Spaces:
Runtime error
Runtime error
import torch | |
import gradio as gr | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
# Dictionary of available models | |
MODELS = { | |
"SmolLM2-135M-Instruct": "HuggingFaceTB/SmolLM2-135M-Instruct", | |
"SmolLM2-360M-Instruct": "HuggingFaceTB/SmolLM2-360M-Instruct", | |
"SmolLM2-1.7B-Instruct": "HuggingFaceTB/SmolLM2-1.7B-Instruct" | |
} | |
class ModelHandler: | |
def __init__(self): | |
self.current_model = None | |
self.current_tokenizer = None | |
self.device = "cpu" if torch.cuda.is_available() else "cpu" | |
def load_model(self, model_name): | |
try: | |
checkpoint = MODELS[model_name] | |
self.current_tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
self.current_model = AutoModelForCausalLM.from_pretrained( | |
checkpoint, | |
torch_dtype=torch.bfloat16, | |
device_map="auto" | |
) | |
return f"Successfully loaded {model_name}" | |
except Exception as e: | |
return f"Error loading model: {str(e)}" | |
model_handler = ModelHandler() | |
def generate_text(model_name, prompt, max_tokens, temperature, top_p): | |
try: | |
# Load model if it's different from the current one | |
if model_handler.current_model is None or MODELS[model_name] != model_handler.current_model.name_or_path: | |
load_status = model_handler.load_model(model_name) | |
if "Error" in load_status: | |
return load_status | |
# Format input as chat message | |
messages = [{"role": "user", "content": prompt}] | |
input_text = model_handler.current_tokenizer.apply_chat_template(messages, tokenize=False) | |
# Tokenize | |
inputs = model_handler.current_tokenizer.encode( | |
input_text, | |
return_tensors="pt" | |
).to(model_handler.device) | |
# Generate | |
outputs = model_handler.current_model.generate( | |
inputs, | |
max_new_tokens=max_tokens, | |
temperature=temperature, | |
top_p=top_p, | |
do_sample=True | |
) | |
# Decode and return | |
response = model_handler.current_tokenizer.decode( | |
outputs[0], | |
skip_special_tokens=True | |
) | |
return response | |
except Exception as e: | |
return f"Error during generation: {str(e)}" | |
# Create Gradio interface | |
iface = gr.Interface( | |
fn=generate_text, | |
inputs=[ | |
gr.Dropdown( | |
choices=list(MODELS.keys()), | |
label="Select Model", | |
value="SmolLM2-360M-Instruct" | |
), | |
gr.Textbox( | |
label="Enter your prompt", | |
placeholder="What would you like to know?", | |
lines=3 | |
), | |
gr.Slider( | |
minimum=10, | |
maximum=500, | |
value=50, | |
step=10, | |
label="Maximum Tokens" | |
), | |
gr.Slider( | |
minimum=0.1, | |
maximum=1.0, | |
value=0.2, | |
step=0.1, | |
label="Temperature" | |
), | |
gr.Slider( | |
minimum=0.1, | |
maximum=1.0, | |
value=0.9, | |
step=0.1, | |
label="Top P" | |
) | |
], | |
outputs=gr.Textbox(label="Generated Response", lines=5), | |
title="SmolLM2 Model Comparison", | |
description=""" | |
Compare different sizes of SmolLM2 models: | |
- SmolLM2-135M-Instruct: Smallest and fastest | |
- SmolLM2-360M-Instruct: Balanced size and performance | |
- SmolLM2-1.7B-Instruct: Largest and most capable | |
""", | |
examples=[ | |
["SmolLM2-360M-Instruct", "What is the capital of France?", 50, 0.2, 0.9], | |
["SmolLM2-360M-Instruct", "Explain quantum computing in simple terms.", 200, 0.3, 0.9], | |
["SmolLM2-360M-Instruct", "Write a short poem about nature.", 100, 0.7, 0.9] | |
] | |
) | |
# Launch the application | |
if __name__ == "__main__": | |
iface.launch(share=True) |