DexterSptizu's picture
Update app.py
ddd1c57 verified
raw
history blame
3.9 kB
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
# Dictionary of available models
MODELS = {
"SmolLM2-135M-Instruct": "HuggingFaceTB/SmolLM2-135M-Instruct",
"SmolLM2-360M-Instruct": "HuggingFaceTB/SmolLM2-360M-Instruct",
"SmolLM2-1.7B-Instruct": "HuggingFaceTB/SmolLM2-1.7B-Instruct"
}
class ModelHandler:
def __init__(self):
self.current_model = None
self.current_tokenizer = None
self.device = "cpu" if torch.cuda.is_available() else "cpu"
def load_model(self, model_name):
try:
checkpoint = MODELS[model_name]
self.current_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
self.current_model = AutoModelForCausalLM.from_pretrained(
checkpoint,
torch_dtype=torch.bfloat16,
device_map="auto"
)
return f"Successfully loaded {model_name}"
except Exception as e:
return f"Error loading model: {str(e)}"
model_handler = ModelHandler()
def generate_text(model_name, prompt, max_tokens, temperature, top_p):
try:
# Load model if it's different from the current one
if model_handler.current_model is None or MODELS[model_name] != model_handler.current_model.name_or_path:
load_status = model_handler.load_model(model_name)
if "Error" in load_status:
return load_status
# Format input as chat message
messages = [{"role": "user", "content": prompt}]
input_text = model_handler.current_tokenizer.apply_chat_template(messages, tokenize=False)
# Tokenize
inputs = model_handler.current_tokenizer.encode(
input_text,
return_tensors="pt"
).to(model_handler.device)
# Generate
outputs = model_handler.current_model.generate(
inputs,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True
)
# Decode and return
response = model_handler.current_tokenizer.decode(
outputs[0],
skip_special_tokens=True
)
return response
except Exception as e:
return f"Error during generation: {str(e)}"
# Create Gradio interface
iface = gr.Interface(
fn=generate_text,
inputs=[
gr.Dropdown(
choices=list(MODELS.keys()),
label="Select Model",
value="SmolLM2-360M-Instruct"
),
gr.Textbox(
label="Enter your prompt",
placeholder="What would you like to know?",
lines=3
),
gr.Slider(
minimum=10,
maximum=500,
value=50,
step=10,
label="Maximum Tokens"
),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.2,
step=0.1,
label="Temperature"
),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.9,
step=0.1,
label="Top P"
)
],
outputs=gr.Textbox(label="Generated Response", lines=5),
title="SmolLM2 Model Comparison",
description="""
Compare different sizes of SmolLM2 models:
- SmolLM2-135M-Instruct: Smallest and fastest
- SmolLM2-360M-Instruct: Balanced size and performance
- SmolLM2-1.7B-Instruct: Largest and most capable
""",
examples=[
["SmolLM2-360M-Instruct", "What is the capital of France?", 50, 0.2, 0.9],
["SmolLM2-360M-Instruct", "Explain quantum computing in simple terms.", 200, 0.3, 0.9],
["SmolLM2-360M-Instruct", "Write a short poem about nature.", 100, 0.7, 0.9]
]
)
# Launch the application
if __name__ == "__main__":
iface.launch(share=True)