Spaces:
Running
Running
import gradio as gr | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
# Initialize model and tokenizer | |
model = AutoModelForCausalLM.from_pretrained( | |
"microsoft/Phi-3.5-mini-instruct", | |
device_map="cpu", | |
torch_dtype=torch.float16, | |
low_cpu_mem_usage=True, | |
trust_remote_code=True, | |
) | |
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct") | |
# Create pipeline | |
pipe = pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer | |
) | |
# Generation arguments | |
generation_args = { | |
"max_new_tokens": 500, | |
"return_full_text": False, | |
"temperature": 0.0, | |
"do_sample": False, | |
} | |
def chat(message, history, system_prompt): | |
# Prepare messages | |
messages = [ | |
{"role": "system", "content": system_prompt}, | |
] | |
# Add history to messages | |
for human, assistant in history: | |
messages.append({"role": "user", "content": human}) | |
messages.append({"role": "assistant", "content": assistant}) | |
# Add current message | |
messages.append({"role": "user", "content": message}) | |
# Generate response | |
output = pipe(messages, **generation_args) | |
response = output[0]['generated_text'] | |
return response | |
# Gradio interface | |
with gr.Blocks() as demo: | |
chatbot = gr.Chatbot() | |
msg = gr.Textbox() | |
clear = gr.Button("Clear") | |
system_prompt = gr.Textbox(label="System Prompt", value="You are a helpful AI assistant.") | |
def respond(message, chat_history): | |
bot_message = chat(message, chat_history, system_prompt.value) | |
chat_history.append((message, bot_message)) | |
return "", chat_history | |
msg.submit(respond, [msg, chatbot], [msg, chatbot]) | |
clear.click(lambda: None, None, chatbot, queue=False) | |
if __name__ == "__main__": | |
demo.launch() |