Mistral-lab

Running on Zero

App Files Files Community

vilarin commited on Jul 17

Commit

3eed0af

•

1 Parent(s): 93fdc72

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -25

app.py CHANGED Viewed

@@ -2,22 +2,18 @@ import os
 import time
 #import spaces
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import gradio as gr
-MODEL_LIST = ["openbmb/MiniCPM-1B-sft-bf16", "openbmb/MiniCPM-S-1B-sft"]
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
-MODEL_ID = os.environ.get("MODEL_ID", None)
-MODEL_NAME = MODEL_ID.split("/")[-1]
-TITLE = "<h1><center>MiniCPM-S-1B-chat</center></h1>"
-DESCRIPTION = f"""
-<h3>MODEL NOW: <a href="https://hf.co/{MODEL_ID}">{MODEL_NAME}</a></h3>
-"""
 PLACEHOLDER = """
 <center>
-<p>MiniCPM is an End-Size LLM with only 1.2B parameters excluding embeddings.</p>
 </center>
 """
@@ -34,13 +30,22 @@ h3 {
 }
 """
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    torch_dtype=torch.bfloat16,
-    device_map='auto',
-    low_cpu_mem_usage=True,
-    trust_remote_code=True)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
 #@spaces.GPU()
 def stream_chat(
@@ -50,7 +55,8 @@ def stream_chat(
     max_new_tokens: int = 1024,
     top_p: float = 1.0,
     top_k: int = 20,
-    penalty: float = 1.2
 ):
     print(f'message: {message}')
     print(f'history: {history}')
@@ -61,26 +67,49 @@ def stream_chat(
             {"role": "user", "content": prompt},
             {"role": "assistant", "content": answer},
         ])
-    torch.manual_seed(0)
-    resp, history = model.chat(
-        tokenizer,
-        query = message,
-        history = conversation,
-        max_length = max_new_tokens,
         do_sample = False if temperature == 0 else True,
         top_p = top_p,
         top_k = top_k,
         temperature = temperature,
     )
-    return resp
 chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
 with gr.Blocks(css=CSS, theme="soft") as demo:
     gr.HTML(TITLE)
-    gr.HTML(DESCRIPTION)
     gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
     gr.ChatInterface(
         fn=stream_chat,
@@ -128,6 +157,12 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
                 label="Repetition penalty",
                 render=False,
             ),
         ],
         examples=[
             ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."],

 import time
 #import spaces
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import gradio as gr
+from threading import Thread
+MODEL_LIST = ["HuggingFaceTB/SmolLM-1.7B-Instruct", "HuggingFaceTB/SmolLM-135M-Instruct", "HuggingFaceTB/SmolLM-360M-Instruct"]
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
+TITLE = "<h1><center>SmolLM-Instruct</center></h1>"
 PLACEHOLDER = """
 <center>
+<pSmolLM is a series of state-of-the-art small language models available in three sizes: 135M, 360M, and 1.7B parameters.</p>
 </center>
 """
 }
 """
+# pip install transformers
+from transformers import AutoModelForCausalLM, AutoTokenizer
+device = "cpu" # for GPU usage or "cpu" for CPU usage
+tokenizer0 = AutoTokenizer.from_pretrained(MODEL_LIST[0])
+model0 = AutoModelForCausalLM.from_pretrained(MODEL_LIST[0]).to(device)
+tokenizer1 = AutoTokenizer.from_pretrained(MODEL_LIST[1])
+model1 = AutoModelForCausalLM.from_pretrained(MODEL_LIST[1]).to(device)
+tokenizer2 = AutoTokenizer.from_pretrained(MODEL_LIST[2])
+model2 = AutoModelForCausalLM.from_pretrained(MODEL_LIST[2]).to(device)
+messages = [{"role": "user", "content": "List the steps to bake a chocolate cake from scratch."}]
 #@spaces.GPU()
 def stream_chat(
     max_new_tokens: int = 1024,
     top_p: float = 1.0,
     top_k: int = 20,
+    penalty: float = 1.2,
+    choice: str = "1.7B"
 ):
     print(f'message: {message}')
     print(f'history: {history}')
             {"role": "user", "content": prompt},
             {"role": "assistant", "content": answer},
         ])
+    conversation.append({"role": "user", "content": message})
+    if choice == "1.7B":
+        tokenizer = tokenizer0
+        model = model0
+    elif choice == "135M":
+        model = model1
+        tokenizer = tokenizer1
+    else:
+        model = model2
+        tokenizer = tokenizer2
+    input_text=tokenizer.apply_chat_template(conversation, tokenize=False)
+    inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        inputs,
+        max_new_tokens = max_new_tokens,
         do_sample = False if temperature == 0 else True,
         top_p = top_p,
         top_k = top_k,
         temperature = temperature,
+        streamer=streamer,
     )
+    with torch.no_grad():
+        thread = Thread(target=model.generate, kwargs=gen_kwargs)
+        thread.start()
+    buffer = ""
+    for new_text in streamer:
+        buffer += new_text
+        yield buffer
+    #print(tokenizer.decode(outputs[0]))
 chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
 with gr.Blocks(css=CSS, theme="soft") as demo:
     gr.HTML(TITLE)
     gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
     gr.ChatInterface(
         fn=stream_chat,
                 label="Repetition penalty",
                 render=False,
             ),
+            gr.Radio(
+                ["135M", "360M", "1.7B"],
+                value="1.7B",
+                label="Load Model",
+                render=False,
+            ),
         ],
         examples=[
             ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."],