Mistral-lab

Running on Zero

App Files Files Community

vilarin commited on May 23

Commit

51a7d9e

•

1 Parent(s): 1e24216

Create app.py

Browse files

Files changed (1) hide show

app.py +123 -0

app.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import torch
+from PIL import Image
+import gradio as gr
+import spaces
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+import os
+import time
+HF_TOKEN = os.environ.get("HF_TOKEN", None)
+MODEL_ID = "CohereForAI/aya-23-35B"
+MODEL_NAME = MODEL_ID.split("/")[-1]
+TITLE = "<h1><center>Aya-23-35B-Chatbox</center></h1>"
+DESCRIPTION = f'<h3><center>MODEL: <a href="https://hf.co/{MODEL_ID}">{MODEL_NAME}</a></center></h3>'
+CSS = """
+.duplicate-button {
+  margin: auto !important;
+  color: white !important;
+  background: black !important;
+  border-radius: 100vh !important;
+}
+"""
+#QUANTIZE
+QUANTIZE_4BIT = True
+USE_GRAD_CHECKPOINTING = True
+TRAIN_BATCH_SIZE = 2
+TRAIN_MAX_SEQ_LENGTH = 512
+USE_FLASH_ATTENTION = False
+GRAD_ACC_STEPS = 16
+quantization_config = None
+if QUANTIZE_4BIT:
+  quantization_config = BitsAndBytesConfig(
+      load_in_4bit=True,
+      bnb_4bit_quant_type="nf4",
+      bnb_4bit_use_double_quant=True,
+      bnb_4bit_compute_dtype=torch.bfloat16,
+  )
+attn_implementation = None
+if USE_FLASH_ATTENTION:
+  attn_implementation="flash_attention_2"
+model = AutoModelForCausalLM.from_pretrained(
+          MODEL_NAME,
+          quantization_config=quantization_config,
+          attn_implementation=attn_implementation,
+          torch_dtype=torch.bfloat16,
+          device_map="auto",
+        )
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+@spaces.GPU
+def stream_chat(message: str, history: list, temperature: float, max_new_tokens: int):
+    conversation = []
+    for prompt, answer in history:
+        conversation.extend([{"role": "user", "content": prompt}, {"role": "assistant", "content": answer}])
+    conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)
+    gen_tokens= model.generate(
+        input_ids,
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        temperature=temperature,
+    )
+    gen_text = tokenizer.decode(gen_tokens[0])
+    return gen_text
+chatbot = gr.Chatbot(height=450)
+with gr.Blocks(css=CSS) as demo:
+    gr.HTML(TITLE)
+    gr.HTML(DESCRIPTION)
+    gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
+    gr.ChatInterface(
+        fn=stream_chat,
+        chatbot=chatbot,
+        fill_height=True,
+        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
+        additional_inputs=[
+            gr.Slider(
+                minimum=0,
+                maximum=1,
+                step=0.1,
+                value=0.8,
+                label="Temperature",
+                render=False,
+            ),
+            gr.Slider(
+                minimum=128,
+                maximum=4096,
+                step=1,
+                value=1024,
+                label="Max new tokens",
+                render=False,
+            ),
+        ],
+        examples=[
+            ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."],
+            ["What are 5 creative things I could do with my kids' art? I don't want to throw them away, but it's also so much clutter."],
+            ["Tell me a random fun fact about the Roman Empire."],
+            ["Show me a code snippet of a website's sticky header in CSS and JavaScript."],
+        ],
+        cache_examples=False,
+    )
+if __name__ == "__main__":
+    demo.launch()