Mistral-lab

Running on Zero

vilarin commited on Jul 20

Commit

1d4c579

•

1 Parent(s): 4d71d31

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,3 +1,9 @@
 import os
 import time
 import spaces
@@ -37,6 +43,7 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL,
     torch_dtype=torch.bfloat16,
     device_map="auto",
     ignore_mismatched_sizes=True)
@@ -44,7 +51,7 @@ model = AutoModelForCausalLM.from_pretrained(
 def stream_chat(
     message: str,
     history: list,
-    temperature: float = 0.35,
     max_new_tokens: int = 1024,
     top_p: float = 1.0,
     top_k: int = 20,
@@ -101,7 +108,7 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
                 minimum=0,
                 maximum=1,
                 step=0.1,
-                value=0.35,
                 label="Temperature",
                 render=False,
             ),

+import subprocess
+subprocess.run(
+    'pip install flash-attn --no-build-isolation',
+    env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
+    shell=True
+)
 import os
 import time
 import spaces
 model = AutoModelForCausalLM.from_pretrained(
     MODEL,
     torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
     device_map="auto",
     ignore_mismatched_sizes=True)
 def stream_chat(
     message: str,
     history: list,
+    temperature: float = 0.3,
     max_new_tokens: int = 1024,
     top_p: float = 1.0,
     top_k: int = 20,
                 minimum=0,
                 maximum=1,
                 step=0.1,
+                value=0.3,
                 label="Temperature",
                 render=False,
             ),