Spaces:

hon9kon9ize
/

CantoneseLLMChat

Running on Zero

indiejoseph commited on Apr 9

Commit

039d7bc

•

1 Parent(s): f9c87b8

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -24,7 +24,7 @@ if not torch.cuda.is_available():
 if torch.cuda.is_available():
     model_id = "hon9kon9ize/CantoneseLLMChat-preview20240326"
-    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
     model = torch.compile(model)
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     tokenizer.use_default_system_prompt = False
@@ -48,7 +48,9 @@ def generate(
         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
     conversation.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
@@ -60,7 +62,6 @@ def generate(
         top_p=top_p,
         top_k=top_k,
         temperature=temperature,
-        num_beams=1,
         repetition_penalty=repetition_penalty
     )

 if torch.cuda.is_available():
     model_id = "hon9kon9ize/CantoneseLLMChat-preview20240326"
+    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16,)
     model = torch.compile(model)
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     tokenizer.use_default_system_prompt = False
         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
     conversation.append({"role": "user", "content": message})
+    printf(chat_history)
+    input_ids = tokenizer.apply_chat_template(conversation, tokenize=True, add_generation_prompt=True, return_tensors='pt')
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
         top_p=top_p,
         top_k=top_k,
         temperature=temperature,
         repetition_penalty=repetition_penalty
     )