QWEN-2.5-Coder-7B

Sleeping

App Files Files Community

Leri777 commited on 29 days ago

Commit

76cf20f

•

1 Parent(s): a800c44

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -34

app.py CHANGED Viewed

@@ -8,7 +8,8 @@ from logging.handlers import RotatingFileHandler
 import torch
 import spaces
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
@@ -36,19 +37,16 @@ DESCRIPTION = f"This is the {MODEL_NAME} model designed for coding assistance an
 def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
     logger.debug(f"Received prediction request: message='{message}', system_prompt='{system_prompt}'")
     if CHAT_TEMPLATE == "Auto":
-        stop_tokens = [tokenizer.eos_token_id]
         instruction = system_prompt + "\n\n"
         for user, assistant in history:
             instruction += f"User: {user}\nAssistant: {assistant}\n"
         instruction += f"User: {message}\nAssistant:"
     elif CHAT_TEMPLATE == "ChatML":
-        stop_tokens = ["<|endoftext|>", "<|im_end|>"]
         instruction = '<|im_start|>system\n' + system_prompt + '\n<|im_end|>\n'
         for user, assistant in history:
             instruction += f'<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n'
         instruction += f'<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n'
     elif CHAT_TEMPLATE == "Mistral Instruct":
-        stop_tokens = ["</s>", "[INST]", "[INST] ", "<s>", "[/INST]", "[/INST] "]
         instruction = f'<s>[INST] {system_prompt}\n'
         for user, assistant in history:
             instruction += f'{user} [/INST] {assistant}</s>[INST]'
@@ -57,33 +55,11 @@ def predict(message, history, system_prompt, temperature, max_new_tokens, top_k,
         raise Exception("Incorrect chat template, select 'Auto', 'ChatML' or 'Mistral Instruct'")
     print(instruction)
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True)
-    input_ids, attention_mask = enc.input_ids, enc.attention_mask
-    if input_ids.shape[1] > CONTEXT_LENGTH:
-        input_ids = input_ids[:, -CONTEXT_LENGTH:]
-        attention_mask = attention_mask[:, -CONTEXT_LENGTH:]
-    generate_kwargs = dict(
-        input_ids=input_ids.to(device),
-        attention_mask=attention_mask.to(device),
-        streamer=streamer,
-        do_sample=True,
-        temperature=temperature,
-        max_new_tokens=max_new_tokens,
-        top_k=top_k,
-        repetition_penalty=repetition_penalty,
-        top_p=top_p
-    )
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
-    t.start()
     outputs = []
     try:
-        for new_token in streamer:
-            outputs.append(new_token)
-            if new_token in stop_tokens:
-                break
             yield "".join(outputs)
         logger.debug(f"Prediction completed successfully for message: '{message}'")
     except Exception as e:
@@ -96,11 +72,15 @@ quantization_config = BitsAndBytesConfig(
     bnb_4bit_compute_dtype=torch.bfloat16
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    device_map="auto",
-    quantization_config=quantization_config,
-    attn_implementation="flash_attention_2",
 )
 logger.debug("Model and tokenizer loaded successfully")

 import torch
 import spaces
 import gradio as gr
+from transformers import AutoTokenizer, BitsAndBytesConfig
+from langchain_huggingface import ChatHuggingFace
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
     logger.debug(f"Received prediction request: message='{message}', system_prompt='{system_prompt}'")
     if CHAT_TEMPLATE == "Auto":
         instruction = system_prompt + "\n\n"
         for user, assistant in history:
             instruction += f"User: {user}\nAssistant: {assistant}\n"
         instruction += f"User: {message}\nAssistant:"
     elif CHAT_TEMPLATE == "ChatML":
         instruction = '<|im_start|>system\n' + system_prompt + '\n<|im_end|>\n'
         for user, assistant in history:
             instruction += f'<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n'
         instruction += f'<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n'
     elif CHAT_TEMPLATE == "Mistral Instruct":
         instruction = f'<s>[INST] {system_prompt}\n'
         for user, assistant in history:
             instruction += f'{user} [/INST] {assistant}</s>[INST]'
         raise Exception("Incorrect chat template, select 'Auto', 'ChatML' or 'Mistral Instruct'")
     print(instruction)
+    response = chat_model.predict(instruction)
     outputs = []
     try:
+        for token in response:
+            outputs.append(token)
             yield "".join(outputs)
         logger.debug(f"Prediction completed successfully for message: '{message}'")
     except Exception as e:
     bnb_4bit_compute_dtype=torch.bfloat16
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+chat_model = ChatHuggingFace(
+    model_name=MODEL_ID,
+    huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"],
+    model_kwargs={
+        "device_map": "auto",
+        "quantization_config": quantization_config,
+        "attn_implementation": "flash_attention_2",
+    }
 )
 logger.debug("Model and tokenizer loaded successfully")