Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -8,7 +8,8 @@ from logging.handlers import RotatingFileHandler
|
|
8 |
import torch
|
9 |
import spaces
|
10 |
import gradio as gr
|
11 |
-
from transformers import
|
|
|
12 |
|
13 |
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
|
14 |
|
@@ -36,19 +37,16 @@ DESCRIPTION = f"This is the {MODEL_NAME} model designed for coding assistance an
|
|
36 |
def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
|
37 |
logger.debug(f"Received prediction request: message='{message}', system_prompt='{system_prompt}'")
|
38 |
if CHAT_TEMPLATE == "Auto":
|
39 |
-
stop_tokens = [tokenizer.eos_token_id]
|
40 |
instruction = system_prompt + "\n\n"
|
41 |
for user, assistant in history:
|
42 |
instruction += f"User: {user}\nAssistant: {assistant}\n"
|
43 |
instruction += f"User: {message}\nAssistant:"
|
44 |
elif CHAT_TEMPLATE == "ChatML":
|
45 |
-
stop_tokens = ["<|endoftext|>", "<|im_end|>"]
|
46 |
instruction = '<|im_start|>system\n' + system_prompt + '\n<|im_end|>\n'
|
47 |
for user, assistant in history:
|
48 |
instruction += f'<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n'
|
49 |
instruction += f'<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n'
|
50 |
elif CHAT_TEMPLATE == "Mistral Instruct":
|
51 |
-
stop_tokens = ["</s>", "[INST]", "[INST] ", "<s>", "[/INST]", "[/INST] "]
|
52 |
instruction = f'<s>[INST] {system_prompt}\n'
|
53 |
for user, assistant in history:
|
54 |
instruction += f'{user} [/INST] {assistant}</s>[INST]'
|
@@ -57,33 +55,11 @@ def predict(message, history, system_prompt, temperature, max_new_tokens, top_k,
|
|
57 |
raise Exception("Incorrect chat template, select 'Auto', 'ChatML' or 'Mistral Instruct'")
|
58 |
print(instruction)
|
59 |
|
60 |
-
|
61 |
-
enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True)
|
62 |
-
input_ids, attention_mask = enc.input_ids, enc.attention_mask
|
63 |
-
|
64 |
-
if input_ids.shape[1] > CONTEXT_LENGTH:
|
65 |
-
input_ids = input_ids[:, -CONTEXT_LENGTH:]
|
66 |
-
attention_mask = attention_mask[:, -CONTEXT_LENGTH:]
|
67 |
-
|
68 |
-
generate_kwargs = dict(
|
69 |
-
input_ids=input_ids.to(device),
|
70 |
-
attention_mask=attention_mask.to(device),
|
71 |
-
streamer=streamer,
|
72 |
-
do_sample=True,
|
73 |
-
temperature=temperature,
|
74 |
-
max_new_tokens=max_new_tokens,
|
75 |
-
top_k=top_k,
|
76 |
-
repetition_penalty=repetition_penalty,
|
77 |
-
top_p=top_p
|
78 |
-
)
|
79 |
-
t = Thread(target=model.generate, kwargs=generate_kwargs)
|
80 |
-
t.start()
|
81 |
outputs = []
|
82 |
try:
|
83 |
-
for
|
84 |
-
outputs.append(
|
85 |
-
if new_token in stop_tokens:
|
86 |
-
break
|
87 |
yield "".join(outputs)
|
88 |
logger.debug(f"Prediction completed successfully for message: '{message}'")
|
89 |
except Exception as e:
|
@@ -96,11 +72,15 @@ quantization_config = BitsAndBytesConfig(
|
|
96 |
bnb_4bit_compute_dtype=torch.bfloat16
|
97 |
)
|
98 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
|
|
104 |
)
|
105 |
|
106 |
logger.debug("Model and tokenizer loaded successfully")
|
|
|
8 |
import torch
|
9 |
import spaces
|
10 |
import gradio as gr
|
11 |
+
from transformers import AutoTokenizer, BitsAndBytesConfig
|
12 |
+
from langchain_huggingface import ChatHuggingFace
|
13 |
|
14 |
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
|
15 |
|
|
|
37 |
def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
|
38 |
logger.debug(f"Received prediction request: message='{message}', system_prompt='{system_prompt}'")
|
39 |
if CHAT_TEMPLATE == "Auto":
|
|
|
40 |
instruction = system_prompt + "\n\n"
|
41 |
for user, assistant in history:
|
42 |
instruction += f"User: {user}\nAssistant: {assistant}\n"
|
43 |
instruction += f"User: {message}\nAssistant:"
|
44 |
elif CHAT_TEMPLATE == "ChatML":
|
|
|
45 |
instruction = '<|im_start|>system\n' + system_prompt + '\n<|im_end|>\n'
|
46 |
for user, assistant in history:
|
47 |
instruction += f'<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n'
|
48 |
instruction += f'<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n'
|
49 |
elif CHAT_TEMPLATE == "Mistral Instruct":
|
|
|
50 |
instruction = f'<s>[INST] {system_prompt}\n'
|
51 |
for user, assistant in history:
|
52 |
instruction += f'{user} [/INST] {assistant}</s>[INST]'
|
|
|
55 |
raise Exception("Incorrect chat template, select 'Auto', 'ChatML' or 'Mistral Instruct'")
|
56 |
print(instruction)
|
57 |
|
58 |
+
response = chat_model.predict(instruction)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
outputs = []
|
60 |
try:
|
61 |
+
for token in response:
|
62 |
+
outputs.append(token)
|
|
|
|
|
63 |
yield "".join(outputs)
|
64 |
logger.debug(f"Prediction completed successfully for message: '{message}'")
|
65 |
except Exception as e:
|
|
|
72 |
bnb_4bit_compute_dtype=torch.bfloat16
|
73 |
)
|
74 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
75 |
+
|
76 |
+
chat_model = ChatHuggingFace(
|
77 |
+
model_name=MODEL_ID,
|
78 |
+
huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"],
|
79 |
+
model_kwargs={
|
80 |
+
"device_map": "auto",
|
81 |
+
"quantization_config": quantization_config,
|
82 |
+
"attn_implementation": "flash_attention_2",
|
83 |
+
}
|
84 |
)
|
85 |
|
86 |
logger.debug("Model and tokenizer loaded successfully")
|