Leri777 commited on
Commit
76cf20f
1 Parent(s): a800c44

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -34
app.py CHANGED
@@ -8,7 +8,8 @@ from logging.handlers import RotatingFileHandler
8
  import torch
9
  import spaces
10
  import gradio as gr
11
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
 
12
 
13
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
14
 
@@ -36,19 +37,16 @@ DESCRIPTION = f"This is the {MODEL_NAME} model designed for coding assistance an
36
  def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
37
  logger.debug(f"Received prediction request: message='{message}', system_prompt='{system_prompt}'")
38
  if CHAT_TEMPLATE == "Auto":
39
- stop_tokens = [tokenizer.eos_token_id]
40
  instruction = system_prompt + "\n\n"
41
  for user, assistant in history:
42
  instruction += f"User: {user}\nAssistant: {assistant}\n"
43
  instruction += f"User: {message}\nAssistant:"
44
  elif CHAT_TEMPLATE == "ChatML":
45
- stop_tokens = ["<|endoftext|>", "<|im_end|>"]
46
  instruction = '<|im_start|>system\n' + system_prompt + '\n<|im_end|>\n'
47
  for user, assistant in history:
48
  instruction += f'<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n'
49
  instruction += f'<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n'
50
  elif CHAT_TEMPLATE == "Mistral Instruct":
51
- stop_tokens = ["</s>", "[INST]", "[INST] ", "<s>", "[/INST]", "[/INST] "]
52
  instruction = f'<s>[INST] {system_prompt}\n'
53
  for user, assistant in history:
54
  instruction += f'{user} [/INST] {assistant}</s>[INST]'
@@ -57,33 +55,11 @@ def predict(message, history, system_prompt, temperature, max_new_tokens, top_k,
57
  raise Exception("Incorrect chat template, select 'Auto', 'ChatML' or 'Mistral Instruct'")
58
  print(instruction)
59
 
60
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
61
- enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True)
62
- input_ids, attention_mask = enc.input_ids, enc.attention_mask
63
-
64
- if input_ids.shape[1] > CONTEXT_LENGTH:
65
- input_ids = input_ids[:, -CONTEXT_LENGTH:]
66
- attention_mask = attention_mask[:, -CONTEXT_LENGTH:]
67
-
68
- generate_kwargs = dict(
69
- input_ids=input_ids.to(device),
70
- attention_mask=attention_mask.to(device),
71
- streamer=streamer,
72
- do_sample=True,
73
- temperature=temperature,
74
- max_new_tokens=max_new_tokens,
75
- top_k=top_k,
76
- repetition_penalty=repetition_penalty,
77
- top_p=top_p
78
- )
79
- t = Thread(target=model.generate, kwargs=generate_kwargs)
80
- t.start()
81
  outputs = []
82
  try:
83
- for new_token in streamer:
84
- outputs.append(new_token)
85
- if new_token in stop_tokens:
86
- break
87
  yield "".join(outputs)
88
  logger.debug(f"Prediction completed successfully for message: '{message}'")
89
  except Exception as e:
@@ -96,11 +72,15 @@ quantization_config = BitsAndBytesConfig(
96
  bnb_4bit_compute_dtype=torch.bfloat16
97
  )
98
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
99
- model = AutoModelForCausalLM.from_pretrained(
100
- MODEL_ID,
101
- device_map="auto",
102
- quantization_config=quantization_config,
103
- attn_implementation="flash_attention_2",
 
 
 
 
104
  )
105
 
106
  logger.debug("Model and tokenizer loaded successfully")
 
8
  import torch
9
  import spaces
10
  import gradio as gr
11
+ from transformers import AutoTokenizer, BitsAndBytesConfig
12
+ from langchain_huggingface import ChatHuggingFace
13
 
14
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
15
 
 
37
  def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
38
  logger.debug(f"Received prediction request: message='{message}', system_prompt='{system_prompt}'")
39
  if CHAT_TEMPLATE == "Auto":
 
40
  instruction = system_prompt + "\n\n"
41
  for user, assistant in history:
42
  instruction += f"User: {user}\nAssistant: {assistant}\n"
43
  instruction += f"User: {message}\nAssistant:"
44
  elif CHAT_TEMPLATE == "ChatML":
 
45
  instruction = '<|im_start|>system\n' + system_prompt + '\n<|im_end|>\n'
46
  for user, assistant in history:
47
  instruction += f'<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n'
48
  instruction += f'<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n'
49
  elif CHAT_TEMPLATE == "Mistral Instruct":
 
50
  instruction = f'<s>[INST] {system_prompt}\n'
51
  for user, assistant in history:
52
  instruction += f'{user} [/INST] {assistant}</s>[INST]'
 
55
  raise Exception("Incorrect chat template, select 'Auto', 'ChatML' or 'Mistral Instruct'")
56
  print(instruction)
57
 
58
+ response = chat_model.predict(instruction)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  outputs = []
60
  try:
61
+ for token in response:
62
+ outputs.append(token)
 
 
63
  yield "".join(outputs)
64
  logger.debug(f"Prediction completed successfully for message: '{message}'")
65
  except Exception as e:
 
72
  bnb_4bit_compute_dtype=torch.bfloat16
73
  )
74
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
75
+
76
+ chat_model = ChatHuggingFace(
77
+ model_name=MODEL_ID,
78
+ huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"],
79
+ model_kwargs={
80
+ "device_map": "auto",
81
+ "quantization_config": quantization_config,
82
+ "attn_implementation": "flash_attention_2",
83
+ }
84
  )
85
 
86
  logger.debug("Model and tokenizer loaded successfully")