Leri777 commited on
Commit
2e1cfa2
1 Parent(s): b149e3a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -27
app.py CHANGED
@@ -8,8 +8,10 @@ from logging.handlers import RotatingFileHandler
8
  import torch
9
  import spaces
10
  import gradio as gr
11
- from transformers import AutoTokenizer, BitsAndBytesConfig
12
  from langchain_huggingface import ChatHuggingFace
 
 
13
 
14
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
15
 
@@ -33,34 +35,42 @@ COLOR = "blue"
33
  EMOJI = "🤖"
34
  DESCRIPTION = f"This is the {MODEL_NAME} model designed for coding assistance and general AI tasks."
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  @spaces.GPU()
37
  def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
38
  logger.debug(f"Received prediction request: message='{message}', system_prompt='{system_prompt}'")
39
- if CHAT_TEMPLATE == "Auto":
40
- instruction = system_prompt + "\n\n"
41
- for user, assistant in history:
42
- instruction += f"User: {user}\nAssistant: {assistant}\n"
43
- instruction += f"User: {message}\nAssistant:"
44
- elif CHAT_TEMPLATE == "ChatML":
45
- instruction = '<|im_start|>system\n' + system_prompt + '\n<|im_end|>\n'
46
- for user, assistant in history:
47
- instruction += f'<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n'
48
- instruction += f'<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n'
49
- elif CHAT_TEMPLATE == "Mistral Instruct":
50
- instruction = f'<s>[INST] {system_prompt}\n'
51
- for user, assistant in history:
52
- instruction += f'{user} [/INST] {assistant}</s>[INST]'
53
- instruction += f' {message} [/INST]'
54
- else:
55
- raise Exception("Incorrect chat template, select 'Auto', 'ChatML' or 'Mistral Instruct'")
56
- print(instruction)
57
 
58
- response = chat_model.predict(instruction)
59
- outputs = []
 
 
 
 
 
 
 
 
60
  try:
61
- for token in response:
62
- outputs.append(token)
63
- yield "".join(outputs)
64
  logger.debug(f"Prediction completed successfully for message: '{message}'")
65
  except Exception as e:
66
  logger.exception(f"Error during prediction for message '{message}': {str(e)}")
@@ -75,12 +85,12 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
75
 
76
  chat_model = ChatHuggingFace(
77
  model_name=MODEL_ID,
78
- huggingfacehub_api_token=os.environ["HF_TOKEN"],
79
  model_kwargs={
80
  "device_map": "auto",
81
  "quantization_config": quantization_config,
82
  "attn_implementation": "flash_attention_2",
83
- }
 
84
  )
85
 
86
  logger.debug("Model and tokenizer loaded successfully")
@@ -101,7 +111,6 @@ gr.ChatInterface(
101
  ["How do I find the maximum element in an array using Kotlin?"],
102
  ["Write a Rust program to generate the Fibonacci sequence up to the 10th number."]
103
  ],
104
- additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False),
105
  additional_inputs=[
106
  gr.Textbox("You are a code assistant.", label="System prompt"),
107
  gr.Slider(0, 1, 0.3, label="Temperature"),
 
8
  import torch
9
  import spaces
10
  import gradio as gr
11
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
12
  from langchain_huggingface import ChatHuggingFace
13
+ from langchain.prompts import PromptTemplate
14
+ from langchain.chains import LLMChain
15
 
16
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
17
 
 
35
  EMOJI = "🤖"
36
  DESCRIPTION = f"This is the {MODEL_NAME} model designed for coding assistance and general AI tasks."
37
 
38
+ # Prompt template for conversation
39
+ template = """<|im_start|>system
40
+ {system_prompt}
41
+ <|im_end|>
42
+ {history}
43
+ <|im_start|>user
44
+ {human_input}
45
+ <|im_end|>
46
+ <|im_start|>assistant
47
+ """
48
+ prompt = PromptTemplate(template=template, input_variables=["system_prompt", "history", "human_input"])
49
+
50
+ # Format the conversation history
51
+ def format_history(history):
52
+ formatted = ""
53
+ for human, ai in history:
54
+ formatted += f"<|im_start|>user\n{human}\n<|im_end|>\n<|im_start|>assistant\n{ai}\n<|im_end|>\n"
55
+ return formatted
56
+
57
  @spaces.GPU()
58
  def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
59
  logger.debug(f"Received prediction request: message='{message}', system_prompt='{system_prompt}'")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
+ formatted_history = format_history(history)
62
+
63
+ chat_model.temperature = temperature
64
+ chat_model.max_new_tokens = max_new_tokens
65
+ chat_model.top_k = top_k
66
+ chat_model.repetition_penalty = repetition_penalty
67
+ chat_model.top_p = top_p
68
+
69
+ chain = LLMChain(llm=chat_model, prompt=prompt)
70
+
71
  try:
72
+ for chunk in chain.stream({"system_prompt": system_prompt, "history": formatted_history, "human_input": message}):
73
+ yield chunk["text"]
 
74
  logger.debug(f"Prediction completed successfully for message: '{message}'")
75
  except Exception as e:
76
  logger.exception(f"Error during prediction for message '{message}': {str(e)}")
 
85
 
86
  chat_model = ChatHuggingFace(
87
  model_name=MODEL_ID,
 
88
  model_kwargs={
89
  "device_map": "auto",
90
  "quantization_config": quantization_config,
91
  "attn_implementation": "flash_attention_2",
92
+ },
93
+ tokenizer=tokenizer
94
  )
95
 
96
  logger.debug("Model and tokenizer loaded successfully")
 
111
  ["How do I find the maximum element in an array using Kotlin?"],
112
  ["Write a Rust program to generate the Fibonacci sequence up to the 10th number."]
113
  ],
 
114
  additional_inputs=[
115
  gr.Textbox("You are a code assistant.", label="System prompt"),
116
  gr.Slider(0, 1, 0.3, label="Temperature"),