Spaces:
Runtime error
Runtime error
File size: 5,863 Bytes
9afdfd9 afba2c3 9afdfd9 654c1d2 9afdfd9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
import gradio as gr
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftConfig, PeftModel
import warnings
from threading import Thread
warnings.filterwarnings("ignore")
PEFT_MODEL = "givyboy/TinyLlama-1.1B-Chat-v1.0-mental-health-conversational"
SYSTEM_PROMPT = """Answer the following question truthfully.
If you don't know the answer, respond 'Sorry, I don't know the answer to this question.'.
If the question is too complex, respond 'Kindly, consult a psychiatrist for further queries.'."""
USER_PROMPT = lambda x: f"""<HUMAN>: {x}\n<ASSISTANT>: """
ADD_RESPONSE = lambda x, y: f"""<HUMAN>: {x}\n<ASSISTANT>: {y}"""
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.float16,
)
config = PeftConfig.from_pretrained(PEFT_MODEL)
peft_base_model = AutoModelForCausalLM.from_pretrained(
config.base_model_name_or_path,
return_dict=True,
# quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
offload_folder="offload/",
offload_state_dict=True,
)
peft_model = PeftModel.from_pretrained(
peft_base_model,
PEFT_MODEL,
offload_folder="offload/",
offload_state_dict=True,
)
peft_model = peft_model.to(DEVICE)
peft_tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
peft_tokenizer.pad_token = peft_tokenizer.eos_token
pipeline = transformers.pipeline(
"text-generation",
model=peft_model,
tokenizer=peft_tokenizer,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
device_map="auto",
)
# def format_message(message: str, history: list[str], memory_limit: int = 3) -> str:
# if len(history) > memory_limit:
# history = history[-memory_limit:]
# if len(history) == 0:
# return f"{SYSTEM_PROMPT}\n{USER_PROMPT(message)}"
# formatted_message = f"{SYSTEM_PROMPT}\n{ADD_RESPONSE(history[0][0], history[0][1])}"
# for msg, ans in history[1:]:
# formatted_message += f"\n{ADD_RESPONSE(msg, ans)}"
# formatted_message += f"\n{USER_PROMPT(message)}"
# return formatted_message
# def get_model_response(message: str, history: list[str]) -> str:
# formatted_message = format_message(message, history)
# sequences = pipeline(
# formatted_message,
# do_sample=True,
# top_k=10,
# num_return_sequences=1,
# eos_token_id=peft_tokenizer.eos_token_id,
# max_length=600,
# )[0]
# print(sequences["generated_text"])
# output = sequences["generated_text"].split("<ASSISTANT>:")[-1].strip()
# # print(f"Response: {output}")
# return output
start_message = ""
def user(message, history):
# Append the user's message to the conversation history
return "", history + [[message, ""]]
def chat(message, history):
chat_history = []
for item in history:
chat_history.append({"role": "user", "content": item[0]})
if item[1] is not None:
chat_history.append({"role": "assistant", "content": item[1]})
message = f"{SYSTEM_PROMPT}\n{USER_PROMPT(message)}"
chat_history.append({"role": "user", "content": message})
messages = peft_tokenizer.apply_chat_template(chat_history, tokenize=False, add_generation_prompt=True)
# Tokenize the messages string
model_inputs = peft_tokenizer([messages], return_tensors="pt").to(DEVICE)
streamer = transformers.TextIteratorStreamer(
peft_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
)
generate_kwargs = dict(
model_inputs,
streamer=streamer,
max_new_tokens=1024,
do_sample=True,
top_p=0.95,
top_k=1000,
temperature=0.75,
num_beams=1,
)
t = Thread(target=peft_model.generate, kwargs=generate_kwargs)
t.start()
# Initialize an empty string to store the generated text
partial_text = ""
for new_text in streamer:
# print(new_text)
partial_text += new_text
# Yield an empty string to cleanup the message textbox and the updated conversation history
yield partial_text
chat = gr.ChatInterface(fn=chat, title="Mental Health Chatbot - SHEKHAR")
chat.launch(share=True)
# import os
# from openai import OpenAI
# from dotenv import load_dotenv
# import gradio as gr
# load_dotenv()
# API_KEY = os.getenv("OPENAI_API_KEY")
# openai = OpenAI(api_key=API_KEY)
# create_msg = lambda x, y: {"role": x, "content": y}
# SYSTEM_PROMPT = create_msg(
# "system",
# """You are a helpful mental health chatbot, please answer with care. If you don't know the answer, respond 'Sorry, I don't know the answer to this question.'. If the question is too complex, respond 'Kindly, consult a psychiatrist for further queries.'.""".strip(),
# )
# def predict(message, history):
# history_openai_format = []
# history_openai_format.append(SYSTEM_PROMPT)
# for human, assistant in history:
# history_openai_format.append({"role": "user", "content": human})
# history_openai_format.append({"role": "assistant", "content": assistant})
# history_openai_format.append({"role": "user", "content": message})
# response = openai.chat.completions.create(
# model="ft:gpt-3.5-turbo-0613:personal::8kBTG8eh", messages=history_openai_format, temperature=0.35, stream=True
# )
# partial_message = ""
# for chunk in response:
# if chunk.choices[0].delta.content is not None:
# partial_message = partial_message + chunk.choices[0].delta.content
# yield partial_message
# gr.ChatInterface(fn=predict, title="Mental Health Chatbot").launch(share=True) |