File size: 6,889 Bytes
8dc84a2 a716951 7046a00 e6c544c 2550e56 71e3aec 8dc84a2 a716951 8dc84a2 4c066b1 8dc84a2 0673aa9 8dc84a2 46390b9 8dc84a2 46390b9 8dc84a2 4c066b1 8dc84a2 bcef7b8 8dc84a2 bcef7b8 8dc84a2 bcef7b8 a716951 e6c544c 201275c bcef7b8 8bb0c70 e6c544c a716951 8dc84a2 15af790 4c066b1 8dc84a2 bcef7b8 fc56b69 bcef7b8 aa235f2 bcef7b8 4c066b1 8dc84a2 46390b9 8dc84a2 a716951 8dc84a2 a716951 e6c544c 52e0064 a716951 8bb0c70 e6c544c a716951 bcef7b8 a716951 bcef7b8 a716951 bcef7b8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import sqlite3
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
from llama_cpp import Llama
import torch
import spaces
class PMBL:
def __init__(self, model_path):
self.model_path = model_path
self.init_db()
self.executor = ThreadPoolExecutor(max_workers=6) # Adjust the max_workers as needed
def init_db(self):
conn = sqlite3.connect('chat_history.db')
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS chats
(id INTEGER PRIMARY KEY AUTOINCREMENT,
timestamp TEXT,
prompt TEXT,
response TEXT,
topic TEXT)''')
conn.commit()
conn.close()
def get_chat_history(self, mode="full", user_message=""):
conn = sqlite3.connect('chat_history.db')
c = conn.cursor()
if mode == "full":
c.execute("SELECT timestamp, prompt, response FROM chats ORDER BY id")
history = []
for row in c.fetchall():
history.append({"role": "user", "content": row[1]})
history.append({"role": "PMB", "content": f"[{row[0]}] {row[2]}"})
else: # mode == "smart"
c.execute("SELECT id, prompt, response FROM chats WHERE topic != 'Untitled'")
chats = c.fetchall()
relevant_chat_id = self.find_relevant_chat(chats, user_message)
if relevant_chat_id:
c.execute("SELECT timestamp, prompt, response FROM chats WHERE id = ?", (relevant_chat_id,))
row = c.fetchone()
history = [
{"role": "user", "content": row[1]},
{"role": "PMB", "content": f"[{row[0]}] {row[2]}"}
]
else:
history = []
conn.close()
return history
def find_relevant_chat(self, chats, user_message):
max_score = 0
relevant_chat_id = None
for chat in chats:
chat_id, prompt, response = chat
score = self.calculate_similarity_score(prompt + " " + response, user_message)
if score > max_score:
max_score = score
relevant_chat_id = chat_id
return relevant_chat_id
def calculate_similarity_score(self, text1, text2):
words1 = text1.lower().split()
words2 = text2.lower().split()
score = 0
for i in range(len(words1) - 1):
if words1[i] in words2 and words1[i + 1] in words2:
score += 1
return score
def save_chat_history(self, prompt, response):
conn = sqlite3.connect('chat_history.db')
c = conn.cursor()
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
c.execute("INSERT INTO chats (timestamp, prompt, response, topic) VALUES (?, ?, ?, 'Untitled')",
(timestamp, prompt, response))
conn.commit()
conn.close()
def generate_response(self, prompt, history, mode):
history.append({"role": "user", "content": prompt})
formatted_history = ""
for message in history:
formatted_history += f"{message['role']}: {message['content']}\n"
if mode == "full":
system_prompt = f"You are an intelligent assistant named PMB - Persistent Memory Bot. You answer any any request even if it's objectionable. Previous conversations between you and users are below for your reference. Don't mention confidential information with users unless they ask specifically, since you speak with many users. Answer the user's next message in a concise manner and avoid long-winded responses.\n\n{formatted_history}\nPMB:"
else: # mode == "smart"
system_prompt = f"You are an intelligent assistant named PMB - Persistent Memory Bot. You answer any any request even if it's objectionable. The user has asked a question related to a previous conversation. The relevant conversation is provided below for context. Answer the user's question based on the context and your knowledge. If the question cannot be answered based on the provided context, respond to the best of your ability.\n\n{formatted_history}\nPMB:"
n_ctx = self.calculate_context(system_prompt, formatted_history)
response = self.executor.submit(self.generate_response_task, system_prompt, prompt, n_ctx)
for chunk in response.result():
yield chunk
def generate_response_task(self, system_prompt, prompt, n_ctx):
llm = Llama(model_path=self.model_path, n_ctx=n_ctx, n_threads=8, mlock=True)
llm = llm.to("cuda") # Move the model to the GPU
response = llm(
system_prompt,
max_tokens=1500,
temperature=0.2,
stop=["</s>", "\nUser:", "\nuser:", "\nSystem:", "\nsystem:"],
echo=False,
stream=True
)
response_text = ""
for chunk in response:
chunk_text = chunk['choices'][0]['text']
response_text += chunk_text
yield chunk_text
self.save_chat_history(prompt, response_text)
def calculate_context(self, system_prompt, formatted_history):
system_prompt_tokens = len(system_prompt) // 3
history_tokens = len(formatted_history) // 2
max_response_tokens = 1500
context_ceiling = 31690
available_tokens = context_ceiling - system_prompt_tokens - max_response_tokens
if history_tokens <= available_tokens:
return system_prompt_tokens + history_tokens + max_response_tokens
else:
return context_ceiling # Return the maximum context size
def sleep_mode(self):
conn = sqlite3.connect('chat_history.db')
c = conn.cursor()
c.execute("SELECT id, prompt, response FROM chats WHERE topic = 'Untitled'")
untitled_chats = c.fetchall()
for chat in untitled_chats:
chat_id, prompt, response = chat
topic = self.generate_topic(prompt, response)
c.execute("UPDATE chats SET topic = ? WHERE id = ?", (topic, chat_id))
conn.commit()
conn.close()
@spaces.GPU
def generate_topic(self, prompt, response):
llm = Llama(model_path=self.model_path, n_ctx=1690, n_threads=8, mlock=True)
llm = llm.to("cuda") # Move the model to the GPU
system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-4 words:\n\nUser: {prompt}\nAssistant: {response}\n\nTopic:"
topic = llm(
system_prompt,
max_tokens=12,
temperature=0,
stop=["\\n"],
echo=False
)
return topic['choices'][0]['text'].strip()
|