Update modules/pmbl.py
Browse files- modules/pmbl.py +7 -15
modules/pmbl.py
CHANGED
@@ -2,18 +2,13 @@ import sqlite3
|
|
2 |
from datetime import datetime
|
3 |
from llama_cpp import Llama
|
4 |
from concurrent.futures import ThreadPoolExecutor
|
5 |
-
import torch
|
6 |
-
import os
|
7 |
|
8 |
-
# Set CUDA device (assuming you have an NVIDIA T4 Medium)
|
9 |
-
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
10 |
-
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
11 |
|
12 |
class PMBL:
|
13 |
def __init__(self, model_path):
|
14 |
self.model_path = model_path
|
15 |
self.init_db()
|
16 |
-
self.executor = ThreadPoolExecutor(max_workers=6)
|
17 |
|
18 |
def init_db(self):
|
19 |
conn = sqlite3.connect('chat_history.db')
|
@@ -84,7 +79,8 @@ class PMBL:
|
|
84 |
conn = sqlite3.connect('chat_history.db')
|
85 |
c = conn.cursor()
|
86 |
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
87 |
-
c.execute("INSERT INTO chats (timestamp, prompt, response, topic) VALUES (?, ?, ?, 'Untitled')",
|
|
|
88 |
conn.commit()
|
89 |
conn.close()
|
90 |
|
@@ -107,16 +103,14 @@ class PMBL:
|
|
107 |
for chunk in response.result():
|
108 |
yield chunk
|
109 |
|
110 |
-
|
111 |
def generate_response_task(self, system_prompt, prompt, n_ctx):
|
112 |
-
|
113 |
-
llm = Llama(model_path=self.model_path, n_ctx=n_ctx, n_threads=8, mlock=True, n_gpu_layers=42, device=device)
|
114 |
|
115 |
response = llm(
|
116 |
system_prompt,
|
117 |
max_tokens=1500,
|
118 |
temperature=0.2,
|
119 |
-
stop=["
|
120 |
echo=False,
|
121 |
stream=True
|
122 |
)
|
@@ -140,7 +134,7 @@ class PMBL:
|
|
140 |
return system_prompt_tokens + history_tokens + max_response_tokens
|
141 |
else:
|
142 |
return context_ceiling # Return the maximum context size
|
143 |
-
|
144 |
def sleep_mode(self):
|
145 |
conn = sqlite3.connect('chat_history.db')
|
146 |
c = conn.cursor()
|
@@ -155,10 +149,8 @@ class PMBL:
|
|
155 |
|
156 |
conn.close()
|
157 |
|
158 |
-
|
159 |
def generate_topic(self, prompt, response):
|
160 |
-
|
161 |
-
llm = Llama(model_path=self.model_path, n_ctx=1690, n_threads=8, mlock=True, n_gpu_layers=42, device=device)
|
162 |
|
163 |
system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-4 words:\n\nUser: {prompt}\nAssistant: {response}\n\nTopic:"
|
164 |
|
|
|
2 |
from datetime import datetime
|
3 |
from llama_cpp import Llama
|
4 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
5 |
|
|
|
|
|
|
|
6 |
|
7 |
class PMBL:
|
8 |
def __init__(self, model_path):
|
9 |
self.model_path = model_path
|
10 |
self.init_db()
|
11 |
+
self.executor = ThreadPoolExecutor(max_workers=6) # Adjust the max_workers as needed
|
12 |
|
13 |
def init_db(self):
|
14 |
conn = sqlite3.connect('chat_history.db')
|
|
|
79 |
conn = sqlite3.connect('chat_history.db')
|
80 |
c = conn.cursor()
|
81 |
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
82 |
+
c.execute("INSERT INTO chats (timestamp, prompt, response, topic) VALUES (?, ?, ?, 'Untitled')",
|
83 |
+
(timestamp, prompt, response))
|
84 |
conn.commit()
|
85 |
conn.close()
|
86 |
|
|
|
103 |
for chunk in response.result():
|
104 |
yield chunk
|
105 |
|
|
|
106 |
def generate_response_task(self, system_prompt, prompt, n_ctx):
|
107 |
+
llm = Llama(model_path=self.model_path, n_ctx=n_ctx, n_threads=8, n_gpu_layers=-1, mlock=True)
|
|
|
108 |
|
109 |
response = llm(
|
110 |
system_prompt,
|
111 |
max_tokens=1500,
|
112 |
temperature=0.2,
|
113 |
+
stop=["</s>", "\nUser:", "\nuser:", "\nSystem:", "\nsystem:"],
|
114 |
echo=False,
|
115 |
stream=True
|
116 |
)
|
|
|
134 |
return system_prompt_tokens + history_tokens + max_response_tokens
|
135 |
else:
|
136 |
return context_ceiling # Return the maximum context size
|
137 |
+
|
138 |
def sleep_mode(self):
|
139 |
conn = sqlite3.connect('chat_history.db')
|
140 |
c = conn.cursor()
|
|
|
149 |
|
150 |
conn.close()
|
151 |
|
|
|
152 |
def generate_topic(self, prompt, response):
|
153 |
+
llm = Llama(model_path=self.model_path, n_ctx=1690, n_threads=8, n_gpu_layers=-1, mlock=True)
|
|
|
154 |
|
155 |
system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-4 words:\n\nUser: {prompt}\nAssistant: {response}\n\nTopic:"
|
156 |
|