Sergidev commited on
Commit
4c066b1
1 Parent(s): 47968ad

Update modules/pmbl.py

Browse files
Files changed (1) hide show
  1. modules/pmbl.py +7 -15
modules/pmbl.py CHANGED
@@ -2,18 +2,13 @@ import sqlite3
2
  from datetime import datetime
3
  from llama_cpp import Llama
4
  from concurrent.futures import ThreadPoolExecutor
5
- import torch
6
- import os
7
 
8
- # Set CUDA device (assuming you have an NVIDIA T4 Medium)
9
- os.environ["CUDA_VISIBLE_DEVICES"] = "0"
10
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
11
 
12
  class PMBL:
13
  def __init__(self, model_path):
14
  self.model_path = model_path
15
  self.init_db()
16
- self.executor = ThreadPoolExecutor(max_workers=6)
17
 
18
  def init_db(self):
19
  conn = sqlite3.connect('chat_history.db')
@@ -84,7 +79,8 @@ class PMBL:
84
  conn = sqlite3.connect('chat_history.db')
85
  c = conn.cursor()
86
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
87
- c.execute("INSERT INTO chats (timestamp, prompt, response, topic) VALUES (?, ?, ?, 'Untitled')", (timestamp, prompt, response))
 
88
  conn.commit()
89
  conn.close()
90
 
@@ -107,16 +103,14 @@ class PMBL:
107
  for chunk in response.result():
108
  yield chunk
109
 
110
-
111
  def generate_response_task(self, system_prompt, prompt, n_ctx):
112
- # Load the model on the GPU
113
- llm = Llama(model_path=self.model_path, n_ctx=n_ctx, n_threads=8, mlock=True, n_gpu_layers=42, device=device)
114
 
115
  response = llm(
116
  system_prompt,
117
  max_tokens=1500,
118
  temperature=0.2,
119
- stop=[" ", "\nUser:", "\nuser:", "\nSystem:", "\nsystem:"],
120
  echo=False,
121
  stream=True
122
  )
@@ -140,7 +134,7 @@ class PMBL:
140
  return system_prompt_tokens + history_tokens + max_response_tokens
141
  else:
142
  return context_ceiling # Return the maximum context size
143
-
144
  def sleep_mode(self):
145
  conn = sqlite3.connect('chat_history.db')
146
  c = conn.cursor()
@@ -155,10 +149,8 @@ class PMBL:
155
 
156
  conn.close()
157
 
158
-
159
  def generate_topic(self, prompt, response):
160
- # Load the model on the GPU
161
- llm = Llama(model_path=self.model_path, n_ctx=1690, n_threads=8, mlock=True, n_gpu_layers=42, device=device)
162
 
163
  system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-4 words:\n\nUser: {prompt}\nAssistant: {response}\n\nTopic:"
164
 
 
2
  from datetime import datetime
3
  from llama_cpp import Llama
4
  from concurrent.futures import ThreadPoolExecutor
 
 
5
 
 
 
 
6
 
7
  class PMBL:
8
  def __init__(self, model_path):
9
  self.model_path = model_path
10
  self.init_db()
11
+ self.executor = ThreadPoolExecutor(max_workers=6) # Adjust the max_workers as needed
12
 
13
  def init_db(self):
14
  conn = sqlite3.connect('chat_history.db')
 
79
  conn = sqlite3.connect('chat_history.db')
80
  c = conn.cursor()
81
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
82
+ c.execute("INSERT INTO chats (timestamp, prompt, response, topic) VALUES (?, ?, ?, 'Untitled')",
83
+ (timestamp, prompt, response))
84
  conn.commit()
85
  conn.close()
86
 
 
103
  for chunk in response.result():
104
  yield chunk
105
 
 
106
  def generate_response_task(self, system_prompt, prompt, n_ctx):
107
+ llm = Llama(model_path=self.model_path, n_ctx=n_ctx, n_threads=8, n_gpu_layers=-1, mlock=True)
 
108
 
109
  response = llm(
110
  system_prompt,
111
  max_tokens=1500,
112
  temperature=0.2,
113
+ stop=["</s>", "\nUser:", "\nuser:", "\nSystem:", "\nsystem:"],
114
  echo=False,
115
  stream=True
116
  )
 
134
  return system_prompt_tokens + history_tokens + max_response_tokens
135
  else:
136
  return context_ceiling # Return the maximum context size
137
+
138
  def sleep_mode(self):
139
  conn = sqlite3.connect('chat_history.db')
140
  c = conn.cursor()
 
149
 
150
  conn.close()
151
 
 
152
  def generate_topic(self, prompt, response):
153
+ llm = Llama(model_path=self.model_path, n_ctx=1690, n_threads=8, n_gpu_layers=-1, mlock=True)
 
154
 
155
  system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-4 words:\n\nUser: {prompt}\nAssistant: {response}\n\nTopic:"
156