Sergidev commited on
Commit
b3a580a
1 Parent(s): 49b5488

Add torch

Files changed (1) hide show
  1. modules/pmbl.py +11 -4
modules/pmbl.py CHANGED
@@ -2,12 +2,17 @@ import sqlite3
2
  from datetime import datetime
3
  from llama_cpp import Llama
4
  from concurrent.futures import ThreadPoolExecutor
 
 
 
 
 
5
 
6
  class PMBL:
7
  def __init__(self, model_path):
8
  self.model_path = model_path
9
  self.init_db()
10
- self.executor = ThreadPoolExecutor(max_workers=6) # Adjust the max_workers as needed
11
 
12
  def init_db(self):
13
  conn = sqlite3.connect('chat_history.db')
@@ -103,13 +108,14 @@ class PMBL:
103
 
104
 
105
  def generate_response_task(self, system_prompt, prompt, n_ctx):
106
- llm = Llama(model_path=self.model_path, n_ctx=n_ctx, n_threads=8, mlock=True, n_gpu_layers=42)
 
107
 
108
  response = llm(
109
  system_prompt,
110
  max_tokens=1500,
111
  temperature=0.2,
112
- stop=["</s>", "\nUser:", "\nuser:", "\nSystem:", "\nsystem:"],
113
  echo=False,
114
  stream=True
115
  )
@@ -150,7 +156,8 @@ class PMBL:
150
 
151
 
152
  def generate_topic(self, prompt, response):
153
- llm = Llama(model_path=self.model_path, n_ctx=1690, n_threads=8, mlock=True, n_gpu_layers=42)
 
154
 
155
  system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-4 words:\n\nUser: {prompt}\nAssistant: {response}\n\nTopic:"
156
 
 
2
  from datetime import datetime
3
  from llama_cpp import Llama
4
  from concurrent.futures import ThreadPoolExecutor
5
+ import torch
6
+
7
+ # Set CUDA device (assuming you have an NVIDIA T4 Medium)
8
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
9
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
10
 
11
  class PMBL:
12
  def __init__(self, model_path):
13
  self.model_path = model_path
14
  self.init_db()
15
+ self.executor = ThreadPoolExecutor(max_workers=6)
16
 
17
  def init_db(self):
18
  conn = sqlite3.connect('chat_history.db')
 
108
 
109
 
110
  def generate_response_task(self, system_prompt, prompt, n_ctx):
111
+ # Load the model on the GPU
112
+ llm = Llama(model_path=self.model_path, n_ctx=n_ctx, n_threads=8, mlock=True, n_gpu_layers=42, device=device)
113
 
114
  response = llm(
115
  system_prompt,
116
  max_tokens=1500,
117
  temperature=0.2,
118
+ stop=[" ", "\nUser:", "\nuser:", "\nSystem:", "\nsystem:"],
119
  echo=False,
120
  stream=True
121
  )
 
156
 
157
 
158
  def generate_topic(self, prompt, response):
159
+ # Load the model on the GPU
160
+ llm = Llama(model_path=self.model_path, n_ctx=1690, n_threads=8, mlock=True, n_gpu_layers=42, device=device)
161
 
162
  system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-4 words:\n\nUser: {prompt}\nAssistant: {response}\n\nTopic:"
163