Cuda fix
Browse filesAdd torch
- modules/pmbl.py +11 -4
modules/pmbl.py
CHANGED
@@ -2,12 +2,17 @@ import sqlite3
|
|
2 |
from datetime import datetime
|
3 |
from llama_cpp import Llama
|
4 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
class PMBL:
|
7 |
def __init__(self, model_path):
|
8 |
self.model_path = model_path
|
9 |
self.init_db()
|
10 |
-
self.executor = ThreadPoolExecutor(max_workers=6)
|
11 |
|
12 |
def init_db(self):
|
13 |
conn = sqlite3.connect('chat_history.db')
|
@@ -103,13 +108,14 @@ class PMBL:
|
|
103 |
|
104 |
|
105 |
def generate_response_task(self, system_prompt, prompt, n_ctx):
|
106 |
-
|
|
|
107 |
|
108 |
response = llm(
|
109 |
system_prompt,
|
110 |
max_tokens=1500,
|
111 |
temperature=0.2,
|
112 |
-
stop=["
|
113 |
echo=False,
|
114 |
stream=True
|
115 |
)
|
@@ -150,7 +156,8 @@ class PMBL:
|
|
150 |
|
151 |
|
152 |
def generate_topic(self, prompt, response):
|
153 |
-
|
|
|
154 |
|
155 |
system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-4 words:\n\nUser: {prompt}\nAssistant: {response}\n\nTopic:"
|
156 |
|
|
|
2 |
from datetime import datetime
|
3 |
from llama_cpp import Llama
|
4 |
from concurrent.futures import ThreadPoolExecutor
|
5 |
+
import torch
|
6 |
+
|
7 |
+
# Set CUDA device (assuming you have an NVIDIA T4 Medium)
|
8 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
9 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
10 |
|
11 |
class PMBL:
|
12 |
def __init__(self, model_path):
|
13 |
self.model_path = model_path
|
14 |
self.init_db()
|
15 |
+
self.executor = ThreadPoolExecutor(max_workers=6)
|
16 |
|
17 |
def init_db(self):
|
18 |
conn = sqlite3.connect('chat_history.db')
|
|
|
108 |
|
109 |
|
110 |
def generate_response_task(self, system_prompt, prompt, n_ctx):
|
111 |
+
# Load the model on the GPU
|
112 |
+
llm = Llama(model_path=self.model_path, n_ctx=n_ctx, n_threads=8, mlock=True, n_gpu_layers=42, device=device)
|
113 |
|
114 |
response = llm(
|
115 |
system_prompt,
|
116 |
max_tokens=1500,
|
117 |
temperature=0.2,
|
118 |
+
stop=[" ", "\nUser:", "\nuser:", "\nSystem:", "\nsystem:"],
|
119 |
echo=False,
|
120 |
stream=True
|
121 |
)
|
|
|
156 |
|
157 |
|
158 |
def generate_topic(self, prompt, response):
|
159 |
+
# Load the model on the GPU
|
160 |
+
llm = Llama(model_path=self.model_path, n_ctx=1690, n_threads=8, mlock=True, n_gpu_layers=42, device=device)
|
161 |
|
162 |
system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-4 words:\n\nUser: {prompt}\nAssistant: {response}\n\nTopic:"
|
163 |
|