PMAlpha

Sleeping

Sergidev commited on Jun 6

Commit

b3a580a

•

1 Parent(s): 49b5488

Cuda fix

Add torch

Files changed (1) hide show

modules/pmbl.py CHANGED Viewed

@@ -2,12 +2,17 @@ import sqlite3
 from datetime import datetime
 from llama_cpp import Llama
 from concurrent.futures import ThreadPoolExecutor
 class PMBL:
     def __init__(self, model_path):
         self.model_path = model_path
         self.init_db()
-        self.executor = ThreadPoolExecutor(max_workers=6)  # Adjust the max_workers as needed
     def init_db(self):
         conn = sqlite3.connect('chat_history.db')
@@ -103,13 +108,14 @@ class PMBL:
     def generate_response_task(self, system_prompt, prompt, n_ctx):
-        llm = Llama(model_path=self.model_path, n_ctx=n_ctx, n_threads=8, mlock=True, n_gpu_layers=42)
         response = llm(
             system_prompt,
             max_tokens=1500,
             temperature=0.2,
-            stop=["</s>", "\nUser:", "\nuser:", "\nSystem:", "\nsystem:"],
             echo=False,
             stream=True
         )
@@ -150,7 +156,8 @@ class PMBL:
     def generate_topic(self, prompt, response):
-        llm = Llama(model_path=self.model_path, n_ctx=1690, n_threads=8, mlock=True, n_gpu_layers=42)
         system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-4 words:\n\nUser: {prompt}\nAssistant: {response}\n\nTopic:"

 from datetime import datetime
 from llama_cpp import Llama
 from concurrent.futures import ThreadPoolExecutor
+import torch
+# Set CUDA device (assuming you have an NVIDIA T4 Medium)
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 class PMBL:
     def __init__(self, model_path):
         self.model_path = model_path
         self.init_db()
+        self.executor = ThreadPoolExecutor(max_workers=6)
     def init_db(self):
         conn = sqlite3.connect('chat_history.db')
     def generate_response_task(self, system_prompt, prompt, n_ctx):
+        # Load the model on the GPU
+        llm = Llama(model_path=self.model_path, n_ctx=n_ctx, n_threads=8, mlock=True, n_gpu_layers=42, device=device)
         response = llm(
             system_prompt,
             max_tokens=1500,
             temperature=0.2,
+            stop=[" ", "\nUser:", "\nuser:", "\nSystem:", "\nsystem:"],
             echo=False,
             stream=True
         )
     def generate_topic(self, prompt, response):
+        # Load the model on the GPU
+        llm = Llama(model_path=self.model_path, n_ctx=1690, n_threads=8, mlock=True, n_gpu_layers=42, device=device)
         system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-4 words:\n\nUser: {prompt}\nAssistant: {response}\n\nTopic:"