PMAlpha

Sleeping

App Files Files Community

Sergidev commited on Jun 6

Commit

4c066b1

•

1 Parent(s): 47968ad

Update modules/pmbl.py

Browse files

Files changed (1) hide show

modules/pmbl.py +7 -15

modules/pmbl.py CHANGED Viewed

@@ -2,18 +2,13 @@ import sqlite3
 from datetime import datetime
 from llama_cpp import Llama
 from concurrent.futures import ThreadPoolExecutor
-import torch
-import os
-# Set CUDA device (assuming you have an NVIDIA T4 Medium)
-os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 class PMBL:
     def __init__(self, model_path):
         self.model_path = model_path
         self.init_db()
-        self.executor = ThreadPoolExecutor(max_workers=6)
     def init_db(self):
         conn = sqlite3.connect('chat_history.db')
@@ -84,7 +79,8 @@ class PMBL:
         conn = sqlite3.connect('chat_history.db')
         c = conn.cursor()
         timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        c.execute("INSERT INTO chats (timestamp, prompt, response, topic) VALUES (?, ?, ?, 'Untitled')", (timestamp, prompt, response))
         conn.commit()
         conn.close()
@@ -107,16 +103,14 @@ class PMBL:
         for chunk in response.result():
             yield chunk
     def generate_response_task(self, system_prompt, prompt, n_ctx):
-        # Load the model on the GPU
-        llm = Llama(model_path=self.model_path, n_ctx=n_ctx, n_threads=8, mlock=True, n_gpu_layers=42, device=device)
         response = llm(
             system_prompt,
             max_tokens=1500,
             temperature=0.2,
-            stop=[" ", "\nUser:", "\nuser:", "\nSystem:", "\nsystem:"],
             echo=False,
             stream=True
         )
@@ -140,7 +134,7 @@ class PMBL:
             return system_prompt_tokens + history_tokens + max_response_tokens
         else:
             return context_ceiling  # Return the maximum context size
     def sleep_mode(self):
         conn = sqlite3.connect('chat_history.db')
         c = conn.cursor()
@@ -155,10 +149,8 @@ class PMBL:
         conn.close()
     def generate_topic(self, prompt, response):
-        # Load the model on the GPU
-        llm = Llama(model_path=self.model_path, n_ctx=1690, n_threads=8, mlock=True, n_gpu_layers=42, device=device)
         system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-4 words:\n\nUser: {prompt}\nAssistant: {response}\n\nTopic:"

 from datetime import datetime
 from llama_cpp import Llama
 from concurrent.futures import ThreadPoolExecutor
 class PMBL:
     def __init__(self, model_path):
         self.model_path = model_path
         self.init_db()
+        self.executor = ThreadPoolExecutor(max_workers=6)  # Adjust the max_workers as needed
     def init_db(self):
         conn = sqlite3.connect('chat_history.db')
         conn = sqlite3.connect('chat_history.db')
         c = conn.cursor()
         timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        c.execute("INSERT INTO chats (timestamp, prompt, response, topic) VALUES (?, ?, ?, 'Untitled')",
+                  (timestamp, prompt, response))
         conn.commit()
         conn.close()
         for chunk in response.result():
             yield chunk
     def generate_response_task(self, system_prompt, prompt, n_ctx):
+        llm = Llama(model_path=self.model_path, n_ctx=n_ctx, n_threads=8, n_gpu_layers=-1, mlock=True)
         response = llm(
             system_prompt,
             max_tokens=1500,
             temperature=0.2,
+            stop=["</s>", "\nUser:", "\nuser:", "\nSystem:", "\nsystem:"],
             echo=False,
             stream=True
         )
             return system_prompt_tokens + history_tokens + max_response_tokens
         else:
             return context_ceiling  # Return the maximum context size
     def sleep_mode(self):
         conn = sqlite3.connect('chat_history.db')
         c = conn.cursor()
         conn.close()
     def generate_topic(self, prompt, response):
+        llm = Llama(model_path=self.model_path, n_ctx=1690, n_threads=8, n_gpu_layers=-1, mlock=True)
         system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-4 words:\n\nUser: {prompt}\nAssistant: {response}\n\nTopic:"