PMAlpha

Sleeping

Sergidev commited on Aug 6

Commit

6625104

•

1 Parent(s): 311b51e

Update modules/pmbl.py

Files changed (1) hide show

modules/pmbl.py CHANGED Viewed

@@ -102,7 +102,7 @@ class PMBL:
             yield chunk
     def generate_response_task(self, system_prompt, prompt, n_ctx):
-        llm = Llama(model_path=self.model_path, n_ctx=n_ctx, n_threads=8, n_gpu_layers=-1, offload_kqv=True, flash_attn=True, use_mlock=True)
         response = llm(
             system_prompt,
@@ -132,6 +132,7 @@ class PMBL:
             return system_prompt_tokens + history_tokens + max_response_tokens
         else:
             return context_ceiling  # Return the maximum context size
     def sleep_mode(self):
         conn = sqlite3.connect('chat_history.db')
         c = conn.cursor()
@@ -147,7 +148,7 @@ class PMBL:
         conn.close()
     def generate_topic(self, prompt, response):
-        llm = Llama(model_path=self.model_path, n_ctx=2960, n_threads=4, n_gpu_layers=-1, offload_kqv=True, flash_attn=True, use_mlock=True)
         system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-4 words:\n\nUser: {prompt}\nAssistant: {response}\n\nTopic:"
@@ -155,7 +156,7 @@ class PMBL:
             system_prompt,
             max_tokens=12,
             temperature=0,
-            stop=["\\n"],
             echo=False
         )

             yield chunk
     def generate_response_task(self, system_prompt, prompt, n_ctx):
+        llm = Llama(model_path=self.model_path, n_ctx=n_ctx, n_threads=8, n_gpu_layers=-1, offload_kqv=True, use_mlock=True)
         response = llm(
             system_prompt,
             return system_prompt_tokens + history_tokens + max_response_tokens
         else:
             return context_ceiling  # Return the maximum context size
     def sleep_mode(self):
         conn = sqlite3.connect('chat_history.db')
         c = conn.cursor()
         conn.close()
     def generate_topic(self, prompt, response):
+        llm = Llama(model_path=self.model_path, n_ctx=2960, n_threads=4, n_gpu_layers=-1, offload_kqv=True, use_mlock=True)
         system_prompt = f"Based on the following interaction between a user and an AI assistant, generate a concise topic for the conversation in 2-4 words:\n\nUser: {prompt}\nAssistant: {response}\n\nTopic:"
             system_prompt,
             max_tokens=12,
             temperature=0,
+            stop=["\n"],
             echo=False
         )