from typing import Dict, List from llama_cpp import Llama llama_args={"n_gpu_layers":100,"main_gpu":0,"verbose":False} class Model: def __init__(self): pass def __call__(self, msg:str, stop:List[str], max_tokens:int): raise NotImplementedError def conv(self, msgs:List[Dict[str, str]])->str: raise NotImplementedError def starttok(self, user:str)->str: raise NotImplementedError def start(self)->str: return "" def close(self): pass class Phi35RPMax(Model): def __init__(self): self.llm = Llama.from_pretrained( repo_id="ArliAI/Phi-3.5-mini-3.8B-ArliAI-RPMax-v1.1-GGUF", filename="ArliAI-RPMax-3.8B-v1.1-fp16.gguf", **llama_args, ) def __call__(self, msg:str, stop:List[str], max_tokens:int): print("Autocomplete: ",msg) ret=self.llm(msg, stop=stop, max_tokens=max_tokens) return ret def conv(self,msgs:List[Dict[str, str]]): return "\n".join([f"<|{msg['role']}|>\n{msg['content']}<|end|>" for msg in msgs]) def starttok(self,user:str): return f"<|{user}|>\n" def close(self): self.llm.close() Phi35RPMax.modelname="Phi35RPMax-fp16" class Phi35(Model): def __init__(self): self.llm = Llama.from_pretrained( repo_id="bartowski/Phi-3.5-mini-instruct-GGUF", filename="Phi-3.5-mini-instruct-f32.gguf", **llama_args, ) def __call__(self, msg:str, stop:List[str], max_tokens:int): print("Autocomplete: ",msg) return self.llm(msg, stop=stop, max_tokens=max_tokens) def conv(self,msgs:List[Dict[str, str]]): return "\n".join([f"<|{msg['role']}|>\n{msg['content']}<|end|>" for msg in msgs]) def starttok(self,user:str): return f"<|{user}|>\n" def close(self): self.llm.close() Phi35.modelname="Phi35-IQ3_XS" # TODO: Gemma2 needs license maybe try it in the future but dont think it is worth it # class Gemma2(Model): # def __init__(self): # self.llm = Llama.from_pretrained( # repo_id="google/gemma-2-2b-it-GGUF", # filename="2b_it_v2.gguf", # ) # def __call__(self, msg:str, stop:List[str], max_tokens:int): # return self.llm(msg, stop=stop, max_tokens=max_tokens) # def conv(self,msgs:List[Dict[str, str]]):#https://ai.google.dev/gemma/docs/formatting?hl=de # return "\n".join([f"<|{msg['role']}|>\n{msg['content']}<|end|>" for msg in msgs]) # def formatmessage(self,msg:str, role:str):#https://ai.google.dev/gemma/docs/formatting?hl=de # if(role=="system"): # # Gemma2 does not support system messages / isnt trained for them # # TODO: Make them Assistant messages and test if this improves the results # return "" # if role=="assistant": # role="model" # return f"{role}\n{msg}" # def starttok(self,user:str): # return f"{user}\n" # def close(self): # self.llm.close() # Gemma2.modelname="Gemma2-2b-it-GGUF" class Llama31uncensored(Model): def __init__(self): self.llm = Llama.from_pretrained( repo_id="Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2-GGUF", filename="Llama-3.1-8B-Lexi-Uncensored_V2_F16.gguf", **llama_args, ) def __call__(self, msg:str, stop:List[str], max_tokens:int): print("Autocomplete: ",msg) return self.llm(msg, stop=stop, max_tokens=max_tokens) def start(self): return "<|begin_of_text|>" def conv(self,msgs:List[Dict[str, str]]): return "\n".join([f"<|start_header_id|>{msg['role']}<|end_header_id|>\n\n{msg['content']}<|eot_id|>" for msg in msgs]) def starttok(self,user:str): return f"<|start_header_id|>{user}<|end_header_id|>\n\n" def close(self): self.llm.close() Llama31uncensored.modelname="Llama31-uncensored-fp16" class Llama31(Model): def __init__(self): self.llm = Llama.from_pretrained( repo_id="lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF", filename="Meta-Llama-3.1-8B-Instruct-IQ4_XS.gguf", **llama_args, ) def __call__(self, msg:str, stop:List[str], max_tokens:int): print("Autocomplete: ",msg) return self.llm(msg, stop=stop, max_tokens=max_tokens) def conv(self,msgs:List[Dict[str, str]]): return "\n".join([f"<|begin_of_text|><|start_header_id|>{msg['role']}<|end_header_id|>\n\n{msg['content']}<|eot_id|>" for msg in msgs]) def starttok(self,user:str): return f"<|begin_of_text|><|start_header_id|>{user}<|end_header_id|>" def close(self): self.llm.close() Llama31.modelname="Llama31-IQ4_XS" models=[Phi35RPMax,Phi35,Llama31uncensored,Llama31]