import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download # Download the model model_name = "Mykes/med_tinyllama_gguf" filename = "unsloth.Q4_K_M.gguf" model_path = hf_hub_download(repo_id=model_name, filename=filename) # Initialize the model # model = Llama(model_path=model_path, n_ctx=2048, n_threads=4, n_batch=32, use_mmap=True, use_mlock=True, rope_freq_base=10000, rope_freq_scale=1.0) model = Llama(model_path=model_path, n_ctx=256, n_threads=2, n_batch=8, use_mlock=True) # def preload_model(model, preload_tokens=1024): # # Dummy call to load model into RAM by accessing parts of it # try: # dummy_input = " " * preload_tokens # _ = model(dummy_input, max_tokens=1) # print("Model preloaded into RAM.") # except Exception as e: # print(f"Error preloading model: {e}") # # Preload the model into RAM # preload_model(model) def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, ): history = history[-3:] # Construct the prompt prompt = f"{system_message}\n\n" for user_msg, assistant_msg in history: prompt += f"<|user|>{user_msg}<|end|> <|assistant|>{assistant_msg}<|end|>" prompt += f"<|user|>{message}<|end|> <|assistant|>" # Generate response response = "" for token in model( prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p, stream=True, stop=["<|end|>", ""] ): response += token['choices'][0]['text'] yield response.strip() # Create the Gradio interface demo = gr.ChatInterface( respond, undo_btn="Отменить", clear_btn="Очистить", additional_inputs=[ # gr.Textbox(value="You are a friendly medical assistant.", label="System message"), gr.Textbox(value="", label="System message"), gr.Slider(minimum=128, maximum=4096, value=2048, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p (nucleus sampling)", ), ], title="Med TinyLlama Chat", description="Chat with the Med TinyLlama model for medical information.", ) if __name__ == "__main__": demo.launch()