import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Download the model
model_name = "Mykes/med_tinyllama_gguf"
filename = "unsloth.Q4_K_M.gguf"
model_path = hf_hub_download(repo_id=model_name, filename=filename)

# Initialize the model
# model = Llama(model_path=model_path, n_ctx=2048, n_threads=4, n_batch=32, use_mmap=True, use_mlock=True, rope_freq_base=10000, rope_freq_scale=1.0)
model = Llama(model_path=model_path, n_ctx=256, n_threads=2, n_batch=8, use_mlock=True)
# def preload_model(model, preload_tokens=1024):
#     # Dummy call to load model into RAM by accessing parts of it
#     try:
#         dummy_input = " " * preload_tokens
#         _ = model(dummy_input, max_tokens=1)
#         print("Model preloaded into RAM.")
#     except Exception as e:
#         print(f"Error preloading model: {e}")

# # Preload the model into RAM
# preload_model(model)
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    history = history[-3:]
    # Construct the prompt
    prompt = f"<s>{system_message}\n\n"
    for user_msg, assistant_msg in history:
        prompt += f"<|user|>{user_msg}<|end|></s> <|assistant|>{assistant_msg}<|end|></s>"
    prompt += f"<|user|>{message}<|end|></s> <|assistant|>"

    # Generate response
    response = ""
    for token in model(
        prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        stream=True,
        stop=["<|end|>", "</s>"]
    ):
        response += token['choices'][0]['text']
        yield response.strip()

# Create the Gradio interface
demo = gr.ChatInterface(
    respond,
    undo_btn="Отменить",
    clear_btn="Очистить",
    additional_inputs=[
        # gr.Textbox(value="You are a friendly medical assistant.", label="System message"),
        gr.Textbox(value="", label="System message"),
        gr.Slider(minimum=128, maximum=4096, value=2048, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.9,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
    title="Med TinyLlama Chat",
    description="Chat with the Med TinyLlama model for medical information.",
)

if __name__ == "__main__":
    demo.launch()