import gradio as gr
import os
import requests
import dotenv
dotenv.load_dotenv()

url = os.getenv('BACKEND_URL')
username = os.getenv('USERNAME')
password = os.getenv('PASSWORD')
system_prompt_text = ""

def predict(message, history, max_new_tokens, top_k, top_p, temperature):
    global system_prompt_text, url
    payload = {
        "message": message,
        "system_message": system_prompt_text,
        "history": history,
        "max_new_tokens": max_new_tokens,
        "top_k": top_k,
        "top_p": top_p,
        "temperature": temperature,
    }
    headers = {
        "Content-Type": "application/json"
    }
    
    response = requests.post(url, json=payload, headers=headers)
    
    if response.status_code == 200:
        return response.json()
    else:
        response.raise_for_status()


def update_system_prompt(new_content):
    global system_prompt_text
    system_prompt_text = new_content

with gr.Blocks(fill_height=True) as demo:
    max_new_tokens_slider = gr.Slider(
        minimum=1, maximum=500, value=50, step=1,
        label="Max New Tokens (The maximum number of tokens to generate in the response. This limits the length of the generated text.)",
        render=False
    )

    top_k_slider = gr.Slider(
        minimum=0, maximum=100, value=50, step=1,
        label="Top K (The number of highest probability vocabulary tokens to keep for top-k filtering. This controls the diversity of the generated text by limiting the number of token options at each step.)",
        render=False
    )

    top_p_slider = gr.Slider(
        minimum=0.0, maximum=1.0, value=1.0, step=0.01,
        label="Top P (The cumulative probability threshold for nucleus sampling. This controls the diversity of the generated text by sampling tokens from the smallest possible set whose cumulative probability is above the threshold.)",
        render=False
    )

    temperature_slider = gr.Slider(
        minimum=0.0, maximum=2.0, value=0.9, step=0.01,
        label="Temperature (The sampling temperature to use. This controls the randomness of predictions by scaling the logits before applying softmax. Lower values make the model more deterministic, while higher values increase diversity.)",
        render=False
    )

    gr.ChatInterface(
        predict,
        cache_examples=False,
        additional_inputs=[max_new_tokens_slider, top_k_slider, top_p_slider, temperature_slider],
        examples=[  ["I'm in a bad mood.", None, None, None, None], 
                    ["Do you have any hobbies or interests outside of work?", None, None, None, None], 
                    ["Who created you?", None, None, None, None], 
                    ["Please introduce yourself.", None, None, None, None], 
                    ["Do you have any plans for the future?", None, None, None, None],
                    ["Does Emi play the piano?", None, None, None, None],
                    ["Can you feel pain?", None, None, None, None],
                    ["Do you feel like AI?", None, None, None, None],
                    ["Can you work 24/7?", None, None, None, None],
                    ["Do you ever update?", None, None, None, None]]
    )
    
    # system_prompt = gr.Textbox(value=system_prompt_text, info="System Message:", placeholder="你是Emi",
    #                                        interactive=True, lines=5)
    # system_prompt.change(
    #     fn=update_system_prompt, inputs=system_prompt)

if __name__ == "__main__":
    demo.launch(auth=(username, password))