Mistral-Nemo

Runtime error

File size: 4,302 Bytes

cc5b602
da59244
6f619d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27dc368
6f619d7
85585d6
e6367a7
51a7d9e
29c0142
 
51a7d9e
 
ffe6c17
e6367a7
 
b48b00e
51a7d9e
bd34f0b
6a0c6b9
bd34f0b
82b38de
bd34f0b
 
 
 
 
51a7d9e
 
 
bd34f0b
 
 
 
 
 
 
51a7d9e
 
da59244
 
3569c20
fd6304d
 
51a7d9e
 
85585d6
27dc368
690d573
85585d6
27dc368
51a7d9e
fd6304d
85585d6
29c0142
27dc368
 
f01a45c
3569c20
 
 
 
 
 
 
51a7d9e
27dc368
f01a45c
27dc368
f01a45c
 
27dc368
51a7d9e
 
82b38de
51a7d9e
82b38de
51a7d9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82b38de
51a7d9e
 
3569c20
51a7d9e
 
bd34f0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51a7d9e

import os
import spaces
import threading
import time
import subprocess

OLLAMA = os.path.expanduser("~/ollama")

if not os.path.exists(OLLAMA):
    subprocess.run("curl -L https://ollama.com/download/ollama-linux-amd64 -o ~/ollama", shell=True)
    os.chmod(OLLAMA, 0o755)

def ollama_service_thread():
    subprocess.run("~/ollama serve", shell=True)

OLLAMA_SERVICE_THREAD = threading.Thread(target=ollama_service_thread)
OLLAMA_SERVICE_THREAD.start()

print("Giving ollama serve a moment")
time.sleep(10)
subprocess.run("~/ollama pull gemma2", shell=True)


import copy
import gradio as gr
from ollama import Client
client = Client(host='http://localhost:11434', timeout=60)

HF_TOKEN = os.environ.get("HF_TOKEN", None)
MODEL_ID = os.environ.get("MODEL_ID", "google/gemma-2-9b-it")
MODEL_NAME = MODEL_ID.split("/")[-1]

TITLE = "<h1><center>Chatbox</center></h1>"

DESCRIPTION = f"""
<h3>MODEL: <a href="https://hf.co/{MODEL_ID}">{MODEL_NAME}</a></h3>
<center>
<p>Gemma is the large language model built by Google.
<br>
Feel free to test without log.
</p>
</center>
"""

CSS = """
.duplicate-button {
    margin: auto !important;
    color: white !important;
    background: black !important;
    border-radius: 100vh !important;
}
h3 {
    text-align: center;
}
"""

@spaces.GPU()
def stream_chat(message: str, history: list, temperature: float, max_new_tokens: int, top_p: float, top_k: int, penalty: float):
    print(f'message is - {message}')
    print(f'history is - {history}')
    conversation = []
    for prompt, answer in history:
        conversation.extend([
            {"role": "user", "content": prompt}, 
            {"role": "assistant", "content": answer},
        ])
    conversation.append({"role": "user", "content": message})

    print(f"Conversation is -\n{conversation}")

    response = client.chat(
        model="gemma2",
        messages=conversation,
        stream=True,
        options={
            'num_predict': max_new_tokens,
            'temperature': temperature,
            'top_p': top_p,
            'top_k': top_k,
            'repeat_penalty': penalty,
        },
    )

    buffer = ""
    for chunk in response:
        buffer += chunk["message"]["content"]
        yield buffer



chatbot = gr.Chatbot(height=600)

with gr.Blocks(css=CSS, theme="soft") as demo:
    gr.HTML(TITLE)
    gr.HTML(DESCRIPTION)
    gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
    gr.ChatInterface(
        fn=stream_chat,
        chatbot=chatbot,
        fill_height=True,
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            gr.Slider(
                minimum=0,
                maximum=1,
                step=0.1,
                value=0.8,
                label="Temperature",
                render=False,
            ),
            gr.Slider(
                minimum=128,
                maximum=2048,
                step=1,
                value=1024,
                label="Max New Tokens",
                render=False,
            ),
            gr.Slider(
                minimum=0.0,
                maximum=1.0,
                step=0.1,
                value=0.8,
                label="top_p",
                render=False,
            ),
            gr.Slider(
                minimum=1,
                maximum=20,
                step=1,
                value=20,
                label="top_k",
                render=False,
            ),
            gr.Slider(
                minimum=0.0,
                maximum=2.0,
                step=0.1,
                value=1.0,
                label="Repetition penalty",
                render=False,
            ),
        ],
        examples=[
            ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."],
            ["What are 5 creative things I could do with my kids' art? I don't want to throw them away, but it's also so much clutter."],
            ["Tell me a random fun fact about the Roman Empire."],
            ["Show me a code snippet of a website's sticky header in CSS and JavaScript."],
        ],
        cache_examples=False,
    )


if __name__ == "__main__":
    demo.launch()