File size: 2,314 Bytes
1d0039a
 
c111a7e
1d0039a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f55d24
1d0039a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f55d24
1d0039a
 
 
 
 
 
 
c111a7e
1d0039a
c111a7e
1d0039a
c111a7e
1d0039a
c111a7e
 
1d0039a
f33abcd
c111a7e
 
 
1d0039a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import re

import gradio as gr
from routellm.controller import Controller

TEMPERATURE = 0.8
THRESHOLD = 0.11593
ROUTER = "mf"

client = Controller(
    routers=["mf"],
    strong_model="gpt-4-1106-preview",
    weak_model="anyscale/mistralai/Mixtral-8x7B-Instruct-v0.1",
)


def predict(message, history, threshold, temperature):
    # Convert chat history to OpenAI format
    history_openai_format = [
        {"role": "system", "content": "You are a helpful AI assistant."}
    ]
    for human, assistant in history:
        history_openai_format.append({"role": "user", "content": human})
        history_openai_format.append(
            {
                "role": "assistant",
                # Remove model name from response
                "content": re.sub(r"^\*\*\[.*?\]\*\*\s*", "", assistant),
            }
        )
    history_openai_format.append({"role": "user", "content": message})

    # Create a chat completion request and send it to the API server
    stream = client.chat.completions.create(
        model=f"router-{ROUTER}-{threshold}",  # Model name to use
        messages=history_openai_format,  # Chat history
        temperature=temperature,  # Temperature for text generation
        stream=True,  # Stream response
    )
    print(stream)

    # Read and return generated text from response stream
    partial_message = ""
    for i, chunk in enumerate(stream):
        print(chunk)
        if i == 0:
            model_prefix = f"**[{chunk.model}]**\n"
            yield model_prefix
            partial_message += model_prefix
        partial_message += chunk.choices[0].delta.content or ""
        yield partial_message


# Create and launch a chat interface with Gradio
demo = gr.ChatInterface(
    predict,
    additional_inputs=[
        gr.Slider(label="Threshold", minimum=0, maximum=1, value=THRESHOLD, step=0.01),
        gr.Slider(
            label="Temperature", minimum=0, maximum=1, value=TEMPERATURE, step=0.1
        ),
    ],
    title="RouteLLM",
    description="This is a demo of our matrix factorization router, calibrated so that approximately 50% of calls (those that are harder) are routed to GPT-4, with remaining calls routed to Mixtral 8x7B.\n\nCheck out https://github.com/lm-sys/RouteLLM for details!",
)

if __name__ == "__main__":
    demo.launch()