import gradio as gr import os from typing import Iterator import sambanova def generate( message: str, chat_history: list[tuple[str, str]], system_message, max_tokens: int = 1024, temperature: float = 0.6, top_p: float = 0.9, top_k: int = 50, repetition_penalty: float = 1.2, ) -> Iterator[str]: conversation = [{"role": "system", "content": system_message}] for val in chat_history: if val[0]: conversation.append({"role": "user", "content": val[0]}) if val[1]: conversation.append({"role": "assistant", "content": val[1]}) conversation.append({"role": "user", "content": message}) outputs = [] for text in sambanova.Streamer(conversation, new_tokens=max_tokens, temperature=temperature, top_k=top_k, top_p=top_p): outputs.append(text) yield "".join(outputs) MAX_MAX_TOKENS = 2048 DEFAULT_MAX_TOKENS = 1024 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096")) # chat_interface = gr.ChatInterface( # fn=generate, # additional_inputs=[ # gr.Slider( # label="Max new tokens", # minimum=1, # maximum=MAX_MAX_NEW_TOKENS, # step=1, # value=DEFAULT_MAX_NEW_TOKENS, # ), # gr.Slider( # label="Temperature", # minimum=0.1, # maximum=4.0, # step=0.1, # value=0.6, # ), # gr.Slider( # label="Top-p (nucleus sampling)", # minimum=0.05, # maximum=1.0, # step=0.05, # value=0.9, # ), # gr.Slider( # label="Top-k", # minimum=1, # maximum=1000, # step=1, # value=50, # ), # gr.Slider( # label="Repetition penalty", # minimum=1.0, # maximum=2.0, # step=0.05, # value=1.2, # ), # ], # stop_btn=None, # fill_height=True, # examples=[ # ["Which one is bigger? 4.9 or 4.11"], # [ # "Can you explain briefly to me what is the Python programming language?" # ], # ["Explain the plot of Cinderella in a sentence."], # ["How many hours does it take a man to eat a Helicopter?"], # [ # "Write a 100-word article on 'Benefits of Open-Source in AI research'" # ], # ], # cache_examples=False, # ) chat_interface = gr.ChatInterface( generate, additional_inputs=[ gr.Textbox(value="You are a friendly Chatbot.", label="System message"), gr.Slider( label="Max tokens", minimum=1, maximum=MAX_MAX_TOKENS, step=1, value=DEFAULT_MAX_TOKENS, ), gr.Slider( label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6, ), gr.Slider( label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9, ), gr.Slider( label="Top-k", minimum=1, maximum=1000, step=1, value=50, ), ], examples=[ ["Which one is bigger? 4.9 or 4.11"], [ "Can you explain briefly to me what is the Python programming language?" ], ["Explain the plot of Cinderella in a sentence."], ["How many hours does it take a man to eat a Helicopter?"], [ "Write a 100-word article on 'Benefits of Open-Source in AI research'" ], ], cache_examples=False, ) with gr.Blocks(fill_height=True) as demo: gr.Markdown('# Sambanova model inference LLAMA 405B') chat_interface.render() if __name__ == "__main__": demo.queue(max_size=20).launch()