import gradio as gr from llama_cpp import Llama import datetime import os import datetime from huggingface_hub import hf_hub_download #MODEL SETTINGS also for DISPLAY convHistory = '' modelfile = hf_hub_download( repo_id=os.environ.get("REPO_ID", "RichardErkhov/scb10x_-_llama-3-typhoon-v1.5-8b-instruct-gguf"), filename=os.environ.get("MODEL_FILE", "llama-3-typhoon-v1.5-8b-instruct.Q4_K_M.gguf"), ) repetitionpenalty = 1.15 contextlength=8192 logfile = 'typhoon-v1.5-8b-instruct_logs.txt' print("loading model...") stt = datetime.datetime.now() # Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system. llm = Llama( model_path=modelfile, # Download the model file first n_ctx=contextlength, # The max sequence length to use - note that longer sequence lengths require much more resources n_threads=2, # The number of CPU threads to use, tailor to your system and the resulting performance ) dt = datetime.datetime.now() - stt print(f"Model loaded in {dt}") def writehistory(text): with open(logfile, 'a') as f: f.write(text) f.write('\n') f.close() """ gr.themes.Base() gr.themes.Default() gr.themes.Glass() gr.themes.Monochrome() gr.themes.Soft() """ def combine(a, b, c, d,e,f): global convHistory import datetime SYSTEM_PROMPT = f"""{a} """ temperature = c max_new_tokens = d repeat_penalty = f top_p = e prompt = f"<|user|>\n{b}<|endoftext|>\n<|assistant|>" # prompt = [ # {"role": "system", "content": SYSTEM_PROMPT} , # {"role": "user", "content": b}, # ] prompt = f"""{prompt}""" start = datetime.datetime.now() generation = "" delta = "" prompt_tokens = f"Prompt Tokens: {len(llm.tokenize(bytes(prompt,encoding='utf-8')))}" generated_text = "" answer_tokens = '' total_tokens = '' for character in llm(prompt, max_tokens=max_new_tokens, #stop=["<|eot_id|>"], temperature = temperature, repeat_penalty = repeat_penalty, top_p = top_p, # Example stop token - not necessarily correct for this specific model! Please check before using. echo=False, stream=True): generation += character["choices"][0]["text"] answer_tokens = f"Out Tkns: {len(llm.tokenize(bytes(generation,encoding='utf-8')))}" total_tokens = f"Total Tkns: {len(llm.tokenize(bytes(prompt,encoding='utf-8'))) + len(llm.tokenize(bytes(generation,encoding='utf-8')))}" delta = datetime.datetime.now() - start yield generation, delta, prompt_tokens, answer_tokens, total_tokens print(f"Response: {generation}") timestamp = datetime.datetime.now() logger = f"""time: {timestamp}\n Temp: {temperature} - MaxNewTokens: {max_new_tokens} - RepPenalty: 1.5 \nPROMPT: \n{prompt}\nStableZephyr3B: {generation}\nGenerated in {delta}\nPromptTokens: {prompt_tokens} Output Tokens: {answer_tokens} Total Tokens: {total_tokens}\n\n---\n\n""" writehistory(logger) convHistory = convHistory + prompt + "\n" + generation + "\n" print(convHistory) return generation, delta, prompt_tokens, answer_tokens, total_tokens #return generation, delta # MAIN GRADIO INTERFACE with gr.Blocks(theme='Medguy/base2') as demo: #theme=gr.themes.Glass() #theme='remilia/Ghostly' #TITLE SECTION with gr.Row(variant='compact'): with gr.Column(scale=10): gr.HTML("
" + "

🐶 Paotung Typhoon

") with gr.Row(): with gr.Column(min_width=80): gentime = gr.Textbox(value="", placeholder="Generation Time:", min_width=50, show_label=False) with gr.Column(min_width=80): prompttokens = gr.Textbox(value="", placeholder="Prompt Tkn:", min_width=50, show_label=False) with gr.Column(min_width=80): outputokens = gr.Textbox(value="", placeholder="Output Tkn:", min_width=50, show_label=False) with gr.Column(min_width=80): totaltokens = gr.Textbox(value="", placeholder="Total Tokens:", min_width=50, show_label=False) # INTERACTIVE INFOGRAPHIC SECTION # PLAYGROUND INTERFACE SECTION with gr.Row(): with gr.Column(scale=1): gr.Markdown( f""" ### Tunning Parameters""") temp = gr.Slider(label="Temperature",minimum=0.0, maximum=1.0, step=0.01, value=0.42) top_p = gr.Slider(label="Top_P",minimum=0.0, maximum=1.0, step=0.01, value=0.8) repPen = gr.Slider(label="Repetition Penalty",minimum=0.0, maximum=4.0, step=0.01, value=1.2) max_len = gr.Slider(label="Maximum output lenght", minimum=10,maximum=(contextlength-500),step=2, value=900) gr.Markdown( """ Fill the System Prompt and User Prompt And then click the Button below """) btn = gr.Button(value="💎🦜 Generate", variant='primary') gr.Markdown( f""" - **Prompt Template**: Llama-3-8B - **Repetition Penalty**: {repetitionpenalty} - **Context Lenght**: {contextlength} tokens - **LLM Engine**: llama-cpp - **Model**: 💎🦜 Llama-3-8B + typhoon-v1.5-8b-instruct - **Log File**: {logfile} """) with gr.Column(scale=4): txt = gr.Textbox(label="System Prompt", value = "", placeholder = "This models does not have any System prompt...",lines=1, interactive = True) txt_2 = gr.Textbox(label="User Prompt", lines=5, show_copy_button=True) txt_3 = gr.Textbox(value="", label="Output", lines = 10, show_copy_button=True) btn.click(combine, inputs=[txt, txt_2,temp,max_len,top_p,repPen], outputs=[txt_3,gentime,prompttokens,outputokens,totaltokens]) if __name__ == "__main__": demo.launch(inbrowser=True)