Spaces:

nakcnx
/

paotung-typhoon

Sleeping

App Files Files Community

nakcnx commited on Jun 27

Commit

fe8bfde

•

1 Parent(s): d44f8f3

Update app.py

Browse files

Files changed (1) hide show

app.py +130 -49

app.py CHANGED Viewed

@@ -1,63 +1,144 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
 """
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 """
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+from llama_cpp import Llama
+import datetime
+import os
+import datetime
+from huggingface_hub import hf_hub_download
+#MODEL SETTINGS also for DISPLAY
+convHistory = ''
+modelfile = hf_hub_download(
+        repo_id=os.environ.get("REPO_ID", "RichardErkhov/scb10x_-_llama-3-typhoon-v1.5-8b-instruct-gguf"),
+        filename=os.environ.get("MODEL_FILE", "llama-3-typhoon-v1.5-8b-instruct.Q4_K_M.gguf"),
+    )
+repetitionpenalty = 1.15
+contextlength=8192
+logfile = 'typhoon-v1.5-8b-instruct_logs.txt'
+print("loading model...")
+stt = datetime.datetime.now()
+# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
+llm = Llama(
+  model_path=modelfile,  # Download the model file first
+  n_ctx=contextlength,  # The max sequence length to use - note that longer sequence lengths require much more resources
+  n_threads=2,            # The number of CPU threads to use, tailor to your system and the resulting performance
+)
+dt = datetime.datetime.now() - stt
+print(f"Model loaded in {dt}")
+def writehistory(text):
+    with open(logfile, 'a') as f:
+        f.write(text)
+        f.write('\n')
+    f.close()
 """
+gr.themes.Base()
+gr.themes.Default()
+gr.themes.Glass()
+gr.themes.Monochrome()
+gr.themes.Soft()
 """
+def combine(a, b, c, d,e,f):
+    global convHistory
+    import datetime
+    SYSTEM_PROMPT = f"""{a}
+    """
+    temperature = c
+    max_new_tokens = d
+    repeat_penalty = f
+    top_p = e
+    prompt = f"<|user|>\n{b}<|endoftext|>\n<|assistant|>"
+    # prompt = [
+    #     {"role": "system", "content": SYSTEM_PROMPT} ,
+    #     {"role": "user", "content": b},
+    # ]
+    prompt = f"""{prompt}"""
+    start = datetime.datetime.now()
+    generation = ""
+    delta = ""
+    prompt_tokens = f"Prompt Tokens: {len(llm.tokenize(bytes(prompt,encoding='utf-8')))}"
+    generated_text = ""
+    answer_tokens = ''
+    total_tokens = ''
+    for character in llm(prompt,
+                max_tokens=max_new_tokens,
+                #stop=["<|eot_id|>"],
+                temperature = temperature,
+                repeat_penalty = repeat_penalty,
+                top_p = top_p,   # Example stop token - not necessarily correct for this specific model! Please check before using.
+                echo=False,
+                stream=True):
+        generation += character["choices"][0]["text"]
+        answer_tokens = f"Out Tkns: {len(llm.tokenize(bytes(generation,encoding='utf-8')))}"
+        total_tokens = f"Total Tkns: {len(llm.tokenize(bytes(prompt,encoding='utf-8'))) + len(llm.tokenize(bytes(generation,encoding='utf-8')))}"
+        delta = datetime.datetime.now() - start
+        yield generation, delta, prompt_tokens, answer_tokens, total_tokens
+    print(f"Response: {generation}")
+    timestamp = datetime.datetime.now()
+    logger = f"""time: {timestamp}\n Temp: {temperature} - MaxNewTokens: {max_new_tokens} - RepPenalty: 1.5 \nPROMPT: \n{prompt}\nStableZephyr3B: {generation}\nGenerated in {delta}\nPromptTokens: {prompt_tokens}   Output Tokens: {answer_tokens}  Total Tokens: {total_tokens}\n\n---\n\n"""
+    writehistory(logger)
+    convHistory = convHistory + prompt + "\n" + generation + "\n"
+    print(convHistory)
+    return generation, delta, prompt_tokens, answer_tokens, total_tokens
+    #return generation, delta
+# MAIN GRADIO INTERFACE
+with gr.Blocks(theme='Medguy/base2') as demo:   #theme=gr.themes.Glass()  #theme='remilia/Ghostly'
+    #TITLE SECTION
+    with gr.Row(variant='compact'):
+            with gr.Column(scale=10):
+                gr.HTML("<center>"
+                + "<h2>🐶 Paotung Typhoon</h2></center>")
+                with gr.Row():
+                        with gr.Column(min_width=80):
+                            gentime = gr.Textbox(value="", placeholder="Generation Time:", min_width=50, show_label=False)
+                        with gr.Column(min_width=80):
+                            prompttokens = gr.Textbox(value="", placeholder="Prompt Tkn:", min_width=50, show_label=False)
+                        with gr.Column(min_width=80):
+                            outputokens = gr.Textbox(value="", placeholder="Output Tkn:", min_width=50, show_label=False)
+                        with gr.Column(min_width=80):
+                            totaltokens = gr.Textbox(value="", placeholder="Total Tokens:", min_width=50, show_label=False)
+    # INTERACTIVE INFOGRAPHIC SECTION
+    # PLAYGROUND INTERFACE SECTION
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown(
+            f"""
+            ### Tunning Parameters""")
+            temp = gr.Slider(label="Temperature",minimum=0.0, maximum=1.0, step=0.01, value=0.42)
+            top_p = gr.Slider(label="Top_P",minimum=0.0, maximum=1.0, step=0.01, value=0.8)
+            repPen = gr.Slider(label="Repetition Penalty",minimum=0.0, maximum=4.0, step=0.01, value=1.2)
+            max_len = gr.Slider(label="Maximum output lenght", minimum=10,maximum=(contextlength-500),step=2, value=900)
+            gr.Markdown(
+            """
+            Fill the System Prompt and User Prompt
+            And then click the Button below
+            """)
+            btn = gr.Button(value="💎🦜 Generate", variant='primary')
+            gr.Markdown(
+            f"""
+            - **Prompt Template**: Llama-3-8B
+            - **Repetition Penalty**: {repetitionpenalty}
+            - **Context Lenght**: {contextlength} tokens
+            - **LLM Engine**: llama-cpp
+            - **Model**: 💎🦜 Llama-3-8B + typhoon-v1.5-8b-instruct
+            - **Log File**: {logfile}
+            """)
+        with gr.Column(scale=4):
+            txt = gr.Textbox(label="System Prompt", value = "", placeholder = "This models does not have any System prompt...",lines=1, interactive = True)
+            txt_2 = gr.Textbox(label="User Prompt", lines=5, show_copy_button=True)
+            txt_3 = gr.Textbox(value="", label="Output", lines = 10, show_copy_button=True)
+            btn.click(combine, inputs=[txt, txt_2,temp,max_len,top_p,repPen], outputs=[txt_3,gentime,prompttokens,outputokens,totaltokens])
 if __name__ == "__main__":
+    demo.launch(inbrowser=True)