#!/usr/bin/env python import os from threading import Thread from typing import Iterator import gradio as gr import spaces import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from peft import PeftModel, PeftConfig DESCRIPTION = "# Mistral-7B" if not torch.cuda.is_available(): DESCRIPTION += "\n

Running on CPU 🥶 This demo does not work on CPU.

" MAX_MAX_NEW_TOKENS = 2048 DEFAULT_MAX_NEW_TOKENS = 256 MAX_INPUT_TOKEN_LENGTH = 4096 if torch.cuda.is_available(): model_id = "codys12/MergeLlama-7b" model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float16, device_map=0, cache_dir="/data") tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf", trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right" @spaces.GPU def generate( message: str, chat_history: list[tuple[str, str]], max_new_tokens: int = 1024, #temperature: float = 0.6, #top_p: float = 0.9, #top_k: int = 50, #repetition_penalty: float = 1.2, ) -> Iterator[str]: conversation = [] current_input = "" for user, assistant in chat_history: current_input += user current_input += assistant history = current_input current_input += message device = "cuda:0" input_ids = tokenizer(current_input, return_tensors="pt").input_ids.to(device) if len(input_ids) > MAX_INPUT_TOKEN_LENGTH: input_ids = input_ids[-MAX_INPUT_TOKEN_LENGTH:] gr.Warning("Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.") streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True) generate_kwargs = dict( {"input_ids": input_ids}, streamer=streamer, max_new_tokens=max_new_tokens, #do_sample=True, #top_p=top_p, #top_k=top_k, #temperature=temperature, #num_beams=1, #repetition_penalty=repetition_penalty, ) t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() outputs = [] for text in streamer: outputs.append(text) combined_text = "".join(outputs) if "<<<<<<<" in combined_text: combined_text = combined_text.replace("<<<<<<<", "") # Remove the unwanted string yield combined_text break else: yield combined_text chat_interface = gr.ChatInterface( fn=generate, additional_inputs=[ gr.Slider( label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS, ), # gr.Slider( # label="Temperature", # minimum=0.1, # maximum=4.0, # step=0.1, # value=0.6, # ), # gr.Slider( # label="Top-p (nucleus sampling)", # minimum=0.05, # maximum=1.0, # step=0.05, # value=0.9, # ), # gr.Slider( # label="Top-k", # minimum=1, # maximum=1000, # step=1, # value=50, # ), # gr.Slider( # label="Repetition penalty", # minimum=0.1, # maximum=2.0, # step=0.05, # value=1.2, # ), ], stop_btn=None, examples=[ ["<<<<<<<\nimport org.apache.flink.api.java.tuple.Tuple2;\n\n=======\n\nimport org.apache.commons.collections.MapUtils;\nimport org.apache.flink.api.common.functions.RuntimeContext;\n\n>>>>>>>"], ["<<<<<<<\n // Simple check for whether our target app uses Recoil\n if (window[`$recoilDebugStates`]) {\n isRecoil = true;\n }\n\n=======\n\n if (\n memoizedState &&\n (tag === 0 || tag === 1 || tag === 2 || tag === 10) &&\n isRecoil === true\n ) {\n if (memoizedState.queue) {\n // Hooks states are stored as a linked list using memoizedState.next,\n // so we must traverse through the list and get the states.\n // We then store them along with the corresponding memoizedState.queue,\n // which includes the dispatch() function we use to change their state.\n const hooksStates = traverseRecoilHooks(memoizedState);\n hooksStates.forEach((state, i) => {\n\n hooksIndex = componentActionsRecord.saveNew(\n state.state,\n state.component\n );\n componentData.hooksIndex = hooksIndex;\n if (newState && newState.hooksState) {\n newState.push(state.state);\n } else if (newState) {\n newState = [state.state];\n } else {\n newState.push(state.state);\n }\n componentFound = true;\n });\n }\n }\n\n>>>>>>>"], ["Explain the plot of Cinderella in a sentence."], ["How many hours does it take a man to eat a Helicopter?"], ["Write a 100-word article on 'Benefits of Open-Source in AI research'"], ], ) with gr.Blocks(css="style.css") as demo: gr.Markdown(DESCRIPTION) gr.DuplicateButton( value="Duplicate Space for private use", elem_id="duplicate-button", visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1", ) chat_interface.render() if __name__ == "__main__": demo.queue(max_size=20).launch()