Spaces:
Paused
Paused
File size: 5,330 Bytes
8824f88 5ac471c 8824f88 faf8f3f 60682e7 9a7bc17 8824f88 8ef0569 8824f88 faf8f3f 8824f88 a5f97a2 faf8f3f 151d4c2 faf8f3f f5ee359 1ba36bf 73d0fad 8824f88 96b060f 99ab088 8ef0569 ab23fe8 99ab088 dac8084 8824f88 99ab088 0c42252 99ab088 5976549 8824f88 8ef0569 8824f88 c4cdcd8 8824f88 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
#!/usr/bin/env python
import os
from threading import Thread
from typing import Iterator
import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
DESCRIPTION = "# Mistral-7B"
if not torch.cuda.is_available():
DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = 4096
if torch.cuda.is_available():
model_id = "codys12/MergeLlama-7b"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map=0, cache_dir="/data")
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf", trust_remote_code=True)
#tokenizer.pad_token = tokenizer.eos_token
#tokenizer.padding_side = "right"
@spaces.GPU
def generate(
message: str,
chat_history: list[tuple[str, str]],
max_new_tokens: int = 1024,
#temperature: float = 0.6,
#top_p: float = 0.9,
#top_k: int = 50,
#repetition_penalty: float = 1.2,
) -> Iterator[str]:
conversation = []
current_input = ""
for user, assistant in chat_history:
current_input += user
current_input += assistant
history = current_input
current_input += message
device = "cuda"
input_ids = tokenizer(current_input, return_tensors="pt").input_ids.to(device)
original_input_length = input_ids.shape[1] # Remember the input length
if len(input_ids) > MAX_INPUT_TOKEN_LENGTH:
input_ids = input_ids[-MAX_INPUT_TOKEN_LENGTH:]
gr.Warning("Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
generate_kwargs = dict(
{"input_ids": input_ids},
streamer=streamer,
max_new_tokens=max_new_tokens,
#do_sample=True,
#top_p=top_p,
#top_k=top_k,
#temperature=temperature,
#num_beams=1,
repetition_penalty=1.0,#repetition_penalty,
)
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
print()
outputs = []
for text in streamer:
print(text, end="")
outputs.append(text)
yield "".join(outputs)
chat_interface = gr.ChatInterface(
fn=generate,
additional_inputs=[
gr.Slider(
label="Max new tokens",
minimum=1,
maximum=MAX_MAX_NEW_TOKENS,
step=1,
value=DEFAULT_MAX_NEW_TOKENS,
),
# gr.Slider(
# label="Temperature",
# minimum=0.1,
# maximum=4.0,
# step=0.1,
# value=0.6,
# ),
# gr.Slider(
# label="Top-p (nucleus sampling)",
# minimum=0.05,
# maximum=1.0,
# step=0.05,
# value=0.9,
# ),
# gr.Slider(
# label="Top-k",
# minimum=1,
# maximum=1000,
# step=1,
# value=50,
# ),
# gr.Slider(
# label="Repetition penalty",
# minimum=1.0,
# maximum=2.0,
# step=0.05,
# value=1.2,
# ),
],
stop_btn=None,
examples=[
["<<<<<<<\nimport org.apache.flink.api.java.tuple.Tuple2;\n\n=======\n\nimport org.apache.commons.collections.MapUtils;\nimport org.apache.flink.api.common.functions.RuntimeContext;\n\n>>>>>>>"],
["<<<<<<<\n // Simple check for whether our target app uses Recoil\n if (window[`$recoilDebugStates`]) {\n isRecoil = true;\n }\n\n=======\n\n if (\n memoizedState &&\n (tag === 0 || tag === 1 || tag === 2 || tag === 10) &&\n isRecoil === true\n ) {\n if (memoizedState.queue) {\n // Hooks states are stored as a linked list using memoizedState.next,\n // so we must traverse through the list and get the states.\n // We then store them along with the corresponding memoizedState.queue,\n // which includes the dispatch() function we use to change their state.\n const hooksStates = traverseRecoilHooks(memoizedState);\n hooksStates.forEach((state, i) => {\n\n hooksIndex = componentActionsRecord.saveNew(\n state.state,\n state.component\n );\n componentData.hooksIndex = hooksIndex;\n if (newState && newState.hooksState) {\n newState.push(state.state);\n } else if (newState) {\n newState = [state.state];\n } else {\n newState.push(state.state);\n }\n componentFound = true;\n });\n }\n }\n\n>>>>>>>"],
["Explain the plot of Cinderella in a sentence."],
["How many hours does it take a man to eat a Helicopter?"],
["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
],
)
with gr.Blocks(css="style.css") as demo:
gr.Markdown(DESCRIPTION)
gr.DuplicateButton(
value="Duplicate Space for private use",
elem_id="duplicate-button",
visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
)
chat_interface.render()
if __name__ == "__main__":
demo.queue(max_size=20).launch()
|