MergeLlama-7b

Paused

File size: 5,430 Bytes

8824f88
 
 
 
 
 
 
 
 
5ac471c
192a9de
8824f88
 
 
 
 
 
 
14acdab
8824f88
 
 
faf8f3f
36ff73f
9a7bc17
36ff73f
 
8824f88
 
 
 
 
 
 
8ef0569
 
 
b2015f4
8824f88
 
faf8f3f
8824f88
a5f97a2
 
faf8f3f
151d4c2
faf8f3f
 
5b4300b
598f44b
1ba36bf
641bfc8
11e21a8
641bfc8
 
73d0fad
8824f88
 
 
 
 
96b060f
99ab088
 
 
 
8ef0569
 
 
 
 
b2015f4
99ab088
 
 
8824f88
99ab088
 
 
5976549
8824f88
 
 
 
 
 
 
 
 
 
 
 
8ef0569
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2015f4
 
 
 
 
 
 
8824f88
 
 
c4cdcd8
 
8824f88

#!/usr/bin/env python

import os
from threading import Thread
from typing import Iterator

import gradio as gr
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from peft import PeftModel, PeftConfig

DESCRIPTION = "# Mistral-7B"

if not torch.cuda.is_available():
    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"

MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 256
MAX_INPUT_TOKEN_LENGTH = 4096

if torch.cuda.is_available():
    model_id = "codys12/MergeLlama-7b"
    model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float16, device_map=0, cache_dir="/data")
    tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf", trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"


@spaces.GPU
def generate(
    message: str,
    chat_history: list[tuple[str, str]],
    max_new_tokens: int = 1024,
    #temperature: float = 0.6,
    #top_p: float = 0.9,
    #top_k: int = 50,
    #repetition_penalty: float = 1.2,
) -> Iterator[str]:
    conversation = []
    current_input = ""
    for user, assistant in chat_history:
        current_input += user
        current_input += assistant

    history = current_input
    current_input += message
    
    device = "cuda:0"
    print(current_input)
    input_ids = tokenizer(current_input, return_tensors="pt").input_ids.to(device)
    
    outputs = model.generate(input_ids, max_new_tokens=100)

    print(tokenizer.decode(outputs[0], skip_special_tokens=False))


    if len(input_ids) > MAX_INPUT_TOKEN_LENGTH:
        input_ids = input_ids[-MAX_INPUT_TOKEN_LENGTH:]
        gr.Warning("Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")

    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        {"input_ids": input_ids},
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        #do_sample=True,
        #top_p=top_p,
        #top_k=top_k,
        #temperature=temperature,
        #num_beams=1,
        #repetition_penalty=repetition_penalty,
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    outputs = []
    for text in streamer:
        outputs.append(text)
        yield "".join(outputs)


chat_interface = gr.ChatInterface(
    fn=generate,
    additional_inputs=[
        gr.Slider(
            label="Max new tokens",
            minimum=1,
            maximum=MAX_MAX_NEW_TOKENS,
            step=1,
            value=DEFAULT_MAX_NEW_TOKENS,
        ),
        # gr.Slider(
        #     label="Temperature",
        #     minimum=0.1,
        #     maximum=4.0,
        #     step=0.1,
        #     value=0.6,
        # ),
        # gr.Slider(
        #     label="Top-p (nucleus sampling)",
        #     minimum=0.05,
        #     maximum=1.0,
        #     step=0.05,
        #     value=0.9,
        # ),
        # gr.Slider(
        #     label="Top-k",
        #     minimum=1,
        #     maximum=1000,
        #     step=1,
        #     value=50,
        # ),
        # gr.Slider(
        #     label="Repetition penalty",
        #     minimum=0.1,
        #     maximum=2.0,
        #     step=0.05,
        #     value=1.2,
        # ),
    ],
    stop_btn=None,
    examples=[
        ["<<<<<<<\nimport org.apache.flink.api.java.tuple.Tuple2;\n\n=======\n\nimport org.apache.commons.collections.MapUtils;\nimport org.apache.flink.api.common.functions.RuntimeContext;\n\n>>>>>>>"],
        ["<<<<<<<\n  // Simple check for whether our target app uses Recoil\n  if (window[`$recoilDebugStates`]) {\n    isRecoil = true;\n  }\n\n=======\n\n    if (\n      memoizedState &&\n      (tag === 0 || tag === 1 || tag === 2 || tag === 10) &&\n      isRecoil === true\n    ) {\n      if (memoizedState.queue) {\n        // Hooks states are stored as a linked list using memoizedState.next,\n        // so we must traverse through the list and get the states.\n        // We then store them along with the corresponding memoizedState.queue,\n        // which includes the dispatch() function we use to change their state.\n        const hooksStates = traverseRecoilHooks(memoizedState);\n        hooksStates.forEach((state, i) => {\n\n          hooksIndex = componentActionsRecord.saveNew(\n            state.state,\n            state.component\n          );\n          componentData.hooksIndex = hooksIndex;\n          if (newState && newState.hooksState) {\n            newState.push(state.state);\n          } else if (newState) {\n            newState = [state.state];\n          } else {\n            newState.push(state.state);\n          }\n          componentFound = true;\n        });\n      }\n    }\n\n>>>>>>>"],
        ["Explain the plot of Cinderella in a sentence."],
        ["How many hours does it take a man to eat a Helicopter?"],
        ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
    ],
)

with gr.Blocks(css="style.css") as demo:
    gr.Markdown(DESCRIPTION)
    gr.DuplicateButton(
        value="Duplicate Space for private use",
        elem_id="duplicate-button",
        visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
    )
    chat_interface.render()

if __name__ == "__main__":
    demo.queue(max_size=20).launch()