vicuna-7b / README.md
helloollel's picture
Update README.md
43b3abc
|
raw
history blame
6.87 kB
# vicuna-7b
This README provides a step-by-step guide to set up and run the FastChat application with the required dependencies and model.
## Prerequisites
Before you proceed, ensure that you have `git` installed on your system.
## Installation
Follow the steps below to install the required packages and set up the environment.
1. Upgrade `pip`:
```bash
python3 -m pip install --upgrade pip
```
2. Install `accelerate`:
```bash
python3 -m pip install accelerate
```
3. Install `bitsandbytes`
3.1 install by pip
```bash
python3 -m pip install bitsandbytes
```
3.2 Clone the `bitsandbytes` repository and install it:
```bash
git clone https://github.com/TimDettmers/bitsandbytes.git
cd bitsandbytes
CUDA_VERSION=118 make cuda11x
python3 -m pip install .
cd ..
```
use the following command to find `CUDA_VERSION`:
```bash
ls /usr/local/cuda*
```
4. Clone the `FastChat` repository and install it:
```bash
git clone https://github.com/lm-sys/FastChat.git
cd FastChat
python3 -m pip install -e .
cd ..
```
5. Install `git-lfs`:
```bash
curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
sudo apt-get install git-lfs
git lfs install
```
6. Clone the `vicuna-7b` model:
```bash
git clone https://huggingface.co/helloollel/vicuna-7b
```
## Running FastChat
After completing the installation, you can run FastChat with the following command:
```bash
python3 -m fastchat.serve.cli --model-name ./vicuna-7b
```
This will start the FastChat server using the `vicuna-7b` model.
## Running in Notebook
```python
import argparse
import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer
from fastchat.conversation import conv_templates, SeparatorStyle
from fastchat.serve.monkey_patch_non_inplace import replace_llama_attn_with_non_inplace_operations
def load_model(model_name, device, num_gpus, load_8bit=False):
if device == "cpu":
kwargs = {}
elif device == "cuda":
kwargs = {"torch_dtype": torch.float16}
if load_8bit:
if num_gpus != "auto" and int(num_gpus) != 1:
print("8-bit weights are not supported on multiple GPUs. Revert to use one GPU.")
kwargs.update({"load_in_8bit": True, "device_map": "auto"})
else:
if num_gpus == "auto":
kwargs["device_map"] = "auto"
else:
num_gpus = int(num_gpus)
if num_gpus != 1:
kwargs.update({
"device_map": "auto",
"max_memory": {i: "13GiB" for i in range(num_gpus)},
})
elif device == "mps":
# Avoid bugs in mps backend by not using in-place operations.
kwargs = {"torch_dtype": torch.float16}
replace_llama_attn_with_non_inplace_operations()
else:
raise ValueError(f"Invalid device: {device}")
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_name,
low_cpu_mem_usage=True, **kwargs)
# calling model.cuda() mess up weights if loading 8-bit weights
if device == "cuda" and num_gpus == 1 and not load_8bit:
model.to("cuda")
elif device == "mps":
model.to("mps")
return model, tokenizer
@torch.inference_mode()
def generate_stream(tokenizer, model, params, device,
context_len=2048, stream_interval=2):
"""Adapted from fastchat/serve/model_worker.py::generate_stream"""
prompt = params["prompt"]
l_prompt = len(prompt)
temperature = float(params.get("temperature", 1.0))
max_new_tokens = int(params.get("max_new_tokens", 256))
stop_str = params.get("stop", None)
input_ids = tokenizer(prompt).input_ids
output_ids = list(input_ids)
max_src_len = context_len - max_new_tokens - 8
input_ids = input_ids[-max_src_len:]
for i in range(max_new_tokens):
if i == 0:
out = model(
torch.as_tensor([input_ids], device=device), use_cache=True)
logits = out.logits
past_key_values = out.past_key_values
else:
attention_mask = torch.ones(
1, past_key_values[0][0].shape[-2] + 1, device=device)
out = model(input_ids=torch.as_tensor([[token]], device=device),
use_cache=True,
attention_mask=attention_mask,
past_key_values=past_key_values)
logits = out.logits
past_key_values = out.past_key_values
last_token_logits = logits[0][-1]
if device == "mps":
# Switch to CPU by avoiding some bugs in mps backend.
last_token_logits = last_token_logits.float().to("cpu")
if temperature < 1e-4:
token = int(torch.argmax(last_token_logits))
else:
probs = torch.softmax(last_token_logits / temperature, dim=-1)
token = int(torch.multinomial(probs, num_samples=1))
output_ids.append(token)
if token == tokenizer.eos_token_id:
stopped = True
else:
stopped = False
if i % stream_interval == 0 or i == max_new_tokens - 1 or stopped:
output = tokenizer.decode(output_ids, skip_special_tokens=True)
pos = output.rfind(stop_str, l_prompt)
if pos != -1:
output = output[:pos]
stopped = True
yield output
if stopped:
break
del past_key_values
args = dict(
model_name='./vicuna-7b',
device='cuda',
num_gpus='1',
load_8bit=True,
conv_template='v1',
temperature=0.7,
max_new_tokens=512,
debug=False
)
args = argparse.Namespace(**args)
model_name = args.model_name
# Model
model, tokenizer = load_model(args.model_name, args.device,
args.num_gpus, args.load_8bit)
# Chat
conv = conv_templates[args.conv_template].copy()
def chat(inp):
conv.append_message(conv.roles[0], inp)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
params = {
"model": model_name,
"prompt": prompt,
"temperature": args.temperature,
"max_new_tokens": args.max_new_tokens,
"stop": conv.sep if conv.sep_style == SeparatorStyle.SINGLE else conv.sep2,
}
print(f"{conv.roles[1]}: ", end="", flush=True)
pre = 0
for outputs in generate_stream(tokenizer, model, params, args.device):
outputs = outputs[len(prompt) + 1:].strip()
outputs = outputs.split(" ")
now = len(outputs)
if now - 1 > pre:
print(" ".join(outputs[pre:now-1]), end=" ", flush=True)
pre = now - 1
print(" ".join(outputs[pre:]), flush=True)
conv.messages[-1][-1] = " ".join(outputs)
```
```python
chat("what's the meaning of life?")
```