yibolu
commited on
Commit
•
dae7d0f
0
Parent(s):
commit message
Browse files- .gitattributes +37 -0
- .gitignore +6 -0
- README.md +123 -0
- demo.py +19 -0
- lyra_llama/__init__.py +1 -0
- lyra_llama/config.py +33 -0
- lyra_llama/lyra_llama.py +169 -0
- lyra_llama/model.py +156 -0
- models/config.ini +13 -0
- models/special_tokens_map.json +27 -0
- models/tokenizer.json +0 -0
- models/tokenizer.model +3 -0
- models/tokenizer_config.json +33 -0
- requirements.txt +4 -0
.gitattributes
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
models/glm6b-kv-cache-dy-bs8.ftm filter=lfs diff=lfs merge=lfs -text
|
36 |
+
models/glm6b-bs8.ftm filter=lfs diff=lfs merge=lfs -text
|
37 |
+
*.so filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dist/
|
2 |
+
*.egg-info/
|
3 |
+
__pycache__
|
4 |
+
build/
|
5 |
+
.vscode
|
6 |
+
.idea
|
README.md
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: mit
|
3 |
+
language: en
|
4 |
+
tags:
|
5 |
+
- LLM
|
6 |
+
- LLaMA-13b
|
7 |
+
---
|
8 |
+
## Special Notes
|
9 |
+
|
10 |
+
Due to the license regulation of LLaMA, we are not allowed to release the accelerated parameters directly. We hope to discuss with you guys to figure out **a legal way** to share lyraLLaMA. If you any any suggestions, please feel free to drop us a line at [email protected].
|
11 |
+
|
12 |
+
## Model Card for lyraLLaMA
|
13 |
+
|
14 |
+
lyraLLaMA is currently the **fastest LLaMA-13b** available. The inference speed of lyraLLaMA has achieved **4x** acceleration upon the torch version.
|
15 |
+
|
16 |
+
Among its main features are:
|
17 |
+
- device: Nvidia GPU with Amperer architecture or Volta architecture (A100 or higher, V100).
|
18 |
+
- batch_size: compiled with dynamic batch size, maximum depends on device.
|
19 |
+
|
20 |
+
We use the LLaMA.13B model for measurement, but this optimized inference is applicable to LLaMA models of different sizes.
|
21 |
+
|
22 |
+
## Speed
|
23 |
+
|
24 |
+
* test on A100 40G
|
25 |
+
* fp16 precision
|
26 |
+
|
27 |
+
### LLaMA-Ziya-13B
|
28 |
+
|
29 |
+
| Version | Batch Size 1 | Batch Size 8 | Batch Size 16 | Batch Size 32 | Batch Size 64 |
|
30 |
+
| --- | --- | --- | --- | --- | --- |
|
31 |
+
| Torch LLaMA | 31.74 | 289.2 | 521.37 | 775.69 | OOM |
|
32 |
+
| lyraLLaMA | 73.2 | 565.6 | 1179.59 | 1795.63 | 3061.27 |
|
33 |
+
|
34 |
+
### LLaMA-Vicuna-13B
|
35 |
+
|
36 |
+
| Version | Batch Size 1 | Batch Size 8 | Batch Size 16 | Batch Size 32 | Batch Size 64 |
|
37 |
+
| --- | --- | --- | --- | --- | --- |
|
38 |
+
| Torch LLaMA | 24.65| 167.3 | 322.97 | 407.99 | OOM |
|
39 |
+
| lyraLLaMA | 53.67 | 421.38 | 804.31 | 151.928| 2679.82 |
|
40 |
+
|
41 |
+
## Docker Environment Recommendation
|
42 |
+
|
43 |
+
- For Cuda 11.X: we recommend ```nvcr.io/nvidia/pytorch:22.12-py3```
|
44 |
+
- For Cuda 12.0: we recommend ```nvcr.io/nvidia/pytorch:23.02-py3```
|
45 |
+
|
46 |
+
```bash
|
47 |
+
docker pull nvcr.io/nvidia/pytorch:23.02-py3
|
48 |
+
docker run --rm -it --gpus all -v ./:/lyraLLaMA nvcr.io/nvidia/pytorch:23.02-py3
|
49 |
+
|
50 |
+
pip install -r requirements.txt
|
51 |
+
python demo.py
|
52 |
+
```
|
53 |
+
|
54 |
+
## Uses
|
55 |
+
|
56 |
+
```python
|
57 |
+
from lyra_llama import lyraLLaMA
|
58 |
+
|
59 |
+
model_path = "./models/lamma-13b-1-gpu-fp16.bin"
|
60 |
+
tokenizer_path = "./models/"
|
61 |
+
dtype='fp16'
|
62 |
+
prompt = "列出3个不同的机器学习算法,并说明它们的适用范围"
|
63 |
+
max_output_length = 512
|
64 |
+
|
65 |
+
model = lyraLLaMA(model_path, tokenizer_path, dtype)
|
66 |
+
|
67 |
+
prompt = '<human>:' + prompt.strip() + '\n<bot>:'
|
68 |
+
|
69 |
+
bs = 1
|
70 |
+
prompts = [prompt, ] * bs
|
71 |
+
output_texts = model.generate(
|
72 |
+
prompts, output_length=max_output_length,
|
73 |
+
top_k=30, top_p=0.85, temperature=1.0, repetition_penalty=1.0, do_sample=False)
|
74 |
+
|
75 |
+
print(output_texts)
|
76 |
+
|
77 |
+
```
|
78 |
+
|
79 |
+
## Demo output
|
80 |
+
|
81 |
+
### LLaMA-Ziya-13B
|
82 |
+
#### input
|
83 |
+
列出3个不同的机器学习算法,并说明它们的适用范围.
|
84 |
+
|
85 |
+
#### output
|
86 |
+
以下是三个常见的机器学习算法及其适用范围:
|
87 |
+
|
88 |
+
1. 决策树(Decision Tree):决策树是一种基于分类和回归问题的朴素贝叶斯模型。它通过构建一系列逐步分裂的分支来预测结果。适用于那些具有简单特征、大量数据且数据集大小在可接受范围内的情况。
|
89 |
+
|
90 |
+
2. 随机森林(Random Forest):随机森林是一种集成学习算法,由多个决策树组成。它的优点是能够处理大规模数据和高维度的特征。适用于需要对多个变量进行建模的场景,例如医疗诊断、金融风险评估等。
|
91 |
+
|
92 |
+
3. 支持向量机(Support Vector Machine):支持向量机是一种监督学习方法,通常用于分类问题。它可以处理高维数据,并且具有较高的准确性。适用于需要对高维数据进行分类或回归的问题,例如图像识别、自然语言处理等。
|
93 |
+
|
94 |
+
### LLaMA-Vicuna-13B
|
95 |
+
|
96 |
+
#### input
|
97 |
+
List 3 different machine learning algorithms and explain where they are applicable.
|
98 |
+
|
99 |
+
#### output
|
100 |
+
|
101 |
+
Outputs:
|
102 |
+
1. Linear Regression: Linear regression is a simple, yet powerful algorithm that is used to predict the value of a continuous variable based on one or more input features. It is commonly used in prediction and forecasting tasks, such as predicting the price of a stock or the sales of a product.
|
103 |
+
2. Decision Trees: Decision Trees are a type of supervised learning algorithm that can be used for both classification and regression tasks. They work by partitioning the feature space into smaller subspaces, with each subspace corresponding to a leaf node in the tree. Decision Trees are commonly used in applications such as credit risk assessment and customer segmentation.
|
104 |
+
3. Support Vector Machines (SVMs): SVMs are a type of supervised learning algorithm that can be used for both classification and regression tasks. They work by finding the best hyperplane that separates the data into different classes. SVMs are commonly used in applications such as image classification and natural language processing.
|
105 |
+
|
106 |
+
## TODO
|
107 |
+
1. Support for int8 and int4
|
108 |
+
2. Inference for longer context situations
|
109 |
+
3. Streaming inference mode.
|
110 |
+
|
111 |
+
## Citation
|
112 |
+
``` bibtex
|
113 |
+
@Misc{lyraLLaMA2023,
|
114 |
+
author = {Kangjian Wu, Zhengtao Wang, Yibo Lu, Bin Wu},
|
115 |
+
title = {lyraLLaMA: Accelerating LLaMA-13b(fp16) to 3000+ tokens/s},
|
116 |
+
howpublished = {\url{https://huggingface.co/TMElyralab/lyraLLaMA}},
|
117 |
+
year = {2023}
|
118 |
+
}
|
119 |
+
```
|
120 |
+
|
121 |
+
## Report bug
|
122 |
+
- start a discussion to report any bugs!--> https://huggingface.co/TMElyralab/lyraLLaMA/discussions
|
123 |
+
- report bug with a `[bug]` mark in the title.
|
demo.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from lyra_llama import lyraLLaMA
|
2 |
+
|
3 |
+
model_path = "./models/lamma-13b-1-gpu-fp16.bin"
|
4 |
+
tokenizer_path = "./models/"
|
5 |
+
dtype='fp16'
|
6 |
+
prompt = "列出3个不同的机器学习算法,并说明它们的适用范围"
|
7 |
+
max_output_length = 512
|
8 |
+
|
9 |
+
model = lyraLLaMA(model_path, tokenizer_path, dtype)
|
10 |
+
|
11 |
+
prompt = '<human>:' + prompt.strip() + '\n<bot>:'
|
12 |
+
|
13 |
+
bs = 1
|
14 |
+
prompts = [prompt, ] * bs
|
15 |
+
output_texts = model.generate(
|
16 |
+
prompts, output_length=max_output_length,
|
17 |
+
top_k=30, top_p=0.85, temperature=1.0, repetition_penalty=1.0, do_sample=False)
|
18 |
+
|
19 |
+
print(output_texts)
|
lyra_llama/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .lyra_llama import lyraLlama
|
lyra_llama/config.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import dataclasses
|
2 |
+
from typing import Optional
|
3 |
+
|
4 |
+
@dataclasses.dataclass
|
5 |
+
class LyraLLaMAParam:
|
6 |
+
num_heads: int = 40
|
7 |
+
size_per_head: int = 128
|
8 |
+
inter_size: int = 13824
|
9 |
+
num_layers: int = 40
|
10 |
+
vocab_size: int = 39424
|
11 |
+
start_id: Optional[int] = 1
|
12 |
+
end_id: Optional[int] = 2
|
13 |
+
tensor_para_size: int = 1
|
14 |
+
pipeline_para_size: int = 1
|
15 |
+
remove_padding: bool = True
|
16 |
+
shared_contexts_ratio: float = 1.0
|
17 |
+
layernorm_eps: float = 1e-6
|
18 |
+
weights_data_type: str = "fp16"
|
19 |
+
rotary_embedding: int = 128
|
20 |
+
use_gptj_residual: bool = False
|
21 |
+
|
22 |
+
def __post_init__(self):
|
23 |
+
if not 0.0 <= self.shared_contexts_ratio <= 1.0:
|
24 |
+
raise ValueError(
|
25 |
+
f'Got an invalid value of shared_context_ratio '
|
26 |
+
f'{self.shared_contexts_ratio} - range: [0.0, 1.0]')
|
27 |
+
|
28 |
+
def asdict(self):
|
29 |
+
return dataclasses.asdict(self)
|
30 |
+
|
31 |
+
|
32 |
+
LYRA_LLAMA_PARAM = LyraLLaMAParam()
|
33 |
+
LIB_SO_PATH = '/app/LyraLLaMAPy/ftlib/libth_transformer_sm80_cu11.so'
|
lyra_llama/lyra_llama.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import configparser
|
4 |
+
import pathlib
|
5 |
+
import typing
|
6 |
+
|
7 |
+
import torch
|
8 |
+
import transformers
|
9 |
+
from torch.nn.utils.rnn import pad_sequence
|
10 |
+
|
11 |
+
from .config import LYRA_LLAMA_PARAM, LIB_SO_PATH
|
12 |
+
from .model import LLaMAModel
|
13 |
+
|
14 |
+
class lyraLLaMA:
|
15 |
+
def __init__(self, model_path, tokenizer_path=None, dtype='fp16', int8_mode=0) -> None:
|
16 |
+
self.model_path = model_path
|
17 |
+
self.tokenizer_path = tokenizer_path
|
18 |
+
self.dtype = dtype
|
19 |
+
if dtype != 'int8':
|
20 |
+
int8_mode = 0
|
21 |
+
self.int8_mode = int8_mode
|
22 |
+
|
23 |
+
self.model, self.tokenizer = self.load_model_and_tokenizer()
|
24 |
+
print("Got model and tokenizer")
|
25 |
+
|
26 |
+
def load_model_and_tokenizer(self):
|
27 |
+
if self.tokenizer_path is None:
|
28 |
+
tokenizer_path = self.model_path
|
29 |
+
else:
|
30 |
+
tokenizer_path = self.tokenizer_path
|
31 |
+
|
32 |
+
print(f'Loading tokenizer from {tokenizer_path}')
|
33 |
+
tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path)
|
34 |
+
|
35 |
+
checkpoint_path = pathlib.Path(self.tokenizer_path)
|
36 |
+
config_path = checkpoint_path / 'config.ini'
|
37 |
+
|
38 |
+
if config_path.exists():
|
39 |
+
# Read model params from config.
|
40 |
+
cfg = configparser.ConfigParser()
|
41 |
+
cfg.read(config_path)
|
42 |
+
model_name = 'llama'
|
43 |
+
inference_data_type = self.dtype
|
44 |
+
if inference_data_type == None:
|
45 |
+
inference_data_type = cfg.get(model_name, "weight_data_type")
|
46 |
+
model_args = dict(
|
47 |
+
head_num=cfg.getint(model_name, 'head_num'),
|
48 |
+
size_per_head=cfg.getint(model_name, "size_per_head"),
|
49 |
+
layer_num=cfg.getint(model_name, "num_layer"),
|
50 |
+
tensor_para_size=cfg.getint(model_name, "tensor_para_size"),
|
51 |
+
vocab_size=cfg.getint(model_name, "vocab_size"),
|
52 |
+
start_id=cfg.getint(model_name, "start_id"),
|
53 |
+
end_id=cfg.getint(model_name, "end_id"),
|
54 |
+
weights_data_type=cfg.get(model_name, "weight_data_type"),
|
55 |
+
layernorm_eps=cfg.getfloat(model_name, 'layernorm_eps'),
|
56 |
+
rotary_embedding_dim=cfg.getint(model_name, 'rotary_embedding'),
|
57 |
+
inter_size=cfg.getint(model_name, 'inter_size'),
|
58 |
+
inference_data_type=inference_data_type)
|
59 |
+
else:
|
60 |
+
inference_data_type = self.dtype
|
61 |
+
if inference_data_type == None:
|
62 |
+
inference_data_type = LYRA_LLAMA_PARAM.weights_data_type
|
63 |
+
model_args = dict(head_num=LYRA_LLAMA_PARAM.num_heads,
|
64 |
+
size_per_head=LYRA_LLAMA_PARAM.size_per_head,
|
65 |
+
vocab_size=LYRA_LLAMA_PARAM.vocab_size,
|
66 |
+
start_id=LYRA_LLAMA_PARAM.start_id or tokenizer.bos_token_id,
|
67 |
+
end_id=LYRA_LLAMA_PARAM.end_id or tokenizer.eos_token_id,
|
68 |
+
layer_num=LYRA_LLAMA_PARAM.num_layers,
|
69 |
+
tensor_para_size=LYRA_LLAMA_PARAM.tensor_para_size,
|
70 |
+
weights_data_type=LYRA_LLAMA_PARAM.weights_data_type,
|
71 |
+
layernorm_eps=LYRA_LLAMA_PARAM.layernorm_eps,
|
72 |
+
rotary_embedding_dim=LYRA_LLAMA_PARAM.rotary_embedding,
|
73 |
+
inter_size=LYRA_LLAMA_PARAM.inter_size,
|
74 |
+
inference_data_type=inference_data_type)
|
75 |
+
|
76 |
+
# update common parameters
|
77 |
+
model_args.update(dict(
|
78 |
+
lib_path=LIB_SO_PATH,
|
79 |
+
model_path=self.model_path,
|
80 |
+
max_seq_len=0, # for position seq embedding
|
81 |
+
pipeline_para_size=LYRA_LLAMA_PARAM.pipeline_para_size,
|
82 |
+
use_gptj_residual=LYRA_LLAMA_PARAM.use_gptj_residual,
|
83 |
+
# shared_contexts_ratio=LYRA_LLAMA_PARAM.shared_contexts_ratio,
|
84 |
+
))
|
85 |
+
|
86 |
+
print('[FT][INFO] Load Our FT Highly Optimized ChatGLM6B model')
|
87 |
+
for k, v in model_args.items():
|
88 |
+
print(f' - {k.ljust(25, ".")}: {v}')
|
89 |
+
|
90 |
+
# Check sanity and consistency between the model and tokenizer.
|
91 |
+
checklist = ['head_num', 'size_per_head', 'vocab_size', 'layer_num',
|
92 |
+
'tensor_para_size', 'tensor_para_size', 'weights_data_type']
|
93 |
+
if None in [model_args[k] for k in checklist]:
|
94 |
+
none_params = [p for p in checklist if model_args[p] is None]
|
95 |
+
print(f'[FT][WARNING] Found None parameters {none_params}. They must '
|
96 |
+
f'be provided either by config file or CLI arguments.')
|
97 |
+
if model_args['start_id'] != tokenizer.bos_token_id:
|
98 |
+
print('[FT][WARNING] Given start_id is not matched with the bos token '
|
99 |
+
'id of the pretrained tokenizer.')
|
100 |
+
if model_args['end_id'] not in (tokenizer.pad_token_id, tokenizer.eos_token_id):
|
101 |
+
print('[FT][WARNING] Given end_id is not matched with neither pad '
|
102 |
+
'token id nor eos token id of the pretrained tokenizer.')
|
103 |
+
|
104 |
+
print(f'Loading tokenizer from {self.model_path}')
|
105 |
+
model = LLaMAModel(**model_args)
|
106 |
+
|
107 |
+
return model, tokenizer
|
108 |
+
|
109 |
+
def generate(self, prompts: typing.List[str] | str,
|
110 |
+
output_length: int = 512,
|
111 |
+
beam_width: int = 1,
|
112 |
+
top_k: typing.Optional[torch.IntTensor] = 1,
|
113 |
+
top_p: typing.Optional[torch.FloatTensor] = 1.0,
|
114 |
+
beam_search_diversity_rate: typing.Optional[torch.FloatTensor] = 0.0,
|
115 |
+
temperature: typing.Optional[torch.FloatTensor] = 1.0,
|
116 |
+
len_penalty: typing.Optional[torch.FloatTensor] = 0.0,
|
117 |
+
repetition_penalty: typing.Optional[torch.FloatTensor] = 1.0,
|
118 |
+
presence_penalty: typing.Optional[torch.FloatTensor] = None,
|
119 |
+
min_length: typing.Optional[torch.IntTensor] = None,
|
120 |
+
bad_words_list: typing.Optional[torch.IntTensor] = None,
|
121 |
+
do_sample: bool = False,
|
122 |
+
return_output_length: bool = False,
|
123 |
+
return_cum_log_probs: int = 0):
|
124 |
+
#
|
125 |
+
if isinstance(prompts, str):
|
126 |
+
prompts = [prompts, ]
|
127 |
+
|
128 |
+
inputs = prompts
|
129 |
+
|
130 |
+
batch_size = len(inputs)
|
131 |
+
ones_int = torch.ones(size=[batch_size], dtype=torch.int32)
|
132 |
+
ones_float = torch.ones(size=[batch_size], dtype=torch.float32)
|
133 |
+
|
134 |
+
# we must encode the raw prompt text one by one in order to compute the length of the original text.
|
135 |
+
input_token_ids = [self.tokenizer(text, return_tensors="pt").input_ids.int().squeeze() for text in inputs]
|
136 |
+
input_lengths = torch.IntTensor([len(ids) for ids in input_token_ids])
|
137 |
+
# after got the length of each input text tokens. we can batchfy the input list to a tensor. padding the right.
|
138 |
+
input_token_ids = pad_sequence(input_token_ids, batch_first=True, padding_value=self.tokenizer.eos_token_id)
|
139 |
+
|
140 |
+
random_seed = None
|
141 |
+
if do_sample:
|
142 |
+
random_seed = torch.randint(0, 262144, (batch_size,), dtype=torch.long)
|
143 |
+
|
144 |
+
outputs = self.model(start_ids=input_token_ids,
|
145 |
+
start_lengths=input_lengths,
|
146 |
+
output_len=output_length,
|
147 |
+
beam_width=beam_width,
|
148 |
+
top_k=top_k * ones_int,
|
149 |
+
top_p=top_p * ones_float,
|
150 |
+
beam_search_diversity_rate=beam_search_diversity_rate * ones_float,
|
151 |
+
temperature=temperature * ones_float,
|
152 |
+
len_penalty=len_penalty * ones_float,
|
153 |
+
repetition_penalty=repetition_penalty * ones_float,
|
154 |
+
random_seed=random_seed,
|
155 |
+
return_output_length=return_output_length,
|
156 |
+
return_cum_log_probs=return_cum_log_probs)
|
157 |
+
|
158 |
+
if return_cum_log_probs > 0:
|
159 |
+
outputs = outputs[0] # output_token_ids.
|
160 |
+
|
161 |
+
# Slice the generated token ids of the 1st beam result.
|
162 |
+
# output = input tokens + generated tokens.
|
163 |
+
output_token_ids = [out[0, length:].cpu()
|
164 |
+
for out, length in zip(outputs, input_lengths)]
|
165 |
+
|
166 |
+
output_texts = self.tokenizer.batch_decode(
|
167 |
+
output_token_ids, skip_special_tokens=True)
|
168 |
+
|
169 |
+
return output_texts
|
lyra_llama/model.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
from __future__ import print_function
|
16 |
+
|
17 |
+
import copy
|
18 |
+
import os
|
19 |
+
|
20 |
+
import numpy as np
|
21 |
+
import torch
|
22 |
+
import torch.distributed as dist
|
23 |
+
import torch.nn as nn
|
24 |
+
|
25 |
+
class LLaMAModel(nn.Module):
|
26 |
+
def __init__(self,
|
27 |
+
head_num, size_per_head, inter_size,
|
28 |
+
vocab_size, rotary_embedding_dim,
|
29 |
+
start_id, end_id, layer_num,
|
30 |
+
max_seq_len, layernorm_eps,
|
31 |
+
tensor_para_size, pipeline_para_size,
|
32 |
+
use_gptj_residual,
|
33 |
+
lib_path,
|
34 |
+
model_path,
|
35 |
+
inference_data_type: str = "fp16",
|
36 |
+
weights_data_type: np.dtype = np.float16):
|
37 |
+
super().__init__()
|
38 |
+
self.head_num = head_num
|
39 |
+
self.size_per_head = size_per_head
|
40 |
+
self.inter_size = inter_size
|
41 |
+
self.vocab_size = vocab_size
|
42 |
+
self.rotary_embedding_dim = rotary_embedding_dim
|
43 |
+
self.start_id = start_id
|
44 |
+
self.end_id = end_id
|
45 |
+
self.max_seq_len = max_seq_len
|
46 |
+
self.layer_num = layer_num
|
47 |
+
self.use_gptj_residual = use_gptj_residual
|
48 |
+
self.layernorm_eps = layernorm_eps
|
49 |
+
|
50 |
+
self.tensor_para_size = tensor_para_size
|
51 |
+
self.pipeline_para_size = pipeline_para_size
|
52 |
+
self.build_model = False
|
53 |
+
self.weights_data_type = weights_data_type
|
54 |
+
self.inference_data_type = inference_data_type
|
55 |
+
|
56 |
+
assert torch.cuda.is_available(), "CUDA is required for this model."
|
57 |
+
|
58 |
+
assert head_num % tensor_para_size == 0, "head_num must be a multiple of tensor_para_size."
|
59 |
+
assert layer_num % pipeline_para_size == 0, "layer_num must be a multiple of pipeline_para_size."
|
60 |
+
|
61 |
+
# Load the C++ model into Pytorch model.
|
62 |
+
torch.classes.load_library(os.path.abspath(lib_path))
|
63 |
+
|
64 |
+
# Prepare for tensor/pipeline parallel
|
65 |
+
try:
|
66 |
+
dist.init_process_group(backend='mpi')
|
67 |
+
except:
|
68 |
+
print("[INFO] WARNING: Have initialized the process group")
|
69 |
+
self.rank = dist.get_rank()
|
70 |
+
self.device_count = torch.cuda.device_count()
|
71 |
+
self.device = self.rank % self.device_count
|
72 |
+
torch.cuda.set_device(self.device)
|
73 |
+
|
74 |
+
world_size = dist.get_world_size()
|
75 |
+
# print(tensor_para_size * pipeline_para_size)
|
76 |
+
assert world_size == tensor_para_size * pipeline_para_size, "tensor_para_size * pipeline_para_size must be equal to world_size."
|
77 |
+
|
78 |
+
self.tensor_para_rank = self.rank % self.tensor_para_size
|
79 |
+
self.pipeline_para_rank = self.rank // self.tensor_para_size
|
80 |
+
|
81 |
+
self.model = torch.classes.FasterTransformer.LlamaOp(
|
82 |
+
self.head_num,
|
83 |
+
self.size_per_head,
|
84 |
+
self.inter_size,
|
85 |
+
self.layer_num,
|
86 |
+
self.vocab_size,
|
87 |
+
self.rotary_embedding_dim,
|
88 |
+
self.layernorm_eps,
|
89 |
+
self.start_id,
|
90 |
+
self.end_id,
|
91 |
+
self.tensor_para_size,
|
92 |
+
self.pipeline_para_size,
|
93 |
+
self.max_seq_len,
|
94 |
+
self.use_gptj_residual,
|
95 |
+
model_path,
|
96 |
+
inference_data_type)
|
97 |
+
|
98 |
+
self.build_model = True
|
99 |
+
|
100 |
+
def forward(self,
|
101 |
+
start_ids: torch.Tensor,
|
102 |
+
start_lengths: torch.Tensor,
|
103 |
+
output_len,
|
104 |
+
beam_width=1,
|
105 |
+
top_k: torch.Tensor = None,
|
106 |
+
top_p: torch.Tensor = None,
|
107 |
+
beam_search_diversity_rate: torch.Tensor = None,
|
108 |
+
temperature: torch.Tensor = None,
|
109 |
+
len_penalty: torch.Tensor = None,
|
110 |
+
repetition_penalty: torch.Tensor = None,
|
111 |
+
random_seed: torch.Tensor = None,
|
112 |
+
return_output_length=False,
|
113 |
+
return_cum_log_probs=0):
|
114 |
+
if not self.build_model:
|
115 |
+
self.cuda()
|
116 |
+
input_len = start_ids.size(1)
|
117 |
+
assert input_len > 0, "input len must be larger than zero. For an unconditional case, use start_id as the first token."
|
118 |
+
|
119 |
+
# Inputs to device
|
120 |
+
input_ids = start_ids.cuda(self.device)
|
121 |
+
input_lengths = start_lengths.cuda(self.device)
|
122 |
+
# outputs: output_ids, output_lengths, output_cum_log_probs (optional)
|
123 |
+
outputs = self.model.forward(input_ids,
|
124 |
+
input_lengths,
|
125 |
+
output_len,
|
126 |
+
beam_width, # optional, can be None
|
127 |
+
top_k, # optional, can be None
|
128 |
+
top_p, # optional, can be None
|
129 |
+
beam_search_diversity_rate, # optional, can be None
|
130 |
+
temperature, # optional, can be None
|
131 |
+
len_penalty, # optional, can be None
|
132 |
+
repetition_penalty, # optional, can be None
|
133 |
+
random_seed, # optional, can be None
|
134 |
+
return_cum_log_probs) # optional, can be None
|
135 |
+
|
136 |
+
if return_cum_log_probs == 0:
|
137 |
+
output_ids, output_lengths = outputs
|
138 |
+
else:
|
139 |
+
output_ids, output_lengths, output_cum_log_probs = outputs
|
140 |
+
if return_output_length:
|
141 |
+
if return_cum_log_probs > 0:
|
142 |
+
return output_ids, output_lengths, output_cum_log_probs
|
143 |
+
else:
|
144 |
+
return output_ids, output_lengths
|
145 |
+
else:
|
146 |
+
return output_ids
|
147 |
+
|
148 |
+
def set_input_tensor(self, input_tensor):
|
149 |
+
"""Set input tensor to be used instead of forward()'s input.
|
150 |
+
|
151 |
+
When doing pipeline parallelism the input from the previous
|
152 |
+
stage comes from communication, not from the input, so the
|
153 |
+
model's forward_step_func won't have it. This function is thus
|
154 |
+
used by internal code to bypass the input provided by the
|
155 |
+
forward_step_func"""
|
156 |
+
self.input_tensor = input_tensor
|
models/config.ini
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[llama]
|
2 |
+
model_name = ziya-llama
|
3 |
+
head_num = 40
|
4 |
+
size_per_head = 128
|
5 |
+
inter_size = 13824
|
6 |
+
num_layer = 40
|
7 |
+
rotary_embedding = 128
|
8 |
+
layernorm_eps = 1e-06
|
9 |
+
vocab_size = 39424
|
10 |
+
start_id = 1
|
11 |
+
end_id = 2
|
12 |
+
weight_data_type = fp16
|
13 |
+
tensor_para_size = 1
|
models/special_tokens_map.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<human>",
|
4 |
+
"<bot>"
|
5 |
+
],
|
6 |
+
"bos_token": {
|
7 |
+
"content": "<s>",
|
8 |
+
"lstrip": false,
|
9 |
+
"normalized": true,
|
10 |
+
"rstrip": false,
|
11 |
+
"single_word": false
|
12 |
+
},
|
13 |
+
"eos_token": {
|
14 |
+
"content": "</s>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": true,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false
|
19 |
+
},
|
20 |
+
"unk_token": {
|
21 |
+
"content": "<unk>",
|
22 |
+
"lstrip": false,
|
23 |
+
"normalized": true,
|
24 |
+
"rstrip": false,
|
25 |
+
"single_word": false
|
26 |
+
}
|
27 |
+
}
|
models/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
models/tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6fd7e445833dd0889206aba242c2a51ecbae2437fd328d1759a35475fd8c0423
|
3 |
+
size 588619
|
models/tokenizer_config.json
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": true,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"bos_token": {
|
5 |
+
"__type": "AddedToken",
|
6 |
+
"content": "<s>",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": true,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false
|
11 |
+
},
|
12 |
+
"clean_up_tokenization_spaces": false,
|
13 |
+
"eos_token": {
|
14 |
+
"__type": "AddedToken",
|
15 |
+
"content": "</s>",
|
16 |
+
"lstrip": false,
|
17 |
+
"normalized": true,
|
18 |
+
"rstrip": false,
|
19 |
+
"single_word": false
|
20 |
+
},
|
21 |
+
"model_max_length": 1000000000000000019884624838656,
|
22 |
+
"pad_token": null,
|
23 |
+
"sp_model_kwargs": {},
|
24 |
+
"tokenizer_class": "LlamaTokenizer",
|
25 |
+
"unk_token": {
|
26 |
+
"__type": "AddedToken",
|
27 |
+
"content": "<unk>",
|
28 |
+
"lstrip": false,
|
29 |
+
"normalized": true,
|
30 |
+
"rstrip": false,
|
31 |
+
"single_word": false
|
32 |
+
}
|
33 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
numpy
|
3 |
+
setuptools
|
4 |
+
torch
|