carsonhxsu
commited on
Commit
•
70750a1
1
Parent(s):
03bfeeb
[NewFeature] Support inference of LLaMA (7B/13B) using int8 quantization
Browse files- README.md +8 -5
- lyra_llama/lyra_llama.py +21 -19
- lyra_llama/model.py +28 -18
README.md
CHANGED
@@ -23,21 +23,23 @@ We use the LLaMA.13B model for measurement, but this optimized inference is appl
|
|
23 |
|
24 |
* Evaluated at tokens/s
|
25 |
* test on A100 40G
|
26 |
-
* fp16 precision
|
27 |
|
28 |
### LLaMA-Ziya-13B
|
29 |
|
30 |
| Version | Batch Size 1 | Batch Size 8 | Batch Size 16 | Batch Size 32 | Batch Size 64 |
|
31 |
| --- | --- | --- | --- | --- | --- |
|
32 |
| Torch LLaMA | 31.74 | 289.2 | 521.37 | 775.69 | OOM |
|
33 |
-
| lyraLLaMA | 73.2 | 565.6 | 1179.59 | 1795.63 | 3061.27 |
|
|
|
34 |
|
35 |
### LLaMA-Vicuna-13B
|
36 |
|
37 |
| Version | Batch Size 1 | Batch Size 8 | Batch Size 16 | Batch Size 32 | Batch Size 64 |
|
38 |
| --- | --- | --- | --- | --- | --- |
|
39 |
| Torch LLaMA | 24.65| 167.3 | 322.97 | 407.99 | OOM |
|
40 |
-
| lyraLLaMA | 53.67 | 421.38 | 804.31 | 1519.28| 2679.82 |
|
|
|
41 |
|
42 |
## Docker Environment Recommendation
|
43 |
|
@@ -62,8 +64,9 @@ tokenizer_path = "./models/"
|
|
62 |
dtype='fp16'
|
63 |
prompt = "今天天气大概 25度,有点小雨,吹着风,我想去户外散步,应该穿什么样的衣服 裤子鞋子搭配"
|
64 |
max_output_length = 512
|
|
|
65 |
|
66 |
-
model = lyraLLaMA(model_path, tokenizer_path, dtype)
|
67 |
|
68 |
prompt = '<human>:' + prompt.strip() + '\n<bot>:'
|
69 |
|
@@ -105,7 +108,7 @@ Outputs:
|
|
105 |
3. Support Vector Machines (SVMs): SVMs are a type of supervised learning algorithm that can be used for both classification and regression tasks. They work by finding the best hyperplane that separates the data into different classes. SVMs are commonly used in applications such as image classification and natural language processing.
|
106 |
|
107 |
## TODO
|
108 |
-
1. Support for
|
109 |
2. Inference for longer context situations
|
110 |
3. Streaming inference mode.
|
111 |
|
|
|
23 |
|
24 |
* Evaluated at tokens/s
|
25 |
* test on A100 40G
|
26 |
+
* fp16 and int8 precision
|
27 |
|
28 |
### LLaMA-Ziya-13B
|
29 |
|
30 |
| Version | Batch Size 1 | Batch Size 8 | Batch Size 16 | Batch Size 32 | Batch Size 64 |
|
31 |
| --- | --- | --- | --- | --- | --- |
|
32 |
| Torch LLaMA | 31.74 | 289.2 | 521.37 | 775.69 | OOM |
|
33 |
+
| lyraLLaMA fp16 | 73.2 | 565.6 | 1179.59 | 1795.63 | 3061.27 |
|
34 |
+
| lyraLLaMA int8 | 104 | 770.5 | 1389.9 | 2390.4 | 3782.1 |
|
35 |
|
36 |
### LLaMA-Vicuna-13B
|
37 |
|
38 |
| Version | Batch Size 1 | Batch Size 8 | Batch Size 16 | Batch Size 32 | Batch Size 64 |
|
39 |
| --- | --- | --- | --- | --- | --- |
|
40 |
| Torch LLaMA | 24.65| 167.3 | 322.97 | 407.99 | OOM |
|
41 |
+
| lyraLLaMA fp16 | 53.67 | 421.38 | 804.31 | 1519.28| 2679.82 |
|
42 |
+
| lyraLLaMA int8 | 138.48 | 993.22 | 1741 | 2816.81 | 4146.52 |
|
43 |
|
44 |
## Docker Environment Recommendation
|
45 |
|
|
|
64 |
dtype='fp16'
|
65 |
prompt = "今天天气大概 25度,有点小雨,吹着风,我想去户外散步,应该穿什么样的衣服 裤子鞋子搭配"
|
66 |
max_output_length = 512
|
67 |
+
int8_mode = 0 # To use int8 mode, set int8_mode=1
|
68 |
|
69 |
+
model = lyraLLaMA(model_path, tokenizer_path, dtype, int8_mode)
|
70 |
|
71 |
prompt = '<human>:' + prompt.strip() + '\n<bot>:'
|
72 |
|
|
|
108 |
3. Support Vector Machines (SVMs): SVMs are a type of supervised learning algorithm that can be used for both classification and regression tasks. They work by finding the best hyperplane that separates the data into different classes. SVMs are commonly used in applications such as image classification and natural language processing.
|
109 |
|
110 |
## TODO
|
111 |
+
1. Support for int4
|
112 |
2. Inference for longer context situations
|
113 |
3. Streaming inference mode.
|
114 |
|
lyra_llama/lyra_llama.py
CHANGED
@@ -3,21 +3,23 @@ from __future__ import annotations
|
|
3 |
import configparser
|
4 |
import pathlib
|
5 |
import typing
|
|
|
6 |
|
7 |
import torch
|
8 |
import transformers
|
9 |
from torch.nn.utils.rnn import pad_sequence
|
10 |
|
11 |
from .config import LYRA_LLAMA_PARAM, LIB_SO_PATH
|
12 |
-
from .model import
|
13 |
|
14 |
-
|
|
|
15 |
def __init__(self, model_path, tokenizer_path=None, dtype='fp16', int8_mode=0) -> None:
|
16 |
self.model_path = model_path
|
17 |
self.tokenizer_path = tokenizer_path
|
18 |
self.dtype = dtype
|
19 |
-
if dtype != 'int8':
|
20 |
-
|
21 |
self.int8_mode = int8_mode
|
22 |
|
23 |
self.model, self.tokenizer = self.load_model_and_tokenizer()
|
@@ -32,7 +34,7 @@ class lyraLLaMA:
|
|
32 |
print(f'Loading tokenizer from {tokenizer_path}')
|
33 |
tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path)
|
34 |
|
35 |
-
checkpoint_path = pathlib.Path(self.
|
36 |
config_path = checkpoint_path / 'config.ini'
|
37 |
|
38 |
if config_path.exists():
|
@@ -46,15 +48,15 @@ class lyraLLaMA:
|
|
46 |
model_args = dict(
|
47 |
head_num=cfg.getint(model_name, 'head_num'),
|
48 |
size_per_head=cfg.getint(model_name, "size_per_head"),
|
|
|
49 |
layer_num=cfg.getint(model_name, "num_layer"),
|
50 |
-
|
|
|
51 |
vocab_size=cfg.getint(model_name, "vocab_size"),
|
52 |
start_id=cfg.getint(model_name, "start_id"),
|
53 |
end_id=cfg.getint(model_name, "end_id"),
|
54 |
weights_data_type=cfg.get(model_name, "weight_data_type"),
|
55 |
-
|
56 |
-
rotary_embedding_dim=cfg.getint(model_name, 'rotary_embedding'),
|
57 |
-
inter_size=cfg.getint(model_name, 'inter_size'),
|
58 |
inference_data_type=inference_data_type)
|
59 |
else:
|
60 |
inference_data_type = self.dtype
|
@@ -62,28 +64,29 @@ class lyraLLaMA:
|
|
62 |
inference_data_type = LYRA_LLAMA_PARAM.weights_data_type
|
63 |
model_args = dict(head_num=LYRA_LLAMA_PARAM.num_heads,
|
64 |
size_per_head=LYRA_LLAMA_PARAM.size_per_head,
|
|
|
|
|
|
|
|
|
65 |
vocab_size=LYRA_LLAMA_PARAM.vocab_size,
|
66 |
start_id=LYRA_LLAMA_PARAM.start_id or tokenizer.bos_token_id,
|
67 |
end_id=LYRA_LLAMA_PARAM.end_id or tokenizer.eos_token_id,
|
68 |
-
layer_num=LYRA_LLAMA_PARAM.num_layers,
|
69 |
-
tensor_para_size=LYRA_LLAMA_PARAM.tensor_para_size,
|
70 |
weights_data_type=LYRA_LLAMA_PARAM.weights_data_type,
|
71 |
-
|
72 |
-
rotary_embedding_dim=LYRA_LLAMA_PARAM.rotary_embedding,
|
73 |
-
inter_size=LYRA_LLAMA_PARAM.inter_size,
|
74 |
inference_data_type=inference_data_type)
|
75 |
|
76 |
# update common parameters
|
77 |
model_args.update(dict(
|
78 |
lib_path=LIB_SO_PATH,
|
79 |
-
model_path=self.model_path,
|
80 |
max_seq_len=0, # for position seq embedding
|
81 |
pipeline_para_size=LYRA_LLAMA_PARAM.pipeline_para_size,
|
82 |
use_gptj_residual=LYRA_LLAMA_PARAM.use_gptj_residual,
|
|
|
83 |
# shared_contexts_ratio=LYRA_LLAMA_PARAM.shared_contexts_ratio,
|
84 |
))
|
85 |
|
86 |
-
print('[FT][INFO] Load Our FT Highly Optimized
|
87 |
for k, v in model_args.items():
|
88 |
print(f' - {k.ljust(25, ".")}: {v}')
|
89 |
|
@@ -101,9 +104,8 @@ class lyraLLaMA:
|
|
101 |
print('[FT][WARNING] Given end_id is not matched with neither pad '
|
102 |
'token id nor eos token id of the pretrained tokenizer.')
|
103 |
|
104 |
-
print(f'Loading
|
105 |
-
model =
|
106 |
-
|
107 |
return model, tokenizer
|
108 |
|
109 |
def generate(self, prompts: typing.List[str] | str,
|
|
|
3 |
import configparser
|
4 |
import pathlib
|
5 |
import typing
|
6 |
+
import os
|
7 |
|
8 |
import torch
|
9 |
import transformers
|
10 |
from torch.nn.utils.rnn import pad_sequence
|
11 |
|
12 |
from .config import LYRA_LLAMA_PARAM, LIB_SO_PATH
|
13 |
+
from .model import LlamaModel
|
14 |
|
15 |
+
|
16 |
+
class lyraLlama:
|
17 |
def __init__(self, model_path, tokenizer_path=None, dtype='fp16', int8_mode=0) -> None:
|
18 |
self.model_path = model_path
|
19 |
self.tokenizer_path = tokenizer_path
|
20 |
self.dtype = dtype
|
21 |
+
# if dtype != 'int8':
|
22 |
+
# int8_mode = 0
|
23 |
self.int8_mode = int8_mode
|
24 |
|
25 |
self.model, self.tokenizer = self.load_model_and_tokenizer()
|
|
|
34 |
print(f'Loading tokenizer from {tokenizer_path}')
|
35 |
tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path)
|
36 |
|
37 |
+
checkpoint_path = pathlib.Path(self.model_path)
|
38 |
config_path = checkpoint_path / 'config.ini'
|
39 |
|
40 |
if config_path.exists():
|
|
|
48 |
model_args = dict(
|
49 |
head_num=cfg.getint(model_name, 'head_num'),
|
50 |
size_per_head=cfg.getint(model_name, "size_per_head"),
|
51 |
+
inter_size=cfg.getint(model_name, 'inter_size'),
|
52 |
layer_num=cfg.getint(model_name, "num_layer"),
|
53 |
+
rotary_embedding_dim=cfg.getint(model_name, 'rotary_embedding'),
|
54 |
+
layernorm_eps=cfg.getfloat(model_name, 'layernorm_eps'),
|
55 |
vocab_size=cfg.getint(model_name, "vocab_size"),
|
56 |
start_id=cfg.getint(model_name, "start_id"),
|
57 |
end_id=cfg.getint(model_name, "end_id"),
|
58 |
weights_data_type=cfg.get(model_name, "weight_data_type"),
|
59 |
+
tensor_para_size=cfg.getint(model_name, "tensor_para_size"),
|
|
|
|
|
60 |
inference_data_type=inference_data_type)
|
61 |
else:
|
62 |
inference_data_type = self.dtype
|
|
|
64 |
inference_data_type = LYRA_LLAMA_PARAM.weights_data_type
|
65 |
model_args = dict(head_num=LYRA_LLAMA_PARAM.num_heads,
|
66 |
size_per_head=LYRA_LLAMA_PARAM.size_per_head,
|
67 |
+
inter_size=LYRA_LLAMA_PARAM.inter_size,
|
68 |
+
layer_num=LYRA_LLAMA_PARAM.num_layers,
|
69 |
+
rotary_embedding_dim=LYRA_LLAMA_PARAM.rotary_embedding,
|
70 |
+
layernorm_eps=LYRA_LLAMA_PARAM.layernorm_eps,
|
71 |
vocab_size=LYRA_LLAMA_PARAM.vocab_size,
|
72 |
start_id=LYRA_LLAMA_PARAM.start_id or tokenizer.bos_token_id,
|
73 |
end_id=LYRA_LLAMA_PARAM.end_id or tokenizer.eos_token_id,
|
|
|
|
|
74 |
weights_data_type=LYRA_LLAMA_PARAM.weights_data_type,
|
75 |
+
tensor_para_size=LYRA_LLAMA_PARAM.tensor_para_size,
|
|
|
|
|
76 |
inference_data_type=inference_data_type)
|
77 |
|
78 |
# update common parameters
|
79 |
model_args.update(dict(
|
80 |
lib_path=LIB_SO_PATH,
|
81 |
+
model_path=os.path.join(self.model_path, "1-gpu-fp16.bin"),
|
82 |
max_seq_len=0, # for position seq embedding
|
83 |
pipeline_para_size=LYRA_LLAMA_PARAM.pipeline_para_size,
|
84 |
use_gptj_residual=LYRA_LLAMA_PARAM.use_gptj_residual,
|
85 |
+
int8_mode=self.int8_mode
|
86 |
# shared_contexts_ratio=LYRA_LLAMA_PARAM.shared_contexts_ratio,
|
87 |
))
|
88 |
|
89 |
+
print('[FT][INFO] Load Our FT Highly Optimized LLaMA model')
|
90 |
for k, v in model_args.items():
|
91 |
print(f' - {k.ljust(25, ".")}: {v}')
|
92 |
|
|
|
104 |
print('[FT][WARNING] Given end_id is not matched with neither pad '
|
105 |
'token id nor eos token id of the pretrained tokenizer.')
|
106 |
|
107 |
+
print(f'Loading model from {self.model_path}')
|
108 |
+
model = LlamaModel(**model_args)
|
|
|
109 |
return model, tokenizer
|
110 |
|
111 |
def generate(self, prompts: typing.List[str] | str,
|
lyra_llama/model.py
CHANGED
@@ -16,24 +16,34 @@ from __future__ import print_function
|
|
16 |
|
17 |
import copy
|
18 |
import os
|
|
|
|
|
19 |
|
20 |
import numpy as np
|
21 |
import torch
|
22 |
import torch.distributed as dist
|
23 |
import torch.nn as nn
|
24 |
|
25 |
-
|
|
|
|
|
26 |
def __init__(self,
|
27 |
-
head_num,
|
28 |
-
|
|
|
|
|
|
|
29 |
start_id, end_id, layer_num,
|
30 |
-
max_seq_len,
|
31 |
-
|
|
|
|
|
32 |
use_gptj_residual,
|
33 |
-
lib_path,
|
34 |
model_path,
|
|
|
35 |
inference_data_type: str = "fp16",
|
36 |
-
weights_data_type: np.dtype = np.
|
37 |
super().__init__()
|
38 |
self.head_num = head_num
|
39 |
self.size_per_head = size_per_head
|
@@ -46,7 +56,9 @@ class LLaMAModel(nn.Module):
|
|
46 |
self.layer_num = layer_num
|
47 |
self.use_gptj_residual = use_gptj_residual
|
48 |
self.layernorm_eps = layernorm_eps
|
|
|
49 |
|
|
|
50 |
self.tensor_para_size = tensor_para_size
|
51 |
self.pipeline_para_size = pipeline_para_size
|
52 |
self.build_model = False
|
@@ -79,23 +91,22 @@ class LLaMAModel(nn.Module):
|
|
79 |
self.pipeline_para_rank = self.rank // self.tensor_para_size
|
80 |
|
81 |
self.model = torch.classes.FasterTransformer.LlamaOp(
|
82 |
-
self.head_num,
|
83 |
-
self.size_per_head,
|
84 |
-
self.inter_size,
|
85 |
self.layer_num,
|
86 |
self.vocab_size,
|
87 |
self.rotary_embedding_dim,
|
88 |
self.layernorm_eps,
|
89 |
-
self.start_id,
|
90 |
-
self.
|
91 |
-
self.tensor_para_size,
|
92 |
-
self.pipeline_para_size,
|
93 |
self.max_seq_len,
|
94 |
self.use_gptj_residual,
|
|
|
95 |
model_path,
|
96 |
-
|
97 |
-
|
|
|
98 |
self.build_model = True
|
|
|
99 |
|
100 |
def forward(self,
|
101 |
start_ids: torch.Tensor,
|
@@ -111,8 +122,7 @@ class LLaMAModel(nn.Module):
|
|
111 |
random_seed: torch.Tensor = None,
|
112 |
return_output_length=False,
|
113 |
return_cum_log_probs=0):
|
114 |
-
|
115 |
-
self.cuda()
|
116 |
input_len = start_ids.size(1)
|
117 |
assert input_len > 0, "input len must be larger than zero. For an unconditional case, use start_id as the first token."
|
118 |
|
|
|
16 |
|
17 |
import copy
|
18 |
import os
|
19 |
+
import pathlib
|
20 |
+
import typing
|
21 |
|
22 |
import numpy as np
|
23 |
import torch
|
24 |
import torch.distributed as dist
|
25 |
import torch.nn as nn
|
26 |
|
27 |
+
str_type_map = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
|
28 |
+
|
29 |
+
class LlamaModel(nn.Module):
|
30 |
def __init__(self,
|
31 |
+
head_num,
|
32 |
+
size_per_head,
|
33 |
+
inter_size,
|
34 |
+
vocab_size,
|
35 |
+
rotary_embedding_dim,
|
36 |
start_id, end_id, layer_num,
|
37 |
+
max_seq_len: int,
|
38 |
+
layernorm_eps,
|
39 |
+
tensor_para_size: int,
|
40 |
+
pipeline_para_size: int,
|
41 |
use_gptj_residual,
|
42 |
+
lib_path: typing.Union[str, pathlib.Path],
|
43 |
model_path,
|
44 |
+
int8_mode: int = 0,
|
45 |
inference_data_type: str = "fp16",
|
46 |
+
weights_data_type: typing.Union[str, np.dtype] = np.float32):
|
47 |
super().__init__()
|
48 |
self.head_num = head_num
|
49 |
self.size_per_head = size_per_head
|
|
|
56 |
self.layer_num = layer_num
|
57 |
self.use_gptj_residual = use_gptj_residual
|
58 |
self.layernorm_eps = layernorm_eps
|
59 |
+
self.int8_mode = int8_mode
|
60 |
|
61 |
+
# multi-gpu params
|
62 |
self.tensor_para_size = tensor_para_size
|
63 |
self.pipeline_para_size = pipeline_para_size
|
64 |
self.build_model = False
|
|
|
91 |
self.pipeline_para_rank = self.rank // self.tensor_para_size
|
92 |
|
93 |
self.model = torch.classes.FasterTransformer.LlamaOp(
|
94 |
+
self.head_num, self.size_per_head, self.inter_size,
|
|
|
|
|
95 |
self.layer_num,
|
96 |
self.vocab_size,
|
97 |
self.rotary_embedding_dim,
|
98 |
self.layernorm_eps,
|
99 |
+
self.start_id, self.end_id,
|
100 |
+
self.tensor_para_size, self.pipeline_para_size,
|
|
|
|
|
101 |
self.max_seq_len,
|
102 |
self.use_gptj_residual,
|
103 |
+
self.int8_mode,
|
104 |
model_path,
|
105 |
+
self.weights_data_type,
|
106 |
+
self.inference_data_type)
|
107 |
+
|
108 |
self.build_model = True
|
109 |
+
torch.cuda.empty_cache()
|
110 |
|
111 |
def forward(self,
|
112 |
start_ids: torch.Tensor,
|
|
|
122 |
random_seed: torch.Tensor = None,
|
123 |
return_output_length=False,
|
124 |
return_cum_log_probs=0):
|
125 |
+
|
|
|
126 |
input_len = start_ids.size(1)
|
127 |
assert input_len > 0, "input len must be larger than zero. For an unconditional case, use start_id as the first token."
|
128 |
|