model = model.half().quantize(4).cuda() 运行显示错误"AttributeError: 'Linear' object has no attribute 'bias'"
#22
by
Frank1983823
- opened
# Usage2 16G及以下显存用户下载整个模型,可支持fp16、int8、int4
model = AutoModel.from_pretrained("/media/lk/disk1/lk_git/6_NLPandCNN/LLM/chatglm-fitness-RLHF", device_map='auto')
model = model.half().quantize(4).cuda() # int4
tokenizer = AutoTokenizer.from_pretrained("fb700/chatglm-fitness-RLHF", trust_remote_code=True)
sents = ['新冠肺炎怎么预防。\n答:']
for s in sents:
response = model.chat(tokenizer, s, max_length=128, eos_token_id=tokenizer.eos_token_id)
print(response)
运行现实错误:
Failed to load cpm_kernels:File `cuda/embedding.fatbin` not found in `cpm_kernels.kernels.base`
NameError Traceback (most recent call last)
Cell In[3], line 1
----> 1 model = model.half().quantize(4).cuda() # int4
File ~/.cache/huggingface/modules/transformers_modules/chatglm-fitness-RLHF/modeling_chatglm.py:1434, in ChatGLMForConditionalGeneration.quantize(self, bits, empty_init, **kwargs)
1430 self.quantized = True
1432 self.config.quantization_bit = bits
-> 1434 self.transformer = quantize(self.transformer, bits, empty_init=empty_init, **kwargs)
1435 return self
File ~/.cache/huggingface/modules/transformers_modules/chatglm-fitness-RLHF/quantization.py:157, in quantize(model, weight_bit_width, empty_init, **kwargs)
154 """Replace fp16 linear with quantized linear"""
156 for layer in model.layers:
--> 157 layer.attention.query_key_value = QuantizedLinear(
158 weight_bit_width=weight_bit_width,
159 weight_tensor=layer.attention.query_key_value.weight.to(torch.cuda.current_device()),
160 bias_tensor=layer.attention.query_key_value.bias,
161 in_features=layer.attention.query_key_value.in_features,
162 out_features=layer.attention.query_key_value.out_features,
163 bias=True,
164 dtype=torch.half,
165 device=layer.attention.query_key_value.weight.device,
166 empty_init=empty_init
167 )
168 layer.attention.dense = QuantizedLinear(
169 weight_bit_width=weight_bit_width,
170 weight_tensor=layer.attention.dense.weight.to(torch.cuda.current_device()),
(...)
177 empty_init=empty_init
178 )
179 layer.mlp.dense_h_to_4h = QuantizedLinear(
180 weight_bit_width=weight_bit_width,
181 weight_tensor=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()),
(...)
188 empty_init=empty_init
189 )
File ~/.cache/huggingface/modules/transformers_modules/chatglm-fitness-RLHF/quantization.py:137, in QuantizedLinear.__init__(self, weight_bit_width, weight_tensor, bias_tensor, empty_init, *args, **kwargs)
135 self.weight = torch.round(weight_tensor / self.weight_scale[:, None]).to(torch.int8)
136 if weight_bit_width == 4:
--> 137 self.weight = compress_int4_weight(self.weight)
139 self.weight = Parameter(self.weight.to(kwargs["device"]), requires_grad=False)
140 self.weight_scale = Parameter(self.weight_scale.to(kwargs["device"]), requires_grad=False)
File ~/.cache/huggingface/modules/transformers_modules/chatglm-fitness-RLHF/quantization.py:76, in compress_int4_weight(weight)
73 stream = torch.cuda.current_stream()
75 gridDim = (n, 1, 1)
---> 76 blockDim = (min(round_up(m, 32), 1024), 1, 1)
78 kernels.int4WeightCompression(
79 gridDim,
80 blockDim,
(...)
83 [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)],
84 )
85 return out
NameError: name 'round_up' is not defined
This comment has been hidden