fb700/chatglm-fitness-RLHF · model = model.half().quantize(4).cuda() 运行显示错误"AttributeError: 'Linear' object has no attribute 'bias'"

# Usage2 16G及以下显存用户下载整个模型，可支持fp16、int8、int4
model = AutoModel.from_pretrained("/media/lk/disk1/lk_git/6_NLPandCNN/LLM/chatglm-fitness-RLHF",  device_map='auto')
model = model.half().quantize(4).cuda()  # int4
tokenizer = AutoTokenizer.from_pretrained("fb700/chatglm-fitness-RLHF", trust_remote_code=True)
sents = ['新冠肺炎怎么预防。\n答：']
for s in sents:
    response = model.chat(tokenizer, s, max_length=128, eos_token_id=tokenizer.eos_token_id)
    print(response)

运行现实错误:

Failed to load cpm_kernels:File `cuda/embedding.fatbin` not found in `cpm_kernels.kernels.base`
NameError                                 Traceback (most recent call last)
Cell In[3], line 1
----> 1 model = model.half().quantize(4).cuda()  # int4

File ~/.cache/huggingface/modules/transformers_modules/chatglm-fitness-RLHF/modeling_chatglm.py:1434, in ChatGLMForConditionalGeneration.quantize(self, bits, empty_init, **kwargs)
   1430 self.quantized = True
   1432 self.config.quantization_bit = bits
-> 1434 self.transformer = quantize(self.transformer, bits, empty_init=empty_init, **kwargs)
   1435 return self

File ~/.cache/huggingface/modules/transformers_modules/chatglm-fitness-RLHF/quantization.py:157, in quantize(model, weight_bit_width, empty_init, **kwargs)
    154 """Replace fp16 linear with quantized linear"""
    156 for layer in model.layers:
--> 157     layer.attention.query_key_value = QuantizedLinear(
    158         weight_bit_width=weight_bit_width,
    159         weight_tensor=layer.attention.query_key_value.weight.to(torch.cuda.current_device()),
    160         bias_tensor=layer.attention.query_key_value.bias,
    161         in_features=layer.attention.query_key_value.in_features,
    162         out_features=layer.attention.query_key_value.out_features,
    163         bias=True,
    164         dtype=torch.half,
    165         device=layer.attention.query_key_value.weight.device,
    166         empty_init=empty_init
    167     )
    168     layer.attention.dense = QuantizedLinear(
    169         weight_bit_width=weight_bit_width,
    170         weight_tensor=layer.attention.dense.weight.to(torch.cuda.current_device()),
   (...)
    177         empty_init=empty_init
    178     )
    179     layer.mlp.dense_h_to_4h = QuantizedLinear(
    180         weight_bit_width=weight_bit_width,
    181         weight_tensor=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()),
   (...)
    188         empty_init=empty_init
    189     )

File ~/.cache/huggingface/modules/transformers_modules/chatglm-fitness-RLHF/quantization.py:137, in QuantizedLinear.__init__(self, weight_bit_width, weight_tensor, bias_tensor, empty_init, *args, **kwargs)
    135     self.weight = torch.round(weight_tensor / self.weight_scale[:, None]).to(torch.int8)
    136     if weight_bit_width == 4:
--> 137         self.weight = compress_int4_weight(self.weight)
    139 self.weight = Parameter(self.weight.to(kwargs["device"]), requires_grad=False)
    140 self.weight_scale = Parameter(self.weight_scale.to(kwargs["device"]), requires_grad=False)

File ~/.cache/huggingface/modules/transformers_modules/chatglm-fitness-RLHF/quantization.py:76, in compress_int4_weight(weight)
     73 stream = torch.cuda.current_stream()
     75 gridDim = (n, 1, 1)
---> 76 blockDim = (min(round_up(m, 32), 1024), 1, 1)
     78 kernels.int4WeightCompression(
     79     gridDim,
     80     blockDim,
   (...)
     83     [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)],
     84 )
     85 return out

NameError: name 'round_up' is not defined