RuntimeError: "addmm_impl_cpu_" not implemented for 'Half'

#1
by leonChen - opened

from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig

model_path = "WiNGPT2-7B-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
model = model.eval()

device ='cuda'

device = 'cpu'
model.to(device)

generation_config = GenerationConfig(
num_beams=1,
top_p=0.75,
top_k=30,
repetition_penalty=1.1,
max_new_tokens=1024
)
text = 'User: :xx。xxx?<|endoftext|>\n Assistant: '
inputs = tokenizer.encode(text, return_tensors="pt").to(device)
outputs = model.generate(inputs, generation_config=generation_config)
output = tokenizer.decode(outputs[0])
inputs_str = ' '.join([str(i) for i in inputs.tolist()])
response = output.replace(inputs_str, '')

RuntimeError Traceback (most recent call last)
Cell In[4], line 3
2 inputs = tokenizer.encode(text, return_tensors="pt").to(device)
----> 3 outputs = model.generate(inputs, generation_config=generation_config)
4 output = tokenizer.decode(outputs[0])
5 inputs_str = ' '.join([str(i) for i in inputs.tolist()])

File ~/.cache/huggingface/modules/transformers_modules/WiNGPT2-7B-Chat/modeling_qwen.py:1136, in QWenLMHeadModel.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, **kwargs)
1133 else:
1134 logits_processor.append(stop_words_logits_processor)
-> 1136 return super().generate(
1137 inputs,
1138 generation_config=generation_config,
1139 logits_processor=logits_processor,
1140 stopping_criteria=stopping_criteria,
1141 prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
1142 synced_gpus=synced_gpus,
1143 assistant_model=assistant_model,
1144 streamer=streamer,
1145 **kwargs,
1146 )

File ~/miniconda3/lib/python3.9/site-packages/torch/autograd/grad_mode.py:27, in _DecoratorContextManager.call..decorate_context(*args, **kwargs)
24 @functools.wraps(func)
25 def decorate_context(*args, **kwargs):
26 with self.clone():
---> 27 return func(*args, **kwargs)

File ~/miniconda3/lib/python3.9/site-packages/torch/nn/modules/module.py:1194, in Module._call_impl(self, *input, **kwargs)
1190 # If we don't have any hooks, we want to skip the rest of the logic in
1191 # this function, and just call forward.
1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1193 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194 return forward_call(*input, **kwargs)
1195 # Do not call functions when jit is used
1196 full_backward_hooks, non_full_backward_hooks = [], []

File ~/.cache/huggingface/modules/transformers_modules/WiNGPT2-7B-Chat/modeling_qwen.py:926, in QWenLMHeadModel.forward(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, use_cache, output_attentions, output_hidden_states, return_dict)
904 def forward(
905 self,
906 input_ids: Optional[torch.LongTensor] = None,
(...)
919 return_dict: Optional[bool] = None,
920 ) -> Union[Tuple, CausalLMOutputWithPast]:
922 return_dict = (
923 return_dict if return_dict is not None else self.config.use_return_dict
924 )
--> 926 transformer_outputs = self.transformer(
927 input_ids,
928 past_key_values=past_key_values,
929 attention_mask=attention_mask,
930 token_type_ids=token_type_ids,
931 position_ids=position_ids,
932 head_mask=head_mask,
933 inputs_embeds=inputs_embeds,
934 encoder_hidden_states=encoder_hidden_states,
935 encoder_attention_mask=encoder_attention_mask,
936 use_cache=use_cache,
937 output_attentions=output_attentions,
938 output_hidden_states=output_hidden_states,
939 return_dict=return_dict,
940 )
941 hidden_states = transformer_outputs[0]
943 lm_logits = self.lm_head(hidden_states)

File ~/.cache/huggingface/modules/transformers_modules/WiNGPT2-7B-Chat/modeling_qwen.py:762, in QWenModel.forward(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, use_cache, output_attentions, output_hidden_states, return_dict)
750 outputs = torch.utils.checkpoint.checkpoint(
751 create_custom_forward(block),
752 hidden_states,
(...)
759 encoder_attention_mask,
760 )
761 else:
--> 762 outputs = block(
763 hidden_states,
764 layer_past=layer_past,
765 rotary_pos_emb=rotary_pos_emb,
766 registered_causal_mask=self.registered_causal_mask,
767 attention_mask=attention_mask,
768 head_mask=head_mask[i],
769 encoder_hidden_states=encoder_hidden_states,
770 encoder_attention_mask=encoder_attention_mask,
771 use_cache=use_cache,
772 output_attentions=output_attentions,
773 )
775 hidden_states = outputs[0]
776 if use_cache is True:

File ~/.cache/huggingface/modules/transformers_modules/WiNGPT2-7B-Chat/modeling_qwen.py:483, in QWenBlock.forward(self, hidden_states, rotary_pos_emb, registered_causal_mask, layer_past, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, use_cache, output_attentions)
468 def forward(
469 self,
470 hidden_states: Optional[Tuple[torch.FloatTensor]],
(...)
479 output_attentions: Optional[bool] = False,
480 ):
481 layernorm_output = self.ln_1(hidden_states)
--> 483 attn_outputs = self.attn(
484 layernorm_output,
485 rotary_pos_emb,
486 registered_causal_mask=registered_causal_mask,
487 layer_past=layer_past,
488 attention_mask=attention_mask,
489 head_mask=head_mask,
490 use_cache=use_cache,
491 output_attentions=output_attentions,
492 )
493 attn_output = attn_outputs[0]
495 outputs = attn_outputs[1:]

File ~/.cache/huggingface/modules/transformers_modules/WiNGPT2-7B-Chat/modeling_qwen.py:349, in QWenAttention.forward(self, hidden_states, rotary_pos_emb, registered_causal_mask, layer_past, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions, use_cache)
335 def forward(
336 self,
337 hidden_states: Optional[Tuple[torch.FloatTensor]],
(...)
346 use_cache: Optional[bool] = False,
347 ):
--> 349 mixed_x_layer = self.c_attn(hidden_states)
351 query, key, value = mixed_x_layer.split(self.split_size, dim=2)
353 query = self._split_heads(query, self.num_heads, self.head_dim)

File ~/miniconda3/lib/python3.9/site-packages/torch/nn/modules/module.py:1194, in Module._call_impl(self, *input, **kwargs)
1190 # If we don't have any hooks, we want to skip the rest of the logic in
1191 # this function, and just call forward.
1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1193 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194 return forward_call(*input, **kwargs)
1195 # Do not call functions when jit is used
1196 full_backward_hooks, non_full_backward_hooks = [], []

File ~/miniconda3/lib/python3.9/site-packages/torch/nn/modules/linear.py:114, in Linear.forward(self, input)
113 def forward(self, input: Tensor) -> Tensor:
--> 114 return F.linear(input, self.weight, self.bias)

RuntimeError: "addmm_impl_cpu_" not implemented for 'Half'

set device to "cuda" as the model is loaded as fp16 but addmm_impl_cpu_ ops does not support half(fp16) in cpu mode.

winninghealth changed discussion status to closed

Sign up or log in to comment