Update modeling_chatglm.py for inputs_embeds
Browse files在训练blip2等模型时需要以inputs_embeds作为输入而不是input_ids,但是现在的实现似乎不支持这个功能。
我修改了一部分代码以支持inputs_embeds。依然需要同时输入input_ids以构建attention_mask和position_ids,但是送入transformer的是输入的inputs_embeds而不是用input_ids得到的。
已测试,不影响原有的仅使用input_ids的生成方式。
- modeling_chatglm.py +24 -13
modeling_chatglm.py
CHANGED
@@ -913,12 +913,10 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
|
|
913 |
)
|
914 |
use_cache = False
|
915 |
|
916 |
-
if input_ids is not None
|
917 |
-
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
|
918 |
-
elif input_ids is not None:
|
919 |
batch_size, seq_length = input_ids.shape[:2]
|
920 |
elif inputs_embeds is not None:
|
921 |
-
batch_size, seq_length
|
922 |
else:
|
923 |
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
924 |
|
@@ -973,9 +971,6 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
|
|
973 |
if attention_mask is None:
|
974 |
attention_mask = torch.zeros(1, 1, device=input_ids.device).bool()
|
975 |
|
976 |
-
else:
|
977 |
-
attention_mask = attention_mask.to(input_ids.device)
|
978 |
-
|
979 |
for i, layer in enumerate(self.layers):
|
980 |
|
981 |
if output_hidden_states:
|
@@ -1100,11 +1095,16 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
|
|
1100 |
[position_ids, new_position_id], dim=-1
|
1101 |
)
|
1102 |
|
|
|
|
|
|
|
|
|
1103 |
return model_kwargs
|
1104 |
|
1105 |
def prepare_inputs_for_generation(
|
1106 |
self,
|
1107 |
input_ids: torch.LongTensor,
|
|
|
1108 |
past: Optional[torch.Tensor] = None,
|
1109 |
past_key_values: Optional[torch.Tensor] = None,
|
1110 |
attention_mask: Optional[torch.Tensor] = None,
|
@@ -1165,12 +1165,23 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
|
|
1165 |
use_gmasks=use_gmasks
|
1166 |
)
|
1167 |
|
1168 |
-
|
1169 |
-
|
1170 |
-
|
1171 |
-
"
|
1172 |
-
|
1173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1174 |
|
1175 |
def forward(
|
1176 |
self,
|
|
|
913 |
)
|
914 |
use_cache = False
|
915 |
|
916 |
+
if input_ids is not None:
|
|
|
|
|
917 |
batch_size, seq_length = input_ids.shape[:2]
|
918 |
elif inputs_embeds is not None:
|
919 |
+
batch_size, seq_length = inputs_embeds.shape[:2]
|
920 |
else:
|
921 |
raise ValueError("You have to specify either input_ids or inputs_embeds")
|
922 |
|
|
|
971 |
if attention_mask is None:
|
972 |
attention_mask = torch.zeros(1, 1, device=input_ids.device).bool()
|
973 |
|
|
|
|
|
|
|
974 |
for i, layer in enumerate(self.layers):
|
975 |
|
976 |
if output_hidden_states:
|
|
|
1095 |
[position_ids, new_position_id], dim=-1
|
1096 |
)
|
1097 |
|
1098 |
+
# set to None as prepare_inputs_for_generation use past for input embeds
|
1099 |
+
if "inputs_embeds" in model_kwargs:
|
1100 |
+
model_kwargs["inputs_embeds"] = None
|
1101 |
+
|
1102 |
return model_kwargs
|
1103 |
|
1104 |
def prepare_inputs_for_generation(
|
1105 |
self,
|
1106 |
input_ids: torch.LongTensor,
|
1107 |
+
inputs_embeds: Optional[torch.Tensor] = None,
|
1108 |
past: Optional[torch.Tensor] = None,
|
1109 |
past_key_values: Optional[torch.Tensor] = None,
|
1110 |
attention_mask: Optional[torch.Tensor] = None,
|
|
|
1165 |
use_gmasks=use_gmasks
|
1166 |
)
|
1167 |
|
1168 |
+
if inputs_embeds is not None:
|
1169 |
+
assert input_ids.size(1) == inputs_embeds.size(
|
1170 |
+
1
|
1171 |
+
), f"Make sure that both input_ids ({input_ids.size(1)}) and inputs_embeds ({inputs_embeds.size(1)}) have the same length."
|
1172 |
+
return {
|
1173 |
+
"inputs_embeds": inputs_embeds,
|
1174 |
+
"past_key_values": past,
|
1175 |
+
"position_ids": position_ids,
|
1176 |
+
"attention_mask": attention_mask,
|
1177 |
+
}
|
1178 |
+
else:
|
1179 |
+
return {
|
1180 |
+
"input_ids": input_ids,
|
1181 |
+
"past_key_values": past,
|
1182 |
+
"position_ids": position_ids,
|
1183 |
+
"attention_mask": attention_mask,
|
1184 |
+
}
|
1185 |
|
1186 |
def forward(
|
1187 |
self,
|