Update tokenization_chatglm.py
#93
by
fengkaige
- opened
- tokenization_chatglm.py +31 -17
tokenization_chatglm.py
CHANGED
@@ -380,8 +380,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
380 |
# Load from model defaults
|
381 |
bos_token_id = self.sp_tokenizer[self.bos_token]
|
382 |
mask_token_id = self.sp_tokenizer[self.mask_token]
|
383 |
-
gmask_token_id = self.sp_tokenizer[self.gmask_token]
|
384 |
-
assert self.padding_side == "left"
|
385 |
|
386 |
required_input = encoded_inputs[self.model_input_names[0]]
|
387 |
seq_length = len(required_input)
|
@@ -424,20 +423,35 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
424 |
|
425 |
if needs_to_be_padded:
|
426 |
difference = max_length - len(required_input)
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
442 |
|
443 |
return encoded_inputs
|
|
|
380 |
# Load from model defaults
|
381 |
bos_token_id = self.sp_tokenizer[self.bos_token]
|
382 |
mask_token_id = self.sp_tokenizer[self.mask_token]
|
383 |
+
gmask_token_id = self.sp_tokenizer[self.gmask_token]
|
|
|
384 |
|
385 |
required_input = encoded_inputs[self.model_input_names[0]]
|
386 |
seq_length = len(required_input)
|
|
|
423 |
|
424 |
if needs_to_be_padded:
|
425 |
difference = max_length - len(required_input)
|
426 |
+
if self.padding_side == "left":
|
427 |
+
if "attention_mask" in encoded_inputs:
|
428 |
+
encoded_inputs["attention_mask"] = np.pad(encoded_inputs["attention_mask"],
|
429 |
+
pad_width=[(0, 0), (difference, 0), (difference, 0)],
|
430 |
+
mode='constant', constant_values=True)
|
431 |
+
if "token_type_ids" in encoded_inputs:
|
432 |
+
encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
|
433 |
+
"token_type_ids"
|
434 |
+
]
|
435 |
+
if "special_tokens_mask" in encoded_inputs:
|
436 |
+
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
|
437 |
+
if "position_ids" in encoded_inputs:
|
438 |
+
encoded_inputs["position_ids"] = np.pad(encoded_inputs["position_ids"],
|
439 |
+
pad_width=[(0, 0), (difference, 0)])
|
440 |
+
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
|
441 |
+
elif self.padding_side == "right":
|
442 |
+
if "attention_mask" in encoded_inputs:
|
443 |
+
encoded_inputs["attention_mask"] = np.pad(encoded_inputs["attention_mask"],
|
444 |
+
pad_width=[(0, 0), (0, difference), (0, difference)],
|
445 |
+
mode='constant', constant_values=True)
|
446 |
+
if "token_type_ids" in encoded_inputs:
|
447 |
+
encoded_inputs["token_type_ids"] = encoded_inputs[ "token_type_ids" ] + [self.pad_token_type_id] * difference
|
448 |
+
if "special_tokens_mask" in encoded_inputs:
|
449 |
+
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
|
450 |
+
if "position_ids" in encoded_inputs:
|
451 |
+
encoded_inputs["position_ids"] = np.pad(encoded_inputs["position_ids"],
|
452 |
+
pad_width=[(0, 0), (0, difference)])
|
453 |
+
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
|
454 |
+
else:
|
455 |
+
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
456 |
|
457 |
return encoded_inputs
|