Update tokenization_chatglm.py

#93
by fengkaige - opened
Files changed (1) hide show
  1. tokenization_chatglm.py +31 -17
tokenization_chatglm.py CHANGED
@@ -380,8 +380,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
380
  # Load from model defaults
381
  bos_token_id = self.sp_tokenizer[self.bos_token]
382
  mask_token_id = self.sp_tokenizer[self.mask_token]
383
- gmask_token_id = self.sp_tokenizer[self.gmask_token]
384
- assert self.padding_side == "left"
385
 
386
  required_input = encoded_inputs[self.model_input_names[0]]
387
  seq_length = len(required_input)
@@ -424,20 +423,35 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
424
 
425
  if needs_to_be_padded:
426
  difference = max_length - len(required_input)
427
-
428
- if "attention_mask" in encoded_inputs:
429
- encoded_inputs["attention_mask"] = np.pad(encoded_inputs["attention_mask"],
430
- pad_width=[(0, 0), (difference, 0), (difference, 0)],
431
- mode='constant', constant_values=True)
432
- if "token_type_ids" in encoded_inputs:
433
- encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
434
- "token_type_ids"
435
- ]
436
- if "special_tokens_mask" in encoded_inputs:
437
- encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
438
- if "position_ids" in encoded_inputs:
439
- encoded_inputs["position_ids"] = np.pad(encoded_inputs["position_ids"],
440
- pad_width=[(0, 0), (difference, 0)])
441
- encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
 
443
  return encoded_inputs
 
380
  # Load from model defaults
381
  bos_token_id = self.sp_tokenizer[self.bos_token]
382
  mask_token_id = self.sp_tokenizer[self.mask_token]
383
+ gmask_token_id = self.sp_tokenizer[self.gmask_token]
 
384
 
385
  required_input = encoded_inputs[self.model_input_names[0]]
386
  seq_length = len(required_input)
 
423
 
424
  if needs_to_be_padded:
425
  difference = max_length - len(required_input)
426
+ if self.padding_side == "left":
427
+ if "attention_mask" in encoded_inputs:
428
+ encoded_inputs["attention_mask"] = np.pad(encoded_inputs["attention_mask"],
429
+ pad_width=[(0, 0), (difference, 0), (difference, 0)],
430
+ mode='constant', constant_values=True)
431
+ if "token_type_ids" in encoded_inputs:
432
+ encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
433
+ "token_type_ids"
434
+ ]
435
+ if "special_tokens_mask" in encoded_inputs:
436
+ encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
437
+ if "position_ids" in encoded_inputs:
438
+ encoded_inputs["position_ids"] = np.pad(encoded_inputs["position_ids"],
439
+ pad_width=[(0, 0), (difference, 0)])
440
+ encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
441
+ elif self.padding_side == "right":
442
+ if "attention_mask" in encoded_inputs:
443
+ encoded_inputs["attention_mask"] = np.pad(encoded_inputs["attention_mask"],
444
+ pad_width=[(0, 0), (0, difference), (0, difference)],
445
+ mode='constant', constant_values=True)
446
+ if "token_type_ids" in encoded_inputs:
447
+ encoded_inputs["token_type_ids"] = encoded_inputs[ "token_type_ids" ] + [self.pad_token_type_id] * difference
448
+ if "special_tokens_mask" in encoded_inputs:
449
+ encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
450
+ if "position_ids" in encoded_inputs:
451
+ encoded_inputs["position_ids"] = np.pad(encoded_inputs["position_ids"],
452
+ pad_width=[(0, 0), (0, difference)])
453
+ encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
454
+ else:
455
+ raise ValueError("Invalid padding strategy:" + str(self.padding_side))
456
 
457
  return encoded_inputs