add pad token and default eos token

#5
by tianxie-sf - opened
Files changed (1) hide show
  1. tokenization_xgen.py +26 -5
tokenization_xgen.py CHANGED
@@ -25,7 +25,7 @@ MAX_MODEL_INPUT_SIZES = {
25
  }
26
 
27
 
28
- def tiktoken_tokenizer(base="gpt2", add_special=True):
29
  if not add_special:
30
  return tiktoken.get_encoding(base)
31
 
@@ -83,6 +83,9 @@ def tiktoken_tokenizer(base="gpt2", add_special=True):
83
  special_tokens[sp] = idx
84
  idx += 1
85
 
 
 
 
86
  # In production, load the arguments directly instead of accessing private attributes
87
  # See openai_public.py for examples of arguments for specific encodings
88
  enc = tiktoken.Encoding(
@@ -112,25 +115,40 @@ class XgenTokenizer(PreTrainedTokenizer):
112
  def __init__(
113
  self,
114
  pad_token=None,
 
115
  add_eos_token=False,
116
  add_special_tokens=True,
117
  **kwargs,
118
  ):
119
- pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
 
120
  super().__init__(
121
- pad_token=pad_token,
 
122
  add_eos_token=add_eos_token,
123
  add_special_tokens=add_special_tokens,
124
  **kwargs,
125
  )
126
  self.add_eos_token = add_eos_token
127
- self.encoder = tiktoken_tokenizer(base="gpt2", add_special=add_special_tokens)
128
 
129
  @property
130
  def vocab_size(self):
131
  """Returns vocab size"""
132
  return self.encoder.n_vocab
133
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  def get_vocab(self):
135
  """Returns vocab as a dict"""
136
  vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
@@ -142,6 +160,9 @@ class XgenTokenizer(PreTrainedTokenizer):
142
 
143
  def _convert_token_to_id(self, token):
144
  """Converts a token (str) in an id using the vocab."""
 
 
 
145
  return token
146
 
147
  def _convert_id_to_token(self, index):
@@ -216,4 +237,4 @@ class XgenTokenizer(PreTrainedTokenizer):
216
  if token_ids_1 is not None:
217
  output += [1] * len(token_ids_1 + eos_token_id)
218
 
219
- return output
 
25
  }
26
 
27
 
28
+ def tiktoken_tokenizer(base="gpt2", pad_token=None, add_special=True):
29
  if not add_special:
30
  return tiktoken.get_encoding(base)
31
 
 
83
  special_tokens[sp] = idx
84
  idx += 1
85
 
86
+ if pad_token and pad_token not in tokenizer._special_tokens and pad_token not in special_tokens:
87
+ special_tokens[pad_token] = idx
88
+ idx += 1
89
  # In production, load the arguments directly instead of accessing private attributes
90
  # See openai_public.py for examples of arguments for specific encodings
91
  enc = tiktoken.Encoding(
 
115
  def __init__(
116
  self,
117
  pad_token=None,
118
+ eos_token="<|endoftext|>",
119
  add_eos_token=False,
120
  add_special_tokens=True,
121
  **kwargs,
122
  ):
123
+ pad_token_added = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
124
+ eos_token_added = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
125
  super().__init__(
126
+ pad_token=pad_token_added,
127
+ eos_token=eos_token_added,
128
  add_eos_token=add_eos_token,
129
  add_special_tokens=add_special_tokens,
130
  **kwargs,
131
  )
132
  self.add_eos_token = add_eos_token
133
+ self.encoder = tiktoken_tokenizer(base="gpt2", pad_token=pad_token, add_special=add_special_tokens)
134
 
135
  @property
136
  def vocab_size(self):
137
  """Returns vocab size"""
138
  return self.encoder.n_vocab
139
 
140
+ @property
141
+ def eos_token_id(self):
142
+ if self.eos_token is not None:
143
+ return self.encoder.encode(self.eos_token, allowed_special="all")[0]
144
+ return None
145
+
146
+ @property
147
+ def pad_token_id(self):
148
+ if self.pad_token is not None:
149
+ return self.encoder.encode(self.pad_token, allowed_special="all")[0]
150
+ return None
151
+
152
  def get_vocab(self):
153
  """Returns vocab as a dict"""
154
  vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
 
160
 
161
  def _convert_token_to_id(self, token):
162
  """Converts a token (str) in an id using the vocab."""
163
+ if isinstance(token, str):
164
+ ids = self._tokenize(token)
165
+ return ids[0]
166
  return token
167
 
168
  def _convert_id_to_token(self, index):
 
237
  if token_ids_1 is not None:
238
  output += [1] * len(token_ids_1 + eos_token_id)
239
 
240
+ return output