add pad token and default eos token
#5
by
tianxie-sf
- opened
- tokenization_xgen.py +26 -5
tokenization_xgen.py
CHANGED
@@ -25,7 +25,7 @@ MAX_MODEL_INPUT_SIZES = {
|
|
25 |
}
|
26 |
|
27 |
|
28 |
-
def tiktoken_tokenizer(base="gpt2", add_special=True):
|
29 |
if not add_special:
|
30 |
return tiktoken.get_encoding(base)
|
31 |
|
@@ -83,6 +83,9 @@ def tiktoken_tokenizer(base="gpt2", add_special=True):
|
|
83 |
special_tokens[sp] = idx
|
84 |
idx += 1
|
85 |
|
|
|
|
|
|
|
86 |
# In production, load the arguments directly instead of accessing private attributes
|
87 |
# See openai_public.py for examples of arguments for specific encodings
|
88 |
enc = tiktoken.Encoding(
|
@@ -112,25 +115,40 @@ class XgenTokenizer(PreTrainedTokenizer):
|
|
112 |
def __init__(
|
113 |
self,
|
114 |
pad_token=None,
|
|
|
115 |
add_eos_token=False,
|
116 |
add_special_tokens=True,
|
117 |
**kwargs,
|
118 |
):
|
119 |
-
|
|
|
120 |
super().__init__(
|
121 |
-
pad_token=
|
|
|
122 |
add_eos_token=add_eos_token,
|
123 |
add_special_tokens=add_special_tokens,
|
124 |
**kwargs,
|
125 |
)
|
126 |
self.add_eos_token = add_eos_token
|
127 |
-
self.encoder = tiktoken_tokenizer(base="gpt2", add_special=add_special_tokens)
|
128 |
|
129 |
@property
|
130 |
def vocab_size(self):
|
131 |
"""Returns vocab size"""
|
132 |
return self.encoder.n_vocab
|
133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
def get_vocab(self):
|
135 |
"""Returns vocab as a dict"""
|
136 |
vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
|
@@ -142,6 +160,9 @@ class XgenTokenizer(PreTrainedTokenizer):
|
|
142 |
|
143 |
def _convert_token_to_id(self, token):
|
144 |
"""Converts a token (str) in an id using the vocab."""
|
|
|
|
|
|
|
145 |
return token
|
146 |
|
147 |
def _convert_id_to_token(self, index):
|
@@ -216,4 +237,4 @@ class XgenTokenizer(PreTrainedTokenizer):
|
|
216 |
if token_ids_1 is not None:
|
217 |
output += [1] * len(token_ids_1 + eos_token_id)
|
218 |
|
219 |
-
return output
|
|
|
25 |
}
|
26 |
|
27 |
|
28 |
+
def tiktoken_tokenizer(base="gpt2", pad_token=None, add_special=True):
|
29 |
if not add_special:
|
30 |
return tiktoken.get_encoding(base)
|
31 |
|
|
|
83 |
special_tokens[sp] = idx
|
84 |
idx += 1
|
85 |
|
86 |
+
if pad_token and pad_token not in tokenizer._special_tokens and pad_token not in special_tokens:
|
87 |
+
special_tokens[pad_token] = idx
|
88 |
+
idx += 1
|
89 |
# In production, load the arguments directly instead of accessing private attributes
|
90 |
# See openai_public.py for examples of arguments for specific encodings
|
91 |
enc = tiktoken.Encoding(
|
|
|
115 |
def __init__(
|
116 |
self,
|
117 |
pad_token=None,
|
118 |
+
eos_token="<|endoftext|>",
|
119 |
add_eos_token=False,
|
120 |
add_special_tokens=True,
|
121 |
**kwargs,
|
122 |
):
|
123 |
+
pad_token_added = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
124 |
+
eos_token_added = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
125 |
super().__init__(
|
126 |
+
pad_token=pad_token_added,
|
127 |
+
eos_token=eos_token_added,
|
128 |
add_eos_token=add_eos_token,
|
129 |
add_special_tokens=add_special_tokens,
|
130 |
**kwargs,
|
131 |
)
|
132 |
self.add_eos_token = add_eos_token
|
133 |
+
self.encoder = tiktoken_tokenizer(base="gpt2", pad_token=pad_token, add_special=add_special_tokens)
|
134 |
|
135 |
@property
|
136 |
def vocab_size(self):
|
137 |
"""Returns vocab size"""
|
138 |
return self.encoder.n_vocab
|
139 |
|
140 |
+
@property
|
141 |
+
def eos_token_id(self):
|
142 |
+
if self.eos_token is not None:
|
143 |
+
return self.encoder.encode(self.eos_token, allowed_special="all")[0]
|
144 |
+
return None
|
145 |
+
|
146 |
+
@property
|
147 |
+
def pad_token_id(self):
|
148 |
+
if self.pad_token is not None:
|
149 |
+
return self.encoder.encode(self.pad_token, allowed_special="all")[0]
|
150 |
+
return None
|
151 |
+
|
152 |
def get_vocab(self):
|
153 |
"""Returns vocab as a dict"""
|
154 |
vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
|
|
|
160 |
|
161 |
def _convert_token_to_id(self, token):
|
162 |
"""Converts a token (str) in an id using the vocab."""
|
163 |
+
if isinstance(token, str):
|
164 |
+
ids = self._tokenize(token)
|
165 |
+
return ids[0]
|
166 |
return token
|
167 |
|
168 |
def _convert_id_to_token(self, index):
|
|
|
237 |
if token_ids_1 is not None:
|
238 |
output += [1] * len(token_ids_1 + eos_token_id)
|
239 |
|
240 |
+
return output
|