JustinLin610
commited on
Commit
•
d00ebe5
1
Parent(s):
62bf1c6
fix decoder, and provide an option to remove attack rejection
Browse files- tokenization_qwen.py +13 -5
tokenization_qwen.py
CHANGED
@@ -126,6 +126,7 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
126 |
self.mergeable_ranks = mergeable_ranks
|
127 |
self.encoder = self.mergeable_ranks
|
128 |
self.decoder = {v: k for k, v in self.encoder.items()}
|
|
|
129 |
self.tokenizer = enc # type: tiktoken.Encoding
|
130 |
self.eod_id = self.tokenizer.eot_token
|
131 |
self.im_start_id = special_tokens[IMSTART]
|
@@ -182,16 +183,20 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
182 |
text (`str`):
|
183 |
The sequence to be encoded.
|
184 |
kwargs (additional keyword arguments, *optional*):
|
185 |
-
Will be passed to the underlying model specific encode method.
|
186 |
-
|
|
|
|
|
187 |
|
188 |
Returns:
|
189 |
`List[str]`: The list of tokens.
|
190 |
"""
|
191 |
tokens = []
|
192 |
text = unicodedata.normalize("NFC", text)
|
193 |
-
|
|
|
194 |
tokens.append(self.decoder[t])
|
|
|
195 |
return tokens
|
196 |
|
197 |
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
@@ -216,7 +221,10 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
216 |
|
217 |
def _convert_token_to_id(self, token: str) -> int:
|
218 |
"""Converts a token to an id using the vocab."""
|
219 |
-
return self.encoder.get(
|
|
|
|
|
|
|
220 |
|
221 |
@property
|
222 |
def all_special_tokens(self) -> List[str]:
|
@@ -255,4 +263,4 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
255 |
token_ids = [token_ids]
|
256 |
if skip_special_tokens:
|
257 |
token_ids = [i for i in token_ids if i not in self.all_special_ids]
|
258 |
-
return self.tokenizer.decode(token_ids)
|
|
|
126 |
self.mergeable_ranks = mergeable_ranks
|
127 |
self.encoder = self.mergeable_ranks
|
128 |
self.decoder = {v: k for k, v in self.encoder.items()}
|
129 |
+
self.decoder.update({v: k for k, v in self.special_tokens.items()})
|
130 |
self.tokenizer = enc # type: tiktoken.Encoding
|
131 |
self.eod_id = self.tokenizer.eot_token
|
132 |
self.im_start_id = special_tokens[IMSTART]
|
|
|
183 |
text (`str`):
|
184 |
The sequence to be encoded.
|
185 |
kwargs (additional keyword arguments, *optional*):
|
186 |
+
Will be passed to the underlying model specific encode method.
|
187 |
+
Tiktoken allows users to allow the tokenization of special tokens with the following args:
|
188 |
+
`allowed_special`: set to 'all' or a `set` of special tokens.
|
189 |
+
`disallowed_special`: set to 'all' or a `Collection` of special tokens. NOT RECOMMENDED, AS IT MAY BE CONFLICTED WITH `allowed_special`.
|
190 |
|
191 |
Returns:
|
192 |
`List[str]`: The list of tokens.
|
193 |
"""
|
194 |
tokens = []
|
195 |
text = unicodedata.normalize("NFC", text)
|
196 |
+
|
197 |
+
for t in self.tokenizer.encode(text, **kwargs):
|
198 |
tokens.append(self.decoder[t])
|
199 |
+
|
200 |
return tokens
|
201 |
|
202 |
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
|
|
221 |
|
222 |
def _convert_token_to_id(self, token: str) -> int:
|
223 |
"""Converts a token to an id using the vocab."""
|
224 |
+
return self.encoder.get(
|
225 |
+
token.encode("UTF-8"),
|
226 |
+
self.tokenizer.encode(self.unk_token, allowed_special="all")[0],
|
227 |
+
)
|
228 |
|
229 |
@property
|
230 |
def all_special_tokens(self) -> List[str]:
|
|
|
263 |
token_ids = [token_ids]
|
264 |
if skip_special_tokens:
|
265 |
token_ids = [i for i in token_ids if i not in self.all_special_ids]
|
266 |
+
return self.tokenizer.decode(token_ids)
|