revert convert_tokens_to_string
Browse files- tokenization_qwen.py +6 -5
tokenization_qwen.py
CHANGED
@@ -198,15 +198,16 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
198 |
|
199 |
return tokens
|
200 |
|
201 |
-
def convert_tokens_to_string(self, tokens: List[
|
202 |
"""
|
203 |
Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
|
204 |
often want to remove sub-word tokenization artifacts at the same time.
|
205 |
"""
|
206 |
-
text =
|
207 |
-
for
|
208 |
-
|
209 |
-
|
|
|
210 |
|
211 |
@property
|
212 |
def vocab_size(self):
|
|
|
198 |
|
199 |
return tokens
|
200 |
|
201 |
+
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
202 |
"""
|
203 |
Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
|
204 |
often want to remove sub-word tokenization artifacts at the same time.
|
205 |
"""
|
206 |
+
text = "".join(tokens)
|
207 |
+
text = bytearray([self.byte_decoder[c] for c in text]).decode(
|
208 |
+
"utf-8", errors=self.errors
|
209 |
+
)
|
210 |
+
return text
|
211 |
|
212 |
@property
|
213 |
def vocab_size(self):
|