Qwen
/

yangapku commited on
Commit
f2e5005
1 Parent(s): 7f6821c

revert convert_tokens_to_string

Browse files
Files changed (1) hide show
  1. tokenization_qwen.py +6 -5
tokenization_qwen.py CHANGED
@@ -198,15 +198,16 @@ class QWenTokenizer(PreTrainedTokenizer):
198
 
199
  return tokens
200
 
201
- def convert_tokens_to_string(self, tokens: List[bytes]) -> str:
202
  """
203
  Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
204
  often want to remove sub-word tokenization artifacts at the same time.
205
  """
206
- text = b""
207
- for token in tokens:
208
- text += token
209
- return text.decode('utf-8')
 
210
 
211
  @property
212
  def vocab_size(self):
 
198
 
199
  return tokens
200
 
201
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
202
  """
203
  Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
204
  often want to remove sub-word tokenization artifacts at the same time.
205
  """
206
+ text = "".join(tokens)
207
+ text = bytearray([self.byte_decoder[c] for c in text]).decode(
208
+ "utf-8", errors=self.errors
209
+ )
210
+ return text
211
 
212
  @property
213
  def vocab_size(self):