Chaeseung commited on
Commit
aa4a95c
1 Parent(s): 7102a01

Upload tokenizer

Browse files
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": true
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": true
15
+ },
16
+ "pad_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": true
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": true
29
+ }
30
+ }
tokenization_orion.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024, OrionStar Inc. All rights reserved.
2
+
3
+ import os
4
+ from shutil import copyfile
5
+ from typing import Any, Dict, List, Optional, Tuple
6
+ import re
7
+
8
+ import sentencepiece as spm
9
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
10
+
11
+
12
+ VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
13
+
14
+ PRETRAINED_VOCAB_FILES_MAP = {
15
+ "vocab_file": {},
16
+ "tokenizer_file": {},
17
+ }
18
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
19
+
20
+
21
+ class OrionTokenizer(PreTrainedTokenizer):
22
+ """
23
+ Construct a Orion tokenizer. Based on byte-level Byte-Pair-Encoding.
24
+
25
+ Args:
26
+ vocab_file (`str`):
27
+ Path to the vocabulary file.
28
+ """
29
+
30
+ vocab_files_names = VOCAB_FILES_NAMES
31
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
32
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
33
+ model_input_names = ["input_ids", "attention_mask"]
34
+
35
+ def __init__(
36
+ self,
37
+ vocab_file,
38
+ unk_token="<unk>",
39
+ bos_token="<s>",
40
+ eos_token="</s>",
41
+ pad_token=None,
42
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
43
+ add_bos_token=True,
44
+ add_eos_token=False,
45
+ clean_up_tokenization_spaces=False,
46
+ **kwargs,
47
+ ):
48
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
49
+ bos_token = (
50
+ AddedToken(bos_token, lstrip=False, rstrip=False)
51
+ if isinstance(bos_token, str)
52
+ else bos_token
53
+ )
54
+ eos_token = (
55
+ AddedToken(eos_token, lstrip=False, rstrip=False)
56
+ if isinstance(eos_token, str)
57
+ else eos_token
58
+ )
59
+ unk_token = (
60
+ AddedToken(unk_token, lstrip=False, rstrip=False)
61
+ if isinstance(unk_token, str)
62
+ else unk_token
63
+ )
64
+ pad_token = (
65
+ AddedToken(pad_token, lstrip=False, rstrip=False)
66
+ if isinstance(pad_token, str)
67
+ else pad_token
68
+ )
69
+ self.vocab_file = vocab_file
70
+ self.add_bos_token = add_bos_token
71
+ self.add_eos_token = add_eos_token
72
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
73
+ self.sp_model.Load(vocab_file)
74
+
75
+ super().__init__(
76
+ bos_token=bos_token,
77
+ eos_token=eos_token,
78
+ unk_token=unk_token,
79
+ pad_token=pad_token,
80
+ add_bos_token=add_bos_token,
81
+ add_eos_token=add_eos_token,
82
+ sp_model_kwargs=self.sp_model_kwargs,
83
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
84
+ **kwargs,
85
+ )
86
+
87
+ def __getstate__(self):
88
+ state = self.__dict__.copy()
89
+ state["sp_model"] = None
90
+ return state
91
+
92
+ def __setstate__(self, d):
93
+ self.__dict__ = d
94
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
95
+ self.sp_model.Load(self.vocab_file)
96
+
97
+ @property
98
+ def vocab_size(self):
99
+ """Returns vocab size"""
100
+ return self.sp_model.get_piece_size()
101
+
102
+ def get_vocab(self):
103
+ """Returns vocab as a dict"""
104
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
105
+ vocab.update(self.added_tokens_encoder)
106
+ return vocab
107
+
108
+ def _tokenize(self, text):
109
+ """Returns a tokenized string."""
110
+ return self.sp_model.encode(text, out_type=str)
111
+
112
+ def _convert_token_to_id(self, token):
113
+ """Converts a token (str) in an id using the vocab."""
114
+ return self.sp_model.piece_to_id(token)
115
+
116
+ def _convert_id_to_token(self, index):
117
+ """Converts an index (integer) in a token (str) using the vocab."""
118
+ token = self.sp_model.IdToPiece(index)
119
+ return token
120
+
121
+ def convert_tokens_to_string(self, tokens):
122
+ """Converts a sequence of tokens (string) in a single string."""
123
+ zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
124
+ need_convert_punctuation=(",",";","!","?",":","(",")")
125
+ current_sub_tokens = []
126
+ out_string = ""
127
+ prev_is_special = False
128
+ for i, token in enumerate(tokens):
129
+ # make sure that special tokens are not decoded using sentencepiece model
130
+ if token in self.all_special_tokens:
131
+ if not prev_is_special and i != 0:
132
+ out_string += " "
133
+ out_string += self.sp_model.decode(current_sub_tokens) + token
134
+ prev_is_special = True
135
+ current_sub_tokens = []
136
+ if any([True if punctuation in token else False for punctuation in need_convert_punctuation]):
137
+ out_string += self.sp_model.decode(current_sub_tokens)
138
+ token=self.sp_model.decode(token)
139
+ if zhPattern.search(out_string[-20:]):
140
+ token = self.to_zh_punctuation(token)
141
+ out_string += token
142
+ current_sub_tokens = []
143
+ else:
144
+ current_sub_tokens.append(token)
145
+ prev_is_special = False
146
+ out_string += self.sp_model.decode(current_sub_tokens)
147
+ return out_string
148
+
149
+ def to_zh_punctuation(self, token):
150
+ return token.replace(",",",").replace(";",";").replace("!","!").replace("?","?").replace(":",":").replace("(","(").replace(")",")")
151
+
152
+ def save_vocabulary(
153
+ self, save_directory, filename_prefix: Optional[str] = None
154
+ ) -> Tuple[str]:
155
+ """
156
+ Save the vocabulary and special tokens file to a directory.
157
+
158
+ Args:
159
+ save_directory (`str`):
160
+ The directory in which to save the vocabulary.
161
+
162
+ Returns:
163
+ `Tuple(str)`: Paths to the files saved.
164
+ """
165
+ if not os.path.isdir(save_directory):
166
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
167
+ return
168
+ out_vocab_file = os.path.join(
169
+ save_directory,
170
+ (filename_prefix + "-" if filename_prefix else "")
171
+ + VOCAB_FILES_NAMES["vocab_file"],
172
+ )
173
+
174
+ if os.path.abspath(self.vocab_file) != os.path.abspath(
175
+ out_vocab_file
176
+ ) and os.path.isfile(self.vocab_file):
177
+ copyfile(self.vocab_file, out_vocab_file)
178
+ elif not os.path.isfile(self.vocab_file):
179
+ with open(out_vocab_file, "wb") as fi:
180
+ content_spiece_model = self.sp_model.serialized_model_proto()
181
+ fi.write(content_spiece_model)
182
+
183
+ return (out_vocab_file,)
184
+
185
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
186
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
187
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
188
+
189
+ output = bos_token_id + token_ids_0 + eos_token_id
190
+
191
+ if token_ids_1 is not None:
192
+ output = output + bos_token_id + token_ids_1 + eos_token_id
193
+
194
+ return output
195
+
196
+ def get_special_tokens_mask(
197
+ self,
198
+ token_ids_0: List[int],
199
+ token_ids_1: Optional[List[int]] = None,
200
+ already_has_special_tokens: bool = False,
201
+ ) -> List[int]:
202
+ """
203
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
204
+ special tokens using the tokenizer `prepare_for_model` method.
205
+
206
+ Args:
207
+ token_ids_0 (`List[int]`):
208
+ List of IDs.
209
+ token_ids_1 (`List[int]`, *optional*):
210
+ Optional second list of IDs for sequence pairs.
211
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
212
+ Whether or not the token list is already formatted with special tokens for the model.
213
+
214
+ Returns:
215
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
216
+ """
217
+ if already_has_special_tokens:
218
+ return super().get_special_tokens_mask(
219
+ token_ids_0=token_ids_0,
220
+ token_ids_1=token_ids_1,
221
+ already_has_special_tokens=True,
222
+ )
223
+
224
+ bos_token_id = [1] if self.add_bos_token else []
225
+ eos_token_id = [1] if self.add_eos_token else []
226
+
227
+ if token_ids_1 is None:
228
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
229
+ return (
230
+ bos_token_id
231
+ + ([0] * len(token_ids_0))
232
+ + eos_token_id
233
+ + bos_token_id
234
+ + ([0] * len(token_ids_1))
235
+ + eos_token_id
236
+ )
237
+
238
+ def create_token_type_ids_from_sequences(
239
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
240
+ ) -> List[int]:
241
+ """
242
+ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
243
+ sequence pair mask has the following format:
244
+
245
+ ```
246
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
247
+ | first sequence | second sequence |
248
+ ```
249
+
250
+ if token_ids_1 is None, only returns the first portion of the mask (0s).
251
+
252
+ Args:
253
+ token_ids_0 (`List[int]`):
254
+ List of ids.
255
+ token_ids_1 (`List[int]`, *optional*):
256
+ Optional second list of IDs for sequence pairs.
257
+
258
+ Returns:
259
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
260
+ """
261
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
262
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
263
+
264
+ output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
265
+
266
+ if token_ids_1 is not None:
267
+ output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
268
+
269
+ return output
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ded43118b7418f56db97a4eed08a5c265c03120158229ddd4fbcc9658241d5f0
3
+ size 1520600
tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": true,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": true,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": true,
27
+ "special": true
28
+ }
29
+ },
30
+ "auto_map": {
31
+ "AutoTokenizer": [
32
+ "tokenization_orion.OrionTokenizer",
33
+ null
34
+ ]
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": false,
38
+ "eos_token": "</s>",
39
+ "model_max_length": 4096,
40
+ "pad_token": "<unk>",
41
+ "sp_model_kwargs": {},
42
+ "tokenizer_class": "OrionTokenizer",
43
+ "unk_token": "<unk>"
44
+ }