not-lain commited on
Commit
9dd6df3
1 Parent(s): ccf3284

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.json +12 -16
  2. tokenizer_config.json +2 -0
tokenizer.json CHANGED
@@ -159,21 +159,17 @@
159
  ],
160
  "normalizer": null,
161
  "pre_tokenizer": {
162
- "type": "Sequence",
163
- "pretokenizers": [
164
- {
165
- "type": "Digits",
166
- "individual_digits": true
167
- },
168
- {
169
- "type": "ByteLevel",
170
- "add_prefix_space": false,
171
- "trim_offsets": true,
172
- "use_regex": true
173
- }
174
- ]
175
  },
176
- "post_processor": null,
177
  "decoder": {
178
  "type": "ByteLevel",
179
  "add_prefix_space": true,
@@ -184,8 +180,8 @@
184
  "type": "BPE",
185
  "dropout": null,
186
  "unk_token": null,
187
- "continuing_subword_prefix": null,
188
- "end_of_word_suffix": null,
189
  "fuse_unk": false,
190
  "byte_fallback": false,
191
  "ignore_merges": false,
 
159
  ],
160
  "normalizer": null,
161
  "pre_tokenizer": {
162
+ "type": "ByteLevel",
163
+ "add_prefix_space": false,
164
+ "trim_offsets": true,
165
+ "use_regex": true
166
+ },
167
+ "post_processor": {
168
+ "type": "ByteLevel",
169
+ "add_prefix_space": true,
170
+ "trim_offsets": false,
171
+ "use_regex": true
 
 
 
172
  },
 
173
  "decoder": {
174
  "type": "ByteLevel",
175
  "add_prefix_space": true,
 
180
  "type": "BPE",
181
  "dropout": null,
182
  "unk_token": null,
183
+ "continuing_subword_prefix": "",
184
+ "end_of_word_suffix": "",
185
  "fuse_unk": false,
186
  "byte_fallback": false,
187
  "ignore_merges": false,
tokenizer_config.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "add_prefix_space": false,
3
  "added_tokens_decoder": {
4
  "0": {
@@ -146,6 +147,7 @@
146
  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
147
  "clean_up_tokenization_spaces": false,
148
  "eos_token": "<|im_end|>",
 
149
  "model_max_length": 2048,
150
  "pad_token": "<|im_end|>",
151
  "tokenizer_class": "GPT2Tokenizer",
 
1
  {
2
+ "add_bos_token": false,
3
  "add_prefix_space": false,
4
  "added_tokens_decoder": {
5
  "0": {
 
147
  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
148
  "clean_up_tokenization_spaces": false,
149
  "eos_token": "<|im_end|>",
150
+ "errors": "replace",
151
  "model_max_length": 2048,
152
  "pad_token": "<|im_end|>",
153
  "tokenizer_class": "GPT2Tokenizer",