kenchiayy commited on
Commit
d22b6bb
1 Parent(s): 7c4184f

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer_config.json +28 -28
  2. vocab.json +28 -28
tokenizer_config.json CHANGED
@@ -17,7 +17,7 @@
17
  "special": false
18
  },
19
  "28": {
20
- "content": "n</w>",
21
  "lstrip": true,
22
  "normalized": false,
23
  "rstrip": true,
@@ -25,7 +25,7 @@
25
  "special": false
26
  },
27
  "29": {
28
- "content": "r</w>",
29
  "lstrip": true,
30
  "normalized": false,
31
  "rstrip": true,
@@ -33,7 +33,7 @@
33
  "special": false
34
  },
35
  "30": {
36
- "content": "h</w>",
37
  "lstrip": true,
38
  "normalized": false,
39
  "rstrip": true,
@@ -41,7 +41,7 @@
41
  "special": false
42
  },
43
  "31": {
44
- "content": "f</w>",
45
  "lstrip": true,
46
  "normalized": false,
47
  "rstrip": true,
@@ -49,7 +49,7 @@
49
  "special": false
50
  },
51
  "32": {
52
- "content": "u</w>",
53
  "lstrip": true,
54
  "normalized": false,
55
  "rstrip": true,
@@ -57,7 +57,7 @@
57
  "special": false
58
  },
59
  "33": {
60
- "content": "t</w>",
61
  "lstrip": true,
62
  "normalized": false,
63
  "rstrip": true,
@@ -65,7 +65,7 @@
65
  "special": false
66
  },
67
  "34": {
68
- "content": "d</w>",
69
  "lstrip": true,
70
  "normalized": false,
71
  "rstrip": true,
@@ -73,7 +73,7 @@
73
  "special": false
74
  },
75
  "35": {
76
- "content": "m</w>",
77
  "lstrip": true,
78
  "normalized": false,
79
  "rstrip": true,
@@ -89,7 +89,7 @@
89
  "special": false
90
  },
91
  "37": {
92
- "content": "o</w>",
93
  "lstrip": true,
94
  "normalized": false,
95
  "rstrip": true,
@@ -97,7 +97,7 @@
97
  "special": false
98
  },
99
  "38": {
100
- "content": "e</w>",
101
  "lstrip": true,
102
  "normalized": false,
103
  "rstrip": true,
@@ -105,7 +105,7 @@
105
  "special": false
106
  },
107
  "39": {
108
- "content": "y</w>",
109
  "lstrip": true,
110
  "normalized": false,
111
  "rstrip": true,
@@ -113,7 +113,7 @@
113
  "special": false
114
  },
115
  "40": {
116
- "content": "k</w>",
117
  "lstrip": true,
118
  "normalized": false,
119
  "rstrip": true,
@@ -121,7 +121,7 @@
121
  "special": false
122
  },
123
  "41": {
124
- "content": "s</w>",
125
  "lstrip": true,
126
  "normalized": false,
127
  "rstrip": true,
@@ -129,7 +129,7 @@
129
  "special": false
130
  },
131
  "42": {
132
- "content": "l</w>",
133
  "lstrip": true,
134
  "normalized": false,
135
  "rstrip": true,
@@ -137,7 +137,7 @@
137
  "special": false
138
  },
139
  "43": {
140
- "content": "p</w>",
141
  "lstrip": true,
142
  "normalized": false,
143
  "rstrip": true,
@@ -145,7 +145,7 @@
145
  "special": false
146
  },
147
  "44": {
148
- "content": "g</w>",
149
  "lstrip": true,
150
  "normalized": false,
151
  "rstrip": true,
@@ -153,7 +153,7 @@
153
  "special": false
154
  },
155
  "45": {
156
- "content": "v</w>",
157
  "lstrip": true,
158
  "normalized": false,
159
  "rstrip": true,
@@ -161,7 +161,7 @@
161
  "special": false
162
  },
163
  "46": {
164
- "content": "c</w>",
165
  "lstrip": true,
166
  "normalized": false,
167
  "rstrip": true,
@@ -169,7 +169,7 @@
169
  "special": false
170
  },
171
  "47": {
172
- "content": "w</w>",
173
  "lstrip": true,
174
  "normalized": false,
175
  "rstrip": true,
@@ -177,7 +177,7 @@
177
  "special": false
178
  },
179
  "48": {
180
- "content": "i</w>",
181
  "lstrip": true,
182
  "normalized": false,
183
  "rstrip": true,
@@ -185,7 +185,7 @@
185
  "special": false
186
  },
187
  "49": {
188
- "content": "x</w>",
189
  "lstrip": true,
190
  "normalized": false,
191
  "rstrip": true,
@@ -193,7 +193,7 @@
193
  "special": false
194
  },
195
  "50": {
196
- "content": "b</w>",
197
  "lstrip": true,
198
  "normalized": false,
199
  "rstrip": true,
@@ -201,7 +201,7 @@
201
  "special": false
202
  },
203
  "51": {
204
- "content": "q</w>",
205
  "lstrip": true,
206
  "normalized": false,
207
  "rstrip": true,
@@ -209,7 +209,7 @@
209
  "special": false
210
  },
211
  "52": {
212
- "content": "z</w>",
213
  "lstrip": true,
214
  "normalized": false,
215
  "rstrip": true,
@@ -9113,7 +9113,7 @@
9113
  "special": false
9114
  },
9115
  "1165": {
9116
- "content": "thr</w>",
9117
  "lstrip": true,
9118
  "normalized": false,
9119
  "rstrip": true,
@@ -9121,7 +9121,7 @@
9121
  "special": false
9122
  },
9123
  "1166": {
9124
- "content": "tha</w>",
9125
  "lstrip": true,
9126
  "normalized": false,
9127
  "rstrip": true,
@@ -9897,7 +9897,7 @@
9897
  "special": false
9898
  },
9899
  "1263": {
9900
- "content": "out</w>",
9901
  "lstrip": true,
9902
  "normalized": false,
9903
  "rstrip": true,
@@ -9905,7 +9905,7 @@
9905
  "special": false
9906
  },
9907
  "1264": {
9908
- "content": "ous</w>",
9909
  "lstrip": true,
9910
  "normalized": false,
9911
  "rstrip": true,
 
17
  "special": false
18
  },
19
  "28": {
20
+ "content": "g</w>",
21
  "lstrip": true,
22
  "normalized": false,
23
  "rstrip": true,
 
25
  "special": false
26
  },
27
  "29": {
28
+ "content": "d</w>",
29
  "lstrip": true,
30
  "normalized": false,
31
  "rstrip": true,
 
33
  "special": false
34
  },
35
  "30": {
36
+ "content": "s</w>",
37
  "lstrip": true,
38
  "normalized": false,
39
  "rstrip": true,
 
41
  "special": false
42
  },
43
  "31": {
44
+ "content": "e</w>",
45
  "lstrip": true,
46
  "normalized": false,
47
  "rstrip": true,
 
49
  "special": false
50
  },
51
  "32": {
52
+ "content": "t</w>",
53
  "lstrip": true,
54
  "normalized": false,
55
  "rstrip": true,
 
57
  "special": false
58
  },
59
  "33": {
60
+ "content": "u</w>",
61
  "lstrip": true,
62
  "normalized": false,
63
  "rstrip": true,
 
65
  "special": false
66
  },
67
  "34": {
68
+ "content": "f</w>",
69
  "lstrip": true,
70
  "normalized": false,
71
  "rstrip": true,
 
73
  "special": false
74
  },
75
  "35": {
76
+ "content": "y</w>",
77
  "lstrip": true,
78
  "normalized": false,
79
  "rstrip": true,
 
89
  "special": false
90
  },
91
  "37": {
92
+ "content": "i</w>",
93
  "lstrip": true,
94
  "normalized": false,
95
  "rstrip": true,
 
97
  "special": false
98
  },
99
  "38": {
100
+ "content": "h</w>",
101
  "lstrip": true,
102
  "normalized": false,
103
  "rstrip": true,
 
105
  "special": false
106
  },
107
  "39": {
108
+ "content": "k</w>",
109
  "lstrip": true,
110
  "normalized": false,
111
  "rstrip": true,
 
113
  "special": false
114
  },
115
  "40": {
116
+ "content": "n</w>",
117
  "lstrip": true,
118
  "normalized": false,
119
  "rstrip": true,
 
121
  "special": false
122
  },
123
  "41": {
124
+ "content": "r</w>",
125
  "lstrip": true,
126
  "normalized": false,
127
  "rstrip": true,
 
129
  "special": false
130
  },
131
  "42": {
132
+ "content": "m</w>",
133
  "lstrip": true,
134
  "normalized": false,
135
  "rstrip": true,
 
137
  "special": false
138
  },
139
  "43": {
140
+ "content": "c</w>",
141
  "lstrip": true,
142
  "normalized": false,
143
  "rstrip": true,
 
145
  "special": false
146
  },
147
  "44": {
148
+ "content": "v</w>",
149
  "lstrip": true,
150
  "normalized": false,
151
  "rstrip": true,
 
153
  "special": false
154
  },
155
  "45": {
156
+ "content": "l</w>",
157
  "lstrip": true,
158
  "normalized": false,
159
  "rstrip": true,
 
161
  "special": false
162
  },
163
  "46": {
164
+ "content": "x</w>",
165
  "lstrip": true,
166
  "normalized": false,
167
  "rstrip": true,
 
169
  "special": false
170
  },
171
  "47": {
172
+ "content": "o</w>",
173
  "lstrip": true,
174
  "normalized": false,
175
  "rstrip": true,
 
177
  "special": false
178
  },
179
  "48": {
180
+ "content": "w</w>",
181
  "lstrip": true,
182
  "normalized": false,
183
  "rstrip": true,
 
185
  "special": false
186
  },
187
  "49": {
188
+ "content": "p</w>",
189
  "lstrip": true,
190
  "normalized": false,
191
  "rstrip": true,
 
193
  "special": false
194
  },
195
  "50": {
196
+ "content": "z</w>",
197
  "lstrip": true,
198
  "normalized": false,
199
  "rstrip": true,
 
201
  "special": false
202
  },
203
  "51": {
204
+ "content": "b</w>",
205
  "lstrip": true,
206
  "normalized": false,
207
  "rstrip": true,
 
209
  "special": false
210
  },
211
  "52": {
212
+ "content": "q</w>",
213
  "lstrip": true,
214
  "normalized": false,
215
  "rstrip": true,
 
9113
  "special": false
9114
  },
9115
  "1165": {
9116
+ "content": "tha</w>",
9117
  "lstrip": true,
9118
  "normalized": false,
9119
  "rstrip": true,
 
9121
  "special": false
9122
  },
9123
  "1166": {
9124
+ "content": "thr</w>",
9125
  "lstrip": true,
9126
  "normalized": false,
9127
  "rstrip": true,
 
9897
  "special": false
9898
  },
9899
  "1263": {
9900
+ "content": "ous</w>",
9901
  "lstrip": true,
9902
  "normalized": false,
9903
  "rstrip": true,
 
9905
  "special": false
9906
  },
9907
  "1264": {
9908
+ "content": "out</w>",
9909
  "lstrip": true,
9910
  "normalized": false,
9911
  "rstrip": true,
vocab.json CHANGED
@@ -119,7 +119,7 @@
119
  "aw": 245,
120
  "ay</w>": 290,
121
  "b": 3,
122
- "b</w>": 50,
123
  "ba": 305,
124
  "back</w>": 332,
125
  "baf": 1015,
@@ -183,7 +183,7 @@
183
  "by</w>": 398,
184
  "bye</w>": 135,
185
  "c": 4,
186
- "c</w>": 46,
187
  "cal": 202,
188
  "call": 836,
189
  "call</w>": 224,
@@ -286,7 +286,7 @@
286
  "cut</w>": 799,
287
  "cy</w>": 512,
288
  "d": 5,
289
- "d</w>": 34,
290
  "da</w>": 616,
291
  "day</w>": 369,
292
  "de": 77,
@@ -326,7 +326,7 @@
326
  "du</w>": 634,
327
  "due</w>": 556,
328
  "e": 6,
329
- "e</w>": 38,
330
  "ea": 326,
331
  "ear": 1221,
332
  "earlier</w>": 1353,
@@ -390,7 +390,7 @@
390
  "expedite</w>": 804,
391
  "expediti": 1330,
392
  "f": 7,
393
- "f</w>": 31,
394
  "fa</w>": 232,
395
  "famili": 1140,
396
  "familiar</w>": 1197,
@@ -445,7 +445,7 @@
445
  "fusse</w>": 368,
446
  "futura</w>": 985,
447
  "g": 8,
448
- "g</w>": 44,
449
  "ga": 617,
450
  "ga</w>": 840,
451
  "gal</w>": 816,
@@ -490,7 +490,7 @@
490
  "gulf</w>": 576,
491
  "guten</w>": 412,
492
  "h": 9,
493
- "h</w>": 30,
494
  "ha": 186,
495
  "ha</w>": 1084,
496
  "hal": 1228,
@@ -533,7 +533,7 @@
533
  "hundred</w>": 287,
534
  "huss</w>": 252,
535
  "i": 10,
536
- "i</w>": 48,
537
  "ia</w>": 289,
538
  "iber": 542,
539
  "iberia</w>": 546,
@@ -607,7 +607,7 @@
607
  "juliett</w>": 456,
608
  "just</w>": 825,
609
  "k": 12,
610
- "k</w>": 40,
611
  "ka": 951,
612
  "kair</w>": 774,
613
  "kamas</w>": 986,
@@ -639,7 +639,7 @@
639
  "ks</w>": 952,
640
  "ky</w>": 1086,
641
  "l": 13,
642
- "l</w>": 42,
643
  "la": 374,
644
  "la</w>": 686,
645
  "lan": 431,
@@ -714,7 +714,7 @@
714
  "ly</w>": 312,
715
  "lyon</w>": 1356,
716
  "m": 14,
717
- "m</w>": 35,
718
  "ma": 98,
719
  "ma</w>": 762,
720
  "mach</w>": 691,
@@ -784,7 +784,7 @@
784
  "munich</w>": 1341,
785
  "my</w>": 1019,
786
  "n": 15,
787
- "n</w>": 28,
788
  "na": 384,
789
  "na</w>": 972,
790
  "nato</w>": 698,
@@ -841,7 +841,7 @@
841
  "nu": 735,
842
  "number</w>": 740,
843
  "o": 16,
844
- "o</w>": 37,
845
  "ob": 1238,
846
  "occ": 1051,
847
  "occupied</w>": 1074,
@@ -884,9 +884,9 @@
884
  "our": 302,
885
  "our</w>": 65,
886
  "ous": 386,
887
- "ous</w>": 1264,
888
  "ously</w>": 1314,
889
- "out</w>": 1263,
890
  "outh</w>": 1094,
891
  "ov": 1147,
892
  "over</w>": 1198,
@@ -895,7 +895,7 @@
895
  "ow</w>": 1089,
896
  "own</w>": 264,
897
  "p": 17,
898
- "p</w>": 43,
899
  "pa": 192,
900
  "pa</w>": 322,
901
  "pace</w>": 1300,
@@ -944,12 +944,12 @@
944
  "psa</w>": 709,
945
  "pt</w>": 721,
946
  "q": 18,
947
- "q</w>": 51,
948
  "qu": 216,
949
  "quen": 509,
950
  "qui": 1302,
951
  "r": 19,
952
- "r</w>": 29,
953
  "ra": 84,
954
  "ra</w>": 313,
955
  "rad": 124,
@@ -1037,7 +1037,7 @@
1037
  "rvsm</w>": 680,
1038
  "ry</w>": 382,
1039
  "s": 20,
1040
- "s</w>": 41,
1041
  "sa": 163,
1042
  "sa</w>": 110,
1043
  "sabena</w>": 215,
@@ -1145,7 +1145,7 @@
1145
  "swiss</w>": 240,
1146
  "switch</w>": 1175,
1147
  "t": 21,
1148
- "t</w>": 33,
1149
  "ta": 74,
1150
  "ta</w>": 834,
1151
  "tag</w>": 227,
@@ -1172,7 +1172,7 @@
1172
  "testar</w>": 1295,
1173
  "th": 55,
1174
  "th</w>": 405,
1175
- "tha</w>": 1166,
1176
  "than": 111,
1177
  "than</w>": 1260,
1178
  "thank</w>": 508,
@@ -1194,7 +1194,7 @@
1194
  "this</w>": 894,
1195
  "thous": 387,
1196
  "thousand</w>": 389,
1197
- "thr</w>": 1165,
1198
  "three</w>": 57,
1199
  "throu": 1262,
1200
  "through</w>": 1362,
@@ -1255,7 +1255,7 @@
1255
  "two</w>": 63,
1256
  "ty</w>": 744,
1257
  "u": 22,
1258
- "u</w>": 32,
1259
  "uda</w>": 650,
1260
  "uh": 404,
1261
  "ui": 629,
@@ -1281,7 +1281,7 @@
1281
  "ut</w>": 736,
1282
  "uter</w>": 1156,
1283
  "v": 23,
1284
- "v</w>": 45,
1285
  "va</w>": 507,
1286
  "val": 1158,
1287
  "valda</w>": 1201,
@@ -1305,7 +1305,7 @@
1305
  "vsm</w>": 673,
1306
  "vus</w>": 1250,
1307
  "w": 24,
1308
- "w</w>": 47,
1309
  "wal": 483,
1310
  "wald</w>": 488,
1311
  "warb": 1252,
@@ -1347,12 +1347,12 @@
1347
  "wron": 955,
1348
  "wrong</w>": 962,
1349
  "x": 25,
1350
- "x</w>": 49,
1351
  "xeui": 722,
1352
  "xra": 956,
1353
  "xray</w>": 963,
1354
  "y": 26,
1355
- "y</w>": 39,
1356
  "yan": 924,
1357
  "yankee</w>": 931,
1358
  "yd</w>": 187,
@@ -1368,7 +1368,7 @@
1368
  "ysian</w>": 578,
1369
  "ystar</w>": 1254,
1370
  "z": 27,
1371
- "z</w>": 52,
1372
  "zero</w>": 70,
1373
  "zulu</w>": 1054,
1374
  "zur": 150,
 
119
  "aw": 245,
120
  "ay</w>": 290,
121
  "b": 3,
122
+ "b</w>": 51,
123
  "ba": 305,
124
  "back</w>": 332,
125
  "baf": 1015,
 
183
  "by</w>": 398,
184
  "bye</w>": 135,
185
  "c": 4,
186
+ "c</w>": 43,
187
  "cal": 202,
188
  "call": 836,
189
  "call</w>": 224,
 
286
  "cut</w>": 799,
287
  "cy</w>": 512,
288
  "d": 5,
289
+ "d</w>": 29,
290
  "da</w>": 616,
291
  "day</w>": 369,
292
  "de": 77,
 
326
  "du</w>": 634,
327
  "due</w>": 556,
328
  "e": 6,
329
+ "e</w>": 31,
330
  "ea": 326,
331
  "ear": 1221,
332
  "earlier</w>": 1353,
 
390
  "expedite</w>": 804,
391
  "expediti": 1330,
392
  "f": 7,
393
+ "f</w>": 34,
394
  "fa</w>": 232,
395
  "famili": 1140,
396
  "familiar</w>": 1197,
 
445
  "fusse</w>": 368,
446
  "futura</w>": 985,
447
  "g": 8,
448
+ "g</w>": 28,
449
  "ga": 617,
450
  "ga</w>": 840,
451
  "gal</w>": 816,
 
490
  "gulf</w>": 576,
491
  "guten</w>": 412,
492
  "h": 9,
493
+ "h</w>": 38,
494
  "ha": 186,
495
  "ha</w>": 1084,
496
  "hal": 1228,
 
533
  "hundred</w>": 287,
534
  "huss</w>": 252,
535
  "i": 10,
536
+ "i</w>": 37,
537
  "ia</w>": 289,
538
  "iber": 542,
539
  "iberia</w>": 546,
 
607
  "juliett</w>": 456,
608
  "just</w>": 825,
609
  "k": 12,
610
+ "k</w>": 39,
611
  "ka": 951,
612
  "kair</w>": 774,
613
  "kamas</w>": 986,
 
639
  "ks</w>": 952,
640
  "ky</w>": 1086,
641
  "l": 13,
642
+ "l</w>": 45,
643
  "la": 374,
644
  "la</w>": 686,
645
  "lan": 431,
 
714
  "ly</w>": 312,
715
  "lyon</w>": 1356,
716
  "m": 14,
717
+ "m</w>": 42,
718
  "ma": 98,
719
  "ma</w>": 762,
720
  "mach</w>": 691,
 
784
  "munich</w>": 1341,
785
  "my</w>": 1019,
786
  "n": 15,
787
+ "n</w>": 40,
788
  "na": 384,
789
  "na</w>": 972,
790
  "nato</w>": 698,
 
841
  "nu": 735,
842
  "number</w>": 740,
843
  "o": 16,
844
+ "o</w>": 47,
845
  "ob": 1238,
846
  "occ": 1051,
847
  "occupied</w>": 1074,
 
884
  "our": 302,
885
  "our</w>": 65,
886
  "ous": 386,
887
+ "ous</w>": 1263,
888
  "ously</w>": 1314,
889
+ "out</w>": 1264,
890
  "outh</w>": 1094,
891
  "ov": 1147,
892
  "over</w>": 1198,
 
895
  "ow</w>": 1089,
896
  "own</w>": 264,
897
  "p": 17,
898
+ "p</w>": 49,
899
  "pa": 192,
900
  "pa</w>": 322,
901
  "pace</w>": 1300,
 
944
  "psa</w>": 709,
945
  "pt</w>": 721,
946
  "q": 18,
947
+ "q</w>": 52,
948
  "qu": 216,
949
  "quen": 509,
950
  "qui": 1302,
951
  "r": 19,
952
+ "r</w>": 41,
953
  "ra": 84,
954
  "ra</w>": 313,
955
  "rad": 124,
 
1037
  "rvsm</w>": 680,
1038
  "ry</w>": 382,
1039
  "s": 20,
1040
+ "s</w>": 30,
1041
  "sa": 163,
1042
  "sa</w>": 110,
1043
  "sabena</w>": 215,
 
1145
  "swiss</w>": 240,
1146
  "switch</w>": 1175,
1147
  "t": 21,
1148
+ "t</w>": 32,
1149
  "ta": 74,
1150
  "ta</w>": 834,
1151
  "tag</w>": 227,
 
1172
  "testar</w>": 1295,
1173
  "th": 55,
1174
  "th</w>": 405,
1175
+ "tha</w>": 1165,
1176
  "than": 111,
1177
  "than</w>": 1260,
1178
  "thank</w>": 508,
 
1194
  "this</w>": 894,
1195
  "thous": 387,
1196
  "thousand</w>": 389,
1197
+ "thr</w>": 1166,
1198
  "three</w>": 57,
1199
  "throu": 1262,
1200
  "through</w>": 1362,
 
1255
  "two</w>": 63,
1256
  "ty</w>": 744,
1257
  "u": 22,
1258
+ "u</w>": 33,
1259
  "uda</w>": 650,
1260
  "uh": 404,
1261
  "ui": 629,
 
1281
  "ut</w>": 736,
1282
  "uter</w>": 1156,
1283
  "v": 23,
1284
+ "v</w>": 44,
1285
  "va</w>": 507,
1286
  "val": 1158,
1287
  "valda</w>": 1201,
 
1305
  "vsm</w>": 673,
1306
  "vus</w>": 1250,
1307
  "w": 24,
1308
+ "w</w>": 48,
1309
  "wal": 483,
1310
  "wald</w>": 488,
1311
  "warb": 1252,
 
1347
  "wron": 955,
1348
  "wrong</w>": 962,
1349
  "x": 25,
1350
+ "x</w>": 46,
1351
  "xeui": 722,
1352
  "xra": 956,
1353
  "xray</w>": 963,
1354
  "y": 26,
1355
+ "y</w>": 35,
1356
  "yan": 924,
1357
  "yankee</w>": 931,
1358
  "yd</w>": 187,
 
1368
  "ysian</w>": 578,
1369
  "ystar</w>": 1254,
1370
  "z": 27,
1371
+ "z</w>": 50,
1372
  "zero</w>": 70,
1373
  "zulu</w>": 1054,
1374
  "zur": 150,