robinq commited on
Commit
4d918b0
1 Parent(s): a202610

Upload tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ " ": 64000,
3
+ " ": 64001,
4
+ " ": 64002,
5
+ " ": 64003,
6
+ " ": 64004,
7
+ " ": 64005,
8
+ " ": 64006,
9
+ " ": 64007,
10
+ " ": 64008,
11
+ " ": 64009,
12
+ " ": 64010,
13
+ " ": 64011,
14
+ " ": 64012,
15
+ " ": 64013,
16
+ " ": 64014,
17
+ " ": 64015,
18
+ " ": 64016,
19
+ " ": 64017,
20
+ " ": 64018,
21
+ " ": 64019,
22
+ " ": 64020,
23
+ " ": 64021,
24
+ " ": 64022,
25
+ " ": 64023
26
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "bos_token": "<s>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "eos_token": "</s>",
6
+ "model_max_length": 1000000000000000019884624838656,
7
+ "pad_token": "<pad>",
8
+ "tokenizer_class": "GPT2Tokenizer",
9
+ "unk_token": "<unk>"
10
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff