jtatman commited on
Commit
a0104d5
1 Parent(s): 8ef0f43

Upload tokenizer

Browse files
added_tokens.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "<|im_end|>": 32001,
3
  "<|im_start|>": 32000
4
  }
 
1
  {
2
+ "<|endoftext|>": 32002,
3
  "<|im_end|>": 32001,
4
  "<|im_start|>": 32000
5
  }
special_tokens_map.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
  "additional_special_tokens": [
3
  {
4
- "content": "<|im_start|>",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
8
  "single_word": false
9
  },
10
  {
11
- "content": "<|im_end|>",
12
  "lstrip": false,
13
  "normalized": false,
14
  "rstrip": false,
@@ -29,7 +29,13 @@
29
  "rstrip": false,
30
  "single_word": false
31
  },
32
- "pad_token": "</s>",
 
 
 
 
 
 
33
  "unk_token": {
34
  "content": "<unk>",
35
  "lstrip": false,
 
1
  {
2
  "additional_special_tokens": [
3
  {
4
+ "content": "<|im_end|>",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
8
  "single_word": false
9
  },
10
  {
11
+ "content": "<|endoftext|>",
12
  "lstrip": false,
13
  "normalized": false,
14
  "rstrip": false,
 
29
  "rstrip": false,
30
  "single_word": false
31
  },
32
+ "pad_token": {
33
+ "content": "</s>",
34
+ "lstrip": false,
35
+ "normalized": false,
36
+ "rstrip": false,
37
+ "single_word": false
38
+ },
39
  "unk_token": {
40
  "content": "<unk>",
41
  "lstrip": false,
tokenizer.json CHANGED
@@ -47,6 +47,15 @@
47
  "rstrip": false,
48
  "normalized": false,
49
  "special": true
 
 
 
 
 
 
 
 
 
50
  }
51
  ],
52
  "normalizer": {
 
47
  "rstrip": false,
48
  "normalized": false,
49
  "special": true
50
+ },
51
+ {
52
+ "id": 32002,
53
+ "content": "<|endoftext|>",
54
+ "single_word": false,
55
+ "lstrip": false,
56
+ "rstrip": false,
57
+ "normalized": false,
58
+ "special": true
59
  }
60
  ],
61
  "normalizer": {
tokenizer_config.json CHANGED
@@ -42,11 +42,19 @@
42
  "rstrip": false,
43
  "single_word": false,
44
  "special": true
 
 
 
 
 
 
 
 
45
  }
46
  },
47
  "additional_special_tokens": [
48
- "<|im_start|>",
49
- "<|im_end|>"
50
  ],
51
  "bos_token": "<s>",
52
  "clean_up_tokenization_spaces": false,
 
42
  "rstrip": false,
43
  "single_word": false,
44
  "special": true
45
+ },
46
+ "32002": {
47
+ "content": "<|endoftext|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
  }
54
  },
55
  "additional_special_tokens": [
56
+ "<|im_end|>",
57
+ "<|endoftext|>"
58
  ],
59
  "bos_token": "<s>",
60
  "clean_up_tokenization_spaces": false,