ArthurZ HF staff jonatanklosko commited on
Commit
a179016
1 Parent(s): bfeb067

Changes for fast tokenizer (#5)

Browse files

- Add tokenizer.json (dcca07232bfb1028e499333730f868b87fd3d043)
- Update unknown token (8852c40b30c9b7b981faf4fa77167fd862fd5fdb)
- Move <|endoftext|> from added_tokens.json to vocab.json (3e9581879a6134abfb58f5788096027dd1756a63)


Co-authored-by: Jonatan Kłosko <[email protected]>

added_tokens.json CHANGED
@@ -17,7 +17,6 @@
17
  "<|da|>": 50285,
18
  "<|de|>": 50261,
19
  "<|el|>": 50281,
20
- "<|endoftext|>": 50257,
21
  "<|en|>": 50259,
22
  "<|es|>": 50262,
23
  "<|et|>": 50307,
 
17
  "<|da|>": 50285,
18
  "<|de|>": 50261,
19
  "<|el|>": 50281,
 
20
  "<|en|>": 50259,
21
  "<|es|>": 50262,
22
  "<|et|>": 50307,
special_tokens_map.json CHANGED
@@ -124,7 +124,7 @@
124
  },
125
  "pad_token": "<|endoftext|>",
126
  "unk_token": {
127
- "content": "",
128
  "lstrip": false,
129
  "normalized": true,
130
  "rstrip": false,
 
124
  },
125
  "pad_token": "<|endoftext|>",
126
  "unk_token": {
127
+ "content": "<|endoftext|>",
128
  "lstrip": false,
129
  "normalized": true,
130
  "rstrip": false,
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -27,7 +27,7 @@
27
  "tokenizer_class": "WhisperTokenizer",
28
  "unk_token": {
29
  "__type": "AddedToken",
30
- "content": "",
31
  "lstrip": false,
32
  "normalized": true,
33
  "rstrip": false,
 
27
  "tokenizer_class": "WhisperTokenizer",
28
  "unk_token": {
29
  "__type": "AddedToken",
30
+ "content": "<|endoftext|>",
31
  "lstrip": false,
32
  "normalized": true,
33
  "rstrip": false,
vocab.json CHANGED
@@ -1,5 +1,6 @@
1
  {
2
  "": 50256,
 
3
  "!": 0,
4
  "!!": 1432,
5
  "!!!": 4589,
 
1
  {
2
  "": 50256,
3
+ "<|endoftext|>": 50257,
4
  "!": 0,
5
  "!!": 1432,
6
  "!!!": 4589,