huseinzol05 commited on
Commit
e42b6e8
1 Parent(s): c2b9a32

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +7 -111
  2. tokenizer.json +0 -0
  3. tokenizer_config.json +0 -0
special_tokens_map.json CHANGED
@@ -1,136 +1,32 @@
1
  {
2
  "additional_special_tokens": [
3
- "<|endoftext|>",
4
- "<|startoftranscript|>",
5
- "<|en|>",
6
- "<|zh|>",
7
- "<|de|>",
8
- "<|es|>",
9
- "<|ru|>",
10
- "<|ko|>",
11
- "<|fr|>",
12
- "<|ja|>",
13
- "<|pt|>",
14
- "<|tr|>",
15
- "<|pl|>",
16
- "<|ca|>",
17
- "<|nl|>",
18
- "<|ar|>",
19
- "<|sv|>",
20
- "<|it|>",
21
- "<|id|>",
22
- "<|hi|>",
23
- "<|fi|>",
24
- "<|vi|>",
25
- "<|he|>",
26
- "<|uk|>",
27
- "<|el|>",
28
- "<|ms|>",
29
- "<|cs|>",
30
- "<|ro|>",
31
- "<|da|>",
32
- "<|hu|>",
33
- "<|ta|>",
34
- "<|no|>",
35
- "<|th|>",
36
- "<|ur|>",
37
- "<|hr|>",
38
- "<|bg|>",
39
- "<|lt|>",
40
- "<|la|>",
41
- "<|mi|>",
42
- "<|ml|>",
43
- "<|cy|>",
44
- "<|sk|>",
45
- "<|te|>",
46
- "<|fa|>",
47
- "<|lv|>",
48
- "<|bn|>",
49
- "<|sr|>",
50
- "<|az|>",
51
- "<|sl|>",
52
- "<|kn|>",
53
- "<|et|>",
54
- "<|mk|>",
55
- "<|br|>",
56
- "<|eu|>",
57
- "<|is|>",
58
- "<|hy|>",
59
- "<|ne|>",
60
- "<|mn|>",
61
- "<|bs|>",
62
- "<|kk|>",
63
- "<|sq|>",
64
- "<|sw|>",
65
- "<|gl|>",
66
- "<|mr|>",
67
- "<|pa|>",
68
- "<|si|>",
69
- "<|km|>",
70
- "<|sn|>",
71
- "<|yo|>",
72
- "<|so|>",
73
- "<|af|>",
74
- "<|oc|>",
75
- "<|ka|>",
76
- "<|be|>",
77
- "<|tg|>",
78
- "<|sd|>",
79
- "<|gu|>",
80
- "<|am|>",
81
- "<|yi|>",
82
- "<|lo|>",
83
- "<|uz|>",
84
- "<|fo|>",
85
- "<|ht|>",
86
- "<|ps|>",
87
- "<|tk|>",
88
- "<|nn|>",
89
- "<|mt|>",
90
- "<|sa|>",
91
- "<|lb|>",
92
- "<|my|>",
93
- "<|bo|>",
94
- "<|tl|>",
95
- "<|mg|>",
96
- "<|as|>",
97
- "<|tt|>",
98
- "<|haw|>",
99
- "<|ln|>",
100
- "<|ha|>",
101
- "<|ba|>",
102
- "<|jw|>",
103
- "<|su|>",
104
- "<|translate|>",
105
- "<|transcribe|>",
106
- "<|startoflm|>",
107
- "<|startofprev|>",
108
- "<|nocaptions|>",
109
- "<|notimestamps|>"
110
  ],
111
  "bos_token": {
112
- "content": "<|endoftext|>",
113
  "lstrip": false,
114
  "normalized": false,
115
  "rstrip": false,
116
  "single_word": false
117
  },
118
  "eos_token": {
119
- "content": "<|endoftext|>",
120
  "lstrip": false,
121
  "normalized": false,
122
  "rstrip": false,
123
  "single_word": false
124
  },
125
  "pad_token": {
126
- "content": "<|endoftext|>",
127
  "lstrip": false,
128
  "normalized": false,
129
  "rstrip": false,
130
  "single_word": false
131
  },
132
  "unk_token": {
133
- "content": "<|endoftext|>",
134
  "lstrip": false,
135
  "normalized": false,
136
  "rstrip": false,
 
1
  {
2
  "additional_special_tokens": [
3
+ "<unk>",
4
+ "<s>",
5
+ "</s>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  ],
7
  "bos_token": {
8
+ "content": "<s>",
9
  "lstrip": false,
10
  "normalized": false,
11
  "rstrip": false,
12
  "single_word": false
13
  },
14
  "eos_token": {
15
+ "content": "</s>",
16
  "lstrip": false,
17
  "normalized": false,
18
  "rstrip": false,
19
  "single_word": false
20
  },
21
  "pad_token": {
22
+ "content": "<unk>",
23
  "lstrip": false,
24
  "normalized": false,
25
  "rstrip": false,
26
  "single_word": false
27
  },
28
  "unk_token": {
29
+ "content": "<unk>",
30
  "lstrip": false,
31
  "normalized": false,
32
  "rstrip": false,
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
The diff for this file is too large to render. See raw diff