Thunpitcha commited on
Commit
2c8ffcd
1 Parent(s): dfd232b

Upload tokenizer

Browse files
Files changed (3) hide show
  1. added_tokens.json +2 -2
  2. tokenizer_config.json +4 -4
  3. vocab.json +83 -67
added_tokens.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
- "</s>": 69,
3
- "<s>": 68
4
  }
 
1
  {
2
+ "</s>": 85,
3
+ "<s>": 84
4
  }
tokenizer_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "added_tokens_decoder": {
3
- "66": {
4
  "content": "[UNK]",
5
  "lstrip": true,
6
  "normalized": false,
@@ -8,7 +8,7 @@
8
  "single_word": false,
9
  "special": false
10
  },
11
- "67": {
12
  "content": "[PAD]",
13
  "lstrip": true,
14
  "normalized": false,
@@ -16,7 +16,7 @@
16
  "single_word": false,
17
  "special": false
18
  },
19
- "68": {
20
  "content": "<s>",
21
  "lstrip": false,
22
  "normalized": true,
@@ -24,7 +24,7 @@
24
  "single_word": false,
25
  "special": true
26
  },
27
- "69": {
28
  "content": "</s>",
29
  "lstrip": false,
30
  "normalized": true,
 
1
  {
2
  "added_tokens_decoder": {
3
+ "82": {
4
  "content": "[UNK]",
5
  "lstrip": true,
6
  "normalized": false,
 
8
  "single_word": false,
9
  "special": false
10
  },
11
+ "83": {
12
  "content": "[PAD]",
13
  "lstrip": true,
14
  "normalized": false,
 
16
  "single_word": false,
17
  "special": false
18
  },
19
+ "84": {
20
  "content": "<s>",
21
  "lstrip": false,
22
  "normalized": true,
 
24
  "single_word": false,
25
  "special": true
26
  },
27
+ "85": {
28
  "content": "</s>",
29
  "lstrip": false,
30
  "normalized": true,
vocab.json CHANGED
@@ -1,70 +1,86 @@
1
  {
2
- "'": 4,
3
- "[PAD]": 67,
4
- "[UNK]": 66,
5
- "|": 40,
6
- "": 0,
7
- "": 53,
8
- "": 51,
9
- "": 17,
10
- "": 26,
11
- "": 52,
12
- "": 25,
13
- "": 33,
14
- "": 22,
15
- "": 16,
16
- "": 23,
17
- "": 65,
18
- "": 43,
19
- "": 21,
20
- "": 56,
21
- "": 1,
22
- "": 60,
23
- "": 45,
24
- "": 62,
25
- "": 3,
26
- "": 63,
27
- "": 24,
28
- "": 12,
29
- "": 18,
30
- "": 61,
31
- "": 11,
32
- "": 31,
33
- "": 29,
34
- "": 64,
35
- "": 46,
36
- "": 19,
37
- "": 41,
38
- "": 54,
39
- "": 35,
40
- "": 59,
41
- "": 44,
42
- "": 37,
43
- "": 57,
44
- "": 42,
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  "ฬ": 2,
46
- "อ": 49,
47
- "ฮ": 27,
48
- "ะ": 47,
49
- "ั": 15,
50
- "า": 9,
51
- "ำ": 48,
52
- "ิ": 14,
53
- "ี": 38,
54
- "ึ": 20,
55
- "ื": 28,
56
- "ุ": 10,
57
- "ู": 30,
58
- "เ": 58,
59
- "แ": 34,
60
- "โ": 32,
61
- "ใ": 50,
62
- "ไ": 6,
63
- "": 39,
64
- "": 13,
65
- "": 5,
66
- "": 55,
67
- "": 7,
68
- "": 8,
69
- "": 36
 
 
 
70
  }
 
1
  {
2
+ "'": 57,
3
+ "[PAD]": 83,
4
+ "[UNK]": 82,
5
+ "_": 65,
6
+ "a": 59,
7
+ "c": 20,
8
+ "e": 31,
9
+ "h": 25,
10
+ "i": 18,
11
+ "j": 71,
12
+ "n": 12,
13
+ "o": 44,
14
+ "s": 69,
15
+ "t": 53,
16
+ "|": 72,
17
+ "~": 46,
18
+ "": 60,
19
+ "": 64,
20
+ "": 41,
21
+ "": 52,
22
+ "": 42,
23
+ "": 66,
24
+ "": 70,
25
+ "": 9,
26
+ "": 11,
27
+ "": 15,
28
+ "": 32,
29
+ "": 37,
30
+ "": 28,
31
+ "": 49,
32
+ "": 33,
33
+ "": 61,
34
+ "": 81,
35
+ "": 56,
36
+ "": 39,
37
+ "": 58,
38
+ "": 76,
39
+ "": 26,
40
+ "": 35,
41
+ "": 48,
42
+ "": 13,
43
+ "": 36,
44
+ "": 16,
45
+ "พ": 6,
46
+ "ฟ": 40,
47
+ "ภ": 24,
48
+ "ม": 34,
49
+ "ย": 7,
50
+ "ร": 45,
51
+ "ฤ": 19,
52
+ "ล": 10,
53
+ "ว": 67,
54
+ "ศ": 14,
55
+ "ษ": 79,
56
+ "ส": 29,
57
+ "ห": 8,
58
  "ฬ": 2,
59
+ "อ": 68,
60
+ "ฮ": 47,
61
+ "ะ": 51,
62
+ "ั": 50,
63
+ "า": 63,
64
+ "ำ": 5,
65
+ "ิ": 0,
66
+ "ี": 3,
67
+ "ึ": 62,
68
+ "ื": 27,
69
+ "ุ": 1,
70
+ "ู": 17,
71
+ "เ": 21,
72
+ "แ": 43,
73
+ "โ": 80,
74
+ "ใ": 54,
75
+ "ไ": 73,
76
+ "": 55,
77
+ "": 22,
78
+ "": 74,
79
+ "": 23,
80
+ "": 77,
81
+ "": 78,
82
+ "": 75,
83
+ "์": 30,
84
+ "ํ": 38,
85
+ "’": 4
86
  }