Aravindan commited on
Commit
1950fb4
1 Parent(s): 7b99ce8

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +11 -11
  2. tokenizer.json +2 -2
  3. tokenizer_config.json +7 -62
special_tokens_map.json CHANGED
@@ -1,23 +1,23 @@
1
  {
2
  "additional_special_tokens": [
3
- "<|im_start|>user",
4
- "<|im_start|>assistant",
5
- "<|im_start|>system",
6
- "<|im_start|>function-call",
7
- "<|im_start|>function-response",
8
- "<|im_end|>",
9
- "<s>",
10
- "<pad>"
11
  ],
12
  "bos_token": {
13
- "content": "<s>",
14
  "lstrip": false,
15
  "normalized": false,
16
  "rstrip": false,
17
  "single_word": false
18
  },
19
- "eos_token": "<|im_end|>",
20
- "pad_token": "<|im_end|>",
 
 
 
 
 
 
21
  "unk_token": {
22
  "content": "<unk>",
23
  "lstrip": false,
 
1
  {
2
  "additional_special_tokens": [
3
+ "<start_of_turn>",
4
+ "<end_of_turn>"
 
 
 
 
 
 
5
  ],
6
  "bos_token": {
7
+ "content": "<bos>",
8
  "lstrip": false,
9
  "normalized": false,
10
  "rstrip": false,
11
  "single_word": false
12
  },
13
+ "eos_token": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": "<eos>",
21
  "unk_token": {
22
  "content": "<unk>",
23
  "lstrip": false,
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e1edfa481a0c0013cddd64b0d40d4e91e31073b043c4bd1f6db0ef6df5eeb9f8
3
- size 17519695
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd1df914b205b134e52f15364524a919423b48c9056d725b45c5b89187541de0
3
+ size 17518965
tokenizer_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "add_bos_token": true,
3
- "add_eos_token": false,
4
  "added_tokens_decoder": {
5
  "0": {
6
  "content": "<pad>",
@@ -1640,7 +1640,7 @@
1640
  "normalized": false,
1641
  "rstrip": false,
1642
  "single_word": false,
1643
- "special": true
1644
  },
1645
  "205": {
1646
  "content": "<sub>",
@@ -1737,72 +1737,17 @@
1737
  "rstrip": false,
1738
  "single_word": false,
1739
  "special": false
1740
- },
1741
- "256000": {
1742
- "content": "<|im_end|>",
1743
- "lstrip": false,
1744
- "normalized": false,
1745
- "rstrip": false,
1746
- "single_word": false,
1747
- "special": true
1748
- },
1749
- "256001": {
1750
- "content": "<|im_start|>user",
1751
- "lstrip": false,
1752
- "normalized": false,
1753
- "rstrip": false,
1754
- "single_word": false,
1755
- "special": true
1756
- },
1757
- "256002": {
1758
- "content": "<|im_start|>assistant",
1759
- "lstrip": false,
1760
- "normalized": false,
1761
- "rstrip": false,
1762
- "single_word": false,
1763
- "special": true
1764
- },
1765
- "256003": {
1766
- "content": "<|im_start|>system",
1767
- "lstrip": false,
1768
- "normalized": false,
1769
- "rstrip": false,
1770
- "single_word": false,
1771
- "special": true
1772
- },
1773
- "256004": {
1774
- "content": "<|im_start|>function-call",
1775
- "lstrip": false,
1776
- "normalized": false,
1777
- "rstrip": false,
1778
- "single_word": false,
1779
- "special": true
1780
- },
1781
- "256005": {
1782
- "content": "<|im_start|>function-response",
1783
- "lstrip": false,
1784
- "normalized": false,
1785
- "rstrip": false,
1786
- "single_word": false,
1787
- "special": true
1788
  }
1789
  },
1790
  "additional_special_tokens": [
1791
- "<|im_start|>user",
1792
- "<|im_start|>assistant",
1793
- "<|im_start|>system",
1794
- "<|im_start|>function-call",
1795
- "<|im_start|>function-response",
1796
- "<|im_end|>",
1797
- "<s>",
1798
- "<pad>"
1799
  ],
1800
- "bos_token": "<s>",
1801
- "chat_template": "{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}",
1802
  "clean_up_tokenization_spaces": false,
1803
- "eos_token": "<|im_end|>",
1804
  "model_max_length": 1000000000000000019884624838656,
1805
- "pad_token": "<|im_end|>",
1806
  "sp_model_kwargs": {},
1807
  "spaces_between_special_tokens": false,
1808
  "tokenizer_class": "GemmaTokenizer",
 
1
  {
2
  "add_bos_token": true,
3
+ "add_eos_token": true,
4
  "added_tokens_decoder": {
5
  "0": {
6
  "content": "<pad>",
 
1640
  "normalized": false,
1641
  "rstrip": false,
1642
  "single_word": false,
1643
+ "special": false
1644
  },
1645
  "205": {
1646
  "content": "<sub>",
 
1737
  "rstrip": false,
1738
  "single_word": false,
1739
  "special": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1740
  }
1741
  },
1742
  "additional_special_tokens": [
1743
+ "<start_of_turn>",
1744
+ "<end_of_turn>"
 
 
 
 
 
 
1745
  ],
1746
+ "bos_token": "<bos>",
 
1747
  "clean_up_tokenization_spaces": false,
1748
+ "eos_token": "<eos>",
1749
  "model_max_length": 1000000000000000019884624838656,
1750
+ "pad_token": "<eos>",
1751
  "sp_model_kwargs": {},
1752
  "spaces_between_special_tokens": false,
1753
  "tokenizer_class": "GemmaTokenizer",