Create the tokenizer.json properly (with TemplateProcessing included).
Browse filesTurns out, transformers will override the TemplateProcessing meaning in python land the bug is not seen.
https://github.com/huggingface/transformers/blame/main/src/transformers/models/llama/tokenization_llama_fast.py#L203-L205
However in Rust land, the definition is taken exactly from the tokenizer.json file, therefore it's missing the TemplateProcessor.
Adding it would fix the issue in rust land without affecting python land (since the override is still there)
- tokenizer.json +31 -2
tokenizer.json
CHANGED
@@ -128,7 +128,7 @@
|
|
128 |
"rstrip": true,
|
129 |
"normalized": false,
|
130 |
"special": true
|
131 |
-
}
|
132 |
],
|
133 |
"normalizer": {
|
134 |
"type": "Sequence",
|
@@ -150,6 +150,12 @@
|
|
150 |
"post_processor": {
|
151 |
"type": "TemplateProcessing",
|
152 |
"single": [
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
{
|
154 |
"Sequence": {
|
155 |
"id": "A",
|
@@ -158,12 +164,24 @@
|
|
158 |
}
|
159 |
],
|
160 |
"pair": [
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
{
|
162 |
"Sequence": {
|
163 |
"id": "A",
|
164 |
"type_id": 0
|
165 |
}
|
166 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
{
|
168 |
"Sequence": {
|
169 |
"id": "B",
|
@@ -171,7 +189,17 @@
|
|
171 |
}
|
172 |
}
|
173 |
],
|
174 |
-
"special_tokens": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
},
|
176 |
"decoder": {
|
177 |
"type": "Sequence",
|
@@ -205,6 +233,7 @@
|
|
205 |
"end_of_word_suffix": null,
|
206 |
"fuse_unk": true,
|
207 |
"byte_fallback": true,
|
|
|
208 |
"vocab": {
|
209 |
"<unk>": 0,
|
210 |
"<s>": 1,
|
|
|
128 |
"rstrip": true,
|
129 |
"normalized": false,
|
130 |
"special": true
|
131 |
+
}
|
132 |
],
|
133 |
"normalizer": {
|
134 |
"type": "Sequence",
|
|
|
150 |
"post_processor": {
|
151 |
"type": "TemplateProcessing",
|
152 |
"single": [
|
153 |
+
{
|
154 |
+
"SpecialToken": {
|
155 |
+
"id": "<s>",
|
156 |
+
"type_id": 0
|
157 |
+
}
|
158 |
+
},
|
159 |
{
|
160 |
"Sequence": {
|
161 |
"id": "A",
|
|
|
164 |
}
|
165 |
],
|
166 |
"pair": [
|
167 |
+
{
|
168 |
+
"SpecialToken": {
|
169 |
+
"id": "<s>",
|
170 |
+
"type_id": 0
|
171 |
+
}
|
172 |
+
},
|
173 |
{
|
174 |
"Sequence": {
|
175 |
"id": "A",
|
176 |
"type_id": 0
|
177 |
}
|
178 |
},
|
179 |
+
{
|
180 |
+
"SpecialToken": {
|
181 |
+
"id": "<s>",
|
182 |
+
"type_id": 1
|
183 |
+
}
|
184 |
+
},
|
185 |
{
|
186 |
"Sequence": {
|
187 |
"id": "B",
|
|
|
189 |
}
|
190 |
}
|
191 |
],
|
192 |
+
"special_tokens": {
|
193 |
+
"<s>": {
|
194 |
+
"id": "<s>",
|
195 |
+
"ids": [
|
196 |
+
1
|
197 |
+
],
|
198 |
+
"tokens": [
|
199 |
+
"<s>"
|
200 |
+
]
|
201 |
+
}
|
202 |
+
}
|
203 |
},
|
204 |
"decoder": {
|
205 |
"type": "Sequence",
|
|
|
233 |
"end_of_word_suffix": null,
|
234 |
"fuse_unk": true,
|
235 |
"byte_fallback": true,
|
236 |
+
"ignore_merges": false,
|
237 |
"vocab": {
|
238 |
"<unk>": 0,
|
239 |
"<s>": 1,
|