Create the tokenizer.json properly (with TemplateProcessing included).

Turns out, transformers will override the TemplateProcessing meaning in python land the bug is not seen.
https://github.com/huggingface/transformers/blame/main/src/transformers/models/llama/tokenization_llama_fast.py#L203-L205

However in Rust land, the definition is taken exactly from the tokenizer.json file, therefore it's missing the TemplateProcessor.

Adding it would fix the issue in rust land without affecting python land (since the override is still there)

Files changed (1) hide show

tokenizer.json +31 -2

tokenizer.json CHANGED Viewed

@@ -128,7 +128,7 @@
       "rstrip": true,
       "normalized": false,
       "special": true
-    }
   ],
   "normalizer": {
     "type": "Sequence",
@@ -150,6 +150,12 @@
   "post_processor": {
     "type": "TemplateProcessing",
     "single": [
       {
         "Sequence": {
           "id": "A",
@@ -158,12 +164,24 @@
       }
     ],
     "pair": [
       {
         "Sequence": {
           "id": "A",
           "type_id": 0
         }
       },
       {
         "Sequence": {
           "id": "B",
@@ -171,7 +189,17 @@
         }
       }
     ],
-    "special_tokens": {}
   },
   "decoder": {
     "type": "Sequence",
@@ -205,6 +233,7 @@
     "end_of_word_suffix": null,
     "fuse_unk": true,
     "byte_fallback": true,
     "vocab": {
       "<unk>": 0,
       "<s>": 1,

       "rstrip": true,
       "normalized": false,
       "special": true
+    }
   ],
   "normalizer": {
     "type": "Sequence",
   "post_processor": {
     "type": "TemplateProcessing",
     "single": [
+      {
+        "SpecialToken": {
+          "id": "<s>",
+          "type_id": 0
+        }
+      },
       {
         "Sequence": {
           "id": "A",
       }
     ],
     "pair": [
+      {
+        "SpecialToken": {
+          "id": "<s>",
+          "type_id": 0
+        }
+      },
       {
         "Sequence": {
           "id": "A",
           "type_id": 0
         }
       },
+      {
+        "SpecialToken": {
+          "id": "<s>",
+          "type_id": 1
+        }
+      },
       {
         "Sequence": {
           "id": "B",
         }
       }
     ],
+    "special_tokens": {
+      "<s>": {
+        "id": "<s>",
+        "ids": [
+          1
+        ],
+        "tokens": [
+          "<s>"
+        ]
+      }
+    }
   },
   "decoder": {
     "type": "Sequence",
     "end_of_word_suffix": null,
     "fuse_unk": true,
     "byte_fallback": true,
+    "ignore_merges": false,
     "vocab": {
       "<unk>": 0,
       "<s>": 1,