diff --git "a/tokenizer.json" "b/tokenizer.json" --- "a/tokenizer.json" +++ "b/tokenizer.json" @@ -12,14 +12,14 @@ }, "direction": "Right", "pad_to_multiple_of": null, - "pad_id": 1, + "pad_id": 0, "pad_type_id": 0, - "pad_token": "" + "pad_token": "[PAD]" }, "added_tokens": [ { "id": 0, - "content": "", + "content": "[PAD]", "single_word": false, "lstrip": false, "rstrip": false, @@ -28,7 +28,7 @@ }, { "id": 1, - "content": "", + "content": "[UNK]", "single_word": false, "lstrip": false, "rstrip": false, @@ -37,7 +37,7 @@ }, { "id": 2, - "content": "", + "content": "[MASK]", "single_word": false, "lstrip": false, "rstrip": false, @@ -46,7 +46,7 @@ }, { "id": 3, - "content": "", + "content": "[CLS]", "single_word": false, "lstrip": false, "rstrip": false, @@ -55,54 +55,115 @@ }, { "id": 4, - "content": "", + "content": "[SEP]", "single_word": false, - "lstrip": true, + "lstrip": false, "rstrip": false, "normalized": false, "special": true } ], - "normalizer": null, + "normalizer": { + "type": "BertNormalizer", + "clean_text": true, + "handle_chinese_chars": true, + "strip_accents": null, + "lowercase": true + }, "pre_tokenizer": { - "type": "ByteLevel", - "add_prefix_space": false, - "trim_offsets": true, - "use_regex": true + "type": "BertPreTokenizer" }, "post_processor": { - "type": "RobertaProcessing", - "sep": [ - "", - 2 + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "[CLS]", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "[SEP]", + "type_id": 0 + } + } ], - "cls": [ - "", - 0 + "pair": [ + { + "SpecialToken": { + "id": "[CLS]", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "[SEP]", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + }, + { + "SpecialToken": { + "id": "[SEP]", + "type_id": 1 + } + } ], - "trim_offsets": true, - "add_prefix_space": false + "special_tokens": { + "[CLS]": { + "id": "[CLS]", + "ids": [ + 3 + ], + "tokens": [ + "[CLS]" + ] + }, + "[SEP]": { + "id": "[SEP]", + "ids": [ + 4 + ], + "tokens": [ + "[SEP]" + ] + } + } }, "decoder": { - "type": "ByteLevel", - "add_prefix_space": true, - "trim_offsets": true, - "use_regex": true + "type": "WordPiece", + "prefix": "##", + "cleanup": true }, "model": { - "type": "BPE", - "dropout": null, - "unk_token": null, - "continuing_subword_prefix": "", - "end_of_word_suffix": "", - "fuse_unk": false, - "byte_fallback": false, + "type": "WordPiece", + "unk_token": "[UNK]", + "continuing_subword_prefix": "##", + "max_input_chars_per_word": 100, "vocab": { - "": 0, - "": 1, - "": 2, - "": 3, - "": 4, + "[PAD]": 0, + "[UNK]": 1, + "[MASK]": 2, + "[CLS]": 3, + "[SEP]": 4, "!": 5, "\"": 6, "#": 7, @@ -135,103710 +196,31892 @@ ">": 34, "?": 35, "@": 36, - "A": 37, - "B": 38, - "C": 39, - "D": 40, - "E": 41, - "F": 42, - "G": 43, - "H": 44, - "I": 45, - "J": 46, - "K": 47, - "L": 48, - "M": 49, - "N": 50, - "O": 51, - "P": 52, - "Q": 53, - "R": 54, - "S": 55, - "T": 56, - "U": 57, - "V": 58, - "W": 59, - "X": 60, - "Y": 61, - "Z": 62, - "[": 63, - "\\": 64, - "]": 65, - "^": 66, - "_": 67, - "`": 68, - "a": 69, - "b": 70, - "c": 71, - "d": 72, - "e": 73, - "f": 74, - "g": 75, - "h": 76, - "i": 77, - "j": 78, - "k": 79, - "l": 80, - "m": 81, - "n": 82, - "o": 83, - "p": 84, - "q": 85, - "r": 86, - "s": 87, - "t": 88, - "u": 89, - "v": 90, - "w": 91, - "x": 92, - "y": 93, - "z": 94, - "{": 95, - "|": 96, - "}": 97, - "~": 98, - "¡": 99, - "¢": 100, - "£": 101, - "¤": 102, - "¥": 103, - "¦": 104, - "§": 105, - "¨": 106, - "©": 107, - "ª": 108, - "«": 109, - "¬": 110, - "®": 111, - "¯": 112, - "°": 113, - "±": 114, - "²": 115, - "³": 116, - "´": 117, - "µ": 118, - "¶": 119, - "·": 120, - "¸": 121, - "¹": 122, - "º": 123, - "»": 124, - "¼": 125, - "½": 126, - "¾": 127, - "¿": 128, - "À": 129, - "Á": 130, - "Â": 131, - "Ã": 132, - "Ä": 133, - "Å": 134, - "Æ": 135, - "Ç": 136, - "È": 137, - "É": 138, - "Ê": 139, - "Ë": 140, - "Ì": 141, - "Í": 142, - "Î": 143, - "Ï": 144, - "Ð": 145, - "Ñ": 146, - "Ò": 147, - "Ó": 148, - "Ô": 149, - "Õ": 150, - "Ö": 151, - "×": 152, - "Ø": 153, - "Ù": 154, - "Ú": 155, - "Û": 156, - "Ü": 157, - "Ý": 158, - "Þ": 159, - "ß": 160, - "à": 161, - "á": 162, - "â": 163, - "ã": 164, - "ä": 165, - "å": 166, - "æ": 167, - "ç": 168, - "è": 169, - "é": 170, - "ê": 171, - "ë": 172, - "ì": 173, - "í": 174, - "î": 175, - "ï": 176, - "ð": 177, - "ñ": 178, - "ò": 179, - "ó": 180, - "ô": 181, - "õ": 182, - "ö": 183, - "÷": 184, - "ø": 185, - "ù": 186, - "ú": 187, - "û": 188, - "ü": 189, - "ý": 190, - "þ": 191, - "ÿ": 192, - "Ā": 193, - "ā": 194, - "Ă": 195, - "ă": 196, - "Ą": 197, - "ą": 198, - "Ć": 199, - "ć": 200, - "Ĉ": 201, - "ĉ": 202, - "Ċ": 203, - "ċ": 204, - "Č": 205, - "č": 206, - "Ď": 207, - "ď": 208, - "Đ": 209, - "đ": 210, - "Ē": 211, - "ē": 212, - "Ĕ": 213, - "ĕ": 214, - "Ė": 215, - "ė": 216, - "Ę": 217, - "ę": 218, - "Ě": 219, - "ě": 220, - "Ĝ": 221, - "ĝ": 222, - "Ğ": 223, - "ğ": 224, - "Ġ": 225, - "ġ": 226, - "Ģ": 227, - "ģ": 228, - "Ĥ": 229, - "ĥ": 230, - "Ħ": 231, - "ħ": 232, - "Ĩ": 233, - "ĩ": 234, - "Ī": 235, - "ī": 236, - "Ĭ": 237, - "ĭ": 238, - "Į": 239, - "į": 240, - "İ": 241, - "ı": 242, - "IJ": 243, - "ij": 244, - "Ĵ": 245, - "ĵ": 246, - "Ķ": 247, - "ķ": 248, - "ĸ": 249, - "Ĺ": 250, - "ĺ": 251, - "Ļ": 252, - "ļ": 253, - "Ľ": 254, - "ľ": 255, - "Ŀ": 256, - "ŀ": 257, - "Ł": 258, - "ł": 259, - "Ń": 260, - "an": 261, - "Ġd": 262, - "er": 263, - "en": 264, - "ar": 265, - "Ġm": 266, - "la": 267, - "ang": 268, - "Ġs": 269, - "Ġp": 270, - "in": 271, - "at": 272, - "Ġk": 273, - "Ġt": 274, - "Ġb": 275, - "da": 276, - "Ġdi": 277, - "un": 278, - "as": 279, - "kan": 280, - "em": 281, - "ah": 282, - "al": 283, - "ya": 284, - "am": 285, - "Ġse": 286, - "ada": 287, - "Ġmen": 288, - "si": 289, - "yang": 290, - "Ġdan": 291, - "Ġyang": 292, - "tu": 293, - "on": 294, - "ga": 295, - "ĠS": 296, - "ak": 297, - "ari": 298, - "lah": 299, - "il": 300, - "es": 301, - "or": 302, - "di": 303, - "Ġke": 304, - "is": 305, - "Ġber": 306, - "ik": 307, - "ĠP": 308, - "eng": 309, - "ĠA": 310, - "bu": 311, - "Ġter": 312, - "us": 313, - "ta": 314, - "ol": 315, - "ing": 316, - "el": 317, - "um": 318, - "ĠK": 319, - "ur": 320, - "Ġin": 321, - "ĠM": 322, - "Ġdari": 323, - "ara": 324, - "ba": 325, - "ti": 326, - "nya": 327, - "lam": 328, - "ap": 329, - "ĠB": 330, - "Ġper": 331, - "ek": 332, - "Ġini": 333, - "akan": 334, - "Ġmem": 335, - "it": 336, - "Ġ1": 337, - "uk": 338, - "ĠI": 339, - "ja": 340, - "ĠD": 341, - "Ġ\"": 342, - "ia": 343, - "ĠT": 344, - "Ġada": 345, - "engan": 346, - "Ġpada": 347, - "Ġj": 348, - "up": 349, - "gi": 350, - "tuk": 351, - "Ġdengan": 352, - "Ġh": 353, - "im": 354, - "ul": 355, - "ir": 356, - "alam": 357, - "om": 358, - "Ġ(": 359, - "et": 360, - "asi": 361, - "ut": 362, - "ung": 363, - "ama": 364, - "Ġadalah": 365, - "Ġun": 366, - "Ġmer": 367, - "Ġ2": 368, - "Ġl": 369, - "Ġmeng": 370, - "Ġuntuk": 371, - "ro": 372, - "eb": 373, - "Ġdalam": 374, - "ri": 375, - "esi": 376, - "anya": 377, - "eh": 378, - "ĠC": 379, - "ĠJ": 380, - "Ġpen": 381, - "ter": 382, - "ahun": 383, - "ĠR": 384, - "Ġr": 385, - "gai": 386, - "se": 387, - "ela": 388, - "Ġn": 389, - "ata": 390, - "ĠN": 391, - "Ġf": 392, - "Ġtahun": 393, - "Ġla": 394, - "ĠL": 395, - "oleh": 396, - "wa": 397, - "bagai": 398, - "Ġba": 399, - "Ġg": 400, - "id": 401, - "per": 402, - "Ġoleh": 403, - "Ġa": 404, - "pat": 405, - "ĠH": 406, - "Ġ20": 407, - "Ġju": 408, - "ĠIn": 409, - "emb": 410, - "Ġ19": 411, - "