CodonTransformer / tokenizer.json
adibvafa's picture
Upload tokenizer
f8cf390 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "Lowercase"
}
]
},
"pre_tokenizer": {
"type": "Sequence",
"pretokenizers": [
{
"type": "Split",
"pattern": {
"String": " "
},
"behavior": "Isolated",
"invert": false
},
{
"type": "Whitespace"
}
]
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 1
}
}
],
"special_tokens": {
"[CLS]": {
"id": "[CLS]",
"ids": [
1
],
"tokens": [
"[CLS]"
]
},
"[SEP]": {
"id": "[SEP]",
"ids": [
2
],
"tokens": [
"[SEP]"
]
}
}
},
"decoder": null,
"model": {
"type": "WordPiece",
"unk_token": "[UNK]",
"continuing_subword_prefix": "##",
"max_input_chars_per_word": 100,
"vocab": {
"[UNK]": 0,
"[CLS]": 1,
"[SEP]": 2,
"[PAD]": 3,
"[MASK]": 4,
"a_unk": 5,
"c_unk": 6,
"d_unk": 7,
"e_unk": 8,
"f_unk": 9,
"g_unk": 10,
"h_unk": 11,
"i_unk": 12,
"k_unk": 13,
"l_unk": 14,
"m_unk": 15,
"n_unk": 16,
"p_unk": 17,
"q_unk": 18,
"r_unk": 19,
"s_unk": 20,
"t_unk": 21,
"v_unk": 22,
"w_unk": 23,
"y_unk": 24,
"__unk": 25,
"k_aaa": 26,
"n_aac": 27,
"k_aag": 28,
"n_aat": 29,
"t_aca": 30,
"t_acc": 31,
"t_acg": 32,
"t_act": 33,
"r_aga": 34,
"s_agc": 35,
"r_agg": 36,
"s_agt": 37,
"i_ata": 38,
"i_atc": 39,
"m_atg": 40,
"i_att": 41,
"q_caa": 42,
"h_cac": 43,
"q_cag": 44,
"h_cat": 45,
"p_cca": 46,
"p_ccc": 47,
"p_ccg": 48,
"p_cct": 49,
"r_cga": 50,
"r_cgc": 51,
"r_cgg": 52,
"r_cgt": 53,
"l_cta": 54,
"l_ctc": 55,
"l_ctg": 56,
"l_ctt": 57,
"e_gaa": 58,
"d_gac": 59,
"e_gag": 60,
"d_gat": 61,
"a_gca": 62,
"a_gcc": 63,
"a_gcg": 64,
"a_gct": 65,
"g_gga": 66,
"g_ggc": 67,
"g_ggg": 68,
"g_ggt": 69,
"v_gta": 70,
"v_gtc": 71,
"v_gtg": 72,
"v_gtt": 73,
"__taa": 74,
"y_tac": 75,
"__tag": 76,
"y_tat": 77,
"s_tca": 78,
"s_tcc": 79,
"s_tcg": 80,
"s_tct": 81,
"__tga": 82,
"c_tgc": 83,
"w_tgg": 84,
"c_tgt": 85,
"l_tta": 86,
"f_ttc": 87,
"l_ttg": 88,
"f_ttt": 89
}
}
}