ndeclarke's picture
Upload tokenizer
3e487d2 verified
{
"mal": {
"[PAD]": 72,
"[UNK]": 71,
"|": 0,
"ം": 1,
"ഃ": 2,
"അ": 3,
"ആ": 4,
"ഇ": 5,
"ഈ": 6,
"ഉ": 7,
"ഊ": 8,
"എ": 9,
"ഏ": 10,
"ഐ": 11,
"ഒ": 12,
"ഓ": 13,
"ഔ": 14,
"ക": 15,
"ഖ": 16,
"ഗ": 17,
"ഘ": 18,
"ങ": 19,
"ച": 20,
"ഛ": 21,
"ജ": 22,
"ഞ": 23,
"ട": 24,
"ഠ": 25,
"ഡ": 26,
"ഢ": 27,
"ണ": 28,
"ത": 29,
"ഥ": 30,
"ദ": 31,
"ധ": 32,
"ന": 33,
"പ": 34,
"ഫ": 35,
"ബ": 36,
"ഭ": 37,
"മ": 38,
"യ": 39,
"ര": 40,
"റ": 41,
"ല": 42,
"ള": 43,
"ഴ": 44,
"വ": 45,
"ശ": 46,
"ഷ": 47,
"സ": 48,
"ഹ": 49,
"ാ": 50,
"ി": 51,
"ീ": 52,
"ു": 53,
"ൂ": 54,
"ൃ": 55,
"െ": 56,
"േ": 57,
"ൈ": 58,
"ൊ": 59,
"ോ": 60,
"ൌ": 61,
"്": 62,
"ൗ": 63,
"ൺ": 64,
"ൻ": 65,
"ർ": 66,
"ൽ": 67,
"ൾ": 68,
"ൿ": 69,
"’": 70
},
"tam": {
"&": 1,
"[PAD]": 53,
"[UNK]": 52,
"_": 2,
"|": 0,
"ஃ": 3,
"அ": 4,
"ஆ": 5,
"இ": 6,
"ஈ": 7,
"உ": 8,
"ஊ": 9,
"எ": 10,
"ஏ": 11,
"ஐ": 12,
"ஒ": 13,
"ஓ": 14,
"ஔ": 15,
"க": 16,
"ங": 17,
"ச": 18,
"ஜ": 19,
"ஞ": 20,
"ட": 21,
"ண": 22,
"த": 23,
"ந": 24,
"ன": 25,
"ப": 26,
"ம": 27,
"ய": 28,
"ர": 29,
"ற": 30,
"ல": 31,
"ள": 32,
"ழ": 33,
"வ": 34,
"ஷ": 35,
"ஸ": 36,
"ஹ": 37,
"ா": 38,
"ி": 39,
"ீ": 40,
"ு": 41,
"ூ": 42,
"ெ": 43,
"ே": 44,
"ை": 45,
"ொ": 46,
"ோ": 47,
"ௌ": 48,
"்": 49,
"ௗ": 50,
"ഥ": 51
},
"tel": {
"[PAD]": 53,
"[UNK]": 52,
"|": 0,
"ం": 1,
"అ": 2,
"ఆ": 3,
"ఇ": 4,
"ఈ": 5,
"ఉ": 6,
"ఊ": 7,
"ఎ": 8,
"ఏ": 9,
"ఒ": 10,
"క": 11,
"ఖ": 12,
"గ": 13,
"ఘ": 14,
"చ": 15,
"జ": 16,
"ట": 17,
"డ": 18,
"ణ": 19,
"త": 20,
"థ": 21,
"ద": 22,
"ధ": 23,
"న": 24,
"ప": 25,
"ఫ": 26,
"బ": 27,
"భ": 28,
"మ": 29,
"య": 30,
"ర": 31,
"ల": 32,
"ళ": 33,
"వ": 34,
"శ": 35,
"ష": 36,
"స": 37,
"హ": 38,
"ా": 39,
"ి": 40,
"ీ": 41,
"ు": 42,
"ూ": 43,
"ృ": 44,
"ె": 45,
"ే": 46,
"ై": 47,
"ొ": 48,
"ో": 49,
"ౌ": 50,
"్": 51
},
"yor": {
"[PAD]": 43,
"[UNK]": 42,
"a": 1,
"b": 2,
"d": 3,
"e": 4,
"f": 5,
"g": 6,
"h": 7,
"i": 8,
"j": 9,
"k": 10,
"l": 11,
"m": 12,
"n": 13,
"o": 14,
"p": 15,
"r": 16,
"s": 17,
"t": 18,
"u": 19,
"w": 20,
"y": 21,
"|": 0,
"à": 22,
"á": 23,
"è": 24,
"é": 25,
"ì": 26,
"í": 27,
"ò": 28,
"ó": 29,
"ù": 30,
"ú": 31,
"ń": 32,
"ǹ": 33,
"̀": 34,
"́": 35,
"̄": 36,
"̣": 37,
"ṣ": 38,
"ẹ": 39,
"ọ": 40,
"ụ": 41
}
}