tokenizers_info: | |
- name: AA | |
tokenizer_id: 0 | |
json_path: ./t5_tokenizer_AA_special.json | |
modular_json_path: ./t5_tokenizer_AA_special.json | |
start_delimiter: <start_AA> | |
end_delimiter: <end_AA> | |
- name: SMILES | |
tokenizer_id: 1 | |
json_path: ./bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json | |
modular_json_path: ./bpe_tokenizer_trained_on_chembl_zinc_with_aug_4272372_samples_balanced_1_1.json | |
start_delimiter: <start_SMILES> | |
end_delimiter: <end_SMILES> | |
- name: CELL_ATTRIBUTES | |
tokenizer_id: 2 | |
json_path: ./cell_attributes_tokenizer.json | |
modular_json_path: ./cell_attributes_tokenizer.json | |
start_delimiter: <start_CELL_ATTRIBUTES> | |
end_delimiter: <end_CELL_ATTRIBUTES> | |
- name: GENE | |
tokenizer_id: 3 | |
json_path: ./gene_tokenizer.json | |
modular_json_path: ./gene_tokenizer.json | |
start_delimiter: <start_GENE> | |
end_delimiter: <end_GENE> | |
minimal_token_id: 5000 | |
max_possible_token_id: 100000 | |
max_special_token_id: 500 | |