import sentencepiece as spm | |
spm.SentencePieceTrainer.train(input="/researchdisk/lm_training_dataset_full_sentences/train.txt", model_prefix='spiece', vocab_size=32000, character_coverage=1.0, | |
pad_id=0, unk_id=2, eos_id=1, bos_id=-1, | |
train_extremely_large_corpus=True, | |
num_threads=96, input_sentence_size=50000000, shuffle_input_sentence=True) |