from datasets import load_dataset from models.bpe_trainer import BpeTrainer from tqdm import tqdm raw_ds = load_dataset("parquet", data_files ={'train':'data/culturaX_bnhi_500Kx2.parquet'}) raw_ds = raw_ds['train'] phn_ds = load_dataset("parquet", data_files ={'train':'data/culturaX_bnhi_500Kx2_phonemized.parquet'}) phn_ds = phn_ds['train'] # vocab_sizes = [size for size in range(2000, 34000, 2000)] vocab_sizes = [16000] for vocab_size in tqdm(vocab_sizes): BpeTrainer(dataset=raw_ds, vocab_size=vocab_size, batch_size=50000, output_dir=f"trained_tokenizers/multi/multi_raw_bnhi_bpetokenizer_{vocab_size//1000}K") BpeTrainer(dataset=phn_ds, vocab_size=vocab_size, batch_size=50000, output_dir=f"trained_tokenizers/multi/multi_phn_bnhi_bpetokenizer_{vocab_size//1000}K") # 8K for one language in native tokenizer # < 8K for one language in phonemized tokenizer # 16k for 2 languages (mutually exclusive, script has diff char) # How much lesser than 16 K we are? # Lower limit 8K # Anywhere bw 8K and 16K, 12K --> phonemized tokenizer had FS as the 16K. ''' Benchmarking for how much time for phonemization: NUM_SAMPLES = 50,000 Convert to text Phonemization script time command_for_script time/500000 ------------------------------------------------ Prep data: Native script = directly from Sangraha Phonemization HF dataset --> Convert to text files and store in a dir --> Phonemization script --> phonemized text files --> convert back to HF dataset (parquet format) ------------------------------------------------ 1st exp: Hi, Phn_Hi --> Plot FS from vocab size 4K to 16K. Train 12 tokenizers. Ur, Phn_Ur --> Plot FS from vocab size 4K to 16K. 2nd exp: HiUr, Phn HiUr --> Plot FS from vs 8K to 16K. 8 in total. '''