|
|
|
|
|
|
|
import sys |
|
import os |
|
|
|
sys.path.append(os.getcwd()) |
|
|
|
import json |
|
from tqdm import tqdm |
|
from concurrent.futures import ProcessPoolExecutor |
|
|
|
import torchaudio |
|
from datasets import Dataset |
|
|
|
from model.utils import convert_char_to_pinyin |
|
|
|
|
|
def deal_with_sub_path_files(dataset_path, sub_path): |
|
print(f"Dealing with: {sub_path}") |
|
|
|
text_dir = os.path.join(dataset_path, sub_path, "txts") |
|
audio_dir = os.path.join(dataset_path, sub_path, "wavs") |
|
text_files = os.listdir(text_dir) |
|
|
|
audio_paths, texts, durations = [], [], [] |
|
for text_file in tqdm(text_files): |
|
with open(os.path.join(text_dir, text_file), "r", encoding="utf-8") as file: |
|
first_line = file.readline().split("\t") |
|
audio_nm = first_line[0] |
|
audio_path = os.path.join(audio_dir, audio_nm + ".wav") |
|
text = first_line[1].strip() |
|
|
|
audio_paths.append(audio_path) |
|
|
|
if tokenizer == "pinyin": |
|
texts.extend(convert_char_to_pinyin([text], polyphone=polyphone)) |
|
elif tokenizer == "char": |
|
texts.append(text) |
|
|
|
audio, sample_rate = torchaudio.load(audio_path) |
|
durations.append(audio.shape[-1] / sample_rate) |
|
|
|
return audio_paths, texts, durations |
|
|
|
|
|
def main(): |
|
assert tokenizer in ["pinyin", "char"] |
|
|
|
audio_path_list, text_list, duration_list = [], [], [] |
|
|
|
executor = ProcessPoolExecutor(max_workers=max_workers) |
|
futures = [] |
|
for dataset_path in dataset_paths: |
|
sub_items = os.listdir(dataset_path) |
|
sub_paths = [item for item in sub_items if os.path.isdir(os.path.join(dataset_path, item))] |
|
for sub_path in sub_paths: |
|
futures.append(executor.submit(deal_with_sub_path_files, dataset_path, sub_path)) |
|
for future in tqdm(futures, total=len(futures)): |
|
audio_paths, texts, durations = future.result() |
|
audio_path_list.extend(audio_paths) |
|
text_list.extend(texts) |
|
duration_list.extend(durations) |
|
executor.shutdown() |
|
|
|
if not os.path.exists("data"): |
|
os.makedirs("data") |
|
|
|
print(f"\nSaving to data/{dataset_name}_{tokenizer} ...") |
|
dataset = Dataset.from_dict({"audio_path": audio_path_list, "text": text_list, "duration": duration_list}) |
|
dataset.save_to_disk(f"data/{dataset_name}_{tokenizer}/raw", max_shard_size="2GB") |
|
|
|
with open(f"data/{dataset_name}_{tokenizer}/duration.json", "w", encoding="utf-8") as f: |
|
json.dump( |
|
{"duration": duration_list}, f, ensure_ascii=False |
|
) |
|
|
|
print("\nEvaluating vocab size (all characters and symbols / all phonemes) ...") |
|
text_vocab_set = set() |
|
for text in tqdm(text_list): |
|
text_vocab_set.update(list(text)) |
|
|
|
|
|
if tokenizer == "pinyin": |
|
text_vocab_set.update([chr(i) for i in range(32, 127)] + [chr(i) for i in range(192, 256)]) |
|
|
|
with open(f"data/{dataset_name}_{tokenizer}/vocab.txt", "w") as f: |
|
for vocab in sorted(text_vocab_set): |
|
f.write(vocab + "\n") |
|
print(f"\nFor {dataset_name}, sample count: {len(text_list)}") |
|
print(f"For {dataset_name}, vocab size is: {len(text_vocab_set)}\n") |
|
|
|
|
|
if __name__ == "__main__": |
|
max_workers = 32 |
|
|
|
tokenizer = "pinyin" |
|
polyphone = True |
|
dataset_choice = 1 |
|
|
|
dataset_name = ["WenetSpeech4TTS_Premium", "WenetSpeech4TTS_Standard", "WenetSpeech4TTS_Basic"][dataset_choice - 1] |
|
dataset_paths = [ |
|
"<SOME_PATH>/WenetSpeech4TTS/Basic", |
|
"<SOME_PATH>/WenetSpeech4TTS/Standard", |
|
"<SOME_PATH>/WenetSpeech4TTS/Premium", |
|
][-dataset_choice:] |
|
print(f"\nChoose Dataset: {dataset_name}\n") |
|
|
|
main() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|