File size: 4,579 Bytes
4dab15f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# generate audio text map for WenetSpeech4TTS
# evaluate for vocab size

import os
import sys

sys.path.append(os.getcwd())

import json
from concurrent.futures import ProcessPoolExecutor
from importlib.resources import files
from tqdm import tqdm

import torchaudio
from datasets import Dataset

from f5_tts.model.utils import convert_char_to_pinyin


def deal_with_sub_path_files(dataset_path, sub_path):
    print(f"Dealing with: {sub_path}")

    text_dir = os.path.join(dataset_path, sub_path, "txts")
    audio_dir = os.path.join(dataset_path, sub_path, "wavs")
    text_files = os.listdir(text_dir)

    audio_paths, texts, durations = [], [], []
    for text_file in tqdm(text_files):
        with open(os.path.join(text_dir, text_file), "r", encoding="utf-8") as file:
            first_line = file.readline().split("\t")
        audio_nm = first_line[0]
        audio_path = os.path.join(audio_dir, audio_nm + ".wav")
        text = first_line[1].strip()

        audio_paths.append(audio_path)

        if tokenizer == "pinyin":
            texts.extend(convert_char_to_pinyin([text], polyphone=polyphone))
        elif tokenizer == "char":
            texts.append(text)

        audio, sample_rate = torchaudio.load(audio_path)
        durations.append(audio.shape[-1] / sample_rate)

    return audio_paths, texts, durations


def main():
    assert tokenizer in ["pinyin", "char"]

    audio_path_list, text_list, duration_list = [], [], []

    executor = ProcessPoolExecutor(max_workers=max_workers)
    futures = []
    for dataset_path in dataset_paths:
        sub_items = os.listdir(dataset_path)
        sub_paths = [item for item in sub_items if os.path.isdir(os.path.join(dataset_path, item))]
        for sub_path in sub_paths:
            futures.append(executor.submit(deal_with_sub_path_files, dataset_path, sub_path))
    for future in tqdm(futures, total=len(futures)):
        audio_paths, texts, durations = future.result()
        audio_path_list.extend(audio_paths)
        text_list.extend(texts)
        duration_list.extend(durations)
    executor.shutdown()

    if not os.path.exists("data"):
        os.makedirs("data")

    print(f"\nSaving to {save_dir} ...")
    dataset = Dataset.from_dict({"audio_path": audio_path_list, "text": text_list, "duration": duration_list})
    dataset.save_to_disk(f"{save_dir}/raw", max_shard_size="2GB")  # arrow format

    with open(f"{save_dir}/duration.json", "w", encoding="utf-8") as f:
        json.dump(
            {"duration": duration_list}, f, ensure_ascii=False
        )  # dup a json separately saving duration in case for DynamicBatchSampler ease

    print("\nEvaluating vocab size (all characters and symbols / all phonemes) ...")
    text_vocab_set = set()
    for text in tqdm(text_list):
        text_vocab_set.update(list(text))

    # add alphabets and symbols (optional, if plan to ft on de/fr etc.)
    if tokenizer == "pinyin":
        text_vocab_set.update([chr(i) for i in range(32, 127)] + [chr(i) for i in range(192, 256)])

    with open(f"{save_dir}/vocab.txt", "w") as f:
        for vocab in sorted(text_vocab_set):
            f.write(vocab + "\n")
    print(f"\nFor {dataset_name}, sample count: {len(text_list)}")
    print(f"For {dataset_name}, vocab size is: {len(text_vocab_set)}\n")


if __name__ == "__main__":
    max_workers = 32

    tokenizer = "pinyin"  # "pinyin" | "char"
    polyphone = True
    dataset_choice = 1  # 1: Premium, 2: Standard, 3: Basic

    dataset_name = (
        ["WenetSpeech4TTS_Premium", "WenetSpeech4TTS_Standard", "WenetSpeech4TTS_Basic"][dataset_choice - 1]
        + "_"
        + tokenizer
    )
    dataset_paths = [
        "<SOME_PATH>/WenetSpeech4TTS/Basic",
        "<SOME_PATH>/WenetSpeech4TTS/Standard",
        "<SOME_PATH>/WenetSpeech4TTS/Premium",
    ][-dataset_choice:]
    save_dir = str(files("f5_tts").joinpath("../../")) + f"/data/{dataset_name}"
    print(f"\nChoose Dataset: {dataset_name}, will save to {save_dir}\n")

    main()

    # Results (if adding alphabets with accents and symbols):
    # WenetSpeech4TTS       Basic     Standard     Premium
    # samples count       3932473      1941220      407494
    # pinyin vocab size      1349         1348        1344   (no polyphone)
    #                           -            -        1459   (polyphone)
    # char   vocab size      5264         5219        5042

    # vocab size may be slightly different due to jieba tokenizer and pypinyin (e.g. way of polyphoneme)
    # please be careful if using pretrained model, make sure the vocab.txt is same