File size: 3,801 Bytes
6eb1048
 
 
 
 
4d7078a
 
 
 
 
 
 
 
6eb1048
 
27e87f7
 
 
 
 
4d7078a
 
6eb1048
27e87f7
 
 
 
 
 
 
 
 
6eb1048
27e87f7
 
6eb1048
4d7078a
6eb1048
 
 
2bfbd08
 
 
6eb1048
4d7078a
 
1d3c496
4d7078a
 
27e87f7
4d7078a
2bfbd08
6eb1048
4d7078a
 
 
2bfbd08
4d7078a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bfbd08
4d7078a
 
 
 
 
 
 
 
6eb1048
 
 
27e87f7
 
 
d7b3885
27e87f7
 
 
4d7078a
27e87f7
4d7078a
 
 
 
 
27e87f7
6eb1048
 
 
 
 
27e87f7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
from TTS.utils.download import download_url
from TTS.utils.synthesizer import Synthesizer
import gradio as gr
import tempfile
import torch
import json
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.models.vits import Vits, VitsCharacters
from TTS.tts.utils.text.tokenizer import TTSTokenizer
import numpy as np
from TTS.utils.audio.numpy_transforms import save_wav

MAX_TXT_LEN = 800
BASE_DIR = "kbd-vits-tts-{}"
MALE_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/checkpoint_56000.pth"
MALE_CONFIG_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/config_35000.json"
FEMALE_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-female/resolve/main/best_model_56351.pth"
FEMALE_CONFIG_URL = "https://huggingface.co/anzorq/kbd-vits-tts-female/resolve/main/config.json"
MALE_ONNX_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/onnx/kbd_vits_male.onnx"
FEMALE_ONNX_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/onnx/kbd_vits_female.onnx"

def download_model_and_config(gender):
    dir_path = BASE_DIR.format(gender)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    model_url = MALE_MODEL_URL if gender == "male" else FEMALE_MODEL_URL
    config_url = MALE_CONFIG_URL if gender == "male" else FEMALE_CONFIG_URL
    download_url(model_url, dir_path, "model.pth")
    download_url(config_url, dir_path, "config.json")
    return dir_path

download_model_and_config("male")
download_model_and_config("female")

def tts(text: str, voice: str = "Male", use_onnx: bool = True):
    if len(text) > MAX_TXT_LEN:
        text = text[:MAX_TXT_LEN]
        print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")

    model_dir = BASE_DIR.format("male" if voice == "Male" else "female")
    config_file = f"{model_dir}/config.json"

    text = text.replace("I", "ӏ")  # Replace capital "I" with "Palochka" symbol
    text = text.lower()

    if use_onnx:
        onnx_model_url = MALE_ONNX_MODEL_URL if voice == "Male" else FEMALE_ONNX_MODEL_URL

        config = VitsConfig()
        config.load_json(config_file)

        tokenizer = TTSTokenizer(
            use_phonemes=False,
            text_cleaner=config.text_cleaner,
            characters=VitsCharacters(),
            phonemizer=None,
            add_blank=config.add_blank,
        )

        vits = Vits.init_from_config(config)
        vits.load_onnx(onnx_model_url)

        text_inputs = np.asarray(
            vits.tokenizer.text_to_ids(text),
            dtype=np.int64,
        )[None, :]
        audio = vits.inference_onnx(text_inputs)

        # Create a temporary WAV file
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
            out_path = temp_file.name
        save_wav(wav=audio[0], path=out_path, sample_rate=24000)
    else:

        # Synthesize
        synthesizer = Synthesizer(f"{model_dir}/model.pth", config_file)
        wavs = synthesizer.tts(text)

        # Create a temporary WAV file
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
            out_path = temp_file.name
            synthesizer.save_wav(wavs, out_path)

    return out_path

iface = gr.Interface(
    fn=tts,
    inputs=[
        gr.Textbox(
            label="Text",
            value="Дауэ ущыт?",
        ),
        gr.Radio(
            choices=["Male", "Female"],
            value="Male",
            label="Voice"
        ),
        gr.Checkbox(
            label="Use ONNX",
            value=True,
        ),
    ],
    outputs=gr.Audio(label="Output", type='filepath'),
    title="KBD TTS",
    live=False
)

iface.launch(share=False)