Spaces:

anzorq
/

vits-kbd-male

Running

File size: 3,801 Bytes

import os
from TTS.utils.download import download_url
from TTS.utils.synthesizer import Synthesizer
import gradio as gr
import tempfile
import torch
import json
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.models.vits import Vits, VitsCharacters
from TTS.tts.utils.text.tokenizer import TTSTokenizer
import numpy as np
from TTS.utils.audio.numpy_transforms import save_wav

MAX_TXT_LEN = 800
BASE_DIR = "kbd-vits-tts-{}"
MALE_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/checkpoint_56000.pth"
MALE_CONFIG_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/config_35000.json"
FEMALE_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-female/resolve/main/best_model_56351.pth"
FEMALE_CONFIG_URL = "https://huggingface.co/anzorq/kbd-vits-tts-female/resolve/main/config.json"
MALE_ONNX_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/onnx/kbd_vits_male.onnx"
FEMALE_ONNX_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/onnx/kbd_vits_female.onnx"

def download_model_and_config(gender):
    dir_path = BASE_DIR.format(gender)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    model_url = MALE_MODEL_URL if gender == "male" else FEMALE_MODEL_URL
    config_url = MALE_CONFIG_URL if gender == "male" else FEMALE_CONFIG_URL
    download_url(model_url, dir_path, "model.pth")
    download_url(config_url, dir_path, "config.json")
    return dir_path

download_model_and_config("male")
download_model_and_config("female")

def tts(text: str, voice: str = "Male", use_onnx: bool = True):
    if len(text) > MAX_TXT_LEN:
        text = text[:MAX_TXT_LEN]
        print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")

    model_dir = BASE_DIR.format("male" if voice == "Male" else "female")
    config_file = f"{model_dir}/config.json"

    text = text.replace("I", "ӏ")  # Replace capital "I" with "Palochka" symbol
    text = text.lower()

    if use_onnx:
        onnx_model_url = MALE_ONNX_MODEL_URL if voice == "Male" else FEMALE_ONNX_MODEL_URL

        config = VitsConfig()
        config.load_json(config_file)

        tokenizer = TTSTokenizer(
            use_phonemes=False,
            text_cleaner=config.text_cleaner,
            characters=VitsCharacters(),
            phonemizer=None,
            add_blank=config.add_blank,
        )

        vits = Vits.init_from_config(config)
        vits.load_onnx(onnx_model_url)

        text_inputs = np.asarray(
            vits.tokenizer.text_to_ids(text),
            dtype=np.int64,
        )[None, :]
        audio = vits.inference_onnx(text_inputs)

        # Create a temporary WAV file
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
            out_path = temp_file.name
        save_wav(wav=audio[0], path=out_path, sample_rate=24000)
    else:

        # Synthesize
        synthesizer = Synthesizer(f"{model_dir}/model.pth", config_file)
        wavs = synthesizer.tts(text)

        # Create a temporary WAV file
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
            out_path = temp_file.name
            synthesizer.save_wav(wavs, out_path)

    return out_path

iface = gr.Interface(
    fn=tts,
    inputs=[
        gr.Textbox(
            label="Text",
            value="Дауэ ущыт?",
        ),
        gr.Radio(
            choices=["Male", "Female"],
            value="Male",
            label="Voice"
        ),
        gr.Checkbox(
            label="Use ONNX",
            value=True,
        ),
    ],
    outputs=gr.Audio(label="Output", type='filepath'),
    title="KBD TTS",
    live=False
)

iface.launch(share=False)