vits-kbd-male / app.py
anzorq's picture
Update app.py
2bfbd08 verified
raw
history blame
3.8 kB
import os
from TTS.utils.download import download_url
from TTS.utils.synthesizer import Synthesizer
import gradio as gr
import tempfile
import torch
import json
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.models.vits import Vits, VitsCharacters
from TTS.tts.utils.text.tokenizer import TTSTokenizer
import numpy as np
from TTS.utils.audio.numpy_transforms import save_wav
MAX_TXT_LEN = 800
BASE_DIR = "kbd-vits-tts-{}"
MALE_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/checkpoint_56000.pth"
MALE_CONFIG_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/config_35000.json"
FEMALE_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-female/resolve/main/best_model_56351.pth"
FEMALE_CONFIG_URL = "https://huggingface.co/anzorq/kbd-vits-tts-female/resolve/main/config.json"
MALE_ONNX_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/onnx/kbd_vits_male.onnx"
FEMALE_ONNX_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/onnx/kbd_vits_female.onnx"
def download_model_and_config(gender):
dir_path = BASE_DIR.format(gender)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
model_url = MALE_MODEL_URL if gender == "male" else FEMALE_MODEL_URL
config_url = MALE_CONFIG_URL if gender == "male" else FEMALE_CONFIG_URL
download_url(model_url, dir_path, "model.pth")
download_url(config_url, dir_path, "config.json")
return dir_path
download_model_and_config("male")
download_model_and_config("female")
def tts(text: str, voice: str = "Male", use_onnx: bool = True):
if len(text) > MAX_TXT_LEN:
text = text[:MAX_TXT_LEN]
print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
model_dir = BASE_DIR.format("male" if voice == "Male" else "female")
config_file = f"{model_dir}/config.json"
text = text.replace("I", "ำ") # Replace capital "I" with "Palochka" symbol
text = text.lower()
if use_onnx:
onnx_model_url = MALE_ONNX_MODEL_URL if voice == "Male" else FEMALE_ONNX_MODEL_URL
config = VitsConfig()
config.load_json(config_file)
tokenizer = TTSTokenizer(
use_phonemes=False,
text_cleaner=config.text_cleaner,
characters=VitsCharacters(),
phonemizer=None,
add_blank=config.add_blank,
)
vits = Vits.init_from_config(config)
vits.load_onnx(onnx_model_url)
text_inputs = np.asarray(
vits.tokenizer.text_to_ids(text),
dtype=np.int64,
)[None, :]
audio = vits.inference_onnx(text_inputs)
# Create a temporary WAV file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
out_path = temp_file.name
save_wav(wav=audio[0], path=out_path, sample_rate=24000)
else:
# Synthesize
synthesizer = Synthesizer(f"{model_dir}/model.pth", config_file)
wavs = synthesizer.tts(text)
# Create a temporary WAV file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
out_path = temp_file.name
synthesizer.save_wav(wavs, out_path)
return out_path
iface = gr.Interface(
fn=tts,
inputs=[
gr.Textbox(
label="Text",
value="ะ”ะฐัƒั ัƒั‰ั‹ั‚?",
),
gr.Radio(
choices=["Male", "Female"],
value="Male",
label="Voice"
),
gr.Checkbox(
label="Use ONNX",
value=True,
),
],
outputs=gr.Audio(label="Output", type='filepath'),
title="KBD TTS",
live=False
)
iface.launch(share=False)