Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,6 +3,15 @@ from TTS.utils.download import download_url
|
|
3 |
from TTS.utils.synthesizer import Synthesizer
|
4 |
import gradio as gr
|
5 |
import tempfile
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
MAX_TXT_LEN = 800
|
8 |
BASE_DIR = "kbd-vits-tts-{}"
|
@@ -10,6 +19,8 @@ MALE_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/c
|
|
10 |
MALE_CONFIG_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/config_35000.json"
|
11 |
FEMALE_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-female/resolve/main/best_model_56351.pth"
|
12 |
FEMALE_CONFIG_URL = "https://huggingface.co/anzorq/kbd-vits-tts-female/resolve/main/config.json"
|
|
|
|
|
13 |
|
14 |
def download_model_and_config(gender):
|
15 |
dir_path = BASE_DIR.format(gender)
|
@@ -24,24 +35,61 @@ def download_model_and_config(gender):
|
|
24 |
download_model_and_config("male")
|
25 |
download_model_and_config("female")
|
26 |
|
27 |
-
def tts(text: str, voice: str="Male"):
|
28 |
if len(text) > MAX_TXT_LEN:
|
29 |
text = text[:MAX_TXT_LEN]
|
30 |
print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
|
31 |
print(text)
|
32 |
|
33 |
-
text = text.replace("I", "ӏ")
|
|
|
34 |
|
35 |
-
|
|
|
|
|
|
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
wavs = synthesizer.tts(text)
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
iface = gr.Interface(
|
47 |
fn=tts,
|
@@ -52,9 +100,13 @@ iface = gr.Interface(
|
|
52 |
),
|
53 |
gr.Radio(
|
54 |
choices=["Male", "Female"],
|
55 |
-
value="Male",
|
56 |
label="Voice"
|
57 |
-
)
|
|
|
|
|
|
|
|
|
58 |
],
|
59 |
outputs=gr.Audio(label="Output", type='filepath'),
|
60 |
title="KBD TTS",
|
|
|
3 |
from TTS.utils.synthesizer import Synthesizer
|
4 |
import gradio as gr
|
5 |
import tempfile
|
6 |
+
import torch
|
7 |
+
import onnxruntime as ort
|
8 |
+
import json
|
9 |
+
from TTS.tts.utils.synthesis import synthesis
|
10 |
+
from TTS.tts.configs.vits_config import VitsConfig
|
11 |
+
from TTS.tts.models.vits import Vits, VitsCharacters
|
12 |
+
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
13 |
+
import numpy as np
|
14 |
+
from TTS.utils.audio.numpy_transforms import save_wav
|
15 |
|
16 |
MAX_TXT_LEN = 800
|
17 |
BASE_DIR = "kbd-vits-tts-{}"
|
|
|
19 |
MALE_CONFIG_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/config_35000.json"
|
20 |
FEMALE_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-female/resolve/main/best_model_56351.pth"
|
21 |
FEMALE_CONFIG_URL = "https://huggingface.co/anzorq/kbd-vits-tts-female/resolve/main/config.json"
|
22 |
+
MALE_ONNX_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/onnx/kbd_vits_male.onnx"
|
23 |
+
FEMALE_ONNX_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/onnx/kbd_vits_female.onnx"
|
24 |
|
25 |
def download_model_and_config(gender):
|
26 |
dir_path = BASE_DIR.format(gender)
|
|
|
35 |
download_model_and_config("male")
|
36 |
download_model_and_config("female")
|
37 |
|
38 |
+
def tts(text: str, voice: str = "Male", use_onnx: bool = True):
|
39 |
if len(text) > MAX_TXT_LEN:
|
40 |
text = text[:MAX_TXT_LEN]
|
41 |
print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
|
42 |
print(text)
|
43 |
|
44 |
+
text = text.replace("I", "ӏ") # Replace capital "I" with "Palochka" symbol
|
45 |
+
text = text.lower()
|
46 |
|
47 |
+
if use_onnx:
|
48 |
+
# Load the ONNX model
|
49 |
+
onnx_model_url = MALE_ONNX_MODEL_URL if voice == "Male" else FEMALE_ONNX_MODEL_URL
|
50 |
+
ort_session = ort.InferenceSession(onnx_model_url)
|
51 |
|
52 |
+
config = VitsConfig()
|
53 |
+
config.load_json("config_35000.json")
|
|
|
54 |
|
55 |
+
# Initialize the tokenizer
|
56 |
+
tokenizer = TTSTokenizer(
|
57 |
+
use_phonemes=False,
|
58 |
+
text_cleaner=config.text_cleaner,
|
59 |
+
characters=VitsCharacters(), # Assuming the config has character info
|
60 |
+
phonemizer=None,
|
61 |
+
add_blank=config.add_blank,
|
62 |
+
)
|
63 |
+
|
64 |
+
# Create the Vits model instance
|
65 |
+
vits = Vits.init_from_config(config)
|
66 |
+
|
67 |
+
# Load the ONNX model into the Vits model
|
68 |
+
vits.load_onnx(onnx_model_url)
|
69 |
+
|
70 |
+
text_inputs = np.asarray(
|
71 |
+
vits.tokenizer.text_to_ids(text),
|
72 |
+
dtype=np.int64,
|
73 |
+
)[None, :]
|
74 |
+
audio = vits.inference_onnx(text_inputs)
|
75 |
+
|
76 |
+
# Create a temporary WAV file
|
77 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
78 |
+
out_path = temp_file.name
|
79 |
+
save_wav(wav=audio[0], path=out_path, sample_rate=24000)
|
80 |
+
else:
|
81 |
+
model_dir = BASE_DIR.format("male" if voice == "Male" else "female")
|
82 |
+
|
83 |
+
# Synthesize
|
84 |
+
synthesizer = Synthesizer(f"{model_dir}/model.pth", f"{model_dir}/config.json")
|
85 |
+
wavs = synthesizer.tts(text)
|
86 |
+
|
87 |
+
# Create a temporary WAV file
|
88 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
|
89 |
+
out_path = temp_file.name
|
90 |
+
synthesizer.save_wav(wavs, out_path)
|
91 |
+
|
92 |
+
return out_path
|
93 |
|
94 |
iface = gr.Interface(
|
95 |
fn=tts,
|
|
|
100 |
),
|
101 |
gr.Radio(
|
102 |
choices=["Male", "Female"],
|
103 |
+
value="Male",
|
104 |
label="Voice"
|
105 |
+
),
|
106 |
+
gr.Checkbox(
|
107 |
+
label="Use ONNX",
|
108 |
+
value=True,
|
109 |
+
),
|
110 |
],
|
111 |
outputs=gr.Audio(label="Output", type='filepath'),
|
112 |
title="KBD TTS",
|