Spaces:

anzorq
/

vits-kbd-male

Running

App Files Files Community

anzorq commited on Jun 9

Commit

4d7078a

•

1 Parent(s): d7b3885

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -12

app.py CHANGED Viewed

@@ -3,6 +3,15 @@ from TTS.utils.download import download_url
 from TTS.utils.synthesizer import Synthesizer
 import gradio as gr
 import tempfile
 MAX_TXT_LEN = 800
 BASE_DIR = "kbd-vits-tts-{}"
@@ -10,6 +19,8 @@ MALE_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/c
 MALE_CONFIG_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/config_35000.json"
 FEMALE_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-female/resolve/main/best_model_56351.pth"
 FEMALE_CONFIG_URL = "https://huggingface.co/anzorq/kbd-vits-tts-female/resolve/main/config.json"
 def download_model_and_config(gender):
     dir_path = BASE_DIR.format(gender)
@@ -24,24 +35,61 @@ def download_model_and_config(gender):
 download_model_and_config("male")
 download_model_and_config("female")
-def tts(text: str, voice: str="Male"):
     if len(text) > MAX_TXT_LEN:
         text = text[:MAX_TXT_LEN]
         print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
     print(text)
-    text = text.replace("I", "ӏ") #replace capital is with "Palochka" symbol
-    model_dir = BASE_DIR.format("male" if voice == "Male" else "female")
-    # synthesize
-    synthesizer = Synthesizer(f"{model_dir}/model.pth", f"{model_dir}/config.json")
-    wavs = synthesizer.tts(text)
-    # return output
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
-        synthesizer.save_wav(wavs, fp)
-        return fp.name
 iface = gr.Interface(
     fn=tts,
@@ -52,9 +100,13 @@ iface = gr.Interface(
         ),
         gr.Radio(
             choices=["Male", "Female"],
-            value="Male",  # Set Male as the default choice
             label="Voice"
-        )
     ],
     outputs=gr.Audio(label="Output", type='filepath'),
     title="KBD TTS",

 from TTS.utils.synthesizer import Synthesizer
 import gradio as gr
 import tempfile
+import torch
+import onnxruntime as ort
+import json
+from TTS.tts.utils.synthesis import synthesis
+from TTS.tts.configs.vits_config import VitsConfig
+from TTS.tts.models.vits import Vits, VitsCharacters
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
+import numpy as np
+from TTS.utils.audio.numpy_transforms import save_wav
 MAX_TXT_LEN = 800
 BASE_DIR = "kbd-vits-tts-{}"
 MALE_CONFIG_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/config_35000.json"
 FEMALE_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-female/resolve/main/best_model_56351.pth"
 FEMALE_CONFIG_URL = "https://huggingface.co/anzorq/kbd-vits-tts-female/resolve/main/config.json"
+MALE_ONNX_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/onnx/kbd_vits_male.onnx"
+FEMALE_ONNX_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/onnx/kbd_vits_female.onnx"
 def download_model_and_config(gender):
     dir_path = BASE_DIR.format(gender)
 download_model_and_config("male")
 download_model_and_config("female")
+def tts(text: str, voice: str = "Male", use_onnx: bool = True):
     if len(text) > MAX_TXT_LEN:
         text = text[:MAX_TXT_LEN]
         print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
     print(text)
+    text = text.replace("I", "ӏ")  # Replace capital "I" with "Palochka" symbol
+    text = text.lower()
+    if use_onnx:
+        # Load the ONNX model
+        onnx_model_url = MALE_ONNX_MODEL_URL if voice == "Male" else FEMALE_ONNX_MODEL_URL
+        ort_session = ort.InferenceSession(onnx_model_url)
+        config = VitsConfig()
+        config.load_json("config_35000.json")
+        # Initialize the tokenizer
+        tokenizer = TTSTokenizer(
+            use_phonemes=False,
+            text_cleaner=config.text_cleaner,
+            characters=VitsCharacters(),  # Assuming the config has character info
+            phonemizer=None,
+            add_blank=config.add_blank,
+        )
+        # Create the Vits model instance
+        vits = Vits.init_from_config(config)
+        # Load the ONNX model into the Vits model
+        vits.load_onnx(onnx_model_url)
+        text_inputs = np.asarray(
+            vits.tokenizer.text_to_ids(text),
+            dtype=np.int64,
+        )[None, :]
+        audio = vits.inference_onnx(text_inputs)
+        # Create a temporary WAV file
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
+            out_path = temp_file.name
+        save_wav(wav=audio[0], path=out_path, sample_rate=24000)
+    else:
+        model_dir = BASE_DIR.format("male" if voice == "Male" else "female")
+        # Synthesize
+        synthesizer = Synthesizer(f"{model_dir}/model.pth", f"{model_dir}/config.json")
+        wavs = synthesizer.tts(text)
+        # Create a temporary WAV file
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
+            out_path = temp_file.name
+            synthesizer.save_wav(wavs, out_path)
+    return out_path
 iface = gr.Interface(
     fn=tts,
         ),
         gr.Radio(
             choices=["Male", "Female"],
+            value="Male",
             label="Voice"
+        ),
+        gr.Checkbox(
+            label="Use ONNX",
+            value=True,
+        ),
     ],
     outputs=gr.Audio(label="Output", type='filepath'),
     title="KBD TTS",