tts-silero / app.py
NeuroSenko's picture
formatting fix
5d68da9
raw
history blame contribute delete
No virus
4.22 kB
import os
from datetime import datetime
from inspect import signature
import gradio as gr
import torch
from omegaconf import OmegaConf
torch.hub.download_url_to_file(
"https://raw.githubusercontent.com/snakers4/silero-models/master/models.yml",
"latest_silero_models.yml",
progress=False,
)
all_models = OmegaConf.load("latest_silero_models.yml")
language="ru"
model_id = "v3_1_ru"
device = torch.device("cpu")
model, example_text = torch.hub.load(
repo_or_dir="snakers4/silero-models",
model="silero_tts",
language=language,
speaker=model_id,
)
model.to(device) # gpu or cpu
sample_rate = 48000
speaker = "aidar"
put_accent = True
put_yo = True
example_text = "В недрах тундры выдры в г+етрах т+ырят в вёдра ядра к+едров."
models = list(all_models.tts_models.get(language).keys())
model, example_text = torch.hub.load(
repo_or_dir='snakers4/silero-models',
model='silero_tts',
language='ru',
speaker=model_id
)
def change_language(language):
models = list(all_models.tts_models.get(language).keys())
return model_input.update(choices=models)
def change_model(language, model_name):
model, example_text = torch.hub.load(
repo_or_dir='snakers4/silero-models',
model='silero_tts',
language=language,
speaker=model_name
)
return speaker_input.update(choices=model.speakers)
def generate_audio_by_text(text, text_type, speaker):
output_file_name = "{datetime}.wav".format(datetime=datetime.now().isoformat().replace(':', '-'))
output = os.path.join("out_audio", output_file_name)
if text_type == 'SSML':
return model.save_wav(
audio_path=output,
ssml_text=text,
speaker=speaker,
sample_rate=sample_rate,
put_accent=put_accent,
put_yo=put_yo,
)
else:
return model.save_wav(
audio_path=output,
text=text,
speaker=speaker,
sample_rate=sample_rate,
put_accent=put_accent,
put_yo=put_yo,
)
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
language_input = gr.Dropdown(
label="Language",
choices=list(all_models.tts_models.keys()),
value="ru",
interactive=True,
)
model_input = gr.Dropdown(
label="Model (based on selected language)",
value="v3_1_ru",
choices=models,
interactive=True,
)
speaker_input = gr.Dropdown(
label="Speaker (based on selected model)",
value="kseniya",
choices=model.speakers,
interactive=True,
)
text_input = gr.Textbox(
label="Text for generating",
value="В недрах тундры выдры в г+етрах т+ырят в вёдра +ядра к+едров.",
lines=5,
interactive=True,
)
text_type_input = gr.Radio(
label="Text type",
choices=["Common", "SSML"],
value="Common",
interactive=True,
)
language_input.change(change_language, inputs=language_input, outputs=model_input)
model_input.change(change_model, inputs=[language_input, model_input], outputs=speaker_input)
with gr.Column():
audio_output = gr.Audio(label="Output audio")
generate_btn = gr.Button(value="Generate", variant="primary")
generate_btn.click(
generate_audio_by_text,
inputs=[text_input, text_type_input, speaker_input],
outputs=audio_output,
)
gr.Markdown(
"This is a simple frontend for [silero](https://github.com/snakers4/silero-models) project (Text-To-Speech part only)."
)
gr.Markdown(
"You can check [official docs](https://github.com/snakers4/silero-models/wiki/SSML) to find information about SSML syntax."
)
demo.launch()