Spaces:
Running
Running
import torch | |
import scipy | |
import os | |
import streamlit as st | |
import pandas as pd | |
from transformers import pipeline #set_seed, | |
from transformers import VitsTokenizer, VitsModel | |
from datasets import load_dataset, Audio | |
from huggingface_hub.inference_api import InferenceApi | |
from src import * | |
######################## | |
st.title("Mockingbird") | |
st.header("A demo of open Text to Speech tools") | |
tts, about = st.tabs(["Text to speech", "**About**"]) | |
######################## | |
with tts: | |
# Configurations -- language choice and text | |
tts_lang = st.selectbox('Language of text', (language_list), format_func = decode_iso) | |
tts_text = st.text_area(label = "Please enter your sentence here:", | |
value="", placeholder=placeholders[tts_lang] ) | |
target_speaker_file = st.file_uploader("If you would like to test voice conversion, you may upload your audio below. You should upload one file in .wav format. If you don't, a default file will be used.", | |
type=['wav']) | |
# Inference | |
if st.button("Generate"): | |
# Warning about alphabet support | |
if tts_lang in ['rus', 'fas']: | |
st.warning("WARNING! On Windows, ESpeak-NG has trouble synthesizing output when input is provided from non-Latin alphabets.") | |
st.divider() | |
# Synthesis | |
with st.spinner(":rainbow[Synthesizing, please wait... (this will be slowest the first time you generate audio in a new language)]"): | |
if tts_text == "": | |
tts_text=placeholders[tts_lang] | |
# First, make the audio | |
base_mms = synth_mms(tts_text, models[tts_lang]['mms']) | |
base_coqui= synth_coqui(tts_text, models[tts_lang]['coqui']) | |
base_espeakng= synth_espeakng(tts_text, models[tts_lang]['espeakng']) | |
if tts_lang=="swh": | |
finetuned_mms1 = synth_mms(tts_text, "khof312/mms-tts-swh-female-1") | |
finetuned_mms2 = synth_mms(tts_text, "khof312/mms-tts-swh-female-2") | |
#vc_mms | |
#vc_coqui | |
#vc_espeakng | |
"## Synthesis" | |
"### Default models" | |
row1 = st.columns([1,1,2]) | |
row2 = st.columns([1,1,2]) | |
row3 = st.columns([1,1,2]) | |
row4 = st.columns([1,1,2]) | |
row1[0].write("**Model**") | |
row1[1].write("**Configuration**") | |
row1[2].write("**Audio**") | |
if base_mms is not None: | |
row2[0].write(f"Meta MMS") | |
row2[1].write("default") | |
row2[2].audio(base_mms[0], sample_rate = base_mms[1]) | |
if base_coqui is not None: | |
row3[0].write(f"Coqui") | |
row3[1].write("default") | |
row3[2].audio(base_coqui[0], sample_rate = base_coqui[1]) | |
if base_espeakng is not None: | |
row4[0].write(f"Espeak-ng") | |
row4[1].write("default") | |
row4[2].audio(base_espeakng[0], sample_rate = base_espeakng[1]) | |
################################################################# | |
if tts_lang == "swh": | |
"### Fine Tuned" | |
row1 = st.columns([1,1,2]) | |
row2 = st.columns([1,1,2]) | |
row3 = st.columns([1,1,2]) | |
row1[0].write("**Model**") | |
row1[1].write("**Configuration**") | |
row1[2].write("**Audio**") | |
row2[0].write(f"Meta MMS") | |
row2[1].write("female 1") | |
row2[2].audio(finetuned_mms1[0], sample_rate = finetuned_mms1[1]) | |
row3[0].write(f"Meta MMS") | |
row3[1].write("female 2") | |
row3[2].audio(finetuned_mms2[0], sample_rate = finetuned_mms2[1]) | |
st.divider() | |
"## Voice conversion" ################################################################# | |
st.warning('''Note: The naturalness of the audio will only be as good as that of the audio in "default models" above.''') | |
if target_speaker_file is not None: | |
rate, wav = scipy.io.wavfile.read(target_speaker_file) | |
scipy.io.wavfile.write("target_speaker_custom.wav", data=wav, rate=rate) | |
target_speaker = "target_speaker_custom.wav" | |
else: | |
target_speaker = "target_speaker.wav" | |
if base_mms is not None: | |
scipy.io.wavfile.write("source_speaker_mms.wav", rate=base_mms[1], data=base_mms[0].T) | |
converted_mms = convert_coqui('source_speaker_mms.wav', target_speaker) | |
if base_coqui is not None: | |
scipy.io.wavfile.write("source_speaker_coqui.wav", rate=base_coqui[1], data=base_coqui[0].T) | |
converted_coqui = convert_coqui('source_speaker_coqui.wav', target_speaker) | |
if base_espeakng is not None: | |
scipy.io.wavfile.write("source_speaker_espeakng.wav", rate=base_espeakng[1], data=base_espeakng[0].T) | |
converted_espeakng = convert_coqui('source_speaker_espeakng.wav', target_speaker) | |
row1 = st.columns([1,1,2]) | |
row2 = st.columns([1,1,2]) | |
row3 = st.columns([1,1,2]) | |
row1[0].write("**Model**") | |
row1[1].write("**Configuration**") | |
row1[2].write("**Audio**") | |
if base_mms is not None: | |
row1[0].write(f"Meta MMS") | |
row1[1].write(f"converted") | |
row1[2].audio(converted_mms[0], sample_rate = converted_mms[1]) | |
if base_coqui is not None: | |
row2[0].write(f"Coqui") | |
row2[1].write(f"converted") | |
row2[2].audio(converted_coqui[0], sample_rate = converted_coqui[1]) | |
if base_espeakng is not None: | |
row3[0].write(f"Espeak-ng") | |
row3[1].write(f"converted") | |
row3[2].audio(converted_espeakng[0], sample_rate = converted_espeakng[1]) | |
#row3[0].write("MMS-TTS-SWH") | |
#row3[1].audio(synth, sample_rate=16_000) | |
#row3[2].audio(synth, sample_rate=16_000) | |
#st.audio(synth, sample_rate=16_000) | |
#data.write(np.random.randn(10, 1) | |
#col1.subheader("A wide column with a chart") | |
#col1.line_chart(data) | |
#col2.subheader("A narrow column with the data") | |
#col2.write(data) | |
with about: | |
#st.header("How it works") | |
st.markdown('''# Mockingbird TTS Demo | |
This page is a demo of the openly available Text to Speech models for various languages of interest. Currently, 3 synthesizers are supported: | |
- [**Meta's Massively Multilingual Speech (MMS)**](https://ai.meta.com/blog/multilingual-model-speech-recognition/) model, which supports over 1000 languages.[^1] | |
- [**Coqui's TTS**](https://docs.coqui.ai/en/latest/#) package;[^2] while no longer supported, Coqui acted as a hub for TTS model hosting and these models are still available. | |
- [**ESpeak-NG's**](https://github.com/espeak-ng/espeak-ng/tree/master)'s synthetic voices**[^3] | |
Voice conversion is achieved through Coqui. | |
Notes: | |
1. ESpeak-NG seems to have the worst performance out of the box, but it has a lot of options for controlling voice output. | |
2. Where a synthesizer supports multiple models/voices, I manually pick the appropriate model. | |
3. Not all synthesizers support a given language. | |
[^1]: Endpoints used are of the form https://huggingface.co/facebook/mms-tts-[LANG]. | |
Learn more: | |
[Docs](https://huggingface.co/docs/transformers/model_doc/mms) | | |
[Paper](https://arxiv.org/abs/2305.13516) | | |
[Supported languages](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) | |
[^2]: [Available models](https://github.com/coqui-ai/TTS/blob/dev/TTS/.models.json) | |
[^3]: [Language list](https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md) | |
''') | |