import gradio as gr import numpy as np import torch import os import re import tempfile from transformers import VitsModel, VitsTokenizer models = { "English": VitsModel.from_pretrained("Matthijs/mms-tts-eng"), "German": VitsModel.from_pretrained("Matthijs/mms-tts-deu"), "Korean": VitsModel.from_pretrained("Matthijs/mms-tts-kor"), } tokenizers = { "English": VitsTokenizer.from_pretrained("Matthijs/mms-tts-eng"), "German": VitsTokenizer.from_pretrained("Matthijs/mms-tts-deu"), "Korean": VitsTokenizer.from_pretrained("Matthijs/mms-tts-kor"), } # For certain checkpoints, the text needs to be romanized. # MMS-TTS uses uromanize.pl for this from https://github.com/isi-nlp/uroman # This needs to be installed in the folder "uroman" def uromanize(text, uroman_pl): iso = "xxx" with tempfile.NamedTemporaryFile() as tf, tempfile.NamedTemporaryFile() as tf2: with open(tf.name, "w") as f: f.write("\n".join([text])) cmd = f"perl " + uroman_pl cmd += f" -l {iso} " cmd += f" < {tf.name} > {tf2.name}" os.system(cmd) outtexts = [] with open(tf2.name) as f: for line in f: line = re.sub(r"\s+", " ", line).strip() outtexts.append(line) outtext = outtexts[0] return outtext def predict(text, language=None): if len(text.strip()) == 0: return (16000, np.zeros(0).astype(np.int16)) if language == "Korean": uroman_pl = os.path.join("uroman", "bin", "uroman.pl") text = uromanize(text, uroman_pl) tokenizer = tokenizers[language] inputs = tokenizer(text, return_tensors="pt") input_ids = inputs["input_ids"] if language != "Korean": text = tokenizer.batch_decode(input_ids)[0] model = models[language] with torch.no_grad(): outputs = model(input_ids) speech = outputs.audio[0] speech = (speech.numpy() * 32767).astype(np.int16) return (16000, speech), text title = "MMS-TTS speech synthesis" description = """ Facebook's [Massively Multilingual Speech](https://arxiv.org/abs/2305.13516) project aims to provide speech technology across a diverse range of languages. The MMS-TTS project contains a collection of over 1000 text-to-speech (TTS) models. This demo shows how to use MMS-TTS using 🤗 Transformers. Since MMS-TTS is based on the VITS model, this code can also be used to run VITS checkpoints. For a full list of checkpoints, [click here](https://huggingface.co/models?filter=vits). As the model performs random sampling, the generated speech is slightly different each time. The voice may also vary between runs, or sometimes even in the same sentence. (Note that 🤗 Transformers also supports multispeaker VITS checkpoints but the MMS-TTS checkpoints are not conditioned on a speaker ID.) """ article = """
References: MMS paper | blog post | original weights | original MMS space
@article{pratap2023mms, title={Scaling Speech Technology to 1,000+ Languages}, author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli}, journal={arXiv}, year={2023} }