from turtle import title import gradio as gr import git import os os.system('git clone https://github.com/Edresson/Coqui-TTS -b multilingual-torchaudio-SE TTS') os.system('pip install -q -e TTS/') os.system('pip install -q torchaudio==0.9.0') import sys TTS_PATH = "TTS/" sys.path.append(TTS_PATH) # set this if TTS is not installed globally import os import string import time import argparse import json import numpy as np import IPython from IPython.display import Audio import torch from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols try: from TTS.utils.audio import AudioProcessor except: from TTS.utils.audio import AudioProcessor from TTS.tts.models import setup_model from TTS.config import load_config from TTS.tts.models.vits import * OUT_PATH = 'out/' os.makedirs(OUT_PATH, exist_ok=True) MODEL_PATH = '/home/user/app/best_model_latest.pth.tar' CONFIG_PATH = '/home/user/app/config.json' TTS_LANGUAGES = "/home/user/app/language_ids.json" TTS_SPEAKERS = "/home/user/app/speakers.json" USE_CUDA = torch.cuda.is_available() C = load_config(CONFIG_PATH) ap = AudioProcessor(**C.audio) speaker_embedding = None C.model_args['d_vector_file'] = TTS_SPEAKERS C.model_args['use_speaker_encoder_as_loss'] = False model = setup_model(C) model.language_manager.set_language_ids_from_file(TTS_LANGUAGES) cp = torch.load(MODEL_PATH, map_location=torch.device('cpu')) model_weights = cp['model'].copy() for key in list(model_weights.keys()): if "speaker_encoder" in key: del model_weights[key] model.load_state_dict(model_weights) model.eval() if USE_CUDA: model = model.cuda() use_griffin_lim = False os.system('pip install -q pydub ffmpeg-normalize') CONFIG_SE_PATH = "config_se.json" CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar" from TTS.tts.utils.speakers import SpeakerManager from pydub import AudioSegment import librosa SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA) def compute_spec(ref_file): y, sr = librosa.load(ref_file, sr=ap.sample_rate) spec = ap.spectrogram(y) spec = torch.FloatTensor(spec).unsqueeze(0) return spec def greet(Text,Voicetoclone,VoiceMicrophone): text= "%s" % (Text) if Voicetoclone is not None: reference_files= "%s" % (Voicetoclone) print("path url") print(Voicetoclone) sample= str(Voicetoclone) else: reference_files= "%s" % (VoiceMicrophone) print("path url") print(VoiceMicrophone) sample= str(VoiceMicrophone) size= len(reference_files)*sys.getsizeof(reference_files) size2= size / 1000000 if (size2 > 0.012) or len(text)>2000: message="File is greater than 30mb or Text inserted is longer than 2000 characters. Please re-try with smaller sizes." print(message) raise SystemExit("File is greater than 30mb. Please re-try or Text inserted is longer than 2000 characters. Please re-try with smaller sizes.") else: os.system('ffmpeg-normalize $sample -nt rms -t=-27 -o $sample -ar 16000 -f') reference_emb = SE_speaker_manager.compute_d_vector_from_clip(reference_files) model.length_scale = 1 # scaler for the duration predictor. The larger it is, the slower the speech. model.inference_noise_scale = 0.3 # defines the noise variance applied to the random z vector at inference. model.inference_noise_scale_dp = 0.3 # defines the noise variance applied to the duration predictor z vector at inference. text = text model.language_manager.language_id_mapping language_id = 0 print(" > text: {}".format(text)) wav, alignment, _, _ = synthesis( model, text, C, "cuda" in str(next(model.parameters()).device), ap, speaker_id=None, d_vector=reference_emb, style_wav=None, language_id=language_id, enable_eos_bos_chars=C.enable_eos_bos_chars, use_griffin_lim=True, do_trim_silence=False, ).values() print("Generated Audio") IPython.display.display(Audio(wav, rate=ap.sample_rate)) #file_name = text.replace(" ", "_") #file_name = file_name.translate(str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav' file_name="Audio.wav" out_path = os.path.join(OUT_PATH, file_name) print(" > Saving output to {}".format(out_path)) ap.save_wav(wav, out_path) return out_path demo = gr.Interface( fn=greet, inputs=[ gr.Audio(source="microphone", type="filepath", streaming=True, label='Record voice to clone with microphone'), gr.inputs.Textbox(label='Enter text for cloned voice to speak aloud.'), gr.Audio(type="filepath",source="upload", label='Upload a voice to clone from audio file'), ], outputs="audio", title="🗣️ Clone Voice and Speak Aloud 🔊" ) demo.launch()