Spaces:
Running
Running
import os | |
import re | |
import torch | |
import torchaudio | |
import gradio as gr | |
import numpy as np | |
import tempfile | |
from einops import rearrange | |
from ema_pytorch import EMA | |
from vocos import Vocos | |
from pydub import AudioSegment | |
from model import CFM, UNetT, DiT, MMDiT | |
from cached_path import cached_path | |
from model.utils import ( | |
get_tokenizer, | |
convert_char_to_pinyin, | |
save_spectrogram, | |
) | |
from transformers import pipeline | |
import spaces | |
import librosa | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
pipe = pipeline( | |
"automatic-speech-recognition", | |
model="openai/whisper-large-v3-turbo", | |
torch_dtype=torch.float16, | |
device=device, | |
) | |
# --------------------- Settings -------------------- # | |
target_sample_rate = 24000 | |
n_mel_channels = 100 | |
hop_length = 256 | |
target_rms = 0.1 | |
nfe_step = 32 # 16, 32 | |
cfg_strength = 2.0 | |
ode_method = 'euler' | |
sway_sampling_coef = -1.0 | |
speed = 1.0 | |
# fix_duration = 27 # None or float (duration in seconds) | |
fix_duration = None | |
def load_model(exp_name, model_cls, model_cfg, ckpt_step): | |
checkpoint = torch.load(str(cached_path(f"hf://SWivid/F5-TTS/{exp_name}/model_{ckpt_step}.pt")), map_location=device) | |
vocab_char_map, vocab_size = get_tokenizer("Emilia_ZH_EN", "pinyin") | |
model = CFM( | |
transformer=model_cls( | |
**model_cfg, | |
text_num_embeds=vocab_size, | |
mel_dim=n_mel_channels | |
), | |
mel_spec_kwargs=dict( | |
target_sample_rate=target_sample_rate, | |
n_mel_channels=n_mel_channels, | |
hop_length=hop_length, | |
), | |
odeint_kwargs=dict( | |
method=ode_method, | |
), | |
vocab_char_map=vocab_char_map, | |
).to(device) | |
ema_model = EMA(model, include_online_model=False).to(device) | |
ema_model.load_state_dict(checkpoint['ema_model_state_dict']) | |
ema_model.copy_params_from_ema_to_model() | |
return ema_model, model | |
# load models | |
F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4) | |
E2TTS_model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4) | |
F5TTS_ema_model, F5TTS_base_model = load_model("F5TTS_Base", DiT, F5TTS_model_cfg, 1200000) | |
E2TTS_ema_model, E2TTS_base_model = load_model("E2TTS_Base", UNetT, E2TTS_model_cfg, 1200000) | |
def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence): | |
print(gen_text) | |
if len(gen_text) > 200: | |
raise gr.Error("Please keep your text under 200 chars.") | |
gr.Info("Converting audio...") | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f: | |
aseg = AudioSegment.from_file(ref_audio_orig) | |
audio_duration = len(aseg) | |
if audio_duration > 15000: | |
gr.Warning("Audio is over 15s, clipping to only first 15s.") | |
aseg = aseg[:15000] | |
aseg.export(f.name, format="wav") | |
ref_audio = f.name | |
if exp_name == "F5-TTS": | |
ema_model = F5TTS_ema_model | |
base_model = F5TTS_base_model | |
elif exp_name == "E2-TTS": | |
ema_model = E2TTS_ema_model | |
base_model = E2TTS_base_model | |
if not ref_text.strip(): | |
gr.Info("No reference text provided, transcribing reference audio...") | |
ref_text = outputs = pipe( | |
ref_audio, | |
chunk_length_s=30, | |
batch_size=128, | |
generate_kwargs={"task": "transcribe"}, | |
return_timestamps=False, | |
)['text'].strip() | |
gr.Info("Finished transcription") | |
else: | |
gr.Info("Using custom reference text...") | |
audio, sr = torchaudio.load(ref_audio) | |
rms = torch.sqrt(torch.mean(torch.square(audio))) | |
if rms < target_rms: | |
audio = audio * target_rms / rms | |
if sr != target_sample_rate: | |
resampler = torchaudio.transforms.Resample(sr, target_sample_rate) | |
audio = resampler(audio) | |
audio = audio.to(device) | |
# Prepare the text | |
text_list = [ref_text + gen_text] | |
final_text_list = convert_char_to_pinyin(text_list) | |
# Calculate duration | |
ref_audio_len = audio.shape[-1] // hop_length | |
# if fix_duration is not None: | |
# duration = int(fix_duration * target_sample_rate / hop_length) | |
# else: | |
zh_pause_punc = r"。,、;:?!" | |
ref_text_len = len(ref_text) + len(re.findall(zh_pause_punc, ref_text)) | |
gen_text_len = len(gen_text) + len(re.findall(zh_pause_punc, gen_text)) | |
duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed) | |
# inference | |
gr.Info(f"Generating audio using {exp_name}") | |
with torch.inference_mode(): | |
generated, _ = base_model.sample( | |
cond=audio, | |
text=final_text_list, | |
duration=duration, | |
steps=nfe_step, | |
cfg_strength=cfg_strength, | |
sway_sampling_coef=sway_sampling_coef, | |
) | |
generated = generated[:, ref_audio_len:, :] | |
generated_mel_spec = rearrange(generated, '1 n d -> 1 d n') | |
gr.Info("Running vocoder") | |
vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz") | |
generated_wave = vocos.decode(generated_mel_spec.cpu()) | |
if rms < target_rms: | |
generated_wave = generated_wave * rms / target_rms | |
# wav -> numpy | |
generated_wave = generated_wave.squeeze().cpu().numpy() | |
if remove_silence: | |
gr.Info("Removing audio silences") | |
non_silent_intervals = librosa.effects.split(generated_wave, top_db=30) | |
non_silent_wave = np.array([]) | |
for interval in non_silent_intervals: | |
start, end = interval | |
non_silent_wave = np.concatenate([non_silent_wave, generated_wave[start:end]]) | |
generated_wave = non_silent_wave | |
# spectogram | |
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram: | |
spectrogram_path = tmp_spectrogram.name | |
save_spectrogram(generated_mel_spec[0].cpu().numpy(), spectrogram_path) | |
return (target_sample_rate, generated_wave), spectrogram_path | |
with gr.Blocks() as app: | |
gr.Markdown(""" | |
# E2/F5 TTS | |
This is an unofficial E2/F5 TTS demo. This demo supports the following TTS models: | |
* [E2-TTS](https://arxiv.org/abs/2406.18009) (Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS) | |
* [F5-TTS](https://arxiv.org/abs/2410.06885) (A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching) | |
This demo is based on the [F5-TTS](https://github.com/SWivid/F5-TTS) codebase, which is based on an [unofficial E2-TTS implementation](https://github.com/lucidrains/e2-tts-pytorch). | |
The checkpoints support English and Chinese. | |
**NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.** | |
""") | |
ref_audio_input = gr.Audio(label="Reference Audio", type="filepath") | |
gen_text_input = gr.Textbox(label="Text to Generate (max 200 chars.)", lines=4) | |
model_choice = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS") | |
generate_btn = gr.Button("Synthesize", variant="primary") | |
with gr.Accordion("Advanced Settings", open=False): | |
ref_text_input = gr.Textbox(label="Reference Text", info="Leave blank to automatically transcribe the reference audio. If you enter text it will override automatic transcription.", lines=2) | |
remove_silence = gr.Checkbox(label="[EXPERIMENTAL] Remove Silences", info="The model tends to leave silences, we can manually remove silences if needed. This may produce strange results and is not guarenteed to work.") | |
audio_output = gr.Audio(label="Synthesized Audio") | |
spectrogram_output = gr.Image(label="Spectrogram") | |
generate_btn.click(infer, inputs=[ref_audio_input, ref_text_input, gen_text_input, model_choice, remove_silence], outputs=[audio_output, spectrogram_output]) | |
gr.Markdown("Unofficial demo by [mrfakename](https://x.com/realmrfakename)") | |
app.queue().launch() |