Spaces:

mrfakename
/

E2-F5-TTS

Running on Zero

App Files Files Community

mrfakename commited on 25 days ago

Commit

45012e5

•

1 Parent(s): ca722aa

add chunking

Browse files

Files changed (1) hide show

app.py +52 -46

app.py CHANGED Viewed

@@ -19,8 +19,9 @@ from model.utils import (
 from transformers import pipeline
 import spaces
 import librosa
-device = "cuda" if torch.cuda.is_available() else "cpu"
 pipe = pipeline(
     "automatic-speech-recognition",
@@ -77,7 +78,7 @@ F5TTS_ema_model, F5TTS_base_model = load_model("F5TTS_Base", DiT, F5TTS_model_cf
 E2TTS_ema_model, E2TTS_base_model = load_model("E2TTS_Base", UNetT, E2TTS_model_cfg, 1200000)
 @spaces.GPU
-def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence):
     print(gen_text)
     if len(gen_text) > 200:
         raise gr.Error("Please keep your text under 200 chars.")
@@ -122,44 +123,49 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence):
         resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
         audio = resampler(audio)
     audio = audio.to(device)
-    # Prepare the text
-    text_list = [ref_text + gen_text]
-    final_text_list = convert_char_to_pinyin(text_list)
-    # Calculate duration
-    ref_audio_len = audio.shape[-1] // hop_length
-    # if fix_duration is not None:
-    #     duration = int(fix_duration * target_sample_rate / hop_length)
-    # else:
-    zh_pause_punc = r"。，、；：？！"
-    ref_text_len = len(ref_text) + len(re.findall(zh_pause_punc, ref_text))
-    gen_text_len = len(gen_text) + len(re.findall(zh_pause_punc, gen_text))
-    duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed)
-    # inference
-    gr.Info(f"Generating audio using {exp_name}")
-    with torch.inference_mode():
-        generated, _ = base_model.sample(
-            cond=audio,
-            text=final_text_list,
-            duration=duration,
-            steps=nfe_step,
-            cfg_strength=cfg_strength,
-            sway_sampling_coef=sway_sampling_coef,
-        )
-    generated = generated[:, ref_audio_len:, :]
-    generated_mel_spec = rearrange(generated, '1 n d -> 1 d n')
-    gr.Info("Running vocoder")
-    vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
-    generated_wave = vocos.decode(generated_mel_spec.cpu())
-    if rms < target_rms:
-        generated_wave = generated_wave * rms / target_rms
-    # wav -> numpy
-    generated_wave = generated_wave.squeeze().cpu().numpy()
     if remove_silence:
         gr.Info("Removing audio silences... This may take a moment")
         non_silent_intervals = librosa.effects.split(generated_wave, top_db=30)
@@ -171,11 +177,11 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence):
     # spectogram
-    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
-        spectrogram_path = tmp_spectrogram.name
-        save_spectrogram(generated_mel_spec[0].cpu().numpy(), spectrogram_path)
-    return (target_sample_rate, generated_wave), spectrogram_path
 with gr.Blocks() as app:
     gr.Markdown("""
@@ -206,9 +212,9 @@ Long-form/batched inference + speech editing is coming soon!
         remove_silence = gr.Checkbox(label="Remove Silences", info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.", value=True)
     audio_output = gr.Audio(label="Synthesized Audio")
-    spectrogram_output = gr.Image(label="Spectrogram")
-    generate_btn.click(infer, inputs=[ref_audio_input, ref_text_input, gen_text_input, model_choice, remove_silence], outputs=[audio_output, spectrogram_output])
     gr.Markdown("""
 ## Run Locally

 from transformers import pipeline
 import spaces
 import librosa
+from txtsplit import txtsplit
+device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() "cpu"
 pipe = pipeline(
     "automatic-speech-recognition",
 E2TTS_ema_model, E2TTS_base_model = load_model("E2TTS_Base", UNetT, E2TTS_model_cfg, 1200000)
 @spaces.GPU
+def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, progress = gr.Progress()):
     print(gen_text)
     if len(gen_text) > 200:
         raise gr.Error("Please keep your text under 200 chars.")
         resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
         audio = resampler(audio)
     audio = audio.to(device)
+    # Chunk
+    chunks = txtsplit(gen_text, 100, 150) # 100 chars preferred, 150 max
+    results = []
+    generated_mel_specs = []
+    for chunk in progress.tqdm(chunks):
+        # Prepare the text
+        text_list = [ref_text + chunk]
+        final_text_list = convert_char_to_pinyin(text_list)
+        # Calculate duration
+        ref_audio_len = audio.shape[-1] // hop_length
+        # if fix_duration is not None:
+        #     duration = int(fix_duration * target_sample_rate / hop_length)
+        # else:
+        zh_pause_punc = r"。，、；：？！"
+        ref_text_len = len(ref_text) + len(re.findall(zh_pause_punc, ref_text))
+        gen_text_len = len(gen_text) + len(re.findall(zh_pause_punc, gen_text))
+        duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed)
+        # inference
+        gr.Info(f"Generating audio using {exp_name}")
+        with torch.inference_mode():
+            generated, _ = base_model.sample(
+                cond=audio,
+                text=final_text_list,
+                duration=duration,
+                steps=nfe_step,
+                cfg_strength=cfg_strength,
+                sway_sampling_coef=sway_sampling_coef,
+            )
+        generated = generated[:, ref_audio_len:, :]
+        generated_mel_specs.append(rearrange(generated, '1 n d -> 1 d n'))
+        gr.Info("Running vocoder")
+        vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
+        generated_wave = vocos.decode(generated_mel_spec.cpu())
+        if rms < target_rms:
+            generated_wave = generated_wave * rms / target_rms
+        # wav -> numpy
+        generated_wave = generated_wave.squeeze().cpu().numpy()
+        results.append(generated_wave)
+    generated_wave = np.concatenate(results)
     if remove_silence:
         gr.Info("Removing audio silences... This may take a moment")
         non_silent_intervals = librosa.effects.split(generated_wave, top_db=30)
     # spectogram
+    # with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
+    #     spectrogram_path = tmp_spectrogram.name
+    #     save_spectrogram(generated_mel_spec[0].cpu().numpy(), spectrogram_path)
+    return (target_sample_rate, generated_wave)
 with gr.Blocks() as app:
     gr.Markdown("""
         remove_silence = gr.Checkbox(label="Remove Silences", info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.", value=True)
     audio_output = gr.Audio(label="Synthesized Audio")
+    # spectrogram_output = gr.Image(label="Spectrogram")
+    generate_btn.click(infer, inputs=[ref_audio_input, ref_text_input, gen_text_input, model_choice, remove_silence], outputs=[audio_output])
     gr.Markdown("""
 ## Run Locally