Spaces:

mrfakename
/

E2-F5-TTS

Running on Zero

App Files Files Community

mrfakename commited on 16 days ago

Commit

b624c42

•

1 Parent(s): 118c154

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (4) hide show

api.py +13 -11
model/backbones/dit.py +1 -1
model/backbones/unett.py +1 -1
model/utils_infer.py +27 -23

api.py CHANGED Viewed

@@ -69,6 +69,10 @@ class F5TTS:
         ref_file,
         ref_text,
         gen_text,
         sway_sampling_coef=-1,
         cfg_strength=2,
         nfe_step=32,
@@ -77,23 +81,21 @@ class F5TTS:
         remove_silence=False,
         file_wave=None,
         file_spect=None,
-        cross_fade_duration=0.15,
-        show_info=print,
-        progress=tqdm,
     ):
         wav, sr, spect = infer_process(
             ref_file,
             ref_text,
             gen_text,
             self.ema_model,
-            cross_fade_duration,
-            speed,
-            show_info,
-            progress,
-            nfe_step,
-            cfg_strength,
-            sway_sampling_coef,
-            fix_duration,
         )
         if file_wave is not None:

         ref_file,
         ref_text,
         gen_text,
+        show_info=print,
+        progress=tqdm,
+        target_rms=0.1,
+        cross_fade_duration=0.15,
         sway_sampling_coef=-1,
         cfg_strength=2,
         nfe_step=32,
         remove_silence=False,
         file_wave=None,
         file_spect=None,
     ):
         wav, sr, spect = infer_process(
             ref_file,
             ref_text,
             gen_text,
             self.ema_model,
+            show_info=show_info,
+            progress=progress,
+            target_rms=target_rms,
+            cross_fade_duration=cross_fade_duration,
+            nfe_step=nfe_step,
+            cfg_strength=cfg_strength,
+            sway_sampling_coef=sway_sampling_coef,
+            speed=speed,
+            fix_duration=fix_duration,
         )
         if file_wave is not None:

model/backbones/dit.py CHANGED Viewed

@@ -45,9 +45,9 @@ class TextEmbedding(nn.Module):
             self.extra_modeling = False
     def forward(self, text: int["b nt"], seq_len, drop_text=False):  # noqa: F722
-        batch, text_len = text.shape[0], text.shape[1]
         text = text + 1  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
         text = text[:, :seq_len]  # curtail if character tokens are more than the mel spec tokens
         text = F.pad(text, (0, seq_len - text_len), value=0)
         if drop_text:  # cfg for text

             self.extra_modeling = False
     def forward(self, text: int["b nt"], seq_len, drop_text=False):  # noqa: F722
         text = text + 1  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
         text = text[:, :seq_len]  # curtail if character tokens are more than the mel spec tokens
+        batch, text_len = text.shape[0], text.shape[1]
         text = F.pad(text, (0, seq_len - text_len), value=0)
         if drop_text:  # cfg for text

model/backbones/unett.py CHANGED Viewed

@@ -48,9 +48,9 @@ class TextEmbedding(nn.Module):
             self.extra_modeling = False
     def forward(self, text: int["b nt"], seq_len, drop_text=False):  # noqa: F722
-        batch, text_len = text.shape[0], text.shape[1]
         text = text + 1  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
         text = text[:, :seq_len]  # curtail if character tokens are more than the mel spec tokens
         text = F.pad(text, (0, seq_len - text_len), value=0)
         if drop_text:  # cfg for text

             self.extra_modeling = False
     def forward(self, text: int["b nt"], seq_len, drop_text=False):  # noqa: F722
         text = text + 1  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
         text = text[:, :seq_len]  # curtail if character tokens are more than the mel spec tokens
+        batch, text_len = text.shape[0], text.shape[1]
         text = F.pad(text, (0, seq_len - text_len), value=0)
         if drop_text:  # cfg for text

model/utils_infer.py CHANGED Viewed

@@ -31,12 +31,13 @@ target_sample_rate = 24000
 n_mel_channels = 100
 hop_length = 256
 target_rms = 0.1
-# nfe_step = 32  # 16, 32
-# cfg_strength = 2.0
-# ode_method = "euler"
-# sway_sampling_coef = -1.0
-# speed = 1.0
-# fix_duration = None
 # -----------------------------------------
@@ -107,7 +108,7 @@ def initialize_asr_pipeline(device=device):
 # load model for inference
-def load_model(model_cls, model_cfg, ckpt_path, vocab_file="", ode_method="euler", use_ema=True, device=device):
     if vocab_file == "":
         vocab_file = "Emilia_ZH_EN"
         tokenizer = "pinyin"
@@ -192,14 +193,15 @@ def infer_process(
     ref_text,
     gen_text,
     model_obj,
-    cross_fade_duration=0.15,
-    speed=1.0,
     show_info=print,
     progress=tqdm,
-    nfe_step=32,
-    cfg_strength=2,
-    sway_sampling_coef=-1,
-    fix_duration=None,
 ):
     # Split the input text into batches
     audio, sr = torchaudio.load(ref_audio)
@@ -214,13 +216,14 @@ def infer_process(
         ref_text,
         gen_text_batches,
         model_obj,
-        cross_fade_duration,
-        speed,
-        progress,
-        nfe_step,
-        cfg_strength,
-        sway_sampling_coef,
-        fix_duration,
     )
@@ -232,12 +235,13 @@ def infer_batch_process(
     ref_text,
     gen_text_batches,
     model_obj,
-    cross_fade_duration=0.15,
-    speed=1,
     progress=tqdm,
     nfe_step=32,
     cfg_strength=2.0,
     sway_sampling_coef=-1,
     fix_duration=None,
 ):
     audio, sr = ref_audio
@@ -262,11 +266,11 @@ def infer_batch_process(
         text_list = [ref_text + gen_text]
         final_text_list = convert_char_to_pinyin(text_list)
         if fix_duration is not None:
             duration = int(fix_duration * target_sample_rate / hop_length)
         else:
             # Calculate duration
-            ref_audio_len = audio.shape[-1] // hop_length
             ref_text_len = len(ref_text.encode("utf-8"))
             gen_text_len = len(gen_text.encode("utf-8"))
             duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed)

 n_mel_channels = 100
 hop_length = 256
 target_rms = 0.1
+cross_fade_duration = 0.15
+ode_method = "euler"
+nfe_step = 32  # 16, 32
+cfg_strength = 2.0
+sway_sampling_coef = -1.0
+speed = 1.0
+fix_duration = None
 # -----------------------------------------
 # load model for inference
+def load_model(model_cls, model_cfg, ckpt_path, vocab_file="", ode_method=ode_method, use_ema=True, device=device):
     if vocab_file == "":
         vocab_file = "Emilia_ZH_EN"
         tokenizer = "pinyin"
     ref_text,
     gen_text,
     model_obj,
     show_info=print,
     progress=tqdm,
+    target_rms=target_rms,
+    cross_fade_duration=cross_fade_duration,
+    nfe_step=nfe_step,
+    cfg_strength=cfg_strength,
+    sway_sampling_coef=sway_sampling_coef,
+    speed=speed,
+    fix_duration=fix_duration,
 ):
     # Split the input text into batches
     audio, sr = torchaudio.load(ref_audio)
         ref_text,
         gen_text_batches,
         model_obj,
+        progress=progress,
+        target_rms=target_rms,
+        cross_fade_duration=cross_fade_duration,
+        nfe_step=nfe_step,
+        cfg_strength=cfg_strength,
+        sway_sampling_coef=sway_sampling_coef,
+        speed=speed,
+        fix_duration=fix_duration,
     )
     ref_text,
     gen_text_batches,
     model_obj,
     progress=tqdm,
+    target_rms=0.1,
+    cross_fade_duration=0.15,
     nfe_step=32,
     cfg_strength=2.0,
     sway_sampling_coef=-1,
+    speed=1,
     fix_duration=None,
 ):
     audio, sr = ref_audio
         text_list = [ref_text + gen_text]
         final_text_list = convert_char_to_pinyin(text_list)
+        ref_audio_len = audio.shape[-1] // hop_length
         if fix_duration is not None:
             duration = int(fix_duration * target_sample_rate / hop_length)
         else:
             # Calculate duration
             ref_text_len = len(ref_text.encode("utf-8"))
             gen_text_len = len(gen_text.encode("utf-8"))
             duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed)