E2-F5-TTS

Running

App Files Files Community

mrfakename commited on 23 days ago

Commit

ad63082

•

1 Parent(s): dd4e6c8

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (6) hide show

README_REPO.md +7 -1
app.py +1 -2
inference-cli.py +35 -21
model/utils.py +6 -7
requirements.txt +2 -8
requirements_eval.txt +5 -0

README_REPO.md CHANGED Viewed

@@ -62,7 +62,7 @@ An initial guidance on Finetuning [#57](https://github.com/SWivid/F5-TTS/discuss
 ## Inference
-To run inference with pretrained models, download the checkpoints from [🤗 Hugging Face](https://huggingface.co/SWivid/F5-TTS), or automatically downloaded with `inference-cli` and `gradio_app`.
 Currently support 30s for a single generation, which is the **TOTAL** length of prompt audio and the generated. Batch inference with chunks is supported by `inference-cli` and `gradio_app`.
 - To avoid possible inference failures, make sure you have seen through the following instructions.
@@ -148,6 +148,12 @@ bash scripts/eval_infer_batch.sh
 ### Objective Evaluation
 **Some Notes**
 For faster-whisper with CUDA 11:

 ## Inference
+The pretrained model checkpoints can be reached at [🤗 Hugging Face](https://huggingface.co/SWivid/F5-TTS) and [⭐ Model Scope](https://www.modelscope.cn/models/SWivid/F5-TTS_Emilia-ZH-EN), or automatically downloaded with `inference-cli` and `gradio_app`.
 Currently support 30s for a single generation, which is the **TOTAL** length of prompt audio and the generated. Batch inference with chunks is supported by `inference-cli` and `gradio_app`.
 - To avoid possible inference failures, make sure you have seen through the following instructions.
 ### Objective Evaluation
+Install packages for evaluation:
+```bash
+pip install -r requirements_eval.txt
+```
 **Some Notes**
 For faster-whisper with CUDA 11:

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import os
 import re
 import torch
 import torchaudio
@@ -17,7 +16,6 @@ from model.utils import (
     save_spectrogram,
 )
 from transformers import pipeline
-import librosa
 import click
 import soundfile as sf
@@ -429,6 +427,7 @@ with gr.Blocks() as app_credits:
 * [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
 * [RootingInLoad](https://github.com/RootingInLoad) for the podcast generation
 """)
 with gr.Blocks() as app_tts:
     gr.Markdown("# Batched TTS")

 import re
 import torch
 import torchaudio
     save_spectrogram,
 )
 from transformers import pipeline
 import click
 import soundfile as sf
 * [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
 * [RootingInLoad](https://github.com/RootingInLoad) for the podcast generation
+* [jpgallegoar](https://github.com/jpgallegoar) for multiple speech-type generation
 """)
 with gr.Blocks() as app_tts:
     gr.Markdown("# Batched TTS")

inference-cli.py CHANGED Viewed

@@ -1,26 +1,24 @@
 import re
 import torch
 import torchaudio
-import numpy as np
-import tempfile
 from einops import rearrange
-from vocos import Vocos
 from pydub import AudioSegment, silence
-from model import CFM, UNetT, DiT, MMDiT
-from cached_path import cached_path
-from model.utils import (
-    load_checkpoint,
-    get_tokenizer,
-    convert_char_to_pinyin,
-    save_spectrogram,
-)
 from transformers import pipeline
-import soundfile as sf
-import tomli
-import argparse
-import tqdm
-from pathlib import Path
-import codecs
 parser = argparse.ArgumentParser(
     prog="python3 inference-cli.py",
@@ -73,6 +71,11 @@ parser.add_argument(
     "--remove_silence",
     help="Remove silence.",
 )
 args = parser.parse_args()
 config = tomli.load(open(args.config, "rb"))
@@ -88,6 +91,7 @@ model = args.model if args.model else config["model"]
 remove_silence = args.remove_silence if args.remove_silence else config["remove_silence"]
 wave_path = Path(output_dir)/"out.wav"
 spectrogram_path = Path(output_dir)/"out.png"
 SPLIT_WORDS = [
     "but", "however", "nevertheless", "yet", "still",
@@ -105,7 +109,16 @@ device = (
     if torch.cuda.is_available()
     else "mps" if torch.backends.mps.is_available() else "cpu"
 )
-vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
 print(f"Using {device} device")
@@ -124,8 +137,9 @@ speed = 1.0
 fix_duration = None
 def load_model(repo_name, exp_name, model_cls, model_cfg, ckpt_step):
-    ckpt_path = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors"))
-    # ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.pt"  # .pt | .safetensors
     vocab_char_map, vocab_size = get_tokenizer("Emilia_ZH_EN", "pinyin")
     model = CFM(
         transformer=model_cls(
@@ -385,4 +399,4 @@ def infer(ref_audio_orig, ref_text, gen_text, model, remove_silence, custom_spli
     return infer_batch((audio, sr), ref_text, gen_text_batches, model, remove_silence)
-infer(ref_audio, ref_text, gen_text, model, remove_silence, ",".join(SPLIT_WORDS))

+import argparse
+import codecs
 import re
+import tempfile
+from pathlib import Path
+import numpy as np
+import soundfile as sf
+import tomli
 import torch
 import torchaudio
+import tqdm
+from cached_path import cached_path
 from einops import rearrange
 from pydub import AudioSegment, silence
 from transformers import pipeline
+from vocos import Vocos
+from model import CFM, DiT, MMDiT, UNetT
+from model.utils import (convert_char_to_pinyin, get_tokenizer,
+                         load_checkpoint, save_spectrogram)
 parser = argparse.ArgumentParser(
     prog="python3 inference-cli.py",
     "--remove_silence",
     help="Remove silence.",
 )
+parser.add_argument(
+    "--load_vocoder_from_local",
+    action="store_true",
+    help="load vocoder from local. Default: ../checkpoints/charactr/vocos-mel-24khz",
+)
 args = parser.parse_args()
 config = tomli.load(open(args.config, "rb"))
 remove_silence = args.remove_silence if args.remove_silence else config["remove_silence"]
 wave_path = Path(output_dir)/"out.wav"
 spectrogram_path = Path(output_dir)/"out.png"
+vocos_local_path = "../checkpoints/charactr/vocos-mel-24khz"
 SPLIT_WORDS = [
     "but", "however", "nevertheless", "yet", "still",
     if torch.cuda.is_available()
     else "mps" if torch.backends.mps.is_available() else "cpu"
 )
+if args.load_vocoder_from_local:
+    print(f"Load vocos from local path {vocos_local_path}")
+    vocos = Vocos.from_hparams(f"{vocos_local_path}/config.yaml")
+    state_dict = torch.load(f"{vocos_local_path}/pytorch_model.bin", map_location=device)
+    vocos.load_state_dict(state_dict)
+    vocos.eval()
+else:
+    print("Donwload Vocos from huggingface charactr/vocos-mel-24khz")
+    vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
 print(f"Using {device} device")
 fix_duration = None
 def load_model(repo_name, exp_name, model_cls, model_cfg, ckpt_step):
+    ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.pt" # .pt | .safetensors
+    if not Path(ckpt_path).exists():
+        ckpt_path = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors"))
     vocab_char_map, vocab_size = get_tokenizer("Emilia_ZH_EN", "pinyin")
     model = CFM(
         transformer=model_cls(
     return infer_batch((audio, sr), ref_text, gen_text_batches, model, remove_silence)
+infer(ref_audio, ref_text, gen_text, model, remove_silence, ",".join(SPLIT_WORDS))

model/utils.py CHANGED Viewed

@@ -22,12 +22,6 @@ from einops import rearrange, reduce
 import jieba
 from pypinyin import lazy_pinyin, Style
-import zhconv
-from zhon.hanzi import punctuation
-from jiwer import compute_measures
-from funasr import AutoModel
-from faster_whisper import WhisperModel
 from model.ecapa_tdnn import ECAPA_TDNN_SMALL
 from model.modules import MelSpec
@@ -432,6 +426,7 @@ def get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path
 def load_asr_model(lang, ckpt_dir = ""):
     if lang == "zh":
         model = AutoModel(
             model = os.path.join(ckpt_dir, "paraformer-zh"),
             # vad_model = os.path.join(ckpt_dir, "fsmn-vad"),
@@ -440,6 +435,7 @@ def load_asr_model(lang, ckpt_dir = ""):
             disable_update=True,
             )  # following seed-tts setting
     elif lang == "en":
         model_size = "large-v3" if ckpt_dir == "" else ckpt_dir
         model = WhisperModel(model_size, device="cuda", compute_type="float16")
     return model
@@ -451,6 +447,7 @@ def run_asr_wer(args):
     rank, lang, test_set, ckpt_dir = args
     if lang == "zh":
         torch.cuda.set_device(rank)
     elif lang == "en":
         os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
@@ -458,10 +455,12 @@ def run_asr_wer(args):
         raise NotImplementedError("lang support only 'zh' (funasr paraformer-zh), 'en' (faster-whisper-large-v3), for now.")
     asr_model = load_asr_model(lang, ckpt_dir = ckpt_dir)
     punctuation_all = punctuation + string.punctuation
     wers = []
     for gen_wav, prompt_wav, truth in tqdm(test_set):
         if lang == "zh":
             res = asr_model.generate(input=gen_wav, batch_size_s=300, disable_pbar=True)

 import jieba
 from pypinyin import lazy_pinyin, Style
 from model.ecapa_tdnn import ECAPA_TDNN_SMALL
 from model.modules import MelSpec
 def load_asr_model(lang, ckpt_dir = ""):
     if lang == "zh":
+        from funasr import AutoModel
         model = AutoModel(
             model = os.path.join(ckpt_dir, "paraformer-zh"),
             # vad_model = os.path.join(ckpt_dir, "fsmn-vad"),
             disable_update=True,
             )  # following seed-tts setting
     elif lang == "en":
+        from faster_whisper import WhisperModel
         model_size = "large-v3" if ckpt_dir == "" else ckpt_dir
         model = WhisperModel(model_size, device="cuda", compute_type="float16")
     return model
     rank, lang, test_set, ckpt_dir = args
     if lang == "zh":
+        import zhconv
         torch.cuda.set_device(rank)
     elif lang == "en":
         os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
         raise NotImplementedError("lang support only 'zh' (funasr paraformer-zh), 'en' (faster-whisper-large-v3), for now.")
     asr_model = load_asr_model(lang, ckpt_dir = ckpt_dir)
+    from zhon.hanzi import punctuation
     punctuation_all = punctuation + string.punctuation
     wers = []
+    from jiwer import compute_measures
     for gen_wav, prompt_wav, truth in tqdm(test_set):
         if lang == "zh":
             res = asr_model.generate(input=gen_wav, batch_size_s=300, disable_pbar=True)

requirements.txt CHANGED Viewed

@@ -5,25 +5,19 @@ datasets
 einops>=0.8.0
 einx>=0.3.0
 ema_pytorch>=0.5.2
-faster_whisper
-funasr
 gradio
 jieba
-jiwer
 librosa
 matplotlib
-numpy==1.23.5
 pydub
 pypinyin
 safetensors
 soundfile
-# torch>=2.0
-# torchaudio>=2.3.0
 torchdiffeq
 tqdm>=4.65.0
 transformers
 vocos
 wandb
 x_transformers>=1.31.14
-zhconv
-zhon

 einops>=0.8.0
 einx>=0.3.0
 ema_pytorch>=0.5.2
 gradio
 jieba
 librosa
 matplotlib
+numpy<=1.26.4
 pydub
 pypinyin
 safetensors
 soundfile
+tomli
 torchdiffeq
 tqdm>=4.65.0
 transformers
 vocos
 wandb
 x_transformers>=1.31.14

requirements_eval.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+faster_whisper
+funasr
+jiwer
+zhconv
+zhon