Spaces:
Running
Running
mrfakename
commited on
Commit
•
ad63082
1
Parent(s):
dd4e6c8
Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
- README_REPO.md +7 -1
- app.py +1 -2
- inference-cli.py +35 -21
- model/utils.py +6 -7
- requirements.txt +2 -8
- requirements_eval.txt +5 -0
README_REPO.md
CHANGED
@@ -62,7 +62,7 @@ An initial guidance on Finetuning [#57](https://github.com/SWivid/F5-TTS/discuss
|
|
62 |
|
63 |
## Inference
|
64 |
|
65 |
-
|
66 |
|
67 |
Currently support 30s for a single generation, which is the **TOTAL** length of prompt audio and the generated. Batch inference with chunks is supported by `inference-cli` and `gradio_app`.
|
68 |
- To avoid possible inference failures, make sure you have seen through the following instructions.
|
@@ -148,6 +148,12 @@ bash scripts/eval_infer_batch.sh
|
|
148 |
|
149 |
### Objective Evaluation
|
150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
**Some Notes**
|
152 |
|
153 |
For faster-whisper with CUDA 11:
|
|
|
62 |
|
63 |
## Inference
|
64 |
|
65 |
+
The pretrained model checkpoints can be reached at [🤗 Hugging Face](https://huggingface.co/SWivid/F5-TTS) and [⭐ Model Scope](https://www.modelscope.cn/models/SWivid/F5-TTS_Emilia-ZH-EN), or automatically downloaded with `inference-cli` and `gradio_app`.
|
66 |
|
67 |
Currently support 30s for a single generation, which is the **TOTAL** length of prompt audio and the generated. Batch inference with chunks is supported by `inference-cli` and `gradio_app`.
|
68 |
- To avoid possible inference failures, make sure you have seen through the following instructions.
|
|
|
148 |
|
149 |
### Objective Evaluation
|
150 |
|
151 |
+
Install packages for evaluation:
|
152 |
+
|
153 |
+
```bash
|
154 |
+
pip install -r requirements_eval.txt
|
155 |
+
```
|
156 |
+
|
157 |
**Some Notes**
|
158 |
|
159 |
For faster-whisper with CUDA 11:
|
app.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
import os
|
2 |
import re
|
3 |
import torch
|
4 |
import torchaudio
|
@@ -17,7 +16,6 @@ from model.utils import (
|
|
17 |
save_spectrogram,
|
18 |
)
|
19 |
from transformers import pipeline
|
20 |
-
import librosa
|
21 |
import click
|
22 |
import soundfile as sf
|
23 |
|
@@ -429,6 +427,7 @@ with gr.Blocks() as app_credits:
|
|
429 |
|
430 |
* [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
|
431 |
* [RootingInLoad](https://github.com/RootingInLoad) for the podcast generation
|
|
|
432 |
""")
|
433 |
with gr.Blocks() as app_tts:
|
434 |
gr.Markdown("# Batched TTS")
|
|
|
|
|
1 |
import re
|
2 |
import torch
|
3 |
import torchaudio
|
|
|
16 |
save_spectrogram,
|
17 |
)
|
18 |
from transformers import pipeline
|
|
|
19 |
import click
|
20 |
import soundfile as sf
|
21 |
|
|
|
427 |
|
428 |
* [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
|
429 |
* [RootingInLoad](https://github.com/RootingInLoad) for the podcast generation
|
430 |
+
* [jpgallegoar](https://github.com/jpgallegoar) for multiple speech-type generation
|
431 |
""")
|
432 |
with gr.Blocks() as app_tts:
|
433 |
gr.Markdown("# Batched TTS")
|
inference-cli.py
CHANGED
@@ -1,26 +1,24 @@
|
|
|
|
|
|
1 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import torch
|
3 |
import torchaudio
|
4 |
-
import
|
5 |
-
import
|
6 |
from einops import rearrange
|
7 |
-
from vocos import Vocos
|
8 |
from pydub import AudioSegment, silence
|
9 |
-
from model import CFM, UNetT, DiT, MMDiT
|
10 |
-
from cached_path import cached_path
|
11 |
-
from model.utils import (
|
12 |
-
load_checkpoint,
|
13 |
-
get_tokenizer,
|
14 |
-
convert_char_to_pinyin,
|
15 |
-
save_spectrogram,
|
16 |
-
)
|
17 |
from transformers import pipeline
|
18 |
-
|
19 |
-
|
20 |
-
import
|
21 |
-
import
|
22 |
-
|
23 |
-
import codecs
|
24 |
|
25 |
parser = argparse.ArgumentParser(
|
26 |
prog="python3 inference-cli.py",
|
@@ -73,6 +71,11 @@ parser.add_argument(
|
|
73 |
"--remove_silence",
|
74 |
help="Remove silence.",
|
75 |
)
|
|
|
|
|
|
|
|
|
|
|
76 |
args = parser.parse_args()
|
77 |
|
78 |
config = tomli.load(open(args.config, "rb"))
|
@@ -88,6 +91,7 @@ model = args.model if args.model else config["model"]
|
|
88 |
remove_silence = args.remove_silence if args.remove_silence else config["remove_silence"]
|
89 |
wave_path = Path(output_dir)/"out.wav"
|
90 |
spectrogram_path = Path(output_dir)/"out.png"
|
|
|
91 |
|
92 |
SPLIT_WORDS = [
|
93 |
"but", "however", "nevertheless", "yet", "still",
|
@@ -105,7 +109,16 @@ device = (
|
|
105 |
if torch.cuda.is_available()
|
106 |
else "mps" if torch.backends.mps.is_available() else "cpu"
|
107 |
)
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
print(f"Using {device} device")
|
111 |
|
@@ -124,8 +137,9 @@ speed = 1.0
|
|
124 |
fix_duration = None
|
125 |
|
126 |
def load_model(repo_name, exp_name, model_cls, model_cfg, ckpt_step):
|
127 |
-
ckpt_path =
|
128 |
-
|
|
|
129 |
vocab_char_map, vocab_size = get_tokenizer("Emilia_ZH_EN", "pinyin")
|
130 |
model = CFM(
|
131 |
transformer=model_cls(
|
@@ -385,4 +399,4 @@ def infer(ref_audio_orig, ref_text, gen_text, model, remove_silence, custom_spli
|
|
385 |
return infer_batch((audio, sr), ref_text, gen_text_batches, model, remove_silence)
|
386 |
|
387 |
|
388 |
-
infer(ref_audio, ref_text, gen_text, model, remove_silence, ",".join(SPLIT_WORDS))
|
|
|
1 |
+
import argparse
|
2 |
+
import codecs
|
3 |
import re
|
4 |
+
import tempfile
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
import numpy as np
|
8 |
+
import soundfile as sf
|
9 |
+
import tomli
|
10 |
import torch
|
11 |
import torchaudio
|
12 |
+
import tqdm
|
13 |
+
from cached_path import cached_path
|
14 |
from einops import rearrange
|
|
|
15 |
from pydub import AudioSegment, silence
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
from transformers import pipeline
|
17 |
+
from vocos import Vocos
|
18 |
+
|
19 |
+
from model import CFM, DiT, MMDiT, UNetT
|
20 |
+
from model.utils import (convert_char_to_pinyin, get_tokenizer,
|
21 |
+
load_checkpoint, save_spectrogram)
|
|
|
22 |
|
23 |
parser = argparse.ArgumentParser(
|
24 |
prog="python3 inference-cli.py",
|
|
|
71 |
"--remove_silence",
|
72 |
help="Remove silence.",
|
73 |
)
|
74 |
+
parser.add_argument(
|
75 |
+
"--load_vocoder_from_local",
|
76 |
+
action="store_true",
|
77 |
+
help="load vocoder from local. Default: ../checkpoints/charactr/vocos-mel-24khz",
|
78 |
+
)
|
79 |
args = parser.parse_args()
|
80 |
|
81 |
config = tomli.load(open(args.config, "rb"))
|
|
|
91 |
remove_silence = args.remove_silence if args.remove_silence else config["remove_silence"]
|
92 |
wave_path = Path(output_dir)/"out.wav"
|
93 |
spectrogram_path = Path(output_dir)/"out.png"
|
94 |
+
vocos_local_path = "../checkpoints/charactr/vocos-mel-24khz"
|
95 |
|
96 |
SPLIT_WORDS = [
|
97 |
"but", "however", "nevertheless", "yet", "still",
|
|
|
109 |
if torch.cuda.is_available()
|
110 |
else "mps" if torch.backends.mps.is_available() else "cpu"
|
111 |
)
|
112 |
+
|
113 |
+
if args.load_vocoder_from_local:
|
114 |
+
print(f"Load vocos from local path {vocos_local_path}")
|
115 |
+
vocos = Vocos.from_hparams(f"{vocos_local_path}/config.yaml")
|
116 |
+
state_dict = torch.load(f"{vocos_local_path}/pytorch_model.bin", map_location=device)
|
117 |
+
vocos.load_state_dict(state_dict)
|
118 |
+
vocos.eval()
|
119 |
+
else:
|
120 |
+
print("Donwload Vocos from huggingface charactr/vocos-mel-24khz")
|
121 |
+
vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
|
122 |
|
123 |
print(f"Using {device} device")
|
124 |
|
|
|
137 |
fix_duration = None
|
138 |
|
139 |
def load_model(repo_name, exp_name, model_cls, model_cfg, ckpt_step):
|
140 |
+
ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.pt" # .pt | .safetensors
|
141 |
+
if not Path(ckpt_path).exists():
|
142 |
+
ckpt_path = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors"))
|
143 |
vocab_char_map, vocab_size = get_tokenizer("Emilia_ZH_EN", "pinyin")
|
144 |
model = CFM(
|
145 |
transformer=model_cls(
|
|
|
399 |
return infer_batch((audio, sr), ref_text, gen_text_batches, model, remove_silence)
|
400 |
|
401 |
|
402 |
+
infer(ref_audio, ref_text, gen_text, model, remove_silence, ",".join(SPLIT_WORDS))
|
model/utils.py
CHANGED
@@ -22,12 +22,6 @@ from einops import rearrange, reduce
|
|
22 |
|
23 |
import jieba
|
24 |
from pypinyin import lazy_pinyin, Style
|
25 |
-
import zhconv
|
26 |
-
from zhon.hanzi import punctuation
|
27 |
-
from jiwer import compute_measures
|
28 |
-
|
29 |
-
from funasr import AutoModel
|
30 |
-
from faster_whisper import WhisperModel
|
31 |
|
32 |
from model.ecapa_tdnn import ECAPA_TDNN_SMALL
|
33 |
from model.modules import MelSpec
|
@@ -432,6 +426,7 @@ def get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path
|
|
432 |
|
433 |
def load_asr_model(lang, ckpt_dir = ""):
|
434 |
if lang == "zh":
|
|
|
435 |
model = AutoModel(
|
436 |
model = os.path.join(ckpt_dir, "paraformer-zh"),
|
437 |
# vad_model = os.path.join(ckpt_dir, "fsmn-vad"),
|
@@ -440,6 +435,7 @@ def load_asr_model(lang, ckpt_dir = ""):
|
|
440 |
disable_update=True,
|
441 |
) # following seed-tts setting
|
442 |
elif lang == "en":
|
|
|
443 |
model_size = "large-v3" if ckpt_dir == "" else ckpt_dir
|
444 |
model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
445 |
return model
|
@@ -451,6 +447,7 @@ def run_asr_wer(args):
|
|
451 |
rank, lang, test_set, ckpt_dir = args
|
452 |
|
453 |
if lang == "zh":
|
|
|
454 |
torch.cuda.set_device(rank)
|
455 |
elif lang == "en":
|
456 |
os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
|
@@ -458,10 +455,12 @@ def run_asr_wer(args):
|
|
458 |
raise NotImplementedError("lang support only 'zh' (funasr paraformer-zh), 'en' (faster-whisper-large-v3), for now.")
|
459 |
|
460 |
asr_model = load_asr_model(lang, ckpt_dir = ckpt_dir)
|
461 |
-
|
|
|
462 |
punctuation_all = punctuation + string.punctuation
|
463 |
wers = []
|
464 |
|
|
|
465 |
for gen_wav, prompt_wav, truth in tqdm(test_set):
|
466 |
if lang == "zh":
|
467 |
res = asr_model.generate(input=gen_wav, batch_size_s=300, disable_pbar=True)
|
|
|
22 |
|
23 |
import jieba
|
24 |
from pypinyin import lazy_pinyin, Style
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
from model.ecapa_tdnn import ECAPA_TDNN_SMALL
|
27 |
from model.modules import MelSpec
|
|
|
426 |
|
427 |
def load_asr_model(lang, ckpt_dir = ""):
|
428 |
if lang == "zh":
|
429 |
+
from funasr import AutoModel
|
430 |
model = AutoModel(
|
431 |
model = os.path.join(ckpt_dir, "paraformer-zh"),
|
432 |
# vad_model = os.path.join(ckpt_dir, "fsmn-vad"),
|
|
|
435 |
disable_update=True,
|
436 |
) # following seed-tts setting
|
437 |
elif lang == "en":
|
438 |
+
from faster_whisper import WhisperModel
|
439 |
model_size = "large-v3" if ckpt_dir == "" else ckpt_dir
|
440 |
model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
441 |
return model
|
|
|
447 |
rank, lang, test_set, ckpt_dir = args
|
448 |
|
449 |
if lang == "zh":
|
450 |
+
import zhconv
|
451 |
torch.cuda.set_device(rank)
|
452 |
elif lang == "en":
|
453 |
os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
|
|
|
455 |
raise NotImplementedError("lang support only 'zh' (funasr paraformer-zh), 'en' (faster-whisper-large-v3), for now.")
|
456 |
|
457 |
asr_model = load_asr_model(lang, ckpt_dir = ckpt_dir)
|
458 |
+
|
459 |
+
from zhon.hanzi import punctuation
|
460 |
punctuation_all = punctuation + string.punctuation
|
461 |
wers = []
|
462 |
|
463 |
+
from jiwer import compute_measures
|
464 |
for gen_wav, prompt_wav, truth in tqdm(test_set):
|
465 |
if lang == "zh":
|
466 |
res = asr_model.generate(input=gen_wav, batch_size_s=300, disable_pbar=True)
|
requirements.txt
CHANGED
@@ -5,25 +5,19 @@ datasets
|
|
5 |
einops>=0.8.0
|
6 |
einx>=0.3.0
|
7 |
ema_pytorch>=0.5.2
|
8 |
-
faster_whisper
|
9 |
-
funasr
|
10 |
gradio
|
11 |
jieba
|
12 |
-
jiwer
|
13 |
librosa
|
14 |
matplotlib
|
15 |
-
numpy
|
16 |
pydub
|
17 |
pypinyin
|
18 |
safetensors
|
19 |
soundfile
|
20 |
-
|
21 |
-
# torchaudio>=2.3.0
|
22 |
torchdiffeq
|
23 |
tqdm>=4.65.0
|
24 |
transformers
|
25 |
vocos
|
26 |
wandb
|
27 |
x_transformers>=1.31.14
|
28 |
-
zhconv
|
29 |
-
zhon
|
|
|
5 |
einops>=0.8.0
|
6 |
einx>=0.3.0
|
7 |
ema_pytorch>=0.5.2
|
|
|
|
|
8 |
gradio
|
9 |
jieba
|
|
|
10 |
librosa
|
11 |
matplotlib
|
12 |
+
numpy<=1.26.4
|
13 |
pydub
|
14 |
pypinyin
|
15 |
safetensors
|
16 |
soundfile
|
17 |
+
tomli
|
|
|
18 |
torchdiffeq
|
19 |
tqdm>=4.65.0
|
20 |
transformers
|
21 |
vocos
|
22 |
wandb
|
23 |
x_transformers>=1.31.14
|
|
|
|
requirements_eval.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
faster_whisper
|
2 |
+
funasr
|
3 |
+
jiwer
|
4 |
+
zhconv
|
5 |
+
zhon
|