Spaces:
Running
Running
mrfakename
commited on
Commit
•
971a624
1
Parent(s):
342cd99
Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
app.py
CHANGED
@@ -31,19 +31,6 @@ def gpu_decorator(func):
|
|
31 |
else:
|
32 |
return func
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
SPLIT_WORDS = [
|
37 |
-
"but", "however", "nevertheless", "yet", "still",
|
38 |
-
"therefore", "thus", "hence", "consequently",
|
39 |
-
"moreover", "furthermore", "additionally",
|
40 |
-
"meanwhile", "alternatively", "otherwise",
|
41 |
-
"namely", "specifically", "for example", "such as",
|
42 |
-
"in fact", "indeed", "notably",
|
43 |
-
"in contrast", "on the other hand", "conversely",
|
44 |
-
"in conclusion", "to summarize", "finally"
|
45 |
-
]
|
46 |
-
|
47 |
device = (
|
48 |
"cuda"
|
49 |
if torch.cuda.is_available()
|
@@ -71,7 +58,6 @@ cfg_strength = 2.0
|
|
71 |
ode_method = "euler"
|
72 |
sway_sampling_coef = -1.0
|
73 |
speed = 1.0
|
74 |
-
# fix_duration = 27 # None or float (duration in seconds)
|
75 |
fix_duration = None
|
76 |
|
77 |
|
@@ -142,7 +128,7 @@ def chunk_text(text, max_chars=135):
|
|
142 |
return chunks
|
143 |
|
144 |
@gpu_decorator
|
145 |
-
def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, progress=gr.Progress()):
|
146 |
if exp_name == "F5-TTS":
|
147 |
ema_model = F5TTS_ema_model
|
148 |
elif exp_name == "E2-TTS":
|
@@ -200,8 +186,44 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
|
|
200 |
generated_waves.append(generated_wave)
|
201 |
spectrograms.append(generated_mel_spec[0].cpu().numpy())
|
202 |
|
203 |
-
# Combine all generated waves
|
204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
|
206 |
# Remove silence
|
207 |
if remove_silence:
|
@@ -227,11 +249,7 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence,
|
|
227 |
return (target_sample_rate, final_wave), spectrogram_path
|
228 |
|
229 |
@gpu_decorator
|
230 |
-
def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence,
|
231 |
-
if not custom_split_words.strip():
|
232 |
-
custom_words = [word.strip() for word in custom_split_words.split(',')]
|
233 |
-
global SPLIT_WORDS
|
234 |
-
SPLIT_WORDS = custom_words
|
235 |
|
236 |
print(gen_text)
|
237 |
|
@@ -283,7 +301,8 @@ def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, custom_s
|
|
283 |
print(f'gen_text {i}', batch_text)
|
284 |
|
285 |
gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
|
286 |
-
return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence)
|
|
|
287 |
|
288 |
@gpu_decorator
|
289 |
def generate_podcast(script, speaker1_name, ref_audio1, ref_text1, speaker2_name, ref_audio2, ref_text2, exp_name, remove_silence):
|
@@ -388,12 +407,7 @@ with gr.Blocks() as app_tts:
|
|
388 |
remove_silence = gr.Checkbox(
|
389 |
label="Remove Silences",
|
390 |
info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.",
|
391 |
-
value=
|
392 |
-
)
|
393 |
-
split_words_input = gr.Textbox(
|
394 |
-
label="Custom Split Words",
|
395 |
-
info="Enter custom words to split on, separated by commas. Leave blank to use default list.",
|
396 |
-
lines=2,
|
397 |
)
|
398 |
speed_slider = gr.Slider(
|
399 |
label="Speed",
|
@@ -403,6 +417,14 @@ with gr.Blocks() as app_tts:
|
|
403 |
step=0.1,
|
404 |
info="Adjust the speed of the audio.",
|
405 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
406 |
speed_slider.change(update_speed, inputs=speed_slider)
|
407 |
|
408 |
audio_output = gr.Audio(label="Synthesized Audio")
|
@@ -416,7 +438,7 @@ with gr.Blocks() as app_tts:
|
|
416 |
gen_text_input,
|
417 |
model_choice,
|
418 |
remove_silence,
|
419 |
-
|
420 |
],
|
421 |
outputs=[audio_output, spectrogram_output],
|
422 |
)
|
@@ -664,7 +686,7 @@ with gr.Blocks() as app_emotional:
|
|
664 |
ref_text = speech_types[current_emotion].get('ref_text', '')
|
665 |
|
666 |
# Generate speech for this segment
|
667 |
-
audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence,
|
668 |
sr, audio_data = audio
|
669 |
|
670 |
generated_audio_segments.append(audio_data)
|
|
|
31 |
else:
|
32 |
return func
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
device = (
|
35 |
"cuda"
|
36 |
if torch.cuda.is_available()
|
|
|
58 |
ode_method = "euler"
|
59 |
sway_sampling_coef = -1.0
|
60 |
speed = 1.0
|
|
|
61 |
fix_duration = None
|
62 |
|
63 |
|
|
|
128 |
return chunks
|
129 |
|
130 |
@gpu_decorator
|
131 |
+
def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration=0.15, progress=gr.Progress()):
|
132 |
if exp_name == "F5-TTS":
|
133 |
ema_model = F5TTS_ema_model
|
134 |
elif exp_name == "E2-TTS":
|
|
|
186 |
generated_waves.append(generated_wave)
|
187 |
spectrograms.append(generated_mel_spec[0].cpu().numpy())
|
188 |
|
189 |
+
# Combine all generated waves with cross-fading
|
190 |
+
if cross_fade_duration <= 0:
|
191 |
+
# Simply concatenate
|
192 |
+
final_wave = np.concatenate(generated_waves)
|
193 |
+
else:
|
194 |
+
final_wave = generated_waves[0]
|
195 |
+
for i in range(1, len(generated_waves)):
|
196 |
+
prev_wave = final_wave
|
197 |
+
next_wave = generated_waves[i]
|
198 |
+
|
199 |
+
# Calculate cross-fade samples, ensuring it does not exceed wave lengths
|
200 |
+
cross_fade_samples = int(cross_fade_duration * target_sample_rate)
|
201 |
+
cross_fade_samples = min(cross_fade_samples, len(prev_wave), len(next_wave))
|
202 |
+
|
203 |
+
if cross_fade_samples <= 0:
|
204 |
+
# No overlap possible, concatenate
|
205 |
+
final_wave = np.concatenate([prev_wave, next_wave])
|
206 |
+
continue
|
207 |
+
|
208 |
+
# Overlapping parts
|
209 |
+
prev_overlap = prev_wave[-cross_fade_samples:]
|
210 |
+
next_overlap = next_wave[:cross_fade_samples]
|
211 |
+
|
212 |
+
# Fade out and fade in
|
213 |
+
fade_out = np.linspace(1, 0, cross_fade_samples)
|
214 |
+
fade_in = np.linspace(0, 1, cross_fade_samples)
|
215 |
+
|
216 |
+
# Cross-faded overlap
|
217 |
+
cross_faded_overlap = prev_overlap * fade_out + next_overlap * fade_in
|
218 |
+
|
219 |
+
# Combine
|
220 |
+
new_wave = np.concatenate([
|
221 |
+
prev_wave[:-cross_fade_samples],
|
222 |
+
cross_faded_overlap,
|
223 |
+
next_wave[cross_fade_samples:]
|
224 |
+
])
|
225 |
+
|
226 |
+
final_wave = new_wave
|
227 |
|
228 |
# Remove silence
|
229 |
if remove_silence:
|
|
|
249 |
return (target_sample_rate, final_wave), spectrogram_path
|
250 |
|
251 |
@gpu_decorator
|
252 |
+
def infer(ref_audio_orig, ref_text, gen_text, exp_name, remove_silence, cross_fade_duration=0.15):
|
|
|
|
|
|
|
|
|
253 |
|
254 |
print(gen_text)
|
255 |
|
|
|
301 |
print(f'gen_text {i}', batch_text)
|
302 |
|
303 |
gr.Info(f"Generating audio using {exp_name} in {len(gen_text_batches)} batches")
|
304 |
+
return infer_batch((audio, sr), ref_text, gen_text_batches, exp_name, remove_silence, cross_fade_duration)
|
305 |
+
|
306 |
|
307 |
@gpu_decorator
|
308 |
def generate_podcast(script, speaker1_name, ref_audio1, ref_text1, speaker2_name, ref_audio2, ref_text2, exp_name, remove_silence):
|
|
|
407 |
remove_silence = gr.Checkbox(
|
408 |
label="Remove Silences",
|
409 |
info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.",
|
410 |
+
value=False,
|
|
|
|
|
|
|
|
|
|
|
411 |
)
|
412 |
speed_slider = gr.Slider(
|
413 |
label="Speed",
|
|
|
417 |
step=0.1,
|
418 |
info="Adjust the speed of the audio.",
|
419 |
)
|
420 |
+
cross_fade_duration_slider = gr.Slider(
|
421 |
+
label="Cross-Fade Duration (s)",
|
422 |
+
minimum=0.0,
|
423 |
+
maximum=1.0,
|
424 |
+
value=0.15,
|
425 |
+
step=0.01,
|
426 |
+
info="Set the duration of the cross-fade between audio clips.",
|
427 |
+
)
|
428 |
speed_slider.change(update_speed, inputs=speed_slider)
|
429 |
|
430 |
audio_output = gr.Audio(label="Synthesized Audio")
|
|
|
438 |
gen_text_input,
|
439 |
model_choice,
|
440 |
remove_silence,
|
441 |
+
cross_fade_duration_slider,
|
442 |
],
|
443 |
outputs=[audio_output, spectrogram_output],
|
444 |
)
|
|
|
686 |
ref_text = speech_types[current_emotion].get('ref_text', '')
|
687 |
|
688 |
# Generate speech for this segment
|
689 |
+
audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence, 0)
|
690 |
sr, audio_data = audio
|
691 |
|
692 |
generated_audio_segments.append(audio_data)
|