mrfakename commited on
Commit
0978fba
1 Parent(s): 0559e57

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (1) hide show
  1. app.py +37 -34
app.py CHANGED
@@ -79,8 +79,10 @@ def generate_response(messages, model, tokenizer):
79
 
80
 
81
  @gpu_decorator
82
- def infer(ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15, speed=1):
83
- ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=gr.Info)
 
 
84
 
85
  if model == "F5-TTS":
86
  ema_model = F5TTS_ema_model
@@ -94,7 +96,7 @@ def infer(ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_
94
  ema_model,
95
  cross_fade_duration=cross_fade_duration,
96
  speed=speed,
97
- show_info=gr.Info,
98
  progress=gr.Progress(),
99
  )
100
 
@@ -183,24 +185,24 @@ def parse_speechtypes_text(gen_text):
183
 
184
  segments = []
185
 
186
- current_emotion = "Regular"
187
 
188
  for i in range(len(tokens)):
189
  if i % 2 == 0:
190
  # This is text
191
  text = tokens[i].strip()
192
  if text:
193
- segments.append({"emotion": current_emotion, "text": text})
194
  else:
195
- # This is emotion
196
- emotion = tokens[i].strip()
197
- current_emotion = emotion
198
 
199
  return segments
200
 
201
 
202
  with gr.Blocks() as app_multistyle:
203
- # New section for emotional generation
204
  gr.Markdown(
205
  """
206
  # Multiple Speech-Type Generation
@@ -313,29 +315,29 @@ with gr.Blocks() as app_multistyle:
313
  delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows)
314
 
315
  # Text input for the prompt
316
- gen_text_input_emotional = gr.Textbox(
317
  label="Text to Generate",
318
  lines=10,
319
  placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
320
  )
321
 
322
  # Model choice
323
- model_choice_emotional = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS")
324
 
325
  with gr.Accordion("Advanced Settings", open=False):
326
- remove_silence_emotional = gr.Checkbox(
327
  label="Remove Silences",
328
  value=False,
329
  )
330
 
331
  # Generate button
332
- generate_emotional_btn = gr.Button("Generate Emotional Speech", variant="primary")
333
 
334
  # Output audio
335
- audio_output_emotional = gr.Audio(label="Synthesized Audio")
336
 
337
  @gpu_decorator
338
- def generate_emotional_speech(
339
  regular_audio,
340
  regular_ref_text,
341
  gen_text,
@@ -362,23 +364,23 @@ with gr.Blocks() as app_multistyle:
362
 
363
  # For each segment, generate speech
364
  generated_audio_segments = []
365
- current_emotion = "Regular"
366
 
367
  for segment in segments:
368
- emotion = segment["emotion"]
369
  text = segment["text"]
370
 
371
- if emotion in speech_types:
372
- current_emotion = emotion
373
  else:
374
- # If emotion not available, default to Regular
375
- current_emotion = "Regular"
376
 
377
- ref_audio = speech_types[current_emotion]["audio"]
378
- ref_text = speech_types[current_emotion].get("ref_text", "")
379
 
380
  # Generate speech for this segment
381
- audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence, 0)
382
  sr, audio_data = audio
383
 
384
  generated_audio_segments.append(audio_data)
@@ -391,21 +393,21 @@ with gr.Blocks() as app_multistyle:
391
  gr.Warning("No audio generated.")
392
  return None
393
 
394
- generate_emotional_btn.click(
395
- generate_emotional_speech,
396
  inputs=[
397
  regular_audio,
398
  regular_ref_text,
399
- gen_text_input_emotional,
400
  ]
401
  + speech_type_names
402
  + speech_type_audios
403
  + speech_type_ref_texts
404
  + [
405
- model_choice_emotional,
406
- remove_silence_emotional,
407
  ],
408
- outputs=audio_output_emotional,
409
  )
410
 
411
  # Validation function to disable Generate button if speech types are missing
@@ -423,7 +425,7 @@ with gr.Blocks() as app_multistyle:
423
 
424
  # Parse the gen_text to get the speech types used
425
  segments = parse_speechtypes_text(gen_text)
426
- speech_types_in_text = set(segment["emotion"] for segment in segments)
427
 
428
  # Check if all speech types in text are available
429
  missing_speech_types = speech_types_in_text - speech_types_available
@@ -435,10 +437,10 @@ with gr.Blocks() as app_multistyle:
435
  # Enable the generate button
436
  return gr.update(interactive=True)
437
 
438
- gen_text_input_emotional.change(
439
  validate_speech_types,
440
- inputs=[gen_text_input_emotional, regular_name] + speech_type_names,
441
- outputs=generate_emotional_btn,
442
  )
443
 
444
 
@@ -576,6 +578,7 @@ Have a conversation with an AI using your reference voice!
576
  remove_silence,
577
  cross_fade_duration=0.15,
578
  speed=1.0,
 
579
  )
580
  return audio_result
581
 
 
79
 
80
 
81
  @gpu_decorator
82
+ def infer(
83
+ ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15, speed=1, show_info=gr.Info
84
+ ):
85
+ ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
86
 
87
  if model == "F5-TTS":
88
  ema_model = F5TTS_ema_model
 
96
  ema_model,
97
  cross_fade_duration=cross_fade_duration,
98
  speed=speed,
99
+ show_info=show_info,
100
  progress=gr.Progress(),
101
  )
102
 
 
185
 
186
  segments = []
187
 
188
+ current_style = "Regular"
189
 
190
  for i in range(len(tokens)):
191
  if i % 2 == 0:
192
  # This is text
193
  text = tokens[i].strip()
194
  if text:
195
+ segments.append({"style": current_style, "text": text})
196
  else:
197
+ # This is style
198
+ style = tokens[i].strip()
199
+ current_style = style
200
 
201
  return segments
202
 
203
 
204
  with gr.Blocks() as app_multistyle:
205
+ # New section for multistyle generation
206
  gr.Markdown(
207
  """
208
  # Multiple Speech-Type Generation
 
315
  delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows)
316
 
317
  # Text input for the prompt
318
+ gen_text_input_multistyle = gr.Textbox(
319
  label="Text to Generate",
320
  lines=10,
321
  placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
322
  )
323
 
324
  # Model choice
325
+ model_choice_multistyle = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS")
326
 
327
  with gr.Accordion("Advanced Settings", open=False):
328
+ remove_silence_multistyle = gr.Checkbox(
329
  label="Remove Silences",
330
  value=False,
331
  )
332
 
333
  # Generate button
334
+ generate_multistyle_btn = gr.Button("Generate Multi-Style Speech", variant="primary")
335
 
336
  # Output audio
337
+ audio_output_multistyle = gr.Audio(label="Synthesized Audio")
338
 
339
  @gpu_decorator
340
+ def generate_multistyle_speech(
341
  regular_audio,
342
  regular_ref_text,
343
  gen_text,
 
364
 
365
  # For each segment, generate speech
366
  generated_audio_segments = []
367
+ current_style = "Regular"
368
 
369
  for segment in segments:
370
+ style = segment["style"]
371
  text = segment["text"]
372
 
373
+ if style in speech_types:
374
+ current_style = style
375
  else:
376
+ # If style not available, default to Regular
377
+ current_style = "Regular"
378
 
379
+ ref_audio = speech_types[current_style]["audio"]
380
+ ref_text = speech_types[current_style].get("ref_text", "")
381
 
382
  # Generate speech for this segment
383
+ audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence, 0, show_info=None)
384
  sr, audio_data = audio
385
 
386
  generated_audio_segments.append(audio_data)
 
393
  gr.Warning("No audio generated.")
394
  return None
395
 
396
+ generate_multistyle_btn.click(
397
+ generate_multistyle_speech,
398
  inputs=[
399
  regular_audio,
400
  regular_ref_text,
401
+ gen_text_input_multistyle,
402
  ]
403
  + speech_type_names
404
  + speech_type_audios
405
  + speech_type_ref_texts
406
  + [
407
+ model_choice_multistyle,
408
+ remove_silence_multistyle,
409
  ],
410
+ outputs=audio_output_multistyle,
411
  )
412
 
413
  # Validation function to disable Generate button if speech types are missing
 
425
 
426
  # Parse the gen_text to get the speech types used
427
  segments = parse_speechtypes_text(gen_text)
428
+ speech_types_in_text = set(segment["style"] for segment in segments)
429
 
430
  # Check if all speech types in text are available
431
  missing_speech_types = speech_types_in_text - speech_types_available
 
437
  # Enable the generate button
438
  return gr.update(interactive=True)
439
 
440
+ gen_text_input_multistyle.change(
441
  validate_speech_types,
442
+ inputs=[gen_text_input_multistyle, regular_name] + speech_type_names,
443
+ outputs=generate_multistyle_btn,
444
  )
445
 
446
 
 
578
  remove_silence,
579
  cross_fade_duration=0.15,
580
  speed=1.0,
581
+ show_info=None,
582
  )
583
  return audio_result
584