jason-on-salt-a40 commited on
Commit
cf33c41
1 Parent(s): 93adc07

use updated model and prompt

Browse files
Files changed (2) hide show
  1. app.py +29 -42
  2. demo/YOU1000000115_S0000252.wav +0 -0
app.py CHANGED
@@ -78,9 +78,14 @@ class WhisperxModel:
78
  def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, voicecraft_model_name):
79
  global transcribe_model, align_model, voicecraft_model
80
 
81
- if voicecraft_model_name == "giga330M_TTSEnhanced":
 
 
 
 
82
  voicecraft_model_name = "gigaHalfLibri330M_TTSEnhanced_max16s"
83
-
 
84
  if alignment_model_name is not None:
85
  align_model = WhisperxAlignModel()
86
 
@@ -365,50 +370,32 @@ If disabled, you should write the target transcript yourself:</br>
365
  - In Edit mode write full prompt</br>
366
  """
367
 
368
- demo_original_transcript = "But when I had approached so near to them, the common object, which the sense deceives, lost not by distance any of its marks."
369
 
370
  demo_text = {
371
  "TTS": {
372
  "smart": "I cannot believe that the same model can also do text to speech synthesis too!",
373
- "regular": "But when I had approached so near to them, the common I cannot believe that the same model can also do text to speech synthesis too!"
374
  },
375
  "Edit": {
376
- "smart": "saw the mirage of the lake in the distance,",
377
- "regular": "But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks,"
378
  },
379
  "Long TTS": {
380
  "smart": "You can run the model on a big text!\n"
381
  "Just write it line-by-line. Or sentence-by-sentence.\n"
382
  "If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!",
383
- "regular": "But when I had approached so near to them, the common You can run the model on a big text!\n"
384
- "But when I had approached so near to them, the common Just write it line-by-line. Or sentence-by-sentence.\n"
385
- "But when I had approached so near to them, the common If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!"
386
  }
387
  }
388
 
389
  all_demo_texts = {vv for k, v in demo_text.items() for kk, vv in v.items()}
390
 
391
- demo_words = [
392
- '0.029 But 0.149', '0.189 when 0.33', '0.43 I 0.49', '0.53 had 0.65', '0.711 approached 1.152', '1.352 so 1.593',
393
- '1.693 near 1.933', '1.994 to 2.074', '2.134 them, 2.354', '2.535 the 2.655', '2.695 common 3.016', '3.196 object, 3.577',
394
- '3.717 which 3.898', '3.958 the 4.058', '4.098 sense 4.359', '4.419 deceives, 4.92', '5.101 lost 5.481', '5.682 not 5.963',
395
- '6.043 by 6.183', '6.223 distance 6.644', '6.905 any 7.065', '7.125 of 7.185', '7.245 its 7.346', '7.406 marks. 7.727'
396
- ]
397
-
398
- demo_words_info = [
399
- {'word': 'But', 'start': 0.029, 'end': 0.149, 'score': 0.834}, {'word': 'when', 'start': 0.189, 'end': 0.33, 'score': 0.879},
400
- {'word': 'I', 'start': 0.43, 'end': 0.49, 'score': 0.984}, {'word': 'had', 'start': 0.53, 'end': 0.65, 'score': 0.998},
401
- {'word': 'approached', 'start': 0.711, 'end': 1.152, 'score': 0.822}, {'word': 'so', 'start': 1.352, 'end': 1.593, 'score': 0.822},
402
- {'word': 'near', 'start': 1.693, 'end': 1.933, 'score': 0.752}, {'word': 'to', 'start': 1.994, 'end': 2.074, 'score': 0.924},
403
- {'word': 'them,', 'start': 2.134, 'end': 2.354, 'score': 0.914}, {'word': 'the', 'start': 2.535, 'end': 2.655, 'score': 0.818},
404
- {'word': 'common', 'start': 2.695, 'end': 3.016, 'score': 0.971}, {'word': 'object,', 'start': 3.196, 'end': 3.577, 'score': 0.823},
405
- {'word': 'which', 'start': 3.717, 'end': 3.898, 'score': 0.701}, {'word': 'the', 'start': 3.958, 'end': 4.058, 'score': 0.798},
406
- {'word': 'sense', 'start': 4.098, 'end': 4.359, 'score': 0.797}, {'word': 'deceives,', 'start': 4.419, 'end': 4.92, 'score': 0.802},
407
- {'word': 'lost', 'start': 5.101, 'end': 5.481, 'score': 0.71}, {'word': 'not', 'start': 5.682, 'end': 5.963, 'score': 0.781},
408
- {'word': 'by', 'start': 6.043, 'end': 6.183, 'score': 0.834}, {'word': 'distance', 'start': 6.223, 'end': 6.644, 'score': 0.899},
409
- {'word': 'any', 'start': 6.905, 'end': 7.065, 'score': 0.893}, {'word': 'of', 'start': 7.125, 'end': 7.185, 'score': 0.772},
410
- {'word': 'its', 'start': 7.245, 'end': 7.346, 'score': 0.778}, {'word': 'marks.', 'start': 7.406, 'end': 7.727, 'score': 0.955}
411
- ]
412
 
413
 
414
  def update_demo(mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word):
@@ -435,19 +422,19 @@ def get_app():
435
  with gr.Column(scale=5):
436
  with gr.Accordion("Select models", open=False) as models_selector:
437
  with gr.Row():
438
- voicecraft_model_choice = gr.Radio(label="VoiceCraft model", value="giga830M",
439
- choices=["giga330M", "giga830M", "giga330M_TTSEnhanced"])
440
- whisper_backend_choice = gr.Radio(label="Whisper backend", value="whisperX", choices=["whisper", "whisperX"])
441
  whisper_model_choice = gr.Radio(label="Whisper model", value="base.en",
442
  choices=[None, "base.en", "small.en", "medium.en", "large"])
443
- align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=[None, "whisperX"])
444
 
445
  with gr.Row():
446
  with gr.Column(scale=2):
447
- input_audio = gr.Audio(value=f"{DEMO_PATH}/84_121550_000074_000000.wav", label="Input Audio", type="filepath", interactive=True)
448
  with gr.Group():
449
  original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
450
- info="Use whisper model to get the transcript. Fix and align it if necessary.")
451
  with gr.Accordion("Word start time", open=False):
452
  transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word")
453
  with gr.Accordion("Word end time", open=False):
@@ -472,16 +459,16 @@ def get_app():
472
  info="What to do with first and last word", visible=False)
473
 
474
  with gr.Group() as tts_mode_controls:
475
- prompt_to_word = gr.Dropdown(label="Last word in prompt", choices=demo_words, value=demo_words[10], interactive=True)
476
- prompt_end_time = gr.Slider(label="Prompt end time", minimum=0, maximum=7.93, step=0.001, value=3.016)
477
 
478
  with gr.Group(visible=False) as edit_mode_controls:
479
  with gr.Row():
480
- edit_from_word = gr.Dropdown(label="First word to edit", choices=demo_words, value=demo_words[2], interactive=True)
481
  edit_to_word = gr.Dropdown(label="Last word to edit", choices=demo_words, value=demo_words[12], interactive=True)
482
  with gr.Row():
483
- edit_start_time = gr.Slider(label="Edit from time", minimum=0, maximum=7.93, step=0.001, value=0.46)
484
- edit_end_time = gr.Slider(label="Edit to time", minimum=0, maximum=7.93, step=0.001, value=3.808)
485
 
486
  run_btn = gr.Button(value="Run")
487
 
@@ -498,7 +485,7 @@ def get_app():
498
 
499
  with gr.Row():
500
  with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
501
- stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=3,
502
  info="if there are long silence in the generated audio, reduce the stop_repetition to 2 or 1. -1 = disabled")
503
  sample_batch_size = gr.Number(label="speech rate", value=4, precision=0,
504
  info="The higher the number, the faster the output will be. "
 
78
  def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, voicecraft_model_name):
79
  global transcribe_model, align_model, voicecraft_model
80
 
81
+ if voicecraft_model_name == "330M":
82
+ voicecraft_model_name = "giga330M"
83
+ elif voicecraft_model_name == "830M":
84
+ voicecraft_model_name = "giga830M"
85
+ elif voicecraft_model_name == "330M_TTSEnhanced":
86
  voicecraft_model_name = "gigaHalfLibri330M_TTSEnhanced_max16s"
87
+ elif voicecraft_model_name == "830M_TTSEnhanced":
88
+ voicecraft_model_name = "830M_TTSEnhanced"
89
  if alignment_model_name is not None:
90
  align_model = WhisperxAlignModel()
91
 
 
370
  - In Edit mode write full prompt</br>
371
  """
372
 
373
+ demo_original_transcript = "And again in two thousand and eight when the United States Central Bank, the Federal Reserve, printed over two trillion dollars."
374
 
375
  demo_text = {
376
  "TTS": {
377
  "smart": "I cannot believe that the same model can also do text to speech synthesis too!",
378
+ "regular": "And again in two thousand and eight when the united states central bank, I cannot believe that the same model can also do text to speech synthesis too!"
379
  },
380
  "Edit": {
381
+ "smart": "Central Bank of the United States, also called",
382
+ "regular": "And again in two thousand and eight when the Central Bank of the United States, also called the Federal Reserve, printed over two trillion dollars."
383
  },
384
  "Long TTS": {
385
  "smart": "You can run the model on a big text!\n"
386
  "Just write it line-by-line. Or sentence-by-sentence.\n"
387
  "If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!",
388
+ "regular": "And again in two thousand and eight when the united states central bank, You can run the model on a big text!\n"
389
+ "And again in two thousand and eight when the united states central bank, Just write it line-by-line. Or sentence-by-sentence.\n"
390
+ "And again in two thousand and eight when the united states central bank, If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!"
391
  }
392
  }
393
 
394
  all_demo_texts = {vv for k, v in demo_text.items() for kk, vv in v.items()}
395
 
396
+ demo_words = ['0.12 And 0.221', '0.261 again 0.561', '0.622 in 0.682', '0.742 two 0.922', '0.983 thousand 1.464', '1.504 and 1.584', '1.684 eight 1.865', '1.945 when 2.085', '2.125 the 2.206', '2.266 United 2.667', '2.707 States 2.968', '3.008 Central 3.349', '3.389 Bank, 3.649', '3.83 the 3.93', '4.01 Federal 4.451', '4.532 Reserve, 5.113', '5.314 printed 5.674', '5.835 over 6.035', '6.176 two 6.517', '6.637 trillion 7.098', '7.118 dollars. 7.479']
397
+
398
+ demo_words_info = [{'word': 'And', 'start': 0.12, 'end': 0.221, 'score': 0.792}, {'word': 'again', 'start': 0.261, 'end': 0.561, 'score': 0.795}, {'word': 'in', 'start': 0.622, 'end': 0.682, 'score': 0.75}, {'word': 'two', 'start': 0.742, 'end': 0.922, 'score': 0.755}, {'word': 'thousand', 'start': 0.983, 'end': 1.464, 'score': 0.82}, {'word': 'and', 'start': 1.504, 'end': 1.584, 'score': 0.715}, {'word': 'eight', 'start': 1.684, 'end': 1.865, 'score': 0.885}, {'word': 'when', 'start': 1.945, 'end': 2.085, 'score': 0.987}, {'word': 'the', 'start': 2.125, 'end': 2.206, 'score': 0.833}, {'word': 'United', 'start': 2.266, 'end': 2.667, 'score': 0.818}, {'word': 'States', 'start': 2.707, 'end': 2.968, 'score': 0.842}, {'word': 'Central', 'start': 3.008, 'end': 3.349, 'score': 0.852}, {'word': 'Bank,', 'start': 3.389, 'end': 3.649, 'score': 0.98}, {'word': 'the', 'start': 3.83, 'end': 3.93, 'score': 0.996}, {'word': 'Federal', 'start': 4.01, 'end': 4.451, 'score': 0.795}, {'word': 'Reserve,', 'start': 4.532, 'end': 5.113, 'score': 0.852}, {'word': 'printed', 'start': 5.314, 'end': 5.674, 'score': 0.785}, {'word': 'over', 'start': 5.835, 'end': 6.035, 'score': 0.84}, {'word': 'two', 'start': 6.176, 'end': 6.517, 'score': 0.757}, {'word': 'trillion', 'start': 6.637, 'end': 7.098, 'score': 0.796}, {'word': 'dollars.', 'start': 7.118, 'end': 7.479, 'score': 0.939}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
 
400
 
401
  def update_demo(mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word):
 
422
  with gr.Column(scale=5):
423
  with gr.Accordion("Select models", open=False) as models_selector:
424
  with gr.Row():
425
+ voicecraft_model_choice = gr.Radio(label="VoiceCraft model", value="830M_TTSEnhanced",
426
+ choices=["330M", "830M", "330M_TTSEnhanced", "830M_TTSEnhanced"])
427
+ whisper_backend_choice = gr.Radio(label="Whisper backend", value="whisperX", choices=["whisperX", "whisper"])
428
  whisper_model_choice = gr.Radio(label="Whisper model", value="base.en",
429
  choices=[None, "base.en", "small.en", "medium.en", "large"])
430
+ align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=["whisperX", None])
431
 
432
  with gr.Row():
433
  with gr.Column(scale=2):
434
+ input_audio = gr.Audio(value=f"{DEMO_PATH}/YOU1000000115_S0000252.wav", label="Input Audio", type="filepath", interactive=True)
435
  with gr.Group():
436
  original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
437
+ info="Use whisperx model to get the transcript. Fix and align it if necessary.")
438
  with gr.Accordion("Word start time", open=False):
439
  transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word")
440
  with gr.Accordion("Word end time", open=False):
 
459
  info="What to do with first and last word", visible=False)
460
 
461
  with gr.Group() as tts_mode_controls:
462
+ prompt_to_word = gr.Dropdown(label="Last word in prompt", choices=demo_words, value=demo_words[12], interactive=True)
463
+ prompt_end_time = gr.Slider(label="Prompt end time", minimum=0, maximum=7.479, step=0.001, value=3.700)
464
 
465
  with gr.Group(visible=False) as edit_mode_controls:
466
  with gr.Row():
467
+ edit_from_word = gr.Dropdown(label="First word to edit", choices=demo_words, value=demo_words[9], interactive=True)
468
  edit_to_word = gr.Dropdown(label="Last word to edit", choices=demo_words, value=demo_words[12], interactive=True)
469
  with gr.Row():
470
+ edit_start_time = gr.Slider(label="Edit from time", minimum=0, maximum=7.479, step=0.001, value=2.266)
471
+ edit_end_time = gr.Slider(label="Edit to time", minimum=0, maximum=7.479, step=0.001, value=3.649)
472
 
473
  run_btn = gr.Button(value="Run")
474
 
 
485
 
486
  with gr.Row():
487
  with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
488
+ stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=2,
489
  info="if there are long silence in the generated audio, reduce the stop_repetition to 2 or 1. -1 = disabled")
490
  sample_batch_size = gr.Number(label="speech rate", value=4, precision=0,
491
  info="The higher the number, the faster the output will be. "
demo/YOU1000000115_S0000252.wav ADDED
Binary file (252 kB). View file