Spaces:

pyp1
/

VoiceCraft_gradio

Running on A10G

App Files Files Community

jason-on-salt-a40 commited on Apr 21

Commit

cf33c41

•

1 Parent(s): 93adc07

use updated model and prompt

Browse files

Files changed (2) hide show

app.py +29 -42
demo/YOU1000000115_S0000252.wav +0 -0

app.py CHANGED Viewed

@@ -78,9 +78,14 @@ class WhisperxModel:
 def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, voicecraft_model_name):
     global transcribe_model, align_model, voicecraft_model
-    if voicecraft_model_name == "giga330M_TTSEnhanced":
         voicecraft_model_name = "gigaHalfLibri330M_TTSEnhanced_max16s"
     if alignment_model_name is not None:
         align_model = WhisperxAlignModel()
@@ -365,50 +370,32 @@ If disabled, you should write the target transcript yourself:</br>
  - In Edit mode write full prompt</br>
 """
-demo_original_transcript = "But when I had approached so near to them, the common object, which the sense deceives, lost not by distance any of its marks."
 demo_text = {
     "TTS": {
         "smart": "I cannot believe that the same model can also do text to speech synthesis too!",
-        "regular": "But when I had approached so near to them, the common I cannot believe that the same model can also do text to speech synthesis too!"
     },
     "Edit": {
-        "smart": "saw the mirage of the lake in the distance,",
-        "regular": "But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks,"
     },
     "Long TTS": {
         "smart": "You can run the model on a big text!\n"
                  "Just write it line-by-line. Or sentence-by-sentence.\n"
                  "If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!",
-        "regular": "But when I had approached so near to them, the common You can run the model on a big text!\n"
-                   "But when I had approached so near to them, the common Just write it line-by-line. Or sentence-by-sentence.\n"
-                   "But when I had approached so near to them, the common If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!"
     }
 }
 all_demo_texts = {vv for k, v in demo_text.items() for kk, vv in v.items()}
-demo_words = [
-    '0.029 But 0.149', '0.189 when 0.33', '0.43 I 0.49', '0.53 had 0.65', '0.711 approached 1.152', '1.352 so 1.593',
-    '1.693 near 1.933', '1.994 to 2.074', '2.134 them, 2.354', '2.535 the 2.655', '2.695 common 3.016', '3.196 object, 3.577',
-    '3.717 which 3.898', '3.958 the 4.058', '4.098 sense 4.359', '4.419 deceives, 4.92', '5.101 lost 5.481', '5.682 not 5.963',
-    '6.043 by 6.183', '6.223 distance 6.644', '6.905 any 7.065', '7.125 of 7.185', '7.245 its 7.346', '7.406 marks. 7.727'
-]
-demo_words_info = [
-    {'word': 'But', 'start': 0.029, 'end': 0.149, 'score': 0.834}, {'word': 'when', 'start': 0.189, 'end': 0.33, 'score': 0.879},
-    {'word': 'I', 'start': 0.43, 'end': 0.49, 'score': 0.984}, {'word': 'had', 'start': 0.53, 'end': 0.65, 'score': 0.998},
-    {'word': 'approached', 'start': 0.711, 'end': 1.152, 'score': 0.822}, {'word': 'so', 'start': 1.352, 'end': 1.593, 'score': 0.822},
-    {'word': 'near', 'start': 1.693, 'end': 1.933, 'score': 0.752}, {'word': 'to', 'start': 1.994, 'end': 2.074, 'score': 0.924},
-    {'word': 'them,', 'start': 2.134, 'end': 2.354, 'score': 0.914}, {'word': 'the', 'start': 2.535, 'end': 2.655, 'score': 0.818},
-    {'word': 'common', 'start': 2.695, 'end': 3.016, 'score': 0.971}, {'word': 'object,', 'start': 3.196, 'end': 3.577, 'score': 0.823},
-    {'word': 'which', 'start': 3.717, 'end': 3.898, 'score': 0.701}, {'word': 'the', 'start': 3.958, 'end': 4.058, 'score': 0.798},
-    {'word': 'sense', 'start': 4.098, 'end': 4.359, 'score': 0.797}, {'word': 'deceives,', 'start': 4.419, 'end': 4.92, 'score': 0.802},
-    {'word': 'lost', 'start': 5.101, 'end': 5.481, 'score': 0.71}, {'word': 'not', 'start': 5.682, 'end': 5.963, 'score': 0.781},
-    {'word': 'by', 'start': 6.043, 'end': 6.183, 'score': 0.834}, {'word': 'distance', 'start': 6.223, 'end': 6.644, 'score': 0.899},
-    {'word': 'any', 'start': 6.905, 'end': 7.065, 'score': 0.893}, {'word': 'of', 'start': 7.125, 'end': 7.185, 'score': 0.772},
-    {'word': 'its', 'start': 7.245, 'end': 7.346, 'score': 0.778}, {'word': 'marks.', 'start': 7.406, 'end': 7.727, 'score': 0.955}
-]
 def update_demo(mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word):
@@ -435,19 +422,19 @@ def get_app():
             with gr.Column(scale=5):
                 with gr.Accordion("Select models", open=False) as models_selector:
                     with gr.Row():
-                        voicecraft_model_choice = gr.Radio(label="VoiceCraft model", value="giga830M",
-                                                        choices=["giga330M", "giga830M", "giga330M_TTSEnhanced"])
-                        whisper_backend_choice = gr.Radio(label="Whisper backend", value="whisperX", choices=["whisper", "whisperX"])
                         whisper_model_choice = gr.Radio(label="Whisper model", value="base.en",
                                                         choices=[None, "base.en", "small.en", "medium.en", "large"])
-                        align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=[None, "whisperX"])
         with gr.Row():
             with gr.Column(scale=2):
-                input_audio = gr.Audio(value=f"{DEMO_PATH}/84_121550_000074_000000.wav", label="Input Audio", type="filepath", interactive=True)
                 with gr.Group():
                     original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
-                                                    info="Use whisper model to get the transcript. Fix and align it if necessary.")
                     with gr.Accordion("Word start time", open=False):
                         transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word")
                     with gr.Accordion("Word end time", open=False):
@@ -472,16 +459,16 @@ def get_app():
                                                 info="What to do with first and last word", visible=False)
                     with gr.Group() as tts_mode_controls:
-                        prompt_to_word = gr.Dropdown(label="Last word in prompt", choices=demo_words, value=demo_words[10], interactive=True)
-                        prompt_end_time = gr.Slider(label="Prompt end time", minimum=0, maximum=7.93, step=0.001, value=3.016)
                     with gr.Group(visible=False) as edit_mode_controls:
                         with gr.Row():
-                            edit_from_word = gr.Dropdown(label="First word to edit", choices=demo_words, value=demo_words[2], interactive=True)
                             edit_to_word = gr.Dropdown(label="Last word to edit", choices=demo_words, value=demo_words[12], interactive=True)
                         with gr.Row():
-                            edit_start_time = gr.Slider(label="Edit from time", minimum=0, maximum=7.93, step=0.001, value=0.46)
-                            edit_end_time = gr.Slider(label="Edit to time", minimum=0, maximum=7.93, step=0.001, value=3.808)
                     run_btn = gr.Button(value="Run")
@@ -498,7 +485,7 @@ def get_app():
         with gr.Row():
             with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
-                stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=3,
                                         info="if there are long silence in the generated audio, reduce the stop_repetition to 2 or 1. -1 = disabled")
                 sample_batch_size = gr.Number(label="speech rate", value=4, precision=0,
                                             info="The higher the number, the faster the output will be. "

 def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, voicecraft_model_name):
     global transcribe_model, align_model, voicecraft_model
+    if voicecraft_model_name == "330M":
+        voicecraft_model_name = "giga330M"
+    elif voicecraft_model_name == "830M":
+        voicecraft_model_name = "giga830M"
+    elif voicecraft_model_name == "330M_TTSEnhanced":
         voicecraft_model_name = "gigaHalfLibri330M_TTSEnhanced_max16s"
+    elif voicecraft_model_name == "830M_TTSEnhanced":
+        voicecraft_model_name = "830M_TTSEnhanced"
     if alignment_model_name is not None:
         align_model = WhisperxAlignModel()
  - In Edit mode write full prompt</br>
 """
+demo_original_transcript = "And again in two thousand and eight when the United States Central Bank, the Federal Reserve, printed over two trillion dollars."
 demo_text = {
     "TTS": {
         "smart": "I cannot believe that the same model can also do text to speech synthesis too!",
+        "regular": "And again in two thousand and eight when the united states central bank, I cannot believe that the same model can also do text to speech synthesis too!"
     },
     "Edit": {
+        "smart": "Central Bank of the United States, also called",
+        "regular": "And again in two thousand and eight when the Central Bank of the United States, also called the Federal Reserve, printed over two trillion dollars."
     },
     "Long TTS": {
         "smart": "You can run the model on a big text!\n"
                  "Just write it line-by-line. Or sentence-by-sentence.\n"
                  "If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!",
+        "regular": "And again in two thousand and eight when the united states central bank, You can run the model on a big text!\n"
+                   "And again in two thousand and eight when the united states central bank, Just write it line-by-line. Or sentence-by-sentence.\n"
+                   "And again in two thousand and eight when the united states central bank, If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!"
     }
 }
 all_demo_texts = {vv for k, v in demo_text.items() for kk, vv in v.items()}
+demo_words = ['0.12 And 0.221', '0.261 again 0.561', '0.622 in 0.682', '0.742 two 0.922', '0.983 thousand 1.464', '1.504 and 1.584', '1.684 eight 1.865', '1.945 when 2.085', '2.125 the 2.206', '2.266 United 2.667', '2.707 States 2.968', '3.008 Central 3.349', '3.389 Bank, 3.649', '3.83 the 3.93', '4.01 Federal 4.451', '4.532 Reserve, 5.113', '5.314 printed 5.674', '5.835 over 6.035', '6.176 two 6.517', '6.637 trillion 7.098', '7.118 dollars. 7.479']
+demo_words_info = [{'word': 'And', 'start': 0.12, 'end': 0.221, 'score': 0.792}, {'word': 'again', 'start': 0.261, 'end': 0.561, 'score': 0.795}, {'word': 'in', 'start': 0.622, 'end': 0.682, 'score': 0.75}, {'word': 'two', 'start': 0.742, 'end': 0.922, 'score': 0.755}, {'word': 'thousand', 'start': 0.983, 'end': 1.464, 'score': 0.82}, {'word': 'and', 'start': 1.504, 'end': 1.584, 'score': 0.715}, {'word': 'eight', 'start': 1.684, 'end': 1.865, 'score': 0.885}, {'word': 'when', 'start': 1.945, 'end': 2.085, 'score': 0.987}, {'word': 'the', 'start': 2.125, 'end': 2.206, 'score': 0.833}, {'word': 'United', 'start': 2.266, 'end': 2.667, 'score': 0.818}, {'word': 'States', 'start': 2.707, 'end': 2.968, 'score': 0.842}, {'word': 'Central', 'start': 3.008, 'end': 3.349, 'score': 0.852}, {'word': 'Bank,', 'start': 3.389, 'end': 3.649, 'score': 0.98}, {'word': 'the', 'start': 3.83, 'end': 3.93, 'score': 0.996}, {'word': 'Federal', 'start': 4.01, 'end': 4.451, 'score': 0.795}, {'word': 'Reserve,', 'start': 4.532, 'end': 5.113, 'score': 0.852}, {'word': 'printed', 'start': 5.314, 'end': 5.674, 'score': 0.785}, {'word': 'over', 'start': 5.835, 'end': 6.035, 'score': 0.84}, {'word': 'two', 'start': 6.176, 'end': 6.517, 'score': 0.757}, {'word': 'trillion', 'start': 6.637, 'end': 7.098, 'score': 0.796}, {'word': 'dollars.', 'start': 7.118, 'end': 7.479, 'score': 0.939}]
 def update_demo(mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word):
             with gr.Column(scale=5):
                 with gr.Accordion("Select models", open=False) as models_selector:
                     with gr.Row():
+                        voicecraft_model_choice = gr.Radio(label="VoiceCraft model", value="830M_TTSEnhanced",
+                                                        choices=["330M", "830M", "330M_TTSEnhanced", "830M_TTSEnhanced"])
+                        whisper_backend_choice = gr.Radio(label="Whisper backend", value="whisperX", choices=["whisperX", "whisper"])
                         whisper_model_choice = gr.Radio(label="Whisper model", value="base.en",
                                                         choices=[None, "base.en", "small.en", "medium.en", "large"])
+                        align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=["whisperX", None])
         with gr.Row():
             with gr.Column(scale=2):
+                input_audio = gr.Audio(value=f"{DEMO_PATH}/YOU1000000115_S0000252.wav", label="Input Audio", type="filepath", interactive=True)
                 with gr.Group():
                     original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
+                                                    info="Use whisperx model to get the transcript. Fix and align it if necessary.")
                     with gr.Accordion("Word start time", open=False):
                         transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word")
                     with gr.Accordion("Word end time", open=False):
                                                 info="What to do with first and last word", visible=False)
                     with gr.Group() as tts_mode_controls:
+                        prompt_to_word = gr.Dropdown(label="Last word in prompt", choices=demo_words, value=demo_words[12], interactive=True)
+                        prompt_end_time = gr.Slider(label="Prompt end time", minimum=0, maximum=7.479, step=0.001, value=3.700)
                     with gr.Group(visible=False) as edit_mode_controls:
                         with gr.Row():
+                            edit_from_word = gr.Dropdown(label="First word to edit", choices=demo_words, value=demo_words[9], interactive=True)
                             edit_to_word = gr.Dropdown(label="Last word to edit", choices=demo_words, value=demo_words[12], interactive=True)
                         with gr.Row():
+                            edit_start_time = gr.Slider(label="Edit from time", minimum=0, maximum=7.479, step=0.001, value=2.266)
+                            edit_end_time = gr.Slider(label="Edit to time", minimum=0, maximum=7.479, step=0.001, value=3.649)
                     run_btn = gr.Button(value="Run")
         with gr.Row():
             with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
+                stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=2,
                                         info="if there are long silence in the generated audio, reduce the stop_repetition to 2 or 1. -1 = disabled")
                 sample_batch_size = gr.Number(label="speech rate", value=4, precision=0,
                                             info="The higher the number, the faster the output will be. "

demo/YOU1000000115_S0000252.wav ADDED Viewed

Binary file (252 kB). View file