Spaces:

whispy
/

Whisper-Image

Runtime error

App Files Files Community

whispy commited on Dec 7, 2022

Commit

803b60d

•

1 Parent(s): 3da8fbb

Upload app.py

Browse files

Files changed (1) hide show

app.py +15 -9

app.py CHANGED Viewed

@@ -27,18 +27,22 @@ translator_ppl = pipeline(
 # model producing an image from text
 image_ppl = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=YOUR_TOKEN)
-def transcribe(microphone, file_upload):
     warn_output = ""
-    if (microphone is not None) and (file_upload is not None):
         warn_output = (
             "WARNING: You've uploaded an audio file and used the microphone. "
             "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
         )
-    elif (microphone is None) and (file_upload is None):
         return "ERROR: You have to either use the microphone or upload an audio file"
-    file = microphone if microphone is not None else file_upload
     text = speech_ppl(file)["text"]
     print("Text: ", text)
@@ -48,7 +52,7 @@ def transcribe(microphone, file_upload):
     print("Translate 2: ", translate)
     print("Building image .....")
     #image = image_ppl(translate).images[0]
-    image = image_ppl(translate)["sample"]
     print("Image: ", image)
     image.save("text-to-image.png")
@@ -86,14 +90,14 @@ mf_transcribe = gr.Interface(
     fn=transcribe,
     inputs=[
         gr.inputs.Audio(source="microphone", type="filepath", optional=True),
-        gr.inputs.Audio(source="upload", type="filepath", optional=True),
     ],
     outputs=[gr.Textbox(label="Transcribed text"),
              gr.Textbox(label="Summarized text"),
              gr.Image(type="pil", label="Output image")],
     layout="horizontal",
     theme="huggingface",
-    title="Whisper Demo: Transcribe Audio",
     description=(
         "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
         f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
@@ -101,7 +105,7 @@ mf_transcribe = gr.Interface(
     ),
     allow_flagging="never",
 )
 yt_transcribe = gr.Interface(
     fn=yt_transcribe,
     inputs=[gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
@@ -116,8 +120,10 @@ yt_transcribe = gr.Interface(
     ),
     allow_flagging="never",
 )
 with demo:
-    gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
 demo.launch(enable_queue=True)

 # model producing an image from text
 image_ppl = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=YOUR_TOKEN)
+#def transcribe(microphone, file_upload):
+def transcribe(microphone):
     warn_output = ""
+#    if (microphone is not None) and (file_upload is not None):
+    if (microphone is not None):
         warn_output = (
             "WARNING: You've uploaded an audio file and used the microphone. "
             "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
         )
+#    elif (microphone is None) and (file_upload is None):
+    elif (microphone is None):
         return "ERROR: You have to either use the microphone or upload an audio file"
+#    file = microphone if microphone is not None else file_upload
+    file = microphone
     text = speech_ppl(file)["text"]
     print("Text: ", text)
     print("Translate 2: ", translate)
     print("Building image .....")
     #image = image_ppl(translate).images[0]
+    image = image_ppl(translate, num_inference_steps=15)["sample"]
     print("Image: ", image)
     image.save("text-to-image.png")
     fn=transcribe,
     inputs=[
         gr.inputs.Audio(source="microphone", type="filepath", optional=True),
+        #gr.inputs.Audio(source="upload", type="filepath", optional=True),
     ],
     outputs=[gr.Textbox(label="Transcribed text"),
              gr.Textbox(label="Summarized text"),
              gr.Image(type="pil", label="Output image")],
     layout="horizontal",
     theme="huggingface",
+    title="Whisper Demo: Transcribe Audio to Image",
     description=(
         "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
         f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
     ),
     allow_flagging="never",
 )
+'''
 yt_transcribe = gr.Interface(
     fn=yt_transcribe,
     inputs=[gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
     ),
     allow_flagging="never",
 )
+'''
 with demo:
+    #gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
+    gr.TabbedInterface(mf_transcribe, "Transcribe Audio to Image")
 demo.launch(enable_queue=True)