whispy commited on
Commit
803b60d
1 Parent(s): 3da8fbb

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -9
app.py CHANGED
@@ -27,18 +27,22 @@ translator_ppl = pipeline(
27
  # model producing an image from text
28
  image_ppl = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=YOUR_TOKEN)
29
 
30
- def transcribe(microphone, file_upload):
 
31
  warn_output = ""
32
- if (microphone is not None) and (file_upload is not None):
 
33
  warn_output = (
34
  "WARNING: You've uploaded an audio file and used the microphone. "
35
  "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
36
  )
37
 
38
- elif (microphone is None) and (file_upload is None):
 
39
  return "ERROR: You have to either use the microphone or upload an audio file"
40
 
41
- file = microphone if microphone is not None else file_upload
 
42
 
43
  text = speech_ppl(file)["text"]
44
  print("Text: ", text)
@@ -48,7 +52,7 @@ def transcribe(microphone, file_upload):
48
  print("Translate 2: ", translate)
49
  print("Building image .....")
50
  #image = image_ppl(translate).images[0]
51
- image = image_ppl(translate)["sample"]
52
  print("Image: ", image)
53
  image.save("text-to-image.png")
54
 
@@ -86,14 +90,14 @@ mf_transcribe = gr.Interface(
86
  fn=transcribe,
87
  inputs=[
88
  gr.inputs.Audio(source="microphone", type="filepath", optional=True),
89
- gr.inputs.Audio(source="upload", type="filepath", optional=True),
90
  ],
91
  outputs=[gr.Textbox(label="Transcribed text"),
92
  gr.Textbox(label="Summarized text"),
93
  gr.Image(type="pil", label="Output image")],
94
  layout="horizontal",
95
  theme="huggingface",
96
- title="Whisper Demo: Transcribe Audio",
97
  description=(
98
  "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
99
  f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
@@ -101,7 +105,7 @@ mf_transcribe = gr.Interface(
101
  ),
102
  allow_flagging="never",
103
  )
104
-
105
  yt_transcribe = gr.Interface(
106
  fn=yt_transcribe,
107
  inputs=[gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
@@ -116,8 +120,10 @@ yt_transcribe = gr.Interface(
116
  ),
117
  allow_flagging="never",
118
  )
 
119
 
120
  with demo:
121
- gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
 
122
 
123
  demo.launch(enable_queue=True)
 
27
  # model producing an image from text
28
  image_ppl = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=YOUR_TOKEN)
29
 
30
+ #def transcribe(microphone, file_upload):
31
+ def transcribe(microphone):
32
  warn_output = ""
33
+ # if (microphone is not None) and (file_upload is not None):
34
+ if (microphone is not None):
35
  warn_output = (
36
  "WARNING: You've uploaded an audio file and used the microphone. "
37
  "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
38
  )
39
 
40
+ # elif (microphone is None) and (file_upload is None):
41
+ elif (microphone is None):
42
  return "ERROR: You have to either use the microphone or upload an audio file"
43
 
44
+ # file = microphone if microphone is not None else file_upload
45
+ file = microphone
46
 
47
  text = speech_ppl(file)["text"]
48
  print("Text: ", text)
 
52
  print("Translate 2: ", translate)
53
  print("Building image .....")
54
  #image = image_ppl(translate).images[0]
55
+ image = image_ppl(translate, num_inference_steps=15)["sample"]
56
  print("Image: ", image)
57
  image.save("text-to-image.png")
58
 
 
90
  fn=transcribe,
91
  inputs=[
92
  gr.inputs.Audio(source="microphone", type="filepath", optional=True),
93
+ #gr.inputs.Audio(source="upload", type="filepath", optional=True),
94
  ],
95
  outputs=[gr.Textbox(label="Transcribed text"),
96
  gr.Textbox(label="Summarized text"),
97
  gr.Image(type="pil", label="Output image")],
98
  layout="horizontal",
99
  theme="huggingface",
100
+ title="Whisper Demo: Transcribe Audio to Image",
101
  description=(
102
  "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
103
  f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
 
105
  ),
106
  allow_flagging="never",
107
  )
108
+ '''
109
  yt_transcribe = gr.Interface(
110
  fn=yt_transcribe,
111
  inputs=[gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
 
120
  ),
121
  allow_flagging="never",
122
  )
123
+ '''
124
 
125
  with demo:
126
+ #gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
127
+ gr.TabbedInterface(mf_transcribe, "Transcribe Audio to Image")
128
 
129
  demo.launch(enable_queue=True)