whispy commited on
Commit
1cd9790
1 Parent(s): 1e4fa1e

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -20
app.py CHANGED
@@ -6,23 +6,26 @@ from diffusers import StableDiffusionPipeline
6
 
7
 
8
  MODEL_NAME = "whispy/whisper_italian"
9
-
10
- summarizer = pipeline(
11
- "summarization",
12
- model="it5/it5-efficient-small-el32-news-summarization",
13
- )
14
-
15
- pipe = pipeline(
16
  task="automatic-speech-recognition",
17
  model=MODEL_NAME,
18
  chunk_length_s=30,
19
- device="cpu",
20
- )
21
-
22
- YOUR_TOKEN="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"
23
- image_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=YOUR_TOKEN)
24
-
25
- translator = pipeline("translation", model="Helsinki-NLP/opus-mt-it-en")
 
 
 
 
 
 
 
26
 
27
  def transcribe(microphone, file_upload):
28
  warn_output = ""
@@ -37,12 +40,15 @@ def transcribe(microphone, file_upload):
37
 
38
  file = microphone if microphone is not None else file_upload
39
 
40
- text = pipe(file)["text"]
41
-
42
- translate = translator(text)
 
43
  translate = translate[0]["translation_text"]
44
-
45
- image = image_pipe(translate)["sample"][0]
 
 
46
 
47
  return warn_output + text, translate, image
48
 
@@ -80,7 +86,9 @@ mf_transcribe = gr.Interface(
80
  gr.inputs.Audio(source="microphone", type="filepath", optional=True),
81
  gr.inputs.Audio(source="upload", type="filepath", optional=True),
82
  ],
83
- outputs=["text", "text", "image"],
 
 
84
  layout="horizontal",
85
  theme="huggingface",
86
  title="Whisper Demo: Transcribe Audio",
 
6
 
7
 
8
  MODEL_NAME = "whispy/whisper_italian"
9
+ YOUR_TOKEN="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"
10
+ # whisper model fine-tuned for italian
11
+ speech_ppl = pipeline(
 
 
 
 
12
  task="automatic-speech-recognition",
13
  model=MODEL_NAME,
14
  chunk_length_s=30,
15
+ device="cpu"
16
+ )
17
+ # model summarizing text
18
+ summarizer_ppl = pipeline(
19
+ "summarization",
20
+ model="it5/it5-efficient-small-el32-news-summarization"
21
+ )
22
+ # model translating text from Italian to English
23
+ translator_ppl = pipeline(
24
+ "translation",
25
+ model="Helsinki-NLP/opus-mt-it-en"
26
+ )
27
+ # model producing an image from text
28
+ image_ppl = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=YOUR_TOKEN)
29
 
30
  def transcribe(microphone, file_upload):
31
  warn_output = ""
 
40
 
41
  file = microphone if microphone is not None else file_upload
42
 
43
+ text = speech_ppl(file)["text"]
44
+ print("Text: ", text)
45
+ translate = translator_ppl(text)
46
+ print("Translate: ", translate)
47
  translate = translate[0]["translation_text"]
48
+ print("Translate 2: ", translate)
49
+ image = image_ppl(translate).images[0]
50
+ print("Image: ", image)
51
+ image.save("text-to-image.png")
52
 
53
  return warn_output + text, translate, image
54
 
 
86
  gr.inputs.Audio(source="microphone", type="filepath", optional=True),
87
  gr.inputs.Audio(source="upload", type="filepath", optional=True),
88
  ],
89
+ outputs=[gr.Textbox(label="Transcribed text"),
90
+ gr.Textbox(label="Summarized text"),
91
+ gr.Image(type="pil", label="Output image")],
92
  layout="horizontal",
93
  theme="huggingface",
94
  title="Whisper Demo: Transcribe Audio",