sts

Running on A100

App Files Files Community

Afrinetwork7 commited on Aug 23

Commit

09ab406

•

1 Parent(s): 384d281

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -93

app.py CHANGED Viewed

@@ -1,95 +1,57 @@
-import gradio as gr
-import librosa
-from asr import transcribe, ASR_EXAMPLES, ASR_LANGUAGES, ASR_NOTE
-from tts import synthesize, TTS_EXAMPLES, TTS_LANGUAGES
-from lid import identify, LID_EXAMPLES
-mms_transcribe = gr.Interface(
-    fn=transcribe,
-    inputs=[
-        gr.Audio(),
-        gr.Dropdown(
-            [f"{k} ({v})" for k, v in ASR_LANGUAGES.items()],
-            label="Language",
-            value="eng English",
-        ),
-        # gr.Checkbox(label="Use Language Model (if available)", default=True),
-    ],
-    outputs="text",
-    examples=ASR_EXAMPLES,
-    title="Speech-to-text",
-    description=(
-        "Transcribe audio from a microphone or input file in your desired language."
-    ),
-    article=ASR_NOTE,
-    allow_flagging="never",
-)
-mms_synthesize = gr.Interface(
-    fn=synthesize,
-    inputs=[
-        gr.Text(label="Input text"),
-        gr.Dropdown(
-            [f"{k} ({v})" for k, v in TTS_LANGUAGES.items()],
-            label="Language",
-            value="eng English",
-        ),
-        gr.Slider(minimum=0.1, maximum=4.0, value=1.0, step=0.1, label="Speed"),
-    ],
-    outputs=[
-        gr.Audio(label="Generated Audio", type="numpy"),
-        gr.Text(label="Filtered text after removing OOVs"),
-    ],
-    examples=TTS_EXAMPLES,
-    title="Text-to-speech",
-    description=("Generate audio in your desired language from input text."),
-    allow_flagging="never",
-)
-mms_identify = gr.Interface(
-    fn=identify,
-    inputs=[
-        gr.Audio(),
-    ],
-    outputs=gr.Label(num_top_classes=10),
-    examples=LID_EXAMPLES,
-    title="Language Identification",
-    description=("Identity the language of input audio."),
-    allow_flagging="never",
-)
-tabbed_interface = gr.TabbedInterface(
-    [mms_transcribe, mms_synthesize, mms_identify],
-    ["Speech-to-text", "Text-to-speech", "Language Identification"],
-)
-with gr.Blocks() as demo:
-    gr.Markdown(
-        "<p align='center' style='font-size: 20px;'>MMS: Scaling Speech Technology to 1000+ languages demo. See our <a href='https://ai.facebook.com/blog/multilingual-model-speech-recognition/'>blog post</a> and <a href='https://arxiv.org/abs/2305.13516'>paper</a>.</p>"
-    )
-    gr.HTML(
-        """<center>Click on the appropriate tab to explore Speech-to-text (ASR), Text-to-speech (TTS) and Language identification (LID) demos.   </center>"""
-    )
-    gr.HTML(
-        """<center>You can also finetune MMS models on your data using the recipes provides here - <a href='https://huggingface.co/blog/mms_adapters'>ASR</a> <a href='https://github.com/ylacombe/finetune-hf-vits'>TTS</a>  </center>"""
-    )
-    gr.HTML(
-        """<center><a href="https://huggingface.co/spaces/facebook/MMS?duplicate=true"  style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"><img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> for more control and no queue.</center>"""
-    )
-    tabbed_interface.render()
-    gr.HTML(
-        """
-            <div class="footer" style="text-align:center">
-                <p>
-                    Model by <a href="https://ai.facebook.com" style="text-decoration: underline;" target="_blank">Meta AI</a> - Gradio Demo by 🤗 Hugging Face
-                </p>
-            </div>
-           """
     )
-if __name__ == "__main__":
-    demo.queue()
-    demo.launch()

+from fastapi import FastAPI, UploadFile, File, Form
+from fastapi.responses import JSONResponse, FileResponse
+import uvicorn
+from pydantic import BaseModel
+import numpy as np
+import io
+import soundfile as sf
+from asr import transcribe, ASR_LANGUAGES
+from tts import synthesize, TTS_LANGUAGES
+from lid import identify
+app = FastAPI(title="MMS: Scaling Speech Technology to 1000+ languages")
+class TTSRequest(BaseModel):
+    text: str
+    language: str
+    speed: float
+@app.post("/transcribe")
+async def transcribe_audio(audio: UploadFile = File(...), language: str = Form(...)):
+    contents = await audio.read()
+    audio_array, sample_rate = sf.read(io.BytesIO(contents))
+    result = transcribe(audio_array, language)
+    return JSONResponse(content={"transcription": result})
+@app.post("/synthesize")
+async def synthesize_speech(request: TTSRequest):
+    audio, filtered_text = synthesize(request.text, request.language, request.speed)
+    # Convert numpy array to bytes
+    buffer = io.BytesIO()
+    sf.write(buffer, audio, 22050, format='wav')
+    buffer.seek(0)
+    return FileResponse(
+        buffer,
+        media_type="audio/wav",
+        headers={"Content-Disposition": "attachment; filename=synthesized_audio.wav"}
     )
+@app.post("/identify")
+async def identify_language(audio: UploadFile = File(...)):
+    contents = await audio.read()
+    audio_array, sample_rate = sf.read(io.BytesIO(contents))
+    result = identify(audio_array)
+    return JSONResponse(content={"language_identification": result})
+@app.get("/asr_languages")
+async def get_asr_languages():
+    return JSONResponse(content=ASR_LANGUAGES)
+@app.get("/tts_languages")
+async def get_tts_languages():
+    return JSONResponse(content=TTS_LANGUAGES)