zongxiao's picture
Update app.py
08a0218
import torch
import numpy as np
import soundfile as sf
from transformers import pipeline
from transformers import BarkModel
from transformers import AutoProcessor
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
"automatic-speech-recognition", model="openai/whisper-large-v2", device=device
)
label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark")
model = model.to(device)
synthesised_rate = model.generation_config.sample_rate
def translate(audio_file):
audio, sampling_rate = sf.read(audio_file)
outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
label_outputs = {}
for pred in language_prediction:
label_outputs[pred["label"]] = pred["score"]
return outputs["text"],label_outputs
def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
inputs = processor(text_prompt, voice_preset=voice_preset)
speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
return speech_output
def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
translated_text, label_outputs= translate(audio)
synthesised_speech = synthesise(translated_text,voice_preset)
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
return (synthesised_rate , synthesised_speech.T),translated_text,label_outputs
title = "外国话转普通话"
description = """
作为[Hugging Face Audio course](https://github.com/danfouer/HFAudioCourse) 的结课大作业,本演示调用了三个自然语言处理的大模型,一个用于将外国话翻译成中文,一个用于判断说的哪个国家的话,一个用于将中文转成普通话语音输出。演示同时支持语音上传和麦克风输入,转换速度比较慢因为租不起GPU的服务器(支出增加20倍),建议您通过已经缓存Examples体验效果。欢迎添加我的微信号:ESGGTP 与我的平行人交流。
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""
examples = [
# ["./en.mp3", None],
# ["./de.mp3", None],
["./fr.mp3", None],
["./it.mp3", None],
["./nl.mp3", None],
["./fi.mp3", None],
# ["./cs.mp3", None],
# ["./pl.mp3", None],
]
import gradio as gr
demo = gr.Blocks()
file_transcribe = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="upload", type="filepath"),
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
gr.Text(label="Transcription"),
gr.Label(label="Language prediction"),
],
title=title,
description=description,
examples=examples,
)
mic_transcribe = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
gr.Text(label="Transcription"),
gr.Label(label="Language prediction"),
],
title=title,
description=description,
)
with demo:
gr.TabbedInterface(
[file_transcribe, mic_transcribe],
["Transcribe Audio File", "Transcribe Microphone"],
)
demo.launch()
###########################################################################################################################
# import torch
# import numpy as np
# import soundfile as sf
# from transformers import pipeline
# from transformers import BarkModel
# from transformers import AutoProcessor
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# pipe = pipeline(
# "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
# )
# #label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
# processor = AutoProcessor.from_pretrained("suno/bark")
# model = BarkModel.from_pretrained("suno/bark")
# model = model.to(device)
# synthesised_rate = model.generation_config.sample_rate
# def translate(audio_file):
# # audio, sampling_rate = sf.read(audio_file)
# outputs = pipe(audio_file, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
# # language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
# # label_outputs = {}
# # for pred in language_prediction:
# # label_outputs[pred["label"]] = pred["score"]
# return outputs["text"]#,label_outputs
# def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
# inputs = processor(text_prompt, voice_preset=voice_preset)
# speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
# return speech_output
# def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
# #translated_text, label_outputs= translate(audio)
# translated_text = translate(audio)
# synthesised_speech = synthesise(translated_text,voice_preset)
# synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
# return (synthesised_rate , synthesised_speech.T),translated_text#,label_outputs
# title = "外国话转中文话"
# description = """
# 作为[Hugging Face Audio course](https://github.com/danfouer/HFAudioCourse) 的结课大作业,本演示调用了三个自然语言处理的大模型,一个用于将外国话翻译成中文,一个用于判断说的哪个国家的话(CPU演示太慢暂时先去掉了),一个用于将中文转成语音输出。演示同时支持语音上传和麦克风输入,转换速度比较慢因为租不起GPU的服务器(支出增加20倍),建议您通过已经缓存Examples体验效果。欢迎添加我的微信号:ESGGTP 与我的平行人交流。
# ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
# """
# examples = [
# ["./en.mp3", None],
# ["./de.mp3", None],
# ["./fr.mp3", None],
# ["./it.mp3", None],
# ["./nl.mp3", None],
# ["./fi.mp3", None],
# ["./cs.mp3", None],
# ["./pl.mp3", None],
# ]
# import gradio as gr
# demo = gr.Blocks()
# file_transcribe = gr.Interface(
# fn=speech_to_speech_translation,
# inputs=gr.Audio(source="upload", type="filepath"),
# outputs=[
# gr.Audio(label="Generated Speech", type="numpy"),
# gr.Text(label="Transcription"),
# # gr.Label(label="Language prediction"),
# ],
# title=title,
# description=description,
# examples=examples,
# )
# mic_transcribe = gr.Interface(
# fn=speech_to_speech_translation,
# inputs=gr.Audio(source="microphone", type="filepath"),
# outputs=[
# gr.Audio(label="Generated Speech", type="numpy"),
# gr.Text(label="Transcription"),
# # gr.Label(label="Language prediction"),
# ],
# title=title,
# description=description,
# )
# with demo:
# gr.TabbedInterface(
# [file_transcribe, mic_transcribe],
# ["Transcribe Audio File", "Transcribe Microphone"],
# )
# demo.launch()