import torch
import numpy as np
import soundfile as sf
from transformers import pipeline
from transformers import BarkModel
from transformers import AutoProcessor

device = "cuda:0" if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
)
label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark")
model = model.to(device)
synthesised_rate = model.generation_config.sample_rate

def translate(audio_file):
    audio, sampling_rate = sf.read(audio_file)
    outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
    language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
    label_outputs = {}
    for pred in language_prediction:
        label_outputs[pred["label"]] = pred["score"]
    return outputs["text"],label_outputs
def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
    inputs = processor(text_prompt, voice_preset=voice_preset)
    speech_output = model.generate(**inputs.to(device),pad_token_id=10000)  
    return speech_output
def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
    translated_text, label_outputs= translate(audio)
    synthesised_speech = synthesise(translated_text,voice_preset)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return (synthesised_rate , synthesised_speech.T),translated_text,label_outputs

title = "外国话转中文话"
description = """
本演示调用了三个自然语言处理的大模型，一个用于将外国话翻译成中文，一个用于判断说的哪个国家的话，一个用于将中文转成语音输出。同时支持语音上传和麦克风输入。转换速度比较慢因为租不起GPU的服务器（支出增加20倍），建议您通过已经缓存Examples体验效果。欢迎添加我的微信号：ESGGTP 与我的平行人交流。
![外国话转中文话](https://huggingface.co/spaces/zongxiao/speech-to-speech/blob/main/QRcode.jpg "Diagram of my Wechat QR code")
"""
examples = [
    ["./cs.wav", None],
    ["./de.wav", None],
    ["./fr.wav", None],
    ["./it.wav", None],
    ["./nl.wav", None],
    ["./pl.wav", None],
    ["./ro.wav", None],
    ["./hr.wav", None],
    ["./fi.wav", None],
    ["./sl.wav", None],
]
import gradio as gr

demo = gr.Blocks()
file_transcribe = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
        gr.Text(label="Transcription"),
        gr.Label(label="Language prediction"),
    ],
    title=title,
    description=description,
    examples=examples,
)
mic_transcribe = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
        gr.Text(label="Transcription"),
        gr.Label(label="Language prediction"),
    ],
    title=title,
    description=description,
)
with demo:
    gr.TabbedInterface(
        [file_transcribe, mic_transcribe],
        ["Transcribe Audio File", "Transcribe Microphone"],
    )

demo.launch(share=True)