#test correct replication of speaker phonemes
from transformers import pipeline
pipe = pipeline(
    task="zero-shot-audio-classification", model="laion/clap-htsat-unfused"
)

import numpy as np
import gradio as gr


def get_labels(target):
    return  [f"An adult speaking.", f"A child speaking."]

def classify_audio(audio, target=None):
    sr, y = audio
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    # get labels
    candidate_labels = get_labels(target)
    preds = pipe(y, candidate_labels=candidate_labels)
    outputs = {}
    for p in preds:
        outputs[p["label"]] = p["score"]
    return outputs


demo = gr.Interface(
    fn=classify_audio, inputs=[gr.Audio(source="microphone")], outputs=gr.outputs.Label()
)
demo.launch(debug=False)