#test correct replication of speaker phonemes from transformers import pipeline pipe = pipeline( task="zero-shot-audio-classification", model="laion/clap-htsat-unfused" ) import numpy as np import gradio as gr def get_labels(target): return [f"An adult speaking.", f"A child speaking."] def classify_audio(audio, target=None): sr, y = audio y = y.astype(np.float32) y /= np.max(np.abs(y)) # get labels candidate_labels = get_labels(target) preds = pipe(y, candidate_labels=candidate_labels) outputs = {} for p in preds: outputs[p["label"]] = p["score"] return outputs demo = gr.Interface( fn=classify_audio, inputs=[gr.Audio(source="microphone")], outputs=gr.outputs.Label() ) demo.launch(debug=False)