from fastai.vision.all import * import librosa import gradio as gr import numpy as np import wandb from fastai.callback.wandb import * wandb.init() learn = load_learner('audio_mnist_classifier_v1.pkl') categories = learn.dls.vocab def mel_spectrogram_tfm(file): y, sr = librosa.load(file) y, _ = librosa.effects.trim(y) spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=512) spec_db = librosa.amplitude_to_db(spec, ref=np.max) return spec_db def classify(audio): spec_db = mel_spectrogram_tfm(audio) img = 'tmp.png' plt.imsave(img, spec_db) pred, idx, prob = learn.predict(img) return dict(zip(categories, map(float, prob))) interface = gr.Interface(fn=classify, inputs=gr.Audio(source="microphone", type="filepath"), outputs=gr.outputs.Label(num_top_classes=10), title='Audio MNIST Classification', description='Identifying digits (from 0 to 9) from an audio clip') interface.launch()