import opensmile import joblib import wave import datetime import os import pandas as pd from sklearn.preprocessing import StandardScaler from base64 import b64decode import onnx import onnxruntime import torch import gradio as gr model_names = ["DNN", "RandomForest"] rf_model_path = "RF_emobase_20_model_top1_score0.6863_20231207_1537.joblib" dnn_model_path = "NN_emobase_allfeature_model_score_68.00_20240304_1451.onnx" dnn_model = onnxruntime.InferenceSession(dnn_model_path) rf_model = joblib.load(rf_model_path) def extract_features_rf(audio_path): smile = opensmile.Smile( #feature_set=opensmile.FeatureSet.GeMAPSv01b, feature_set=opensmile.FeatureSet.emobase, feature_level=opensmile.FeatureLevel.Functionals, ) feature_df = smile.process_files(audio_path) output_features = ['F0env_sma_de_amean', 'lspFreq_sma_de[5]_linregc1', 'mfcc_sma[3]_linregc1', 'lspFreq_sma[6]_quartile1', 'lspFreq_sma_de[6]_linregerrQ', 'lspFreq_sma_de[6]_maxPos', 'lspFreq_sma_de[6]_iqr2-3', 'lspFreq_sma_de[7]_minPos', 'lspFreq_sma_de[4]_linregc1', 'lspFreq_sma_de[6]_linregerrA', 'lspFreq_sma_de[6]_linregc2', 'lspFreq_sma[5]_amean', 'lspFreq_sma_de[6]_iqr1-2', 'mfcc_sma[1]_minPos', 'mfcc_sma[4]_linregc1', 'mfcc_sma[9]_iqr2-3', 'lspFreq_sma[5]_kurtosis', 'lspFreq_sma_de[3]_skewness', 'mfcc_sma[3]_minPos', 'mfcc_sma[12]_linregc1'] df = pd.DataFrame(feature_df.values[0], index=feature_df.columns) df = df[df.index.isin(output_features)] df = df.T scaler = StandardScaler() feature = scaler.fit_transform(df) print(df.shape) return feature def predict_rf(input): # openSMILEで特徴量抽出 feature_vector = extract_features_rf([input]) # ロードしたモデルで推論 prediction = rf_model.predict(feature_vector) #print(f"Prediction: {prediction}") return prediction def extract_features_dnn(audio_path): smile = opensmile.Smile( #feature_set=opensmile.FeatureSet.GeMAPSv01b, feature_set=opensmile.FeatureSet.emobase, feature_level=opensmile.FeatureLevel.Functionals, ) feature_df = smile.process_files(audio_path) #output_features = ['F0env_sma_de_amean', 'lspFreq_sma_de[5]_linregc1', 'mfcc_sma[3]_linregc1', 'lspFreq_sma[6]_quartile1', 'lspFreq_sma_de[6]_linregerrQ', 'lspFreq_sma_de[6]_maxPos', 'lspFreq_sma_de[6]_iqr2-3', 'lspFreq_sma_de[7]_minPos', 'lspFreq_sma_de[4]_linregc1', 'lspFreq_sma_de[6]_linregerrA', 'lspFreq_sma_de[6]_linregc2', 'lspFreq_sma[5]_amean', 'lspFreq_sma_de[6]_iqr1-2', 'mfcc_sma[1]_minPos', 'mfcc_sma[4]_linregc1', 'mfcc_sma[9]_iqr2-3', 'lspFreq_sma[5]_kurtosis', 'lspFreq_sma_de[3]_skewness', 'mfcc_sma[3]_minPos', 'mfcc_sma[12]_linregc1'] df = pd.DataFrame(feature_df.values[0], index=feature_df.columns) #df = df[df.index.isin(output_features)] df = df.T scaler = StandardScaler() feature = scaler.fit_transform(df) print(df.shape) return feature def softmax_calc_(pred): if torch.argmax(pred) == torch.tensor(0) : prediction = "question" else: prediction = "declarative" return prediction def predict_dnn(input): # openSMILEで特徴量抽出 feature_vector = extract_features_dnn([input]) # ロードしたモデルで推論 onnx_outs = dnn_model.run(None, {"model_input":feature_vector}) print(onnx_outs) prediction = softmax_calc_(torch.FloatTensor(onnx_outs)) print(f"Prediction: {prediction}") return prediction def main(model, audio): if model == "DNN": predict = predict_dnn(audio) elif model == "RandomForest": predict = predict_rf(audio) return predict iface = gr.Interface( fn = main, inputs=[ gr.Dropdown(choices=model_names), gr.Audio(sources=["microphone","upload"], type="filepath") ], outputs=[ "textbox" ], live=True, description="demo for Audio to question classifier" ) iface.launch()