import opensmile import joblib import wave import datetime import os import pandas as pd from sklearn.preprocessing import StandardScaler from base64 import b64decode import onnx import onnxruntime import torch import gradio as gr model_names = ["DNN", "RandomForest"] rf_model_path = "RF_emobase_20_model_top1_score0.6863_20231207_1537.joblib" dnn_model_path = "NN_emobase_allfeature_model_score_69.00_20240304_1432.onnx" dnn_model = onnxruntime.InferenceSession(dnn_model_path) rf_model = joblib.load(rf_model_path) def extract_features_rf(audio_path): smile = opensmile.Smile( #feature_set=opensmile.FeatureSet.GeMAPSv01b, feature_set=opensmile.FeatureSet.emobase, feature_level=opensmile.FeatureLevel.Functionals, ) feature_df = smile.process_files(audio_path) output_features = ['F0env_sma_de_amean', 'lspFreq_sma_de[5]_linregc1', 'mfcc_sma[3]_linregc1', 'lspFreq_sma[6]_quartile1', 'lspFreq_sma_de[6]_linregerrQ', 'lspFreq_sma_de[6]_maxPos', 'lspFreq_sma_de[6]_iqr2-3', 'lspFreq_sma_de[7]_minPos', 'lspFreq_sma_de[4]_linregc1', 'lspFreq_sma_de[6]_linregerrA', 'lspFreq_sma_de[6]_linregc2', 'lspFreq_sma[5]_amean', 'lspFreq_sma_de[6]_iqr1-2', 'mfcc_sma[1]_minPos', 'mfcc_sma[4]_linregc1', 'mfcc_sma[9]_iqr2-3', 'lspFreq_sma[5]_kurtosis', 'lspFreq_sma_de[3]_skewness', 'mfcc_sma[3]_minPos', 'mfcc_sma[12]_linregc1'] df = pd.DataFrame(feature_df.values[0], index=feature_df.columns) df = df[df.index.isin(output_features)] df = df.T scaler = StandardScaler() feature = scaler.fit_transform(df) print(df.shape) return feature def predict_rf(input): # openSMILEで特徴量抽出 feature_vector = extract_features_rf([input]) # ロードしたモデルで推論 prediction = rf_model.predict(feature_vector) #print(f"Prediction: {prediction}") return prediction def extract_features_dnn(audio_path): smile = opensmile.Smile( #feature_set=opensmile.FeatureSet.GeMAPSv01b, feature_set=opensmile.FeatureSet.emobase, feature_level=opensmile.FeatureLevel.Functionals, ) feature_df = smile.process_files(audio_path) #output_features = ['F0env_sma_de_amean', 'lspFreq_sma_de[5]_linregc1', 'mfcc_sma[3]_linregc1', 'lspFreq_sma[6]_quartile1', 'lspFreq_sma_de[6]_linregerrQ', 'lspFreq_sma_de[6]_maxPos', 'lspFreq_sma_de[6]_iqr2-3', 'lspFreq_sma_de[7]_minPos', 'lspFreq_sma_de[4]_linregc1', 'lspFreq_sma_de[6]_linregerrA', 'lspFreq_sma_de[6]_linregc2', 'lspFreq_sma[5]_amean', 'lspFreq_sma_de[6]_iqr1-2', 'mfcc_sma[1]_minPos', 'mfcc_sma[4]_linregc1', 'mfcc_sma[9]_iqr2-3', 'lspFreq_sma[5]_kurtosis', 'lspFreq_sma_de[3]_skewness', 'mfcc_sma[3]_minPos', 'mfcc_sma[12]_linregc1'] df = pd.DataFrame(feature_df.values[0], index=feature_df.columns) #df = df[df.index.isin(output_features)] df = df.T scaler = StandardScaler() feature = scaler.fit_transform(df) print(df.shape) return feature def softmax_calc_(pred): if torch.argmax(pred) == torch.tensor(0) : prediction = "question" else: prediction = "declarative" return prediction def predict_dnn(input): # openSMILEで特徴量抽出 feature_vector = extract_features_dnn([input]) # ロードしたモデルで推論 onnx_outs = dnn_model.run(None, {"model_input":feature_vector}) print(onnx_outs) prediction = softmax_calc_(torch.FloatTensor(onnx_outs)) print(f"Prediction: {prediction}") return prediction def main(model): if model == "DNN": return predict_dnn(input) elif model == "RandomForest": return predict_rf(input) with gr.Blocks() as demo: model = gr.Dropdown(choices=model_names), fn = main, inputs=[ gr.Audio(sources=["microphone","upload"], type="filepath") ], outputs=[ "textbox" ], live=True, description="demo for Audio to question classifier" demo.launch()