ttt-tkmr's picture
Update app.py
af16a0b verified
raw
history blame
3.93 kB
import opensmile
import joblib
import wave
import datetime
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from base64 import b64decode
import onnx
import onnxruntime
import torch
import gradio as gr
model_names = ["DNN", "RandomForest"]
rf_model_path = "RF_emobase_20_model_top1_score0.6863_20231207_1537.joblib"
dnn_model_path = "NN_emobase_allfeature_model_score_69.00_20240304_1432.onnx"
dnn_model = onnxruntime.InferenceSession(dnn_model_path)
rf_model = joblib.load(rf_model_path)
def extract_features_rf(audio_path):
smile = opensmile.Smile(
#feature_set=opensmile.FeatureSet.GeMAPSv01b,
feature_set=opensmile.FeatureSet.emobase,
feature_level=opensmile.FeatureLevel.Functionals,
)
feature_df = smile.process_files(audio_path)
output_features = ['F0env_sma_de_amean', 'lspFreq_sma_de[5]_linregc1', 'mfcc_sma[3]_linregc1', 'lspFreq_sma[6]_quartile1', 'lspFreq_sma_de[6]_linregerrQ', 'lspFreq_sma_de[6]_maxPos', 'lspFreq_sma_de[6]_iqr2-3', 'lspFreq_sma_de[7]_minPos', 'lspFreq_sma_de[4]_linregc1', 'lspFreq_sma_de[6]_linregerrA', 'lspFreq_sma_de[6]_linregc2', 'lspFreq_sma[5]_amean', 'lspFreq_sma_de[6]_iqr1-2', 'mfcc_sma[1]_minPos', 'mfcc_sma[4]_linregc1', 'mfcc_sma[9]_iqr2-3', 'lspFreq_sma[5]_kurtosis', 'lspFreq_sma_de[3]_skewness', 'mfcc_sma[3]_minPos', 'mfcc_sma[12]_linregc1']
df = pd.DataFrame(feature_df.values[0], index=feature_df.columns)
df = df[df.index.isin(output_features)]
df = df.T
scaler = StandardScaler()
feature = scaler.fit_transform(df)
print(df.shape)
return feature
def predict_rf(input):
# openSMILEで特徴量抽出
feature_vector = extract_features_rf([input])
# ロードしたモデルで推論
prediction = rf_model.predict(feature_vector)
#print(f"Prediction: {prediction}")
return prediction
def extract_features_dnn(audio_path):
smile = opensmile.Smile(
#feature_set=opensmile.FeatureSet.GeMAPSv01b,
feature_set=opensmile.FeatureSet.emobase,
feature_level=opensmile.FeatureLevel.Functionals,
)
feature_df = smile.process_files(audio_path)
#output_features = ['F0env_sma_de_amean', 'lspFreq_sma_de[5]_linregc1', 'mfcc_sma[3]_linregc1', 'lspFreq_sma[6]_quartile1', 'lspFreq_sma_de[6]_linregerrQ', 'lspFreq_sma_de[6]_maxPos', 'lspFreq_sma_de[6]_iqr2-3', 'lspFreq_sma_de[7]_minPos', 'lspFreq_sma_de[4]_linregc1', 'lspFreq_sma_de[6]_linregerrA', 'lspFreq_sma_de[6]_linregc2', 'lspFreq_sma[5]_amean', 'lspFreq_sma_de[6]_iqr1-2', 'mfcc_sma[1]_minPos', 'mfcc_sma[4]_linregc1', 'mfcc_sma[9]_iqr2-3', 'lspFreq_sma[5]_kurtosis', 'lspFreq_sma_de[3]_skewness', 'mfcc_sma[3]_minPos', 'mfcc_sma[12]_linregc1']
df = pd.DataFrame(feature_df.values[0], index=feature_df.columns)
#df = df[df.index.isin(output_features)]
df = df.T
scaler = StandardScaler()
feature = scaler.fit_transform(df)
print(df.shape)
return feature
def softmax_calc_(pred):
if torch.argmax(pred) == torch.tensor(0) :
prediction = "question"
else:
prediction = "declarative"
return prediction
def predict_dnn(input):
# openSMILEで特徴量抽出
feature_vector = extract_features_dnn([input])
# ロードしたモデルで推論
onnx_outs = dnn_model.run(None, {"model_input":feature_vector})
print(onnx_outs)
prediction = softmax_calc_(torch.FloatTensor(onnx_outs))
print(f"Prediction: {prediction}")
return prediction
def main(model, audio):
if model == "DNN":
predict = predict_dnn(audio)
elif model == "RandomForest":
predict = predict_rf(audio)
return predict
iface = gr.Interface(
fn = main,
inputs=[
gr.Dropdown(choices=model_names),
gr.Audio(sources=["microphone","upload"], type="filepath")
],
outputs=[
"textbox"
],
live=True,
description="demo for Audio to question classifier"
)
iface.launch()