ttt-tkmr's picture
Update app.py
f8ad6dd verified
raw
history blame
3.9 kB
import opensmile
import joblib
import wave
import datetime
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from base64 import b64decode
import onnx
import onnxruntime
import torch
import gradio as gr
model_names = ["DNN", "RandomForest"]
rf_model_path = "RF_emobase_20_model_top1_score0.6863_20231207_1537.joblib"
dnn_model_path = "NN_emobase_allfeature_model_score_69.00_20240304_1432.onnx"
dnn_model = onnxruntime.InferenceSession(dnn_model_path)
rf_model = joblib.load(rf_model_path)
def extract_features_rf(audio_path):
smile = opensmile.Smile(
#feature_set=opensmile.FeatureSet.GeMAPSv01b,
feature_set=opensmile.FeatureSet.emobase,
feature_level=opensmile.FeatureLevel.Functionals,
)
feature_df = smile.process_files(audio_path)
output_features = ['F0env_sma_de_amean', 'lspFreq_sma_de[5]_linregc1', 'mfcc_sma[3]_linregc1', 'lspFreq_sma[6]_quartile1', 'lspFreq_sma_de[6]_linregerrQ', 'lspFreq_sma_de[6]_maxPos', 'lspFreq_sma_de[6]_iqr2-3', 'lspFreq_sma_de[7]_minPos', 'lspFreq_sma_de[4]_linregc1', 'lspFreq_sma_de[6]_linregerrA', 'lspFreq_sma_de[6]_linregc2', 'lspFreq_sma[5]_amean', 'lspFreq_sma_de[6]_iqr1-2', 'mfcc_sma[1]_minPos', 'mfcc_sma[4]_linregc1', 'mfcc_sma[9]_iqr2-3', 'lspFreq_sma[5]_kurtosis', 'lspFreq_sma_de[3]_skewness', 'mfcc_sma[3]_minPos', 'mfcc_sma[12]_linregc1']
df = pd.DataFrame(feature_df.values[0], index=feature_df.columns)
df = df[df.index.isin(output_features)]
df = df.T
scaler = StandardScaler()
feature = scaler.fit_transform(df)
print(df.shape)
return feature
def predict_rf(input):
# openSMILEで特徴量抽出
feature_vector = extract_features_rf([input])
# ロードしたモデルで推論
prediction = rf_model.predict(feature_vector)
#print(f"Prediction: {prediction}")
return prediction
def extract_features_dnn(audio_path):
smile = opensmile.Smile(
#feature_set=opensmile.FeatureSet.GeMAPSv01b,
feature_set=opensmile.FeatureSet.emobase,
feature_level=opensmile.FeatureLevel.Functionals,
)
feature_df = smile.process_files(audio_path)
#output_features = ['F0env_sma_de_amean', 'lspFreq_sma_de[5]_linregc1', 'mfcc_sma[3]_linregc1', 'lspFreq_sma[6]_quartile1', 'lspFreq_sma_de[6]_linregerrQ', 'lspFreq_sma_de[6]_maxPos', 'lspFreq_sma_de[6]_iqr2-3', 'lspFreq_sma_de[7]_minPos', 'lspFreq_sma_de[4]_linregc1', 'lspFreq_sma_de[6]_linregerrA', 'lspFreq_sma_de[6]_linregc2', 'lspFreq_sma[5]_amean', 'lspFreq_sma_de[6]_iqr1-2', 'mfcc_sma[1]_minPos', 'mfcc_sma[4]_linregc1', 'mfcc_sma[9]_iqr2-3', 'lspFreq_sma[5]_kurtosis', 'lspFreq_sma_de[3]_skewness', 'mfcc_sma[3]_minPos', 'mfcc_sma[12]_linregc1']
df = pd.DataFrame(feature_df.values[0], index=feature_df.columns)
#df = df[df.index.isin(output_features)]
df = df.T
scaler = StandardScaler()
feature = scaler.fit_transform(df)
print(df.shape)
return feature
def softmax_calc_(pred):
if torch.argmax(pred) == torch.tensor(0) :
prediction = "question"
else:
prediction = "declarative"
return prediction
def predict_dnn(input):
# openSMILEで特徴量抽出
feature_vector = extract_features_dnn([input])
# ロードしたモデルで推論
onnx_outs = dnn_model.run(None, {"model_input":feature_vector})
print(onnx_outs)
prediction = softmax_calc_(torch.FloatTensor(onnx_outs))
print(f"Prediction: {prediction}")
return prediction
def main(model, audio):
if model == "DNN":
return predict_dnn(audio)
elif model == "RandomForest":
return predict_rf(audio)
iface = gr.Interface(
fn = main,
inputs=[
gr.Dropdown(choices=model_names),
gr.Audio(sources=["microphone","upload"], type="filepath")
],
outputs=[
"textbox"
],
live=True,
description="demo for Audio to question classifier"
)
iface.launch()