Ar4ikov's picture
Update app.py
9585e6d
raw
history blame
1.98 kB
from transformers import pipeline
import gradio as gr
from pyctcdecode import BeamSearchDecoderCTC
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from transformers import AutoConfig, AutoModel, Wav2Vec2FeatureExtractor
import librosa
import numpy as np
import subprocess
def speech_file_to_array_fn(path, sampling_rate):
speech_array, _sampling_rate = torchaudio.load(path)
resampler = torchaudio.transforms.Resample(_sampling_rate)
speech = resampler(speech_array).squeeze().numpy()
return speech
def predict(speech, sampling_rate):
inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
inputs = {key: inputs[key].to(device) for key in inputs}
with torch.no_grad():
logits = model_(**inputs).logits
scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
outputs = [{"Emotion": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
return outputs
TRUST = True
config = AutoConfig.from_pretrained('Aniemore/wav2vec2-xlsr-53-russian-emotion-recognition', trust_remote_code=TRUST)
model = AutoModel.from_pretrained("Aniemore/wav2vec2-xlsr-53-russian-emotion-recognition", trust_remote_code=TRUST)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("Aniemore/wav2vec2-xlsr-53-russian-emotion-recognition")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
def transcribe(audio):
print(audio)
return predict(audio, 16000)
def get_asr_interface():
return gr.Interface(
fn=transcribe,
inputs=[
gr.inputs.Audio(source="upload", type="numpy")
],
outputs=[
"textbox"
])
interfaces = [
get_asr_interface()
]
names = [
"Russian Emotion Recognition"
]
gr.TabbedInterface(interfaces, names).launch(server_name = "0.0.0.0", enable_queue=False)