File size: 2,087 Bytes
5557412
 
 
feec576
89bf06f
 
 
 
 
 
 
6b4e503
5557412
03b4fa3
 
 
89bf06f
638437d
879ffa6
84a3e63
638437d
84a3e63
89bf06f
 
 
638437d
 
 
89bf06f
 
 
142f6ec
89bf06f
 
833d68e
89bf06f
 
 
 
 
 
 
 
879ffa6
5557412
 
638437d
 
6586f8d
638437d
89bf06f
5557412
e8122f3
638437d
e8122f3
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from transformers import pipeline
import gradio as gr
from pyctcdecode import BeamSearchDecoderCTC
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from transformers import AutoConfig, AutoModel, Wav2Vec2FeatureExtractor
import librosa
import numpy as np
import subprocess

TRUST = True
SR = 16000


def resample(speech_array, sampling_rate):
    speech = torch.from_numpy(np.array([speech_array])).type(torch.cuda.ShortTensor)
    print(speech, speech.shape, sampling_rate)
    resampler = torchaudio.transforms.Resample(sampling_rate)
    speech = resampler(speech).squeeze().numpy()
    return speech


def predict(speech_array, sampling_rate):
    speech = resample(speech_array, sampling_rate)
    inputs = feature_extractor(speech, sampling_rate=SR, return_tensors="pt", padding=True)
    inputs = {key: inputs[key].to(device) for key in inputs}

    with torch.no_grad():
        logits = model.to(device)(**inputs).logits

    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
    outputs = {config.id2label[i]: f"{round(score * 100, 3):.1f}%" for i, score in enumerate(scores)}
    return outputs


config = AutoConfig.from_pretrained('Aniemore/wav2vec2-xlsr-53-russian-emotion-recognition', trust_remote_code=TRUST)
model = AutoModel.from_pretrained("Aniemore/wav2vec2-xlsr-53-russian-emotion-recognition", trust_remote_code=TRUST)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("Aniemore/wav2vec2-xlsr-53-russian-emotion-recognition")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


def recognize(audio):
    sr, audio_array = audio
    print(sr, audio_array)
    return predict(audio_array, sr)


with gr.Blocks() as blocks:
    audio = gr.Audio(source="microphone", type="numpy", label="Скажите что-нибудь...")
    success_button = gr.Button('Распознать эмоции')
    output = gr.JSON(label="Эмоции")
    
    success_button.click(fn=recognize, inputs=[audio], outputs=[output])
    
blocks.launch(enable_queue=True, debug=True)