File size: 2,038 Bytes
5557412
 
 
feec576
89bf06f
 
 
 
 
 
 
6b4e503
5557412
03b4fa3
 
 
89bf06f
638437d
55f31c4
84a3e63
638437d
84a3e63
89bf06f
 
 
638437d
 
3032b2a
638437d
89bf06f
 
 
142f6ec
89bf06f
 
833d68e
89bf06f
 
 
 
 
 
 
 
879ffa6
5557412
 
638437d
 
55f31c4
6586f8d
638437d
89bf06f
5557412
c5e3e64
 
 
 
 
 
 
 
 
e8122f3
c5e3e64
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from transformers import pipeline
import gradio as gr
from pyctcdecode import BeamSearchDecoderCTC
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from transformers import AutoConfig, AutoModel, Wav2Vec2FeatureExtractor
import librosa
import numpy as np
import subprocess

TRUST = True
SR = 16000


def resample(speech_array, sampling_rate):
    speech = torch.from_numpy(speech_array)
    print(speech, speech.shape, sampling_rate)
    resampler = torchaudio.transforms.Resample(sampling_rate)
    speech = resampler(speech).squeeze().numpy()
    return speech


def predict(speech_array, sampling_rate):
    speech = resample(speech_array, sampling_rate)
    print(speech, speech.shape)
    inputs = feature_extractor(speech, sampling_rate=SR, return_tensors="pt", padding=True)
    inputs = {key: inputs[key].to(device) for key in inputs}

    with torch.no_grad():
        logits = model.to(device)(**inputs).logits

    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
    outputs = {config.id2label[i]: f"{round(score * 100, 3):.1f}%" for i, score in enumerate(scores)}
    return outputs


config = AutoConfig.from_pretrained('Aniemore/wav2vec2-xlsr-53-russian-emotion-recognition', trust_remote_code=TRUST)
model = AutoModel.from_pretrained("Aniemore/wav2vec2-xlsr-53-russian-emotion-recognition", trust_remote_code=TRUST)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("Aniemore/wav2vec2-xlsr-53-russian-emotion-recognition")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


def recognize(audio):
    sr, audio_array = audio
    audio_array = audio_array.astype(np.float32)
    print(sr, audio_array)
    return predict(audio_array, sr)


interface = gr.Interface(
    fn=recognize, 
    inputs=[
        gr.Audio(source="microphone", type="numpy", label="Скажите что-нибудь...")
    ],
    outputs=[
        output = gr.JSON(label="Эмоции")
    ],
    live=True)
    
interface.launch(debug=True)