Spaces:
Build error
Build error
File size: 2,233 Bytes
5557412 feec576 89bf06f 6b4e503 5557412 03b4fa3 89bf06f 638437d 55f31c4 84a3e63 638437d 84a3e63 89bf06f 638437d 3032b2a 638437d 89bf06f 142f6ec 89bf06f cd6af6a 89bf06f 879ffa6 5557412 55c2d79 638437d 55f31c4 6586f8d 302433f 55c2d79 89bf06f 5557412 7f185f4 c5e3e64 55c2d79 c5e3e64 55c2d79 c5e3e64 55c2d79 c5e3e64 3124643 e8122f3 c5e3e64 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
from transformers import pipeline
import gradio as gr
from pyctcdecode import BeamSearchDecoderCTC
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from transformers import AutoConfig, AutoModel, Wav2Vec2FeatureExtractor
import librosa
import numpy as np
import subprocess
TRUST = True
SR = 16000
def resample(speech_array, sampling_rate):
speech = torch.from_numpy(speech_array)
print(speech, speech.shape, sampling_rate)
resampler = torchaudio.transforms.Resample(sampling_rate)
speech = resampler(speech).squeeze().numpy()
return speech
def predict(speech_array, sampling_rate):
speech = resample(speech_array, sampling_rate)
print(speech, speech.shape)
inputs = feature_extractor(speech, sampling_rate=SR, return_tensors="pt", padding=True)
inputs = {key: inputs[key].to(device) for key in inputs}
with torch.no_grad():
logits = model.to(device)(**inputs).logits
scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
outputs = {config.id2label[i]: round(float(score), 3) for i, score in enumerate(scores)}
return outputs
config = AutoConfig.from_pretrained('Aniemore/wav2vec2-xlsr-53-russian-emotion-recognition', trust_remote_code=TRUST)
model = AutoModel.from_pretrained("Aniemore/wav2vec2-xlsr-53-russian-emotion-recognition", trust_remote_code=TRUST)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("Aniemore/wav2vec2-xlsr-53-russian-emotion-recognition")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
def recognize(audio, state={}):
sr, audio_array = audio
audio_array = audio_array.astype(np.float32)
print(sr, audio_array)
state = predict(audio_array, sr)
return state, state
def test_some(audio):
sr, audio_array = audio
audio_array = audio_array.astype(np.float32)
return (sr, audio_array)
interface = gr.Interface(
fn=recognize,
inputs=[
gr.Audio(source="microphone", label="Скажите что-нибудь..."),
"state"
],
outputs=[
gr.Label(num_top_classes=7),
"state"
],
live=True,
theme="huggingface")
interface.launch(debug=True) |