Spaces:

SeyedAli
/

Persian-Speech-Emotion-Detection

Runtime error

App Files Files Community

SeyedAli commited on Sep 21, 2023

Commit

54063ad

•

1 Parent(s): 827628e

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -24

app.py CHANGED Viewed

@@ -1,37 +1,44 @@
 import tempfile
 import torch
 import torchaudio
 import gradio as gr
-from transformers import Wav2Vec2FeatureExtractor,AutoConfig,pipeline
 config = AutoConfig.from_pretrained("SeyedAli/Persian-Speech-Emotion-HuBert-V1")
 model = Wav2Vec2FeatureExtractor.from_pretrained("SeyedAli/Persian-Speech-Emotion-HuBert-V1")
-def speech_file_to_array_fn(path, sampling_rate):
-   with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file:
-    # Copy the contents of the uploaded audio file to the temporary file
-    temp_audio_file.write(open(path, "rb").read())
-    temp_audio_file.flush()
-    # Load the audio file using torchaudio
-    speech_array, _sampling_rate = torchaudio.load(temp_audio_file.name)
-    resampler = torchaudio.transforms.Resample(_sampling_rate)
-    speech = resampler(speech_array).squeeze().numpy()
-    return speech
-def predict(path, sampling_rate):
-    speech = speech_file_to_array_fn(path, sampling_rate)
-    inputs = model(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
-    inputs = {key: inputs[key].to(device) for key in inputs}
-    with torch.no_grad():
-        logits = model(**inputs).logits
-    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
-    outputs = [{"Label": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
-    return outputs
-def SER(audio):
-    return predict(audio,model.sampling_rate)
-iface = gr.Interface(fn=SER, inputs="audio", outputs="text")
 iface.launch(share=False)

 import tempfile
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
 import torchaudio
 import gradio as gr
+from transformers import Wav2Vec2FeatureExtractor,AutoConfig
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.models.wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2PreTrainedModel,
+    Wav2Vec2Model
+)
+from transformers.models.hubert.modeling_hubert import (
+    HubertPreTrainedModel,
+    HubertModel
+)
 config = AutoConfig.from_pretrained("SeyedAli/Persian-Speech-Emotion-HuBert-V1")
 model = Wav2Vec2FeatureExtractor.from_pretrained("SeyedAli/Persian-Speech-Emotion-HuBert-V1")
+audio_input = gr.Audio(label="صوت گفتار فارسی",type="filepath")
+text_output = gr.TextArea(label="هیجان موجود در صوت گفتار",text_align="right",rtl=True,type="text")
+def SER(audio):
+    with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file:
+        # Copy the contents of the uploaded audio file to the temporary file
+        temp_audio_file.write(open(audio, "rb").read())
+        temp_audio_file.flush()
+        # Load the audio file using torchaudio
+        speech_array, _sampling_rate = torchaudio.load(temp_audio_file.name)
+        resampler = torchaudio.transforms.Resample(_sampling_rate)
+        speech = resampler(speech_array).squeeze().numpy()
+        inputs = model(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
+        inputs = {key: inputs[key].to(device) for key in inputs}
+        with torch.no_grad():
+            logits = model(**inputs).logits
+        scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
+        outputs = [{"Label": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
+        return outputs
+iface = gr.Interface(fn=SER, inputs=audio_input, outputs=text_output)
 iface.launch(share=False)