|
import torch |
|
import torch.nn as nn |
|
import torch.optim as optim |
|
import librosa |
|
import numpy as np |
|
from torchvision import models |
|
from scipy.ndimage import zoom |
|
import gradio as gr |
|
import pickle |
|
from joblib import load |
|
import soundfile as sf |
|
|
|
|
|
language_mapping = {'malayalam': 0, 'english': 1, 'tamil': 2,'hindi':3} |
|
|
|
class ANNModel(nn.Module): |
|
def __init__(self): |
|
super(ANNModel, self).__init__() |
|
self.fc1 = nn.Linear(300, 256) |
|
self.relu1 = nn.ReLU() |
|
self.fc2 = nn.Linear(256, 64) |
|
self.relu2 = nn.ReLU() |
|
self.fc3 = nn.Linear(64, 4) |
|
|
|
def forward(self, x): |
|
x = self.fc1(x) |
|
x = self.relu1(x) |
|
x = self.fc2(x) |
|
x = self.relu2(x) |
|
x = self.fc3(x) |
|
return x |
|
|
|
|
|
ann_model = ANNModel() |
|
|
|
|
|
ann_model.load_state_dict(torch.load('ann_model_256_01_94.pth')) |
|
|
|
|
|
pca = load('pca_256_01_94.pkl') |
|
|
|
vgg16 = models.vgg16(pretrained=True).features |
|
|
|
def preprocess_single_audio_vgg16(audio_data, sr, vgg16_model, pca_instance): |
|
|
|
y= audio_data |
|
sr=sr |
|
|
|
mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128) |
|
log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max) |
|
norm_mel_spec = (log_mel_spec - np.mean(log_mel_spec)) / np.std(log_mel_spec) |
|
|
|
|
|
target_shape = (224, 224) |
|
resized_mel_spec = zoom(norm_mel_spec, (target_shape[0] / norm_mel_spec.shape[0], target_shape[1] / norm_mel_spec.shape[1]), mode='nearest') |
|
|
|
|
|
mel_spec_rgb = np.stack([resized_mel_spec] * 3, axis=-1) |
|
|
|
|
|
mel_spec_tensor = torch.from_numpy(mel_spec_rgb).permute(2, 0, 1).unsqueeze(0).float() |
|
|
|
|
|
vgg16_model.eval() |
|
with torch.no_grad(): |
|
features = vgg16_model(mel_spec_tensor) |
|
|
|
|
|
features_np = features.squeeze().detach().numpy() |
|
features_flattened = features_np.flatten().reshape(1, -1) |
|
|
|
|
|
features_pca = pca_instance.transform(features_flattened) |
|
|
|
|
|
features_tensor = torch.from_numpy(features_pca).float() |
|
return features_tensor |
|
|
|
|
|
def predict_language(audio_input): |
|
|
|
if isinstance(audio_input, str): |
|
|
|
audio, sr = librosa.load(audio_input, sr=22050) |
|
else: |
|
|
|
sr, audio = audio_input |
|
audio = audio.astype(np.float32) |
|
|
|
|
|
preprocessed_features = preprocess_single_audio_vgg16(audio, sr, vgg16, pca) |
|
|
|
|
|
ann_model.eval() |
|
with torch.no_grad(): |
|
output = ann_model(preprocessed_features) |
|
_, predicted_class = torch.max(output, 1) |
|
|
|
|
|
predicted_label = {v: k for k, v in language_mapping.items()}[predicted_class.item()] |
|
|
|
return predicted_label |
|
|
|
iface = gr.Interface(fn=predict_language, inputs="audio", outputs="text") |
|
|
|
iface.launch() |
|
|