new_lang / app.py
saronium's picture
Update app.py
0c3276d verified
import torch
import torch.nn as nn
import torch.optim as optim
import librosa
import numpy as np
from torchvision import models
from scipy.ndimage import zoom
import gradio as gr
import pickle
from joblib import load
import soundfile as sf
# Assuming you already have the 'ann_model' trained and 'pca' instance from the previous code
language_mapping = {'malayalam': 0, 'english': 1, 'tamil': 2,'hindi':3}
class ANNModel(nn.Module):
def __init__(self):
super(ANNModel, self).__init__()
self.fc1 = nn.Linear(300, 256)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(256, 64)
self.relu2 = nn.ReLU()
self.fc3 = nn.Linear(64, 4)
def forward(self, x):
x = self.fc1(x)
x = self.relu1(x)
x = self.fc2(x)
x = self.relu2(x)
x = self.fc3(x)
return x
# Create an instance of your model
ann_model = ANNModel()
# Load the trained model
ann_model.load_state_dict(torch.load('ann_model_256_01_94.pth'))
# Load the PCA instance
pca = load('pca_256_01_94.pkl')
vgg16 = models.vgg16(pretrained=True).features
# Function to load and preprocess a single audio file
def preprocess_single_audio_vgg16(audio_data, sr, vgg16_model, pca_instance):
# Your existing preprocessing code goes here
y= audio_data
sr=sr
# Load audio
mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128) # Compute Mel spectrogram
log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max) # Apply log transformation
norm_mel_spec = (log_mel_spec - np.mean(log_mel_spec)) / np.std(log_mel_spec) # Normalize
# Resize mel spectrogram to the target shape (128, 128) using zoom
target_shape = (224, 224)
resized_mel_spec = zoom(norm_mel_spec, (target_shape[0] / norm_mel_spec.shape[0], target_shape[1] / norm_mel_spec.shape[1]), mode='nearest')
# Stack the resized mel spectrogram along the third axis to create 3 channels
mel_spec_rgb = np.stack([resized_mel_spec] * 3, axis=-1)
# Convert the preprocessed audio data into a format suitable for the VGG16 model
mel_spec_tensor = torch.from_numpy(mel_spec_rgb).permute(2, 0, 1).unsqueeze(0).float() # Add batch dimension and change channel order
# Extract features using VGG16
vgg16_model.eval()
with torch.no_grad():
features = vgg16_model(mel_spec_tensor)
# Convert the features to numpy array and flatten them
features_np = features.squeeze().detach().numpy()
features_flattened = features_np.flatten().reshape(1, -1)
# Apply PCA transformation
features_pca = pca_instance.transform(features_flattened)
# Convert to PyTorch tensor
features_tensor = torch.from_numpy(features_pca).float()
return features_tensor
def predict_language(audio_input):
# Load VGG16 model
if isinstance(audio_input, str):
# Load the audio file
audio, sr = librosa.load(audio_input, sr=22050)
else:
# Get the sample rate and convert the audio data to float
sr, audio = audio_input
audio = audio.astype(np.float32)
# Preprocess the single audio file using VGG16 for feature extraction
preprocessed_features = preprocess_single_audio_vgg16(audio, sr, vgg16, pca)
# Make predictions using the trained model
ann_model.eval()
with torch.no_grad():
output = ann_model(preprocessed_features)
_, predicted_class = torch.max(output, 1)
# Map predicted class index to actual label
predicted_label = {v: k for k, v in language_mapping.items()}[predicted_class.item()]
return predicted_label
iface = gr.Interface(fn=predict_language, inputs="audio", outputs="text")
iface.launch()