Spaces:

nouamanetazi
/

emotion_recognition

Build error

App Files Files Community

emotion_recognition / utils /audio.py

nouamanetazi HF staff

linting

c731c61 over 2 years ago

raw

history blame contribute delete

4.66 kB

	# -- coding: utf-8 --
	# /usr/bin/python2
	"""
	By kyubyong park. [email protected].
	https://www.github.com/kyubyong/dc_tts
	"""
	from __future__ import print_function, division

	import numpy as np
	import librosa
	import os, copy
	import matplotlib

	matplotlib.use("pdf")
	import matplotlib.pyplot as plt
	from scipy import signal

	from .audio_params import Hyperparams as hp
	import tensorflow as tf


	def get_spectrograms(fpath):
	"""Parse the wave file in `fpath` and
	Returns normalized melspectrogram and linear spectrogram.

	Args:
	fpath: A string. The full path of a sound file.

	Returns:
	mel: A 2d array of shape (T, n_mels) and dtype of float32.
	mag: A 2d array of shape (T, 1+n_fft/2) and dtype of float32.
	"""
	# Loading sound file
	y, sr = librosa.load(fpath, sr=hp.sr)

	# Trimming
	y, _ = librosa.effects.trim(y)

	# Preemphasis
	y = np.append(y[0], y[1:] - hp.preemphasis * y[:-1])

	# stft
	linear = librosa.stft(
	y=y, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length
	)

	# magnitude spectrogram
	mag = np.abs(linear) # (1+n_fft//2, T)

	# mel spectrogram
	mel_basis = librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels) # (n_mels, 1+n_fft//2)
	mel = np.dot(mel_basis, mag) # (n_mels, t)

	# to decibel
	mel = 20 * np.log10(np.maximum(1e-5, mel))
	mag = 20 * np.log10(np.maximum(1e-5, mag))

	# normalize
	mel = np.clip((mel - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)
	mag = np.clip((mag - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)

	# Transpose
	mel = mel.T.astype(np.float32) # (T, n_mels)
	mag = mag.T.astype(np.float32) # (T, 1+n_fft//2)

	return mel, mag


	def spectrogram2wav(mag):
	"""# Generate wave file from linear magnitude spectrogram

	Args:
	mag: A numpy array of (T, 1+n_fft//2)

	Returns:
	wav: A 1-D numpy array.
	"""
	# transpose
	mag = mag.T

	# de-noramlize
	mag = (np.clip(mag, 0, 1) * hp.max_db) - hp.max_db + hp.ref_db

	# to amplitude
	mag = np.power(10.0, mag * 0.05)

	# wav reconstruction
	wav = griffin_lim(mag ** hp.power)

	# de-preemphasis
	wav = signal.lfilter([1], [1, -hp.preemphasis], wav)

	# trim
	wav, _ = librosa.effects.trim(wav)

	return wav.astype(np.float32)


	def griffin_lim(spectrogram):
	"""Applies Griffin-Lim's raw."""
	X_best = copy.deepcopy(spectrogram)
	for i in range(hp.n_iter):
	X_t = invert_spectrogram(X_best)
	est = librosa.stft(X_t, hp.n_fft, hp.hop_length, win_length=hp.win_length)
	phase = est / np.maximum(1e-8, np.abs(est))
	X_best = spectrogram * phase
	X_t = invert_spectrogram(X_best)
	y = np.real(X_t)

	return y


	def invert_spectrogram(spectrogram):
	"""Applies inverse fft.
	Args:
	spectrogram: [1+n_fft//2, t]
	"""
	return librosa.istft(
	spectrogram, hp.hop_length, win_length=hp.win_length, window="hann"
	)


	def plot_alignment(alignment, gs, dir=hp.logdir):
	"""Plots the alignment.

	Args:
	alignment: A numpy array with shape of (encoder_steps, decoder_steps)
	gs: (int) global step.
	dir: Output path.
	"""
	if not os.path.exists(dir):
	os.mkdir(dir)

	fig, ax = plt.subplots()
	im = ax.imshow(alignment)

	fig.colorbar(im)
	plt.title("{} Steps".format(gs))
	plt.savefig("{}/alignment_{}.png".format(dir, gs), format="png")
	plt.close(fig)


	def guided_attention(g=0.2):
	"""Guided attention. Refer to page 3 on the paper."""
	W = np.zeros((hp.max_N, hp.max_T), dtype=np.float32)
	for n_pos in range(W.shape[0]):
	for t_pos in range(W.shape[1]):
	W[n_pos, t_pos] = 1 - np.exp(
	-((t_pos / float(hp.max_T) - n_pos / float(hp.max_N)) ** 2)
	/ (2 * g * g)
	)
	return W


	def learning_rate_decay(init_lr, global_step, warmup_steps=4000.0):
	"""Noam scheme from tensor2tensor"""
	step = tf.to_float(global_step + 1)
	return (
	init_lr
	* warmup_steps ** 0.5
	* tf.minimum(step * warmup_steps -1.5, step -0.5)
	)


	def load_spectrograms(fpath):
	"""Read the wave file in `fpath`
	and extracts spectrograms"""

	fname = os.path.basename(fpath)
	mel, mag = get_spectrograms(fpath)
	t = mel.shape[0]

	# Marginal padding for reduction shape sync.
	num_paddings = hp.r - (t % hp.r) if t % hp.r != 0 else 0
	mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode="constant")
	mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode="constant")

	# Reduction
	mel = mel[:: hp.r, :]
	return fname, mel, mag