Spaces:
Build error
Build error
# -*- coding: utf-8 -*- | |
# /usr/bin/python2 | |
""" | |
By kyubyong park. [email protected]. | |
https://www.github.com/kyubyong/dc_tts | |
""" | |
from __future__ import print_function, division | |
import numpy as np | |
import librosa | |
import os, copy | |
import matplotlib | |
matplotlib.use("pdf") | |
import matplotlib.pyplot as plt | |
from scipy import signal | |
from .audio_params import Hyperparams as hp | |
import tensorflow as tf | |
def get_spectrograms(fpath): | |
"""Parse the wave file in `fpath` and | |
Returns normalized melspectrogram and linear spectrogram. | |
Args: | |
fpath: A string. The full path of a sound file. | |
Returns: | |
mel: A 2d array of shape (T, n_mels) and dtype of float32. | |
mag: A 2d array of shape (T, 1+n_fft/2) and dtype of float32. | |
""" | |
# Loading sound file | |
y, sr = librosa.load(fpath, sr=hp.sr) | |
# Trimming | |
y, _ = librosa.effects.trim(y) | |
# Preemphasis | |
y = np.append(y[0], y[1:] - hp.preemphasis * y[:-1]) | |
# stft | |
linear = librosa.stft( | |
y=y, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length | |
) | |
# magnitude spectrogram | |
mag = np.abs(linear) # (1+n_fft//2, T) | |
# mel spectrogram | |
mel_basis = librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels) # (n_mels, 1+n_fft//2) | |
mel = np.dot(mel_basis, mag) # (n_mels, t) | |
# to decibel | |
mel = 20 * np.log10(np.maximum(1e-5, mel)) | |
mag = 20 * np.log10(np.maximum(1e-5, mag)) | |
# normalize | |
mel = np.clip((mel - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1) | |
mag = np.clip((mag - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1) | |
# Transpose | |
mel = mel.T.astype(np.float32) # (T, n_mels) | |
mag = mag.T.astype(np.float32) # (T, 1+n_fft//2) | |
return mel, mag | |
def spectrogram2wav(mag): | |
"""# Generate wave file from linear magnitude spectrogram | |
Args: | |
mag: A numpy array of (T, 1+n_fft//2) | |
Returns: | |
wav: A 1-D numpy array. | |
""" | |
# transpose | |
mag = mag.T | |
# de-noramlize | |
mag = (np.clip(mag, 0, 1) * hp.max_db) - hp.max_db + hp.ref_db | |
# to amplitude | |
mag = np.power(10.0, mag * 0.05) | |
# wav reconstruction | |
wav = griffin_lim(mag ** hp.power) | |
# de-preemphasis | |
wav = signal.lfilter([1], [1, -hp.preemphasis], wav) | |
# trim | |
wav, _ = librosa.effects.trim(wav) | |
return wav.astype(np.float32) | |
def griffin_lim(spectrogram): | |
"""Applies Griffin-Lim's raw.""" | |
X_best = copy.deepcopy(spectrogram) | |
for i in range(hp.n_iter): | |
X_t = invert_spectrogram(X_best) | |
est = librosa.stft(X_t, hp.n_fft, hp.hop_length, win_length=hp.win_length) | |
phase = est / np.maximum(1e-8, np.abs(est)) | |
X_best = spectrogram * phase | |
X_t = invert_spectrogram(X_best) | |
y = np.real(X_t) | |
return y | |
def invert_spectrogram(spectrogram): | |
"""Applies inverse fft. | |
Args: | |
spectrogram: [1+n_fft//2, t] | |
""" | |
return librosa.istft( | |
spectrogram, hp.hop_length, win_length=hp.win_length, window="hann" | |
) | |
def plot_alignment(alignment, gs, dir=hp.logdir): | |
"""Plots the alignment. | |
Args: | |
alignment: A numpy array with shape of (encoder_steps, decoder_steps) | |
gs: (int) global step. | |
dir: Output path. | |
""" | |
if not os.path.exists(dir): | |
os.mkdir(dir) | |
fig, ax = plt.subplots() | |
im = ax.imshow(alignment) | |
fig.colorbar(im) | |
plt.title("{} Steps".format(gs)) | |
plt.savefig("{}/alignment_{}.png".format(dir, gs), format="png") | |
plt.close(fig) | |
def guided_attention(g=0.2): | |
"""Guided attention. Refer to page 3 on the paper.""" | |
W = np.zeros((hp.max_N, hp.max_T), dtype=np.float32) | |
for n_pos in range(W.shape[0]): | |
for t_pos in range(W.shape[1]): | |
W[n_pos, t_pos] = 1 - np.exp( | |
-((t_pos / float(hp.max_T) - n_pos / float(hp.max_N)) ** 2) | |
/ (2 * g * g) | |
) | |
return W | |
def learning_rate_decay(init_lr, global_step, warmup_steps=4000.0): | |
"""Noam scheme from tensor2tensor""" | |
step = tf.to_float(global_step + 1) | |
return ( | |
init_lr | |
* warmup_steps ** 0.5 | |
* tf.minimum(step * warmup_steps ** -1.5, step ** -0.5) | |
) | |
def load_spectrograms(fpath): | |
"""Read the wave file in `fpath` | |
and extracts spectrograms""" | |
fname = os.path.basename(fpath) | |
mel, mag = get_spectrograms(fpath) | |
t = mel.shape[0] | |
# Marginal padding for reduction shape sync. | |
num_paddings = hp.r - (t % hp.r) if t % hp.r != 0 else 0 | |
mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode="constant") | |
mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode="constant") | |
# Reduction | |
mel = mel[:: hp.r, :] | |
return fname, mel, mag | |