Spaces:
Running
Running
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
import functools | |
from torchdiffeq import odeint | |
from models.estimator import Decoder | |
# modified from https://github.com/shivammehta25/Matcha-TTS/blob/main/matcha/models/components/flow_matching.py | |
class CFMDecoder(torch.nn.Module): | |
def __init__(self, noise_channels, cond_channels, hidden_channels, out_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, gin_channels): | |
super().__init__() | |
self.noise_channels = noise_channels | |
self.cond_channels = cond_channels | |
self.hidden_channels = hidden_channels | |
self.out_channels = out_channels | |
self.filter_channels = filter_channels | |
self.gin_channels = gin_channels | |
self.sigma_min = 1e-4 | |
self.estimator = Decoder(noise_channels, cond_channels, hidden_channels, out_channels, filter_channels, p_dropout, n_layers, n_heads, kernel_size, gin_channels) | |
def forward(self, mu, mask, n_timesteps, temperature=1.0, c=None, solver=None, cfg_kwargs=None): | |
"""Forward diffusion | |
Args: | |
mu (torch.Tensor): output of encoder | |
shape: (batch_size, n_feats, mel_timesteps) | |
mask (torch.Tensor): output_mask | |
shape: (batch_size, 1, mel_timesteps) | |
n_timesteps (int): number of diffusion steps | |
temperature (float, optional): temperature for scaling noise. Defaults to 1.0. | |
c (torch.Tensor, optional): speaker embedding | |
shape: (batch_size, gin_channels) | |
solver: see https://github.com/rtqichen/torchdiffeq for supported solvers | |
cfg_kwargs: used for cfg inference | |
Returns: | |
sample: generated mel-spectrogram | |
shape: (batch_size, n_feats, mel_timesteps) | |
""" | |
z = torch.randn_like(mu) * temperature | |
t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device) | |
# cfg control | |
if cfg_kwargs is None: | |
estimator = functools.partial(self.estimator, mask=mask, mu=mu, c=c) | |
else: | |
estimator = functools.partial(self.cfg_wrapper, mask=mask, mu=mu, c=c, cfg_kwargs=cfg_kwargs) | |
trajectory = odeint(estimator, z, t_span, method=solver, rtol=1e-5, atol=1e-5) | |
return trajectory[-1] | |
# cfg inference | |
def cfg_wrapper(self, t, x, mask, mu, c, cfg_kwargs): | |
fake_speaker = cfg_kwargs['fake_speaker'].repeat(x.size(0), 1) | |
fake_content = cfg_kwargs['fake_content'].repeat(x.size(0), 1, x.size(-1)) | |
cfg_strength = cfg_kwargs['cfg_strength'] | |
cond_output = self.estimator(t, x, mask, mu, c) | |
uncond_output = self.estimator(t, x, mask, fake_content, fake_speaker) | |
output = uncond_output + cfg_strength * (cond_output - uncond_output) | |
return output | |
def compute_loss(self, x1, mask, mu, c): | |
"""Computes diffusion loss | |
Args: | |
x1 (torch.Tensor): Target | |
shape: (batch_size, n_feats, mel_timesteps) | |
mask (torch.Tensor): target mask | |
shape: (batch_size, 1, mel_timesteps) | |
mu (torch.Tensor): output of encoder | |
shape: (batch_size, n_feats, mel_timesteps) | |
c (torch.Tensor, optional): speaker condition. | |
Returns: | |
loss: conditional flow matching loss | |
y: conditional flow | |
shape: (batch_size, n_feats, mel_timesteps) | |
""" | |
b, _, t = mu.shape | |
# random timestep | |
# use cosine timestep scheduler from cosyvoice: https://github.com/FunAudioLLM/CosyVoice/blob/main/cosyvoice/flow/flow_matching.py | |
t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype) | |
t = 1 - torch.cos(t * 0.5 * torch.pi) | |
# sample noise p(x_0) | |
z = torch.randn_like(x1) | |
y = (1 - (1 - self.sigma_min) * t) * z + t * x1 | |
u = x1 - (1 - self.sigma_min) * z | |
loss = F.mse_loss(self.estimator(t.squeeze(), y, mask, mu, c), u, reduction="sum") / (torch.sum(mask) * u.size(1)) | |
return loss, y | |