File size: 1,002 Bytes
4f6613a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
_target_: fish_speech.models.vqgan.modules.firefly.FireflyArchitecture
spec_transform:
  _target_: fish_speech.utils.spectrogram.LogMelSpectrogram
  sample_rate: 44100
  n_mels: 160
  n_fft: 2048
  hop_length: 512
  win_length: 2048
backbone:
  _target_: fish_speech.models.vqgan.modules.firefly.ConvNeXtEncoder
  input_channels: 160
  depths: [3, 3, 9, 3]
  dims: [128, 256, 384, 512]
  drop_path_rate: 0.2
  kernel_size: 7
head:
  _target_: fish_speech.models.vqgan.modules.firefly.HiFiGANGenerator
  hop_length: 512
  upsample_rates: [8, 8, 2, 2, 2]  # aka. strides
  upsample_kernel_sizes: [16, 16, 4, 4, 4]
  resblock_kernel_sizes: [3, 7, 11]
  resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
  num_mels: 512
  upsample_initial_channel: 512
  pre_conv_kernel_size: 13
  post_conv_kernel_size: 13
quantizer:
  _target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
  input_dim: 512
  n_groups: 8
  n_codebooks: 1
  levels: [8, 5, 5, 5]
  downsample_factor: [2, 2]