upload config
Browse files- config/decoder.yaml +12 -0
- config/dvae.yaml +14 -0
- config/gpt.yaml +20 -0
- config/path.yaml +11 -0
- config/vocos.yaml +24 -0
config/decoder.yaml
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
dim: 384
|
4 |
+
|
5 |
+
decoder_config:
|
6 |
+
idim: ${dim}
|
7 |
+
odim: ${dim}
|
8 |
+
hidden: 512
|
9 |
+
n_layer: 12
|
10 |
+
bn_dim: 128
|
11 |
+
|
12 |
+
vq_config: null
|
config/dvae.yaml
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
dim: 512
|
4 |
+
decoder_config:
|
5 |
+
idim: ${dim}
|
6 |
+
odim: ${dim}
|
7 |
+
n_layer: 12
|
8 |
+
bn_dim: 128
|
9 |
+
|
10 |
+
vq_config:
|
11 |
+
dim: 1024
|
12 |
+
levels: [5,5,5,5]
|
13 |
+
G: 2
|
14 |
+
R: 2
|
config/gpt.yaml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
num_audio_tokens: 626
|
4 |
+
num_text_tokens: 21178
|
5 |
+
|
6 |
+
gpt_config:
|
7 |
+
hidden_size: 768
|
8 |
+
intermediate_size: 3072
|
9 |
+
num_attention_heads: 12
|
10 |
+
num_hidden_layers: 20
|
11 |
+
use_cache: False
|
12 |
+
max_position_embeddings: 4096
|
13 |
+
# attn_implementation: flash_attention_2
|
14 |
+
|
15 |
+
spk_emb_dim: 192
|
16 |
+
spk_KL: False
|
17 |
+
num_audio_tokens: 626
|
18 |
+
num_text_tokens: null
|
19 |
+
num_vq: 4
|
20 |
+
|
config/path.yaml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
vocos_config_path: config/vocos.yaml
|
4 |
+
vocos_ckpt_path: asset/Vocos.pt
|
5 |
+
dvae_config_path: config/dvae.yaml
|
6 |
+
dvae_ckpt_path: asset/DVAE.pt
|
7 |
+
gpt_config_path: config/gpt.yaml
|
8 |
+
gpt_ckpt_path: asset/GPT.pt
|
9 |
+
decoder_config_path: config/decoder.yaml
|
10 |
+
decoder_ckpt_path: asset/Decoder.pt
|
11 |
+
tokenizer_path: asset/tokenizer.pt
|
config/vocos.yaml
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
feature_extractor:
|
2 |
+
class_path: vocos.feature_extractors.MelSpectrogramFeatures
|
3 |
+
init_args:
|
4 |
+
sample_rate: 24000
|
5 |
+
n_fft: 1024
|
6 |
+
hop_length: 256
|
7 |
+
n_mels: 100
|
8 |
+
padding: center
|
9 |
+
|
10 |
+
backbone:
|
11 |
+
class_path: vocos.models.VocosBackbone
|
12 |
+
init_args:
|
13 |
+
input_channels: 100
|
14 |
+
dim: 512
|
15 |
+
intermediate_dim: 1536
|
16 |
+
num_layers: 8
|
17 |
+
|
18 |
+
head:
|
19 |
+
class_path: vocos.heads.ISTFTHead
|
20 |
+
init_args:
|
21 |
+
dim: 512
|
22 |
+
n_fft: 1024
|
23 |
+
hop_length: 256
|
24 |
+
padding: center
|