ftshijt commited on
Commit
2cc117a
1 Parent(s): 72e0752

Update model

Browse files
README.md CHANGED
@@ -1,3 +1,377 @@
1
  ---
2
- license: apache-2.0
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - singing-voice-synthesis
6
+ language: zh
7
+ datasets:
8
+ - m4singer
9
+ license: cc-by-4.0
10
  ---
11
+
12
+ ## ESPnet2 SVS model
13
+
14
+ ### `espnet/m4singer_svs_naive_rnn_dp`
15
+
16
+ This model was trained by ftshijt using m4singer recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 5c4d7cf7feba8461de2e1080bf82182f0efaef38
26
+ pip install -e .
27
+ cd egs2/m4singer/svs1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/m4singer_svs_naive_rnn_dp
29
+ ```
30
+
31
+
32
+
33
+ ## SVS config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/tuning/train_naive_rnn_dp.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ drop_last_iter: false
42
+ dry_run: false
43
+ iterator_type: sequence
44
+ valid_iterator_type: null
45
+ output_dir: exp/svs_train_naive_rnn_dp_raw_phn_None_zh
46
+ ngpu: 1
47
+ seed: 0
48
+ num_workers: 8
49
+ num_att_plot: 3
50
+ dist_backend: nccl
51
+ dist_init_method: env://
52
+ dist_world_size: null
53
+ dist_rank: null
54
+ local_rank: 0
55
+ dist_master_addr: null
56
+ dist_master_port: null
57
+ dist_launcher: null
58
+ multiprocessing_distributed: false
59
+ unused_parameters: false
60
+ sharded_ddp: false
61
+ cudnn_enabled: true
62
+ cudnn_benchmark: false
63
+ cudnn_deterministic: true
64
+ collect_stats: false
65
+ write_collected_feats: false
66
+ max_epoch: 500
67
+ patience: null
68
+ val_scheduler_criterion:
69
+ - valid
70
+ - loss
71
+ early_stopping_criterion:
72
+ - valid
73
+ - loss
74
+ - min
75
+ best_model_criterion:
76
+ - - valid
77
+ - loss
78
+ - min
79
+ - - train
80
+ - loss
81
+ - min
82
+ keep_nbest_models: 2
83
+ nbest_averaging_interval: 0
84
+ grad_clip: 1.0
85
+ grad_clip_type: 2.0
86
+ grad_noise: false
87
+ accum_grad: 1
88
+ no_forward_run: false
89
+ resume: true
90
+ train_dtype: float32
91
+ use_amp: false
92
+ log_interval: null
93
+ use_matplotlib: true
94
+ use_tensorboard: true
95
+ create_graph_in_tensorboard: false
96
+ use_wandb: false
97
+ wandb_project: null
98
+ wandb_id: null
99
+ wandb_entity: null
100
+ wandb_name: null
101
+ wandb_model_log_interval: -1
102
+ detect_anomaly: false
103
+ use_lora: false
104
+ save_lora_only: true
105
+ lora_conf: {}
106
+ pretrain_path: null
107
+ init_param: []
108
+ ignore_init_mismatch: false
109
+ freeze_param: []
110
+ num_iters_per_epoch: null
111
+ batch_size: 16
112
+ valid_batch_size: null
113
+ batch_bins: 1000000
114
+ valid_batch_bins: null
115
+ train_shape_file:
116
+ - exp/svs_stats_raw_phn_None_zh/train/text_shape.phn
117
+ - exp/svs_stats_raw_phn_None_zh/train/singing_shape
118
+ valid_shape_file:
119
+ - exp/svs_stats_raw_phn_None_zh/valid/text_shape.phn
120
+ - exp/svs_stats_raw_phn_None_zh/valid/singing_shape
121
+ batch_type: sorted
122
+ valid_batch_type: null
123
+ fold_length:
124
+ - 150
125
+ - 240000
126
+ sort_in_batch: descending
127
+ shuffle_within_batch: false
128
+ sort_batch: descending
129
+ multiple_iterator: false
130
+ chunk_length: 500
131
+ chunk_shift_ratio: 0.5
132
+ num_cache_chunks: 1024
133
+ chunk_excluded_key_prefixes: []
134
+ chunk_default_fs: null
135
+ train_data_path_and_name_and_type:
136
+ - - dump/raw/tr_no_dev/text
137
+ - text
138
+ - text
139
+ - - dump/raw/tr_no_dev/wav.scp
140
+ - singing
141
+ - sound
142
+ - - dump/raw/tr_no_dev/label
143
+ - label
144
+ - duration
145
+ - - dump/raw/tr_no_dev/score.scp
146
+ - score
147
+ - score
148
+ - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/pitch.scp
149
+ - pitch
150
+ - npy
151
+ - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/feats.scp
152
+ - feats
153
+ - npy
154
+ - - dump/raw/tr_no_dev/utt2sid
155
+ - sids
156
+ - text_int
157
+ valid_data_path_and_name_and_type:
158
+ - - dump/raw/dev/text
159
+ - text
160
+ - text
161
+ - - dump/raw/dev/wav.scp
162
+ - singing
163
+ - sound
164
+ - - dump/raw/dev/label
165
+ - label
166
+ - duration
167
+ - - dump/raw/dev/score.scp
168
+ - score
169
+ - score
170
+ - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/pitch.scp
171
+ - pitch
172
+ - npy
173
+ - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/feats.scp
174
+ - feats
175
+ - npy
176
+ - - dump/raw/dev/utt2sid
177
+ - sids
178
+ - text_int
179
+ allow_variable_data_keys: false
180
+ max_cache_size: 0.0
181
+ max_cache_fd: 32
182
+ allow_multi_rates: false
183
+ valid_max_cache_size: null
184
+ exclude_weight_decay: false
185
+ exclude_weight_decay_conf: {}
186
+ optim: adam
187
+ optim_conf:
188
+ lr: 0.001
189
+ eps: 1.0e-06
190
+ weight_decay: 0.0
191
+ scheduler: null
192
+ scheduler_conf: {}
193
+ token_list:
194
+ - <blank>
195
+ - <unk>
196
+ - i
197
+ - <AP>
198
+ - <SP>
199
+ - e
200
+ - d
201
+ - uo
202
+ - ai
203
+ - sh
204
+ - u
205
+ - ian
206
+ - n
207
+ - l
208
+ - h
209
+ - x
210
+ - j
211
+ - b
212
+ - zh
213
+ - m
214
+ - en
215
+ - uei
216
+ - an
217
+ - a
218
+ - eng
219
+ - iou
220
+ - z
221
+ - g
222
+ - ang
223
+ - ing
224
+ - ou
225
+ - q
226
+ - ei
227
+ - ao
228
+ - iang
229
+ - t
230
+ - ie
231
+ - ong
232
+ - r
233
+ - iao
234
+ - ch
235
+ - k
236
+ - f
237
+ - v
238
+ - in
239
+ - uang
240
+ - uan
241
+ - c
242
+ - s
243
+ - ve
244
+ - van
245
+ - p
246
+ - uen
247
+ - o
248
+ - ia
249
+ - ua
250
+ - iong
251
+ - uai
252
+ - vn
253
+ - er
254
+ - <sos/eos>
255
+ odim: null
256
+ model_conf: {}
257
+ use_preprocessor: true
258
+ token_type: phn
259
+ bpemodel: null
260
+ non_linguistic_symbols: null
261
+ cleaner: null
262
+ g2p: null
263
+ fs: 24000
264
+ score_feats_extract: syllable_score_feats
265
+ score_feats_extract_conf:
266
+ fs: 24000
267
+ n_fft: 2048
268
+ win_length: 1200
269
+ hop_length: 300
270
+ feats_extract: fbank
271
+ feats_extract_conf:
272
+ n_fft: 2048
273
+ hop_length: 300
274
+ win_length: 1200
275
+ fs: 24000
276
+ fmin: 80
277
+ fmax: 7600
278
+ n_mels: 80
279
+ normalize: global_mvn
280
+ normalize_conf:
281
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz
282
+ svs: naive_rnn_dp
283
+ svs_conf:
284
+ midi_dim: 129
285
+ embed_dim: 512
286
+ duration_dim: 1000
287
+ eprenet_conv_layers: 0
288
+ eprenet_conv_chans: 256
289
+ eprenet_conv_filts: 3
290
+ elayers: 3
291
+ eunits: 256
292
+ ebidirectional: true
293
+ midi_embed_integration_type: add
294
+ dlayers: 2
295
+ dunits: 256
296
+ dbidirectional: true
297
+ postnet_layers: 5
298
+ postnet_chans: 512
299
+ postnet_filts: 5
300
+ use_batch_norm: true
301
+ reduction_factor: 1
302
+ eprenet_dropout_rate: 0.2
303
+ edropout_rate: 0.1
304
+ ddropout_rate: 0.1
305
+ postnet_dropout_rate: 0.5
306
+ init_type: pytorch
307
+ use_masking: true
308
+ spks: 21
309
+ pitch_extract: dio
310
+ pitch_extract_conf:
311
+ use_token_averaged_f0: false
312
+ fs: 24000
313
+ n_fft: 2048
314
+ hop_length: 300
315
+ f0max: 800
316
+ f0min: 80
317
+ reduction_factor: 1
318
+ pitch_normalize: global_mvn
319
+ pitch_normalize_conf:
320
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz
321
+ ying_extract: null
322
+ ying_extract_conf: {}
323
+ energy_extract: null
324
+ energy_extract_conf: {}
325
+ energy_normalize: null
326
+ energy_normalize_conf: {}
327
+ required:
328
+ - output_dir
329
+ - token_list
330
+ version: '202310'
331
+ distributed: false
332
+ ```
333
+
334
+ </details>
335
+
336
+
337
+
338
+ ### Citing ESPnet
339
+
340
+ ```BibTex
341
+ @inproceedings{watanabe2018espnet,
342
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
343
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
344
+ year={2018},
345
+ booktitle={Proceedings of Interspeech},
346
+ pages={2207--2211},
347
+ doi={10.21437/Interspeech.2018-1456},
348
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
349
+ }
350
+
351
+
352
+
353
+
354
+
355
+
356
+ @inproceedings{shi22d_interspeech,
357
+ author={Jiatong Shi and Shuai Guo and Tao Qian and Tomoki Hayashi and Yuning Wu and Fangzheng Xu and Xuankai Chang and Huazhe Li and Peter Wu and Shinji Watanabe and Qin Jin},
358
+ title={{Muskits: an End-to-end Music Processing Toolkit for Singing Voice Synthesis}},
359
+ year=2022,
360
+ booktitle={Proc. Interspeech 2022},
361
+ pages={4277--4281},
362
+ doi={10.21437/Interspeech.2022-10039}
363
+ }
364
+ ```
365
+
366
+ or arXiv:
367
+
368
+ ```bibtex
369
+ @misc{watanabe2018espnet,
370
+ title={ESPnet: End-to-End Speech Processing Toolkit},
371
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
372
+ year={2018},
373
+ eprint={1804.00015},
374
+ archivePrefix={arXiv},
375
+ primaryClass={cs.CL}
376
+ }
377
+ ```
dump/raw/org/tr_no_dev/spk2sid ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <unk> 0
2
+ Alto-1 1
3
+ Alto-2 2
4
+ Alto-3 3
5
+ Alto-4 4
6
+ Alto-5 5
7
+ Alto-6 6
8
+ Alto-7 7
9
+ Bass-1 8
10
+ Bass-2 9
11
+ Bass-3 10
12
+ Soprano-1 11
13
+ Soprano-2 12
14
+ Soprano-3 13
15
+ Tenor-1 14
16
+ Tenor-2 15
17
+ Tenor-3 16
18
+ Tenor-4 17
19
+ Tenor-5 18
20
+ Tenor-6 19
21
+ Tenor-7 20
exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d1ad19168ccba3b69b08f2afdc7df4fdf08a0d62e407c593f7a7167116343f0
3
+ size 1402
exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38b211c522897d8dfb4f111139ae98f245f559533cf71bf533078e6c160bdde9
3
+ size 770
exp/svs_train_naive_rnn_dp_raw_phn_None_zh/109epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4efcbfefd02536a852140d109c2ba79ce3c0f928774af50419bedaff30fe5abf
3
+ size 87472181
exp/svs_train_naive_rnn_dp_raw_phn_None_zh/config.yaml ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_naive_rnn_dp.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/svs_train_naive_rnn_dp_raw_phn_None_zh
9
+ ngpu: 1
10
+ seed: 0
11
+ num_workers: 8
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: false
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: true
27
+ collect_stats: false
28
+ write_collected_feats: false
29
+ max_epoch: 500
30
+ patience: null
31
+ val_scheduler_criterion:
32
+ - valid
33
+ - loss
34
+ early_stopping_criterion:
35
+ - valid
36
+ - loss
37
+ - min
38
+ best_model_criterion:
39
+ - - valid
40
+ - loss
41
+ - min
42
+ - - train
43
+ - loss
44
+ - min
45
+ keep_nbest_models: 2
46
+ nbest_averaging_interval: 0
47
+ grad_clip: 1.0
48
+ grad_clip_type: 2.0
49
+ grad_noise: false
50
+ accum_grad: 1
51
+ no_forward_run: false
52
+ resume: true
53
+ train_dtype: float32
54
+ use_amp: false
55
+ log_interval: null
56
+ use_matplotlib: true
57
+ use_tensorboard: true
58
+ create_graph_in_tensorboard: false
59
+ use_wandb: false
60
+ wandb_project: null
61
+ wandb_id: null
62
+ wandb_entity: null
63
+ wandb_name: null
64
+ wandb_model_log_interval: -1
65
+ detect_anomaly: false
66
+ use_lora: false
67
+ save_lora_only: true
68
+ lora_conf: {}
69
+ pretrain_path: null
70
+ init_param: []
71
+ ignore_init_mismatch: false
72
+ freeze_param: []
73
+ num_iters_per_epoch: null
74
+ batch_size: 16
75
+ valid_batch_size: null
76
+ batch_bins: 1000000
77
+ valid_batch_bins: null
78
+ train_shape_file:
79
+ - exp/svs_stats_raw_phn_None_zh/train/text_shape.phn
80
+ - exp/svs_stats_raw_phn_None_zh/train/singing_shape
81
+ valid_shape_file:
82
+ - exp/svs_stats_raw_phn_None_zh/valid/text_shape.phn
83
+ - exp/svs_stats_raw_phn_None_zh/valid/singing_shape
84
+ batch_type: sorted
85
+ valid_batch_type: null
86
+ fold_length:
87
+ - 150
88
+ - 240000
89
+ sort_in_batch: descending
90
+ shuffle_within_batch: false
91
+ sort_batch: descending
92
+ multiple_iterator: false
93
+ chunk_length: 500
94
+ chunk_shift_ratio: 0.5
95
+ num_cache_chunks: 1024
96
+ chunk_excluded_key_prefixes: []
97
+ chunk_default_fs: null
98
+ train_data_path_and_name_and_type:
99
+ - - dump/raw/tr_no_dev/text
100
+ - text
101
+ - text
102
+ - - dump/raw/tr_no_dev/wav.scp
103
+ - singing
104
+ - sound
105
+ - - dump/raw/tr_no_dev/label
106
+ - label
107
+ - duration
108
+ - - dump/raw/tr_no_dev/score.scp
109
+ - score
110
+ - score
111
+ - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/pitch.scp
112
+ - pitch
113
+ - npy
114
+ - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/feats.scp
115
+ - feats
116
+ - npy
117
+ - - dump/raw/tr_no_dev/utt2sid
118
+ - sids
119
+ - text_int
120
+ valid_data_path_and_name_and_type:
121
+ - - dump/raw/dev/text
122
+ - text
123
+ - text
124
+ - - dump/raw/dev/wav.scp
125
+ - singing
126
+ - sound
127
+ - - dump/raw/dev/label
128
+ - label
129
+ - duration
130
+ - - dump/raw/dev/score.scp
131
+ - score
132
+ - score
133
+ - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/pitch.scp
134
+ - pitch
135
+ - npy
136
+ - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/feats.scp
137
+ - feats
138
+ - npy
139
+ - - dump/raw/dev/utt2sid
140
+ - sids
141
+ - text_int
142
+ allow_variable_data_keys: false
143
+ max_cache_size: 0.0
144
+ max_cache_fd: 32
145
+ allow_multi_rates: false
146
+ valid_max_cache_size: null
147
+ exclude_weight_decay: false
148
+ exclude_weight_decay_conf: {}
149
+ optim: adam
150
+ optim_conf:
151
+ lr: 0.001
152
+ eps: 1.0e-06
153
+ weight_decay: 0.0
154
+ scheduler: null
155
+ scheduler_conf: {}
156
+ token_list:
157
+ - <blank>
158
+ - <unk>
159
+ - i
160
+ - <AP>
161
+ - <SP>
162
+ - e
163
+ - d
164
+ - uo
165
+ - ai
166
+ - sh
167
+ - u
168
+ - ian
169
+ - n
170
+ - l
171
+ - h
172
+ - x
173
+ - j
174
+ - b
175
+ - zh
176
+ - m
177
+ - en
178
+ - uei
179
+ - an
180
+ - a
181
+ - eng
182
+ - iou
183
+ - z
184
+ - g
185
+ - ang
186
+ - ing
187
+ - ou
188
+ - q
189
+ - ei
190
+ - ao
191
+ - iang
192
+ - t
193
+ - ie
194
+ - ong
195
+ - r
196
+ - iao
197
+ - ch
198
+ - k
199
+ - f
200
+ - v
201
+ - in
202
+ - uang
203
+ - uan
204
+ - c
205
+ - s
206
+ - ve
207
+ - van
208
+ - p
209
+ - uen
210
+ - o
211
+ - ia
212
+ - ua
213
+ - iong
214
+ - uai
215
+ - vn
216
+ - er
217
+ - <sos/eos>
218
+ odim: null
219
+ model_conf: {}
220
+ use_preprocessor: true
221
+ token_type: phn
222
+ bpemodel: null
223
+ non_linguistic_symbols: null
224
+ cleaner: null
225
+ g2p: null
226
+ fs: 24000
227
+ score_feats_extract: syllable_score_feats
228
+ score_feats_extract_conf:
229
+ fs: 24000
230
+ n_fft: 2048
231
+ win_length: 1200
232
+ hop_length: 300
233
+ feats_extract: fbank
234
+ feats_extract_conf:
235
+ n_fft: 2048
236
+ hop_length: 300
237
+ win_length: 1200
238
+ fs: 24000
239
+ fmin: 80
240
+ fmax: 7600
241
+ n_mels: 80
242
+ normalize: global_mvn
243
+ normalize_conf:
244
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz
245
+ svs: naive_rnn_dp
246
+ svs_conf:
247
+ midi_dim: 129
248
+ embed_dim: 512
249
+ duration_dim: 1000
250
+ eprenet_conv_layers: 0
251
+ eprenet_conv_chans: 256
252
+ eprenet_conv_filts: 3
253
+ elayers: 3
254
+ eunits: 256
255
+ ebidirectional: true
256
+ midi_embed_integration_type: add
257
+ dlayers: 2
258
+ dunits: 256
259
+ dbidirectional: true
260
+ postnet_layers: 5
261
+ postnet_chans: 512
262
+ postnet_filts: 5
263
+ use_batch_norm: true
264
+ reduction_factor: 1
265
+ eprenet_dropout_rate: 0.2
266
+ edropout_rate: 0.1
267
+ ddropout_rate: 0.1
268
+ postnet_dropout_rate: 0.5
269
+ init_type: pytorch
270
+ use_masking: true
271
+ spks: 21
272
+ pitch_extract: dio
273
+ pitch_extract_conf:
274
+ use_token_averaged_f0: false
275
+ fs: 24000
276
+ n_fft: 2048
277
+ hop_length: 300
278
+ f0max: 800
279
+ f0min: 80
280
+ reduction_factor: 1
281
+ pitch_normalize: global_mvn
282
+ pitch_normalize_conf:
283
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz
284
+ ying_extract: null
285
+ ying_extract_conf: {}
286
+ energy_extract: null
287
+ energy_extract_conf: {}
288
+ energy_normalize: null
289
+ energy_normalize_conf: {}
290
+ required:
291
+ - output_dir
292
+ - token_list
293
+ version: '202310'
294
+ distributed: false
exp/svs_train_naive_rnn_dp_raw_phn_None_zh/images/backward_time.png ADDED
exp/svs_train_naive_rnn_dp_raw_phn_None_zh/images/clip.png ADDED
exp/svs_train_naive_rnn_dp_raw_phn_None_zh/images/duration_loss.png ADDED
exp/svs_train_naive_rnn_dp_raw_phn_None_zh/images/forward_time.png ADDED
exp/svs_train_naive_rnn_dp_raw_phn_None_zh/images/gpu_max_cached_mem_GB.png ADDED
exp/svs_train_naive_rnn_dp_raw_phn_None_zh/images/grad_norm.png ADDED
exp/svs_train_naive_rnn_dp_raw_phn_None_zh/images/iter_time.png ADDED
exp/svs_train_naive_rnn_dp_raw_phn_None_zh/images/l1_loss.png ADDED
exp/svs_train_naive_rnn_dp_raw_phn_None_zh/images/loss.png ADDED
exp/svs_train_naive_rnn_dp_raw_phn_None_zh/images/loss_scale.png ADDED
exp/svs_train_naive_rnn_dp_raw_phn_None_zh/images/optim0_lr0.png ADDED
exp/svs_train_naive_rnn_dp_raw_phn_None_zh/images/optim_step_time.png ADDED
exp/svs_train_naive_rnn_dp_raw_phn_None_zh/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202310'
2
+ files:
3
+ model_file: exp/svs_train_naive_rnn_dp_raw_phn_None_zh/109epoch.pth
4
+ python: "3.9.16 (main, Mar 8 2023, 14:00:05) \n[GCC 11.2.0]"
5
+ timestamp: 1702901147.259539
6
+ torch: 1.13.1+cu117
7
+ yaml_files:
8
+ train_config: exp/svs_train_naive_rnn_dp_raw_phn_None_zh/config.yaml