nijisakai's picture
Duplicate from kevinwang676/Personal-TTS
778a50b
{
"framework": "PyTorch",
"task" : "text-to-speech",
"model" : {
"type" : "sambert-hifigan",
"lang_type" : "zhcn",
"sample_rate" : 16000,
"custom_ckpt": {
"voice_name" : "F7",
"am_ckpt" : "basemodel_16k/sambert/ckpt",
"am_config" : "basemodel_16k/sambert/config.yaml",
"voc_ckpt" : "basemodel_16k/hifigan/ckpt",
"voc_config" : "basemodel_16k/hifigan/config.yaml",
"audio_config" : "basemodel_16k/audio_config_se_16k.yaml",
"se_model" : "basemodel_16k/speaker_embedding/se.onnx"
},
"am": {
"am": {
"max_len": 800,
"embedding_dim": 512,
"encoder_num_layers": 8,
"encoder_num_heads": 8,
"encoder_num_units": 128,
"encoder_ffn_inner_dim": 1024,
"encoder_dropout": 0.1,
"encoder_attention_dropout": 0.1,
"encoder_relu_dropout": 0.1,
"encoder_projection_units": 32,
"speaker_units": 512,
"emotion_units": 32,
"predictor_filter_size": 41,
"predictor_fsmn_num_layers": 3,
"predictor_num_memory_units": 128,
"predictor_ffn_inner_dim": 256,
"predictor_dropout": 0.1,
"predictor_shift": 0,
"predictor_lstm_units": 128,
"dur_pred_prenet_units": [128, 128],
"dur_pred_lstm_units": 128,
"decoder_prenet_units": [256, 256],
"decoder_num_layers": 12,
"decoder_num_heads": 8,
"decoder_num_units": 128,
"decoder_ffn_inner_dim": 1024,
"decoder_dropout": 0.1,
"decoder_attention_dropout": 0.1,
"decoder_relu_dropout": 0.1,
"outputs_per_step": 3,
"num_mels": 82,
"postnet_filter_size": 41,
"postnet_fsmn_num_layers": 4,
"postnet_num_memory_units": 256,
"postnet_ffn_inner_dim": 512,
"postnet_dropout": 0.1,
"postnet_shift": 17,
"postnet_lstm_units": 128,
"nsf_f0_global_maximum": 730.0,
"nsf_f0_global_minimum": 30.0,
"nsf_norm_type": "global"
},
"audio": {
"frame_shift_ms": 12.5
},
"linguistic_unit": {
"cleaners": "english_cleaners",
"lfeat_type_list": "sy,tone,syllable_flag,word_segment,emo_category,speaker_category",
"sy": "dict/sy_dict.txt",
"tone": "dict/tone_dict.txt",
"syllable_flag": "dict/syllable_flag_dict.txt",
"word_segment": "dict/word_segment_dict.txt",
"emo_category": "dict/emo_category_dict.txt",
"speaker_category": "dict/speaker_dict.txt"
},
"num_gpus": 1,
"batch_size": 32,
"group_size": 1024,
"learning_rate": 0.001,
"adam_b1": 0.9,
"adam_b2": 0.98,
"seed": 1234,
"num_workers": 4,
"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:11111",
"world_size": 1
}
},
"vocoder" : {
"resblock": "1",
"num_gpus": 1,
"batch_size": 16,
"learning_rate": 0.0002,
"adam_b1": 0.8,
"adam_b2": 0.99,
"lr_decay": 0.999,
"seed": 1234,
"bias": true,
"causal": false,
"nsf_params" : {
"nb_harmonics": 7,
"nsf_f0_global_maximum": 730.0,
"nsf_f0_global_minimum": 30.0,
"nsf_norm_type": "global",
"sampling_rate": 16000
},
"upsample_rates": [10,5,2,2],
"upsample_kernel_sizes": [20,11,4,4],
"upsample_initial_channel": 256,
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5,7], [1,3,5,7], [1,3,5,7]],
"segment_size": 6400,
"num_mels": 80,
"num_freq": 1025,
"n_fft": 2048,
"hop_size": 200,
"win_size": 1000,
"sampling_rate": 16000,
"fmin": 0,
"fmax": 8000,
"fmax_for_loss": null,
"num_workers": 4,
"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:54312",
"world_size": 1
}
}
},
"train": {
},
"pipeline": {
"type": "sambert-hifigan-tts"
}
}