File size: 4,166 Bytes
778a50b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
{
  "framework": "PyTorch",
  "task" : "text-to-speech",
  "model" : {
    "type" : "sambert-hifigan",
    "lang_type" : "zhcn",
    "sample_rate" : 16000,
    "custom_ckpt": {
      "voice_name" : "F7",
      "am_ckpt" : "basemodel_16k/sambert/ckpt",
      "am_config" : "basemodel_16k/sambert/config.yaml",
      "voc_ckpt" : "basemodel_16k/hifigan/ckpt",
      "voc_config" : "basemodel_16k/hifigan/config.yaml",
      "audio_config" : "basemodel_16k/audio_config_se_16k.yaml",
      "se_model" : "basemodel_16k/speaker_embedding/se.onnx"
    },
    "am": {
       "am": {
          "max_len": 800,

          "embedding_dim": 512, 
          "encoder_num_layers": 8,
          "encoder_num_heads": 8,
          "encoder_num_units": 128,
          "encoder_ffn_inner_dim": 1024,
          "encoder_dropout": 0.1,
          "encoder_attention_dropout": 0.1,
          "encoder_relu_dropout": 0.1,
          "encoder_projection_units": 32,

          "speaker_units": 512,
          "emotion_units": 32,

          "predictor_filter_size": 41,
          "predictor_fsmn_num_layers": 3,
          "predictor_num_memory_units": 128,
          "predictor_ffn_inner_dim": 256,
          "predictor_dropout": 0.1,
          "predictor_shift": 0,
          "predictor_lstm_units": 128,
          "dur_pred_prenet_units": [128, 128],
          "dur_pred_lstm_units": 128,

          "decoder_prenet_units": [256, 256],
          "decoder_num_layers": 12,
          "decoder_num_heads": 8,
          "decoder_num_units": 128,
          "decoder_ffn_inner_dim": 1024,
          "decoder_dropout": 0.1,
          "decoder_attention_dropout": 0.1,
          "decoder_relu_dropout": 0.1,

          "outputs_per_step": 3,
          "num_mels": 82,

          "postnet_filter_size": 41,
          "postnet_fsmn_num_layers": 4,
          "postnet_num_memory_units": 256,
          "postnet_ffn_inner_dim": 512,
          "postnet_dropout": 0.1,
          "postnet_shift": 17,
          "postnet_lstm_units": 128,


          "nsf_f0_global_maximum": 730.0,
          "nsf_f0_global_minimum": 30.0,
          "nsf_norm_type": "global"
      },

      "audio": {
          "frame_shift_ms": 12.5
      },

      "linguistic_unit": {
        "cleaners": "english_cleaners",
        "lfeat_type_list": "sy,tone,syllable_flag,word_segment,emo_category,speaker_category",
        "sy": "dict/sy_dict.txt",
        "tone": "dict/tone_dict.txt",
        "syllable_flag": "dict/syllable_flag_dict.txt",
        "word_segment": "dict/word_segment_dict.txt",
        "emo_category": "dict/emo_category_dict.txt",
        "speaker_category": "dict/speaker_dict.txt"
      },

      "num_gpus": 1,
      "batch_size": 32,
      "group_size": 1024,
      "learning_rate": 0.001,
      "adam_b1": 0.9,
      "adam_b2": 0.98,
      "seed": 1234,

      "num_workers": 4,

      "dist_config": {
          "dist_backend": "nccl",
          "dist_url": "tcp://localhost:11111",
          "world_size": 1
      }

    },
    "vocoder" : {
      "resblock": "1",
      "num_gpus": 1,
      "batch_size": 16,
      "learning_rate": 0.0002,
      "adam_b1": 0.8,
      "adam_b2": 0.99,
      "lr_decay": 0.999,
      "seed": 1234,

      "bias": true,
      "causal": false,
      "nsf_params" : {
        "nb_harmonics": 7, 
        "nsf_f0_global_maximum": 730.0,
        "nsf_f0_global_minimum": 30.0,
        "nsf_norm_type": "global", 
        "sampling_rate": 16000
      },

      "upsample_rates": [10,5,2,2],
      "upsample_kernel_sizes": [20,11,4,4],
      "upsample_initial_channel": 256,
      "resblock_kernel_sizes": [3,7,11],
      "resblock_dilation_sizes": [[1,3,5,7], [1,3,5,7], [1,3,5,7]],

      "segment_size": 6400,
      "num_mels": 80,
      "num_freq": 1025,
      "n_fft": 2048,
      "hop_size": 200,
      "win_size": 1000,

      "sampling_rate": 16000,

      "fmin": 0,
      "fmax": 8000,
      "fmax_for_loss": null,

      "num_workers": 4,

      "dist_config": {
          "dist_backend": "nccl",
          "dist_url": "tcp://localhost:54312",
          "world_size": 1
      }
    }
  },
  "train": {
  },
  "pipeline": {
     "type": "sambert-hifigan-tts"
  }
}