# This config contains the default values for training a modified ContextNet model with Transducer loss and BPE-based vocabulary. # In contrast to original ContextNet, the same number of filters is used throughout the model. # Default learning parameters in this config are set for effective batch size of 1k on 32 GPUs. # To train it with smaller batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. # It contains the default values for training a ContextNet ASR model, large size (~144M) with Transducer loss and sub-word encoding. # Architecture and training config: # Default learning parameters in this config are set for effective batch size of 1K. To train it with smaller effective # batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. # Here are the recommended configs for different variants of ContextNet, other parameters are the same as in this config file. # # +-------------+---------+------------+ # | Model | filters | time_masks | # +=============+=========+============+ # | Small (14M)| 256 | 2 | # +-------------+---------+------------+ # | Medium (40M)| 512 | 5 | # +-------------+---------+------------+ # | Large (145M)| 1024 | 10 | # +------------------------------------- name: &name "ContextNet-8x-Stride-RNNT" model: sample_rate: 16000 compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. train_ds: manifest_filepath: ??? sample_rate: ${model.sample_rate} batch_size: 16 # Can be increased if memory allows or when using smaller model trim_silence: false max_duration: 16.7 shuffle: true use_start_end_token: false num_workers: 16 pin_memory: true # tarred datasets is_tarred: false tarred_audio_filepaths: null tarred_shard_strategy: "scatter" shuffle_n: 2048 # bucketing params bucketing_strategy: "synced_randomized" bucketing_batch_size: null validation_ds: manifest_filepath: ??? sample_rate: ${model.sample_rate} batch_size: 8 shuffle: false use_start_end_token: false num_workers: 16 pin_memory: true test_ds: manifest_filepath: null sample_rate: ${model.sample_rate} batch_size: 8 shuffle: false use_start_end_token: false num_workers: 16 pin_memory: true model_defaults: filters: 1024 repeat: 5 dropout: 0.1 separable: true se: true se_context_size: -1 kernel_size_factor: 1.0 # encoder / decoder / joint values enc_hidden: 640 pred_hidden: 640 joint_hidden: 640 tokenizer: dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) type: ??? # Can be either bpe or wpe preprocessor: _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor sample_rate: ${model.sample_rate} normalize: "per_feature" window_size: 0.025 window_stride: 0.01 window: "hann" features: &n_mels 80 n_fft: 512 frame_splicing: 1 dither: 0.00001 pad_to: 16 stft_conv: false spec_augment: _target_: nemo.collections.asr.modules.SpectrogramAugmentation freq_masks: 2 # should be kept at 2 time_masks: 10 # can be 5 for small-med models, 10 for larger models. freq_width: 27 time_width: 0.05 encoder: _target_: nemo.collections.asr.modules.ConvASREncoder feat_in: *n_mels activation: swish conv_mask: true init_mode: "tds_uniform" jasper: - filters: ${model.model_defaults.filters} repeat: 1 kernel: [5] stride: [1] dilation: [1] dropout: 0.0 residual: false separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [2] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} stride_last: true residual_mode: "stride_add" kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [2] # *stride dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} stride_last: true residual_mode: "stride_add" kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [2] # stride dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} stride_last: true residual_mode: "stride_add" kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.enc_hidden} repeat: 1 kernel: [5] stride: [1] dilation: [1] dropout: 0.0 residual: false separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} decoder: _target_: nemo.collections.asr.modules.RNNTDecoder normalization_mode: null # Currently only null is supported for export. random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. prednet: pred_hidden: ${model.model_defaults.pred_hidden} pred_rnn_layers: 1 # only 1 layer LSTM networks are exportable. t_max: null # Maximum possible target seq length used for Chrono Initialization - https://arxiv.org/abs/1804.11188. Disabled by default. dropout: 0.1 joint: _target_: nemo.collections.asr.modules.RNNTJoint log_softmax: null # sets it according to cpu/gpu device preserve_memory: false # dramatically slows down training, but might preserve some memory # Fuses the computation of prediction net + joint net + loss + WER calculation # to be run on sub-batches of size `fused_batch_size`. # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. # Using small values here will preserve a lot of memory during training, but will make training slower as well. # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. # However, to preserve memory, this ratio can be 1:8 or even 1:16. # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. fuse_loss_wer: true fused_batch_size: 16 jointnet: joint_hidden: ${model.model_defaults.joint_hidden} activation: "relu" dropout: 0.1 # RNNT decoding strategy decoding: strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. # greedy strategy config greedy: max_symbols: 10 # beam strategy config beam: beam_size: 4 score_norm: true return_best_hypothesis: False softmax_temperature: 1.0 # scale the logits by some temperature prior to softmax tsd_max_sym_exp: 10 # for Time Synchronous Decoding, int > 0 alsd_max_target_len: 5.0 # for Alignment-Length Synchronous Decoding, float > 1.0 maes_num_steps: 2 # for modified Adaptive Expansion Search, int > 0 maes_prefix_alpha: 1 # for modified Adaptive Expansion Search, int > 0 maes_expansion_beta: 2 # for modified Adaptive Expansion Search, int >= 0 maes_expansion_gamma: 2.3 # for modified Adaptive Expansion Search, float >= 0 # RNNT loss config loss: loss_name: "default" warprnnt_numba_kwargs: # FastEmit regularization: https://arxiv.org/abs/2010.11148 fastemit_lambda: 0.001 # Values can be in range [1e-4, 1e-2]. Generally, 0.001 is good start. clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. optim: name: novograd lr: 0.05 # optimizer arguments betas: [0.9, 0.0] weight_decay: 0.001 # scheduler setup sched: name: CosineAnnealing # scheduler config override warmup_steps: 5000 warmup_ratio: null min_lr: 1e-6 last_epoch: -1