# It contains the default values for training a Conformer-Transducer ASR model, XL size (~0.6B) with Transducer loss and sub-word encoding. # You may find more info about Conformer-Transducer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#conformer-transducer # Pre-trained models of Conformer-Transducer can be found here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/results.html # The checkpoint of the xlarge model trained on NeMo ASRSET with this recipe can be found here: https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_xlarge name: "Conformer-Transducer-BPE" model: sample_rate: 16000 compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag. log_prediction: true # enables logging sample predictions in the output during training skip_nan_grad: false model_defaults: enc_hidden: ${model.encoder.d_model} pred_hidden: 640 joint_hidden: 640 train_ds: manifest_filepath: ??? sample_rate: ${model.sample_rate} batch_size: 16 # you may increase batch_size if your memory allows shuffle: true num_workers: 8 pin_memory: true use_start_end_token: false trim_silence: false max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset min_duration: 0.1 # tarred datasets is_tarred: false tarred_audio_filepaths: null shuffle_n: 2048 # bucketing params bucketing_strategy: "synced_randomized" bucketing_batch_size: null validation_ds: manifest_filepath: ??? sample_rate: ${model.sample_rate} batch_size: 16 shuffle: false num_workers: 8 pin_memory: true use_start_end_token: false test_ds: manifest_filepath: null sample_rate: ${model.sample_rate} batch_size: 16 shuffle: false num_workers: 8 pin_memory: true use_start_end_token: false # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py tokenizer: dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) preprocessor: _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor sample_rate: ${model.sample_rate} normalize: "per_feature" window_size: 0.025 window_stride: 0.01 window: "hann" features: 80 n_fft: 512 frame_splicing: 1 dither: 0.00001 pad_to: 0 spec_augment: _target_: nemo.collections.asr.modules.SpectrogramAugmentation freq_masks: 2 # set to zero to disable it time_masks: 10 # set to zero to disable it freq_width: 27 time_width: 0.05 encoder: _target_: nemo.collections.asr.modules.ConformerEncoder feat_in: ${model.preprocessor.features} feat_out: -1 # you may set it if you need different output size other than the default d_model n_layers: 24 d_model: 1024 # Sub-sampling params subsampling: striding # vggnet, striding, stacking or stacking_norm, dw_striding subsampling_factor: 4 # must be power of 2 for striding and vggnet subsampling_conv_channels: -1 # set to -1 to make it equal to the d_model causal_downsampling: false # Feed forward module's params ff_expansion_factor: 4 # Multi-headed Attention Module's params self_attention_model: rel_pos # rel_pos or abs_pos n_heads: 8 # may need to be lower for smaller d_models # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention att_context_size: [-1, -1] # -1 means unlimited context att_context_style: regular # regular or chunked_limited xscaling: true # scales up the input embeddings by sqrt(d_model) untie_biases: true # unties the biases of the TransformerXL layers pos_emb_max_len: 5000 # Convolution module's params conv_kernel_size: 5 conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups) # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0] conv_context_size: null ### regularization dropout: 0.1 # The dropout used in most of the Conformer Modules dropout_emb: 0.0 # The dropout used for embeddings dropout_att: 0.1 # The dropout for multi-headed attention modules decoder: _target_: nemo.collections.asr.modules.RNNTDecoder normalization_mode: null # Currently only null is supported for export. random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference. prednet: pred_hidden: ${model.model_defaults.pred_hidden} pred_rnn_layers: 2 t_max: null dropout: 0.1 joint: _target_: nemo.collections.asr.modules.RNNTJoint log_softmax: null # 'null' would set it automatically according to CPU/GPU device preserve_memory: false # dramatically slows down training, but might preserve some memory # Fuses the computation of prediction net + joint net + loss + WER calculation # to be run on sub-batches of size `fused_batch_size`. # When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size. # `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss. # Using small values here will preserve a lot of memory during training, but will make training slower as well. # An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1. # However, to preserve memory, this ratio can be 1:8 or even 1:16. # Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow. fuse_loss_wer: true fused_batch_size: 16 jointnet: joint_hidden: ${model.model_defaults.joint_hidden} activation: "relu" dropout: 0.1 decoding: strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd. # greedy strategy config greedy: max_symbols: 10 # beam strategy config beam: beam_size: 2 return_best_hypothesis: False score_norm: true tsd_max_sym_exp: 50 # for Time Synchronous Decoding alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding loss: loss_name: "default" warprnnt_numba_kwargs: # FastEmit regularization: https://arxiv.org/abs/2010.11148 # You may enable FastEmit to reduce the latency of the model for streaming fastemit_lambda: 0.0 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start. clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only. # Adds Gaussian noise to the gradients of the decoder to avoid overfitting variational_noise: start_step: 0 std: 0.0 optim: name: adamw lr: 5.0 # optimizer arguments betas: [0.9, 0.98] weight_decay: 1e-3 # scheduler setup sched: name: NoamAnnealing d_model: ${model.encoder.d_model} # scheduler config override warmup_steps: 10000 warmup_ratio: null min_lr: 1e-6