conformer-transducer-xl-ami / conf /conformer_transducer_bpe_large.yaml

Push to Hub

4ee7109 about 2 years ago

8.79 kB

	# It contains the default values for training a Conformer-Transducer ASR model, large size (~120M) with Transducer loss and sub-word encoding.

	# Architecture and training config:
	# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective
	# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
	# Here are the recommended configs for different variants of Conformer-Transducer, other parameters are the same as in this config file.
	#
	# +-------------+---------+---------+----------+--------------+--------------------------+
	# \| Model \| d_model \| n_heads \| n_layers \| weight_decay \| pred_hidden/joint_hidden \|
	# +=============+=========+========+===========+==============+==========================+
	# \| Small (14M)\| 176 \| 4 \| 16 \| 0.0 \| 320 \|
	# +-------------+---------+--------+-----------+--------------+--------------------------+
	# \| Medium (32M)\| 256 \| 4 \| 16 \| 1e-3 \| 640 \|
	# +-------------+---------+--------+-----------+--------------+--------------------------+
	# \| Large (120M)\| 512 \| 8 \| 17 \| 1e-3 \| 640 \|
	# +-----------------------------------------------------------+--------------------------+
	#

	# You may find more info about Conformer-Transducer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#conformer-transducer
	# Pre-trained models of Conformer-Transducer can be found here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/results.html
	# The checkpoint of the large model trained on NeMo ASRSET with this recipe can be found here: https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_large

	name: "Conformer-Transducer-BPE"

	model:
	sample_rate: 16000
	compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag.
	log_prediction: true # enables logging sample predictions in the output during training
	skip_nan_grad: false

	model_defaults:
	enc_hidden: ${model.encoder.d_model}
	pred_hidden: 640
	joint_hidden: 640

	train_ds:
	manifest_filepath: ???
	sample_rate: ${model.sample_rate}
	batch_size: 16 # you may increase batch_size if your memory allows
	shuffle: true
	num_workers: 8
	pin_memory: true
	use_start_end_token: false
	trim_silence: false
	max_duration: 16.7 # it is set for LibriSpeech, you may need to update it for your dataset
	min_duration: 0.1
	# tarred datasets
	is_tarred: false
	tarred_audio_filepaths: null
	shuffle_n: 2048
	# bucketing params
	bucketing_strategy: "synced_randomized"
	bucketing_batch_size: null

	validation_ds:
	manifest_filepath: ???
	sample_rate: ${model.sample_rate}
	batch_size: 16
	shuffle: false
	num_workers: 8
	pin_memory: true
	use_start_end_token: false

	test_ds:
	manifest_filepath: null
	sample_rate: ${model.sample_rate}
	batch_size: 16
	shuffle: false
	num_workers: 8
	pin_memory: true
	use_start_end_token: false

	# You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py
	tokenizer:
	dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe)
	type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer)

	preprocessor:
	_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
	sample_rate: ${model.sample_rate}
	normalize: "per_feature"
	window_size: 0.025
	window_stride: 0.01
	window: "hann"
	features: 80
	n_fft: 512
	frame_splicing: 1
	dither: 0.00001
	pad_to: 0

	spec_augment:
	_target_: nemo.collections.asr.modules.SpectrogramAugmentation
	freq_masks: 2 # set to zero to disable it
	time_masks: 10 # set to zero to disable it
	freq_width: 27
	time_width: 0.05

	encoder:
	_target_: nemo.collections.asr.modules.ConformerEncoder
	feat_in: ${model.preprocessor.features}
	feat_out: -1 # you may set it if you need different output size other than the default d_model
	n_layers: 17
	d_model: 512

	# Sub-sampling params
	subsampling: striding # vggnet, striding, stacking or stacking_norm, dw_striding
	subsampling_factor: 4 # must be power of 2 for striding and vggnet
	subsampling_conv_channels: -1 # set to -1 to make it equal to the d_model
	causal_downsampling: false

	# Feed forward module's params
	ff_expansion_factor: 4

	# Multi-headed Attention Module's params
	self_attention_model: rel_pos # rel_pos or abs_pos
	n_heads: 8 # may need to be lower for smaller d_models
	# [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
	att_context_size: [-1, -1] # -1 means unlimited context
	att_context_style: regular # regular or chunked_limited
	xscaling: true # scales up the input embeddings by sqrt(d_model)
	untie_biases: true # unties the biases of the TransformerXL layers
	pos_emb_max_len: 5000

	# Convolution module's params
	conv_kernel_size: 31
	conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
	# conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
	# null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
	conv_context_size: null

	### regularization
	dropout: 0.1 # The dropout used in most of the Conformer Modules
	dropout_emb: 0.0 # The dropout used for embeddings
	dropout_att: 0.1 # The dropout for multi-headed attention modules

	decoder:
	_target_: nemo.collections.asr.modules.RNNTDecoder
	normalization_mode: null # Currently only null is supported for export.
	random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf
	blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference.

	prednet:
	pred_hidden: ${model.model_defaults.pred_hidden}
	pred_rnn_layers: 1
	t_max: null
	dropout: 0.2

	joint:
	_target_: nemo.collections.asr.modules.RNNTJoint
	log_softmax: null # 'null' would set it automatically according to CPU/GPU device
	preserve_memory: false # dramatically slows down training, but might preserve some memory

	# Fuses the computation of prediction net + joint net + loss + WER calculation
	# to be run on sub-batches of size `fused_batch_size`.
	# When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size.
	# `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss.
	# Using small values here will preserve a lot of memory during training, but will make training slower as well.
	# An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1.
	# However, to preserve memory, this ratio can be 1:8 or even 1:16.
	# Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow.
	fuse_loss_wer: true
	fused_batch_size: 16

	jointnet:
	joint_hidden: ${model.model_defaults.joint_hidden}
	activation: "relu"
	dropout: 0.2

	decoding:
	strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd.

	# greedy strategy config
	greedy:
	max_symbols: 10

	# beam strategy config
	beam:
	beam_size: 2
	return_best_hypothesis: False
	score_norm: true
	tsd_max_sym_exp: 50 # for Time Synchronous Decoding
	alsd_max_target_len: 2.0 # for Alignment-Length Synchronous Decoding

	loss:
	loss_name: "default"

	warprnnt_numba_kwargs:
	# FastEmit regularization: https://arxiv.org/abs/2010.11148
	# You may enable FastEmit to reduce the latency of the model for streaming
	fastemit_lambda: 0.0 # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start.
	clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only.

	# Adds Gaussian noise to the gradients of the decoder to avoid overfitting
	variational_noise:
	start_step: 0
	std: 0.0

	optim:
	name: adamw
	lr: 5.0
	# optimizer arguments
	betas: [0.9, 0.98]
	weight_decay: 1e-3

	# scheduler setup
	sched:
	name: NoamAnnealing
	d_model: ${model.encoder.d_model}
	# scheduler config override
	warmup_steps: 10000
	warmup_ratio: null
	min_lr: 1e-6