conformer-transducer-xl-ami / conf /contextnet_rnnt.yaml
sanchit-gandhi's picture
Push to Hub
4ee7109
raw
history blame contribute delete
No virus
17.5 kB
# This config contains the default values for training a modified ContextNet model with Transducer loss and BPE-based vocabulary.
# In contrast to original ContextNet, the same number of filters is used throughout the model.
# Default learning parameters in this config are set for effective batch size of 1k on 32 GPUs.
# To train it with smaller batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
# It contains the default values for training a ContextNet ASR model, large size (~144M) with Transducer loss and sub-word encoding.
# Architecture and training config:
# Default learning parameters in this config are set for effective batch size of 1K. To train it with smaller effective
# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
# Here are the recommended configs for different variants of ContextNet, other parameters are the same as in this config file.
#
# +-------------+---------+------------+
# | Model | filters | time_masks |
# +=============+=========+============+
# | Small (14M)| 256 | 2 |
# +-------------+---------+------------+
# | Medium (40M)| 512 | 5 |
# +-------------+---------+------------+
# | Large (145M)| 1024 | 10 |
# +-------------------------------------
name: &name "ContextNet-8x-Stride-RNNT"
model:
sample_rate: 16000
compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag.
train_ds:
manifest_filepath: ???
sample_rate: ${model.sample_rate}
batch_size: 16 # Can be increased if memory allows or when using smaller model
trim_silence: false
max_duration: 16.7
shuffle: true
use_start_end_token: false
num_workers: 16
pin_memory: true
# tarred datasets
is_tarred: false
tarred_audio_filepaths: null
tarred_shard_strategy: "scatter"
shuffle_n: 2048
# bucketing params
bucketing_strategy: "synced_randomized"
bucketing_batch_size: null
validation_ds:
manifest_filepath: ???
sample_rate: ${model.sample_rate}
batch_size: 8
shuffle: false
use_start_end_token: false
num_workers: 16
pin_memory: true
test_ds:
manifest_filepath: null
sample_rate: ${model.sample_rate}
batch_size: 8
shuffle: false
use_start_end_token: false
num_workers: 16
pin_memory: true
model_defaults:
filters: 1024
repeat: 5
dropout: 0.1
separable: true
se: true
se_context_size: -1
kernel_size_factor: 1.0
# encoder / decoder / joint values
enc_hidden: 640
pred_hidden: 640
joint_hidden: 640
tokenizer:
dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe)
type: ??? # Can be either bpe or wpe
preprocessor:
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
sample_rate: ${model.sample_rate}
normalize: "per_feature"
window_size: 0.025
window_stride: 0.01
window: "hann"
features: &n_mels 80
n_fft: 512
frame_splicing: 1
dither: 0.00001
pad_to: 16
stft_conv: false
spec_augment:
_target_: nemo.collections.asr.modules.SpectrogramAugmentation
freq_masks: 2 # should be kept at 2
time_masks: 10 # can be 5 for small-med models, 10 for larger models.
freq_width: 27
time_width: 0.05
encoder:
_target_: nemo.collections.asr.modules.ConvASREncoder
feat_in: *n_mels
activation: swish
conv_mask: true
init_mode: "tds_uniform"
jasper:
- filters: ${model.model_defaults.filters}
repeat: 1
kernel: [5]
stride: [1]
dilation: [1]
dropout: 0.0
residual: false
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [2]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
stride_last: true
residual_mode: "stride_add"
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [2] # *stride
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
stride_last: true
residual_mode: "stride_add"
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [2] # stride
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
stride_last: true
residual_mode: "stride_add"
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.filters}
repeat: ${model.model_defaults.repeat}
kernel: [5]
stride: [1]
dilation: [1]
dropout: ${model.model_defaults.dropout}
residual: true
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
- filters: ${model.model_defaults.enc_hidden}
repeat: 1
kernel: [5]
stride: [1]
dilation: [1]
dropout: 0.0
residual: false
separable: ${model.model_defaults.separable}
se: ${model.model_defaults.se}
se_context_size: ${model.model_defaults.se_context_size}
kernel_size_factor: ${model.model_defaults.kernel_size_factor}
decoder:
_target_: nemo.collections.asr.modules.RNNTDecoder
normalization_mode: null # Currently only null is supported for export.
random_state_sampling: false # Random state sampling: https://arxiv.org/pdf/1910.11455.pdf
blank_as_pad: true # This flag must be set in order to support exporting of RNNT models + efficient inference.
prednet:
pred_hidden: ${model.model_defaults.pred_hidden}
pred_rnn_layers: 1 # only 1 layer LSTM networks are exportable.
t_max: null # Maximum possible target seq length used for Chrono Initialization - https://arxiv.org/abs/1804.11188. Disabled by default.
dropout: 0.1
joint:
_target_: nemo.collections.asr.modules.RNNTJoint
log_softmax: null # sets it according to cpu/gpu device
preserve_memory: false # dramatically slows down training, but might preserve some memory
# Fuses the computation of prediction net + joint net + loss + WER calculation
# to be run on sub-batches of size `fused_batch_size`.
# When this flag is set to true, consider the `batch_size` of *_ds to be just `encoder` batch size.
# `fused_batch_size` is the actual batch size of the prediction net, joint net and transducer loss.
# Using small values here will preserve a lot of memory during training, but will make training slower as well.
# An optimal ratio of fused_batch_size : *_ds.batch_size is 1:1.
# However, to preserve memory, this ratio can be 1:8 or even 1:16.
# Extreme case of 1:B (i.e. fused_batch_size=1) should be avoided as training speed would be very slow.
fuse_loss_wer: true
fused_batch_size: 16
jointnet:
joint_hidden: ${model.model_defaults.joint_hidden}
activation: "relu"
dropout: 0.1
# RNNT decoding strategy
decoding:
strategy: "greedy_batch" # can be greedy, greedy_batch, beam, tsd, alsd.
# greedy strategy config
greedy:
max_symbols: 10
# beam strategy config
beam:
beam_size: 4
score_norm: true
return_best_hypothesis: False
softmax_temperature: 1.0 # scale the logits by some temperature prior to softmax
tsd_max_sym_exp: 10 # for Time Synchronous Decoding, int > 0
alsd_max_target_len: 5.0 # for Alignment-Length Synchronous Decoding, float > 1.0
maes_num_steps: 2 # for modified Adaptive Expansion Search, int > 0
maes_prefix_alpha: 1 # for modified Adaptive Expansion Search, int > 0
maes_expansion_beta: 2 # for modified Adaptive Expansion Search, int >= 0
maes_expansion_gamma: 2.3 # for modified Adaptive Expansion Search, float >= 0
# RNNT loss config
loss:
loss_name: "default"
warprnnt_numba_kwargs:
# FastEmit regularization: https://arxiv.org/abs/2010.11148
fastemit_lambda: 0.001 # Values can be in range [1e-4, 1e-2]. Generally, 0.001 is good start.
clamp: -1.0 # if > 0, applies gradient clamping in range [-clamp, clamp] for the joint tensor only.
optim:
name: novograd
lr: 0.05
# optimizer arguments
betas: [0.9, 0.0]
weight_decay: 0.001
# scheduler setup
sched:
name: CosineAnnealing
# scheduler config override
warmup_steps: 5000
warmup_ratio: null
min_lr: 1e-6
last_epoch: -1