Adding the acoustic model (.nemo) and the architecture (.yaml) to the Repo
bbb3d6f
name: &name "QuartzNet15x5" | |
model: | |
sample_rate: &sample_rate 16000 | |
repeat: &repeat 5 | |
dropout: &dropout 0.0 | |
separable: &separable true | |
labels: &labels [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "á", "æ", "é", "í", "ð", "ó", "ö", "ú", "ý", "þ"] | |
train_ds: | |
manifest_filepath: ??? | |
sample_rate: 16000 | |
labels: | |
batch_size: 16 ########################## | |
trim_silence: True | |
max_duration: 16.7 | |
shuffle: True | |
num_workers: 8 | |
pin_memory: true | |
# tarred datasets | |
is_tarred: false | |
tarred_audio_filepaths: null | |
shuffle_n: 2048 | |
# bucketing params | |
bucketing_strategy: "synced_randomized" | |
bucketing_batch_size: null | |
validation_ds: | |
manifest_filepath: ??? | |
sample_rate: 16000 | |
labels: | |
batch_size: 16 ########################## | |
shuffle: False | |
num_workers: 8 | |
pin_memory: true | |
preprocessor: | |
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor | |
normalize: "per_feature" | |
window_size: 0.02 | |
sample_rate: | |
window_stride: 0.01 | |
window: "hann" | |
features: &n_mels 64 | |
n_fft: 512 | |
frame_splicing: 1 | |
dither: 1.0e-05 | |
spec_augment: | |
_target_: nemo.collections.asr.modules.SpectrogramAugmentation | |
rect_freq: 50 | |
rect_masks: 5 | |
rect_time: 120 | |
encoder: | |
_target_: nemo.collections.asr.modules.ConvASREncoder | |
feat_in: | |
activation: relu | |
conv_mask: true | |
jasper: | |
#1 | |
- dilation: [1] | |
dropout: | |
filters: 256 | |
kernel: [33] | |
repeat: 1 | |
residual: false | |
separable: | |
stride: [2] | |
#2 | |
- dilation: [1] | |
dropout: | |
filters: 256 | |
kernel: [33] | |
repeat: | |
residual: true | |
separable: | |
stride: [1] | |
#3 | |
- dilation: [1] | |
dropout: | |
filters: 256 | |
kernel: [33] | |
repeat: | |
residual: true | |
separable: | |
stride: [1] | |
#4 | |
- dilation: [1] | |
dropout: | |
filters: 256 | |
kernel: [33] | |
repeat: | |
residual: true | |
separable: | |
stride: [1] | |
#5 | |
- dilation: [1] | |
dropout: | |
filters: 256 | |
kernel: [39] | |
repeat: | |
residual: true | |
separable: | |
stride: [1] | |
#6 | |
- dilation: [1] | |
dropout: | |
filters: 256 | |
kernel: [39] | |
repeat: | |
residual: true | |
separable: | |
stride: [1] | |
#7 | |
- dilation: [1] | |
dropout: | |
filters: 256 | |
kernel: [39] | |
repeat: | |
residual: true | |
separable: | |
stride: [1] | |
#8 | |
- dilation: [1] | |
dropout: | |
filters: 512 | |
kernel: [51] | |
repeat: | |
residual: true | |
separable: | |
stride: [1] | |
#9 | |
- dilation: [1] | |
dropout: | |
filters: 512 | |
kernel: [51] | |
repeat: | |
residual: true | |
separable: | |
stride: [1] | |
#10 | |
- dilation: [1] | |
dropout: | |
filters: 512 | |
kernel: [51] | |
repeat: | |
residual: true | |
separable: | |
stride: [1] | |
#11 | |
- dilation: [1] | |
dropout: | |
filters: 512 | |
kernel: [63] | |
repeat: | |
residual: true | |
separable: | |
stride: [1] | |
#12 | |
- dilation: [1] | |
dropout: | |
filters: 512 | |
kernel: [63] | |
repeat: | |
residual: true | |
separable: | |
stride: [1] | |
#13 | |
- dilation: [1] | |
dropout: | |
filters: 512 | |
kernel: [63] | |
repeat: | |
residual: true | |
separable: | |
stride: [1] | |
#14 | |
- dilation: [1] | |
dropout: | |
filters: 512 | |
kernel: [75] | |
repeat: | |
residual: true | |
separable: | |
stride: [1] | |
#15 | |
- dilation: [1] | |
dropout: | |
filters: 512 | |
kernel: [75] | |
repeat: | |
residual: true | |
separable: | |
stride: [1] | |
#16 | |
- dilation: [1] | |
dropout: | |
filters: 512 | |
kernel: [75] | |
repeat: | |
residual: true | |
separable: | |
stride: [1] | |
#17 | |
- dilation: [2] | |
dropout: | |
filters: 512 | |
kernel: [87] | |
repeat: 1 | |
residual: false | |
separable: | |
stride: [1] | |
#18 | |
- dilation: [1] | |
dropout: | |
filters: &enc_filters 1024 | |
kernel: [1] | |
repeat: 1 | |
residual: false | |
stride: [1] | |
decoder: | |
_target_: nemo.collections.asr.modules.ConvASRDecoder | |
feat_in: | |
num_classes: 37 | |
vocabulary: | |
optim: | |
name: novograd | |
# _target_: nemo.core.optim.optimizers.Novograd | |
lr: 0.0012 | |
# optimizer arguments | |
betas: [0.95, 0.25] | |
weight_decay: 0.001 | |
# scheduler setup | |
sched: | |
name: CosineAnnealing | |
# pytorch lightning args | |
# monitor: val_loss | |
# reduce_on_plateau: false | |
# Scheduler params | |
warmup_steps: null | |
warmup_ratio: null | |
min_lr: 0.0 | |
last_epoch: -1 | |
trainer: | |
devices: 1 # number of gpus | |
max_epochs: 5 | |
max_steps: -1 # computed at runtime if not set | |
num_nodes: 1 | |
accelerator: gpu | |
strategy: ddp | |
accumulate_grad_batches: 1 | |
enable_checkpointing: False # Provided by exp_manager | |
logger: False # Provided by exp_manager | |
log_every_n_steps: 1 # Interval of logging. | |
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations | |
benchmark: false # needs to be false for models with variable-length speech input as it slows down training | |
exp_manager: | |
exp_dir: null | |
name: | |
create_tensorboard_logger: True | |
create_checkpoint_callback: True | |
checkpoint_callback_params: | |
monitor: "val_wer" | |
mode: "min" | |
create_wandb_logger: False | |
wandb_logger_kwargs: | |
name: null | |
project: null | |