stt_is_quartznet15x5_ft_ep56_875h / QuartzNet_FT15x5_Icelandic.yaml

Adding the acoustic model (.nemo) and the architecture (.yaml) to the Repo

bbb3d6f almost 2 years ago

6.14 kB

	name: &name "QuartzNet15x5"

	model:
	sample_rate: &sample_rate 16000
	repeat: &repeat 5
	dropout: &dropout 0.0
	separable: &separable true
	labels: &labels [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "á", "æ", "é", "í", "ð", "ó", "ö", "ú", "ý", "þ"]

	train_ds:
	manifest_filepath: ???
	sample_rate: 16000
	labels: *labels
	batch_size: 16 ##########################
	trim_silence: True
	max_duration: 16.7
	shuffle: True
	num_workers: 8
	pin_memory: true
	# tarred datasets
	is_tarred: false
	tarred_audio_filepaths: null
	shuffle_n: 2048
	# bucketing params
	bucketing_strategy: "synced_randomized"
	bucketing_batch_size: null

	validation_ds:
	manifest_filepath: ???
	sample_rate: 16000
	labels: *labels
	batch_size: 16 ##########################
	shuffle: False
	num_workers: 8
	pin_memory: true

	preprocessor:
	_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
	normalize: "per_feature"
	window_size: 0.02
	sample_rate: *sample_rate
	window_stride: 0.01
	window: "hann"
	features: &n_mels 64
	n_fft: 512
	frame_splicing: 1
	dither: 1.0e-05

	spec_augment:
	_target_: nemo.collections.asr.modules.SpectrogramAugmentation
	rect_freq: 50
	rect_masks: 5
	rect_time: 120

	encoder:
	_target_: nemo.collections.asr.modules.ConvASREncoder
	feat_in: *n_mels
	activation: relu
	conv_mask: true

	jasper:
	#1
	- dilation: [1]
	dropout: *dropout
	filters: 256
	kernel: [33]
	repeat: 1
	residual: false
	separable: *separable
	stride: [2]
	#2
	- dilation: [1]
	dropout: *dropout
	filters: 256
	kernel: [33]
	repeat: *repeat
	residual: true
	separable: *separable
	stride: [1]
	#3
	- dilation: [1]
	dropout: *dropout
	filters: 256
	kernel: [33]
	repeat: *repeat
	residual: true
	separable: *separable
	stride: [1]
	#4
	- dilation: [1]
	dropout: *dropout
	filters: 256
	kernel: [33]
	repeat: *repeat
	residual: true
	separable: *separable
	stride: [1]
	#5
	- dilation: [1]
	dropout: *dropout
	filters: 256
	kernel: [39]
	repeat: *repeat
	residual: true
	separable: *separable
	stride: [1]
	#6
	- dilation: [1]
	dropout: *dropout
	filters: 256
	kernel: [39]
	repeat: *repeat
	residual: true
	separable: *separable
	stride: [1]
	#7
	- dilation: [1]
	dropout: *dropout
	filters: 256
	kernel: [39]
	repeat: *repeat
	residual: true
	separable: *separable
	stride: [1]
	#8
	- dilation: [1]
	dropout: *dropout
	filters: 512
	kernel: [51]
	repeat: *repeat
	residual: true
	separable: *separable
	stride: [1]
	#9
	- dilation: [1]
	dropout: *dropout
	filters: 512
	kernel: [51]
	repeat: *repeat
	residual: true
	separable: *separable
	stride: [1]
	#10
	- dilation: [1]
	dropout: *dropout
	filters: 512
	kernel: [51]
	repeat: *repeat
	residual: true
	separable: *separable
	stride: [1]
	#11
	- dilation: [1]
	dropout: *dropout
	filters: 512
	kernel: [63]
	repeat: *repeat
	residual: true
	separable: *separable
	stride: [1]
	#12
	- dilation: [1]
	dropout: *dropout
	filters: 512
	kernel: [63]
	repeat: *repeat
	residual: true
	separable: *separable
	stride: [1]
	#13
	- dilation: [1]
	dropout: *dropout
	filters: 512
	kernel: [63]
	repeat: *repeat
	residual: true
	separable: *separable
	stride: [1]
	#14
	- dilation: [1]
	dropout: *dropout
	filters: 512
	kernel: [75]
	repeat: *repeat
	residual: true
	separable: *separable
	stride: [1]
	#15
	- dilation: [1]
	dropout: *dropout
	filters: 512
	kernel: [75]
	repeat: *repeat
	residual: true
	separable: *separable
	stride: [1]
	#16
	- dilation: [1]
	dropout: *dropout
	filters: 512
	kernel: [75]
	repeat: *repeat
	residual: true
	separable: *separable
	stride: [1]
	#17
	- dilation: [2]
	dropout: *dropout
	filters: 512
	kernel: [87]
	repeat: 1
	residual: false
	separable: *separable
	stride: [1]
	#18
	- dilation: [1]
	dropout: *dropout
	filters: &enc_filters 1024
	kernel: [1]
	repeat: 1
	residual: false
	stride: [1]

	decoder:
	_target_: nemo.collections.asr.modules.ConvASRDecoder
	feat_in: *enc_filters
	num_classes: 37
	vocabulary: *labels

	optim:
	name: novograd
	# _target_: nemo.core.optim.optimizers.Novograd
	lr: 0.0012
	# optimizer arguments
	betas: [0.95, 0.25]
	weight_decay: 0.001

	# scheduler setup
	sched:
	name: CosineAnnealing

	# pytorch lightning args
	# monitor: val_loss
	# reduce_on_plateau: false

	# Scheduler params
	warmup_steps: null
	warmup_ratio: null
	min_lr: 0.0
	last_epoch: -1

	trainer:
	devices: 1 # number of gpus
	max_epochs: 5
	max_steps: -1 # computed at runtime if not set
	num_nodes: 1
	accelerator: gpu
	strategy: ddp
	accumulate_grad_batches: 1
	enable_checkpointing: False # Provided by exp_manager
	logger: False # Provided by exp_manager
	log_every_n_steps: 1 # Interval of logging.
	val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
	benchmark: false # needs to be false for models with variable-length speech input as it slows down training

	exp_manager:
	exp_dir: null
	name: *name
	create_tensorboard_logger: True
	create_checkpoint_callback: True
	checkpoint_callback_params:
	monitor: "val_wer"
	mode: "min"
	create_wandb_logger: False
	wandb_logger_kwargs:
	name: null
	project: null