soundchoice-g2p / hyperparams.yaml

Add `model_input_keys`for g2p model (#4)

ad0a4a1 verified 4 months ago

11.4 kB

	# Generated 2022-07-09 from:
	# /notebooks/speechbrain/recipes/LibriSpeech/G2P/hparams/hparams_g2p_rnn.yaml
	# yamllint disable
	# ################################
	# Model: LSTM (encoder) + GRU (decoder) (tokenized)
	# Authors:
	# Loren Lugosch & Mirco Ravanelli 2020
	# Artem Ploujnikov 2021
	# ################################

	# Seed needs to be set at top of yaml, before objects with parameters are made
	seed: 1234
	__set_seed: !apply:torch.manual_seed [!ref <seed>]


	# Tokenizers
	char_tokenize: False
	char_token_type: unigram # ["unigram", "bpe", "char"]
	char_token_output: 512
	char_token_wordwise: True
	phn_tokenize: False
	phn_token_type: unigram # ["unigram", "bpe", "char"]
	phn_token_output: 512 # index(blank/eos/bos/unk) = 0
	phn_token_wordwise: True
	character_coverage: 1.0


	phonemes_count: 43
	graphemes_count: 31
	phonemes_enable_space: True

	ctc_weight: 0.5
	ctc_window_size: 0
	homograph_loss_weight: 2.0

	# Model parameters
	output_neurons: !apply:speechbrain.utils.hparams.choice
	value: !ref <phn_tokenize>
	choices:
	True: !ref <phn_token_output> + 1
	False: !ref <phonemes_count>

	enc_num_embeddings: !apply:speechbrain.utils.hparams.choice
	value: !ref <char_tokenize>
	choices:
	True: !ref <char_token_output> + 1
	False: !ref <graphemes_count>

	enc_dropout: 0.5
	enc_neurons: 512
	enc_num_layers: 4
	dec_dropout: 0.5
	dec_neurons: 512
	dec_att_neurons: 256
	dec_num_layers: 4
	embedding_dim: 512

	# Determines whether to use BOS (beginning-of-sequence) or EOS (end-of-sequence) tokens
	# Available modes:
	# raw: no BOS/EOS tokens are added
	# bos: a beginning-of-sequence token is added
	# eos: an end-of-sequence token is added
	grapheme_sequence_mode: bos
	phoneme_sequence_mode: bos


	# Special Token information
	bos_index: 0
	eos_index: 1
	blank_index: 2
	unk_index: 2
	token_space_index: 512


	# Language Model
	lm_emb_dim: 256 # dimension of the embeddings
	lm_rnn_size: 512 # dimension of hidden layers
	lm_layers: 2 # number of hidden layers
	lm_output_neurons: 43

	# Beam Searcher
	beam_search_min_decode_ratio: 0
	beam_search_max_decode_ratio: 1.0
	beam_search_beam_size: 16
	beam_search_beam_size_valid: 16
	beam_search_eos_threshold: 10.0
	beam_search_using_max_attn_shift: false
	beam_search_max_attn_shift: 10
	beam_search_coverage_penalty: 5.0
	beam_search_lm_weight: 0.5
	beam_search_ctc_weight_decode: 0.4
	beam_search_temperature: 1.25
	beam_search_temperature_lm: 1.0

	# Word embeddings
	use_word_emb: true
	word_emb_model: bert-base-uncased
	word_emb_dim: 768
	word_emb_enc_dim: 256
	word_emb_norm_type: batch

	graphemes:
	- A
	- B
	- C
	- D
	- E
	- F
	- G
	- H
	- I
	- J
	- K
	- L
	- M
	- N
	- O
	- P
	- Q
	- R
	- S
	- T
	- U
	- V
	- W
	- X
	- Y
	- Z
	- "'"
	- ' '

	phonemes:
	- AA
	- AE
	- AH
	- AO
	- AW
	- AY
	- B
	- CH
	- D
	- DH
	- EH
	- ER
	- EY
	- F
	- G
	- HH
	- IH
	- IY
	- JH
	- K
	- L
	- M
	- N
	- NG
	- OW
	- OY
	- P
	- R
	- S
	- SH
	- T
	- TH
	- UH
	- UW
	- V
	- W
	- Y
	- Z
	- ZH
	- ' '

	enc_input_dim: !apply:speechbrain.lobes.models.g2p.model.input_dim
	use_word_emb: !ref <use_word_emb>
	word_emb_enc_dim: !ref <word_emb_enc_dim>
	embedding_dim: !ref <embedding_dim>

	phn_char_map: !apply:speechbrain.lobes.models.g2p.dataio.build_token_char_map
	tokens: !ref <phonemes>

	char_phn_map: !apply:speechbrain.lobes.models.g2p.dataio.flip_map
	map_dict: !ref <phn_char_map>

	enc: !new:speechbrain.nnet.RNN.LSTM
	input_shape: [null, null, !ref <enc_input_dim>]
	bidirectional: True
	hidden_size: !ref <enc_neurons>
	num_layers: !ref <enc_num_layers>
	dropout: !ref <enc_dropout>

	lin: !new:speechbrain.nnet.linear.Linear
	input_size: !ref <dec_neurons>
	n_neurons: !ref <output_neurons>
	bias: false

	ctc_lin: !new:speechbrain.nnet.linear.Linear
	input_size: !ref 2 * <enc_neurons>
	n_neurons: !ref <output_neurons>

	encoder_emb: !new:speechbrain.nnet.embedding.Embedding
	num_embeddings: !ref <enc_num_embeddings>
	embedding_dim: !ref <embedding_dim>

	emb: !new:speechbrain.nnet.embedding.Embedding
	num_embeddings: !ref <output_neurons>
	embedding_dim: !ref <embedding_dim>

	dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
	enc_dim: !ref <enc_neurons> * 2
	input_size: !ref <embedding_dim>
	rnn_type: gru
	attn_type: content
	dropout: !ref <dec_dropout>
	hidden_size: !ref <dec_neurons>
	attn_dim: !ref <dec_att_neurons>
	num_layers: !ref <dec_num_layers>

	word_emb_enc: !new:speechbrain.lobes.models.g2p.model.WordEmbeddingEncoder
	word_emb_dim: !ref <word_emb_dim>
	word_emb_enc_dim: !ref <word_emb_enc_dim>
	norm_type: batch

	word_emb: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
	init: !name:speechbrain.wordemb.transformer.TransformerWordEmbeddings
	model: bert-base-uncased

	log_softmax: !new:speechbrain.nnet.activations.Softmax
	apply_log: true

	model: !new:speechbrain.lobes.models.g2p.model.AttentionSeq2Seq
	enc: !ref <enc>
	encoder_emb: !ref <encoder_emb>
	emb: !ref <emb>
	dec: !ref <dec>
	lin: !ref <lin>
	out: !ref <log_softmax>
	use_word_emb: !ref <use_word_emb>
	word_emb_enc: !ref <word_emb_enc>

	modules:
	model: !ref <model>
	enc: !ref <enc>
	encoder_emb: !ref <encoder_emb>
	emb: !ref <emb>
	dec: !ref <dec>
	lin: !ref <lin>
	ctc_lin: !ref <ctc_lin>
	out: !ref <log_softmax>
	word_emb: !ref <word_emb>
	word_emb_enc: !ref <word_emb_enc>

	lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM
	embedding_dim: !ref <lm_emb_dim>
	rnn_layers: !ref <lm_layers>
	rnn_neurons: !ref <lm_rnn_size>
	output_neurons: !ref <lm_output_neurons>
	return_hidden: True

	ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer
	eos_index: !ref <eos_index>
	blank_index: !ref <blank_index>
	ctc_fc: !ref <ctc_lin>
	ctc_window_size: !ref <ctc_window_size>

	coverage_scorer: !new:speechbrain.decoders.scorer.CoverageScorer
	vocab_size: !ref <output_neurons>

	scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
	full_scorers: [!ref <coverage_scorer>, !ref <ctc_scorer>]
	weights:
	coverage: !ref <beam_search_coverage_penalty>
	ctc: !ref <ctc_weight>

	beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearcher
	embedding: !ref <emb>
	decoder: !ref <dec>
	linear: !ref <lin>
	bos_index: !ref <bos_index>
	eos_index: !ref <eos_index>
	min_decode_ratio: !ref <beam_search_min_decode_ratio>
	max_decode_ratio: !ref <beam_search_max_decode_ratio>
	beam_size: !ref <beam_search_beam_size>
	eos_threshold: !ref <beam_search_eos_threshold>
	using_max_attn_shift: !ref <beam_search_using_max_attn_shift>
	max_attn_shift: !ref <beam_search_max_attn_shift>
	temperature: !ref <beam_search_temperature>
	scorer: !ref <scorer>

	beam_searcher_valid: !new:speechbrain.decoders.S2SRNNBeamSearcher
	embedding: !ref <emb>
	decoder: !ref <dec>
	linear: !ref <lin>
	bos_index: !ref <bos_index>
	eos_index: !ref <eos_index>
	min_decode_ratio: !ref <beam_search_min_decode_ratio>
	max_decode_ratio: !ref <beam_search_max_decode_ratio>
	beam_size: !ref <beam_search_beam_size>
	eos_threshold: !ref <beam_search_eos_threshold>
	using_max_attn_shift: !ref <beam_search_using_max_attn_shift>
	max_attn_shift: !ref <beam_search_max_attn_shift>
	temperature: !ref <beam_search_temperature>
	scorer: !ref <scorer>

	homograph_extractor: !new:speechbrain.lobes.models.g2p.homograph.SubsequenceExtractor

	# keys to forward from the encoding pipeline to the model
	model_input_keys:
	- grapheme_encoded
	- word_emb

	model_output_keys:
	- p_seq
	- char_lens
	- encoder_out

	grapheme_encoder: &id027 !new:speechbrain.dataio.encoder.TextEncoder
	phoneme_encoder: &id024 !new:speechbrain.dataio.encoder.TextEncoder


	grapheme_tokenizer: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
	init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece
	model_dir: grapheme_tokenizer
	bos_id: !ref <bos_index>
	eos_id: !ref <eos_index>
	unk_id: !ref <unk_index>
	vocab_size: !ref <char_token_output>
	annotation_train: null
	annotation_read: char
	model_type: !ref <char_token_type> # ["unigram", "bpe", "char"]
	character_coverage: !ref <character_coverage>
	annotation_format: json
	text_file: grapheme_annotations.txt

	phoneme_tokenizer: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
	init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece
	model_dir: phoneme_tokenizer
	bos_id: !ref <bos_index>
	eos_id: !ref <eos_index>
	unk_id: !ref <unk_index>
	vocab_size: !ref <phn_token_output>
	annotation_train: null
	annotation_read: phn
	model_type: !ref <phn_token_type> # ["unigram", "bpe", "char"]
	character_coverage: !ref <character_coverage>
	annotation_format: json
	text_file: null

	out_phoneme_decoder_tok: !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize
	tokenizer: !ref <phoneme_tokenizer>
	char_map: !ref <char_phn_map>
	token_space_index: !ref <token_space_index>
	wordwise: !ref <phn_token_wordwise>

	out_phoneme_decoder_raw: !name:speechbrain.lobes.models.g2p.dataio.text_decode
	encoder: !ref <phoneme_encoder>

	out_phoneme_decoder: !apply:speechbrain.utils.hparams.choice
	value: false
	choices:
	True: !ref <out_phoneme_decoder_tok>
	False: !ref <out_phoneme_decoder_raw>
	encode_pipeline:
	batch: false
	use_padded_data: true
	output_keys:
	- grapheme_list
	- grapheme_encoded_list
	- grapheme_encoded
	- word_emb
	init:
	- func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos
	encoder: !ref <grapheme_encoder>
	tokens: !ref <graphemes>
	bos_index: !ref <bos_index>
	eos_index: !ref <eos_index>
	- func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos
	encoder: !ref <phoneme_encoder>
	tokens: !ref <phonemes>
	bos_index: !ref <bos_index>
	eos_index: !ref <eos_index>
	steps:
	- func: !name:speechbrain.lobes.models.g2p.dataio.clean_pipeline
	graphemes: !ref <graphemes>
	takes: txt
	provides: txt_cleaned
	- func: !name:speechbrain.lobes.models.g2p.dataio.grapheme_pipeline
	grapheme_encoder: !ref <grapheme_encoder>
	takes: txt_cleaned
	provides:
	- grapheme_list
	- grapheme_encoded_list
	- grapheme_encoded_raw

	- func: !name:speechbrain.lobes.models.g2p.dataio.add_bos_eos
	encoder: !ref <grapheme_encoder>
	takes: grapheme_encoded_list
	provides:
	- grapheme_encoded
	- grapheme_len
	- grapheme_encoded_eos
	- grapheme_len_eos
	- func: !name:speechbrain.lobes.models.g2p.dataio.word_emb_pipeline
	word_emb: !ref <word_emb>
	grapheme_encoder: !ref <grapheme_encoder>
	use_word_emb: !ref <use_word_emb>
	takes:
	- txt
	- grapheme_encoded
	- grapheme_len
	provides: word_emb

	decode_pipeline:
	batch: true
	output_keys:
	- phonemes
	steps:
	- func: !name:speechbrain.lobes.models.g2p.dataio.beam_search_pipeline
	beam_searcher: !ref <beam_searcher>
	takes:
	- char_lens
	- encoder_out
	provides:
	- hyps
	- scores
	- func: !apply:speechbrain.utils.hparams.choice
	value: false
	choices:
	True: !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize
	tokenizer: !ref <phoneme_tokenizer>
	char_map: !ref <char_phn_map>
	token_space_index: !ref <token_space_index>
	wordwise: !ref <phn_token_wordwise>
	False: !name:speechbrain.lobes.models.g2p.dataio.phoneme_decoder_pipeline
	phoneme_encoder: !ref <phoneme_encoder>
	takes:
	- hyps
	provides:
	- phonemes


	pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
	loadables:
	model: !ref <model>
	ctc_lin: !ref <ctc_lin>