Text2Text Generation
speechbrain
English
G2P
Grapheme-to-Phoneme
soundchoice-g2p / hyperparams.yaml
speechbrainteam's picture
Add `model_input_keys`for g2p model (#4)
ad0a4a1 verified
# Generated 2022-07-09 from:
# /notebooks/speechbrain/recipes/LibriSpeech/G2P/hparams/hparams_g2p_rnn.yaml
# yamllint disable
# ################################
# Model: LSTM (encoder) + GRU (decoder) (tokenized)
# Authors:
# Loren Lugosch & Mirco Ravanelli 2020
# Artem Ploujnikov 2021
# ################################
# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1234
__set_seed: !apply:torch.manual_seed [!ref <seed>]
# Tokenizers
char_tokenize: False
char_token_type: unigram # ["unigram", "bpe", "char"]
char_token_output: 512
char_token_wordwise: True
phn_tokenize: False
phn_token_type: unigram # ["unigram", "bpe", "char"]
phn_token_output: 512 # index(blank/eos/bos/unk) = 0
phn_token_wordwise: True
character_coverage: 1.0
phonemes_count: 43
graphemes_count: 31
phonemes_enable_space: True
ctc_weight: 0.5
ctc_window_size: 0
homograph_loss_weight: 2.0
# Model parameters
output_neurons: !apply:speechbrain.utils.hparams.choice
value: !ref <phn_tokenize>
choices:
True: !ref <phn_token_output> + 1
False: !ref <phonemes_count>
enc_num_embeddings: !apply:speechbrain.utils.hparams.choice
value: !ref <char_tokenize>
choices:
True: !ref <char_token_output> + 1
False: !ref <graphemes_count>
enc_dropout: 0.5
enc_neurons: 512
enc_num_layers: 4
dec_dropout: 0.5
dec_neurons: 512
dec_att_neurons: 256
dec_num_layers: 4
embedding_dim: 512
# Determines whether to use BOS (beginning-of-sequence) or EOS (end-of-sequence) tokens
# Available modes:
# raw: no BOS/EOS tokens are added
# bos: a beginning-of-sequence token is added
# eos: an end-of-sequence token is added
grapheme_sequence_mode: bos
phoneme_sequence_mode: bos
# Special Token information
bos_index: 0
eos_index: 1
blank_index: 2
unk_index: 2
token_space_index: 512
# Language Model
lm_emb_dim: 256 # dimension of the embeddings
lm_rnn_size: 512 # dimension of hidden layers
lm_layers: 2 # number of hidden layers
lm_output_neurons: 43
# Beam Searcher
beam_search_min_decode_ratio: 0
beam_search_max_decode_ratio: 1.0
beam_search_beam_size: 16
beam_search_beam_size_valid: 16
beam_search_eos_threshold: 10.0
beam_search_using_max_attn_shift: false
beam_search_max_attn_shift: 10
beam_search_coverage_penalty: 5.0
beam_search_lm_weight: 0.5
beam_search_ctc_weight_decode: 0.4
beam_search_temperature: 1.25
beam_search_temperature_lm: 1.0
# Word embeddings
use_word_emb: true
word_emb_model: bert-base-uncased
word_emb_dim: 768
word_emb_enc_dim: 256
word_emb_norm_type: batch
graphemes:
- A
- B
- C
- D
- E
- F
- G
- H
- I
- J
- K
- L
- M
- N
- O
- P
- Q
- R
- S
- T
- U
- V
- W
- X
- Y
- Z
- "'"
- ' '
phonemes:
- AA
- AE
- AH
- AO
- AW
- AY
- B
- CH
- D
- DH
- EH
- ER
- EY
- F
- G
- HH
- IH
- IY
- JH
- K
- L
- M
- N
- NG
- OW
- OY
- P
- R
- S
- SH
- T
- TH
- UH
- UW
- V
- W
- Y
- Z
- ZH
- ' '
enc_input_dim: !apply:speechbrain.lobes.models.g2p.model.input_dim
use_word_emb: !ref <use_word_emb>
word_emb_enc_dim: !ref <word_emb_enc_dim>
embedding_dim: !ref <embedding_dim>
phn_char_map: !apply:speechbrain.lobes.models.g2p.dataio.build_token_char_map
tokens: !ref <phonemes>
char_phn_map: !apply:speechbrain.lobes.models.g2p.dataio.flip_map
map_dict: !ref <phn_char_map>
enc: !new:speechbrain.nnet.RNN.LSTM
input_shape: [null, null, !ref <enc_input_dim>]
bidirectional: True
hidden_size: !ref <enc_neurons>
num_layers: !ref <enc_num_layers>
dropout: !ref <enc_dropout>
lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <dec_neurons>
n_neurons: !ref <output_neurons>
bias: false
ctc_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref 2 * <enc_neurons>
n_neurons: !ref <output_neurons>
encoder_emb: !new:speechbrain.nnet.embedding.Embedding
num_embeddings: !ref <enc_num_embeddings>
embedding_dim: !ref <embedding_dim>
emb: !new:speechbrain.nnet.embedding.Embedding
num_embeddings: !ref <output_neurons>
embedding_dim: !ref <embedding_dim>
dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
enc_dim: !ref <enc_neurons> * 2
input_size: !ref <embedding_dim>
rnn_type: gru
attn_type: content
dropout: !ref <dec_dropout>
hidden_size: !ref <dec_neurons>
attn_dim: !ref <dec_att_neurons>
num_layers: !ref <dec_num_layers>
word_emb_enc: !new:speechbrain.lobes.models.g2p.model.WordEmbeddingEncoder
word_emb_dim: !ref <word_emb_dim>
word_emb_enc_dim: !ref <word_emb_enc_dim>
norm_type: batch
word_emb: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
init: !name:speechbrain.wordemb.transformer.TransformerWordEmbeddings
model: bert-base-uncased
log_softmax: !new:speechbrain.nnet.activations.Softmax
apply_log: true
model: !new:speechbrain.lobes.models.g2p.model.AttentionSeq2Seq
enc: !ref <enc>
encoder_emb: !ref <encoder_emb>
emb: !ref <emb>
dec: !ref <dec>
lin: !ref <lin>
out: !ref <log_softmax>
use_word_emb: !ref <use_word_emb>
word_emb_enc: !ref <word_emb_enc>
modules:
model: !ref <model>
enc: !ref <enc>
encoder_emb: !ref <encoder_emb>
emb: !ref <emb>
dec: !ref <dec>
lin: !ref <lin>
ctc_lin: !ref <ctc_lin>
out: !ref <log_softmax>
word_emb: !ref <word_emb>
word_emb_enc: !ref <word_emb_enc>
lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM
embedding_dim: !ref <lm_emb_dim>
rnn_layers: !ref <lm_layers>
rnn_neurons: !ref <lm_rnn_size>
output_neurons: !ref <lm_output_neurons>
return_hidden: True
ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer
eos_index: !ref <eos_index>
blank_index: !ref <blank_index>
ctc_fc: !ref <ctc_lin>
ctc_window_size: !ref <ctc_window_size>
coverage_scorer: !new:speechbrain.decoders.scorer.CoverageScorer
vocab_size: !ref <output_neurons>
scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
full_scorers: [!ref <coverage_scorer>, !ref <ctc_scorer>]
weights:
coverage: !ref <beam_search_coverage_penalty>
ctc: !ref <ctc_weight>
beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearcher
embedding: !ref <emb>
decoder: !ref <dec>
linear: !ref <lin>
bos_index: !ref <bos_index>
eos_index: !ref <eos_index>
min_decode_ratio: !ref <beam_search_min_decode_ratio>
max_decode_ratio: !ref <beam_search_max_decode_ratio>
beam_size: !ref <beam_search_beam_size>
eos_threshold: !ref <beam_search_eos_threshold>
using_max_attn_shift: !ref <beam_search_using_max_attn_shift>
max_attn_shift: !ref <beam_search_max_attn_shift>
temperature: !ref <beam_search_temperature>
scorer: !ref <scorer>
beam_searcher_valid: !new:speechbrain.decoders.S2SRNNBeamSearcher
embedding: !ref <emb>
decoder: !ref <dec>
linear: !ref <lin>
bos_index: !ref <bos_index>
eos_index: !ref <eos_index>
min_decode_ratio: !ref <beam_search_min_decode_ratio>
max_decode_ratio: !ref <beam_search_max_decode_ratio>
beam_size: !ref <beam_search_beam_size>
eos_threshold: !ref <beam_search_eos_threshold>
using_max_attn_shift: !ref <beam_search_using_max_attn_shift>
max_attn_shift: !ref <beam_search_max_attn_shift>
temperature: !ref <beam_search_temperature>
scorer: !ref <scorer>
homograph_extractor: !new:speechbrain.lobes.models.g2p.homograph.SubsequenceExtractor
# keys to forward from the encoding pipeline to the model
model_input_keys:
- grapheme_encoded
- word_emb
model_output_keys:
- p_seq
- char_lens
- encoder_out
grapheme_encoder: &id027 !new:speechbrain.dataio.encoder.TextEncoder
phoneme_encoder: &id024 !new:speechbrain.dataio.encoder.TextEncoder
grapheme_tokenizer: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece
model_dir: grapheme_tokenizer
bos_id: !ref <bos_index>
eos_id: !ref <eos_index>
unk_id: !ref <unk_index>
vocab_size: !ref <char_token_output>
annotation_train: null
annotation_read: char
model_type: !ref <char_token_type> # ["unigram", "bpe", "char"]
character_coverage: !ref <character_coverage>
annotation_format: json
text_file: grapheme_annotations.txt
phoneme_tokenizer: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece
model_dir: phoneme_tokenizer
bos_id: !ref <bos_index>
eos_id: !ref <eos_index>
unk_id: !ref <unk_index>
vocab_size: !ref <phn_token_output>
annotation_train: null
annotation_read: phn
model_type: !ref <phn_token_type> # ["unigram", "bpe", "char"]
character_coverage: !ref <character_coverage>
annotation_format: json
text_file: null
out_phoneme_decoder_tok: !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize
tokenizer: !ref <phoneme_tokenizer>
char_map: !ref <char_phn_map>
token_space_index: !ref <token_space_index>
wordwise: !ref <phn_token_wordwise>
out_phoneme_decoder_raw: !name:speechbrain.lobes.models.g2p.dataio.text_decode
encoder: !ref <phoneme_encoder>
out_phoneme_decoder: !apply:speechbrain.utils.hparams.choice
value: false
choices:
True: !ref <out_phoneme_decoder_tok>
False: !ref <out_phoneme_decoder_raw>
encode_pipeline:
batch: false
use_padded_data: true
output_keys:
- grapheme_list
- grapheme_encoded_list
- grapheme_encoded
- word_emb
init:
- func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos
encoder: !ref <grapheme_encoder>
tokens: !ref <graphemes>
bos_index: !ref <bos_index>
eos_index: !ref <eos_index>
- func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos
encoder: !ref <phoneme_encoder>
tokens: !ref <phonemes>
bos_index: !ref <bos_index>
eos_index: !ref <eos_index>
steps:
- func: !name:speechbrain.lobes.models.g2p.dataio.clean_pipeline
graphemes: !ref <graphemes>
takes: txt
provides: txt_cleaned
- func: !name:speechbrain.lobes.models.g2p.dataio.grapheme_pipeline
grapheme_encoder: !ref <grapheme_encoder>
takes: txt_cleaned
provides:
- grapheme_list
- grapheme_encoded_list
- grapheme_encoded_raw
- func: !name:speechbrain.lobes.models.g2p.dataio.add_bos_eos
encoder: !ref <grapheme_encoder>
takes: grapheme_encoded_list
provides:
- grapheme_encoded
- grapheme_len
- grapheme_encoded_eos
- grapheme_len_eos
- func: !name:speechbrain.lobes.models.g2p.dataio.word_emb_pipeline
word_emb: !ref <word_emb>
grapheme_encoder: !ref <grapheme_encoder>
use_word_emb: !ref <use_word_emb>
takes:
- txt
- grapheme_encoded
- grapheme_len
provides: word_emb
decode_pipeline:
batch: true
output_keys:
- phonemes
steps:
- func: !name:speechbrain.lobes.models.g2p.dataio.beam_search_pipeline
beam_searcher: !ref <beam_searcher>
takes:
- char_lens
- encoder_out
provides:
- hyps
- scores
- func: !apply:speechbrain.utils.hparams.choice
value: false
choices:
True: !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize
tokenizer: !ref <phoneme_tokenizer>
char_map: !ref <char_phn_map>
token_space_index: !ref <token_space_index>
wordwise: !ref <phn_token_wordwise>
False: !name:speechbrain.lobes.models.g2p.dataio.phoneme_decoder_pipeline
phoneme_encoder: !ref <phoneme_encoder>
takes:
- hyps
provides:
- phonemes
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
model: !ref <model>
ctc_lin: !ref <ctc_lin>