|
|
|
|
|
classifier_free_guidance:
|
|
training_dropout: 0.1
|
|
inference_coef: 3.0
|
|
|
|
attribute_dropout:
|
|
args:
|
|
active_on_eval: false
|
|
text:
|
|
description: 0.4
|
|
wav:
|
|
self_wav: 0.4
|
|
|
|
fuser:
|
|
cross_attention_pos_emb: false
|
|
cross_attention_pos_emb_scale: 1
|
|
sum: []
|
|
prepend: [self_wav, description]
|
|
cross: []
|
|
input_interpolate: []
|
|
|
|
conditioners:
|
|
self_wav:
|
|
model: style
|
|
style:
|
|
model_name: mert
|
|
transformer_scale: default
|
|
sample_rate: ${sample_rate}
|
|
encodec_checkpoint: '//pretrained/facebook/encodec_32khz'
|
|
encodec_n_q: 3
|
|
length: 3.0
|
|
ds_factor: 15
|
|
n_q_out: 6
|
|
eval_q: 3
|
|
q_dropout: true
|
|
bins: 1024
|
|
varying_lengths: [1.5, 4.5]
|
|
batch_norm: true
|
|
compute_mask: true
|
|
num_codebooks_lm: ${transformer_lm.n_q}
|
|
ds_rate_compression: 640
|
|
use_middle_of_segment: false
|
|
rvq_threshold_ema_dead_code: 0.1
|
|
|
|
description:
|
|
model: t5
|
|
t5:
|
|
name: t5-base
|
|
finetune: false
|
|
word_dropout: 0.2
|
|
normalize_text: false
|
|
|
|
dataset:
|
|
train:
|
|
merge_text_p: 0.25
|
|
drop_desc_p: 0.5
|
|
drop_other_p: 0.5
|
|
shuffle: true
|
|
|