{ "_name_or_path": "facebook/wav2vec2-base", "activation_dropout": 0.0, "adapter_attn_dim": null, "adapter_kernel_size": 3, "adapter_stride": 2, "add_adapter": false, "apply_spec_augment": true, "architectures": [ "Wav2Vec2ForSequenceClassification" ], "attention_dropout": 0.1, "bos_token_id": 1, "classifier_proj_size": 256, "codevector_dim": 256, "contrastive_logits_temperature": 0.1, "conv_bias": false, "conv_dim": [ 512, 512, 512, 512, 512, 512, 512 ], "conv_kernel": [ 10, 3, 3, 3, 3, 2, 2 ], "conv_stride": [ 5, 2, 2, 2, 2, 2, 2 ], "ctc_loss_reduction": "sum", "ctc_zero_infinity": false, "diversity_loss_weight": 0.1, "do_stable_layer_norm": false, "eos_token_id": 2, "feat_extract_activation": "gelu", "feat_extract_norm": "group", "feat_proj_dropout": 0.1, "feat_quantizer_dropout": 0.0, "final_dropout": 0.0, "freeze_feat_extract_train": true, "hidden_act": "gelu", "hidden_dropout": 0.1, "hidden_size": 768, "id2label": { "0": "VIVOSSPK01", "1": "VIVOSSPK02", "10": "VIVOSSPK11", "11": "VIVOSSPK12", "12": "VIVOSSPK13", "13": "VIVOSSPK14", "14": "VIVOSSPK15", "15": "VIVOSSPK16", "16": "VIVOSSPK17", "17": "VIVOSSPK18", "18": "VIVOSSPK19", "19": "VIVOSSPK20", "2": "VIVOSSPK03", "20": "VIVOSSPK21", "21": "VIVOSSPK22", "22": "VIVOSSPK23", "23": "VIVOSSPK24", "24": "VIVOSSPK25", "25": "VIVOSSPK26", "26": "VIVOSSPK27", "27": "VIVOSSPK28", "28": "VIVOSSPK29", "29": "VIVOSSPK30", "3": "VIVOSSPK04", "30": "VIVOSSPK31", "31": "VIVOSSPK32", "32": "VIVOSSPK33", "33": "VIVOSSPK34", "34": "VIVOSSPK35", "35": "VIVOSSPK36", "36": "VIVOSSPK37", "37": "VIVOSSPK38", "38": "VIVOSSPK39", "39": "VIVOSSPK40", "4": "VIVOSSPK05", "40": "VIVOSSPK41", "41": "VIVOSSPK42", "42": "VIVOSSPK43", "43": "VIVOSSPK44", "44": "VIVOSSPK45", "45": "VIVOSSPK46", "5": "VIVOSSPK06", "6": "VIVOSSPK07", "7": "VIVOSSPK08", "8": "VIVOSSPK09", "9": "VIVOSSPK10" }, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "VIVOSSPK01": "0", "VIVOSSPK02": "1", "VIVOSSPK03": "2", "VIVOSSPK04": "3", "VIVOSSPK05": "4", "VIVOSSPK06": "5", "VIVOSSPK07": "6", "VIVOSSPK08": "7", "VIVOSSPK09": "8", "VIVOSSPK10": "9", "VIVOSSPK11": "10", "VIVOSSPK12": "11", "VIVOSSPK13": "12", "VIVOSSPK14": "13", "VIVOSSPK15": "14", "VIVOSSPK16": "15", "VIVOSSPK17": "16", "VIVOSSPK18": "17", "VIVOSSPK19": "18", "VIVOSSPK20": "19", "VIVOSSPK21": "20", "VIVOSSPK22": "21", "VIVOSSPK23": "22", "VIVOSSPK24": "23", "VIVOSSPK25": "24", "VIVOSSPK26": "25", "VIVOSSPK27": "26", "VIVOSSPK28": "27", "VIVOSSPK29": "28", "VIVOSSPK30": "29", "VIVOSSPK31": "30", "VIVOSSPK32": "31", "VIVOSSPK33": "32", "VIVOSSPK34": "33", "VIVOSSPK35": "34", "VIVOSSPK36": "35", "VIVOSSPK37": "36", "VIVOSSPK38": "37", "VIVOSSPK39": "38", "VIVOSSPK40": "39", "VIVOSSPK41": "40", "VIVOSSPK42": "41", "VIVOSSPK43": "42", "VIVOSSPK44": "43", "VIVOSSPK45": "44", "VIVOSSPK46": "45" }, "layer_norm_eps": 1e-05, "layerdrop": 0.0, "mask_channel_length": 10, "mask_channel_min_space": 1, "mask_channel_other": 0.0, "mask_channel_prob": 0.0, "mask_channel_selection": "static", "mask_feature_length": 10, "mask_feature_min_masks": 0, "mask_feature_prob": 0.0, "mask_time_length": 10, "mask_time_min_masks": 2, "mask_time_min_space": 1, "mask_time_other": 0.0, "mask_time_prob": 0.05, "mask_time_selection": "static", "model_type": "wav2vec2", "no_mask_channel_overlap": false, "no_mask_time_overlap": false, "num_adapter_layers": 3, "num_attention_heads": 12, "num_codevector_groups": 2, "num_codevectors_per_group": 320, "num_conv_pos_embedding_groups": 16, "num_conv_pos_embeddings": 128, "num_feat_extract_layers": 7, "num_hidden_layers": 12, "num_negatives": 100, "output_hidden_size": 768, "pad_token_id": 0, "proj_codevector_dim": 256, "tdnn_dilation": [ 1, 2, 3, 1, 1 ], "tdnn_dim": [ 512, 512, 512, 512, 1500 ], "tdnn_kernel": [ 5, 3, 3, 1, 1 ], "torch_dtype": "float32", "transformers_version": "4.42.4", "use_weighted_layer_sum": false, "vocab_size": 32, "xvector_output_dim": 512 }