diff --git a/model.safetensors b/model.safetensors index 4bab8f4b54c1f98c14358a5ceeca50e590739e9e..2bd0c74bd3b4926b50964a7bb09c6aa8baae4f50 100644 --- a/model.safetensors +++ b/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bcd38943271d452fb41e0cc1c9ac715cd036eaf1fd6859055f6d2b6c6d5d0ccb +oid sha256:e0ba6fbcbb9cb83150dc1524c9934db5438ee17d96fb72054875ac1e12dab680 size 94763496 diff --git a/run-10/checkpoint-16/config.json b/run-10/checkpoint-16/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-10/checkpoint-16/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-10/checkpoint-16/model.safetensors b/run-10/checkpoint-16/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7f67e62f17f03c7c497d7b0dc5a3a4deb7a11178 --- /dev/null +++ b/run-10/checkpoint-16/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09e0e1ed09ab4236c54bad86ac83bba0721a3c45be23d5ad5a4636f2cf20fd03 +size 94763496 diff --git a/run-10/checkpoint-16/optimizer.pt b/run-10/checkpoint-16/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f300558d6da98ae2abe9c791ad6463353b5d2b05 --- /dev/null +++ b/run-10/checkpoint-16/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c18bc5bb40fa3da2d7b3c237eeba0c375395a207aa64bc8f2838dea14110129 +size 189552570 diff --git a/run-10/checkpoint-16/preprocessor_config.json b/run-10/checkpoint-16/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-10/checkpoint-16/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-10/checkpoint-16/rng_state.pth b/run-10/checkpoint-16/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a346ad642796f7d0ec066bc31d527acb6fe66d82 --- /dev/null +++ b/run-10/checkpoint-16/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10f9b0d9dbbdb79c182bae9eaf551291f2d8c3d49777e82f95f3cb8e351f2f17 +size 14244 diff --git a/run-10/checkpoint-16/scheduler.pt b/run-10/checkpoint-16/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b0d7c591180a81af8acbe25daefd669b49fea76b --- /dev/null +++ b/run-10/checkpoint-16/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e9b6b2b845d24d24d86b7c13870d7343a78a4085a1b0978b583b73d3fe1ca4f +size 1064 diff --git a/run-10/checkpoint-16/trainer_state.json b/run-10/checkpoint-16/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..45051792ffc6c1615478695ce6ee6f8681a44cae --- /dev/null +++ b/run-10/checkpoint-16/trainer_state.json @@ -0,0 +1,54 @@ +{ + "best_metric": 0.7326732673267327, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-10/checkpoint-16", + "epoch": 1.0, + "eval_steps": 500, + "global_step": 16, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.31, + "grad_norm": 0.7738416194915771, + "learning_rate": 2.2081627577150035e-05, + "loss": 0.6945, + "step": 5 + }, + { + "epoch": 0.62, + "grad_norm": 0.5010543465614319, + "learning_rate": 4.416325515430007e-05, + "loss": 0.659, + "step": 10 + }, + { + "epoch": 0.94, + "grad_norm": 1.2033077478408813, + "learning_rate": 6.624488273145011e-05, + "loss": 0.615, + "step": 15 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7318763732910156, + "eval_runtime": 1.3731, + "eval_samples_per_second": 46.611, + "eval_steps_per_second": 5.826, + "step": 16 + } + ], + "logging_steps": 5, + "max_steps": 160, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 1203912462770640.0, + "train_batch_size": 12, + "trial_name": null, + "trial_params": { + "learning_rate": 7.066120824688011e-05, + "per_device_train_batch_size": 12 + } +} diff --git a/run-10/checkpoint-16/training_args.bin b/run-10/checkpoint-16/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..de07d79e263d54b7e5c58f3db207d95ab5a0f893 --- /dev/null +++ b/run-10/checkpoint-16/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fb898505a247b10cd68e31d96a6b50b269be05c44e3b90be28396b49a7bcd1a +size 4920 diff --git a/run-10/checkpoint-32/config.json b/run-10/checkpoint-32/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-10/checkpoint-32/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-10/checkpoint-32/model.safetensors b/run-10/checkpoint-32/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f798e0d4f1b35c64e90af8bb1ad61ef63d95aea3 --- /dev/null +++ b/run-10/checkpoint-32/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12376f39eb35250b59bb0f0376276c24b351b2a610bcf9fb5950ff754c74bc34 +size 94763496 diff --git a/run-10/checkpoint-32/optimizer.pt b/run-10/checkpoint-32/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3563eb16a8e9c8342d337bcecbcd700481ce249d --- /dev/null +++ b/run-10/checkpoint-32/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11f9e27ee8577972086ddbeafb6521ca90aa22213daaff0734c4a8bc5632d7b3 +size 189552570 diff --git a/run-10/checkpoint-32/preprocessor_config.json b/run-10/checkpoint-32/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-10/checkpoint-32/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-10/checkpoint-32/rng_state.pth b/run-10/checkpoint-32/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..78d5f7e6798130fa1c69672f9503b7a58c9760d0 --- /dev/null +++ b/run-10/checkpoint-32/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:754d53121980eacb00801af5e57714a1cb2c0271b33c592667ed8ec1e79458ee +size 14244 diff --git a/run-10/checkpoint-32/scheduler.pt b/run-10/checkpoint-32/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..85a250c1bf0f63fda9333d3a3ef4dc055795cfd5 --- /dev/null +++ b/run-10/checkpoint-32/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:baedbef244c8d55355c67e7327e12990762aab1a7f2992d792fbb61c790562df +size 1064 diff --git a/run-10/checkpoint-32/trainer_state.json b/run-10/checkpoint-32/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0a59fbd19645a06e185e43644f8c47f54a0f9ead --- /dev/null +++ b/run-10/checkpoint-32/trainer_state.json @@ -0,0 +1,84 @@ +{ + "best_metric": 0.7326732673267327, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-10/checkpoint-16", + "epoch": 2.0, + "eval_steps": 500, + "global_step": 32, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.31, + "grad_norm": 0.7738416194915771, + "learning_rate": 2.2081627577150035e-05, + "loss": 0.6945, + "step": 5 + }, + { + "epoch": 0.62, + "grad_norm": 0.5010543465614319, + "learning_rate": 4.416325515430007e-05, + "loss": 0.659, + "step": 10 + }, + { + "epoch": 0.94, + "grad_norm": 1.2033077478408813, + "learning_rate": 6.624488273145011e-05, + "loss": 0.615, + "step": 15 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7318763732910156, + "eval_runtime": 1.3731, + "eval_samples_per_second": 46.611, + "eval_steps_per_second": 5.826, + "step": 16 + }, + { + "epoch": 1.25, + "grad_norm": 1.2026809453964233, + "learning_rate": 6.8698396906689e-05, + "loss": 0.6035, + "step": 20 + }, + { + "epoch": 1.56, + "grad_norm": 0.6029912233352661, + "learning_rate": 6.624488273145011e-05, + "loss": 0.5965, + "step": 25 + }, + { + "epoch": 1.88, + "grad_norm": 0.5203647613525391, + "learning_rate": 6.379136855621121e-05, + "loss": 0.6076, + "step": 30 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6890983581542969, + "eval_runtime": 1.371, + "eval_samples_per_second": 46.68, + "eval_steps_per_second": 5.835, + "step": 32 + } + ], + "logging_steps": 5, + "max_steps": 160, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 2082692062957104.0, + "train_batch_size": 12, + "trial_name": null, + "trial_params": { + "learning_rate": 7.066120824688011e-05, + "per_device_train_batch_size": 12 + } +} diff --git a/run-10/checkpoint-32/training_args.bin b/run-10/checkpoint-32/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..de07d79e263d54b7e5c58f3db207d95ab5a0f893 --- /dev/null +++ b/run-10/checkpoint-32/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fb898505a247b10cd68e31d96a6b50b269be05c44e3b90be28396b49a7bcd1a +size 4920 diff --git a/run-10/checkpoint-48/config.json b/run-10/checkpoint-48/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-10/checkpoint-48/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-10/checkpoint-48/model.safetensors b/run-10/checkpoint-48/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f6f2989875903676258c5b1bb96df7e8bb030c80 --- /dev/null +++ b/run-10/checkpoint-48/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0d8c951518fa6a0dfac25d6764e32df79ec5ab273a64772d0e6cd71f13f588f +size 94763496 diff --git a/run-10/checkpoint-48/optimizer.pt b/run-10/checkpoint-48/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1d8dc2b1757e0435b6d850ef4b6903796571620 --- /dev/null +++ b/run-10/checkpoint-48/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4932c57b40291139f6e5dfb1b0904bac1231c4a1d478e31c8d8f2fdce122142d +size 189552570 diff --git a/run-10/checkpoint-48/preprocessor_config.json b/run-10/checkpoint-48/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-10/checkpoint-48/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-10/checkpoint-48/rng_state.pth b/run-10/checkpoint-48/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..06baaca658adaf2eb8af67432fed2be9c2931ac5 --- /dev/null +++ b/run-10/checkpoint-48/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6057f3e71568989f3d2442d841f7f161902200ee453a6d60795ac4142ad66214 +size 14244 diff --git a/run-10/checkpoint-48/scheduler.pt b/run-10/checkpoint-48/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c43059072851c1dea6b38871f0c55cc854a5325b --- /dev/null +++ b/run-10/checkpoint-48/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b868aa2a261996895bb2841a3902ab7ce1afc8713bba40c7d045d170c940efbe +size 1064 diff --git a/run-10/checkpoint-48/trainer_state.json b/run-10/checkpoint-48/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0797012679a0384051a08fdc10809f7c13f17975 --- /dev/null +++ b/run-10/checkpoint-48/trainer_state.json @@ -0,0 +1,114 @@ +{ + "best_metric": 0.7326732673267327, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-10/checkpoint-16", + "epoch": 3.0, + "eval_steps": 500, + "global_step": 48, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.31, + "grad_norm": 0.7738416194915771, + "learning_rate": 2.2081627577150035e-05, + "loss": 0.6945, + "step": 5 + }, + { + "epoch": 0.62, + "grad_norm": 0.5010543465614319, + "learning_rate": 4.416325515430007e-05, + "loss": 0.659, + "step": 10 + }, + { + "epoch": 0.94, + "grad_norm": 1.2033077478408813, + "learning_rate": 6.624488273145011e-05, + "loss": 0.615, + "step": 15 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7318763732910156, + "eval_runtime": 1.3731, + "eval_samples_per_second": 46.611, + "eval_steps_per_second": 5.826, + "step": 16 + }, + { + "epoch": 1.25, + "grad_norm": 1.2026809453964233, + "learning_rate": 6.8698396906689e-05, + "loss": 0.6035, + "step": 20 + }, + { + "epoch": 1.56, + "grad_norm": 0.6029912233352661, + "learning_rate": 6.624488273145011e-05, + "loss": 0.5965, + "step": 25 + }, + { + "epoch": 1.88, + "grad_norm": 0.5203647613525391, + "learning_rate": 6.379136855621121e-05, + "loss": 0.6076, + "step": 30 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6890983581542969, + "eval_runtime": 1.371, + "eval_samples_per_second": 46.68, + "eval_steps_per_second": 5.835, + "step": 32 + }, + { + "epoch": 2.19, + "grad_norm": 1.1778842210769653, + "learning_rate": 6.133785438097233e-05, + "loss": 0.5949, + "step": 35 + }, + { + "epoch": 2.5, + "grad_norm": 1.1317474842071533, + "learning_rate": 5.888434020573343e-05, + "loss": 0.6002, + "step": 40 + }, + { + "epoch": 2.81, + "grad_norm": 1.060381293296814, + "learning_rate": 5.643082603049454e-05, + "loss": 0.5339, + "step": 45 + }, + { + "epoch": 3.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7039279937744141, + "eval_runtime": 1.3552, + "eval_samples_per_second": 47.224, + "eval_steps_per_second": 5.903, + "step": 48 + } + ], + "logging_steps": 5, + "max_steps": 160, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 2941889291680896.0, + "train_batch_size": 12, + "trial_name": null, + "trial_params": { + "learning_rate": 7.066120824688011e-05, + "per_device_train_batch_size": 12 + } +} diff --git a/run-10/checkpoint-48/training_args.bin b/run-10/checkpoint-48/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..de07d79e263d54b7e5c58f3db207d95ab5a0f893 --- /dev/null +++ b/run-10/checkpoint-48/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fb898505a247b10cd68e31d96a6b50b269be05c44e3b90be28396b49a7bcd1a +size 4920 diff --git a/run-10/checkpoint-64/config.json b/run-10/checkpoint-64/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-10/checkpoint-64/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-10/checkpoint-64/model.safetensors b/run-10/checkpoint-64/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cfdb75acd1b4440c9321fb7b361087fd8daccb26 --- /dev/null +++ b/run-10/checkpoint-64/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1581f951841b6771954fb4086e2134ba1e1b915ae9fe4ee6203cd39ff01c7df3 +size 94763496 diff --git a/run-10/checkpoint-64/optimizer.pt b/run-10/checkpoint-64/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1b86b51fdec49f7780b2f0c8512a33a4cc115e5c --- /dev/null +++ b/run-10/checkpoint-64/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c91c98f0a35df12aa517f1166c8b7d9fd35d18a9e7d9a39b81487e4c6827fcf3 +size 189552570 diff --git a/run-10/checkpoint-64/preprocessor_config.json b/run-10/checkpoint-64/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-10/checkpoint-64/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-10/checkpoint-64/rng_state.pth b/run-10/checkpoint-64/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f92d50bc52d5722476550ccb117013456ff6f3b7 --- /dev/null +++ b/run-10/checkpoint-64/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f99cb24be430ce14b1c70379f28ee448a7f503908f1c4823ffcd9cfae3f7f0aa +size 14244 diff --git a/run-10/checkpoint-64/scheduler.pt b/run-10/checkpoint-64/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f60b1270bb221efe30631cb14ef9765998d04e09 --- /dev/null +++ b/run-10/checkpoint-64/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f229c4468d27be61716660c3591bfe04cdd650b021c9cad1f8fda6801caf8435 +size 1064 diff --git a/run-10/checkpoint-64/trainer_state.json b/run-10/checkpoint-64/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f2d23f95c3d7fd211c519472fd614677d7ac5e95 --- /dev/null +++ b/run-10/checkpoint-64/trainer_state.json @@ -0,0 +1,144 @@ +{ + "best_metric": 0.7474747474747475, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-10/checkpoint-64", + "epoch": 4.0, + "eval_steps": 500, + "global_step": 64, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.31, + "grad_norm": 0.7738416194915771, + "learning_rate": 2.2081627577150035e-05, + "loss": 0.6945, + "step": 5 + }, + { + "epoch": 0.62, + "grad_norm": 0.5010543465614319, + "learning_rate": 4.416325515430007e-05, + "loss": 0.659, + "step": 10 + }, + { + "epoch": 0.94, + "grad_norm": 1.2033077478408813, + "learning_rate": 6.624488273145011e-05, + "loss": 0.615, + "step": 15 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7318763732910156, + "eval_runtime": 1.3731, + "eval_samples_per_second": 46.611, + "eval_steps_per_second": 5.826, + "step": 16 + }, + { + "epoch": 1.25, + "grad_norm": 1.2026809453964233, + "learning_rate": 6.8698396906689e-05, + "loss": 0.6035, + "step": 20 + }, + { + "epoch": 1.56, + "grad_norm": 0.6029912233352661, + "learning_rate": 6.624488273145011e-05, + "loss": 0.5965, + "step": 25 + }, + { + "epoch": 1.88, + "grad_norm": 0.5203647613525391, + "learning_rate": 6.379136855621121e-05, + "loss": 0.6076, + "step": 30 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6890983581542969, + "eval_runtime": 1.371, + "eval_samples_per_second": 46.68, + "eval_steps_per_second": 5.835, + "step": 32 + }, + { + "epoch": 2.19, + "grad_norm": 1.1778842210769653, + "learning_rate": 6.133785438097233e-05, + "loss": 0.5949, + "step": 35 + }, + { + "epoch": 2.5, + "grad_norm": 1.1317474842071533, + "learning_rate": 5.888434020573343e-05, + "loss": 0.6002, + "step": 40 + }, + { + "epoch": 2.81, + "grad_norm": 1.060381293296814, + "learning_rate": 5.643082603049454e-05, + "loss": 0.5339, + "step": 45 + }, + { + "epoch": 3.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7039279937744141, + "eval_runtime": 1.3552, + "eval_samples_per_second": 47.224, + "eval_steps_per_second": 5.903, + "step": 48 + }, + { + "epoch": 3.12, + "grad_norm": 1.5415284633636475, + "learning_rate": 5.397731185525564e-05, + "loss": 0.4807, + "step": 50 + }, + { + "epoch": 3.44, + "grad_norm": 5.591915607452393, + "learning_rate": 5.2014500515064535e-05, + "loss": 0.4641, + "step": 55 + }, + { + "epoch": 3.75, + "grad_norm": 7.002699375152588, + "learning_rate": 5.005168917487342e-05, + "loss": 0.4142, + "step": 60 + }, + { + "epoch": 4.0, + "eval_f1": 0.7474747474747475, + "eval_loss": 0.8193864822387695, + "eval_runtime": 1.3515, + "eval_samples_per_second": 47.355, + "eval_steps_per_second": 5.919, + "step": 64 + } + ], + "logging_steps": 5, + "max_steps": 160, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 3761891076086928.0, + "train_batch_size": 12, + "trial_name": null, + "trial_params": { + "learning_rate": 7.066120824688011e-05, + "per_device_train_batch_size": 12 + } +} diff --git a/run-10/checkpoint-64/training_args.bin b/run-10/checkpoint-64/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..de07d79e263d54b7e5c58f3db207d95ab5a0f893 --- /dev/null +++ b/run-10/checkpoint-64/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fb898505a247b10cd68e31d96a6b50b269be05c44e3b90be28396b49a7bcd1a +size 4920 diff --git a/run-10/checkpoint-80/config.json b/run-10/checkpoint-80/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-10/checkpoint-80/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-10/checkpoint-80/model.safetensors b/run-10/checkpoint-80/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c7ce6c4cdba86e34e6d13059d9133c69210b6dac --- /dev/null +++ b/run-10/checkpoint-80/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ec5ee3bf50cedb48f0dee3150eaf553deda3fdad31b0d813045a6edc378914d +size 94763496 diff --git a/run-10/checkpoint-80/optimizer.pt b/run-10/checkpoint-80/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6addcffa00f2f4b58adfa21f01c845e98b741800 --- /dev/null +++ b/run-10/checkpoint-80/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f55d5bf90826e6d69aad5c9424f0408a84e3ef1d758da633844a2ff47ef9554d +size 189552570 diff --git a/run-10/checkpoint-80/preprocessor_config.json b/run-10/checkpoint-80/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-10/checkpoint-80/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-10/checkpoint-80/rng_state.pth b/run-10/checkpoint-80/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..69a5340d50aba07724a0c8dab527a5c2a986c330 --- /dev/null +++ b/run-10/checkpoint-80/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c493c99bdb0fe1a782ab86260b28274c099aa481353b3ca24854228ef501405 +size 14244 diff --git a/run-10/checkpoint-80/scheduler.pt b/run-10/checkpoint-80/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..65778a66d6ac548500a50a70ca7a0939ae63ec95 --- /dev/null +++ b/run-10/checkpoint-80/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d785017e8466f929d98f2cf97bb63497f2ba34fd42559c091da6e149a7393901 +size 1064 diff --git a/run-10/checkpoint-80/trainer_state.json b/run-10/checkpoint-80/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f39393265e6892e320d9fc2725c42c6b5c2cc8d5 --- /dev/null +++ b/run-10/checkpoint-80/trainer_state.json @@ -0,0 +1,181 @@ +{ + "best_metric": 0.7474747474747475, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-10/checkpoint-64", + "epoch": 5.0, + "eval_steps": 500, + "global_step": 80, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.31, + "grad_norm": 0.7738416194915771, + "learning_rate": 2.2081627577150035e-05, + "loss": 0.6945, + "step": 5 + }, + { + "epoch": 0.62, + "grad_norm": 0.5010543465614319, + "learning_rate": 4.416325515430007e-05, + "loss": 0.659, + "step": 10 + }, + { + "epoch": 0.94, + "grad_norm": 1.2033077478408813, + "learning_rate": 6.624488273145011e-05, + "loss": 0.615, + "step": 15 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7318763732910156, + "eval_runtime": 1.3731, + "eval_samples_per_second": 46.611, + "eval_steps_per_second": 5.826, + "step": 16 + }, + { + "epoch": 1.25, + "grad_norm": 1.2026809453964233, + "learning_rate": 6.8698396906689e-05, + "loss": 0.6035, + "step": 20 + }, + { + "epoch": 1.56, + "grad_norm": 0.6029912233352661, + "learning_rate": 6.624488273145011e-05, + "loss": 0.5965, + "step": 25 + }, + { + "epoch": 1.88, + "grad_norm": 0.5203647613525391, + "learning_rate": 6.379136855621121e-05, + "loss": 0.6076, + "step": 30 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6890983581542969, + "eval_runtime": 1.371, + "eval_samples_per_second": 46.68, + "eval_steps_per_second": 5.835, + "step": 32 + }, + { + "epoch": 2.19, + "grad_norm": 1.1778842210769653, + "learning_rate": 6.133785438097233e-05, + "loss": 0.5949, + "step": 35 + }, + { + "epoch": 2.5, + "grad_norm": 1.1317474842071533, + "learning_rate": 5.888434020573343e-05, + "loss": 0.6002, + "step": 40 + }, + { + "epoch": 2.81, + "grad_norm": 1.060381293296814, + "learning_rate": 5.643082603049454e-05, + "loss": 0.5339, + "step": 45 + }, + { + "epoch": 3.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7039279937744141, + "eval_runtime": 1.3552, + "eval_samples_per_second": 47.224, + "eval_steps_per_second": 5.903, + "step": 48 + }, + { + "epoch": 3.12, + "grad_norm": 1.5415284633636475, + "learning_rate": 5.397731185525564e-05, + "loss": 0.4807, + "step": 50 + }, + { + "epoch": 3.44, + "grad_norm": 5.591915607452393, + "learning_rate": 5.2014500515064535e-05, + "loss": 0.4641, + "step": 55 + }, + { + "epoch": 3.75, + "grad_norm": 7.002699375152588, + "learning_rate": 5.005168917487342e-05, + "loss": 0.4142, + "step": 60 + }, + { + "epoch": 4.0, + "eval_f1": 0.7474747474747475, + "eval_loss": 0.8193864822387695, + "eval_runtime": 1.3515, + "eval_samples_per_second": 47.355, + "eval_steps_per_second": 5.919, + "step": 64 + }, + { + "epoch": 4.06, + "grad_norm": 5.908353328704834, + "learning_rate": 4.759817499963453e-05, + "loss": 0.4754, + "step": 65 + }, + { + "epoch": 4.38, + "grad_norm": 6.322163105010986, + "learning_rate": 4.5144660824395625e-05, + "loss": 0.316, + "step": 70 + }, + { + "epoch": 4.69, + "grad_norm": 8.423164367675781, + "learning_rate": 4.2691146649156735e-05, + "loss": 0.3177, + "step": 75 + }, + { + "epoch": 5.0, + "grad_norm": 18.0563907623291, + "learning_rate": 4.0237632473917844e-05, + "loss": 0.2076, + "step": 80 + }, + { + "epoch": 5.0, + "eval_f1": 0.7142857142857143, + "eval_loss": 0.738029956817627, + "eval_runtime": 1.3684, + "eval_samples_per_second": 46.769, + "eval_steps_per_second": 5.846, + "step": 80 + } + ], + "logging_steps": 5, + "max_steps": 160, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 4921636174601328.0, + "train_batch_size": 12, + "trial_name": null, + "trial_params": { + "learning_rate": 7.066120824688011e-05, + "per_device_train_batch_size": 12 + } +} diff --git a/run-10/checkpoint-80/training_args.bin b/run-10/checkpoint-80/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..de07d79e263d54b7e5c58f3db207d95ab5a0f893 --- /dev/null +++ b/run-10/checkpoint-80/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fb898505a247b10cd68e31d96a6b50b269be05c44e3b90be28396b49a7bcd1a +size 4920 diff --git a/run-10/checkpoint-96/config.json b/run-10/checkpoint-96/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-10/checkpoint-96/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-10/checkpoint-96/model.safetensors b/run-10/checkpoint-96/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0975f699563e7427af3d513ea4d25f5199c43396 --- /dev/null +++ b/run-10/checkpoint-96/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6074092ec3e0c3719e90aca48cee3ebf8dfbfd42c3148a6817615f32433dd788 +size 94763496 diff --git a/run-10/checkpoint-96/optimizer.pt b/run-10/checkpoint-96/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1408353724451839fd3c1a5c518a501ef55ddc09 --- /dev/null +++ b/run-10/checkpoint-96/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:997aff2af133caa28979c3ace21fe9c1ea09a8d793ba895997264c93050ca7ea +size 189552570 diff --git a/run-10/checkpoint-96/preprocessor_config.json b/run-10/checkpoint-96/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-10/checkpoint-96/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-10/checkpoint-96/rng_state.pth b/run-10/checkpoint-96/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d17e4c93111149cb6e5dcc5811c5068ba8d12a24 --- /dev/null +++ b/run-10/checkpoint-96/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53f396ce65ed9535364efa4f874662b8f07e93a8b1799db89be140bf009657c2 +size 14244 diff --git a/run-10/checkpoint-96/scheduler.pt b/run-10/checkpoint-96/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..13f202e676331c7fc284402d1ca3b67dacc9502c --- /dev/null +++ b/run-10/checkpoint-96/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1abc4621edce8d381e622ce3c0621a0d70c692d20523191a971e408b800ebad +size 1064 diff --git a/run-10/checkpoint-96/trainer_state.json b/run-10/checkpoint-96/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e657a63c8d0e2e3a35d9b2e52ae6b767209efad4 --- /dev/null +++ b/run-10/checkpoint-96/trainer_state.json @@ -0,0 +1,211 @@ +{ + "best_metric": 0.7474747474747475, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-10/checkpoint-64", + "epoch": 6.0, + "eval_steps": 500, + "global_step": 96, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.31, + "grad_norm": 0.7738416194915771, + "learning_rate": 2.2081627577150035e-05, + "loss": 0.6945, + "step": 5 + }, + { + "epoch": 0.62, + "grad_norm": 0.5010543465614319, + "learning_rate": 4.416325515430007e-05, + "loss": 0.659, + "step": 10 + }, + { + "epoch": 0.94, + "grad_norm": 1.2033077478408813, + "learning_rate": 6.624488273145011e-05, + "loss": 0.615, + "step": 15 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7318763732910156, + "eval_runtime": 1.3731, + "eval_samples_per_second": 46.611, + "eval_steps_per_second": 5.826, + "step": 16 + }, + { + "epoch": 1.25, + "grad_norm": 1.2026809453964233, + "learning_rate": 6.8698396906689e-05, + "loss": 0.6035, + "step": 20 + }, + { + "epoch": 1.56, + "grad_norm": 0.6029912233352661, + "learning_rate": 6.624488273145011e-05, + "loss": 0.5965, + "step": 25 + }, + { + "epoch": 1.88, + "grad_norm": 0.5203647613525391, + "learning_rate": 6.379136855621121e-05, + "loss": 0.6076, + "step": 30 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6890983581542969, + "eval_runtime": 1.371, + "eval_samples_per_second": 46.68, + "eval_steps_per_second": 5.835, + "step": 32 + }, + { + "epoch": 2.19, + "grad_norm": 1.1778842210769653, + "learning_rate": 6.133785438097233e-05, + "loss": 0.5949, + "step": 35 + }, + { + "epoch": 2.5, + "grad_norm": 1.1317474842071533, + "learning_rate": 5.888434020573343e-05, + "loss": 0.6002, + "step": 40 + }, + { + "epoch": 2.81, + "grad_norm": 1.060381293296814, + "learning_rate": 5.643082603049454e-05, + "loss": 0.5339, + "step": 45 + }, + { + "epoch": 3.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7039279937744141, + "eval_runtime": 1.3552, + "eval_samples_per_second": 47.224, + "eval_steps_per_second": 5.903, + "step": 48 + }, + { + "epoch": 3.12, + "grad_norm": 1.5415284633636475, + "learning_rate": 5.397731185525564e-05, + "loss": 0.4807, + "step": 50 + }, + { + "epoch": 3.44, + "grad_norm": 5.591915607452393, + "learning_rate": 5.2014500515064535e-05, + "loss": 0.4641, + "step": 55 + }, + { + "epoch": 3.75, + "grad_norm": 7.002699375152588, + "learning_rate": 5.005168917487342e-05, + "loss": 0.4142, + "step": 60 + }, + { + "epoch": 4.0, + "eval_f1": 0.7474747474747475, + "eval_loss": 0.8193864822387695, + "eval_runtime": 1.3515, + "eval_samples_per_second": 47.355, + "eval_steps_per_second": 5.919, + "step": 64 + }, + { + "epoch": 4.06, + "grad_norm": 5.908353328704834, + "learning_rate": 4.759817499963453e-05, + "loss": 0.4754, + "step": 65 + }, + { + "epoch": 4.38, + "grad_norm": 6.322163105010986, + "learning_rate": 4.5144660824395625e-05, + "loss": 0.316, + "step": 70 + }, + { + "epoch": 4.69, + "grad_norm": 8.423164367675781, + "learning_rate": 4.2691146649156735e-05, + "loss": 0.3177, + "step": 75 + }, + { + "epoch": 5.0, + "grad_norm": 18.0563907623291, + "learning_rate": 4.0237632473917844e-05, + "loss": 0.2076, + "step": 80 + }, + { + "epoch": 5.0, + "eval_f1": 0.7142857142857143, + "eval_loss": 0.738029956817627, + "eval_runtime": 1.3684, + "eval_samples_per_second": 46.769, + "eval_steps_per_second": 5.846, + "step": 80 + }, + { + "epoch": 5.31, + "grad_norm": 18.247249603271484, + "learning_rate": 3.778411829867895e-05, + "loss": 0.2001, + "step": 85 + }, + { + "epoch": 5.62, + "grad_norm": 8.62263298034668, + "learning_rate": 3.582130695848783e-05, + "loss": 0.236, + "step": 90 + }, + { + "epoch": 5.94, + "grad_norm": 31.015357971191406, + "learning_rate": 3.336779278324894e-05, + "loss": 0.2392, + "step": 95 + }, + { + "epoch": 6.0, + "eval_f1": 0.6857142857142857, + "eval_loss": 0.8250775337219238, + "eval_runtime": 1.351, + "eval_samples_per_second": 47.372, + "eval_steps_per_second": 5.922, + "step": 96 + } + ], + "logging_steps": 5, + "max_steps": 160, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 5757413357810448.0, + "train_batch_size": 12, + "trial_name": null, + "trial_params": { + "learning_rate": 7.066120824688011e-05, + "per_device_train_batch_size": 12 + } +} diff --git a/run-10/checkpoint-96/training_args.bin b/run-10/checkpoint-96/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..de07d79e263d54b7e5c58f3db207d95ab5a0f893 --- /dev/null +++ b/run-10/checkpoint-96/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fb898505a247b10cd68e31d96a6b50b269be05c44e3b90be28396b49a7bcd1a +size 4920 diff --git a/run-11/checkpoint-144/config.json b/run-11/checkpoint-144/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-11/checkpoint-144/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-11/checkpoint-144/model.safetensors b/run-11/checkpoint-144/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..eb2b311006c0e0a54cf8abf0d1fed03f13ee91df --- /dev/null +++ b/run-11/checkpoint-144/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e49643f91ab8713ba5b01b3d265ed655a8716946103857daef116458aca433b +size 94763496 diff --git a/run-11/checkpoint-144/optimizer.pt b/run-11/checkpoint-144/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d4cd719a6641265c0c15fdf603a9f49d4dd70992 --- /dev/null +++ b/run-11/checkpoint-144/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cc7c304f10d63d383de94a4cfd8f22a6653c873c46b4a4da5f3e52151e9bb52 +size 189552570 diff --git a/run-11/checkpoint-144/preprocessor_config.json b/run-11/checkpoint-144/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-11/checkpoint-144/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-11/checkpoint-144/rng_state.pth b/run-11/checkpoint-144/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..35674c44088a3585f8cd11a1eb144d356856a804 --- /dev/null +++ b/run-11/checkpoint-144/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87b8bdd7b4355fd23f0b8256efb0158e4240e11263e992a13d50944c37692500 +size 14244 diff --git a/run-11/checkpoint-144/scheduler.pt b/run-11/checkpoint-144/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b0e89219334048e79498d5160a1ff185481e1a80 --- /dev/null +++ b/run-11/checkpoint-144/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:590bd6d52a677075277905f4584dc0b299d568b15a2520aee7f8d837ac8da60e +size 1064 diff --git a/run-11/checkpoint-144/trainer_state.json b/run-11/checkpoint-144/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..51d23e092895d2b0ea8f17e5d0242734a5b9dede --- /dev/null +++ b/run-11/checkpoint-144/trainer_state.json @@ -0,0 +1,247 @@ +{ + "best_metric": 0.7474747474747475, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-11/checkpoint-144", + "epoch": 3.0, + "eval_steps": 500, + "global_step": 144, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "grad_norm": 1.2641795873641968, + "learning_rate": 4.277342370667828e-06, + "loss": 0.702, + "step": 5 + }, + { + "epoch": 0.21, + "grad_norm": 1.1789261102676392, + "learning_rate": 8.554684741335655e-06, + "loss": 0.6864, + "step": 10 + }, + { + "epoch": 0.31, + "grad_norm": 1.3848576545715332, + "learning_rate": 1.2832027112003484e-05, + "loss": 0.6855, + "step": 15 + }, + { + "epoch": 0.42, + "grad_norm": 0.953545093536377, + "learning_rate": 1.710936948267131e-05, + "loss": 0.6668, + "step": 20 + }, + { + "epoch": 0.52, + "grad_norm": 0.9704298377037048, + "learning_rate": 2.138671185333914e-05, + "loss": 0.6426, + "step": 25 + }, + { + "epoch": 0.62, + "grad_norm": 0.7949383854866028, + "learning_rate": 2.5664054224006968e-05, + "loss": 0.6293, + "step": 30 + }, + { + "epoch": 0.73, + "grad_norm": 2.390880823135376, + "learning_rate": 2.994139659467479e-05, + "loss": 0.5631, + "step": 35 + }, + { + "epoch": 0.83, + "grad_norm": 1.7147374153137207, + "learning_rate": 3.421873896534262e-05, + "loss": 0.6562, + "step": 40 + }, + { + "epoch": 0.94, + "grad_norm": 3.123836040496826, + "learning_rate": 3.849608133601045e-05, + "loss": 0.6243, + "step": 45 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7374954223632812, + "eval_runtime": 1.3701, + "eval_samples_per_second": 46.713, + "eval_steps_per_second": 5.839, + "step": 48 + }, + { + "epoch": 1.04, + "grad_norm": 4.445352077484131, + "learning_rate": 4.0872382653048134e-05, + "loss": 0.6965, + "step": 50 + }, + { + "epoch": 1.15, + "grad_norm": 0.8689994215965271, + "learning_rate": 4.03971223896406e-05, + "loss": 0.6945, + "step": 55 + }, + { + "epoch": 1.25, + "grad_norm": 1.744273066520691, + "learning_rate": 3.992186212623306e-05, + "loss": 0.4097, + "step": 60 + }, + { + "epoch": 1.35, + "grad_norm": 0.6568699479103088, + "learning_rate": 3.944660186282552e-05, + "loss": 0.5621, + "step": 65 + }, + { + "epoch": 1.46, + "grad_norm": 1.516032099723816, + "learning_rate": 3.8971341599417986e-05, + "loss": 0.4826, + "step": 70 + }, + { + "epoch": 1.56, + "grad_norm": Infinity, + "learning_rate": 3.8591133388691955e-05, + "loss": 0.7666, + "step": 75 + }, + { + "epoch": 1.67, + "grad_norm": 1.788261890411377, + "learning_rate": 3.811587312528442e-05, + "loss": 0.6187, + "step": 80 + }, + { + "epoch": 1.77, + "grad_norm": 1.7383731603622437, + "learning_rate": 3.764061286187688e-05, + "loss": 0.5974, + "step": 85 + }, + { + "epoch": 1.88, + "grad_norm": 1.9014606475830078, + "learning_rate": 3.7165352598469344e-05, + "loss": 0.5979, + "step": 90 + }, + { + "epoch": 1.98, + "grad_norm": 2.1836445331573486, + "learning_rate": 3.669009233506181e-05, + "loss": 0.6104, + "step": 95 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6798496246337891, + "eval_runtime": 1.3616, + "eval_samples_per_second": 47.003, + "eval_steps_per_second": 5.875, + "step": 96 + }, + { + "epoch": 2.08, + "grad_norm": 1.0497145652770996, + "learning_rate": 3.6214832071654276e-05, + "loss": 0.5777, + "step": 100 + }, + { + "epoch": 2.19, + "grad_norm": 1.1734980344772339, + "learning_rate": 3.573957180824674e-05, + "loss": 0.4793, + "step": 105 + }, + { + "epoch": 2.29, + "grad_norm": 1.6985560655593872, + "learning_rate": 3.52643115448392e-05, + "loss": 0.5708, + "step": 110 + }, + { + "epoch": 2.4, + "grad_norm": 1.110007882118225, + "learning_rate": 3.4789051281431665e-05, + "loss": 0.5446, + "step": 115 + }, + { + "epoch": 2.5, + "grad_norm": Infinity, + "learning_rate": 3.4408843070705634e-05, + "loss": 0.5224, + "step": 120 + }, + { + "epoch": 2.6, + "grad_norm": 4.383241653442383, + "learning_rate": 3.39335828072981e-05, + "loss": 0.4492, + "step": 125 + }, + { + "epoch": 2.71, + "grad_norm": 20.106430053710938, + "learning_rate": 3.355337459657207e-05, + "loss": 0.6228, + "step": 130 + }, + { + "epoch": 2.81, + "grad_norm": 3.188138008117676, + "learning_rate": 3.3078114333164536e-05, + "loss": 0.3671, + "step": 135 + }, + { + "epoch": 2.92, + "grad_norm": 3.840233087539673, + "learning_rate": 3.2602854069757e-05, + "loss": 0.3217, + "step": 140 + }, + { + "epoch": 3.0, + "eval_f1": 0.7474747474747475, + "eval_loss": 0.8114051818847656, + "eval_runtime": 1.4005, + "eval_samples_per_second": 45.699, + "eval_steps_per_second": 5.712, + "step": 144 + } + ], + "logging_steps": 5, + "max_steps": 480, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 2226487719780480.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": { + "learning_rate": 4.1062486758411146e-05, + "per_device_train_batch_size": 4 + } +} diff --git a/run-11/checkpoint-144/training_args.bin b/run-11/checkpoint-144/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eee85774a254ee9284e54bbcaa537c00a3ec285b --- /dev/null +++ b/run-11/checkpoint-144/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:711d11e21d598840c9a2dcfe02c910b7483cbe50927c465ca1b21773cad22965 +size 4920 diff --git a/run-11/checkpoint-192/config.json b/run-11/checkpoint-192/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-11/checkpoint-192/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-11/checkpoint-192/model.safetensors b/run-11/checkpoint-192/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7aa6f63b169466bb2674f20f0e57c2995eb69ee6 --- /dev/null +++ b/run-11/checkpoint-192/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5958dfc88d1ce593029d2a7b83ac41c448df84d41ebc73895e463f8624f26371 +size 94763496 diff --git a/run-11/checkpoint-192/optimizer.pt b/run-11/checkpoint-192/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2835cee619523aaa2b270c34109e027190460059 --- /dev/null +++ b/run-11/checkpoint-192/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e357403872c4a0a3d2aa98909ade4933ec2a296838abf27cadf1fe1142d00a37 +size 189552570 diff --git a/run-11/checkpoint-192/preprocessor_config.json b/run-11/checkpoint-192/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-11/checkpoint-192/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-11/checkpoint-192/rng_state.pth b/run-11/checkpoint-192/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4d76370ee36700e6a498a1fbfff621aca7984a77 --- /dev/null +++ b/run-11/checkpoint-192/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5aef020ca2df517540ac9ff4e195e1c41a7b85939e93195d118078f119bc949 +size 14244 diff --git a/run-11/checkpoint-192/scheduler.pt b/run-11/checkpoint-192/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..19bde5920317068bd42f2ac5078ad9452e901289 --- /dev/null +++ b/run-11/checkpoint-192/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9be988732abd20dc50c7fa3f4ac877acc8d2f3a37a9c7e3f84fcf95e642a2c2e +size 1064 diff --git a/run-11/checkpoint-192/trainer_state.json b/run-11/checkpoint-192/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ad17501bc3b5d3ec254c9c279b28c20b6ed16fdb --- /dev/null +++ b/run-11/checkpoint-192/trainer_state.json @@ -0,0 +1,326 @@ +{ + "best_metric": 0.7500000000000001, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-11/checkpoint-192", + "epoch": 4.0, + "eval_steps": 500, + "global_step": 192, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "grad_norm": 1.2641795873641968, + "learning_rate": 4.277342370667828e-06, + "loss": 0.702, + "step": 5 + }, + { + "epoch": 0.21, + "grad_norm": 1.1789261102676392, + "learning_rate": 8.554684741335655e-06, + "loss": 0.6864, + "step": 10 + }, + { + "epoch": 0.31, + "grad_norm": 1.3848576545715332, + "learning_rate": 1.2832027112003484e-05, + "loss": 0.6855, + "step": 15 + }, + { + "epoch": 0.42, + "grad_norm": 0.953545093536377, + "learning_rate": 1.710936948267131e-05, + "loss": 0.6668, + "step": 20 + }, + { + "epoch": 0.52, + "grad_norm": 0.9704298377037048, + "learning_rate": 2.138671185333914e-05, + "loss": 0.6426, + "step": 25 + }, + { + "epoch": 0.62, + "grad_norm": 0.7949383854866028, + "learning_rate": 2.5664054224006968e-05, + "loss": 0.6293, + "step": 30 + }, + { + "epoch": 0.73, + "grad_norm": 2.390880823135376, + "learning_rate": 2.994139659467479e-05, + "loss": 0.5631, + "step": 35 + }, + { + "epoch": 0.83, + "grad_norm": 1.7147374153137207, + "learning_rate": 3.421873896534262e-05, + "loss": 0.6562, + "step": 40 + }, + { + "epoch": 0.94, + "grad_norm": 3.123836040496826, + "learning_rate": 3.849608133601045e-05, + "loss": 0.6243, + "step": 45 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7374954223632812, + "eval_runtime": 1.3701, + "eval_samples_per_second": 46.713, + "eval_steps_per_second": 5.839, + "step": 48 + }, + { + "epoch": 1.04, + "grad_norm": 4.445352077484131, + "learning_rate": 4.0872382653048134e-05, + "loss": 0.6965, + "step": 50 + }, + { + "epoch": 1.15, + "grad_norm": 0.8689994215965271, + "learning_rate": 4.03971223896406e-05, + "loss": 0.6945, + "step": 55 + }, + { + "epoch": 1.25, + "grad_norm": 1.744273066520691, + "learning_rate": 3.992186212623306e-05, + "loss": 0.4097, + "step": 60 + }, + { + "epoch": 1.35, + "grad_norm": 0.6568699479103088, + "learning_rate": 3.944660186282552e-05, + "loss": 0.5621, + "step": 65 + }, + { + "epoch": 1.46, + "grad_norm": 1.516032099723816, + "learning_rate": 3.8971341599417986e-05, + "loss": 0.4826, + "step": 70 + }, + { + "epoch": 1.56, + "grad_norm": Infinity, + "learning_rate": 3.8591133388691955e-05, + "loss": 0.7666, + "step": 75 + }, + { + "epoch": 1.67, + "grad_norm": 1.788261890411377, + "learning_rate": 3.811587312528442e-05, + "loss": 0.6187, + "step": 80 + }, + { + "epoch": 1.77, + "grad_norm": 1.7383731603622437, + "learning_rate": 3.764061286187688e-05, + "loss": 0.5974, + "step": 85 + }, + { + "epoch": 1.88, + "grad_norm": 1.9014606475830078, + "learning_rate": 3.7165352598469344e-05, + "loss": 0.5979, + "step": 90 + }, + { + "epoch": 1.98, + "grad_norm": 2.1836445331573486, + "learning_rate": 3.669009233506181e-05, + "loss": 0.6104, + "step": 95 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6798496246337891, + "eval_runtime": 1.3616, + "eval_samples_per_second": 47.003, + "eval_steps_per_second": 5.875, + "step": 96 + }, + { + "epoch": 2.08, + "grad_norm": 1.0497145652770996, + "learning_rate": 3.6214832071654276e-05, + "loss": 0.5777, + "step": 100 + }, + { + "epoch": 2.19, + "grad_norm": 1.1734980344772339, + "learning_rate": 3.573957180824674e-05, + "loss": 0.4793, + "step": 105 + }, + { + "epoch": 2.29, + "grad_norm": 1.6985560655593872, + "learning_rate": 3.52643115448392e-05, + "loss": 0.5708, + "step": 110 + }, + { + "epoch": 2.4, + "grad_norm": 1.110007882118225, + "learning_rate": 3.4789051281431665e-05, + "loss": 0.5446, + "step": 115 + }, + { + "epoch": 2.5, + "grad_norm": Infinity, + "learning_rate": 3.4408843070705634e-05, + "loss": 0.5224, + "step": 120 + }, + { + "epoch": 2.6, + "grad_norm": 4.383241653442383, + "learning_rate": 3.39335828072981e-05, + "loss": 0.4492, + "step": 125 + }, + { + "epoch": 2.71, + "grad_norm": 20.106430053710938, + "learning_rate": 3.355337459657207e-05, + "loss": 0.6228, + "step": 130 + }, + { + "epoch": 2.81, + "grad_norm": 3.188138008117676, + "learning_rate": 3.3078114333164536e-05, + "loss": 0.3671, + "step": 135 + }, + { + "epoch": 2.92, + "grad_norm": 3.840233087539673, + "learning_rate": 3.2602854069757e-05, + "loss": 0.3217, + "step": 140 + }, + { + "epoch": 3.0, + "eval_f1": 0.7474747474747475, + "eval_loss": 0.8114051818847656, + "eval_runtime": 1.4005, + "eval_samples_per_second": 45.699, + "eval_steps_per_second": 5.712, + "step": 144 + }, + { + "epoch": 3.02, + "grad_norm": 2.789161205291748, + "learning_rate": 3.212759380634946e-05, + "loss": 0.4985, + "step": 145 + }, + { + "epoch": 3.12, + "grad_norm": 20.845867156982422, + "learning_rate": 3.1652333542941924e-05, + "loss": 0.3366, + "step": 150 + }, + { + "epoch": 3.23, + "grad_norm": 3.9109976291656494, + "learning_rate": 3.117707327953439e-05, + "loss": 0.2834, + "step": 155 + }, + { + "epoch": 3.33, + "grad_norm": 5.4618916511535645, + "learning_rate": 3.070181301612686e-05, + "loss": 0.4844, + "step": 160 + }, + { + "epoch": 3.44, + "grad_norm": 4.305176258087158, + "learning_rate": 3.0226552752719317e-05, + "loss": 0.2045, + "step": 165 + }, + { + "epoch": 3.54, + "grad_norm": 15.64925765991211, + "learning_rate": 2.9751292489311783e-05, + "loss": 0.2373, + "step": 170 + }, + { + "epoch": 3.65, + "grad_norm": 10.34076976776123, + "learning_rate": 2.9276032225904242e-05, + "loss": 0.4372, + "step": 175 + }, + { + "epoch": 3.75, + "grad_norm": Infinity, + "learning_rate": 2.8895824015178215e-05, + "loss": 0.2174, + "step": 180 + }, + { + "epoch": 3.85, + "grad_norm": 1.866976022720337, + "learning_rate": 2.8420563751770678e-05, + "loss": 0.2674, + "step": 185 + }, + { + "epoch": 3.96, + "grad_norm": 12.544903755187988, + "learning_rate": 2.794530348836314e-05, + "loss": 0.3761, + "step": 190 + }, + { + "epoch": 4.0, + "eval_f1": 0.7500000000000001, + "eval_loss": 0.9909818172454834, + "eval_runtime": 1.3645, + "eval_samples_per_second": 46.904, + "eval_steps_per_second": 5.863, + "step": 192 + } + ], + "logging_steps": 5, + "max_steps": 480, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 2996368343978784.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": { + "learning_rate": 4.1062486758411146e-05, + "per_device_train_batch_size": 4 + } +} diff --git a/run-11/checkpoint-192/training_args.bin b/run-11/checkpoint-192/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eee85774a254ee9284e54bbcaa537c00a3ec285b --- /dev/null +++ b/run-11/checkpoint-192/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:711d11e21d598840c9a2dcfe02c910b7483cbe50927c465ca1b21773cad22965 +size 4920 diff --git a/run-11/checkpoint-240/config.json b/run-11/checkpoint-240/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-11/checkpoint-240/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-11/checkpoint-240/model.safetensors b/run-11/checkpoint-240/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2c66ab4cd162a2659a5649ab1dc4e5066eeedf53 --- /dev/null +++ b/run-11/checkpoint-240/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79d73e5f8a719630b7a344545b472c346b3dc9c8c9ee321ce15340e867db4f9a +size 94763496 diff --git a/run-11/checkpoint-240/optimizer.pt b/run-11/checkpoint-240/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..fd11d6a5a79a38712063929c9031e0b035905809 --- /dev/null +++ b/run-11/checkpoint-240/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7656720c1d61ea5ecc49fbf9988a8afdd83edb96c9680ce1f1bf8f7d84f067c2 +size 189552570 diff --git a/run-11/checkpoint-240/preprocessor_config.json b/run-11/checkpoint-240/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-11/checkpoint-240/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-11/checkpoint-240/rng_state.pth b/run-11/checkpoint-240/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..6935c8ee6dffa468628cd166bcbf40c96bd4b606 --- /dev/null +++ b/run-11/checkpoint-240/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8eb071c49709b6f4047e7f48105f0dd51daaf73e0a11fd742255aa4c3526f42 +size 14244 diff --git a/run-11/checkpoint-240/scheduler.pt b/run-11/checkpoint-240/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..bf66d3168c58cce96f687918c7b27f3a644c23b2 --- /dev/null +++ b/run-11/checkpoint-240/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e40867b67ff240e3f031d74e394bc8d763e6fe820ca8585e5a2e0bea349054d +size 1064 diff --git a/run-11/checkpoint-240/trainer_state.json b/run-11/checkpoint-240/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e6d3f931152b7ba200ed7e11a6fac0f7373a269b --- /dev/null +++ b/run-11/checkpoint-240/trainer_state.json @@ -0,0 +1,405 @@ +{ + "best_metric": 0.7586206896551725, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-11/checkpoint-240", + "epoch": 5.0, + "eval_steps": 500, + "global_step": 240, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "grad_norm": 1.2641795873641968, + "learning_rate": 4.277342370667828e-06, + "loss": 0.702, + "step": 5 + }, + { + "epoch": 0.21, + "grad_norm": 1.1789261102676392, + "learning_rate": 8.554684741335655e-06, + "loss": 0.6864, + "step": 10 + }, + { + "epoch": 0.31, + "grad_norm": 1.3848576545715332, + "learning_rate": 1.2832027112003484e-05, + "loss": 0.6855, + "step": 15 + }, + { + "epoch": 0.42, + "grad_norm": 0.953545093536377, + "learning_rate": 1.710936948267131e-05, + "loss": 0.6668, + "step": 20 + }, + { + "epoch": 0.52, + "grad_norm": 0.9704298377037048, + "learning_rate": 2.138671185333914e-05, + "loss": 0.6426, + "step": 25 + }, + { + "epoch": 0.62, + "grad_norm": 0.7949383854866028, + "learning_rate": 2.5664054224006968e-05, + "loss": 0.6293, + "step": 30 + }, + { + "epoch": 0.73, + "grad_norm": 2.390880823135376, + "learning_rate": 2.994139659467479e-05, + "loss": 0.5631, + "step": 35 + }, + { + "epoch": 0.83, + "grad_norm": 1.7147374153137207, + "learning_rate": 3.421873896534262e-05, + "loss": 0.6562, + "step": 40 + }, + { + "epoch": 0.94, + "grad_norm": 3.123836040496826, + "learning_rate": 3.849608133601045e-05, + "loss": 0.6243, + "step": 45 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7374954223632812, + "eval_runtime": 1.3701, + "eval_samples_per_second": 46.713, + "eval_steps_per_second": 5.839, + "step": 48 + }, + { + "epoch": 1.04, + "grad_norm": 4.445352077484131, + "learning_rate": 4.0872382653048134e-05, + "loss": 0.6965, + "step": 50 + }, + { + "epoch": 1.15, + "grad_norm": 0.8689994215965271, + "learning_rate": 4.03971223896406e-05, + "loss": 0.6945, + "step": 55 + }, + { + "epoch": 1.25, + "grad_norm": 1.744273066520691, + "learning_rate": 3.992186212623306e-05, + "loss": 0.4097, + "step": 60 + }, + { + "epoch": 1.35, + "grad_norm": 0.6568699479103088, + "learning_rate": 3.944660186282552e-05, + "loss": 0.5621, + "step": 65 + }, + { + "epoch": 1.46, + "grad_norm": 1.516032099723816, + "learning_rate": 3.8971341599417986e-05, + "loss": 0.4826, + "step": 70 + }, + { + "epoch": 1.56, + "grad_norm": Infinity, + "learning_rate": 3.8591133388691955e-05, + "loss": 0.7666, + "step": 75 + }, + { + "epoch": 1.67, + "grad_norm": 1.788261890411377, + "learning_rate": 3.811587312528442e-05, + "loss": 0.6187, + "step": 80 + }, + { + "epoch": 1.77, + "grad_norm": 1.7383731603622437, + "learning_rate": 3.764061286187688e-05, + "loss": 0.5974, + "step": 85 + }, + { + "epoch": 1.88, + "grad_norm": 1.9014606475830078, + "learning_rate": 3.7165352598469344e-05, + "loss": 0.5979, + "step": 90 + }, + { + "epoch": 1.98, + "grad_norm": 2.1836445331573486, + "learning_rate": 3.669009233506181e-05, + "loss": 0.6104, + "step": 95 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6798496246337891, + "eval_runtime": 1.3616, + "eval_samples_per_second": 47.003, + "eval_steps_per_second": 5.875, + "step": 96 + }, + { + "epoch": 2.08, + "grad_norm": 1.0497145652770996, + "learning_rate": 3.6214832071654276e-05, + "loss": 0.5777, + "step": 100 + }, + { + "epoch": 2.19, + "grad_norm": 1.1734980344772339, + "learning_rate": 3.573957180824674e-05, + "loss": 0.4793, + "step": 105 + }, + { + "epoch": 2.29, + "grad_norm": 1.6985560655593872, + "learning_rate": 3.52643115448392e-05, + "loss": 0.5708, + "step": 110 + }, + { + "epoch": 2.4, + "grad_norm": 1.110007882118225, + "learning_rate": 3.4789051281431665e-05, + "loss": 0.5446, + "step": 115 + }, + { + "epoch": 2.5, + "grad_norm": Infinity, + "learning_rate": 3.4408843070705634e-05, + "loss": 0.5224, + "step": 120 + }, + { + "epoch": 2.6, + "grad_norm": 4.383241653442383, + "learning_rate": 3.39335828072981e-05, + "loss": 0.4492, + "step": 125 + }, + { + "epoch": 2.71, + "grad_norm": 20.106430053710938, + "learning_rate": 3.355337459657207e-05, + "loss": 0.6228, + "step": 130 + }, + { + "epoch": 2.81, + "grad_norm": 3.188138008117676, + "learning_rate": 3.3078114333164536e-05, + "loss": 0.3671, + "step": 135 + }, + { + "epoch": 2.92, + "grad_norm": 3.840233087539673, + "learning_rate": 3.2602854069757e-05, + "loss": 0.3217, + "step": 140 + }, + { + "epoch": 3.0, + "eval_f1": 0.7474747474747475, + "eval_loss": 0.8114051818847656, + "eval_runtime": 1.4005, + "eval_samples_per_second": 45.699, + "eval_steps_per_second": 5.712, + "step": 144 + }, + { + "epoch": 3.02, + "grad_norm": 2.789161205291748, + "learning_rate": 3.212759380634946e-05, + "loss": 0.4985, + "step": 145 + }, + { + "epoch": 3.12, + "grad_norm": 20.845867156982422, + "learning_rate": 3.1652333542941924e-05, + "loss": 0.3366, + "step": 150 + }, + { + "epoch": 3.23, + "grad_norm": 3.9109976291656494, + "learning_rate": 3.117707327953439e-05, + "loss": 0.2834, + "step": 155 + }, + { + "epoch": 3.33, + "grad_norm": 5.4618916511535645, + "learning_rate": 3.070181301612686e-05, + "loss": 0.4844, + "step": 160 + }, + { + "epoch": 3.44, + "grad_norm": 4.305176258087158, + "learning_rate": 3.0226552752719317e-05, + "loss": 0.2045, + "step": 165 + }, + { + "epoch": 3.54, + "grad_norm": 15.64925765991211, + "learning_rate": 2.9751292489311783e-05, + "loss": 0.2373, + "step": 170 + }, + { + "epoch": 3.65, + "grad_norm": 10.34076976776123, + "learning_rate": 2.9276032225904242e-05, + "loss": 0.4372, + "step": 175 + }, + { + "epoch": 3.75, + "grad_norm": Infinity, + "learning_rate": 2.8895824015178215e-05, + "loss": 0.2174, + "step": 180 + }, + { + "epoch": 3.85, + "grad_norm": 1.866976022720337, + "learning_rate": 2.8420563751770678e-05, + "loss": 0.2674, + "step": 185 + }, + { + "epoch": 3.96, + "grad_norm": 12.544903755187988, + "learning_rate": 2.794530348836314e-05, + "loss": 0.3761, + "step": 190 + }, + { + "epoch": 4.0, + "eval_f1": 0.7500000000000001, + "eval_loss": 0.9909818172454834, + "eval_runtime": 1.3645, + "eval_samples_per_second": 46.904, + "eval_steps_per_second": 5.863, + "step": 192 + }, + { + "epoch": 4.06, + "grad_norm": 0.7645444869995117, + "learning_rate": 2.7470043224955607e-05, + "loss": 0.2581, + "step": 195 + }, + { + "epoch": 4.17, + "grad_norm": 3.4740447998046875, + "learning_rate": 2.699478296154807e-05, + "loss": 0.3123, + "step": 200 + }, + { + "epoch": 4.27, + "grad_norm": 6.953164100646973, + "learning_rate": 2.6519522698140533e-05, + "loss": 0.2039, + "step": 205 + }, + { + "epoch": 4.38, + "grad_norm": 7.376092433929443, + "learning_rate": 2.6044262434732996e-05, + "loss": 0.1341, + "step": 210 + }, + { + "epoch": 4.48, + "grad_norm": 1.5489851236343384, + "learning_rate": 2.5569002171325462e-05, + "loss": 0.1285, + "step": 215 + }, + { + "epoch": 4.58, + "grad_norm": 13.857163429260254, + "learning_rate": 2.5093741907917925e-05, + "loss": 0.0793, + "step": 220 + }, + { + "epoch": 4.69, + "grad_norm": 2.46573543548584, + "learning_rate": 2.4618481644510388e-05, + "loss": 0.1254, + "step": 225 + }, + { + "epoch": 4.79, + "grad_norm": 0.2868492305278778, + "learning_rate": 2.4143221381102847e-05, + "loss": 0.205, + "step": 230 + }, + { + "epoch": 4.9, + "grad_norm": 1.8789399862289429, + "learning_rate": 2.376301317037682e-05, + "loss": 0.1137, + "step": 235 + }, + { + "epoch": 5.0, + "grad_norm": 0.7913978099822998, + "learning_rate": 2.3287752906969286e-05, + "loss": 0.1204, + "step": 240 + }, + { + "epoch": 5.0, + "eval_f1": 0.7586206896551725, + "eval_loss": 1.0789520740509033, + "eval_runtime": 1.3527, + "eval_samples_per_second": 47.312, + "eval_steps_per_second": 5.914, + "step": 240 + } + ], + "logging_steps": 5, + "max_steps": 480, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 3758976149440320.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": { + "learning_rate": 4.1062486758411146e-05, + "per_device_train_batch_size": 4 + } +} diff --git a/run-11/checkpoint-240/training_args.bin b/run-11/checkpoint-240/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eee85774a254ee9284e54bbcaa537c00a3ec285b --- /dev/null +++ b/run-11/checkpoint-240/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:711d11e21d598840c9a2dcfe02c910b7483cbe50927c465ca1b21773cad22965 +size 4920 diff --git a/run-11/checkpoint-288/config.json b/run-11/checkpoint-288/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-11/checkpoint-288/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-11/checkpoint-288/model.safetensors b/run-11/checkpoint-288/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f304f8df4e78e1fa206dc7c96157d1a58e5e209c --- /dev/null +++ b/run-11/checkpoint-288/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd8b631ebc23024d612e2348bddea4c8539518f42cecd445a90bdb394e2174fc +size 94763496 diff --git a/run-11/checkpoint-288/optimizer.pt b/run-11/checkpoint-288/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1aad1e6b3ecfd95ed2519fae012b085f83403506 --- /dev/null +++ b/run-11/checkpoint-288/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dec2f38c4ba4de6611b6f6ec2b125aab74557f87c69b9e32301959630715e9d7 +size 189552570 diff --git a/run-11/checkpoint-288/preprocessor_config.json b/run-11/checkpoint-288/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-11/checkpoint-288/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-11/checkpoint-288/rng_state.pth b/run-11/checkpoint-288/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..de638b27cad47a23bcba70558a870ddb08a0f7e8 --- /dev/null +++ b/run-11/checkpoint-288/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9903236b654011babeaee26ea70e1c6278fa670549b900c6df1d64732428a642 +size 14244 diff --git a/run-11/checkpoint-288/scheduler.pt b/run-11/checkpoint-288/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..578beff4c4152bdd0dc0eb014035b71feb01e434 --- /dev/null +++ b/run-11/checkpoint-288/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1faab9e360596e74a92e5b31354317b717732da9902f9100eefd9cece5fe0574 +size 1064 diff --git a/run-11/checkpoint-288/trainer_state.json b/run-11/checkpoint-288/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fd886aa648abf610c91a2ae9e4c49f0160f0237c --- /dev/null +++ b/run-11/checkpoint-288/trainer_state.json @@ -0,0 +1,477 @@ +{ + "best_metric": 0.7741935483870968, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-11/checkpoint-288", + "epoch": 6.0, + "eval_steps": 500, + "global_step": 288, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "grad_norm": 1.2641795873641968, + "learning_rate": 4.277342370667828e-06, + "loss": 0.702, + "step": 5 + }, + { + "epoch": 0.21, + "grad_norm": 1.1789261102676392, + "learning_rate": 8.554684741335655e-06, + "loss": 0.6864, + "step": 10 + }, + { + "epoch": 0.31, + "grad_norm": 1.3848576545715332, + "learning_rate": 1.2832027112003484e-05, + "loss": 0.6855, + "step": 15 + }, + { + "epoch": 0.42, + "grad_norm": 0.953545093536377, + "learning_rate": 1.710936948267131e-05, + "loss": 0.6668, + "step": 20 + }, + { + "epoch": 0.52, + "grad_norm": 0.9704298377037048, + "learning_rate": 2.138671185333914e-05, + "loss": 0.6426, + "step": 25 + }, + { + "epoch": 0.62, + "grad_norm": 0.7949383854866028, + "learning_rate": 2.5664054224006968e-05, + "loss": 0.6293, + "step": 30 + }, + { + "epoch": 0.73, + "grad_norm": 2.390880823135376, + "learning_rate": 2.994139659467479e-05, + "loss": 0.5631, + "step": 35 + }, + { + "epoch": 0.83, + "grad_norm": 1.7147374153137207, + "learning_rate": 3.421873896534262e-05, + "loss": 0.6562, + "step": 40 + }, + { + "epoch": 0.94, + "grad_norm": 3.123836040496826, + "learning_rate": 3.849608133601045e-05, + "loss": 0.6243, + "step": 45 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7374954223632812, + "eval_runtime": 1.3701, + "eval_samples_per_second": 46.713, + "eval_steps_per_second": 5.839, + "step": 48 + }, + { + "epoch": 1.04, + "grad_norm": 4.445352077484131, + "learning_rate": 4.0872382653048134e-05, + "loss": 0.6965, + "step": 50 + }, + { + "epoch": 1.15, + "grad_norm": 0.8689994215965271, + "learning_rate": 4.03971223896406e-05, + "loss": 0.6945, + "step": 55 + }, + { + "epoch": 1.25, + "grad_norm": 1.744273066520691, + "learning_rate": 3.992186212623306e-05, + "loss": 0.4097, + "step": 60 + }, + { + "epoch": 1.35, + "grad_norm": 0.6568699479103088, + "learning_rate": 3.944660186282552e-05, + "loss": 0.5621, + "step": 65 + }, + { + "epoch": 1.46, + "grad_norm": 1.516032099723816, + "learning_rate": 3.8971341599417986e-05, + "loss": 0.4826, + "step": 70 + }, + { + "epoch": 1.56, + "grad_norm": Infinity, + "learning_rate": 3.8591133388691955e-05, + "loss": 0.7666, + "step": 75 + }, + { + "epoch": 1.67, + "grad_norm": 1.788261890411377, + "learning_rate": 3.811587312528442e-05, + "loss": 0.6187, + "step": 80 + }, + { + "epoch": 1.77, + "grad_norm": 1.7383731603622437, + "learning_rate": 3.764061286187688e-05, + "loss": 0.5974, + "step": 85 + }, + { + "epoch": 1.88, + "grad_norm": 1.9014606475830078, + "learning_rate": 3.7165352598469344e-05, + "loss": 0.5979, + "step": 90 + }, + { + "epoch": 1.98, + "grad_norm": 2.1836445331573486, + "learning_rate": 3.669009233506181e-05, + "loss": 0.6104, + "step": 95 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6798496246337891, + "eval_runtime": 1.3616, + "eval_samples_per_second": 47.003, + "eval_steps_per_second": 5.875, + "step": 96 + }, + { + "epoch": 2.08, + "grad_norm": 1.0497145652770996, + "learning_rate": 3.6214832071654276e-05, + "loss": 0.5777, + "step": 100 + }, + { + "epoch": 2.19, + "grad_norm": 1.1734980344772339, + "learning_rate": 3.573957180824674e-05, + "loss": 0.4793, + "step": 105 + }, + { + "epoch": 2.29, + "grad_norm": 1.6985560655593872, + "learning_rate": 3.52643115448392e-05, + "loss": 0.5708, + "step": 110 + }, + { + "epoch": 2.4, + "grad_norm": 1.110007882118225, + "learning_rate": 3.4789051281431665e-05, + "loss": 0.5446, + "step": 115 + }, + { + "epoch": 2.5, + "grad_norm": Infinity, + "learning_rate": 3.4408843070705634e-05, + "loss": 0.5224, + "step": 120 + }, + { + "epoch": 2.6, + "grad_norm": 4.383241653442383, + "learning_rate": 3.39335828072981e-05, + "loss": 0.4492, + "step": 125 + }, + { + "epoch": 2.71, + "grad_norm": 20.106430053710938, + "learning_rate": 3.355337459657207e-05, + "loss": 0.6228, + "step": 130 + }, + { + "epoch": 2.81, + "grad_norm": 3.188138008117676, + "learning_rate": 3.3078114333164536e-05, + "loss": 0.3671, + "step": 135 + }, + { + "epoch": 2.92, + "grad_norm": 3.840233087539673, + "learning_rate": 3.2602854069757e-05, + "loss": 0.3217, + "step": 140 + }, + { + "epoch": 3.0, + "eval_f1": 0.7474747474747475, + "eval_loss": 0.8114051818847656, + "eval_runtime": 1.4005, + "eval_samples_per_second": 45.699, + "eval_steps_per_second": 5.712, + "step": 144 + }, + { + "epoch": 3.02, + "grad_norm": 2.789161205291748, + "learning_rate": 3.212759380634946e-05, + "loss": 0.4985, + "step": 145 + }, + { + "epoch": 3.12, + "grad_norm": 20.845867156982422, + "learning_rate": 3.1652333542941924e-05, + "loss": 0.3366, + "step": 150 + }, + { + "epoch": 3.23, + "grad_norm": 3.9109976291656494, + "learning_rate": 3.117707327953439e-05, + "loss": 0.2834, + "step": 155 + }, + { + "epoch": 3.33, + "grad_norm": 5.4618916511535645, + "learning_rate": 3.070181301612686e-05, + "loss": 0.4844, + "step": 160 + }, + { + "epoch": 3.44, + "grad_norm": 4.305176258087158, + "learning_rate": 3.0226552752719317e-05, + "loss": 0.2045, + "step": 165 + }, + { + "epoch": 3.54, + "grad_norm": 15.64925765991211, + "learning_rate": 2.9751292489311783e-05, + "loss": 0.2373, + "step": 170 + }, + { + "epoch": 3.65, + "grad_norm": 10.34076976776123, + "learning_rate": 2.9276032225904242e-05, + "loss": 0.4372, + "step": 175 + }, + { + "epoch": 3.75, + "grad_norm": Infinity, + "learning_rate": 2.8895824015178215e-05, + "loss": 0.2174, + "step": 180 + }, + { + "epoch": 3.85, + "grad_norm": 1.866976022720337, + "learning_rate": 2.8420563751770678e-05, + "loss": 0.2674, + "step": 185 + }, + { + "epoch": 3.96, + "grad_norm": 12.544903755187988, + "learning_rate": 2.794530348836314e-05, + "loss": 0.3761, + "step": 190 + }, + { + "epoch": 4.0, + "eval_f1": 0.7500000000000001, + "eval_loss": 0.9909818172454834, + "eval_runtime": 1.3645, + "eval_samples_per_second": 46.904, + "eval_steps_per_second": 5.863, + "step": 192 + }, + { + "epoch": 4.06, + "grad_norm": 0.7645444869995117, + "learning_rate": 2.7470043224955607e-05, + "loss": 0.2581, + "step": 195 + }, + { + "epoch": 4.17, + "grad_norm": 3.4740447998046875, + "learning_rate": 2.699478296154807e-05, + "loss": 0.3123, + "step": 200 + }, + { + "epoch": 4.27, + "grad_norm": 6.953164100646973, + "learning_rate": 2.6519522698140533e-05, + "loss": 0.2039, + "step": 205 + }, + { + "epoch": 4.38, + "grad_norm": 7.376092433929443, + "learning_rate": 2.6044262434732996e-05, + "loss": 0.1341, + "step": 210 + }, + { + "epoch": 4.48, + "grad_norm": 1.5489851236343384, + "learning_rate": 2.5569002171325462e-05, + "loss": 0.1285, + "step": 215 + }, + { + "epoch": 4.58, + "grad_norm": 13.857163429260254, + "learning_rate": 2.5093741907917925e-05, + "loss": 0.0793, + "step": 220 + }, + { + "epoch": 4.69, + "grad_norm": 2.46573543548584, + "learning_rate": 2.4618481644510388e-05, + "loss": 0.1254, + "step": 225 + }, + { + "epoch": 4.79, + "grad_norm": 0.2868492305278778, + "learning_rate": 2.4143221381102847e-05, + "loss": 0.205, + "step": 230 + }, + { + "epoch": 4.9, + "grad_norm": 1.8789399862289429, + "learning_rate": 2.376301317037682e-05, + "loss": 0.1137, + "step": 235 + }, + { + "epoch": 5.0, + "grad_norm": 0.7913978099822998, + "learning_rate": 2.3287752906969286e-05, + "loss": 0.1204, + "step": 240 + }, + { + "epoch": 5.0, + "eval_f1": 0.7586206896551725, + "eval_loss": 1.0789520740509033, + "eval_runtime": 1.3527, + "eval_samples_per_second": 47.312, + "eval_steps_per_second": 5.914, + "step": 240 + }, + { + "epoch": 5.1, + "grad_norm": 0.31450164318084717, + "learning_rate": 2.281249264356175e-05, + "loss": 0.0266, + "step": 245 + }, + { + "epoch": 5.21, + "grad_norm": 1.4393788576126099, + "learning_rate": 2.2337232380154212e-05, + "loss": 0.0256, + "step": 250 + }, + { + "epoch": 5.31, + "grad_norm": 1.2491079568862915, + "learning_rate": 2.1861972116746675e-05, + "loss": 0.0326, + "step": 255 + }, + { + "epoch": 5.42, + "grad_norm": 8.679621696472168, + "learning_rate": 2.138671185333914e-05, + "loss": 0.0274, + "step": 260 + }, + { + "epoch": 5.52, + "grad_norm": 0.2720549404621124, + "learning_rate": 2.0911451589931604e-05, + "loss": 0.0474, + "step": 265 + }, + { + "epoch": 5.62, + "grad_norm": 0.5277475714683533, + "learning_rate": 2.0436191326524067e-05, + "loss": 0.0339, + "step": 270 + }, + { + "epoch": 5.73, + "grad_norm": 19.154306411743164, + "learning_rate": 1.996093106311653e-05, + "loss": 0.0294, + "step": 275 + }, + { + "epoch": 5.83, + "grad_norm": 0.20597286522388458, + "learning_rate": 1.9485670799708993e-05, + "loss": 0.2482, + "step": 280 + }, + { + "epoch": 5.94, + "grad_norm": 0.9011418223381042, + "learning_rate": 1.9010410536301456e-05, + "loss": 0.0123, + "step": 285 + }, + { + "epoch": 6.0, + "eval_f1": 0.7741935483870968, + "eval_loss": 1.4787760972976685, + "eval_runtime": 1.3653, + "eval_samples_per_second": 46.875, + "eval_steps_per_second": 5.859, + "step": 288 + } + ], + "logging_steps": 5, + "max_steps": 480, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 4461152812285392.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": { + "learning_rate": 4.1062486758411146e-05, + "per_device_train_batch_size": 4 + } +} diff --git a/run-11/checkpoint-288/training_args.bin b/run-11/checkpoint-288/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eee85774a254ee9284e54bbcaa537c00a3ec285b --- /dev/null +++ b/run-11/checkpoint-288/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:711d11e21d598840c9a2dcfe02c910b7483cbe50927c465ca1b21773cad22965 +size 4920 diff --git a/run-11/checkpoint-336/config.json b/run-11/checkpoint-336/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-11/checkpoint-336/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-11/checkpoint-336/model.safetensors b/run-11/checkpoint-336/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2762aa31504401b0864ddf0fc29b56acf5d6ab4f --- /dev/null +++ b/run-11/checkpoint-336/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2f42dc5d50d2cdeb1ea19a246babea8d5ff81125ef576e6bdd53c43b11c8b3f +size 94763496 diff --git a/run-11/checkpoint-336/optimizer.pt b/run-11/checkpoint-336/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7bdc3dbc4819e4ca8afa38579f32019711e1c99 --- /dev/null +++ b/run-11/checkpoint-336/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7794020661bfdb798abf18876e17edc51a8d3ac3ca22879478ca9f6527cd20f3 +size 189552570 diff --git a/run-11/checkpoint-336/preprocessor_config.json b/run-11/checkpoint-336/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-11/checkpoint-336/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-11/checkpoint-336/rng_state.pth b/run-11/checkpoint-336/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..6d09c593f640bea3364e14655046e2da93b3ebc1 --- /dev/null +++ b/run-11/checkpoint-336/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d12884ae20f0c926a355fda8650edc055a398d4c7c42545ccdb7d60bd202452 +size 14244 diff --git a/run-11/checkpoint-336/scheduler.pt b/run-11/checkpoint-336/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6bc8e0ebf4cb569613cdb689af24b876b69b0728 --- /dev/null +++ b/run-11/checkpoint-336/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b319c5cef623e91959de09eed13e2f88010ede6a5636235e70495c0d5e1ca8e3 +size 1064 diff --git a/run-11/checkpoint-336/trainer_state.json b/run-11/checkpoint-336/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a7ec9d04af13f572c5376a945b0d0d058526782a --- /dev/null +++ b/run-11/checkpoint-336/trainer_state.json @@ -0,0 +1,556 @@ +{ + "best_metric": 0.7741935483870968, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-11/checkpoint-288", + "epoch": 7.0, + "eval_steps": 500, + "global_step": 336, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "grad_norm": 1.2641795873641968, + "learning_rate": 4.277342370667828e-06, + "loss": 0.702, + "step": 5 + }, + { + "epoch": 0.21, + "grad_norm": 1.1789261102676392, + "learning_rate": 8.554684741335655e-06, + "loss": 0.6864, + "step": 10 + }, + { + "epoch": 0.31, + "grad_norm": 1.3848576545715332, + "learning_rate": 1.2832027112003484e-05, + "loss": 0.6855, + "step": 15 + }, + { + "epoch": 0.42, + "grad_norm": 0.953545093536377, + "learning_rate": 1.710936948267131e-05, + "loss": 0.6668, + "step": 20 + }, + { + "epoch": 0.52, + "grad_norm": 0.9704298377037048, + "learning_rate": 2.138671185333914e-05, + "loss": 0.6426, + "step": 25 + }, + { + "epoch": 0.62, + "grad_norm": 0.7949383854866028, + "learning_rate": 2.5664054224006968e-05, + "loss": 0.6293, + "step": 30 + }, + { + "epoch": 0.73, + "grad_norm": 2.390880823135376, + "learning_rate": 2.994139659467479e-05, + "loss": 0.5631, + "step": 35 + }, + { + "epoch": 0.83, + "grad_norm": 1.7147374153137207, + "learning_rate": 3.421873896534262e-05, + "loss": 0.6562, + "step": 40 + }, + { + "epoch": 0.94, + "grad_norm": 3.123836040496826, + "learning_rate": 3.849608133601045e-05, + "loss": 0.6243, + "step": 45 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7374954223632812, + "eval_runtime": 1.3701, + "eval_samples_per_second": 46.713, + "eval_steps_per_second": 5.839, + "step": 48 + }, + { + "epoch": 1.04, + "grad_norm": 4.445352077484131, + "learning_rate": 4.0872382653048134e-05, + "loss": 0.6965, + "step": 50 + }, + { + "epoch": 1.15, + "grad_norm": 0.8689994215965271, + "learning_rate": 4.03971223896406e-05, + "loss": 0.6945, + "step": 55 + }, + { + "epoch": 1.25, + "grad_norm": 1.744273066520691, + "learning_rate": 3.992186212623306e-05, + "loss": 0.4097, + "step": 60 + }, + { + "epoch": 1.35, + "grad_norm": 0.6568699479103088, + "learning_rate": 3.944660186282552e-05, + "loss": 0.5621, + "step": 65 + }, + { + "epoch": 1.46, + "grad_norm": 1.516032099723816, + "learning_rate": 3.8971341599417986e-05, + "loss": 0.4826, + "step": 70 + }, + { + "epoch": 1.56, + "grad_norm": Infinity, + "learning_rate": 3.8591133388691955e-05, + "loss": 0.7666, + "step": 75 + }, + { + "epoch": 1.67, + "grad_norm": 1.788261890411377, + "learning_rate": 3.811587312528442e-05, + "loss": 0.6187, + "step": 80 + }, + { + "epoch": 1.77, + "grad_norm": 1.7383731603622437, + "learning_rate": 3.764061286187688e-05, + "loss": 0.5974, + "step": 85 + }, + { + "epoch": 1.88, + "grad_norm": 1.9014606475830078, + "learning_rate": 3.7165352598469344e-05, + "loss": 0.5979, + "step": 90 + }, + { + "epoch": 1.98, + "grad_norm": 2.1836445331573486, + "learning_rate": 3.669009233506181e-05, + "loss": 0.6104, + "step": 95 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6798496246337891, + "eval_runtime": 1.3616, + "eval_samples_per_second": 47.003, + "eval_steps_per_second": 5.875, + "step": 96 + }, + { + "epoch": 2.08, + "grad_norm": 1.0497145652770996, + "learning_rate": 3.6214832071654276e-05, + "loss": 0.5777, + "step": 100 + }, + { + "epoch": 2.19, + "grad_norm": 1.1734980344772339, + "learning_rate": 3.573957180824674e-05, + "loss": 0.4793, + "step": 105 + }, + { + "epoch": 2.29, + "grad_norm": 1.6985560655593872, + "learning_rate": 3.52643115448392e-05, + "loss": 0.5708, + "step": 110 + }, + { + "epoch": 2.4, + "grad_norm": 1.110007882118225, + "learning_rate": 3.4789051281431665e-05, + "loss": 0.5446, + "step": 115 + }, + { + "epoch": 2.5, + "grad_norm": Infinity, + "learning_rate": 3.4408843070705634e-05, + "loss": 0.5224, + "step": 120 + }, + { + "epoch": 2.6, + "grad_norm": 4.383241653442383, + "learning_rate": 3.39335828072981e-05, + "loss": 0.4492, + "step": 125 + }, + { + "epoch": 2.71, + "grad_norm": 20.106430053710938, + "learning_rate": 3.355337459657207e-05, + "loss": 0.6228, + "step": 130 + }, + { + "epoch": 2.81, + "grad_norm": 3.188138008117676, + "learning_rate": 3.3078114333164536e-05, + "loss": 0.3671, + "step": 135 + }, + { + "epoch": 2.92, + "grad_norm": 3.840233087539673, + "learning_rate": 3.2602854069757e-05, + "loss": 0.3217, + "step": 140 + }, + { + "epoch": 3.0, + "eval_f1": 0.7474747474747475, + "eval_loss": 0.8114051818847656, + "eval_runtime": 1.4005, + "eval_samples_per_second": 45.699, + "eval_steps_per_second": 5.712, + "step": 144 + }, + { + "epoch": 3.02, + "grad_norm": 2.789161205291748, + "learning_rate": 3.212759380634946e-05, + "loss": 0.4985, + "step": 145 + }, + { + "epoch": 3.12, + "grad_norm": 20.845867156982422, + "learning_rate": 3.1652333542941924e-05, + "loss": 0.3366, + "step": 150 + }, + { + "epoch": 3.23, + "grad_norm": 3.9109976291656494, + "learning_rate": 3.117707327953439e-05, + "loss": 0.2834, + "step": 155 + }, + { + "epoch": 3.33, + "grad_norm": 5.4618916511535645, + "learning_rate": 3.070181301612686e-05, + "loss": 0.4844, + "step": 160 + }, + { + "epoch": 3.44, + "grad_norm": 4.305176258087158, + "learning_rate": 3.0226552752719317e-05, + "loss": 0.2045, + "step": 165 + }, + { + "epoch": 3.54, + "grad_norm": 15.64925765991211, + "learning_rate": 2.9751292489311783e-05, + "loss": 0.2373, + "step": 170 + }, + { + "epoch": 3.65, + "grad_norm": 10.34076976776123, + "learning_rate": 2.9276032225904242e-05, + "loss": 0.4372, + "step": 175 + }, + { + "epoch": 3.75, + "grad_norm": Infinity, + "learning_rate": 2.8895824015178215e-05, + "loss": 0.2174, + "step": 180 + }, + { + "epoch": 3.85, + "grad_norm": 1.866976022720337, + "learning_rate": 2.8420563751770678e-05, + "loss": 0.2674, + "step": 185 + }, + { + "epoch": 3.96, + "grad_norm": 12.544903755187988, + "learning_rate": 2.794530348836314e-05, + "loss": 0.3761, + "step": 190 + }, + { + "epoch": 4.0, + "eval_f1": 0.7500000000000001, + "eval_loss": 0.9909818172454834, + "eval_runtime": 1.3645, + "eval_samples_per_second": 46.904, + "eval_steps_per_second": 5.863, + "step": 192 + }, + { + "epoch": 4.06, + "grad_norm": 0.7645444869995117, + "learning_rate": 2.7470043224955607e-05, + "loss": 0.2581, + "step": 195 + }, + { + "epoch": 4.17, + "grad_norm": 3.4740447998046875, + "learning_rate": 2.699478296154807e-05, + "loss": 0.3123, + "step": 200 + }, + { + "epoch": 4.27, + "grad_norm": 6.953164100646973, + "learning_rate": 2.6519522698140533e-05, + "loss": 0.2039, + "step": 205 + }, + { + "epoch": 4.38, + "grad_norm": 7.376092433929443, + "learning_rate": 2.6044262434732996e-05, + "loss": 0.1341, + "step": 210 + }, + { + "epoch": 4.48, + "grad_norm": 1.5489851236343384, + "learning_rate": 2.5569002171325462e-05, + "loss": 0.1285, + "step": 215 + }, + { + "epoch": 4.58, + "grad_norm": 13.857163429260254, + "learning_rate": 2.5093741907917925e-05, + "loss": 0.0793, + "step": 220 + }, + { + "epoch": 4.69, + "grad_norm": 2.46573543548584, + "learning_rate": 2.4618481644510388e-05, + "loss": 0.1254, + "step": 225 + }, + { + "epoch": 4.79, + "grad_norm": 0.2868492305278778, + "learning_rate": 2.4143221381102847e-05, + "loss": 0.205, + "step": 230 + }, + { + "epoch": 4.9, + "grad_norm": 1.8789399862289429, + "learning_rate": 2.376301317037682e-05, + "loss": 0.1137, + "step": 235 + }, + { + "epoch": 5.0, + "grad_norm": 0.7913978099822998, + "learning_rate": 2.3287752906969286e-05, + "loss": 0.1204, + "step": 240 + }, + { + "epoch": 5.0, + "eval_f1": 0.7586206896551725, + "eval_loss": 1.0789520740509033, + "eval_runtime": 1.3527, + "eval_samples_per_second": 47.312, + "eval_steps_per_second": 5.914, + "step": 240 + }, + { + "epoch": 5.1, + "grad_norm": 0.31450164318084717, + "learning_rate": 2.281249264356175e-05, + "loss": 0.0266, + "step": 245 + }, + { + "epoch": 5.21, + "grad_norm": 1.4393788576126099, + "learning_rate": 2.2337232380154212e-05, + "loss": 0.0256, + "step": 250 + }, + { + "epoch": 5.31, + "grad_norm": 1.2491079568862915, + "learning_rate": 2.1861972116746675e-05, + "loss": 0.0326, + "step": 255 + }, + { + "epoch": 5.42, + "grad_norm": 8.679621696472168, + "learning_rate": 2.138671185333914e-05, + "loss": 0.0274, + "step": 260 + }, + { + "epoch": 5.52, + "grad_norm": 0.2720549404621124, + "learning_rate": 2.0911451589931604e-05, + "loss": 0.0474, + "step": 265 + }, + { + "epoch": 5.62, + "grad_norm": 0.5277475714683533, + "learning_rate": 2.0436191326524067e-05, + "loss": 0.0339, + "step": 270 + }, + { + "epoch": 5.73, + "grad_norm": 19.154306411743164, + "learning_rate": 1.996093106311653e-05, + "loss": 0.0294, + "step": 275 + }, + { + "epoch": 5.83, + "grad_norm": 0.20597286522388458, + "learning_rate": 1.9485670799708993e-05, + "loss": 0.2482, + "step": 280 + }, + { + "epoch": 5.94, + "grad_norm": 0.9011418223381042, + "learning_rate": 1.9010410536301456e-05, + "loss": 0.0123, + "step": 285 + }, + { + "epoch": 6.0, + "eval_f1": 0.7741935483870968, + "eval_loss": 1.4787760972976685, + "eval_runtime": 1.3653, + "eval_samples_per_second": 46.875, + "eval_steps_per_second": 5.859, + "step": 288 + }, + { + "epoch": 6.04, + "grad_norm": 0.22904255986213684, + "learning_rate": 1.853515027289392e-05, + "loss": 0.224, + "step": 290 + }, + { + "epoch": 6.15, + "grad_norm": 0.2534620463848114, + "learning_rate": 1.8059890009486385e-05, + "loss": 0.0117, + "step": 295 + }, + { + "epoch": 6.25, + "grad_norm": 0.1455022692680359, + "learning_rate": 1.7584629746078848e-05, + "loss": 0.012, + "step": 300 + }, + { + "epoch": 6.35, + "grad_norm": 7.899118423461914, + "learning_rate": 1.710936948267131e-05, + "loss": 0.0205, + "step": 305 + }, + { + "epoch": 6.46, + "grad_norm": 0.15333446860313416, + "learning_rate": 1.6634109219263774e-05, + "loss": 0.0124, + "step": 310 + }, + { + "epoch": 6.56, + "grad_norm": 2.1998233795166016, + "learning_rate": 1.615884895585624e-05, + "loss": 0.0127, + "step": 315 + }, + { + "epoch": 6.67, + "grad_norm": 4.678240776062012, + "learning_rate": 1.56835886924487e-05, + "loss": 0.0188, + "step": 320 + }, + { + "epoch": 6.77, + "grad_norm": 6.431846618652344, + "learning_rate": 1.5208328429041164e-05, + "loss": 0.0178, + "step": 325 + }, + { + "epoch": 6.88, + "grad_norm": 3.7818799018859863, + "learning_rate": 1.4733068165633629e-05, + "loss": 0.0091, + "step": 330 + }, + { + "epoch": 6.98, + "grad_norm": 0.11249573528766632, + "learning_rate": 1.4257807902226092e-05, + "loss": 0.2635, + "step": 335 + }, + { + "epoch": 7.0, + "eval_f1": 0.7500000000000001, + "eval_loss": 1.9278812408447266, + "eval_runtime": 1.3857, + "eval_samples_per_second": 46.187, + "eval_steps_per_second": 5.773, + "step": 336 + } + ], + "logging_steps": 5, + "max_steps": 480, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 5221000903695312.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": { + "learning_rate": 4.1062486758411146e-05, + "per_device_train_batch_size": 4 + } +} diff --git a/run-11/checkpoint-336/training_args.bin b/run-11/checkpoint-336/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eee85774a254ee9284e54bbcaa537c00a3ec285b --- /dev/null +++ b/run-11/checkpoint-336/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:711d11e21d598840c9a2dcfe02c910b7483cbe50927c465ca1b21773cad22965 +size 4920 diff --git a/run-11/checkpoint-384/config.json b/run-11/checkpoint-384/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-11/checkpoint-384/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-11/checkpoint-384/model.safetensors b/run-11/checkpoint-384/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..94526d5d73dc70f1711edbc83f0f8e75d439cbbf --- /dev/null +++ b/run-11/checkpoint-384/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35111b25ffe91a7f9b131d0cbb0cdc07f7a6eb629ef098c529f96a6de71f99d7 +size 94763496 diff --git a/run-11/checkpoint-384/optimizer.pt b/run-11/checkpoint-384/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c91195b6e9795f8acb58e0c2c5a834be333c14b --- /dev/null +++ b/run-11/checkpoint-384/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19d1dc56abf506d5941575a1db6161bea8b01c005fd8ab4342fa3fac5909ede1 +size 189552570 diff --git a/run-11/checkpoint-384/preprocessor_config.json b/run-11/checkpoint-384/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-11/checkpoint-384/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-11/checkpoint-384/rng_state.pth b/run-11/checkpoint-384/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1a6bd57b8c17138ed3367d5ca6692d78d760bd47 --- /dev/null +++ b/run-11/checkpoint-384/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5326b9611b4fb9dc5dc0b29580e7e48abf50913e44071592799c052bebfbacd7 +size 14244 diff --git a/run-11/checkpoint-384/scheduler.pt b/run-11/checkpoint-384/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..88540abf0c5456fdb48d666c1ed1fc617d0c6395 --- /dev/null +++ b/run-11/checkpoint-384/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa169e207b2d3eb331f41fe92bab205efce79050a70cb02d19ba92ca4af2a686 +size 1064 diff --git a/run-11/checkpoint-384/trainer_state.json b/run-11/checkpoint-384/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..49563c4f981122c878d6560d3dd47e0700ed5e66 --- /dev/null +++ b/run-11/checkpoint-384/trainer_state.json @@ -0,0 +1,628 @@ +{ + "best_metric": 0.7912087912087912, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-11/checkpoint-384", + "epoch": 8.0, + "eval_steps": 500, + "global_step": 384, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "grad_norm": 1.2641795873641968, + "learning_rate": 4.277342370667828e-06, + "loss": 0.702, + "step": 5 + }, + { + "epoch": 0.21, + "grad_norm": 1.1789261102676392, + "learning_rate": 8.554684741335655e-06, + "loss": 0.6864, + "step": 10 + }, + { + "epoch": 0.31, + "grad_norm": 1.3848576545715332, + "learning_rate": 1.2832027112003484e-05, + "loss": 0.6855, + "step": 15 + }, + { + "epoch": 0.42, + "grad_norm": 0.953545093536377, + "learning_rate": 1.710936948267131e-05, + "loss": 0.6668, + "step": 20 + }, + { + "epoch": 0.52, + "grad_norm": 0.9704298377037048, + "learning_rate": 2.138671185333914e-05, + "loss": 0.6426, + "step": 25 + }, + { + "epoch": 0.62, + "grad_norm": 0.7949383854866028, + "learning_rate": 2.5664054224006968e-05, + "loss": 0.6293, + "step": 30 + }, + { + "epoch": 0.73, + "grad_norm": 2.390880823135376, + "learning_rate": 2.994139659467479e-05, + "loss": 0.5631, + "step": 35 + }, + { + "epoch": 0.83, + "grad_norm": 1.7147374153137207, + "learning_rate": 3.421873896534262e-05, + "loss": 0.6562, + "step": 40 + }, + { + "epoch": 0.94, + "grad_norm": 3.123836040496826, + "learning_rate": 3.849608133601045e-05, + "loss": 0.6243, + "step": 45 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7374954223632812, + "eval_runtime": 1.3701, + "eval_samples_per_second": 46.713, + "eval_steps_per_second": 5.839, + "step": 48 + }, + { + "epoch": 1.04, + "grad_norm": 4.445352077484131, + "learning_rate": 4.0872382653048134e-05, + "loss": 0.6965, + "step": 50 + }, + { + "epoch": 1.15, + "grad_norm": 0.8689994215965271, + "learning_rate": 4.03971223896406e-05, + "loss": 0.6945, + "step": 55 + }, + { + "epoch": 1.25, + "grad_norm": 1.744273066520691, + "learning_rate": 3.992186212623306e-05, + "loss": 0.4097, + "step": 60 + }, + { + "epoch": 1.35, + "grad_norm": 0.6568699479103088, + "learning_rate": 3.944660186282552e-05, + "loss": 0.5621, + "step": 65 + }, + { + "epoch": 1.46, + "grad_norm": 1.516032099723816, + "learning_rate": 3.8971341599417986e-05, + "loss": 0.4826, + "step": 70 + }, + { + "epoch": 1.56, + "grad_norm": Infinity, + "learning_rate": 3.8591133388691955e-05, + "loss": 0.7666, + "step": 75 + }, + { + "epoch": 1.67, + "grad_norm": 1.788261890411377, + "learning_rate": 3.811587312528442e-05, + "loss": 0.6187, + "step": 80 + }, + { + "epoch": 1.77, + "grad_norm": 1.7383731603622437, + "learning_rate": 3.764061286187688e-05, + "loss": 0.5974, + "step": 85 + }, + { + "epoch": 1.88, + "grad_norm": 1.9014606475830078, + "learning_rate": 3.7165352598469344e-05, + "loss": 0.5979, + "step": 90 + }, + { + "epoch": 1.98, + "grad_norm": 2.1836445331573486, + "learning_rate": 3.669009233506181e-05, + "loss": 0.6104, + "step": 95 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6798496246337891, + "eval_runtime": 1.3616, + "eval_samples_per_second": 47.003, + "eval_steps_per_second": 5.875, + "step": 96 + }, + { + "epoch": 2.08, + "grad_norm": 1.0497145652770996, + "learning_rate": 3.6214832071654276e-05, + "loss": 0.5777, + "step": 100 + }, + { + "epoch": 2.19, + "grad_norm": 1.1734980344772339, + "learning_rate": 3.573957180824674e-05, + "loss": 0.4793, + "step": 105 + }, + { + "epoch": 2.29, + "grad_norm": 1.6985560655593872, + "learning_rate": 3.52643115448392e-05, + "loss": 0.5708, + "step": 110 + }, + { + "epoch": 2.4, + "grad_norm": 1.110007882118225, + "learning_rate": 3.4789051281431665e-05, + "loss": 0.5446, + "step": 115 + }, + { + "epoch": 2.5, + "grad_norm": Infinity, + "learning_rate": 3.4408843070705634e-05, + "loss": 0.5224, + "step": 120 + }, + { + "epoch": 2.6, + "grad_norm": 4.383241653442383, + "learning_rate": 3.39335828072981e-05, + "loss": 0.4492, + "step": 125 + }, + { + "epoch": 2.71, + "grad_norm": 20.106430053710938, + "learning_rate": 3.355337459657207e-05, + "loss": 0.6228, + "step": 130 + }, + { + "epoch": 2.81, + "grad_norm": 3.188138008117676, + "learning_rate": 3.3078114333164536e-05, + "loss": 0.3671, + "step": 135 + }, + { + "epoch": 2.92, + "grad_norm": 3.840233087539673, + "learning_rate": 3.2602854069757e-05, + "loss": 0.3217, + "step": 140 + }, + { + "epoch": 3.0, + "eval_f1": 0.7474747474747475, + "eval_loss": 0.8114051818847656, + "eval_runtime": 1.4005, + "eval_samples_per_second": 45.699, + "eval_steps_per_second": 5.712, + "step": 144 + }, + { + "epoch": 3.02, + "grad_norm": 2.789161205291748, + "learning_rate": 3.212759380634946e-05, + "loss": 0.4985, + "step": 145 + }, + { + "epoch": 3.12, + "grad_norm": 20.845867156982422, + "learning_rate": 3.1652333542941924e-05, + "loss": 0.3366, + "step": 150 + }, + { + "epoch": 3.23, + "grad_norm": 3.9109976291656494, + "learning_rate": 3.117707327953439e-05, + "loss": 0.2834, + "step": 155 + }, + { + "epoch": 3.33, + "grad_norm": 5.4618916511535645, + "learning_rate": 3.070181301612686e-05, + "loss": 0.4844, + "step": 160 + }, + { + "epoch": 3.44, + "grad_norm": 4.305176258087158, + "learning_rate": 3.0226552752719317e-05, + "loss": 0.2045, + "step": 165 + }, + { + "epoch": 3.54, + "grad_norm": 15.64925765991211, + "learning_rate": 2.9751292489311783e-05, + "loss": 0.2373, + "step": 170 + }, + { + "epoch": 3.65, + "grad_norm": 10.34076976776123, + "learning_rate": 2.9276032225904242e-05, + "loss": 0.4372, + "step": 175 + }, + { + "epoch": 3.75, + "grad_norm": Infinity, + "learning_rate": 2.8895824015178215e-05, + "loss": 0.2174, + "step": 180 + }, + { + "epoch": 3.85, + "grad_norm": 1.866976022720337, + "learning_rate": 2.8420563751770678e-05, + "loss": 0.2674, + "step": 185 + }, + { + "epoch": 3.96, + "grad_norm": 12.544903755187988, + "learning_rate": 2.794530348836314e-05, + "loss": 0.3761, + "step": 190 + }, + { + "epoch": 4.0, + "eval_f1": 0.7500000000000001, + "eval_loss": 0.9909818172454834, + "eval_runtime": 1.3645, + "eval_samples_per_second": 46.904, + "eval_steps_per_second": 5.863, + "step": 192 + }, + { + "epoch": 4.06, + "grad_norm": 0.7645444869995117, + "learning_rate": 2.7470043224955607e-05, + "loss": 0.2581, + "step": 195 + }, + { + "epoch": 4.17, + "grad_norm": 3.4740447998046875, + "learning_rate": 2.699478296154807e-05, + "loss": 0.3123, + "step": 200 + }, + { + "epoch": 4.27, + "grad_norm": 6.953164100646973, + "learning_rate": 2.6519522698140533e-05, + "loss": 0.2039, + "step": 205 + }, + { + "epoch": 4.38, + "grad_norm": 7.376092433929443, + "learning_rate": 2.6044262434732996e-05, + "loss": 0.1341, + "step": 210 + }, + { + "epoch": 4.48, + "grad_norm": 1.5489851236343384, + "learning_rate": 2.5569002171325462e-05, + "loss": 0.1285, + "step": 215 + }, + { + "epoch": 4.58, + "grad_norm": 13.857163429260254, + "learning_rate": 2.5093741907917925e-05, + "loss": 0.0793, + "step": 220 + }, + { + "epoch": 4.69, + "grad_norm": 2.46573543548584, + "learning_rate": 2.4618481644510388e-05, + "loss": 0.1254, + "step": 225 + }, + { + "epoch": 4.79, + "grad_norm": 0.2868492305278778, + "learning_rate": 2.4143221381102847e-05, + "loss": 0.205, + "step": 230 + }, + { + "epoch": 4.9, + "grad_norm": 1.8789399862289429, + "learning_rate": 2.376301317037682e-05, + "loss": 0.1137, + "step": 235 + }, + { + "epoch": 5.0, + "grad_norm": 0.7913978099822998, + "learning_rate": 2.3287752906969286e-05, + "loss": 0.1204, + "step": 240 + }, + { + "epoch": 5.0, + "eval_f1": 0.7586206896551725, + "eval_loss": 1.0789520740509033, + "eval_runtime": 1.3527, + "eval_samples_per_second": 47.312, + "eval_steps_per_second": 5.914, + "step": 240 + }, + { + "epoch": 5.1, + "grad_norm": 0.31450164318084717, + "learning_rate": 2.281249264356175e-05, + "loss": 0.0266, + "step": 245 + }, + { + "epoch": 5.21, + "grad_norm": 1.4393788576126099, + "learning_rate": 2.2337232380154212e-05, + "loss": 0.0256, + "step": 250 + }, + { + "epoch": 5.31, + "grad_norm": 1.2491079568862915, + "learning_rate": 2.1861972116746675e-05, + "loss": 0.0326, + "step": 255 + }, + { + "epoch": 5.42, + "grad_norm": 8.679621696472168, + "learning_rate": 2.138671185333914e-05, + "loss": 0.0274, + "step": 260 + }, + { + "epoch": 5.52, + "grad_norm": 0.2720549404621124, + "learning_rate": 2.0911451589931604e-05, + "loss": 0.0474, + "step": 265 + }, + { + "epoch": 5.62, + "grad_norm": 0.5277475714683533, + "learning_rate": 2.0436191326524067e-05, + "loss": 0.0339, + "step": 270 + }, + { + "epoch": 5.73, + "grad_norm": 19.154306411743164, + "learning_rate": 1.996093106311653e-05, + "loss": 0.0294, + "step": 275 + }, + { + "epoch": 5.83, + "grad_norm": 0.20597286522388458, + "learning_rate": 1.9485670799708993e-05, + "loss": 0.2482, + "step": 280 + }, + { + "epoch": 5.94, + "grad_norm": 0.9011418223381042, + "learning_rate": 1.9010410536301456e-05, + "loss": 0.0123, + "step": 285 + }, + { + "epoch": 6.0, + "eval_f1": 0.7741935483870968, + "eval_loss": 1.4787760972976685, + "eval_runtime": 1.3653, + "eval_samples_per_second": 46.875, + "eval_steps_per_second": 5.859, + "step": 288 + }, + { + "epoch": 6.04, + "grad_norm": 0.22904255986213684, + "learning_rate": 1.853515027289392e-05, + "loss": 0.224, + "step": 290 + }, + { + "epoch": 6.15, + "grad_norm": 0.2534620463848114, + "learning_rate": 1.8059890009486385e-05, + "loss": 0.0117, + "step": 295 + }, + { + "epoch": 6.25, + "grad_norm": 0.1455022692680359, + "learning_rate": 1.7584629746078848e-05, + "loss": 0.012, + "step": 300 + }, + { + "epoch": 6.35, + "grad_norm": 7.899118423461914, + "learning_rate": 1.710936948267131e-05, + "loss": 0.0205, + "step": 305 + }, + { + "epoch": 6.46, + "grad_norm": 0.15333446860313416, + "learning_rate": 1.6634109219263774e-05, + "loss": 0.0124, + "step": 310 + }, + { + "epoch": 6.56, + "grad_norm": 2.1998233795166016, + "learning_rate": 1.615884895585624e-05, + "loss": 0.0127, + "step": 315 + }, + { + "epoch": 6.67, + "grad_norm": 4.678240776062012, + "learning_rate": 1.56835886924487e-05, + "loss": 0.0188, + "step": 320 + }, + { + "epoch": 6.77, + "grad_norm": 6.431846618652344, + "learning_rate": 1.5208328429041164e-05, + "loss": 0.0178, + "step": 325 + }, + { + "epoch": 6.88, + "grad_norm": 3.7818799018859863, + "learning_rate": 1.4733068165633629e-05, + "loss": 0.0091, + "step": 330 + }, + { + "epoch": 6.98, + "grad_norm": 0.11249573528766632, + "learning_rate": 1.4257807902226092e-05, + "loss": 0.2635, + "step": 335 + }, + { + "epoch": 7.0, + "eval_f1": 0.7500000000000001, + "eval_loss": 1.9278812408447266, + "eval_runtime": 1.3857, + "eval_samples_per_second": 46.187, + "eval_steps_per_second": 5.773, + "step": 336 + }, + { + "epoch": 7.08, + "grad_norm": Infinity, + "learning_rate": 1.3877599691500063e-05, + "loss": 0.2288, + "step": 340 + }, + { + "epoch": 7.19, + "grad_norm": 0.09949830174446106, + "learning_rate": 1.3402339428092527e-05, + "loss": 0.0357, + "step": 345 + }, + { + "epoch": 7.29, + "grad_norm": 0.29120293259620667, + "learning_rate": 1.292707916468499e-05, + "loss": 0.0071, + "step": 350 + }, + { + "epoch": 7.4, + "grad_norm": 0.3897027373313904, + "learning_rate": 1.2451818901277455e-05, + "loss": 0.0071, + "step": 355 + }, + { + "epoch": 7.5, + "grad_norm": 0.10337533801794052, + "learning_rate": 1.1976558637869918e-05, + "loss": 0.0076, + "step": 360 + }, + { + "epoch": 7.6, + "grad_norm": 0.14363917708396912, + "learning_rate": 1.1501298374462382e-05, + "loss": 0.0064, + "step": 365 + }, + { + "epoch": 7.71, + "grad_norm": 0.2638760805130005, + "learning_rate": 1.1026038111054845e-05, + "loss": 0.0715, + "step": 370 + }, + { + "epoch": 7.81, + "grad_norm": 0.08668874204158783, + "learning_rate": 1.0550777847647308e-05, + "loss": 0.2678, + "step": 375 + }, + { + "epoch": 7.92, + "grad_norm": 0.11121731251478195, + "learning_rate": 1.0075517584239773e-05, + "loss": 0.0059, + "step": 380 + }, + { + "epoch": 8.0, + "eval_f1": 0.7912087912087912, + "eval_loss": 1.495563268661499, + "eval_runtime": 1.4139, + "eval_samples_per_second": 45.264, + "eval_steps_per_second": 5.658, + "step": 384 + } + ], + "logging_steps": 5, + "max_steps": 480, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 5913396330334368.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": { + "learning_rate": 4.1062486758411146e-05, + "per_device_train_batch_size": 4 + } +} diff --git a/run-11/checkpoint-384/training_args.bin b/run-11/checkpoint-384/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eee85774a254ee9284e54bbcaa537c00a3ec285b --- /dev/null +++ b/run-11/checkpoint-384/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:711d11e21d598840c9a2dcfe02c910b7483cbe50927c465ca1b21773cad22965 +size 4920 diff --git a/run-11/checkpoint-432/config.json b/run-11/checkpoint-432/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-11/checkpoint-432/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-11/checkpoint-432/model.safetensors b/run-11/checkpoint-432/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..699b379c8518cb912504b083509841a1b97c4582 --- /dev/null +++ b/run-11/checkpoint-432/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:684d46e7fab8aea36a0ed15cdcbdc17169436f8c84ffa14dfdc6b0e9f46bb8a3 +size 94763496 diff --git a/run-11/checkpoint-432/optimizer.pt b/run-11/checkpoint-432/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c40d9c2e38bd37ae1c8605b6147267dcdb5ef141 --- /dev/null +++ b/run-11/checkpoint-432/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9207201a77de54828f38a99ea07d0faf470c09378a752ad2af0d0ac902b65bf8 +size 189552570 diff --git a/run-11/checkpoint-432/preprocessor_config.json b/run-11/checkpoint-432/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-11/checkpoint-432/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-11/checkpoint-432/rng_state.pth b/run-11/checkpoint-432/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..bcd8fc524b275ac25e742ea920b25e885f1074e5 --- /dev/null +++ b/run-11/checkpoint-432/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03d982e5865661a49a8729a4369776f96011f643bb66895e82e3a7651ff4f807 +size 14244 diff --git a/run-11/checkpoint-432/scheduler.pt b/run-11/checkpoint-432/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5d8d453c883051ed1063318a20828262cdef0f0 --- /dev/null +++ b/run-11/checkpoint-432/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d42bf3405dd838c48c84ebf36720a3cab546fc79d90cd7ba5c43a525ed049efb +size 1064 diff --git a/run-11/checkpoint-432/trainer_state.json b/run-11/checkpoint-432/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ea44a745fd22d37c718e9f1f7c2e6abe5b584057 --- /dev/null +++ b/run-11/checkpoint-432/trainer_state.json @@ -0,0 +1,707 @@ +{ + "best_metric": 0.7912087912087912, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-11/checkpoint-384", + "epoch": 9.0, + "eval_steps": 500, + "global_step": 432, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "grad_norm": 1.2641795873641968, + "learning_rate": 4.277342370667828e-06, + "loss": 0.702, + "step": 5 + }, + { + "epoch": 0.21, + "grad_norm": 1.1789261102676392, + "learning_rate": 8.554684741335655e-06, + "loss": 0.6864, + "step": 10 + }, + { + "epoch": 0.31, + "grad_norm": 1.3848576545715332, + "learning_rate": 1.2832027112003484e-05, + "loss": 0.6855, + "step": 15 + }, + { + "epoch": 0.42, + "grad_norm": 0.953545093536377, + "learning_rate": 1.710936948267131e-05, + "loss": 0.6668, + "step": 20 + }, + { + "epoch": 0.52, + "grad_norm": 0.9704298377037048, + "learning_rate": 2.138671185333914e-05, + "loss": 0.6426, + "step": 25 + }, + { + "epoch": 0.62, + "grad_norm": 0.7949383854866028, + "learning_rate": 2.5664054224006968e-05, + "loss": 0.6293, + "step": 30 + }, + { + "epoch": 0.73, + "grad_norm": 2.390880823135376, + "learning_rate": 2.994139659467479e-05, + "loss": 0.5631, + "step": 35 + }, + { + "epoch": 0.83, + "grad_norm": 1.7147374153137207, + "learning_rate": 3.421873896534262e-05, + "loss": 0.6562, + "step": 40 + }, + { + "epoch": 0.94, + "grad_norm": 3.123836040496826, + "learning_rate": 3.849608133601045e-05, + "loss": 0.6243, + "step": 45 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7374954223632812, + "eval_runtime": 1.3701, + "eval_samples_per_second": 46.713, + "eval_steps_per_second": 5.839, + "step": 48 + }, + { + "epoch": 1.04, + "grad_norm": 4.445352077484131, + "learning_rate": 4.0872382653048134e-05, + "loss": 0.6965, + "step": 50 + }, + { + "epoch": 1.15, + "grad_norm": 0.8689994215965271, + "learning_rate": 4.03971223896406e-05, + "loss": 0.6945, + "step": 55 + }, + { + "epoch": 1.25, + "grad_norm": 1.744273066520691, + "learning_rate": 3.992186212623306e-05, + "loss": 0.4097, + "step": 60 + }, + { + "epoch": 1.35, + "grad_norm": 0.6568699479103088, + "learning_rate": 3.944660186282552e-05, + "loss": 0.5621, + "step": 65 + }, + { + "epoch": 1.46, + "grad_norm": 1.516032099723816, + "learning_rate": 3.8971341599417986e-05, + "loss": 0.4826, + "step": 70 + }, + { + "epoch": 1.56, + "grad_norm": Infinity, + "learning_rate": 3.8591133388691955e-05, + "loss": 0.7666, + "step": 75 + }, + { + "epoch": 1.67, + "grad_norm": 1.788261890411377, + "learning_rate": 3.811587312528442e-05, + "loss": 0.6187, + "step": 80 + }, + { + "epoch": 1.77, + "grad_norm": 1.7383731603622437, + "learning_rate": 3.764061286187688e-05, + "loss": 0.5974, + "step": 85 + }, + { + "epoch": 1.88, + "grad_norm": 1.9014606475830078, + "learning_rate": 3.7165352598469344e-05, + "loss": 0.5979, + "step": 90 + }, + { + "epoch": 1.98, + "grad_norm": 2.1836445331573486, + "learning_rate": 3.669009233506181e-05, + "loss": 0.6104, + "step": 95 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6798496246337891, + "eval_runtime": 1.3616, + "eval_samples_per_second": 47.003, + "eval_steps_per_second": 5.875, + "step": 96 + }, + { + "epoch": 2.08, + "grad_norm": 1.0497145652770996, + "learning_rate": 3.6214832071654276e-05, + "loss": 0.5777, + "step": 100 + }, + { + "epoch": 2.19, + "grad_norm": 1.1734980344772339, + "learning_rate": 3.573957180824674e-05, + "loss": 0.4793, + "step": 105 + }, + { + "epoch": 2.29, + "grad_norm": 1.6985560655593872, + "learning_rate": 3.52643115448392e-05, + "loss": 0.5708, + "step": 110 + }, + { + "epoch": 2.4, + "grad_norm": 1.110007882118225, + "learning_rate": 3.4789051281431665e-05, + "loss": 0.5446, + "step": 115 + }, + { + "epoch": 2.5, + "grad_norm": Infinity, + "learning_rate": 3.4408843070705634e-05, + "loss": 0.5224, + "step": 120 + }, + { + "epoch": 2.6, + "grad_norm": 4.383241653442383, + "learning_rate": 3.39335828072981e-05, + "loss": 0.4492, + "step": 125 + }, + { + "epoch": 2.71, + "grad_norm": 20.106430053710938, + "learning_rate": 3.355337459657207e-05, + "loss": 0.6228, + "step": 130 + }, + { + "epoch": 2.81, + "grad_norm": 3.188138008117676, + "learning_rate": 3.3078114333164536e-05, + "loss": 0.3671, + "step": 135 + }, + { + "epoch": 2.92, + "grad_norm": 3.840233087539673, + "learning_rate": 3.2602854069757e-05, + "loss": 0.3217, + "step": 140 + }, + { + "epoch": 3.0, + "eval_f1": 0.7474747474747475, + "eval_loss": 0.8114051818847656, + "eval_runtime": 1.4005, + "eval_samples_per_second": 45.699, + "eval_steps_per_second": 5.712, + "step": 144 + }, + { + "epoch": 3.02, + "grad_norm": 2.789161205291748, + "learning_rate": 3.212759380634946e-05, + "loss": 0.4985, + "step": 145 + }, + { + "epoch": 3.12, + "grad_norm": 20.845867156982422, + "learning_rate": 3.1652333542941924e-05, + "loss": 0.3366, + "step": 150 + }, + { + "epoch": 3.23, + "grad_norm": 3.9109976291656494, + "learning_rate": 3.117707327953439e-05, + "loss": 0.2834, + "step": 155 + }, + { + "epoch": 3.33, + "grad_norm": 5.4618916511535645, + "learning_rate": 3.070181301612686e-05, + "loss": 0.4844, + "step": 160 + }, + { + "epoch": 3.44, + "grad_norm": 4.305176258087158, + "learning_rate": 3.0226552752719317e-05, + "loss": 0.2045, + "step": 165 + }, + { + "epoch": 3.54, + "grad_norm": 15.64925765991211, + "learning_rate": 2.9751292489311783e-05, + "loss": 0.2373, + "step": 170 + }, + { + "epoch": 3.65, + "grad_norm": 10.34076976776123, + "learning_rate": 2.9276032225904242e-05, + "loss": 0.4372, + "step": 175 + }, + { + "epoch": 3.75, + "grad_norm": Infinity, + "learning_rate": 2.8895824015178215e-05, + "loss": 0.2174, + "step": 180 + }, + { + "epoch": 3.85, + "grad_norm": 1.866976022720337, + "learning_rate": 2.8420563751770678e-05, + "loss": 0.2674, + "step": 185 + }, + { + "epoch": 3.96, + "grad_norm": 12.544903755187988, + "learning_rate": 2.794530348836314e-05, + "loss": 0.3761, + "step": 190 + }, + { + "epoch": 4.0, + "eval_f1": 0.7500000000000001, + "eval_loss": 0.9909818172454834, + "eval_runtime": 1.3645, + "eval_samples_per_second": 46.904, + "eval_steps_per_second": 5.863, + "step": 192 + }, + { + "epoch": 4.06, + "grad_norm": 0.7645444869995117, + "learning_rate": 2.7470043224955607e-05, + "loss": 0.2581, + "step": 195 + }, + { + "epoch": 4.17, + "grad_norm": 3.4740447998046875, + "learning_rate": 2.699478296154807e-05, + "loss": 0.3123, + "step": 200 + }, + { + "epoch": 4.27, + "grad_norm": 6.953164100646973, + "learning_rate": 2.6519522698140533e-05, + "loss": 0.2039, + "step": 205 + }, + { + "epoch": 4.38, + "grad_norm": 7.376092433929443, + "learning_rate": 2.6044262434732996e-05, + "loss": 0.1341, + "step": 210 + }, + { + "epoch": 4.48, + "grad_norm": 1.5489851236343384, + "learning_rate": 2.5569002171325462e-05, + "loss": 0.1285, + "step": 215 + }, + { + "epoch": 4.58, + "grad_norm": 13.857163429260254, + "learning_rate": 2.5093741907917925e-05, + "loss": 0.0793, + "step": 220 + }, + { + "epoch": 4.69, + "grad_norm": 2.46573543548584, + "learning_rate": 2.4618481644510388e-05, + "loss": 0.1254, + "step": 225 + }, + { + "epoch": 4.79, + "grad_norm": 0.2868492305278778, + "learning_rate": 2.4143221381102847e-05, + "loss": 0.205, + "step": 230 + }, + { + "epoch": 4.9, + "grad_norm": 1.8789399862289429, + "learning_rate": 2.376301317037682e-05, + "loss": 0.1137, + "step": 235 + }, + { + "epoch": 5.0, + "grad_norm": 0.7913978099822998, + "learning_rate": 2.3287752906969286e-05, + "loss": 0.1204, + "step": 240 + }, + { + "epoch": 5.0, + "eval_f1": 0.7586206896551725, + "eval_loss": 1.0789520740509033, + "eval_runtime": 1.3527, + "eval_samples_per_second": 47.312, + "eval_steps_per_second": 5.914, + "step": 240 + }, + { + "epoch": 5.1, + "grad_norm": 0.31450164318084717, + "learning_rate": 2.281249264356175e-05, + "loss": 0.0266, + "step": 245 + }, + { + "epoch": 5.21, + "grad_norm": 1.4393788576126099, + "learning_rate": 2.2337232380154212e-05, + "loss": 0.0256, + "step": 250 + }, + { + "epoch": 5.31, + "grad_norm": 1.2491079568862915, + "learning_rate": 2.1861972116746675e-05, + "loss": 0.0326, + "step": 255 + }, + { + "epoch": 5.42, + "grad_norm": 8.679621696472168, + "learning_rate": 2.138671185333914e-05, + "loss": 0.0274, + "step": 260 + }, + { + "epoch": 5.52, + "grad_norm": 0.2720549404621124, + "learning_rate": 2.0911451589931604e-05, + "loss": 0.0474, + "step": 265 + }, + { + "epoch": 5.62, + "grad_norm": 0.5277475714683533, + "learning_rate": 2.0436191326524067e-05, + "loss": 0.0339, + "step": 270 + }, + { + "epoch": 5.73, + "grad_norm": 19.154306411743164, + "learning_rate": 1.996093106311653e-05, + "loss": 0.0294, + "step": 275 + }, + { + "epoch": 5.83, + "grad_norm": 0.20597286522388458, + "learning_rate": 1.9485670799708993e-05, + "loss": 0.2482, + "step": 280 + }, + { + "epoch": 5.94, + "grad_norm": 0.9011418223381042, + "learning_rate": 1.9010410536301456e-05, + "loss": 0.0123, + "step": 285 + }, + { + "epoch": 6.0, + "eval_f1": 0.7741935483870968, + "eval_loss": 1.4787760972976685, + "eval_runtime": 1.3653, + "eval_samples_per_second": 46.875, + "eval_steps_per_second": 5.859, + "step": 288 + }, + { + "epoch": 6.04, + "grad_norm": 0.22904255986213684, + "learning_rate": 1.853515027289392e-05, + "loss": 0.224, + "step": 290 + }, + { + "epoch": 6.15, + "grad_norm": 0.2534620463848114, + "learning_rate": 1.8059890009486385e-05, + "loss": 0.0117, + "step": 295 + }, + { + "epoch": 6.25, + "grad_norm": 0.1455022692680359, + "learning_rate": 1.7584629746078848e-05, + "loss": 0.012, + "step": 300 + }, + { + "epoch": 6.35, + "grad_norm": 7.899118423461914, + "learning_rate": 1.710936948267131e-05, + "loss": 0.0205, + "step": 305 + }, + { + "epoch": 6.46, + "grad_norm": 0.15333446860313416, + "learning_rate": 1.6634109219263774e-05, + "loss": 0.0124, + "step": 310 + }, + { + "epoch": 6.56, + "grad_norm": 2.1998233795166016, + "learning_rate": 1.615884895585624e-05, + "loss": 0.0127, + "step": 315 + }, + { + "epoch": 6.67, + "grad_norm": 4.678240776062012, + "learning_rate": 1.56835886924487e-05, + "loss": 0.0188, + "step": 320 + }, + { + "epoch": 6.77, + "grad_norm": 6.431846618652344, + "learning_rate": 1.5208328429041164e-05, + "loss": 0.0178, + "step": 325 + }, + { + "epoch": 6.88, + "grad_norm": 3.7818799018859863, + "learning_rate": 1.4733068165633629e-05, + "loss": 0.0091, + "step": 330 + }, + { + "epoch": 6.98, + "grad_norm": 0.11249573528766632, + "learning_rate": 1.4257807902226092e-05, + "loss": 0.2635, + "step": 335 + }, + { + "epoch": 7.0, + "eval_f1": 0.7500000000000001, + "eval_loss": 1.9278812408447266, + "eval_runtime": 1.3857, + "eval_samples_per_second": 46.187, + "eval_steps_per_second": 5.773, + "step": 336 + }, + { + "epoch": 7.08, + "grad_norm": Infinity, + "learning_rate": 1.3877599691500063e-05, + "loss": 0.2288, + "step": 340 + }, + { + "epoch": 7.19, + "grad_norm": 0.09949830174446106, + "learning_rate": 1.3402339428092527e-05, + "loss": 0.0357, + "step": 345 + }, + { + "epoch": 7.29, + "grad_norm": 0.29120293259620667, + "learning_rate": 1.292707916468499e-05, + "loss": 0.0071, + "step": 350 + }, + { + "epoch": 7.4, + "grad_norm": 0.3897027373313904, + "learning_rate": 1.2451818901277455e-05, + "loss": 0.0071, + "step": 355 + }, + { + "epoch": 7.5, + "grad_norm": 0.10337533801794052, + "learning_rate": 1.1976558637869918e-05, + "loss": 0.0076, + "step": 360 + }, + { + "epoch": 7.6, + "grad_norm": 0.14363917708396912, + "learning_rate": 1.1501298374462382e-05, + "loss": 0.0064, + "step": 365 + }, + { + "epoch": 7.71, + "grad_norm": 0.2638760805130005, + "learning_rate": 1.1026038111054845e-05, + "loss": 0.0715, + "step": 370 + }, + { + "epoch": 7.81, + "grad_norm": 0.08668874204158783, + "learning_rate": 1.0550777847647308e-05, + "loss": 0.2678, + "step": 375 + }, + { + "epoch": 7.92, + "grad_norm": 0.11121731251478195, + "learning_rate": 1.0075517584239773e-05, + "loss": 0.0059, + "step": 380 + }, + { + "epoch": 8.0, + "eval_f1": 0.7912087912087912, + "eval_loss": 1.495563268661499, + "eval_runtime": 1.4139, + "eval_samples_per_second": 45.264, + "eval_steps_per_second": 5.658, + "step": 384 + }, + { + "epoch": 8.02, + "grad_norm": 0.1238911896944046, + "learning_rate": 9.600257320832236e-06, + "loss": 0.0065, + "step": 385 + }, + { + "epoch": 8.12, + "grad_norm": 0.09688282012939453, + "learning_rate": 9.124997057424699e-06, + "loss": 0.0054, + "step": 390 + }, + { + "epoch": 8.23, + "grad_norm": 0.07482036203145981, + "learning_rate": 8.649736794017163e-06, + "loss": 0.0078, + "step": 395 + }, + { + "epoch": 8.33, + "grad_norm": 0.10461320728063583, + "learning_rate": 8.174476530609626e-06, + "loss": 0.2455, + "step": 400 + }, + { + "epoch": 8.44, + "grad_norm": 0.12439440935850143, + "learning_rate": 7.699216267202089e-06, + "loss": 0.0053, + "step": 405 + }, + { + "epoch": 8.54, + "grad_norm": 0.12789662182331085, + "learning_rate": 7.223956003794554e-06, + "loss": 0.0054, + "step": 410 + }, + { + "epoch": 8.65, + "grad_norm": 0.07380508631467819, + "learning_rate": 6.7486957403870175e-06, + "loss": 0.006, + "step": 415 + }, + { + "epoch": 8.75, + "grad_norm": 0.08458781242370605, + "learning_rate": 6.273435476979481e-06, + "loss": 0.0056, + "step": 420 + }, + { + "epoch": 8.85, + "grad_norm": 0.09543807804584503, + "learning_rate": 5.798175213571944e-06, + "loss": 0.0063, + "step": 425 + }, + { + "epoch": 8.96, + "grad_norm": 0.10101006925106049, + "learning_rate": 5.322914950164408e-06, + "loss": 0.0064, + "step": 430 + }, + { + "epoch": 9.0, + "eval_f1": 0.7586206896551725, + "eval_loss": 1.48129403591156, + "eval_runtime": 1.4314, + "eval_samples_per_second": 44.71, + "eval_steps_per_second": 5.589, + "step": 432 + } + ], + "logging_steps": 5, + "max_steps": 480, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 6653641014142368.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": { + "learning_rate": 4.1062486758411146e-05, + "per_device_train_batch_size": 4 + } +} diff --git a/run-11/checkpoint-432/training_args.bin b/run-11/checkpoint-432/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eee85774a254ee9284e54bbcaa537c00a3ec285b --- /dev/null +++ b/run-11/checkpoint-432/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:711d11e21d598840c9a2dcfe02c910b7483cbe50927c465ca1b21773cad22965 +size 4920 diff --git a/run-11/checkpoint-48/config.json b/run-11/checkpoint-48/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-11/checkpoint-48/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-11/checkpoint-48/model.safetensors b/run-11/checkpoint-48/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b43b2e2f0f646c687006b75243af3d256654e2fd --- /dev/null +++ b/run-11/checkpoint-48/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd6251700fe0d2871bca8ab759ea7cb2397e0214e408372c0654f56fe5c5650c +size 94763496 diff --git a/run-11/checkpoint-48/optimizer.pt b/run-11/checkpoint-48/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f24daa329601eaee31bc34d47f95cb78f2ff01ff --- /dev/null +++ b/run-11/checkpoint-48/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:590b295485e45e9b1915da4051653a54a30b9e1dda7560e5f5249799e6839a47 +size 189552570 diff --git a/run-11/checkpoint-48/preprocessor_config.json b/run-11/checkpoint-48/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-11/checkpoint-48/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-11/checkpoint-48/rng_state.pth b/run-11/checkpoint-48/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2f69ac2b3cc24a2d23f1e99dfab26d0a1d84a680 --- /dev/null +++ b/run-11/checkpoint-48/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7251f0e64bf9e5675ed89b468a7ff74c1c3fd6457742f84db0e5e361db11f13 +size 14244 diff --git a/run-11/checkpoint-48/scheduler.pt b/run-11/checkpoint-48/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee374bf254d7fc85aa5ec7a36099b8e24d0bc9ab --- /dev/null +++ b/run-11/checkpoint-48/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf0cc4e003f4cfb061c3f09ebd5bce49992c7e4b5229df4e901a3820fd76442e +size 1064 diff --git a/run-11/checkpoint-48/trainer_state.json b/run-11/checkpoint-48/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..34ceeb646d3f2c1705488f1286d7496b5cb551bb --- /dev/null +++ b/run-11/checkpoint-48/trainer_state.json @@ -0,0 +1,96 @@ +{ + "best_metric": 0.7326732673267327, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-11/checkpoint-48", + "epoch": 1.0, + "eval_steps": 500, + "global_step": 48, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "grad_norm": 1.2641795873641968, + "learning_rate": 4.277342370667828e-06, + "loss": 0.702, + "step": 5 + }, + { + "epoch": 0.21, + "grad_norm": 1.1789261102676392, + "learning_rate": 8.554684741335655e-06, + "loss": 0.6864, + "step": 10 + }, + { + "epoch": 0.31, + "grad_norm": 1.3848576545715332, + "learning_rate": 1.2832027112003484e-05, + "loss": 0.6855, + "step": 15 + }, + { + "epoch": 0.42, + "grad_norm": 0.953545093536377, + "learning_rate": 1.710936948267131e-05, + "loss": 0.6668, + "step": 20 + }, + { + "epoch": 0.52, + "grad_norm": 0.9704298377037048, + "learning_rate": 2.138671185333914e-05, + "loss": 0.6426, + "step": 25 + }, + { + "epoch": 0.62, + "grad_norm": 0.7949383854866028, + "learning_rate": 2.5664054224006968e-05, + "loss": 0.6293, + "step": 30 + }, + { + "epoch": 0.73, + "grad_norm": 2.390880823135376, + "learning_rate": 2.994139659467479e-05, + "loss": 0.5631, + "step": 35 + }, + { + "epoch": 0.83, + "grad_norm": 1.7147374153137207, + "learning_rate": 3.421873896534262e-05, + "loss": 0.6562, + "step": 40 + }, + { + "epoch": 0.94, + "grad_norm": 3.123836040496826, + "learning_rate": 3.849608133601045e-05, + "loss": 0.6243, + "step": 45 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7374954223632812, + "eval_runtime": 1.3701, + "eval_samples_per_second": 46.713, + "eval_steps_per_second": 5.839, + "step": 48 + } + ], + "logging_steps": 5, + "max_steps": 480, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 775299419959728.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": { + "learning_rate": 4.1062486758411146e-05, + "per_device_train_batch_size": 4 + } +} diff --git a/run-11/checkpoint-48/training_args.bin b/run-11/checkpoint-48/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eee85774a254ee9284e54bbcaa537c00a3ec285b --- /dev/null +++ b/run-11/checkpoint-48/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:711d11e21d598840c9a2dcfe02c910b7483cbe50927c465ca1b21773cad22965 +size 4920 diff --git a/run-11/checkpoint-480/config.json b/run-11/checkpoint-480/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-11/checkpoint-480/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-11/checkpoint-480/model.safetensors b/run-11/checkpoint-480/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a40d4733bec6618e68503231304039a5412f527d --- /dev/null +++ b/run-11/checkpoint-480/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0f6e97c52125261f6ff5fa08f94c7808e7a17a2219e818d47f0bd5dc6fb09c4 +size 94763496 diff --git a/run-11/checkpoint-480/optimizer.pt b/run-11/checkpoint-480/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bba4e618e9df0ebeade24bb22d0afeba4b1b631c --- /dev/null +++ b/run-11/checkpoint-480/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21f010c21bad06e3a15eacdf968656b52ef9eeabcdd45f6f0d2261b6ced00330 +size 189552570 diff --git a/run-11/checkpoint-480/preprocessor_config.json b/run-11/checkpoint-480/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-11/checkpoint-480/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-11/checkpoint-480/rng_state.pth b/run-11/checkpoint-480/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8ce0733889ec145e042b57c109b1b3747a5e4b8 --- /dev/null +++ b/run-11/checkpoint-480/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50237159e3e933ca82ae35db92d0c845d9cc1581f3410598daa2edb356446877 +size 14244 diff --git a/run-11/checkpoint-480/scheduler.pt b/run-11/checkpoint-480/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c6152d072d71d961b68ad25ae13f2ce0ebfb31f --- /dev/null +++ b/run-11/checkpoint-480/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78e0c0e3a6c30e22a3ac24ba4b08a5e0605530d58952bee634a1e55e98bb51d4 +size 1064 diff --git a/run-11/checkpoint-480/trainer_state.json b/run-11/checkpoint-480/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..41b814d20445825117c72e7686c56af054c384ec --- /dev/null +++ b/run-11/checkpoint-480/trainer_state.json @@ -0,0 +1,786 @@ +{ + "best_metric": 0.7912087912087912, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-11/checkpoint-384", + "epoch": 10.0, + "eval_steps": 500, + "global_step": 480, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "grad_norm": 1.2641795873641968, + "learning_rate": 4.277342370667828e-06, + "loss": 0.702, + "step": 5 + }, + { + "epoch": 0.21, + "grad_norm": 1.1789261102676392, + "learning_rate": 8.554684741335655e-06, + "loss": 0.6864, + "step": 10 + }, + { + "epoch": 0.31, + "grad_norm": 1.3848576545715332, + "learning_rate": 1.2832027112003484e-05, + "loss": 0.6855, + "step": 15 + }, + { + "epoch": 0.42, + "grad_norm": 0.953545093536377, + "learning_rate": 1.710936948267131e-05, + "loss": 0.6668, + "step": 20 + }, + { + "epoch": 0.52, + "grad_norm": 0.9704298377037048, + "learning_rate": 2.138671185333914e-05, + "loss": 0.6426, + "step": 25 + }, + { + "epoch": 0.62, + "grad_norm": 0.7949383854866028, + "learning_rate": 2.5664054224006968e-05, + "loss": 0.6293, + "step": 30 + }, + { + "epoch": 0.73, + "grad_norm": 2.390880823135376, + "learning_rate": 2.994139659467479e-05, + "loss": 0.5631, + "step": 35 + }, + { + "epoch": 0.83, + "grad_norm": 1.7147374153137207, + "learning_rate": 3.421873896534262e-05, + "loss": 0.6562, + "step": 40 + }, + { + "epoch": 0.94, + "grad_norm": 3.123836040496826, + "learning_rate": 3.849608133601045e-05, + "loss": 0.6243, + "step": 45 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7374954223632812, + "eval_runtime": 1.3701, + "eval_samples_per_second": 46.713, + "eval_steps_per_second": 5.839, + "step": 48 + }, + { + "epoch": 1.04, + "grad_norm": 4.445352077484131, + "learning_rate": 4.0872382653048134e-05, + "loss": 0.6965, + "step": 50 + }, + { + "epoch": 1.15, + "grad_norm": 0.8689994215965271, + "learning_rate": 4.03971223896406e-05, + "loss": 0.6945, + "step": 55 + }, + { + "epoch": 1.25, + "grad_norm": 1.744273066520691, + "learning_rate": 3.992186212623306e-05, + "loss": 0.4097, + "step": 60 + }, + { + "epoch": 1.35, + "grad_norm": 0.6568699479103088, + "learning_rate": 3.944660186282552e-05, + "loss": 0.5621, + "step": 65 + }, + { + "epoch": 1.46, + "grad_norm": 1.516032099723816, + "learning_rate": 3.8971341599417986e-05, + "loss": 0.4826, + "step": 70 + }, + { + "epoch": 1.56, + "grad_norm": Infinity, + "learning_rate": 3.8591133388691955e-05, + "loss": 0.7666, + "step": 75 + }, + { + "epoch": 1.67, + "grad_norm": 1.788261890411377, + "learning_rate": 3.811587312528442e-05, + "loss": 0.6187, + "step": 80 + }, + { + "epoch": 1.77, + "grad_norm": 1.7383731603622437, + "learning_rate": 3.764061286187688e-05, + "loss": 0.5974, + "step": 85 + }, + { + "epoch": 1.88, + "grad_norm": 1.9014606475830078, + "learning_rate": 3.7165352598469344e-05, + "loss": 0.5979, + "step": 90 + }, + { + "epoch": 1.98, + "grad_norm": 2.1836445331573486, + "learning_rate": 3.669009233506181e-05, + "loss": 0.6104, + "step": 95 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6798496246337891, + "eval_runtime": 1.3616, + "eval_samples_per_second": 47.003, + "eval_steps_per_second": 5.875, + "step": 96 + }, + { + "epoch": 2.08, + "grad_norm": 1.0497145652770996, + "learning_rate": 3.6214832071654276e-05, + "loss": 0.5777, + "step": 100 + }, + { + "epoch": 2.19, + "grad_norm": 1.1734980344772339, + "learning_rate": 3.573957180824674e-05, + "loss": 0.4793, + "step": 105 + }, + { + "epoch": 2.29, + "grad_norm": 1.6985560655593872, + "learning_rate": 3.52643115448392e-05, + "loss": 0.5708, + "step": 110 + }, + { + "epoch": 2.4, + "grad_norm": 1.110007882118225, + "learning_rate": 3.4789051281431665e-05, + "loss": 0.5446, + "step": 115 + }, + { + "epoch": 2.5, + "grad_norm": Infinity, + "learning_rate": 3.4408843070705634e-05, + "loss": 0.5224, + "step": 120 + }, + { + "epoch": 2.6, + "grad_norm": 4.383241653442383, + "learning_rate": 3.39335828072981e-05, + "loss": 0.4492, + "step": 125 + }, + { + "epoch": 2.71, + "grad_norm": 20.106430053710938, + "learning_rate": 3.355337459657207e-05, + "loss": 0.6228, + "step": 130 + }, + { + "epoch": 2.81, + "grad_norm": 3.188138008117676, + "learning_rate": 3.3078114333164536e-05, + "loss": 0.3671, + "step": 135 + }, + { + "epoch": 2.92, + "grad_norm": 3.840233087539673, + "learning_rate": 3.2602854069757e-05, + "loss": 0.3217, + "step": 140 + }, + { + "epoch": 3.0, + "eval_f1": 0.7474747474747475, + "eval_loss": 0.8114051818847656, + "eval_runtime": 1.4005, + "eval_samples_per_second": 45.699, + "eval_steps_per_second": 5.712, + "step": 144 + }, + { + "epoch": 3.02, + "grad_norm": 2.789161205291748, + "learning_rate": 3.212759380634946e-05, + "loss": 0.4985, + "step": 145 + }, + { + "epoch": 3.12, + "grad_norm": 20.845867156982422, + "learning_rate": 3.1652333542941924e-05, + "loss": 0.3366, + "step": 150 + }, + { + "epoch": 3.23, + "grad_norm": 3.9109976291656494, + "learning_rate": 3.117707327953439e-05, + "loss": 0.2834, + "step": 155 + }, + { + "epoch": 3.33, + "grad_norm": 5.4618916511535645, + "learning_rate": 3.070181301612686e-05, + "loss": 0.4844, + "step": 160 + }, + { + "epoch": 3.44, + "grad_norm": 4.305176258087158, + "learning_rate": 3.0226552752719317e-05, + "loss": 0.2045, + "step": 165 + }, + { + "epoch": 3.54, + "grad_norm": 15.64925765991211, + "learning_rate": 2.9751292489311783e-05, + "loss": 0.2373, + "step": 170 + }, + { + "epoch": 3.65, + "grad_norm": 10.34076976776123, + "learning_rate": 2.9276032225904242e-05, + "loss": 0.4372, + "step": 175 + }, + { + "epoch": 3.75, + "grad_norm": Infinity, + "learning_rate": 2.8895824015178215e-05, + "loss": 0.2174, + "step": 180 + }, + { + "epoch": 3.85, + "grad_norm": 1.866976022720337, + "learning_rate": 2.8420563751770678e-05, + "loss": 0.2674, + "step": 185 + }, + { + "epoch": 3.96, + "grad_norm": 12.544903755187988, + "learning_rate": 2.794530348836314e-05, + "loss": 0.3761, + "step": 190 + }, + { + "epoch": 4.0, + "eval_f1": 0.7500000000000001, + "eval_loss": 0.9909818172454834, + "eval_runtime": 1.3645, + "eval_samples_per_second": 46.904, + "eval_steps_per_second": 5.863, + "step": 192 + }, + { + "epoch": 4.06, + "grad_norm": 0.7645444869995117, + "learning_rate": 2.7470043224955607e-05, + "loss": 0.2581, + "step": 195 + }, + { + "epoch": 4.17, + "grad_norm": 3.4740447998046875, + "learning_rate": 2.699478296154807e-05, + "loss": 0.3123, + "step": 200 + }, + { + "epoch": 4.27, + "grad_norm": 6.953164100646973, + "learning_rate": 2.6519522698140533e-05, + "loss": 0.2039, + "step": 205 + }, + { + "epoch": 4.38, + "grad_norm": 7.376092433929443, + "learning_rate": 2.6044262434732996e-05, + "loss": 0.1341, + "step": 210 + }, + { + "epoch": 4.48, + "grad_norm": 1.5489851236343384, + "learning_rate": 2.5569002171325462e-05, + "loss": 0.1285, + "step": 215 + }, + { + "epoch": 4.58, + "grad_norm": 13.857163429260254, + "learning_rate": 2.5093741907917925e-05, + "loss": 0.0793, + "step": 220 + }, + { + "epoch": 4.69, + "grad_norm": 2.46573543548584, + "learning_rate": 2.4618481644510388e-05, + "loss": 0.1254, + "step": 225 + }, + { + "epoch": 4.79, + "grad_norm": 0.2868492305278778, + "learning_rate": 2.4143221381102847e-05, + "loss": 0.205, + "step": 230 + }, + { + "epoch": 4.9, + "grad_norm": 1.8789399862289429, + "learning_rate": 2.376301317037682e-05, + "loss": 0.1137, + "step": 235 + }, + { + "epoch": 5.0, + "grad_norm": 0.7913978099822998, + "learning_rate": 2.3287752906969286e-05, + "loss": 0.1204, + "step": 240 + }, + { + "epoch": 5.0, + "eval_f1": 0.7586206896551725, + "eval_loss": 1.0789520740509033, + "eval_runtime": 1.3527, + "eval_samples_per_second": 47.312, + "eval_steps_per_second": 5.914, + "step": 240 + }, + { + "epoch": 5.1, + "grad_norm": 0.31450164318084717, + "learning_rate": 2.281249264356175e-05, + "loss": 0.0266, + "step": 245 + }, + { + "epoch": 5.21, + "grad_norm": 1.4393788576126099, + "learning_rate": 2.2337232380154212e-05, + "loss": 0.0256, + "step": 250 + }, + { + "epoch": 5.31, + "grad_norm": 1.2491079568862915, + "learning_rate": 2.1861972116746675e-05, + "loss": 0.0326, + "step": 255 + }, + { + "epoch": 5.42, + "grad_norm": 8.679621696472168, + "learning_rate": 2.138671185333914e-05, + "loss": 0.0274, + "step": 260 + }, + { + "epoch": 5.52, + "grad_norm": 0.2720549404621124, + "learning_rate": 2.0911451589931604e-05, + "loss": 0.0474, + "step": 265 + }, + { + "epoch": 5.62, + "grad_norm": 0.5277475714683533, + "learning_rate": 2.0436191326524067e-05, + "loss": 0.0339, + "step": 270 + }, + { + "epoch": 5.73, + "grad_norm": 19.154306411743164, + "learning_rate": 1.996093106311653e-05, + "loss": 0.0294, + "step": 275 + }, + { + "epoch": 5.83, + "grad_norm": 0.20597286522388458, + "learning_rate": 1.9485670799708993e-05, + "loss": 0.2482, + "step": 280 + }, + { + "epoch": 5.94, + "grad_norm": 0.9011418223381042, + "learning_rate": 1.9010410536301456e-05, + "loss": 0.0123, + "step": 285 + }, + { + "epoch": 6.0, + "eval_f1": 0.7741935483870968, + "eval_loss": 1.4787760972976685, + "eval_runtime": 1.3653, + "eval_samples_per_second": 46.875, + "eval_steps_per_second": 5.859, + "step": 288 + }, + { + "epoch": 6.04, + "grad_norm": 0.22904255986213684, + "learning_rate": 1.853515027289392e-05, + "loss": 0.224, + "step": 290 + }, + { + "epoch": 6.15, + "grad_norm": 0.2534620463848114, + "learning_rate": 1.8059890009486385e-05, + "loss": 0.0117, + "step": 295 + }, + { + "epoch": 6.25, + "grad_norm": 0.1455022692680359, + "learning_rate": 1.7584629746078848e-05, + "loss": 0.012, + "step": 300 + }, + { + "epoch": 6.35, + "grad_norm": 7.899118423461914, + "learning_rate": 1.710936948267131e-05, + "loss": 0.0205, + "step": 305 + }, + { + "epoch": 6.46, + "grad_norm": 0.15333446860313416, + "learning_rate": 1.6634109219263774e-05, + "loss": 0.0124, + "step": 310 + }, + { + "epoch": 6.56, + "grad_norm": 2.1998233795166016, + "learning_rate": 1.615884895585624e-05, + "loss": 0.0127, + "step": 315 + }, + { + "epoch": 6.67, + "grad_norm": 4.678240776062012, + "learning_rate": 1.56835886924487e-05, + "loss": 0.0188, + "step": 320 + }, + { + "epoch": 6.77, + "grad_norm": 6.431846618652344, + "learning_rate": 1.5208328429041164e-05, + "loss": 0.0178, + "step": 325 + }, + { + "epoch": 6.88, + "grad_norm": 3.7818799018859863, + "learning_rate": 1.4733068165633629e-05, + "loss": 0.0091, + "step": 330 + }, + { + "epoch": 6.98, + "grad_norm": 0.11249573528766632, + "learning_rate": 1.4257807902226092e-05, + "loss": 0.2635, + "step": 335 + }, + { + "epoch": 7.0, + "eval_f1": 0.7500000000000001, + "eval_loss": 1.9278812408447266, + "eval_runtime": 1.3857, + "eval_samples_per_second": 46.187, + "eval_steps_per_second": 5.773, + "step": 336 + }, + { + "epoch": 7.08, + "grad_norm": Infinity, + "learning_rate": 1.3877599691500063e-05, + "loss": 0.2288, + "step": 340 + }, + { + "epoch": 7.19, + "grad_norm": 0.09949830174446106, + "learning_rate": 1.3402339428092527e-05, + "loss": 0.0357, + "step": 345 + }, + { + "epoch": 7.29, + "grad_norm": 0.29120293259620667, + "learning_rate": 1.292707916468499e-05, + "loss": 0.0071, + "step": 350 + }, + { + "epoch": 7.4, + "grad_norm": 0.3897027373313904, + "learning_rate": 1.2451818901277455e-05, + "loss": 0.0071, + "step": 355 + }, + { + "epoch": 7.5, + "grad_norm": 0.10337533801794052, + "learning_rate": 1.1976558637869918e-05, + "loss": 0.0076, + "step": 360 + }, + { + "epoch": 7.6, + "grad_norm": 0.14363917708396912, + "learning_rate": 1.1501298374462382e-05, + "loss": 0.0064, + "step": 365 + }, + { + "epoch": 7.71, + "grad_norm": 0.2638760805130005, + "learning_rate": 1.1026038111054845e-05, + "loss": 0.0715, + "step": 370 + }, + { + "epoch": 7.81, + "grad_norm": 0.08668874204158783, + "learning_rate": 1.0550777847647308e-05, + "loss": 0.2678, + "step": 375 + }, + { + "epoch": 7.92, + "grad_norm": 0.11121731251478195, + "learning_rate": 1.0075517584239773e-05, + "loss": 0.0059, + "step": 380 + }, + { + "epoch": 8.0, + "eval_f1": 0.7912087912087912, + "eval_loss": 1.495563268661499, + "eval_runtime": 1.4139, + "eval_samples_per_second": 45.264, + "eval_steps_per_second": 5.658, + "step": 384 + }, + { + "epoch": 8.02, + "grad_norm": 0.1238911896944046, + "learning_rate": 9.600257320832236e-06, + "loss": 0.0065, + "step": 385 + }, + { + "epoch": 8.12, + "grad_norm": 0.09688282012939453, + "learning_rate": 9.124997057424699e-06, + "loss": 0.0054, + "step": 390 + }, + { + "epoch": 8.23, + "grad_norm": 0.07482036203145981, + "learning_rate": 8.649736794017163e-06, + "loss": 0.0078, + "step": 395 + }, + { + "epoch": 8.33, + "grad_norm": 0.10461320728063583, + "learning_rate": 8.174476530609626e-06, + "loss": 0.2455, + "step": 400 + }, + { + "epoch": 8.44, + "grad_norm": 0.12439440935850143, + "learning_rate": 7.699216267202089e-06, + "loss": 0.0053, + "step": 405 + }, + { + "epoch": 8.54, + "grad_norm": 0.12789662182331085, + "learning_rate": 7.223956003794554e-06, + "loss": 0.0054, + "step": 410 + }, + { + "epoch": 8.65, + "grad_norm": 0.07380508631467819, + "learning_rate": 6.7486957403870175e-06, + "loss": 0.006, + "step": 415 + }, + { + "epoch": 8.75, + "grad_norm": 0.08458781242370605, + "learning_rate": 6.273435476979481e-06, + "loss": 0.0056, + "step": 420 + }, + { + "epoch": 8.85, + "grad_norm": 0.09543807804584503, + "learning_rate": 5.798175213571944e-06, + "loss": 0.0063, + "step": 425 + }, + { + "epoch": 8.96, + "grad_norm": 0.10101006925106049, + "learning_rate": 5.322914950164408e-06, + "loss": 0.0064, + "step": 430 + }, + { + "epoch": 9.0, + "eval_f1": 0.7586206896551725, + "eval_loss": 1.48129403591156, + "eval_runtime": 1.4314, + "eval_samples_per_second": 44.71, + "eval_steps_per_second": 5.589, + "step": 432 + }, + { + "epoch": 9.06, + "grad_norm": 0.10252533107995987, + "learning_rate": 4.847654686756872e-06, + "loss": 0.0053, + "step": 435 + }, + { + "epoch": 9.17, + "grad_norm": 0.08117670565843582, + "learning_rate": 4.372394423349335e-06, + "loss": 0.0042, + "step": 440 + }, + { + "epoch": 9.27, + "grad_norm": 0.9447091817855835, + "learning_rate": 3.897134159941798e-06, + "loss": 0.0052, + "step": 445 + }, + { + "epoch": 9.38, + "grad_norm": 1.359684944152832, + "learning_rate": 3.421873896534262e-06, + "loss": 0.0058, + "step": 450 + }, + { + "epoch": 9.48, + "grad_norm": 0.09509690850973129, + "learning_rate": 2.946613633126726e-06, + "loss": 0.0055, + "step": 455 + }, + { + "epoch": 9.58, + "grad_norm": 0.09477395564317703, + "learning_rate": 2.4713533697191893e-06, + "loss": 0.0059, + "step": 460 + }, + { + "epoch": 9.69, + "grad_norm": 0.12153127789497375, + "learning_rate": 1.996093106311653e-06, + "loss": 0.0046, + "step": 465 + }, + { + "epoch": 9.79, + "grad_norm": 0.12337913364171982, + "learning_rate": 1.5208328429041164e-06, + "loss": 0.0045, + "step": 470 + }, + { + "epoch": 9.9, + "grad_norm": 0.0738518089056015, + "learning_rate": 1.04557257949658e-06, + "loss": 0.07, + "step": 475 + }, + { + "epoch": 10.0, + "grad_norm": 0.11413593590259552, + "learning_rate": 5.703123160890437e-07, + "loss": 0.0049, + "step": 480 + }, + { + "epoch": 10.0, + "eval_f1": 0.7586206896551725, + "eval_loss": 1.53921377658844, + "eval_runtime": 1.3572, + "eval_samples_per_second": 47.156, + "eval_steps_per_second": 5.894, + "step": 480 + } + ], + "logging_steps": 5, + "max_steps": 480, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 7411132489412208.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": { + "learning_rate": 4.1062486758411146e-05, + "per_device_train_batch_size": 4 + } +} diff --git a/run-11/checkpoint-480/training_args.bin b/run-11/checkpoint-480/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eee85774a254ee9284e54bbcaa537c00a3ec285b --- /dev/null +++ b/run-11/checkpoint-480/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:711d11e21d598840c9a2dcfe02c910b7483cbe50927c465ca1b21773cad22965 +size 4920 diff --git a/run-11/checkpoint-96/config.json b/run-11/checkpoint-96/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-11/checkpoint-96/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-11/checkpoint-96/model.safetensors b/run-11/checkpoint-96/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a494d728f2dcbff013d8670459038a66f0a210a7 --- /dev/null +++ b/run-11/checkpoint-96/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22c2792ea525a46253277ce91eade424daf4ea871a994c9c923a8d707eec744b +size 94763496 diff --git a/run-11/checkpoint-96/optimizer.pt b/run-11/checkpoint-96/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb7bf6e9a07655cd6b2f5f92ab7ac98151f37c5f --- /dev/null +++ b/run-11/checkpoint-96/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8012385934fc6cb5a552b3a467d6241f73003f36a028c51efc6b09dcb662cda +size 189552570 diff --git a/run-11/checkpoint-96/preprocessor_config.json b/run-11/checkpoint-96/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-11/checkpoint-96/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-11/checkpoint-96/rng_state.pth b/run-11/checkpoint-96/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..5df9532d48eec28233ca1958234673b2505309f1 --- /dev/null +++ b/run-11/checkpoint-96/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39dbf03bf644af79257aec95c925042cb81a469bfcc7a839a95d68f1d0425513 +size 14244 diff --git a/run-11/checkpoint-96/scheduler.pt b/run-11/checkpoint-96/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..77e8d8698eb6cf278c22d49084fc1893c00d534b --- /dev/null +++ b/run-11/checkpoint-96/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bae1d77ddf16eeb434d17489ad61c506dd9e15171480353c64175bf1ec243ac +size 1064 diff --git a/run-11/checkpoint-96/trainer_state.json b/run-11/checkpoint-96/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a394c9bded5dd7f484076c1f5da74367e333b191 --- /dev/null +++ b/run-11/checkpoint-96/trainer_state.json @@ -0,0 +1,175 @@ +{ + "best_metric": 0.7326732673267327, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-11/checkpoint-48", + "epoch": 2.0, + "eval_steps": 500, + "global_step": 96, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "grad_norm": 1.2641795873641968, + "learning_rate": 4.277342370667828e-06, + "loss": 0.702, + "step": 5 + }, + { + "epoch": 0.21, + "grad_norm": 1.1789261102676392, + "learning_rate": 8.554684741335655e-06, + "loss": 0.6864, + "step": 10 + }, + { + "epoch": 0.31, + "grad_norm": 1.3848576545715332, + "learning_rate": 1.2832027112003484e-05, + "loss": 0.6855, + "step": 15 + }, + { + "epoch": 0.42, + "grad_norm": 0.953545093536377, + "learning_rate": 1.710936948267131e-05, + "loss": 0.6668, + "step": 20 + }, + { + "epoch": 0.52, + "grad_norm": 0.9704298377037048, + "learning_rate": 2.138671185333914e-05, + "loss": 0.6426, + "step": 25 + }, + { + "epoch": 0.62, + "grad_norm": 0.7949383854866028, + "learning_rate": 2.5664054224006968e-05, + "loss": 0.6293, + "step": 30 + }, + { + "epoch": 0.73, + "grad_norm": 2.390880823135376, + "learning_rate": 2.994139659467479e-05, + "loss": 0.5631, + "step": 35 + }, + { + "epoch": 0.83, + "grad_norm": 1.7147374153137207, + "learning_rate": 3.421873896534262e-05, + "loss": 0.6562, + "step": 40 + }, + { + "epoch": 0.94, + "grad_norm": 3.123836040496826, + "learning_rate": 3.849608133601045e-05, + "loss": 0.6243, + "step": 45 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7374954223632812, + "eval_runtime": 1.3701, + "eval_samples_per_second": 46.713, + "eval_steps_per_second": 5.839, + "step": 48 + }, + { + "epoch": 1.04, + "grad_norm": 4.445352077484131, + "learning_rate": 4.0872382653048134e-05, + "loss": 0.6965, + "step": 50 + }, + { + "epoch": 1.15, + "grad_norm": 0.8689994215965271, + "learning_rate": 4.03971223896406e-05, + "loss": 0.6945, + "step": 55 + }, + { + "epoch": 1.25, + "grad_norm": 1.744273066520691, + "learning_rate": 3.992186212623306e-05, + "loss": 0.4097, + "step": 60 + }, + { + "epoch": 1.35, + "grad_norm": 0.6568699479103088, + "learning_rate": 3.944660186282552e-05, + "loss": 0.5621, + "step": 65 + }, + { + "epoch": 1.46, + "grad_norm": 1.516032099723816, + "learning_rate": 3.8971341599417986e-05, + "loss": 0.4826, + "step": 70 + }, + { + "epoch": 1.56, + "grad_norm": Infinity, + "learning_rate": 3.8591133388691955e-05, + "loss": 0.7666, + "step": 75 + }, + { + "epoch": 1.67, + "grad_norm": 1.788261890411377, + "learning_rate": 3.811587312528442e-05, + "loss": 0.6187, + "step": 80 + }, + { + "epoch": 1.77, + "grad_norm": 1.7383731603622437, + "learning_rate": 3.764061286187688e-05, + "loss": 0.5974, + "step": 85 + }, + { + "epoch": 1.88, + "grad_norm": 1.9014606475830078, + "learning_rate": 3.7165352598469344e-05, + "loss": 0.5979, + "step": 90 + }, + { + "epoch": 1.98, + "grad_norm": 2.1836445331573486, + "learning_rate": 3.669009233506181e-05, + "loss": 0.6104, + "step": 95 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6798496246337891, + "eval_runtime": 1.3616, + "eval_samples_per_second": 47.003, + "eval_steps_per_second": 5.875, + "step": 96 + } + ], + "logging_steps": 5, + "max_steps": 480, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 1547180751563808.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": { + "learning_rate": 4.1062486758411146e-05, + "per_device_train_batch_size": 4 + } +} diff --git a/run-11/checkpoint-96/training_args.bin b/run-11/checkpoint-96/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..eee85774a254ee9284e54bbcaa537c00a3ec285b --- /dev/null +++ b/run-11/checkpoint-96/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:711d11e21d598840c9a2dcfe02c910b7483cbe50927c465ca1b21773cad22965 +size 4920 diff --git a/run-12/checkpoint-48/config.json b/run-12/checkpoint-48/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-12/checkpoint-48/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-12/checkpoint-48/model.safetensors b/run-12/checkpoint-48/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2bd0c74bd3b4926b50964a7bb09c6aa8baae4f50 --- /dev/null +++ b/run-12/checkpoint-48/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0ba6fbcbb9cb83150dc1524c9934db5438ee17d96fb72054875ac1e12dab680 +size 94763496 diff --git a/run-12/checkpoint-48/optimizer.pt b/run-12/checkpoint-48/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..41517c295c97e5fd41ddcb1a2a4b8425a995d083 --- /dev/null +++ b/run-12/checkpoint-48/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abb631a5f411032b31d0baaa2ccc7fea0420d7b1d9b999fc0e42a33075cde1dd +size 189552570 diff --git a/run-12/checkpoint-48/preprocessor_config.json b/run-12/checkpoint-48/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-12/checkpoint-48/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-12/checkpoint-48/rng_state.pth b/run-12/checkpoint-48/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2f69ac2b3cc24a2d23f1e99dfab26d0a1d84a680 --- /dev/null +++ b/run-12/checkpoint-48/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7251f0e64bf9e5675ed89b468a7ff74c1c3fd6457742f84db0e5e361db11f13 +size 14244 diff --git a/run-12/checkpoint-48/scheduler.pt b/run-12/checkpoint-48/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3cf12f94ee94a3fce005d946f7aee485429574a0 --- /dev/null +++ b/run-12/checkpoint-48/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2b0440527e9e91711783dc9c7a5b958531191908f696aaf97a30075d5c2cd1f +size 1064 diff --git a/run-12/checkpoint-48/trainer_state.json b/run-12/checkpoint-48/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4f223b5899e75d9dfcc03a0ffe606d8145cf15e6 --- /dev/null +++ b/run-12/checkpoint-48/trainer_state.json @@ -0,0 +1,96 @@ +{ + "best_metric": 0.7326732673267327, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-12/checkpoint-48", + "epoch": 1.0, + "eval_steps": 500, + "global_step": 48, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "grad_norm": 1.2605946063995361, + "learning_rate": 5.868670588038625e-06, + "loss": 0.7019, + "step": 5 + }, + { + "epoch": 0.21, + "grad_norm": 1.1554573774337769, + "learning_rate": 1.173734117607725e-05, + "loss": 0.6822, + "step": 10 + }, + { + "epoch": 0.31, + "grad_norm": 1.2631292343139648, + "learning_rate": 1.7606011764115876e-05, + "loss": 0.6823, + "step": 15 + }, + { + "epoch": 0.42, + "grad_norm": 0.8986643552780151, + "learning_rate": 2.34746823521545e-05, + "loss": 0.6573, + "step": 20 + }, + { + "epoch": 0.52, + "grad_norm": 0.8833038210868835, + "learning_rate": 2.934335294019313e-05, + "loss": 0.6302, + "step": 25 + }, + { + "epoch": 0.62, + "grad_norm": 0.6974115967750549, + "learning_rate": 3.521202352823175e-05, + "loss": 0.6181, + "step": 30 + }, + { + "epoch": 0.73, + "grad_norm": 2.8611502647399902, + "learning_rate": 4.1080694116270374e-05, + "loss": 0.5431, + "step": 35 + }, + { + "epoch": 0.83, + "grad_norm": 1.6229726076126099, + "learning_rate": 4.6949364704309e-05, + "loss": 0.6722, + "step": 40 + }, + { + "epoch": 0.94, + "grad_norm": 3.6481218338012695, + "learning_rate": 5.281803529234763e-05, + "loss": 0.6342, + "step": 45 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7596015930175781, + "eval_runtime": 1.3819, + "eval_samples_per_second": 46.313, + "eval_steps_per_second": 5.789, + "step": 48 + } + ], + "logging_steps": 5, + "max_steps": 480, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 670686130935120.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": { + "learning_rate": 5.6339237645170805e-05, + "per_device_train_batch_size": 4 + } +} diff --git a/run-12/checkpoint-48/training_args.bin b/run-12/checkpoint-48/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0ee97abe4643adaac3cd80fa21facc591cbc7e9f --- /dev/null +++ b/run-12/checkpoint-48/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a13ddf2852b749b2ed775695ad5ddfecc242a8e8d8b1546d5e7357434a3ed37 +size 4920 diff --git a/run-8/checkpoint-120/config.json b/run-8/checkpoint-120/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-8/checkpoint-120/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-8/checkpoint-120/model.safetensors b/run-8/checkpoint-120/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..10f065283be3d227d5faed6ad9e5f39e06173484 --- /dev/null +++ b/run-8/checkpoint-120/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92edd0e79583d23b18d51b42f5f45dcae72ce4267081f1c09b8a165b9f2d2354 +size 94763496 diff --git a/run-8/checkpoint-120/optimizer.pt b/run-8/checkpoint-120/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..378ef9a4ce0e8b9e83e88954fc76d1ab76939c0d --- /dev/null +++ b/run-8/checkpoint-120/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:accecfdde0946925eaa672ee3117d9ae527548f71c17e473544d3e40e895c1f4 +size 189552570 diff --git a/run-8/checkpoint-120/preprocessor_config.json b/run-8/checkpoint-120/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-8/checkpoint-120/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-8/checkpoint-120/rng_state.pth b/run-8/checkpoint-120/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e7f7f63718a6f43937c9752e330bbe6b66b5ed44 --- /dev/null +++ b/run-8/checkpoint-120/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea9b93979db9d187112d0877ca456edd569c23f080722f2fbbe337c28c1a6935 +size 14244 diff --git a/run-8/checkpoint-120/scheduler.pt b/run-8/checkpoint-120/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..198082e73e6a719d5d51cb68f2c8b58c695db539 --- /dev/null +++ b/run-8/checkpoint-120/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18a19275ee8b60a1c380f213b92308a9a456d89107405291ffd751e5ea397e36 +size 1064 diff --git a/run-8/checkpoint-120/trainer_state.json b/run-8/checkpoint-120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..135f58e5424da6f8f07b9d3676103819cb58a51b --- /dev/null +++ b/run-8/checkpoint-120/trainer_state.json @@ -0,0 +1,237 @@ +{ + "best_metric": 0.7326732673267327, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-8/checkpoint-24", + "epoch": 5.0, + "eval_steps": 500, + "global_step": 120, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.21, + "grad_norm": 1.6496813297271729, + "learning_rate": 2.9441483262927863e-06, + "loss": 0.6992, + "step": 5 + }, + { + "epoch": 0.42, + "grad_norm": 0.620004415512085, + "learning_rate": 5.888296652585573e-06, + "loss": 0.6939, + "step": 10 + }, + { + "epoch": 0.62, + "grad_norm": 0.5726878046989441, + "learning_rate": 8.832444978878358e-06, + "loss": 0.6835, + "step": 15 + }, + { + "epoch": 0.83, + "grad_norm": 1.4239176511764526, + "learning_rate": 1.1776593305171145e-05, + "loss": 0.6673, + "step": 20 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6805419921875, + "eval_runtime": 1.3716, + "eval_samples_per_second": 46.661, + "eval_steps_per_second": 5.833, + "step": 24 + }, + { + "epoch": 1.04, + "grad_norm": 0.9195191264152527, + "learning_rate": 1.406648644784331e-05, + "loss": 0.6673, + "step": 25 + }, + { + "epoch": 1.25, + "grad_norm": 1.8524231910705566, + "learning_rate": 1.3739358856033002e-05, + "loss": 0.6155, + "step": 30 + }, + { + "epoch": 1.46, + "grad_norm": 1.8213531970977783, + "learning_rate": 1.3412231264222692e-05, + "loss": 0.5895, + "step": 35 + }, + { + "epoch": 1.67, + "grad_norm": 0.4818130433559418, + "learning_rate": 1.3085103672412383e-05, + "loss": 0.6468, + "step": 40 + }, + { + "epoch": 1.88, + "grad_norm": 0.6597484946250916, + "learning_rate": 1.2757976080602073e-05, + "loss": 0.6173, + "step": 45 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6970634460449219, + "eval_runtime": 1.3632, + "eval_samples_per_second": 46.949, + "eval_steps_per_second": 5.869, + "step": 48 + }, + { + "epoch": 2.08, + "grad_norm": 0.48903289437294006, + "learning_rate": 1.2430848488791764e-05, + "loss": 0.6302, + "step": 50 + }, + { + "epoch": 2.29, + "grad_norm": 0.6064260601997375, + "learning_rate": 1.2103720896981454e-05, + "loss": 0.5867, + "step": 55 + }, + { + "epoch": 2.5, + "grad_norm": 0.6802453398704529, + "learning_rate": 1.1776593305171145e-05, + "loss": 0.6321, + "step": 60 + }, + { + "epoch": 2.71, + "grad_norm": 1.2592875957489014, + "learning_rate": 1.1449465713360835e-05, + "loss": 0.6223, + "step": 65 + }, + { + "epoch": 2.92, + "grad_norm": 1.1591824293136597, + "learning_rate": 1.1122338121550526e-05, + "loss": 0.4922, + "step": 70 + }, + { + "epoch": 3.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7079887390136719, + "eval_runtime": 1.3669, + "eval_samples_per_second": 46.821, + "eval_steps_per_second": 5.853, + "step": 72 + }, + { + "epoch": 3.12, + "grad_norm": 0.5332023501396179, + "learning_rate": 1.0795210529740214e-05, + "loss": 0.5989, + "step": 75 + }, + { + "epoch": 3.33, + "grad_norm": 0.5555600523948669, + "learning_rate": 1.0468082937929906e-05, + "loss": 0.6104, + "step": 80 + }, + { + "epoch": 3.54, + "grad_norm": 1.2928024530410767, + "learning_rate": 1.0140955346119596e-05, + "loss": 0.4936, + "step": 85 + }, + { + "epoch": 3.75, + "grad_norm": 1.1424989700317383, + "learning_rate": 9.813827754309287e-06, + "loss": 0.6191, + "step": 90 + }, + { + "epoch": 3.96, + "grad_norm": 1.119732141494751, + "learning_rate": 9.486700162498977e-06, + "loss": 0.6004, + "step": 95 + }, + { + "epoch": 4.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7053489685058594, + "eval_runtime": 1.3556, + "eval_samples_per_second": 47.211, + "eval_steps_per_second": 5.901, + "step": 96 + }, + { + "epoch": 4.17, + "grad_norm": 0.8135461211204529, + "learning_rate": 9.159572570688668e-06, + "loss": 0.5154, + "step": 100 + }, + { + "epoch": 4.38, + "grad_norm": 1.8034342527389526, + "learning_rate": 8.832444978878358e-06, + "loss": 0.6067, + "step": 105 + }, + { + "epoch": 4.58, + "grad_norm": 0.8029685020446777, + "learning_rate": 8.505317387068049e-06, + "loss": 0.5499, + "step": 110 + }, + { + "epoch": 4.79, + "grad_norm": 1.019626259803772, + "learning_rate": 8.178189795257739e-06, + "loss": 0.5542, + "step": 115 + }, + { + "epoch": 5.0, + "grad_norm": 1.861674427986145, + "learning_rate": 7.85106220344743e-06, + "loss": 0.5545, + "step": 120 + }, + { + "epoch": 5.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6832504272460938, + "eval_runtime": 1.3811, + "eval_samples_per_second": 46.341, + "eval_steps_per_second": 5.793, + "step": 120 + } + ], + "logging_steps": 5, + "max_steps": 240, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 4193401989215328.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": { + "learning_rate": 1.4131911966205373e-05, + "per_device_train_batch_size": 8 + } +} diff --git a/run-8/checkpoint-120/training_args.bin b/run-8/checkpoint-120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d5c1c69e2334e136936b2e49c08c4956c68cf869 --- /dev/null +++ b/run-8/checkpoint-120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd5bd287ce4223bd729031c331a786d123069a13358bbb44dde2321174ac948e +size 4920 diff --git a/run-8/checkpoint-144/config.json b/run-8/checkpoint-144/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-8/checkpoint-144/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-8/checkpoint-144/model.safetensors b/run-8/checkpoint-144/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3e6bf199d251b6ecde8c914387378b6fb0f01b91 --- /dev/null +++ b/run-8/checkpoint-144/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fca88fd1581ddef6c4c089d9ab34cbba4b8b761884404dc7a5a3de25f787d54 +size 94763496 diff --git a/run-8/checkpoint-144/optimizer.pt b/run-8/checkpoint-144/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..71e85ca5d2d487100a2c3bd627984961dde77b60 --- /dev/null +++ b/run-8/checkpoint-144/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1809aea0ce1164d399aa33cdd60f3f515d6b003cbf972e319f0411b0cfe17b6a +size 189552570 diff --git a/run-8/checkpoint-144/preprocessor_config.json b/run-8/checkpoint-144/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-8/checkpoint-144/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-8/checkpoint-144/rng_state.pth b/run-8/checkpoint-144/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0dcad447546559109c3c49122d38655653738eb1 --- /dev/null +++ b/run-8/checkpoint-144/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eddddd854f935bdc6bf7e33f19c011df64d161505c6be9aa447288219fa3010a +size 14244 diff --git a/run-8/checkpoint-144/scheduler.pt b/run-8/checkpoint-144/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f7609c683a55ef280fe8b896f7fc260e43ed1942 --- /dev/null +++ b/run-8/checkpoint-144/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:667e860f1f0e6cf98aaf11ea7eb56937a3c19f175aad57494d4a13bda196eca7 +size 1064 diff --git a/run-8/checkpoint-144/trainer_state.json b/run-8/checkpoint-144/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bf91e53950889090346bc7c2589981315e68487c --- /dev/null +++ b/run-8/checkpoint-144/trainer_state.json @@ -0,0 +1,274 @@ +{ + "best_metric": 0.7326732673267327, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-8/checkpoint-24", + "epoch": 6.0, + "eval_steps": 500, + "global_step": 144, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.21, + "grad_norm": 1.6496813297271729, + "learning_rate": 2.9441483262927863e-06, + "loss": 0.6992, + "step": 5 + }, + { + "epoch": 0.42, + "grad_norm": 0.620004415512085, + "learning_rate": 5.888296652585573e-06, + "loss": 0.6939, + "step": 10 + }, + { + "epoch": 0.62, + "grad_norm": 0.5726878046989441, + "learning_rate": 8.832444978878358e-06, + "loss": 0.6835, + "step": 15 + }, + { + "epoch": 0.83, + "grad_norm": 1.4239176511764526, + "learning_rate": 1.1776593305171145e-05, + "loss": 0.6673, + "step": 20 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6805419921875, + "eval_runtime": 1.3716, + "eval_samples_per_second": 46.661, + "eval_steps_per_second": 5.833, + "step": 24 + }, + { + "epoch": 1.04, + "grad_norm": 0.9195191264152527, + "learning_rate": 1.406648644784331e-05, + "loss": 0.6673, + "step": 25 + }, + { + "epoch": 1.25, + "grad_norm": 1.8524231910705566, + "learning_rate": 1.3739358856033002e-05, + "loss": 0.6155, + "step": 30 + }, + { + "epoch": 1.46, + "grad_norm": 1.8213531970977783, + "learning_rate": 1.3412231264222692e-05, + "loss": 0.5895, + "step": 35 + }, + { + "epoch": 1.67, + "grad_norm": 0.4818130433559418, + "learning_rate": 1.3085103672412383e-05, + "loss": 0.6468, + "step": 40 + }, + { + "epoch": 1.88, + "grad_norm": 0.6597484946250916, + "learning_rate": 1.2757976080602073e-05, + "loss": 0.6173, + "step": 45 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6970634460449219, + "eval_runtime": 1.3632, + "eval_samples_per_second": 46.949, + "eval_steps_per_second": 5.869, + "step": 48 + }, + { + "epoch": 2.08, + "grad_norm": 0.48903289437294006, + "learning_rate": 1.2430848488791764e-05, + "loss": 0.6302, + "step": 50 + }, + { + "epoch": 2.29, + "grad_norm": 0.6064260601997375, + "learning_rate": 1.2103720896981454e-05, + "loss": 0.5867, + "step": 55 + }, + { + "epoch": 2.5, + "grad_norm": 0.6802453398704529, + "learning_rate": 1.1776593305171145e-05, + "loss": 0.6321, + "step": 60 + }, + { + "epoch": 2.71, + "grad_norm": 1.2592875957489014, + "learning_rate": 1.1449465713360835e-05, + "loss": 0.6223, + "step": 65 + }, + { + "epoch": 2.92, + "grad_norm": 1.1591824293136597, + "learning_rate": 1.1122338121550526e-05, + "loss": 0.4922, + "step": 70 + }, + { + "epoch": 3.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7079887390136719, + "eval_runtime": 1.3669, + "eval_samples_per_second": 46.821, + "eval_steps_per_second": 5.853, + "step": 72 + }, + { + "epoch": 3.12, + "grad_norm": 0.5332023501396179, + "learning_rate": 1.0795210529740214e-05, + "loss": 0.5989, + "step": 75 + }, + { + "epoch": 3.33, + "grad_norm": 0.5555600523948669, + "learning_rate": 1.0468082937929906e-05, + "loss": 0.6104, + "step": 80 + }, + { + "epoch": 3.54, + "grad_norm": 1.2928024530410767, + "learning_rate": 1.0140955346119596e-05, + "loss": 0.4936, + "step": 85 + }, + { + "epoch": 3.75, + "grad_norm": 1.1424989700317383, + "learning_rate": 9.813827754309287e-06, + "loss": 0.6191, + "step": 90 + }, + { + "epoch": 3.96, + "grad_norm": 1.119732141494751, + "learning_rate": 9.486700162498977e-06, + "loss": 0.6004, + "step": 95 + }, + { + "epoch": 4.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7053489685058594, + "eval_runtime": 1.3556, + "eval_samples_per_second": 47.211, + "eval_steps_per_second": 5.901, + "step": 96 + }, + { + "epoch": 4.17, + "grad_norm": 0.8135461211204529, + "learning_rate": 9.159572570688668e-06, + "loss": 0.5154, + "step": 100 + }, + { + "epoch": 4.38, + "grad_norm": 1.8034342527389526, + "learning_rate": 8.832444978878358e-06, + "loss": 0.6067, + "step": 105 + }, + { + "epoch": 4.58, + "grad_norm": 0.8029685020446777, + "learning_rate": 8.505317387068049e-06, + "loss": 0.5499, + "step": 110 + }, + { + "epoch": 4.79, + "grad_norm": 1.019626259803772, + "learning_rate": 8.178189795257739e-06, + "loss": 0.5542, + "step": 115 + }, + { + "epoch": 5.0, + "grad_norm": 1.861674427986145, + "learning_rate": 7.85106220344743e-06, + "loss": 0.5545, + "step": 120 + }, + { + "epoch": 5.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6832504272460938, + "eval_runtime": 1.3811, + "eval_samples_per_second": 46.341, + "eval_steps_per_second": 5.793, + "step": 120 + }, + { + "epoch": 5.21, + "grad_norm": 1.5949212312698364, + "learning_rate": 7.523934611637121e-06, + "loss": 0.4806, + "step": 125 + }, + { + "epoch": 5.42, + "grad_norm": 3.002861738204956, + "learning_rate": 7.196807019826811e-06, + "loss": 0.5832, + "step": 130 + }, + { + "epoch": 5.62, + "grad_norm": 1.4606820344924927, + "learning_rate": 6.9351049463785626e-06, + "loss": 0.5481, + "step": 135 + }, + { + "epoch": 5.83, + "grad_norm": 1.6088628768920898, + "learning_rate": 6.607977354568253e-06, + "loss": 0.5333, + "step": 140 + }, + { + "epoch": 6.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6599597930908203, + "eval_runtime": 1.3747, + "eval_samples_per_second": 46.555, + "eval_steps_per_second": 5.819, + "step": 144 + } + ], + "logging_steps": 5, + "max_steps": 240, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 4911381340990080.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": { + "learning_rate": 1.4131911966205373e-05, + "per_device_train_batch_size": 8 + } +} diff --git a/run-8/checkpoint-144/training_args.bin b/run-8/checkpoint-144/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d5c1c69e2334e136936b2e49c08c4956c68cf869 --- /dev/null +++ b/run-8/checkpoint-144/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd5bd287ce4223bd729031c331a786d123069a13358bbb44dde2321174ac948e +size 4920 diff --git a/run-8/checkpoint-168/config.json b/run-8/checkpoint-168/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-8/checkpoint-168/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-8/checkpoint-168/model.safetensors b/run-8/checkpoint-168/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8a48959d68d2d412f8c45f995806d3b2e0dae815 --- /dev/null +++ b/run-8/checkpoint-168/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58ebd72680d4004f3ec2c41b2fb8fa49ab1c9bad1f0c3ee10cc9570dfa27474d +size 94763496 diff --git a/run-8/checkpoint-168/optimizer.pt b/run-8/checkpoint-168/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f239b792b74c401567262fb227b6e1c6f4824ab7 --- /dev/null +++ b/run-8/checkpoint-168/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34f21cb461acac8cfff0a49b7932e25b035c993404605f096e7cae503cbf87b9 +size 189552570 diff --git a/run-8/checkpoint-168/preprocessor_config.json b/run-8/checkpoint-168/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-8/checkpoint-168/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-8/checkpoint-168/rng_state.pth b/run-8/checkpoint-168/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b5968c6a0d63ac9dece0214117bab8d5185d7c76 --- /dev/null +++ b/run-8/checkpoint-168/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3d1e8300999dd1e32b96ea9da5218bf1661f524b97023fae45e0a2d78d5309f +size 14244 diff --git a/run-8/checkpoint-168/scheduler.pt b/run-8/checkpoint-168/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..92abe9aadfd3917e8356d77a30917ab7ef6c0664 --- /dev/null +++ b/run-8/checkpoint-168/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9c27a94964552b07f40152e74cbbce1f4535883b0e349e2074af268e2c51c43 +size 1064 diff --git a/run-8/checkpoint-168/trainer_state.json b/run-8/checkpoint-168/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..16111a5e5130e87b53787da974727d9073d98a46 --- /dev/null +++ b/run-8/checkpoint-168/trainer_state.json @@ -0,0 +1,318 @@ +{ + "best_metric": 0.74, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-8/checkpoint-168", + "epoch": 7.0, + "eval_steps": 500, + "global_step": 168, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.21, + "grad_norm": 1.6496813297271729, + "learning_rate": 2.9441483262927863e-06, + "loss": 0.6992, + "step": 5 + }, + { + "epoch": 0.42, + "grad_norm": 0.620004415512085, + "learning_rate": 5.888296652585573e-06, + "loss": 0.6939, + "step": 10 + }, + { + "epoch": 0.62, + "grad_norm": 0.5726878046989441, + "learning_rate": 8.832444978878358e-06, + "loss": 0.6835, + "step": 15 + }, + { + "epoch": 0.83, + "grad_norm": 1.4239176511764526, + "learning_rate": 1.1776593305171145e-05, + "loss": 0.6673, + "step": 20 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6805419921875, + "eval_runtime": 1.3716, + "eval_samples_per_second": 46.661, + "eval_steps_per_second": 5.833, + "step": 24 + }, + { + "epoch": 1.04, + "grad_norm": 0.9195191264152527, + "learning_rate": 1.406648644784331e-05, + "loss": 0.6673, + "step": 25 + }, + { + "epoch": 1.25, + "grad_norm": 1.8524231910705566, + "learning_rate": 1.3739358856033002e-05, + "loss": 0.6155, + "step": 30 + }, + { + "epoch": 1.46, + "grad_norm": 1.8213531970977783, + "learning_rate": 1.3412231264222692e-05, + "loss": 0.5895, + "step": 35 + }, + { + "epoch": 1.67, + "grad_norm": 0.4818130433559418, + "learning_rate": 1.3085103672412383e-05, + "loss": 0.6468, + "step": 40 + }, + { + "epoch": 1.88, + "grad_norm": 0.6597484946250916, + "learning_rate": 1.2757976080602073e-05, + "loss": 0.6173, + "step": 45 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6970634460449219, + "eval_runtime": 1.3632, + "eval_samples_per_second": 46.949, + "eval_steps_per_second": 5.869, + "step": 48 + }, + { + "epoch": 2.08, + "grad_norm": 0.48903289437294006, + "learning_rate": 1.2430848488791764e-05, + "loss": 0.6302, + "step": 50 + }, + { + "epoch": 2.29, + "grad_norm": 0.6064260601997375, + "learning_rate": 1.2103720896981454e-05, + "loss": 0.5867, + "step": 55 + }, + { + "epoch": 2.5, + "grad_norm": 0.6802453398704529, + "learning_rate": 1.1776593305171145e-05, + "loss": 0.6321, + "step": 60 + }, + { + "epoch": 2.71, + "grad_norm": 1.2592875957489014, + "learning_rate": 1.1449465713360835e-05, + "loss": 0.6223, + "step": 65 + }, + { + "epoch": 2.92, + "grad_norm": 1.1591824293136597, + "learning_rate": 1.1122338121550526e-05, + "loss": 0.4922, + "step": 70 + }, + { + "epoch": 3.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7079887390136719, + "eval_runtime": 1.3669, + "eval_samples_per_second": 46.821, + "eval_steps_per_second": 5.853, + "step": 72 + }, + { + "epoch": 3.12, + "grad_norm": 0.5332023501396179, + "learning_rate": 1.0795210529740214e-05, + "loss": 0.5989, + "step": 75 + }, + { + "epoch": 3.33, + "grad_norm": 0.5555600523948669, + "learning_rate": 1.0468082937929906e-05, + "loss": 0.6104, + "step": 80 + }, + { + "epoch": 3.54, + "grad_norm": 1.2928024530410767, + "learning_rate": 1.0140955346119596e-05, + "loss": 0.4936, + "step": 85 + }, + { + "epoch": 3.75, + "grad_norm": 1.1424989700317383, + "learning_rate": 9.813827754309287e-06, + "loss": 0.6191, + "step": 90 + }, + { + "epoch": 3.96, + "grad_norm": 1.119732141494751, + "learning_rate": 9.486700162498977e-06, + "loss": 0.6004, + "step": 95 + }, + { + "epoch": 4.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7053489685058594, + "eval_runtime": 1.3556, + "eval_samples_per_second": 47.211, + "eval_steps_per_second": 5.901, + "step": 96 + }, + { + "epoch": 4.17, + "grad_norm": 0.8135461211204529, + "learning_rate": 9.159572570688668e-06, + "loss": 0.5154, + "step": 100 + }, + { + "epoch": 4.38, + "grad_norm": 1.8034342527389526, + "learning_rate": 8.832444978878358e-06, + "loss": 0.6067, + "step": 105 + }, + { + "epoch": 4.58, + "grad_norm": 0.8029685020446777, + "learning_rate": 8.505317387068049e-06, + "loss": 0.5499, + "step": 110 + }, + { + "epoch": 4.79, + "grad_norm": 1.019626259803772, + "learning_rate": 8.178189795257739e-06, + "loss": 0.5542, + "step": 115 + }, + { + "epoch": 5.0, + "grad_norm": 1.861674427986145, + "learning_rate": 7.85106220344743e-06, + "loss": 0.5545, + "step": 120 + }, + { + "epoch": 5.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6832504272460938, + "eval_runtime": 1.3811, + "eval_samples_per_second": 46.341, + "eval_steps_per_second": 5.793, + "step": 120 + }, + { + "epoch": 5.21, + "grad_norm": 1.5949212312698364, + "learning_rate": 7.523934611637121e-06, + "loss": 0.4806, + "step": 125 + }, + { + "epoch": 5.42, + "grad_norm": 3.002861738204956, + "learning_rate": 7.196807019826811e-06, + "loss": 0.5832, + "step": 130 + }, + { + "epoch": 5.62, + "grad_norm": 1.4606820344924927, + "learning_rate": 6.9351049463785626e-06, + "loss": 0.5481, + "step": 135 + }, + { + "epoch": 5.83, + "grad_norm": 1.6088628768920898, + "learning_rate": 6.607977354568253e-06, + "loss": 0.5333, + "step": 140 + }, + { + "epoch": 6.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6599597930908203, + "eval_runtime": 1.3747, + "eval_samples_per_second": 46.555, + "eval_steps_per_second": 5.819, + "step": 144 + }, + { + "epoch": 6.04, + "grad_norm": 2.2051286697387695, + "learning_rate": 6.280849762757943e-06, + "loss": 0.4805, + "step": 145 + }, + { + "epoch": 6.25, + "grad_norm": 1.6964988708496094, + "learning_rate": 5.953722170947633e-06, + "loss": 0.4573, + "step": 150 + }, + { + "epoch": 6.46, + "grad_norm": 2.1374056339263916, + "learning_rate": 5.626594579137324e-06, + "loss": 0.59, + "step": 155 + }, + { + "epoch": 6.67, + "grad_norm": 1.8037084341049194, + "learning_rate": 5.299466987327015e-06, + "loss": 0.494, + "step": 160 + }, + { + "epoch": 6.88, + "grad_norm": 1.7916295528411865, + "learning_rate": 4.972339395516706e-06, + "loss": 0.4997, + "step": 165 + }, + { + "epoch": 7.0, + "eval_f1": 0.74, + "eval_loss": 0.6653976440429688, + "eval_runtime": 1.3593, + "eval_samples_per_second": 47.083, + "eval_steps_per_second": 5.885, + "step": 168 + } + ], + "logging_steps": 5, + "max_steps": 240, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 5784921236870880.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": { + "learning_rate": 1.4131911966205373e-05, + "per_device_train_batch_size": 8 + } +} diff --git a/run-8/checkpoint-168/training_args.bin b/run-8/checkpoint-168/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d5c1c69e2334e136936b2e49c08c4956c68cf869 --- /dev/null +++ b/run-8/checkpoint-168/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd5bd287ce4223bd729031c331a786d123069a13358bbb44dde2321174ac948e +size 4920 diff --git a/run-8/checkpoint-192/config.json b/run-8/checkpoint-192/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-8/checkpoint-192/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-8/checkpoint-192/model.safetensors b/run-8/checkpoint-192/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..15045f934267f1c25058d7cd5cd1969f80fe9920 --- /dev/null +++ b/run-8/checkpoint-192/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d2018924d161a471f3d1abfb7a4f8e13ff6cc8f2aa4861ec9885f78eba64332 +size 94763496 diff --git a/run-8/checkpoint-192/optimizer.pt b/run-8/checkpoint-192/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..23596696765e3a58de00d3a436fde45120270910 --- /dev/null +++ b/run-8/checkpoint-192/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7fb7451f009f26519dc85c2a260ad506485469f091236769cc59432d3b04b95 +size 189552570 diff --git a/run-8/checkpoint-192/preprocessor_config.json b/run-8/checkpoint-192/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-8/checkpoint-192/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-8/checkpoint-192/rng_state.pth b/run-8/checkpoint-192/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..875f12d85ee4fe0a19729b6294d1d2cdf10f1089 --- /dev/null +++ b/run-8/checkpoint-192/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7d36d02b8fccc787134862aa01292d9aac8446548e5817f3421eb9f95a8666e +size 14244 diff --git a/run-8/checkpoint-192/scheduler.pt b/run-8/checkpoint-192/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..adb130fb794ecd82108a0d813a09dbe72dbbab6f --- /dev/null +++ b/run-8/checkpoint-192/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e0c38af311b4caf48546e0d7f65fed32b091d933c37cf558aedfcc135eb2b04 +size 1064 diff --git a/run-8/checkpoint-192/trainer_state.json b/run-8/checkpoint-192/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9ad7a441b567013df990c7c1e57ed2c46d0f7b60 --- /dev/null +++ b/run-8/checkpoint-192/trainer_state.json @@ -0,0 +1,362 @@ +{ + "best_metric": 0.74, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-8/checkpoint-168", + "epoch": 8.0, + "eval_steps": 500, + "global_step": 192, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.21, + "grad_norm": 1.6496813297271729, + "learning_rate": 2.9441483262927863e-06, + "loss": 0.6992, + "step": 5 + }, + { + "epoch": 0.42, + "grad_norm": 0.620004415512085, + "learning_rate": 5.888296652585573e-06, + "loss": 0.6939, + "step": 10 + }, + { + "epoch": 0.62, + "grad_norm": 0.5726878046989441, + "learning_rate": 8.832444978878358e-06, + "loss": 0.6835, + "step": 15 + }, + { + "epoch": 0.83, + "grad_norm": 1.4239176511764526, + "learning_rate": 1.1776593305171145e-05, + "loss": 0.6673, + "step": 20 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6805419921875, + "eval_runtime": 1.3716, + "eval_samples_per_second": 46.661, + "eval_steps_per_second": 5.833, + "step": 24 + }, + { + "epoch": 1.04, + "grad_norm": 0.9195191264152527, + "learning_rate": 1.406648644784331e-05, + "loss": 0.6673, + "step": 25 + }, + { + "epoch": 1.25, + "grad_norm": 1.8524231910705566, + "learning_rate": 1.3739358856033002e-05, + "loss": 0.6155, + "step": 30 + }, + { + "epoch": 1.46, + "grad_norm": 1.8213531970977783, + "learning_rate": 1.3412231264222692e-05, + "loss": 0.5895, + "step": 35 + }, + { + "epoch": 1.67, + "grad_norm": 0.4818130433559418, + "learning_rate": 1.3085103672412383e-05, + "loss": 0.6468, + "step": 40 + }, + { + "epoch": 1.88, + "grad_norm": 0.6597484946250916, + "learning_rate": 1.2757976080602073e-05, + "loss": 0.6173, + "step": 45 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6970634460449219, + "eval_runtime": 1.3632, + "eval_samples_per_second": 46.949, + "eval_steps_per_second": 5.869, + "step": 48 + }, + { + "epoch": 2.08, + "grad_norm": 0.48903289437294006, + "learning_rate": 1.2430848488791764e-05, + "loss": 0.6302, + "step": 50 + }, + { + "epoch": 2.29, + "grad_norm": 0.6064260601997375, + "learning_rate": 1.2103720896981454e-05, + "loss": 0.5867, + "step": 55 + }, + { + "epoch": 2.5, + "grad_norm": 0.6802453398704529, + "learning_rate": 1.1776593305171145e-05, + "loss": 0.6321, + "step": 60 + }, + { + "epoch": 2.71, + "grad_norm": 1.2592875957489014, + "learning_rate": 1.1449465713360835e-05, + "loss": 0.6223, + "step": 65 + }, + { + "epoch": 2.92, + "grad_norm": 1.1591824293136597, + "learning_rate": 1.1122338121550526e-05, + "loss": 0.4922, + "step": 70 + }, + { + "epoch": 3.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7079887390136719, + "eval_runtime": 1.3669, + "eval_samples_per_second": 46.821, + "eval_steps_per_second": 5.853, + "step": 72 + }, + { + "epoch": 3.12, + "grad_norm": 0.5332023501396179, + "learning_rate": 1.0795210529740214e-05, + "loss": 0.5989, + "step": 75 + }, + { + "epoch": 3.33, + "grad_norm": 0.5555600523948669, + "learning_rate": 1.0468082937929906e-05, + "loss": 0.6104, + "step": 80 + }, + { + "epoch": 3.54, + "grad_norm": 1.2928024530410767, + "learning_rate": 1.0140955346119596e-05, + "loss": 0.4936, + "step": 85 + }, + { + "epoch": 3.75, + "grad_norm": 1.1424989700317383, + "learning_rate": 9.813827754309287e-06, + "loss": 0.6191, + "step": 90 + }, + { + "epoch": 3.96, + "grad_norm": 1.119732141494751, + "learning_rate": 9.486700162498977e-06, + "loss": 0.6004, + "step": 95 + }, + { + "epoch": 4.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7053489685058594, + "eval_runtime": 1.3556, + "eval_samples_per_second": 47.211, + "eval_steps_per_second": 5.901, + "step": 96 + }, + { + "epoch": 4.17, + "grad_norm": 0.8135461211204529, + "learning_rate": 9.159572570688668e-06, + "loss": 0.5154, + "step": 100 + }, + { + "epoch": 4.38, + "grad_norm": 1.8034342527389526, + "learning_rate": 8.832444978878358e-06, + "loss": 0.6067, + "step": 105 + }, + { + "epoch": 4.58, + "grad_norm": 0.8029685020446777, + "learning_rate": 8.505317387068049e-06, + "loss": 0.5499, + "step": 110 + }, + { + "epoch": 4.79, + "grad_norm": 1.019626259803772, + "learning_rate": 8.178189795257739e-06, + "loss": 0.5542, + "step": 115 + }, + { + "epoch": 5.0, + "grad_norm": 1.861674427986145, + "learning_rate": 7.85106220344743e-06, + "loss": 0.5545, + "step": 120 + }, + { + "epoch": 5.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6832504272460938, + "eval_runtime": 1.3811, + "eval_samples_per_second": 46.341, + "eval_steps_per_second": 5.793, + "step": 120 + }, + { + "epoch": 5.21, + "grad_norm": 1.5949212312698364, + "learning_rate": 7.523934611637121e-06, + "loss": 0.4806, + "step": 125 + }, + { + "epoch": 5.42, + "grad_norm": 3.002861738204956, + "learning_rate": 7.196807019826811e-06, + "loss": 0.5832, + "step": 130 + }, + { + "epoch": 5.62, + "grad_norm": 1.4606820344924927, + "learning_rate": 6.9351049463785626e-06, + "loss": 0.5481, + "step": 135 + }, + { + "epoch": 5.83, + "grad_norm": 1.6088628768920898, + "learning_rate": 6.607977354568253e-06, + "loss": 0.5333, + "step": 140 + }, + { + "epoch": 6.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6599597930908203, + "eval_runtime": 1.3747, + "eval_samples_per_second": 46.555, + "eval_steps_per_second": 5.819, + "step": 144 + }, + { + "epoch": 6.04, + "grad_norm": 2.2051286697387695, + "learning_rate": 6.280849762757943e-06, + "loss": 0.4805, + "step": 145 + }, + { + "epoch": 6.25, + "grad_norm": 1.6964988708496094, + "learning_rate": 5.953722170947633e-06, + "loss": 0.4573, + "step": 150 + }, + { + "epoch": 6.46, + "grad_norm": 2.1374056339263916, + "learning_rate": 5.626594579137324e-06, + "loss": 0.59, + "step": 155 + }, + { + "epoch": 6.67, + "grad_norm": 1.8037084341049194, + "learning_rate": 5.299466987327015e-06, + "loss": 0.494, + "step": 160 + }, + { + "epoch": 6.88, + "grad_norm": 1.7916295528411865, + "learning_rate": 4.972339395516706e-06, + "loss": 0.4997, + "step": 165 + }, + { + "epoch": 7.0, + "eval_f1": 0.74, + "eval_loss": 0.6653976440429688, + "eval_runtime": 1.3593, + "eval_samples_per_second": 47.083, + "eval_steps_per_second": 5.885, + "step": 168 + }, + { + "epoch": 7.08, + "grad_norm": 2.2235190868377686, + "learning_rate": 4.645211803706396e-06, + "loss": 0.3887, + "step": 170 + }, + { + "epoch": 7.29, + "grad_norm": 1.239268183708191, + "learning_rate": 4.318084211896087e-06, + "loss": 0.4519, + "step": 175 + }, + { + "epoch": 7.5, + "grad_norm": 1.8677798509597778, + "learning_rate": 3.990956620085777e-06, + "loss": 0.5146, + "step": 180 + }, + { + "epoch": 7.71, + "grad_norm": 3.8495407104492188, + "learning_rate": 3.6638290282754668e-06, + "loss": 0.4237, + "step": 185 + }, + { + "epoch": 7.92, + "grad_norm": 1.8828785419464111, + "learning_rate": 3.3367014364651573e-06, + "loss": 0.5033, + "step": 190 + }, + { + "epoch": 8.0, + "eval_f1": 0.74, + "eval_loss": 0.6842975616455078, + "eval_runtime": 1.3685, + "eval_samples_per_second": 46.767, + "eval_steps_per_second": 5.846, + "step": 192 + } + ], + "logging_steps": 5, + "max_steps": 240, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 6664467234779136.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": { + "learning_rate": 1.4131911966205373e-05, + "per_device_train_batch_size": 8 + } +} diff --git a/run-8/checkpoint-192/training_args.bin b/run-8/checkpoint-192/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d5c1c69e2334e136936b2e49c08c4956c68cf869 --- /dev/null +++ b/run-8/checkpoint-192/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd5bd287ce4223bd729031c331a786d123069a13358bbb44dde2321174ac948e +size 4920 diff --git a/run-8/checkpoint-216/config.json b/run-8/checkpoint-216/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-8/checkpoint-216/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-8/checkpoint-216/model.safetensors b/run-8/checkpoint-216/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..93e13f55d0fe3d6a4cee343de317f40904165277 --- /dev/null +++ b/run-8/checkpoint-216/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:164b096ba23e3a566da47990317c9311a5a21bace072e2e8c58094f64913b6f7 +size 94763496 diff --git a/run-8/checkpoint-216/optimizer.pt b/run-8/checkpoint-216/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a8f2aad53fc55cb503244453b8b19badeae0df4 --- /dev/null +++ b/run-8/checkpoint-216/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f73a7dc542044d538c730b515108455f97d6f1ef2d8f9317fa141b437383652 +size 189552570 diff --git a/run-8/checkpoint-216/preprocessor_config.json b/run-8/checkpoint-216/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-8/checkpoint-216/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-8/checkpoint-216/rng_state.pth b/run-8/checkpoint-216/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a35bf835a565c3dfd906b63d1cc231830b4f10b7 --- /dev/null +++ b/run-8/checkpoint-216/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1e038616a7e873dfcb3b9bb9a5826772d5fe0c6e6bb356ff5fde20c8c8c5dbf +size 14244 diff --git a/run-8/checkpoint-216/scheduler.pt b/run-8/checkpoint-216/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6bee9e0642c8bd5ec222bf36ea31775a8b918dbb --- /dev/null +++ b/run-8/checkpoint-216/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66686a10171fe9e40f1499f634416ffe7983aa0887a26361286f921e4efc1920 +size 1064 diff --git a/run-8/checkpoint-216/trainer_state.json b/run-8/checkpoint-216/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c3e25e7d2675d522c20fd8249bc721a800ee8e35 --- /dev/null +++ b/run-8/checkpoint-216/trainer_state.json @@ -0,0 +1,406 @@ +{ + "best_metric": 0.74, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-8/checkpoint-168", + "epoch": 9.0, + "eval_steps": 500, + "global_step": 216, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.21, + "grad_norm": 1.6496813297271729, + "learning_rate": 2.9441483262927863e-06, + "loss": 0.6992, + "step": 5 + }, + { + "epoch": 0.42, + "grad_norm": 0.620004415512085, + "learning_rate": 5.888296652585573e-06, + "loss": 0.6939, + "step": 10 + }, + { + "epoch": 0.62, + "grad_norm": 0.5726878046989441, + "learning_rate": 8.832444978878358e-06, + "loss": 0.6835, + "step": 15 + }, + { + "epoch": 0.83, + "grad_norm": 1.4239176511764526, + "learning_rate": 1.1776593305171145e-05, + "loss": 0.6673, + "step": 20 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6805419921875, + "eval_runtime": 1.3716, + "eval_samples_per_second": 46.661, + "eval_steps_per_second": 5.833, + "step": 24 + }, + { + "epoch": 1.04, + "grad_norm": 0.9195191264152527, + "learning_rate": 1.406648644784331e-05, + "loss": 0.6673, + "step": 25 + }, + { + "epoch": 1.25, + "grad_norm": 1.8524231910705566, + "learning_rate": 1.3739358856033002e-05, + "loss": 0.6155, + "step": 30 + }, + { + "epoch": 1.46, + "grad_norm": 1.8213531970977783, + "learning_rate": 1.3412231264222692e-05, + "loss": 0.5895, + "step": 35 + }, + { + "epoch": 1.67, + "grad_norm": 0.4818130433559418, + "learning_rate": 1.3085103672412383e-05, + "loss": 0.6468, + "step": 40 + }, + { + "epoch": 1.88, + "grad_norm": 0.6597484946250916, + "learning_rate": 1.2757976080602073e-05, + "loss": 0.6173, + "step": 45 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6970634460449219, + "eval_runtime": 1.3632, + "eval_samples_per_second": 46.949, + "eval_steps_per_second": 5.869, + "step": 48 + }, + { + "epoch": 2.08, + "grad_norm": 0.48903289437294006, + "learning_rate": 1.2430848488791764e-05, + "loss": 0.6302, + "step": 50 + }, + { + "epoch": 2.29, + "grad_norm": 0.6064260601997375, + "learning_rate": 1.2103720896981454e-05, + "loss": 0.5867, + "step": 55 + }, + { + "epoch": 2.5, + "grad_norm": 0.6802453398704529, + "learning_rate": 1.1776593305171145e-05, + "loss": 0.6321, + "step": 60 + }, + { + "epoch": 2.71, + "grad_norm": 1.2592875957489014, + "learning_rate": 1.1449465713360835e-05, + "loss": 0.6223, + "step": 65 + }, + { + "epoch": 2.92, + "grad_norm": 1.1591824293136597, + "learning_rate": 1.1122338121550526e-05, + "loss": 0.4922, + "step": 70 + }, + { + "epoch": 3.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7079887390136719, + "eval_runtime": 1.3669, + "eval_samples_per_second": 46.821, + "eval_steps_per_second": 5.853, + "step": 72 + }, + { + "epoch": 3.12, + "grad_norm": 0.5332023501396179, + "learning_rate": 1.0795210529740214e-05, + "loss": 0.5989, + "step": 75 + }, + { + "epoch": 3.33, + "grad_norm": 0.5555600523948669, + "learning_rate": 1.0468082937929906e-05, + "loss": 0.6104, + "step": 80 + }, + { + "epoch": 3.54, + "grad_norm": 1.2928024530410767, + "learning_rate": 1.0140955346119596e-05, + "loss": 0.4936, + "step": 85 + }, + { + "epoch": 3.75, + "grad_norm": 1.1424989700317383, + "learning_rate": 9.813827754309287e-06, + "loss": 0.6191, + "step": 90 + }, + { + "epoch": 3.96, + "grad_norm": 1.119732141494751, + "learning_rate": 9.486700162498977e-06, + "loss": 0.6004, + "step": 95 + }, + { + "epoch": 4.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7053489685058594, + "eval_runtime": 1.3556, + "eval_samples_per_second": 47.211, + "eval_steps_per_second": 5.901, + "step": 96 + }, + { + "epoch": 4.17, + "grad_norm": 0.8135461211204529, + "learning_rate": 9.159572570688668e-06, + "loss": 0.5154, + "step": 100 + }, + { + "epoch": 4.38, + "grad_norm": 1.8034342527389526, + "learning_rate": 8.832444978878358e-06, + "loss": 0.6067, + "step": 105 + }, + { + "epoch": 4.58, + "grad_norm": 0.8029685020446777, + "learning_rate": 8.505317387068049e-06, + "loss": 0.5499, + "step": 110 + }, + { + "epoch": 4.79, + "grad_norm": 1.019626259803772, + "learning_rate": 8.178189795257739e-06, + "loss": 0.5542, + "step": 115 + }, + { + "epoch": 5.0, + "grad_norm": 1.861674427986145, + "learning_rate": 7.85106220344743e-06, + "loss": 0.5545, + "step": 120 + }, + { + "epoch": 5.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6832504272460938, + "eval_runtime": 1.3811, + "eval_samples_per_second": 46.341, + "eval_steps_per_second": 5.793, + "step": 120 + }, + { + "epoch": 5.21, + "grad_norm": 1.5949212312698364, + "learning_rate": 7.523934611637121e-06, + "loss": 0.4806, + "step": 125 + }, + { + "epoch": 5.42, + "grad_norm": 3.002861738204956, + "learning_rate": 7.196807019826811e-06, + "loss": 0.5832, + "step": 130 + }, + { + "epoch": 5.62, + "grad_norm": 1.4606820344924927, + "learning_rate": 6.9351049463785626e-06, + "loss": 0.5481, + "step": 135 + }, + { + "epoch": 5.83, + "grad_norm": 1.6088628768920898, + "learning_rate": 6.607977354568253e-06, + "loss": 0.5333, + "step": 140 + }, + { + "epoch": 6.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6599597930908203, + "eval_runtime": 1.3747, + "eval_samples_per_second": 46.555, + "eval_steps_per_second": 5.819, + "step": 144 + }, + { + "epoch": 6.04, + "grad_norm": 2.2051286697387695, + "learning_rate": 6.280849762757943e-06, + "loss": 0.4805, + "step": 145 + }, + { + "epoch": 6.25, + "grad_norm": 1.6964988708496094, + "learning_rate": 5.953722170947633e-06, + "loss": 0.4573, + "step": 150 + }, + { + "epoch": 6.46, + "grad_norm": 2.1374056339263916, + "learning_rate": 5.626594579137324e-06, + "loss": 0.59, + "step": 155 + }, + { + "epoch": 6.67, + "grad_norm": 1.8037084341049194, + "learning_rate": 5.299466987327015e-06, + "loss": 0.494, + "step": 160 + }, + { + "epoch": 6.88, + "grad_norm": 1.7916295528411865, + "learning_rate": 4.972339395516706e-06, + "loss": 0.4997, + "step": 165 + }, + { + "epoch": 7.0, + "eval_f1": 0.74, + "eval_loss": 0.6653976440429688, + "eval_runtime": 1.3593, + "eval_samples_per_second": 47.083, + "eval_steps_per_second": 5.885, + "step": 168 + }, + { + "epoch": 7.08, + "grad_norm": 2.2235190868377686, + "learning_rate": 4.645211803706396e-06, + "loss": 0.3887, + "step": 170 + }, + { + "epoch": 7.29, + "grad_norm": 1.239268183708191, + "learning_rate": 4.318084211896087e-06, + "loss": 0.4519, + "step": 175 + }, + { + "epoch": 7.5, + "grad_norm": 1.8677798509597778, + "learning_rate": 3.990956620085777e-06, + "loss": 0.5146, + "step": 180 + }, + { + "epoch": 7.71, + "grad_norm": 3.8495407104492188, + "learning_rate": 3.6638290282754668e-06, + "loss": 0.4237, + "step": 185 + }, + { + "epoch": 7.92, + "grad_norm": 1.8828785419464111, + "learning_rate": 3.3367014364651573e-06, + "loss": 0.5033, + "step": 190 + }, + { + "epoch": 8.0, + "eval_f1": 0.74, + "eval_loss": 0.6842975616455078, + "eval_runtime": 1.3685, + "eval_samples_per_second": 46.767, + "eval_steps_per_second": 5.846, + "step": 192 + }, + { + "epoch": 8.12, + "grad_norm": 4.3374714851379395, + "learning_rate": 3.009573844654848e-06, + "loss": 0.4588, + "step": 195 + }, + { + "epoch": 8.33, + "grad_norm": 2.9799509048461914, + "learning_rate": 2.6824462528445384e-06, + "loss": 0.3783, + "step": 200 + }, + { + "epoch": 8.54, + "grad_norm": 3.3768601417541504, + "learning_rate": 2.3553186610342286e-06, + "loss": 0.4366, + "step": 205 + }, + { + "epoch": 8.75, + "grad_norm": 2.2495288848876953, + "learning_rate": 2.028191069223919e-06, + "loss": 0.4545, + "step": 210 + }, + { + "epoch": 8.96, + "grad_norm": 2.078002691268921, + "learning_rate": 1.7010634774136097e-06, + "loss": 0.6012, + "step": 215 + }, + { + "epoch": 9.0, + "eval_f1": 0.74, + "eval_loss": 0.6836881637573242, + "eval_runtime": 1.3545, + "eval_samples_per_second": 47.25, + "eval_steps_per_second": 5.906, + "step": 216 + } + ], + "logging_steps": 5, + "max_steps": 240, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 7545814381042464.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": { + "learning_rate": 1.4131911966205373e-05, + "per_device_train_batch_size": 8 + } +} diff --git a/run-8/checkpoint-216/training_args.bin b/run-8/checkpoint-216/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d5c1c69e2334e136936b2e49c08c4956c68cf869 --- /dev/null +++ b/run-8/checkpoint-216/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd5bd287ce4223bd729031c331a786d123069a13358bbb44dde2321174ac948e +size 4920 diff --git a/run-8/checkpoint-48/config.json b/run-8/checkpoint-48/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-8/checkpoint-48/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-8/checkpoint-48/model.safetensors b/run-8/checkpoint-48/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6024525554b49ab2d02e282a8fec91473dbeb75a --- /dev/null +++ b/run-8/checkpoint-48/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89d6adb929c041d1e846b345d80027c1afb32ec1324f1a311ad5c6f7b3e58d7c +size 94763496 diff --git a/run-8/checkpoint-48/optimizer.pt b/run-8/checkpoint-48/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..26495a7d3e09935d65622b226be1bb84f8232e90 --- /dev/null +++ b/run-8/checkpoint-48/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61d28f9b4eab8d97d23b64fdeb9ebec7d2987516567ef4bf3287ce8410d34c99 +size 189552570 diff --git a/run-8/checkpoint-48/preprocessor_config.json b/run-8/checkpoint-48/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-8/checkpoint-48/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-8/checkpoint-48/rng_state.pth b/run-8/checkpoint-48/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f747b3a07143a0952937715136c3c6f3d385714b --- /dev/null +++ b/run-8/checkpoint-48/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ea0f2253490889e9b5dbea96976236c3c1f025b5a24179f4bcaa44eec621be7 +size 14244 diff --git a/run-8/checkpoint-48/scheduler.pt b/run-8/checkpoint-48/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..680d94f7bbf4eb0ce0fe1183af22ae521a64ef87 --- /dev/null +++ b/run-8/checkpoint-48/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14d5cc59d71ed6e0e16b832ea2fa9c185108846d4853b2c806129c5900ff19ed +size 1064 diff --git a/run-8/checkpoint-48/trainer_state.json b/run-8/checkpoint-48/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..52e43f0ae88300e1cff34b644d09650659e016b4 --- /dev/null +++ b/run-8/checkpoint-48/trainer_state.json @@ -0,0 +1,105 @@ +{ + "best_metric": 0.7326732673267327, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-8/checkpoint-24", + "epoch": 2.0, + "eval_steps": 500, + "global_step": 48, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.21, + "grad_norm": 1.6496813297271729, + "learning_rate": 2.9441483262927863e-06, + "loss": 0.6992, + "step": 5 + }, + { + "epoch": 0.42, + "grad_norm": 0.620004415512085, + "learning_rate": 5.888296652585573e-06, + "loss": 0.6939, + "step": 10 + }, + { + "epoch": 0.62, + "grad_norm": 0.5726878046989441, + "learning_rate": 8.832444978878358e-06, + "loss": 0.6835, + "step": 15 + }, + { + "epoch": 0.83, + "grad_norm": 1.4239176511764526, + "learning_rate": 1.1776593305171145e-05, + "loss": 0.6673, + "step": 20 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6805419921875, + "eval_runtime": 1.3716, + "eval_samples_per_second": 46.661, + "eval_steps_per_second": 5.833, + "step": 24 + }, + { + "epoch": 1.04, + "grad_norm": 0.9195191264152527, + "learning_rate": 1.406648644784331e-05, + "loss": 0.6673, + "step": 25 + }, + { + "epoch": 1.25, + "grad_norm": 1.8524231910705566, + "learning_rate": 1.3739358856033002e-05, + "loss": 0.6155, + "step": 30 + }, + { + "epoch": 1.46, + "grad_norm": 1.8213531970977783, + "learning_rate": 1.3412231264222692e-05, + "loss": 0.5895, + "step": 35 + }, + { + "epoch": 1.67, + "grad_norm": 0.4818130433559418, + "learning_rate": 1.3085103672412383e-05, + "loss": 0.6468, + "step": 40 + }, + { + "epoch": 1.88, + "grad_norm": 0.6597484946250916, + "learning_rate": 1.2757976080602073e-05, + "loss": 0.6173, + "step": 45 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6970634460449219, + "eval_runtime": 1.3632, + "eval_samples_per_second": 46.949, + "eval_steps_per_second": 5.869, + "step": 48 + } + ], + "logging_steps": 5, + "max_steps": 240, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 1570339835242944.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": { + "learning_rate": 1.4131911966205373e-05, + "per_device_train_batch_size": 8 + } +} diff --git a/run-8/checkpoint-48/training_args.bin b/run-8/checkpoint-48/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d5c1c69e2334e136936b2e49c08c4956c68cf869 --- /dev/null +++ b/run-8/checkpoint-48/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd5bd287ce4223bd729031c331a786d123069a13358bbb44dde2321174ac948e +size 4920 diff --git a/run-8/checkpoint-72/config.json b/run-8/checkpoint-72/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-8/checkpoint-72/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-8/checkpoint-72/model.safetensors b/run-8/checkpoint-72/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..81ba874eb48d71c9323397f77bc1130c0558ed1f --- /dev/null +++ b/run-8/checkpoint-72/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dac58a33b2e0859921f47ac61afce5b0af9a272cebe5a56da094781ec8c143ca +size 94763496 diff --git a/run-8/checkpoint-72/optimizer.pt b/run-8/checkpoint-72/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c86130b355bc9db5996e434362ddba7021ee0a65 --- /dev/null +++ b/run-8/checkpoint-72/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae11a66cf49f6db873b9de4def8c455270ee92ad55e8fb7bce600f604b74e0dc +size 189552570 diff --git a/run-8/checkpoint-72/preprocessor_config.json b/run-8/checkpoint-72/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-8/checkpoint-72/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-8/checkpoint-72/rng_state.pth b/run-8/checkpoint-72/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2a0931c97507f28c46325d60d49a5e7e573321a4 --- /dev/null +++ b/run-8/checkpoint-72/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ee3b4c1fd10f5b7dc9cd3892663bc52a1bc7290b86a11225b56750a1f3c0adf +size 14244 diff --git a/run-8/checkpoint-72/scheduler.pt b/run-8/checkpoint-72/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..635b7d7f57dd9fb738469b1afc9a39220903a8ad --- /dev/null +++ b/run-8/checkpoint-72/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d71a46296bcf3a24653766c571b4c6cdb8469ec648d8c0e134792cd573505fa6 +size 1064 diff --git a/run-8/checkpoint-72/trainer_state.json b/run-8/checkpoint-72/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7ee04833971f7996b113e45766b1ac689675e109 --- /dev/null +++ b/run-8/checkpoint-72/trainer_state.json @@ -0,0 +1,149 @@ +{ + "best_metric": 0.7326732673267327, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-8/checkpoint-24", + "epoch": 3.0, + "eval_steps": 500, + "global_step": 72, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.21, + "grad_norm": 1.6496813297271729, + "learning_rate": 2.9441483262927863e-06, + "loss": 0.6992, + "step": 5 + }, + { + "epoch": 0.42, + "grad_norm": 0.620004415512085, + "learning_rate": 5.888296652585573e-06, + "loss": 0.6939, + "step": 10 + }, + { + "epoch": 0.62, + "grad_norm": 0.5726878046989441, + "learning_rate": 8.832444978878358e-06, + "loss": 0.6835, + "step": 15 + }, + { + "epoch": 0.83, + "grad_norm": 1.4239176511764526, + "learning_rate": 1.1776593305171145e-05, + "loss": 0.6673, + "step": 20 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6805419921875, + "eval_runtime": 1.3716, + "eval_samples_per_second": 46.661, + "eval_steps_per_second": 5.833, + "step": 24 + }, + { + "epoch": 1.04, + "grad_norm": 0.9195191264152527, + "learning_rate": 1.406648644784331e-05, + "loss": 0.6673, + "step": 25 + }, + { + "epoch": 1.25, + "grad_norm": 1.8524231910705566, + "learning_rate": 1.3739358856033002e-05, + "loss": 0.6155, + "step": 30 + }, + { + "epoch": 1.46, + "grad_norm": 1.8213531970977783, + "learning_rate": 1.3412231264222692e-05, + "loss": 0.5895, + "step": 35 + }, + { + "epoch": 1.67, + "grad_norm": 0.4818130433559418, + "learning_rate": 1.3085103672412383e-05, + "loss": 0.6468, + "step": 40 + }, + { + "epoch": 1.88, + "grad_norm": 0.6597484946250916, + "learning_rate": 1.2757976080602073e-05, + "loss": 0.6173, + "step": 45 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6970634460449219, + "eval_runtime": 1.3632, + "eval_samples_per_second": 46.949, + "eval_steps_per_second": 5.869, + "step": 48 + }, + { + "epoch": 2.08, + "grad_norm": 0.48903289437294006, + "learning_rate": 1.2430848488791764e-05, + "loss": 0.6302, + "step": 50 + }, + { + "epoch": 2.29, + "grad_norm": 0.6064260601997375, + "learning_rate": 1.2103720896981454e-05, + "loss": 0.5867, + "step": 55 + }, + { + "epoch": 2.5, + "grad_norm": 0.6802453398704529, + "learning_rate": 1.1776593305171145e-05, + "loss": 0.6321, + "step": 60 + }, + { + "epoch": 2.71, + "grad_norm": 1.2592875957489014, + "learning_rate": 1.1449465713360835e-05, + "loss": 0.6223, + "step": 65 + }, + { + "epoch": 2.92, + "grad_norm": 1.1591824293136597, + "learning_rate": 1.1122338121550526e-05, + "loss": 0.4922, + "step": 70 + }, + { + "epoch": 3.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7079887390136719, + "eval_runtime": 1.3669, + "eval_samples_per_second": 46.821, + "eval_steps_per_second": 5.853, + "step": 72 + } + ], + "logging_steps": 5, + "max_steps": 240, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 2449725503657472.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": { + "learning_rate": 1.4131911966205373e-05, + "per_device_train_batch_size": 8 + } +} diff --git a/run-8/checkpoint-72/training_args.bin b/run-8/checkpoint-72/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d5c1c69e2334e136936b2e49c08c4956c68cf869 --- /dev/null +++ b/run-8/checkpoint-72/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd5bd287ce4223bd729031c331a786d123069a13358bbb44dde2321174ac948e +size 4920 diff --git a/run-8/checkpoint-96/config.json b/run-8/checkpoint-96/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-8/checkpoint-96/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-8/checkpoint-96/model.safetensors b/run-8/checkpoint-96/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7093484428313f25faaff16a565854ad8f67cc43 --- /dev/null +++ b/run-8/checkpoint-96/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:508a6fca9dadcdd3e45675b8d59173a8b8ed5479f0012fb1517ec63c0ee2839c +size 94763496 diff --git a/run-8/checkpoint-96/optimizer.pt b/run-8/checkpoint-96/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a259e56c2910ec7c9cd7de16831f52eb36ec5a0 --- /dev/null +++ b/run-8/checkpoint-96/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f5d90083ca92e1026fa318b39a57fe92b815eb6f1edc1e16c61ff39e7b7a1ab +size 189552570 diff --git a/run-8/checkpoint-96/preprocessor_config.json b/run-8/checkpoint-96/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-8/checkpoint-96/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-8/checkpoint-96/rng_state.pth b/run-8/checkpoint-96/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..bde550517cedc68a3ae49137c6221ff74725b994 --- /dev/null +++ b/run-8/checkpoint-96/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b456b0645d03f4fc8a479113b5caa8bb02734d9c9788d3ded452d4ca10da7ef +size 14244 diff --git a/run-8/checkpoint-96/scheduler.pt b/run-8/checkpoint-96/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6750e3560f9b476f60c95c1febec359af342294a --- /dev/null +++ b/run-8/checkpoint-96/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7765e1353b62bd1d07ccc6dd592956dd317e1885f6642cd31603011e8106f0b4 +size 1064 diff --git a/run-8/checkpoint-96/trainer_state.json b/run-8/checkpoint-96/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..35db0fc418f8b55e278e6103ceb77a765d63ff7d --- /dev/null +++ b/run-8/checkpoint-96/trainer_state.json @@ -0,0 +1,193 @@ +{ + "best_metric": 0.7326732673267327, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-8/checkpoint-24", + "epoch": 4.0, + "eval_steps": 500, + "global_step": 96, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.21, + "grad_norm": 1.6496813297271729, + "learning_rate": 2.9441483262927863e-06, + "loss": 0.6992, + "step": 5 + }, + { + "epoch": 0.42, + "grad_norm": 0.620004415512085, + "learning_rate": 5.888296652585573e-06, + "loss": 0.6939, + "step": 10 + }, + { + "epoch": 0.62, + "grad_norm": 0.5726878046989441, + "learning_rate": 8.832444978878358e-06, + "loss": 0.6835, + "step": 15 + }, + { + "epoch": 0.83, + "grad_norm": 1.4239176511764526, + "learning_rate": 1.1776593305171145e-05, + "loss": 0.6673, + "step": 20 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6805419921875, + "eval_runtime": 1.3716, + "eval_samples_per_second": 46.661, + "eval_steps_per_second": 5.833, + "step": 24 + }, + { + "epoch": 1.04, + "grad_norm": 0.9195191264152527, + "learning_rate": 1.406648644784331e-05, + "loss": 0.6673, + "step": 25 + }, + { + "epoch": 1.25, + "grad_norm": 1.8524231910705566, + "learning_rate": 1.3739358856033002e-05, + "loss": 0.6155, + "step": 30 + }, + { + "epoch": 1.46, + "grad_norm": 1.8213531970977783, + "learning_rate": 1.3412231264222692e-05, + "loss": 0.5895, + "step": 35 + }, + { + "epoch": 1.67, + "grad_norm": 0.4818130433559418, + "learning_rate": 1.3085103672412383e-05, + "loss": 0.6468, + "step": 40 + }, + { + "epoch": 1.88, + "grad_norm": 0.6597484946250916, + "learning_rate": 1.2757976080602073e-05, + "loss": 0.6173, + "step": 45 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6970634460449219, + "eval_runtime": 1.3632, + "eval_samples_per_second": 46.949, + "eval_steps_per_second": 5.869, + "step": 48 + }, + { + "epoch": 2.08, + "grad_norm": 0.48903289437294006, + "learning_rate": 1.2430848488791764e-05, + "loss": 0.6302, + "step": 50 + }, + { + "epoch": 2.29, + "grad_norm": 0.6064260601997375, + "learning_rate": 1.2103720896981454e-05, + "loss": 0.5867, + "step": 55 + }, + { + "epoch": 2.5, + "grad_norm": 0.6802453398704529, + "learning_rate": 1.1776593305171145e-05, + "loss": 0.6321, + "step": 60 + }, + { + "epoch": 2.71, + "grad_norm": 1.2592875957489014, + "learning_rate": 1.1449465713360835e-05, + "loss": 0.6223, + "step": 65 + }, + { + "epoch": 2.92, + "grad_norm": 1.1591824293136597, + "learning_rate": 1.1122338121550526e-05, + "loss": 0.4922, + "step": 70 + }, + { + "epoch": 3.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7079887390136719, + "eval_runtime": 1.3669, + "eval_samples_per_second": 46.821, + "eval_steps_per_second": 5.853, + "step": 72 + }, + { + "epoch": 3.12, + "grad_norm": 0.5332023501396179, + "learning_rate": 1.0795210529740214e-05, + "loss": 0.5989, + "step": 75 + }, + { + "epoch": 3.33, + "grad_norm": 0.5555600523948669, + "learning_rate": 1.0468082937929906e-05, + "loss": 0.6104, + "step": 80 + }, + { + "epoch": 3.54, + "grad_norm": 1.2928024530410767, + "learning_rate": 1.0140955346119596e-05, + "loss": 0.4936, + "step": 85 + }, + { + "epoch": 3.75, + "grad_norm": 1.1424989700317383, + "learning_rate": 9.813827754309287e-06, + "loss": 0.6191, + "step": 90 + }, + { + "epoch": 3.96, + "grad_norm": 1.119732141494751, + "learning_rate": 9.486700162498977e-06, + "loss": 0.6004, + "step": 95 + }, + { + "epoch": 4.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.7053489685058594, + "eval_runtime": 1.3556, + "eval_samples_per_second": 47.211, + "eval_steps_per_second": 5.901, + "step": 96 + } + ], + "logging_steps": 5, + "max_steps": 240, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 3314382463332576.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": { + "learning_rate": 1.4131911966205373e-05, + "per_device_train_batch_size": 8 + } +} diff --git a/run-8/checkpoint-96/training_args.bin b/run-8/checkpoint-96/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d5c1c69e2334e136936b2e49c08c4956c68cf869 --- /dev/null +++ b/run-8/checkpoint-96/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd5bd287ce4223bd729031c331a786d123069a13358bbb44dde2321174ac948e +size 4920 diff --git a/runs/Mar03_18-24-09_ca56ea9bc35e/events.out.tfevents.1709492437.ca56ea9bc35e.3883.10 b/runs/Mar03_18-24-09_ca56ea9bc35e/events.out.tfevents.1709492437.ca56ea9bc35e.3883.10 new file mode 100644 index 0000000000000000000000000000000000000000..106a16cfad69bea640897036ea830ff684003a37 --- /dev/null +++ b/runs/Mar03_18-24-09_ca56ea9bc35e/events.out.tfevents.1709492437.ca56ea9bc35e.3883.10 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92f9a73f0e99d1c7eb284643236d1eff01e3a177f442ae089f7c3865089210c4 +size 6038 diff --git a/runs/Mar03_18-24-09_ca56ea9bc35e/events.out.tfevents.1709492443.ca56ea9bc35e.3883.11 b/runs/Mar03_18-24-09_ca56ea9bc35e/events.out.tfevents.1709492443.ca56ea9bc35e.3883.11 new file mode 100644 index 0000000000000000000000000000000000000000..430eacc4a570237869d85b86b357d7fa9bf972f0 --- /dev/null +++ b/runs/Mar03_18-24-09_ca56ea9bc35e/events.out.tfevents.1709492443.ca56ea9bc35e.3883.11 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:053781900c13d153af4cb47d9633545860c91246df81172cfbcbd24397a4f1a8 +size 12251 diff --git a/runs/Mar03_18-24-09_ca56ea9bc35e/events.out.tfevents.1709492488.ca56ea9bc35e.3883.12 b/runs/Mar03_18-24-09_ca56ea9bc35e/events.out.tfevents.1709492488.ca56ea9bc35e.3883.12 new file mode 100644 index 0000000000000000000000000000000000000000..8e90a8d8ba1e803e76e66e65490c867036a0a44c --- /dev/null +++ b/runs/Mar03_18-24-09_ca56ea9bc35e/events.out.tfevents.1709492488.ca56ea9bc35e.3883.12 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71ce9f36f79b90cfaf6daa0446c2373eac3a306b322946ff9540987ab05f8b67 +size 29188 diff --git a/runs/Mar03_18-24-09_ca56ea9bc35e/events.out.tfevents.1709492652.ca56ea9bc35e.3883.13 b/runs/Mar03_18-24-09_ca56ea9bc35e/events.out.tfevents.1709492652.ca56ea9bc35e.3883.13 new file mode 100644 index 0000000000000000000000000000000000000000..0e1a2099ce2e3702ebab28127a35d03f0f1e43a9 --- /dev/null +++ b/runs/Mar03_18-24-09_ca56ea9bc35e/events.out.tfevents.1709492652.ca56ea9bc35e.3883.13 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5dd56240f65d3a34bbefebf47d3f6eabf955f2db7e22468f9c20f49022a63be +size 29188 diff --git a/training_args.bin b/training_args.bin index d5c1c69e2334e136936b2e49c08c4956c68cf869..0ee97abe4643adaac3cd80fa21facc591cbc7e9f 100644 --- a/training_args.bin +++ b/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd5bd287ce4223bd729031c331a786d123069a13358bbb44dde2321174ac948e +oid sha256:3a13ddf2852b749b2ed775695ad5ddfecc242a8e8d8b1546d5e7357434a3ed37 size 4920