diff --git a/model.safetensors b/model.safetensors index 8b19dae92ab08f0f14f4192717a30fa580c4704c..9877a03ed7ffd01a15db65fc288d2a8b522a91f9 100644 --- a/model.safetensors +++ b/model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:018f138040ad7aa88bf94f4b8deb9d4f75616181b7d07008cdde50dd03503a11 +oid sha256:e1ed6ffdb684cc537f47f54766448db627495bfcb076b58f5bd92f9f75833010 size 94763496 diff --git a/run-4/checkpoint-144/config.json b/run-4/checkpoint-144/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-4/checkpoint-144/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-4/checkpoint-144/model.safetensors b/run-4/checkpoint-144/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..02970724369a6440d75b3102c5b46858fcabac36 --- /dev/null +++ b/run-4/checkpoint-144/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84be90551bec8e32bb108dda4349a49a0a709d76c0e31826f05686036125618e +size 94763496 diff --git a/run-4/checkpoint-144/optimizer.pt b/run-4/checkpoint-144/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0f3613f0ea3c285f54dc4210948818b0cbc3b80 --- /dev/null +++ b/run-4/checkpoint-144/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8185b0c73404b8fffa8583fd719233443888c59c9660c9da10c6e75c9be8a36e +size 189552570 diff --git a/run-4/checkpoint-144/preprocessor_config.json b/run-4/checkpoint-144/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-4/checkpoint-144/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-4/checkpoint-144/rng_state.pth b/run-4/checkpoint-144/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..35674c44088a3585f8cd11a1eb144d356856a804 --- /dev/null +++ b/run-4/checkpoint-144/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87b8bdd7b4355fd23f0b8256efb0158e4240e11263e992a13d50944c37692500 +size 14244 diff --git a/run-4/checkpoint-144/scheduler.pt b/run-4/checkpoint-144/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1181899c7c61177556aa551eb900aaa714c34d31 --- /dev/null +++ b/run-4/checkpoint-144/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab5c28f22d5574a7076ae76732817bd92c67e71ad78a2880a292f813c3e7a8d1 +size 1064 diff --git a/run-4/checkpoint-144/trainer_state.json b/run-4/checkpoint-144/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2624ef4c1c6ab1a3ba5b9d0bc7c24929181c3135 --- /dev/null +++ b/run-4/checkpoint-144/trainer_state.json @@ -0,0 +1,247 @@ +{ + "best_metric": 0.7326732673267327, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-4/checkpoint-96", + "epoch": 3.0, + "eval_steps": 500, + "global_step": 144, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "grad_norm": 1.2733114957809448, + "learning_rate": 2.2702186710865246e-07, + "loss": 0.7025, + "step": 5 + }, + { + "epoch": 0.21, + "grad_norm": 1.243804931640625, + "learning_rate": 4.5404373421730493e-07, + "loss": 0.6974, + "step": 10 + }, + { + "epoch": 0.31, + "grad_norm": 1.7711552381515503, + "learning_rate": 6.810656013259573e-07, + "loss": 0.696, + "step": 15 + }, + { + "epoch": 0.42, + "grad_norm": 1.1453403234481812, + "learning_rate": 9.080874684346099e-07, + "loss": 0.6989, + "step": 20 + }, + { + "epoch": 0.52, + "grad_norm": 1.2729355096817017, + "learning_rate": 1.1351093355432624e-06, + "loss": 0.6968, + "step": 25 + }, + { + "epoch": 0.62, + "grad_norm": 1.1592165231704712, + "learning_rate": 1.3621312026519146e-06, + "loss": 0.6959, + "step": 30 + }, + { + "epoch": 0.73, + "grad_norm": 1.1798148155212402, + "learning_rate": 1.589153069760567e-06, + "loss": 0.6952, + "step": 35 + }, + { + "epoch": 0.83, + "grad_norm": 2.1216671466827393, + "learning_rate": 1.8161749368692197e-06, + "loss": 0.6886, + "step": 40 + }, + { + "epoch": 0.94, + "grad_norm": 1.3416370153427124, + "learning_rate": 2.043196803977872e-06, + "loss": 0.6864, + "step": 45 + }, + { + "epoch": 1.0, + "eval_f1": 0.72, + "eval_loss": 0.688262939453125, + "eval_runtime": 1.3468, + "eval_samples_per_second": 47.521, + "eval_steps_per_second": 5.94, + "step": 48 + }, + { + "epoch": 1.04, + "grad_norm": 2.1856281757354736, + "learning_rate": 2.169320063482679e-06, + "loss": 0.6917, + "step": 50 + }, + { + "epoch": 1.15, + "grad_norm": 1.4077153205871582, + "learning_rate": 2.1440954115817176e-06, + "loss": 0.6884, + "step": 55 + }, + { + "epoch": 1.25, + "grad_norm": 2.1792664527893066, + "learning_rate": 2.1188707596807562e-06, + "loss": 0.6668, + "step": 60 + }, + { + "epoch": 1.35, + "grad_norm": 1.0386197566986084, + "learning_rate": 2.093646107779795e-06, + "loss": 0.6694, + "step": 65 + }, + { + "epoch": 1.46, + "grad_norm": 2.0565919876098633, + "learning_rate": 2.0684214558788335e-06, + "loss": 0.6561, + "step": 70 + }, + { + "epoch": 1.56, + "grad_norm": 1.2978509664535522, + "learning_rate": 2.043196803977872e-06, + "loss": 0.6789, + "step": 75 + }, + { + "epoch": 1.67, + "grad_norm": 2.058328628540039, + "learning_rate": 2.0179721520769108e-06, + "loss": 0.6633, + "step": 80 + }, + { + "epoch": 1.77, + "grad_norm": 0.6023226976394653, + "learning_rate": 1.9927475001759494e-06, + "loss": 0.6655, + "step": 85 + }, + { + "epoch": 1.88, + "grad_norm": 0.5510762929916382, + "learning_rate": 1.967522848274988e-06, + "loss": 0.6622, + "step": 90 + }, + { + "epoch": 1.98, + "grad_norm": 1.098602533340454, + "learning_rate": 1.9422981963740267e-06, + "loss": 0.6633, + "step": 95 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6816024780273438, + "eval_runtime": 1.3765, + "eval_samples_per_second": 46.493, + "eval_steps_per_second": 5.812, + "step": 96 + }, + { + "epoch": 2.08, + "grad_norm": 0.9589098691940308, + "learning_rate": 1.9170735444730654e-06, + "loss": 0.659, + "step": 100 + }, + { + "epoch": 2.19, + "grad_norm": 1.070695161819458, + "learning_rate": 1.8918488925721038e-06, + "loss": 0.6313, + "step": 105 + }, + { + "epoch": 2.29, + "grad_norm": 0.9913639426231384, + "learning_rate": 1.8666242406711424e-06, + "loss": 0.6652, + "step": 110 + }, + { + "epoch": 2.4, + "grad_norm": 1.0632878541946411, + "learning_rate": 1.841399588770181e-06, + "loss": 0.673, + "step": 115 + }, + { + "epoch": 2.5, + "grad_norm": 2.1036579608917236, + "learning_rate": 1.8161749368692197e-06, + "loss": 0.6451, + "step": 120 + }, + { + "epoch": 2.6, + "grad_norm": 1.08384108543396, + "learning_rate": 1.7909502849682583e-06, + "loss": 0.6322, + "step": 125 + }, + { + "epoch": 2.71, + "grad_norm": 0.9407000541687012, + "learning_rate": 1.765725633067297e-06, + "loss": 0.6755, + "step": 130 + }, + { + "epoch": 2.81, + "grad_norm": 0.9016568660736084, + "learning_rate": 1.7405009811663356e-06, + "loss": 0.5985, + "step": 135 + }, + { + "epoch": 2.92, + "grad_norm": 1.1134448051452637, + "learning_rate": 1.7152763292653743e-06, + "loss": 0.603, + "step": 140 + }, + { + "epoch": 3.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6800689697265625, + "eval_runtime": 1.3861, + "eval_samples_per_second": 46.173, + "eval_steps_per_second": 5.772, + "step": 144 + } + ], + "logging_steps": 5, + "max_steps": 480, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 2121874430755872.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": { + "learning_rate": 2.1794099242430636e-06, + "per_device_train_batch_size": 4 + } +} diff --git a/run-4/checkpoint-144/training_args.bin b/run-4/checkpoint-144/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2bb1c33e4434a3bb752764ab093170996801fe72 --- /dev/null +++ b/run-4/checkpoint-144/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eadd8415a707c8e4de7440233358b89448cac114a2ada36aef1d9186553c0c3 +size 4920 diff --git a/run-4/checkpoint-192/config.json b/run-4/checkpoint-192/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-4/checkpoint-192/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-4/checkpoint-192/model.safetensors b/run-4/checkpoint-192/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f967735d87c14c80d6c200b17e359165d1ee7fcf --- /dev/null +++ b/run-4/checkpoint-192/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04a662be668d43e34b3ede26defff9905cfe78fa0158e7f3bca9b7849a684f21 +size 94763496 diff --git a/run-4/checkpoint-192/optimizer.pt b/run-4/checkpoint-192/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..fd4284d9e34f4d671e9767eed9b04c4d5ff52fb0 --- /dev/null +++ b/run-4/checkpoint-192/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:989d42785f68e982f9cc1de749a83d539a71c120933d2aae451e2785a66e335b +size 189552570 diff --git a/run-4/checkpoint-192/preprocessor_config.json b/run-4/checkpoint-192/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-4/checkpoint-192/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-4/checkpoint-192/rng_state.pth b/run-4/checkpoint-192/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4d76370ee36700e6a498a1fbfff621aca7984a77 --- /dev/null +++ b/run-4/checkpoint-192/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5aef020ca2df517540ac9ff4e195e1c41a7b85939e93195d118078f119bc949 +size 14244 diff --git a/run-4/checkpoint-192/scheduler.pt b/run-4/checkpoint-192/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e37067620756c0018ec976c23548188e74a45508 --- /dev/null +++ b/run-4/checkpoint-192/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2463c4896d69602722ffa95084240326443a5f0698cb239a50fde88c55bcd421 +size 1064 diff --git a/run-4/checkpoint-192/trainer_state.json b/run-4/checkpoint-192/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..087300605c072aa45ba893457fcfe0f90f8088b8 --- /dev/null +++ b/run-4/checkpoint-192/trainer_state.json @@ -0,0 +1,326 @@ +{ + "best_metric": 0.7326732673267327, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-4/checkpoint-96", + "epoch": 4.0, + "eval_steps": 500, + "global_step": 192, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "grad_norm": 1.2733114957809448, + "learning_rate": 2.2702186710865246e-07, + "loss": 0.7025, + "step": 5 + }, + { + "epoch": 0.21, + "grad_norm": 1.243804931640625, + "learning_rate": 4.5404373421730493e-07, + "loss": 0.6974, + "step": 10 + }, + { + "epoch": 0.31, + "grad_norm": 1.7711552381515503, + "learning_rate": 6.810656013259573e-07, + "loss": 0.696, + "step": 15 + }, + { + "epoch": 0.42, + "grad_norm": 1.1453403234481812, + "learning_rate": 9.080874684346099e-07, + "loss": 0.6989, + "step": 20 + }, + { + "epoch": 0.52, + "grad_norm": 1.2729355096817017, + "learning_rate": 1.1351093355432624e-06, + "loss": 0.6968, + "step": 25 + }, + { + "epoch": 0.62, + "grad_norm": 1.1592165231704712, + "learning_rate": 1.3621312026519146e-06, + "loss": 0.6959, + "step": 30 + }, + { + "epoch": 0.73, + "grad_norm": 1.1798148155212402, + "learning_rate": 1.589153069760567e-06, + "loss": 0.6952, + "step": 35 + }, + { + "epoch": 0.83, + "grad_norm": 2.1216671466827393, + "learning_rate": 1.8161749368692197e-06, + "loss": 0.6886, + "step": 40 + }, + { + "epoch": 0.94, + "grad_norm": 1.3416370153427124, + "learning_rate": 2.043196803977872e-06, + "loss": 0.6864, + "step": 45 + }, + { + "epoch": 1.0, + "eval_f1": 0.72, + "eval_loss": 0.688262939453125, + "eval_runtime": 1.3468, + "eval_samples_per_second": 47.521, + "eval_steps_per_second": 5.94, + "step": 48 + }, + { + "epoch": 1.04, + "grad_norm": 2.1856281757354736, + "learning_rate": 2.169320063482679e-06, + "loss": 0.6917, + "step": 50 + }, + { + "epoch": 1.15, + "grad_norm": 1.4077153205871582, + "learning_rate": 2.1440954115817176e-06, + "loss": 0.6884, + "step": 55 + }, + { + "epoch": 1.25, + "grad_norm": 2.1792664527893066, + "learning_rate": 2.1188707596807562e-06, + "loss": 0.6668, + "step": 60 + }, + { + "epoch": 1.35, + "grad_norm": 1.0386197566986084, + "learning_rate": 2.093646107779795e-06, + "loss": 0.6694, + "step": 65 + }, + { + "epoch": 1.46, + "grad_norm": 2.0565919876098633, + "learning_rate": 2.0684214558788335e-06, + "loss": 0.6561, + "step": 70 + }, + { + "epoch": 1.56, + "grad_norm": 1.2978509664535522, + "learning_rate": 2.043196803977872e-06, + "loss": 0.6789, + "step": 75 + }, + { + "epoch": 1.67, + "grad_norm": 2.058328628540039, + "learning_rate": 2.0179721520769108e-06, + "loss": 0.6633, + "step": 80 + }, + { + "epoch": 1.77, + "grad_norm": 0.6023226976394653, + "learning_rate": 1.9927475001759494e-06, + "loss": 0.6655, + "step": 85 + }, + { + "epoch": 1.88, + "grad_norm": 0.5510762929916382, + "learning_rate": 1.967522848274988e-06, + "loss": 0.6622, + "step": 90 + }, + { + "epoch": 1.98, + "grad_norm": 1.098602533340454, + "learning_rate": 1.9422981963740267e-06, + "loss": 0.6633, + "step": 95 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6816024780273438, + "eval_runtime": 1.3765, + "eval_samples_per_second": 46.493, + "eval_steps_per_second": 5.812, + "step": 96 + }, + { + "epoch": 2.08, + "grad_norm": 0.9589098691940308, + "learning_rate": 1.9170735444730654e-06, + "loss": 0.659, + "step": 100 + }, + { + "epoch": 2.19, + "grad_norm": 1.070695161819458, + "learning_rate": 1.8918488925721038e-06, + "loss": 0.6313, + "step": 105 + }, + { + "epoch": 2.29, + "grad_norm": 0.9913639426231384, + "learning_rate": 1.8666242406711424e-06, + "loss": 0.6652, + "step": 110 + }, + { + "epoch": 2.4, + "grad_norm": 1.0632878541946411, + "learning_rate": 1.841399588770181e-06, + "loss": 0.673, + "step": 115 + }, + { + "epoch": 2.5, + "grad_norm": 2.1036579608917236, + "learning_rate": 1.8161749368692197e-06, + "loss": 0.6451, + "step": 120 + }, + { + "epoch": 2.6, + "grad_norm": 1.08384108543396, + "learning_rate": 1.7909502849682583e-06, + "loss": 0.6322, + "step": 125 + }, + { + "epoch": 2.71, + "grad_norm": 0.9407000541687012, + "learning_rate": 1.765725633067297e-06, + "loss": 0.6755, + "step": 130 + }, + { + "epoch": 2.81, + "grad_norm": 0.9016568660736084, + "learning_rate": 1.7405009811663356e-06, + "loss": 0.5985, + "step": 135 + }, + { + "epoch": 2.92, + "grad_norm": 1.1134448051452637, + "learning_rate": 1.7152763292653743e-06, + "loss": 0.603, + "step": 140 + }, + { + "epoch": 3.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6800689697265625, + "eval_runtime": 1.3861, + "eval_samples_per_second": 46.173, + "eval_steps_per_second": 5.772, + "step": 144 + }, + { + "epoch": 3.02, + "grad_norm": 0.7627719640731812, + "learning_rate": 1.6900516773644127e-06, + "loss": 0.6557, + "step": 145 + }, + { + "epoch": 3.12, + "grad_norm": 0.9291415214538574, + "learning_rate": 1.6648270254634511e-06, + "loss": 0.6219, + "step": 150 + }, + { + "epoch": 3.23, + "grad_norm": 0.9248765707015991, + "learning_rate": 1.6396023735624898e-06, + "loss": 0.6325, + "step": 155 + }, + { + "epoch": 3.33, + "grad_norm": 0.9842573404312134, + "learning_rate": 1.6143777216615284e-06, + "loss": 0.6521, + "step": 160 + }, + { + "epoch": 3.44, + "grad_norm": 0.8689214587211609, + "learning_rate": 1.589153069760567e-06, + "loss": 0.5929, + "step": 165 + }, + { + "epoch": 3.54, + "grad_norm": 1.0012000799179077, + "learning_rate": 1.5639284178596057e-06, + "loss": 0.584, + "step": 170 + }, + { + "epoch": 3.65, + "grad_norm": 0.7438368797302246, + "learning_rate": 1.5387037659586443e-06, + "loss": 0.6813, + "step": 175 + }, + { + "epoch": 3.75, + "grad_norm": 1.8603870868682861, + "learning_rate": 1.513479114057683e-06, + "loss": 0.6099, + "step": 180 + }, + { + "epoch": 3.85, + "grad_norm": 0.9918416738510132, + "learning_rate": 1.4882544621567216e-06, + "loss": 0.6192, + "step": 185 + }, + { + "epoch": 3.96, + "grad_norm": 1.9146322011947632, + "learning_rate": 1.4630298102557603e-06, + "loss": 0.6472, + "step": 190 + }, + { + "epoch": 4.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6818161010742188, + "eval_runtime": 1.3841, + "eval_samples_per_second": 46.239, + "eval_steps_per_second": 5.78, + "step": 192 + } + ], + "logging_steps": 5, + "max_steps": 480, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 2891755054954176.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": { + "learning_rate": 2.1794099242430636e-06, + "per_device_train_batch_size": 4 + } +} diff --git a/run-4/checkpoint-192/training_args.bin b/run-4/checkpoint-192/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2bb1c33e4434a3bb752764ab093170996801fe72 --- /dev/null +++ b/run-4/checkpoint-192/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eadd8415a707c8e4de7440233358b89448cac114a2ada36aef1d9186553c0c3 +size 4920 diff --git a/run-4/checkpoint-240/config.json b/run-4/checkpoint-240/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-4/checkpoint-240/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-4/checkpoint-240/model.safetensors b/run-4/checkpoint-240/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f46799f852aa99ab283c4bfc91042e609a82e317 --- /dev/null +++ b/run-4/checkpoint-240/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:336f5153e14e700e4c4e5e048754a4434d79bdee3267bc3a53cbb003b875ea7e +size 94763496 diff --git a/run-4/checkpoint-240/optimizer.pt b/run-4/checkpoint-240/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..83db7a8b61d3fc7039daf232a5009008430172df --- /dev/null +++ b/run-4/checkpoint-240/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22b1916abc98dec4fd4f1436e9d8dd462ba5c4401dddd0941a36031f3bc679ff +size 189552570 diff --git a/run-4/checkpoint-240/preprocessor_config.json b/run-4/checkpoint-240/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-4/checkpoint-240/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-4/checkpoint-240/rng_state.pth b/run-4/checkpoint-240/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..6935c8ee6dffa468628cd166bcbf40c96bd4b606 --- /dev/null +++ b/run-4/checkpoint-240/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8eb071c49709b6f4047e7f48105f0dd51daaf73e0a11fd742255aa4c3526f42 +size 14244 diff --git a/run-4/checkpoint-240/scheduler.pt b/run-4/checkpoint-240/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b3a7f570f424f9ee9cf0ee5605323cd6692ee9fb --- /dev/null +++ b/run-4/checkpoint-240/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2e35d3649cf053f4cc488647dacb8d6e20774271115c5998276bc7752ca7e23 +size 1064 diff --git a/run-4/checkpoint-240/trainer_state.json b/run-4/checkpoint-240/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0de518afa0f88b7cf6d85f3d85993ce4b689c09c --- /dev/null +++ b/run-4/checkpoint-240/trainer_state.json @@ -0,0 +1,405 @@ +{ + "best_metric": 0.7326732673267327, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-4/checkpoint-96", + "epoch": 5.0, + "eval_steps": 500, + "global_step": 240, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "grad_norm": 1.2733114957809448, + "learning_rate": 2.2702186710865246e-07, + "loss": 0.7025, + "step": 5 + }, + { + "epoch": 0.21, + "grad_norm": 1.243804931640625, + "learning_rate": 4.5404373421730493e-07, + "loss": 0.6974, + "step": 10 + }, + { + "epoch": 0.31, + "grad_norm": 1.7711552381515503, + "learning_rate": 6.810656013259573e-07, + "loss": 0.696, + "step": 15 + }, + { + "epoch": 0.42, + "grad_norm": 1.1453403234481812, + "learning_rate": 9.080874684346099e-07, + "loss": 0.6989, + "step": 20 + }, + { + "epoch": 0.52, + "grad_norm": 1.2729355096817017, + "learning_rate": 1.1351093355432624e-06, + "loss": 0.6968, + "step": 25 + }, + { + "epoch": 0.62, + "grad_norm": 1.1592165231704712, + "learning_rate": 1.3621312026519146e-06, + "loss": 0.6959, + "step": 30 + }, + { + "epoch": 0.73, + "grad_norm": 1.1798148155212402, + "learning_rate": 1.589153069760567e-06, + "loss": 0.6952, + "step": 35 + }, + { + "epoch": 0.83, + "grad_norm": 2.1216671466827393, + "learning_rate": 1.8161749368692197e-06, + "loss": 0.6886, + "step": 40 + }, + { + "epoch": 0.94, + "grad_norm": 1.3416370153427124, + "learning_rate": 2.043196803977872e-06, + "loss": 0.6864, + "step": 45 + }, + { + "epoch": 1.0, + "eval_f1": 0.72, + "eval_loss": 0.688262939453125, + "eval_runtime": 1.3468, + "eval_samples_per_second": 47.521, + "eval_steps_per_second": 5.94, + "step": 48 + }, + { + "epoch": 1.04, + "grad_norm": 2.1856281757354736, + "learning_rate": 2.169320063482679e-06, + "loss": 0.6917, + "step": 50 + }, + { + "epoch": 1.15, + "grad_norm": 1.4077153205871582, + "learning_rate": 2.1440954115817176e-06, + "loss": 0.6884, + "step": 55 + }, + { + "epoch": 1.25, + "grad_norm": 2.1792664527893066, + "learning_rate": 2.1188707596807562e-06, + "loss": 0.6668, + "step": 60 + }, + { + "epoch": 1.35, + "grad_norm": 1.0386197566986084, + "learning_rate": 2.093646107779795e-06, + "loss": 0.6694, + "step": 65 + }, + { + "epoch": 1.46, + "grad_norm": 2.0565919876098633, + "learning_rate": 2.0684214558788335e-06, + "loss": 0.6561, + "step": 70 + }, + { + "epoch": 1.56, + "grad_norm": 1.2978509664535522, + "learning_rate": 2.043196803977872e-06, + "loss": 0.6789, + "step": 75 + }, + { + "epoch": 1.67, + "grad_norm": 2.058328628540039, + "learning_rate": 2.0179721520769108e-06, + "loss": 0.6633, + "step": 80 + }, + { + "epoch": 1.77, + "grad_norm": 0.6023226976394653, + "learning_rate": 1.9927475001759494e-06, + "loss": 0.6655, + "step": 85 + }, + { + "epoch": 1.88, + "grad_norm": 0.5510762929916382, + "learning_rate": 1.967522848274988e-06, + "loss": 0.6622, + "step": 90 + }, + { + "epoch": 1.98, + "grad_norm": 1.098602533340454, + "learning_rate": 1.9422981963740267e-06, + "loss": 0.6633, + "step": 95 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6816024780273438, + "eval_runtime": 1.3765, + "eval_samples_per_second": 46.493, + "eval_steps_per_second": 5.812, + "step": 96 + }, + { + "epoch": 2.08, + "grad_norm": 0.9589098691940308, + "learning_rate": 1.9170735444730654e-06, + "loss": 0.659, + "step": 100 + }, + { + "epoch": 2.19, + "grad_norm": 1.070695161819458, + "learning_rate": 1.8918488925721038e-06, + "loss": 0.6313, + "step": 105 + }, + { + "epoch": 2.29, + "grad_norm": 0.9913639426231384, + "learning_rate": 1.8666242406711424e-06, + "loss": 0.6652, + "step": 110 + }, + { + "epoch": 2.4, + "grad_norm": 1.0632878541946411, + "learning_rate": 1.841399588770181e-06, + "loss": 0.673, + "step": 115 + }, + { + "epoch": 2.5, + "grad_norm": 2.1036579608917236, + "learning_rate": 1.8161749368692197e-06, + "loss": 0.6451, + "step": 120 + }, + { + "epoch": 2.6, + "grad_norm": 1.08384108543396, + "learning_rate": 1.7909502849682583e-06, + "loss": 0.6322, + "step": 125 + }, + { + "epoch": 2.71, + "grad_norm": 0.9407000541687012, + "learning_rate": 1.765725633067297e-06, + "loss": 0.6755, + "step": 130 + }, + { + "epoch": 2.81, + "grad_norm": 0.9016568660736084, + "learning_rate": 1.7405009811663356e-06, + "loss": 0.5985, + "step": 135 + }, + { + "epoch": 2.92, + "grad_norm": 1.1134448051452637, + "learning_rate": 1.7152763292653743e-06, + "loss": 0.603, + "step": 140 + }, + { + "epoch": 3.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6800689697265625, + "eval_runtime": 1.3861, + "eval_samples_per_second": 46.173, + "eval_steps_per_second": 5.772, + "step": 144 + }, + { + "epoch": 3.02, + "grad_norm": 0.7627719640731812, + "learning_rate": 1.6900516773644127e-06, + "loss": 0.6557, + "step": 145 + }, + { + "epoch": 3.12, + "grad_norm": 0.9291415214538574, + "learning_rate": 1.6648270254634511e-06, + "loss": 0.6219, + "step": 150 + }, + { + "epoch": 3.23, + "grad_norm": 0.9248765707015991, + "learning_rate": 1.6396023735624898e-06, + "loss": 0.6325, + "step": 155 + }, + { + "epoch": 3.33, + "grad_norm": 0.9842573404312134, + "learning_rate": 1.6143777216615284e-06, + "loss": 0.6521, + "step": 160 + }, + { + "epoch": 3.44, + "grad_norm": 0.8689214587211609, + "learning_rate": 1.589153069760567e-06, + "loss": 0.5929, + "step": 165 + }, + { + "epoch": 3.54, + "grad_norm": 1.0012000799179077, + "learning_rate": 1.5639284178596057e-06, + "loss": 0.584, + "step": 170 + }, + { + "epoch": 3.65, + "grad_norm": 0.7438368797302246, + "learning_rate": 1.5387037659586443e-06, + "loss": 0.6813, + "step": 175 + }, + { + "epoch": 3.75, + "grad_norm": 1.8603870868682861, + "learning_rate": 1.513479114057683e-06, + "loss": 0.6099, + "step": 180 + }, + { + "epoch": 3.85, + "grad_norm": 0.9918416738510132, + "learning_rate": 1.4882544621567216e-06, + "loss": 0.6192, + "step": 185 + }, + { + "epoch": 3.96, + "grad_norm": 1.9146322011947632, + "learning_rate": 1.4630298102557603e-06, + "loss": 0.6472, + "step": 190 + }, + { + "epoch": 4.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6818161010742188, + "eval_runtime": 1.3841, + "eval_samples_per_second": 46.239, + "eval_steps_per_second": 5.78, + "step": 192 + }, + { + "epoch": 4.06, + "grad_norm": 0.9502781629562378, + "learning_rate": 1.437805158354799e-06, + "loss": 0.6447, + "step": 195 + }, + { + "epoch": 4.17, + "grad_norm": 0.8570067286491394, + "learning_rate": 1.4125805064538375e-06, + "loss": 0.5306, + "step": 200 + }, + { + "epoch": 4.27, + "grad_norm": 0.8097484111785889, + "learning_rate": 1.3873558545528762e-06, + "loss": 0.6202, + "step": 205 + }, + { + "epoch": 4.38, + "grad_norm": 2.0106472969055176, + "learning_rate": 1.3621312026519146e-06, + "loss": 0.6705, + "step": 210 + }, + { + "epoch": 4.48, + "grad_norm": 1.090775489807129, + "learning_rate": 1.3369065507509533e-06, + "loss": 0.6297, + "step": 215 + }, + { + "epoch": 4.58, + "grad_norm": 0.8988145589828491, + "learning_rate": 1.311681898849992e-06, + "loss": 0.5896, + "step": 220 + }, + { + "epoch": 4.69, + "grad_norm": 0.9149978756904602, + "learning_rate": 1.2864572469490305e-06, + "loss": 0.6156, + "step": 225 + }, + { + "epoch": 4.79, + "grad_norm": 1.9398412704467773, + "learning_rate": 1.2612325950480692e-06, + "loss": 0.6305, + "step": 230 + }, + { + "epoch": 4.9, + "grad_norm": 0.9217966794967651, + "learning_rate": 1.2360079431471078e-06, + "loss": 0.5943, + "step": 235 + }, + { + "epoch": 5.0, + "grad_norm": 0.9083653688430786, + "learning_rate": 1.2107832912461465e-06, + "loss": 0.6386, + "step": 240 + }, + { + "epoch": 5.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6846389770507812, + "eval_runtime": 1.4094, + "eval_samples_per_second": 45.409, + "eval_steps_per_second": 5.676, + "step": 240 + } + ], + "logging_steps": 5, + "max_steps": 480, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 3654362860415712.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": { + "learning_rate": 2.1794099242430636e-06, + "per_device_train_batch_size": 4 + } +} diff --git a/run-4/checkpoint-240/training_args.bin b/run-4/checkpoint-240/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2bb1c33e4434a3bb752764ab093170996801fe72 --- /dev/null +++ b/run-4/checkpoint-240/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eadd8415a707c8e4de7440233358b89448cac114a2ada36aef1d9186553c0c3 +size 4920 diff --git a/run-4/checkpoint-288/config.json b/run-4/checkpoint-288/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-4/checkpoint-288/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-4/checkpoint-288/model.safetensors b/run-4/checkpoint-288/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fc85bf7f38d1766783ae5d1b85efc2ca57754b9f --- /dev/null +++ b/run-4/checkpoint-288/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be27592b57346e1ba7fb5ac856d5e685d4d7d316009088a8a4a6b512cb22d9a4 +size 94763496 diff --git a/run-4/checkpoint-288/optimizer.pt b/run-4/checkpoint-288/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..35f0ee76bc4ee18f54cad6ef384eeb770ab0dcb5 --- /dev/null +++ b/run-4/checkpoint-288/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c627abefd26f649e146eb3f7adc646b23cae944e9576351a11327cb5a485e7d +size 189552570 diff --git a/run-4/checkpoint-288/preprocessor_config.json b/run-4/checkpoint-288/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-4/checkpoint-288/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-4/checkpoint-288/rng_state.pth b/run-4/checkpoint-288/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..de638b27cad47a23bcba70558a870ddb08a0f7e8 --- /dev/null +++ b/run-4/checkpoint-288/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9903236b654011babeaee26ea70e1c6278fa670549b900c6df1d64732428a642 +size 14244 diff --git a/run-4/checkpoint-288/scheduler.pt b/run-4/checkpoint-288/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7c6307ac0e9e5069d4b2165d75011a0e18cc193 --- /dev/null +++ b/run-4/checkpoint-288/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2163f47f6ce948a3a821c2032fa9ddfd17ee0992318acd25664f5287b23dd105 +size 1064 diff --git a/run-4/checkpoint-288/trainer_state.json b/run-4/checkpoint-288/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..165eed28f06d59addaefe9e083ee4bf2c717e731 --- /dev/null +++ b/run-4/checkpoint-288/trainer_state.json @@ -0,0 +1,477 @@ +{ + "best_metric": 0.7326732673267327, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-4/checkpoint-96", + "epoch": 6.0, + "eval_steps": 500, + "global_step": 288, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "grad_norm": 1.2733114957809448, + "learning_rate": 2.2702186710865246e-07, + "loss": 0.7025, + "step": 5 + }, + { + "epoch": 0.21, + "grad_norm": 1.243804931640625, + "learning_rate": 4.5404373421730493e-07, + "loss": 0.6974, + "step": 10 + }, + { + "epoch": 0.31, + "grad_norm": 1.7711552381515503, + "learning_rate": 6.810656013259573e-07, + "loss": 0.696, + "step": 15 + }, + { + "epoch": 0.42, + "grad_norm": 1.1453403234481812, + "learning_rate": 9.080874684346099e-07, + "loss": 0.6989, + "step": 20 + }, + { + "epoch": 0.52, + "grad_norm": 1.2729355096817017, + "learning_rate": 1.1351093355432624e-06, + "loss": 0.6968, + "step": 25 + }, + { + "epoch": 0.62, + "grad_norm": 1.1592165231704712, + "learning_rate": 1.3621312026519146e-06, + "loss": 0.6959, + "step": 30 + }, + { + "epoch": 0.73, + "grad_norm": 1.1798148155212402, + "learning_rate": 1.589153069760567e-06, + "loss": 0.6952, + "step": 35 + }, + { + "epoch": 0.83, + "grad_norm": 2.1216671466827393, + "learning_rate": 1.8161749368692197e-06, + "loss": 0.6886, + "step": 40 + }, + { + "epoch": 0.94, + "grad_norm": 1.3416370153427124, + "learning_rate": 2.043196803977872e-06, + "loss": 0.6864, + "step": 45 + }, + { + "epoch": 1.0, + "eval_f1": 0.72, + "eval_loss": 0.688262939453125, + "eval_runtime": 1.3468, + "eval_samples_per_second": 47.521, + "eval_steps_per_second": 5.94, + "step": 48 + }, + { + "epoch": 1.04, + "grad_norm": 2.1856281757354736, + "learning_rate": 2.169320063482679e-06, + "loss": 0.6917, + "step": 50 + }, + { + "epoch": 1.15, + "grad_norm": 1.4077153205871582, + "learning_rate": 2.1440954115817176e-06, + "loss": 0.6884, + "step": 55 + }, + { + "epoch": 1.25, + "grad_norm": 2.1792664527893066, + "learning_rate": 2.1188707596807562e-06, + "loss": 0.6668, + "step": 60 + }, + { + "epoch": 1.35, + "grad_norm": 1.0386197566986084, + "learning_rate": 2.093646107779795e-06, + "loss": 0.6694, + "step": 65 + }, + { + "epoch": 1.46, + "grad_norm": 2.0565919876098633, + "learning_rate": 2.0684214558788335e-06, + "loss": 0.6561, + "step": 70 + }, + { + "epoch": 1.56, + "grad_norm": 1.2978509664535522, + "learning_rate": 2.043196803977872e-06, + "loss": 0.6789, + "step": 75 + }, + { + "epoch": 1.67, + "grad_norm": 2.058328628540039, + "learning_rate": 2.0179721520769108e-06, + "loss": 0.6633, + "step": 80 + }, + { + "epoch": 1.77, + "grad_norm": 0.6023226976394653, + "learning_rate": 1.9927475001759494e-06, + "loss": 0.6655, + "step": 85 + }, + { + "epoch": 1.88, + "grad_norm": 0.5510762929916382, + "learning_rate": 1.967522848274988e-06, + "loss": 0.6622, + "step": 90 + }, + { + "epoch": 1.98, + "grad_norm": 1.098602533340454, + "learning_rate": 1.9422981963740267e-06, + "loss": 0.6633, + "step": 95 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6816024780273438, + "eval_runtime": 1.3765, + "eval_samples_per_second": 46.493, + "eval_steps_per_second": 5.812, + "step": 96 + }, + { + "epoch": 2.08, + "grad_norm": 0.9589098691940308, + "learning_rate": 1.9170735444730654e-06, + "loss": 0.659, + "step": 100 + }, + { + "epoch": 2.19, + "grad_norm": 1.070695161819458, + "learning_rate": 1.8918488925721038e-06, + "loss": 0.6313, + "step": 105 + }, + { + "epoch": 2.29, + "grad_norm": 0.9913639426231384, + "learning_rate": 1.8666242406711424e-06, + "loss": 0.6652, + "step": 110 + }, + { + "epoch": 2.4, + "grad_norm": 1.0632878541946411, + "learning_rate": 1.841399588770181e-06, + "loss": 0.673, + "step": 115 + }, + { + "epoch": 2.5, + "grad_norm": 2.1036579608917236, + "learning_rate": 1.8161749368692197e-06, + "loss": 0.6451, + "step": 120 + }, + { + "epoch": 2.6, + "grad_norm": 1.08384108543396, + "learning_rate": 1.7909502849682583e-06, + "loss": 0.6322, + "step": 125 + }, + { + "epoch": 2.71, + "grad_norm": 0.9407000541687012, + "learning_rate": 1.765725633067297e-06, + "loss": 0.6755, + "step": 130 + }, + { + "epoch": 2.81, + "grad_norm": 0.9016568660736084, + "learning_rate": 1.7405009811663356e-06, + "loss": 0.5985, + "step": 135 + }, + { + "epoch": 2.92, + "grad_norm": 1.1134448051452637, + "learning_rate": 1.7152763292653743e-06, + "loss": 0.603, + "step": 140 + }, + { + "epoch": 3.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6800689697265625, + "eval_runtime": 1.3861, + "eval_samples_per_second": 46.173, + "eval_steps_per_second": 5.772, + "step": 144 + }, + { + "epoch": 3.02, + "grad_norm": 0.7627719640731812, + "learning_rate": 1.6900516773644127e-06, + "loss": 0.6557, + "step": 145 + }, + { + "epoch": 3.12, + "grad_norm": 0.9291415214538574, + "learning_rate": 1.6648270254634511e-06, + "loss": 0.6219, + "step": 150 + }, + { + "epoch": 3.23, + "grad_norm": 0.9248765707015991, + "learning_rate": 1.6396023735624898e-06, + "loss": 0.6325, + "step": 155 + }, + { + "epoch": 3.33, + "grad_norm": 0.9842573404312134, + "learning_rate": 1.6143777216615284e-06, + "loss": 0.6521, + "step": 160 + }, + { + "epoch": 3.44, + "grad_norm": 0.8689214587211609, + "learning_rate": 1.589153069760567e-06, + "loss": 0.5929, + "step": 165 + }, + { + "epoch": 3.54, + "grad_norm": 1.0012000799179077, + "learning_rate": 1.5639284178596057e-06, + "loss": 0.584, + "step": 170 + }, + { + "epoch": 3.65, + "grad_norm": 0.7438368797302246, + "learning_rate": 1.5387037659586443e-06, + "loss": 0.6813, + "step": 175 + }, + { + "epoch": 3.75, + "grad_norm": 1.8603870868682861, + "learning_rate": 1.513479114057683e-06, + "loss": 0.6099, + "step": 180 + }, + { + "epoch": 3.85, + "grad_norm": 0.9918416738510132, + "learning_rate": 1.4882544621567216e-06, + "loss": 0.6192, + "step": 185 + }, + { + "epoch": 3.96, + "grad_norm": 1.9146322011947632, + "learning_rate": 1.4630298102557603e-06, + "loss": 0.6472, + "step": 190 + }, + { + "epoch": 4.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6818161010742188, + "eval_runtime": 1.3841, + "eval_samples_per_second": 46.239, + "eval_steps_per_second": 5.78, + "step": 192 + }, + { + "epoch": 4.06, + "grad_norm": 0.9502781629562378, + "learning_rate": 1.437805158354799e-06, + "loss": 0.6447, + "step": 195 + }, + { + "epoch": 4.17, + "grad_norm": 0.8570067286491394, + "learning_rate": 1.4125805064538375e-06, + "loss": 0.5306, + "step": 200 + }, + { + "epoch": 4.27, + "grad_norm": 0.8097484111785889, + "learning_rate": 1.3873558545528762e-06, + "loss": 0.6202, + "step": 205 + }, + { + "epoch": 4.38, + "grad_norm": 2.0106472969055176, + "learning_rate": 1.3621312026519146e-06, + "loss": 0.6705, + "step": 210 + }, + { + "epoch": 4.48, + "grad_norm": 1.090775489807129, + "learning_rate": 1.3369065507509533e-06, + "loss": 0.6297, + "step": 215 + }, + { + "epoch": 4.58, + "grad_norm": 0.8988145589828491, + "learning_rate": 1.311681898849992e-06, + "loss": 0.5896, + "step": 220 + }, + { + "epoch": 4.69, + "grad_norm": 0.9149978756904602, + "learning_rate": 1.2864572469490305e-06, + "loss": 0.6156, + "step": 225 + }, + { + "epoch": 4.79, + "grad_norm": 1.9398412704467773, + "learning_rate": 1.2612325950480692e-06, + "loss": 0.6305, + "step": 230 + }, + { + "epoch": 4.9, + "grad_norm": 0.9217966794967651, + "learning_rate": 1.2360079431471078e-06, + "loss": 0.5943, + "step": 235 + }, + { + "epoch": 5.0, + "grad_norm": 0.9083653688430786, + "learning_rate": 1.2107832912461465e-06, + "loss": 0.6386, + "step": 240 + }, + { + "epoch": 5.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6846389770507812, + "eval_runtime": 1.4094, + "eval_samples_per_second": 45.409, + "eval_steps_per_second": 5.676, + "step": 240 + }, + { + "epoch": 5.1, + "grad_norm": 0.9323675036430359, + "learning_rate": 1.1855586393451851e-06, + "loss": 0.5779, + "step": 245 + }, + { + "epoch": 5.21, + "grad_norm": 0.7549787163734436, + "learning_rate": 1.1603339874442238e-06, + "loss": 0.5948, + "step": 250 + }, + { + "epoch": 5.31, + "grad_norm": 0.8535837531089783, + "learning_rate": 1.1351093355432624e-06, + "loss": 0.6928, + "step": 255 + }, + { + "epoch": 5.42, + "grad_norm": 1.2038137912750244, + "learning_rate": 1.109884683642301e-06, + "loss": 0.5887, + "step": 260 + }, + { + "epoch": 5.52, + "grad_norm": 0.9501279592514038, + "learning_rate": 1.0846600317413395e-06, + "loss": 0.5776, + "step": 265 + }, + { + "epoch": 5.62, + "grad_norm": 0.7421719431877136, + "learning_rate": 1.0594353798403781e-06, + "loss": 0.6734, + "step": 270 + }, + { + "epoch": 5.73, + "grad_norm": 0.8555863499641418, + "learning_rate": 1.0342107279394168e-06, + "loss": 0.6399, + "step": 275 + }, + { + "epoch": 5.83, + "grad_norm": 0.8841156363487244, + "learning_rate": 1.0089860760384554e-06, + "loss": 0.6173, + "step": 280 + }, + { + "epoch": 5.94, + "grad_norm": 0.8565478324890137, + "learning_rate": 9.83761424137494e-07, + "loss": 0.5537, + "step": 285 + }, + { + "epoch": 6.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6864242553710938, + "eval_runtime": 1.3622, + "eval_samples_per_second": 46.983, + "eval_steps_per_second": 5.873, + "step": 288 + } + ], + "logging_steps": 5, + "max_steps": 480, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 4356539523260784.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": { + "learning_rate": 2.1794099242430636e-06, + "per_device_train_batch_size": 4 + } +} diff --git a/run-4/checkpoint-288/training_args.bin b/run-4/checkpoint-288/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2bb1c33e4434a3bb752764ab093170996801fe72 --- /dev/null +++ b/run-4/checkpoint-288/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eadd8415a707c8e4de7440233358b89448cac114a2ada36aef1d9186553c0c3 +size 4920 diff --git a/run-4/checkpoint-336/config.json b/run-4/checkpoint-336/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-4/checkpoint-336/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-4/checkpoint-336/model.safetensors b/run-4/checkpoint-336/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..876c6014be610044702a5be12037488ccbe891a8 --- /dev/null +++ b/run-4/checkpoint-336/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3425a160cf28659111e87b6ab601bd4f64cd88b8baa924505a0ed42513beb151 +size 94763496 diff --git a/run-4/checkpoint-336/optimizer.pt b/run-4/checkpoint-336/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3efb9b3ae30ada0736b8d3c6c81434f1e68ad042 --- /dev/null +++ b/run-4/checkpoint-336/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a80e51af62d25c253503b816ec7b98c85cc9ba062954c975249337e8ce646afe +size 189552570 diff --git a/run-4/checkpoint-336/preprocessor_config.json b/run-4/checkpoint-336/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-4/checkpoint-336/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-4/checkpoint-336/rng_state.pth b/run-4/checkpoint-336/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..6d09c593f640bea3364e14655046e2da93b3ebc1 --- /dev/null +++ b/run-4/checkpoint-336/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d12884ae20f0c926a355fda8650edc055a398d4c7c42545ccdb7d60bd202452 +size 14244 diff --git a/run-4/checkpoint-336/scheduler.pt b/run-4/checkpoint-336/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..95f570b84d9abf1d70d960b1382d530250a4db38 --- /dev/null +++ b/run-4/checkpoint-336/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0d819fb0c1d431c458fec913aa5469b8a7ed1313a0add2ae71b76c0b9a0a219 +size 1064 diff --git a/run-4/checkpoint-336/trainer_state.json b/run-4/checkpoint-336/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8f3bfd2b43d222ac75675222f00525e23a76b26d --- /dev/null +++ b/run-4/checkpoint-336/trainer_state.json @@ -0,0 +1,556 @@ +{ + "best_metric": 0.7326732673267327, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-4/checkpoint-96", + "epoch": 7.0, + "eval_steps": 500, + "global_step": 336, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "grad_norm": 1.2733114957809448, + "learning_rate": 2.2702186710865246e-07, + "loss": 0.7025, + "step": 5 + }, + { + "epoch": 0.21, + "grad_norm": 1.243804931640625, + "learning_rate": 4.5404373421730493e-07, + "loss": 0.6974, + "step": 10 + }, + { + "epoch": 0.31, + "grad_norm": 1.7711552381515503, + "learning_rate": 6.810656013259573e-07, + "loss": 0.696, + "step": 15 + }, + { + "epoch": 0.42, + "grad_norm": 1.1453403234481812, + "learning_rate": 9.080874684346099e-07, + "loss": 0.6989, + "step": 20 + }, + { + "epoch": 0.52, + "grad_norm": 1.2729355096817017, + "learning_rate": 1.1351093355432624e-06, + "loss": 0.6968, + "step": 25 + }, + { + "epoch": 0.62, + "grad_norm": 1.1592165231704712, + "learning_rate": 1.3621312026519146e-06, + "loss": 0.6959, + "step": 30 + }, + { + "epoch": 0.73, + "grad_norm": 1.1798148155212402, + "learning_rate": 1.589153069760567e-06, + "loss": 0.6952, + "step": 35 + }, + { + "epoch": 0.83, + "grad_norm": 2.1216671466827393, + "learning_rate": 1.8161749368692197e-06, + "loss": 0.6886, + "step": 40 + }, + { + "epoch": 0.94, + "grad_norm": 1.3416370153427124, + "learning_rate": 2.043196803977872e-06, + "loss": 0.6864, + "step": 45 + }, + { + "epoch": 1.0, + "eval_f1": 0.72, + "eval_loss": 0.688262939453125, + "eval_runtime": 1.3468, + "eval_samples_per_second": 47.521, + "eval_steps_per_second": 5.94, + "step": 48 + }, + { + "epoch": 1.04, + "grad_norm": 2.1856281757354736, + "learning_rate": 2.169320063482679e-06, + "loss": 0.6917, + "step": 50 + }, + { + "epoch": 1.15, + "grad_norm": 1.4077153205871582, + "learning_rate": 2.1440954115817176e-06, + "loss": 0.6884, + "step": 55 + }, + { + "epoch": 1.25, + "grad_norm": 2.1792664527893066, + "learning_rate": 2.1188707596807562e-06, + "loss": 0.6668, + "step": 60 + }, + { + "epoch": 1.35, + "grad_norm": 1.0386197566986084, + "learning_rate": 2.093646107779795e-06, + "loss": 0.6694, + "step": 65 + }, + { + "epoch": 1.46, + "grad_norm": 2.0565919876098633, + "learning_rate": 2.0684214558788335e-06, + "loss": 0.6561, + "step": 70 + }, + { + "epoch": 1.56, + "grad_norm": 1.2978509664535522, + "learning_rate": 2.043196803977872e-06, + "loss": 0.6789, + "step": 75 + }, + { + "epoch": 1.67, + "grad_norm": 2.058328628540039, + "learning_rate": 2.0179721520769108e-06, + "loss": 0.6633, + "step": 80 + }, + { + "epoch": 1.77, + "grad_norm": 0.6023226976394653, + "learning_rate": 1.9927475001759494e-06, + "loss": 0.6655, + "step": 85 + }, + { + "epoch": 1.88, + "grad_norm": 0.5510762929916382, + "learning_rate": 1.967522848274988e-06, + "loss": 0.6622, + "step": 90 + }, + { + "epoch": 1.98, + "grad_norm": 1.098602533340454, + "learning_rate": 1.9422981963740267e-06, + "loss": 0.6633, + "step": 95 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6816024780273438, + "eval_runtime": 1.3765, + "eval_samples_per_second": 46.493, + "eval_steps_per_second": 5.812, + "step": 96 + }, + { + "epoch": 2.08, + "grad_norm": 0.9589098691940308, + "learning_rate": 1.9170735444730654e-06, + "loss": 0.659, + "step": 100 + }, + { + "epoch": 2.19, + "grad_norm": 1.070695161819458, + "learning_rate": 1.8918488925721038e-06, + "loss": 0.6313, + "step": 105 + }, + { + "epoch": 2.29, + "grad_norm": 0.9913639426231384, + "learning_rate": 1.8666242406711424e-06, + "loss": 0.6652, + "step": 110 + }, + { + "epoch": 2.4, + "grad_norm": 1.0632878541946411, + "learning_rate": 1.841399588770181e-06, + "loss": 0.673, + "step": 115 + }, + { + "epoch": 2.5, + "grad_norm": 2.1036579608917236, + "learning_rate": 1.8161749368692197e-06, + "loss": 0.6451, + "step": 120 + }, + { + "epoch": 2.6, + "grad_norm": 1.08384108543396, + "learning_rate": 1.7909502849682583e-06, + "loss": 0.6322, + "step": 125 + }, + { + "epoch": 2.71, + "grad_norm": 0.9407000541687012, + "learning_rate": 1.765725633067297e-06, + "loss": 0.6755, + "step": 130 + }, + { + "epoch": 2.81, + "grad_norm": 0.9016568660736084, + "learning_rate": 1.7405009811663356e-06, + "loss": 0.5985, + "step": 135 + }, + { + "epoch": 2.92, + "grad_norm": 1.1134448051452637, + "learning_rate": 1.7152763292653743e-06, + "loss": 0.603, + "step": 140 + }, + { + "epoch": 3.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6800689697265625, + "eval_runtime": 1.3861, + "eval_samples_per_second": 46.173, + "eval_steps_per_second": 5.772, + "step": 144 + }, + { + "epoch": 3.02, + "grad_norm": 0.7627719640731812, + "learning_rate": 1.6900516773644127e-06, + "loss": 0.6557, + "step": 145 + }, + { + "epoch": 3.12, + "grad_norm": 0.9291415214538574, + "learning_rate": 1.6648270254634511e-06, + "loss": 0.6219, + "step": 150 + }, + { + "epoch": 3.23, + "grad_norm": 0.9248765707015991, + "learning_rate": 1.6396023735624898e-06, + "loss": 0.6325, + "step": 155 + }, + { + "epoch": 3.33, + "grad_norm": 0.9842573404312134, + "learning_rate": 1.6143777216615284e-06, + "loss": 0.6521, + "step": 160 + }, + { + "epoch": 3.44, + "grad_norm": 0.8689214587211609, + "learning_rate": 1.589153069760567e-06, + "loss": 0.5929, + "step": 165 + }, + { + "epoch": 3.54, + "grad_norm": 1.0012000799179077, + "learning_rate": 1.5639284178596057e-06, + "loss": 0.584, + "step": 170 + }, + { + "epoch": 3.65, + "grad_norm": 0.7438368797302246, + "learning_rate": 1.5387037659586443e-06, + "loss": 0.6813, + "step": 175 + }, + { + "epoch": 3.75, + "grad_norm": 1.8603870868682861, + "learning_rate": 1.513479114057683e-06, + "loss": 0.6099, + "step": 180 + }, + { + "epoch": 3.85, + "grad_norm": 0.9918416738510132, + "learning_rate": 1.4882544621567216e-06, + "loss": 0.6192, + "step": 185 + }, + { + "epoch": 3.96, + "grad_norm": 1.9146322011947632, + "learning_rate": 1.4630298102557603e-06, + "loss": 0.6472, + "step": 190 + }, + { + "epoch": 4.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6818161010742188, + "eval_runtime": 1.3841, + "eval_samples_per_second": 46.239, + "eval_steps_per_second": 5.78, + "step": 192 + }, + { + "epoch": 4.06, + "grad_norm": 0.9502781629562378, + "learning_rate": 1.437805158354799e-06, + "loss": 0.6447, + "step": 195 + }, + { + "epoch": 4.17, + "grad_norm": 0.8570067286491394, + "learning_rate": 1.4125805064538375e-06, + "loss": 0.5306, + "step": 200 + }, + { + "epoch": 4.27, + "grad_norm": 0.8097484111785889, + "learning_rate": 1.3873558545528762e-06, + "loss": 0.6202, + "step": 205 + }, + { + "epoch": 4.38, + "grad_norm": 2.0106472969055176, + "learning_rate": 1.3621312026519146e-06, + "loss": 0.6705, + "step": 210 + }, + { + "epoch": 4.48, + "grad_norm": 1.090775489807129, + "learning_rate": 1.3369065507509533e-06, + "loss": 0.6297, + "step": 215 + }, + { + "epoch": 4.58, + "grad_norm": 0.8988145589828491, + "learning_rate": 1.311681898849992e-06, + "loss": 0.5896, + "step": 220 + }, + { + "epoch": 4.69, + "grad_norm": 0.9149978756904602, + "learning_rate": 1.2864572469490305e-06, + "loss": 0.6156, + "step": 225 + }, + { + "epoch": 4.79, + "grad_norm": 1.9398412704467773, + "learning_rate": 1.2612325950480692e-06, + "loss": 0.6305, + "step": 230 + }, + { + "epoch": 4.9, + "grad_norm": 0.9217966794967651, + "learning_rate": 1.2360079431471078e-06, + "loss": 0.5943, + "step": 235 + }, + { + "epoch": 5.0, + "grad_norm": 0.9083653688430786, + "learning_rate": 1.2107832912461465e-06, + "loss": 0.6386, + "step": 240 + }, + { + "epoch": 5.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6846389770507812, + "eval_runtime": 1.4094, + "eval_samples_per_second": 45.409, + "eval_steps_per_second": 5.676, + "step": 240 + }, + { + "epoch": 5.1, + "grad_norm": 0.9323675036430359, + "learning_rate": 1.1855586393451851e-06, + "loss": 0.5779, + "step": 245 + }, + { + "epoch": 5.21, + "grad_norm": 0.7549787163734436, + "learning_rate": 1.1603339874442238e-06, + "loss": 0.5948, + "step": 250 + }, + { + "epoch": 5.31, + "grad_norm": 0.8535837531089783, + "learning_rate": 1.1351093355432624e-06, + "loss": 0.6928, + "step": 255 + }, + { + "epoch": 5.42, + "grad_norm": 1.2038137912750244, + "learning_rate": 1.109884683642301e-06, + "loss": 0.5887, + "step": 260 + }, + { + "epoch": 5.52, + "grad_norm": 0.9501279592514038, + "learning_rate": 1.0846600317413395e-06, + "loss": 0.5776, + "step": 265 + }, + { + "epoch": 5.62, + "grad_norm": 0.7421719431877136, + "learning_rate": 1.0594353798403781e-06, + "loss": 0.6734, + "step": 270 + }, + { + "epoch": 5.73, + "grad_norm": 0.8555863499641418, + "learning_rate": 1.0342107279394168e-06, + "loss": 0.6399, + "step": 275 + }, + { + "epoch": 5.83, + "grad_norm": 0.8841156363487244, + "learning_rate": 1.0089860760384554e-06, + "loss": 0.6173, + "step": 280 + }, + { + "epoch": 5.94, + "grad_norm": 0.8565478324890137, + "learning_rate": 9.83761424137494e-07, + "loss": 0.5537, + "step": 285 + }, + { + "epoch": 6.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6864242553710938, + "eval_runtime": 1.3622, + "eval_samples_per_second": 46.983, + "eval_steps_per_second": 5.873, + "step": 288 + }, + { + "epoch": 6.04, + "grad_norm": 0.8750139474868774, + "learning_rate": 9.585367722365327e-07, + "loss": 0.5531, + "step": 290 + }, + { + "epoch": 6.15, + "grad_norm": 1.0445302724838257, + "learning_rate": 9.333121203355712e-07, + "loss": 0.638, + "step": 295 + }, + { + "epoch": 6.25, + "grad_norm": 0.7958914637565613, + "learning_rate": 9.080874684346099e-07, + "loss": 0.547, + "step": 300 + }, + { + "epoch": 6.35, + "grad_norm": 0.9992254376411438, + "learning_rate": 8.828628165336485e-07, + "loss": 0.6425, + "step": 305 + }, + { + "epoch": 6.46, + "grad_norm": 0.8400682806968689, + "learning_rate": 8.576381646326871e-07, + "loss": 0.6955, + "step": 310 + }, + { + "epoch": 6.56, + "grad_norm": 0.742438793182373, + "learning_rate": 8.324135127317256e-07, + "loss": 0.6473, + "step": 315 + }, + { + "epoch": 6.67, + "grad_norm": 0.6693254113197327, + "learning_rate": 8.071888608307642e-07, + "loss": 0.603, + "step": 320 + }, + { + "epoch": 6.77, + "grad_norm": 1.0816401243209839, + "learning_rate": 7.819642089298028e-07, + "loss": 0.6053, + "step": 325 + }, + { + "epoch": 6.88, + "grad_norm": 0.7275277376174927, + "learning_rate": 7.567395570288415e-07, + "loss": 0.612, + "step": 330 + }, + { + "epoch": 6.98, + "grad_norm": 0.7834873795509338, + "learning_rate": 7.315149051278801e-07, + "loss": 0.55, + "step": 335 + }, + { + "epoch": 7.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6889228820800781, + "eval_runtime": 1.3769, + "eval_samples_per_second": 46.483, + "eval_steps_per_second": 5.81, + "step": 336 + } + ], + "logging_steps": 5, + "max_steps": 480, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 5116387614670704.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": { + "learning_rate": 2.1794099242430636e-06, + "per_device_train_batch_size": 4 + } +} diff --git a/run-4/checkpoint-336/training_args.bin b/run-4/checkpoint-336/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2bb1c33e4434a3bb752764ab093170996801fe72 --- /dev/null +++ b/run-4/checkpoint-336/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eadd8415a707c8e4de7440233358b89448cac114a2ada36aef1d9186553c0c3 +size 4920 diff --git a/run-4/checkpoint-384/config.json b/run-4/checkpoint-384/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-4/checkpoint-384/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-4/checkpoint-384/model.safetensors b/run-4/checkpoint-384/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1cdd6f2d6ff77ef2ae34c8662cbfdd2f5ebd2f9d --- /dev/null +++ b/run-4/checkpoint-384/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f03572b2dcb34be269d7a8392a1db64aa78ab3c991d84591fe98b5b2f300eea6 +size 94763496 diff --git a/run-4/checkpoint-384/optimizer.pt b/run-4/checkpoint-384/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..13ba5b4b42505df1a17ac66ef621677b1df33ae2 --- /dev/null +++ b/run-4/checkpoint-384/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:370faca0c4d8022d7391f5e21d5c7cf5baa3cc1575da59ece8f1da8e073be1d6 +size 189552570 diff --git a/run-4/checkpoint-384/preprocessor_config.json b/run-4/checkpoint-384/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-4/checkpoint-384/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-4/checkpoint-384/rng_state.pth b/run-4/checkpoint-384/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1a6bd57b8c17138ed3367d5ca6692d78d760bd47 --- /dev/null +++ b/run-4/checkpoint-384/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5326b9611b4fb9dc5dc0b29580e7e48abf50913e44071592799c052bebfbacd7 +size 14244 diff --git a/run-4/checkpoint-384/scheduler.pt b/run-4/checkpoint-384/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..16960a7b598201f496455d1ada88abffcc5005ae --- /dev/null +++ b/run-4/checkpoint-384/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53cf1db083d766e2eeb0b68e04895788e327c7d113d516f642c4fc0792596377 +size 1064 diff --git a/run-4/checkpoint-384/trainer_state.json b/run-4/checkpoint-384/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9dc0c0315bdc6944e67fdb9019c5a34d2ee96136 --- /dev/null +++ b/run-4/checkpoint-384/trainer_state.json @@ -0,0 +1,628 @@ +{ + "best_metric": 0.7326732673267327, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-4/checkpoint-96", + "epoch": 8.0, + "eval_steps": 500, + "global_step": 384, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "grad_norm": 1.2733114957809448, + "learning_rate": 2.2702186710865246e-07, + "loss": 0.7025, + "step": 5 + }, + { + "epoch": 0.21, + "grad_norm": 1.243804931640625, + "learning_rate": 4.5404373421730493e-07, + "loss": 0.6974, + "step": 10 + }, + { + "epoch": 0.31, + "grad_norm": 1.7711552381515503, + "learning_rate": 6.810656013259573e-07, + "loss": 0.696, + "step": 15 + }, + { + "epoch": 0.42, + "grad_norm": 1.1453403234481812, + "learning_rate": 9.080874684346099e-07, + "loss": 0.6989, + "step": 20 + }, + { + "epoch": 0.52, + "grad_norm": 1.2729355096817017, + "learning_rate": 1.1351093355432624e-06, + "loss": 0.6968, + "step": 25 + }, + { + "epoch": 0.62, + "grad_norm": 1.1592165231704712, + "learning_rate": 1.3621312026519146e-06, + "loss": 0.6959, + "step": 30 + }, + { + "epoch": 0.73, + "grad_norm": 1.1798148155212402, + "learning_rate": 1.589153069760567e-06, + "loss": 0.6952, + "step": 35 + }, + { + "epoch": 0.83, + "grad_norm": 2.1216671466827393, + "learning_rate": 1.8161749368692197e-06, + "loss": 0.6886, + "step": 40 + }, + { + "epoch": 0.94, + "grad_norm": 1.3416370153427124, + "learning_rate": 2.043196803977872e-06, + "loss": 0.6864, + "step": 45 + }, + { + "epoch": 1.0, + "eval_f1": 0.72, + "eval_loss": 0.688262939453125, + "eval_runtime": 1.3468, + "eval_samples_per_second": 47.521, + "eval_steps_per_second": 5.94, + "step": 48 + }, + { + "epoch": 1.04, + "grad_norm": 2.1856281757354736, + "learning_rate": 2.169320063482679e-06, + "loss": 0.6917, + "step": 50 + }, + { + "epoch": 1.15, + "grad_norm": 1.4077153205871582, + "learning_rate": 2.1440954115817176e-06, + "loss": 0.6884, + "step": 55 + }, + { + "epoch": 1.25, + "grad_norm": 2.1792664527893066, + "learning_rate": 2.1188707596807562e-06, + "loss": 0.6668, + "step": 60 + }, + { + "epoch": 1.35, + "grad_norm": 1.0386197566986084, + "learning_rate": 2.093646107779795e-06, + "loss": 0.6694, + "step": 65 + }, + { + "epoch": 1.46, + "grad_norm": 2.0565919876098633, + "learning_rate": 2.0684214558788335e-06, + "loss": 0.6561, + "step": 70 + }, + { + "epoch": 1.56, + "grad_norm": 1.2978509664535522, + "learning_rate": 2.043196803977872e-06, + "loss": 0.6789, + "step": 75 + }, + { + "epoch": 1.67, + "grad_norm": 2.058328628540039, + "learning_rate": 2.0179721520769108e-06, + "loss": 0.6633, + "step": 80 + }, + { + "epoch": 1.77, + "grad_norm": 0.6023226976394653, + "learning_rate": 1.9927475001759494e-06, + "loss": 0.6655, + "step": 85 + }, + { + "epoch": 1.88, + "grad_norm": 0.5510762929916382, + "learning_rate": 1.967522848274988e-06, + "loss": 0.6622, + "step": 90 + }, + { + "epoch": 1.98, + "grad_norm": 1.098602533340454, + "learning_rate": 1.9422981963740267e-06, + "loss": 0.6633, + "step": 95 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6816024780273438, + "eval_runtime": 1.3765, + "eval_samples_per_second": 46.493, + "eval_steps_per_second": 5.812, + "step": 96 + }, + { + "epoch": 2.08, + "grad_norm": 0.9589098691940308, + "learning_rate": 1.9170735444730654e-06, + "loss": 0.659, + "step": 100 + }, + { + "epoch": 2.19, + "grad_norm": 1.070695161819458, + "learning_rate": 1.8918488925721038e-06, + "loss": 0.6313, + "step": 105 + }, + { + "epoch": 2.29, + "grad_norm": 0.9913639426231384, + "learning_rate": 1.8666242406711424e-06, + "loss": 0.6652, + "step": 110 + }, + { + "epoch": 2.4, + "grad_norm": 1.0632878541946411, + "learning_rate": 1.841399588770181e-06, + "loss": 0.673, + "step": 115 + }, + { + "epoch": 2.5, + "grad_norm": 2.1036579608917236, + "learning_rate": 1.8161749368692197e-06, + "loss": 0.6451, + "step": 120 + }, + { + "epoch": 2.6, + "grad_norm": 1.08384108543396, + "learning_rate": 1.7909502849682583e-06, + "loss": 0.6322, + "step": 125 + }, + { + "epoch": 2.71, + "grad_norm": 0.9407000541687012, + "learning_rate": 1.765725633067297e-06, + "loss": 0.6755, + "step": 130 + }, + { + "epoch": 2.81, + "grad_norm": 0.9016568660736084, + "learning_rate": 1.7405009811663356e-06, + "loss": 0.5985, + "step": 135 + }, + { + "epoch": 2.92, + "grad_norm": 1.1134448051452637, + "learning_rate": 1.7152763292653743e-06, + "loss": 0.603, + "step": 140 + }, + { + "epoch": 3.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6800689697265625, + "eval_runtime": 1.3861, + "eval_samples_per_second": 46.173, + "eval_steps_per_second": 5.772, + "step": 144 + }, + { + "epoch": 3.02, + "grad_norm": 0.7627719640731812, + "learning_rate": 1.6900516773644127e-06, + "loss": 0.6557, + "step": 145 + }, + { + "epoch": 3.12, + "grad_norm": 0.9291415214538574, + "learning_rate": 1.6648270254634511e-06, + "loss": 0.6219, + "step": 150 + }, + { + "epoch": 3.23, + "grad_norm": 0.9248765707015991, + "learning_rate": 1.6396023735624898e-06, + "loss": 0.6325, + "step": 155 + }, + { + "epoch": 3.33, + "grad_norm": 0.9842573404312134, + "learning_rate": 1.6143777216615284e-06, + "loss": 0.6521, + "step": 160 + }, + { + "epoch": 3.44, + "grad_norm": 0.8689214587211609, + "learning_rate": 1.589153069760567e-06, + "loss": 0.5929, + "step": 165 + }, + { + "epoch": 3.54, + "grad_norm": 1.0012000799179077, + "learning_rate": 1.5639284178596057e-06, + "loss": 0.584, + "step": 170 + }, + { + "epoch": 3.65, + "grad_norm": 0.7438368797302246, + "learning_rate": 1.5387037659586443e-06, + "loss": 0.6813, + "step": 175 + }, + { + "epoch": 3.75, + "grad_norm": 1.8603870868682861, + "learning_rate": 1.513479114057683e-06, + "loss": 0.6099, + "step": 180 + }, + { + "epoch": 3.85, + "grad_norm": 0.9918416738510132, + "learning_rate": 1.4882544621567216e-06, + "loss": 0.6192, + "step": 185 + }, + { + "epoch": 3.96, + "grad_norm": 1.9146322011947632, + "learning_rate": 1.4630298102557603e-06, + "loss": 0.6472, + "step": 190 + }, + { + "epoch": 4.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6818161010742188, + "eval_runtime": 1.3841, + "eval_samples_per_second": 46.239, + "eval_steps_per_second": 5.78, + "step": 192 + }, + { + "epoch": 4.06, + "grad_norm": 0.9502781629562378, + "learning_rate": 1.437805158354799e-06, + "loss": 0.6447, + "step": 195 + }, + { + "epoch": 4.17, + "grad_norm": 0.8570067286491394, + "learning_rate": 1.4125805064538375e-06, + "loss": 0.5306, + "step": 200 + }, + { + "epoch": 4.27, + "grad_norm": 0.8097484111785889, + "learning_rate": 1.3873558545528762e-06, + "loss": 0.6202, + "step": 205 + }, + { + "epoch": 4.38, + "grad_norm": 2.0106472969055176, + "learning_rate": 1.3621312026519146e-06, + "loss": 0.6705, + "step": 210 + }, + { + "epoch": 4.48, + "grad_norm": 1.090775489807129, + "learning_rate": 1.3369065507509533e-06, + "loss": 0.6297, + "step": 215 + }, + { + "epoch": 4.58, + "grad_norm": 0.8988145589828491, + "learning_rate": 1.311681898849992e-06, + "loss": 0.5896, + "step": 220 + }, + { + "epoch": 4.69, + "grad_norm": 0.9149978756904602, + "learning_rate": 1.2864572469490305e-06, + "loss": 0.6156, + "step": 225 + }, + { + "epoch": 4.79, + "grad_norm": 1.9398412704467773, + "learning_rate": 1.2612325950480692e-06, + "loss": 0.6305, + "step": 230 + }, + { + "epoch": 4.9, + "grad_norm": 0.9217966794967651, + "learning_rate": 1.2360079431471078e-06, + "loss": 0.5943, + "step": 235 + }, + { + "epoch": 5.0, + "grad_norm": 0.9083653688430786, + "learning_rate": 1.2107832912461465e-06, + "loss": 0.6386, + "step": 240 + }, + { + "epoch": 5.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6846389770507812, + "eval_runtime": 1.4094, + "eval_samples_per_second": 45.409, + "eval_steps_per_second": 5.676, + "step": 240 + }, + { + "epoch": 5.1, + "grad_norm": 0.9323675036430359, + "learning_rate": 1.1855586393451851e-06, + "loss": 0.5779, + "step": 245 + }, + { + "epoch": 5.21, + "grad_norm": 0.7549787163734436, + "learning_rate": 1.1603339874442238e-06, + "loss": 0.5948, + "step": 250 + }, + { + "epoch": 5.31, + "grad_norm": 0.8535837531089783, + "learning_rate": 1.1351093355432624e-06, + "loss": 0.6928, + "step": 255 + }, + { + "epoch": 5.42, + "grad_norm": 1.2038137912750244, + "learning_rate": 1.109884683642301e-06, + "loss": 0.5887, + "step": 260 + }, + { + "epoch": 5.52, + "grad_norm": 0.9501279592514038, + "learning_rate": 1.0846600317413395e-06, + "loss": 0.5776, + "step": 265 + }, + { + "epoch": 5.62, + "grad_norm": 0.7421719431877136, + "learning_rate": 1.0594353798403781e-06, + "loss": 0.6734, + "step": 270 + }, + { + "epoch": 5.73, + "grad_norm": 0.8555863499641418, + "learning_rate": 1.0342107279394168e-06, + "loss": 0.6399, + "step": 275 + }, + { + "epoch": 5.83, + "grad_norm": 0.8841156363487244, + "learning_rate": 1.0089860760384554e-06, + "loss": 0.6173, + "step": 280 + }, + { + "epoch": 5.94, + "grad_norm": 0.8565478324890137, + "learning_rate": 9.83761424137494e-07, + "loss": 0.5537, + "step": 285 + }, + { + "epoch": 6.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6864242553710938, + "eval_runtime": 1.3622, + "eval_samples_per_second": 46.983, + "eval_steps_per_second": 5.873, + "step": 288 + }, + { + "epoch": 6.04, + "grad_norm": 0.8750139474868774, + "learning_rate": 9.585367722365327e-07, + "loss": 0.5531, + "step": 290 + }, + { + "epoch": 6.15, + "grad_norm": 1.0445302724838257, + "learning_rate": 9.333121203355712e-07, + "loss": 0.638, + "step": 295 + }, + { + "epoch": 6.25, + "grad_norm": 0.7958914637565613, + "learning_rate": 9.080874684346099e-07, + "loss": 0.547, + "step": 300 + }, + { + "epoch": 6.35, + "grad_norm": 0.9992254376411438, + "learning_rate": 8.828628165336485e-07, + "loss": 0.6425, + "step": 305 + }, + { + "epoch": 6.46, + "grad_norm": 0.8400682806968689, + "learning_rate": 8.576381646326871e-07, + "loss": 0.6955, + "step": 310 + }, + { + "epoch": 6.56, + "grad_norm": 0.742438793182373, + "learning_rate": 8.324135127317256e-07, + "loss": 0.6473, + "step": 315 + }, + { + "epoch": 6.67, + "grad_norm": 0.6693254113197327, + "learning_rate": 8.071888608307642e-07, + "loss": 0.603, + "step": 320 + }, + { + "epoch": 6.77, + "grad_norm": 1.0816401243209839, + "learning_rate": 7.819642089298028e-07, + "loss": 0.6053, + "step": 325 + }, + { + "epoch": 6.88, + "grad_norm": 0.7275277376174927, + "learning_rate": 7.567395570288415e-07, + "loss": 0.612, + "step": 330 + }, + { + "epoch": 6.98, + "grad_norm": 0.7834873795509338, + "learning_rate": 7.315149051278801e-07, + "loss": 0.55, + "step": 335 + }, + { + "epoch": 7.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6889228820800781, + "eval_runtime": 1.3769, + "eval_samples_per_second": 46.483, + "eval_steps_per_second": 5.81, + "step": 336 + }, + { + "epoch": 7.08, + "grad_norm": 1.24147367477417, + "learning_rate": 7.062902532269188e-07, + "loss": 0.508, + "step": 340 + }, + { + "epoch": 7.19, + "grad_norm": 1.8932181596755981, + "learning_rate": 6.810656013259573e-07, + "loss": 0.6358, + "step": 345 + }, + { + "epoch": 7.29, + "grad_norm": 1.861436128616333, + "learning_rate": 6.55840949424996e-07, + "loss": 0.5741, + "step": 350 + }, + { + "epoch": 7.4, + "grad_norm": 0.8429200053215027, + "learning_rate": 6.306162975240346e-07, + "loss": 0.5717, + "step": 355 + }, + { + "epoch": 7.5, + "grad_norm": 1.8665741682052612, + "learning_rate": 6.053916456230732e-07, + "loss": 0.6992, + "step": 360 + }, + { + "epoch": 7.6, + "grad_norm": 2.312748908996582, + "learning_rate": 5.801669937221119e-07, + "loss": 0.6151, + "step": 365 + }, + { + "epoch": 7.71, + "grad_norm": 1.1628329753875732, + "learning_rate": 5.549423418211505e-07, + "loss": 0.5354, + "step": 370 + }, + { + "epoch": 7.81, + "grad_norm": 1.8674992322921753, + "learning_rate": 5.297176899201891e-07, + "loss": 0.6411, + "step": 375 + }, + { + "epoch": 7.92, + "grad_norm": 0.7112137675285339, + "learning_rate": 5.044930380192277e-07, + "loss": 0.6063, + "step": 380 + }, + { + "epoch": 8.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6909217834472656, + "eval_runtime": 1.3913, + "eval_samples_per_second": 45.999, + "eval_steps_per_second": 5.75, + "step": 384 + } + ], + "logging_steps": 5, + "max_steps": 480, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 5808783041309760.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": { + "learning_rate": 2.1794099242430636e-06, + "per_device_train_batch_size": 4 + } +} diff --git a/run-4/checkpoint-384/training_args.bin b/run-4/checkpoint-384/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2bb1c33e4434a3bb752764ab093170996801fe72 --- /dev/null +++ b/run-4/checkpoint-384/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eadd8415a707c8e4de7440233358b89448cac114a2ada36aef1d9186553c0c3 +size 4920 diff --git a/run-4/checkpoint-432/config.json b/run-4/checkpoint-432/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-4/checkpoint-432/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-4/checkpoint-432/model.safetensors b/run-4/checkpoint-432/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..97176a6f48fb3cb7dbb18101ce7a7ceab09dcb72 --- /dev/null +++ b/run-4/checkpoint-432/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a940b184aaa16faaf6fcf01d2d4dfd7546ac26b4bd8291bbde86439f173fb8ac +size 94763496 diff --git a/run-4/checkpoint-432/optimizer.pt b/run-4/checkpoint-432/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..fba18ddaca41b61ddfcb581a3fce9719abfaf177 --- /dev/null +++ b/run-4/checkpoint-432/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd08546733235b5bd2316ed0d65411a0e29b5b57469e49d2f7bd6a3290f04a39 +size 189552570 diff --git a/run-4/checkpoint-432/preprocessor_config.json b/run-4/checkpoint-432/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-4/checkpoint-432/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-4/checkpoint-432/rng_state.pth b/run-4/checkpoint-432/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..bcd8fc524b275ac25e742ea920b25e885f1074e5 --- /dev/null +++ b/run-4/checkpoint-432/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03d982e5865661a49a8729a4369776f96011f643bb66895e82e3a7651ff4f807 +size 14244 diff --git a/run-4/checkpoint-432/scheduler.pt b/run-4/checkpoint-432/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d4709faa9fdb86bc89728eab1ec5c81f83340cc --- /dev/null +++ b/run-4/checkpoint-432/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1e2af6627ff1e61e58b05fec1c3d1058baf7476a56eccacb0b8faf5fb64ad61 +size 1064 diff --git a/run-4/checkpoint-432/trainer_state.json b/run-4/checkpoint-432/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fa26a63282c815db043aa96a60c33f371bdd3c89 --- /dev/null +++ b/run-4/checkpoint-432/trainer_state.json @@ -0,0 +1,707 @@ +{ + "best_metric": 0.7326732673267327, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-4/checkpoint-96", + "epoch": 9.0, + "eval_steps": 500, + "global_step": 432, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "grad_norm": 1.2733114957809448, + "learning_rate": 2.2702186710865246e-07, + "loss": 0.7025, + "step": 5 + }, + { + "epoch": 0.21, + "grad_norm": 1.243804931640625, + "learning_rate": 4.5404373421730493e-07, + "loss": 0.6974, + "step": 10 + }, + { + "epoch": 0.31, + "grad_norm": 1.7711552381515503, + "learning_rate": 6.810656013259573e-07, + "loss": 0.696, + "step": 15 + }, + { + "epoch": 0.42, + "grad_norm": 1.1453403234481812, + "learning_rate": 9.080874684346099e-07, + "loss": 0.6989, + "step": 20 + }, + { + "epoch": 0.52, + "grad_norm": 1.2729355096817017, + "learning_rate": 1.1351093355432624e-06, + "loss": 0.6968, + "step": 25 + }, + { + "epoch": 0.62, + "grad_norm": 1.1592165231704712, + "learning_rate": 1.3621312026519146e-06, + "loss": 0.6959, + "step": 30 + }, + { + "epoch": 0.73, + "grad_norm": 1.1798148155212402, + "learning_rate": 1.589153069760567e-06, + "loss": 0.6952, + "step": 35 + }, + { + "epoch": 0.83, + "grad_norm": 2.1216671466827393, + "learning_rate": 1.8161749368692197e-06, + "loss": 0.6886, + "step": 40 + }, + { + "epoch": 0.94, + "grad_norm": 1.3416370153427124, + "learning_rate": 2.043196803977872e-06, + "loss": 0.6864, + "step": 45 + }, + { + "epoch": 1.0, + "eval_f1": 0.72, + "eval_loss": 0.688262939453125, + "eval_runtime": 1.3468, + "eval_samples_per_second": 47.521, + "eval_steps_per_second": 5.94, + "step": 48 + }, + { + "epoch": 1.04, + "grad_norm": 2.1856281757354736, + "learning_rate": 2.169320063482679e-06, + "loss": 0.6917, + "step": 50 + }, + { + "epoch": 1.15, + "grad_norm": 1.4077153205871582, + "learning_rate": 2.1440954115817176e-06, + "loss": 0.6884, + "step": 55 + }, + { + "epoch": 1.25, + "grad_norm": 2.1792664527893066, + "learning_rate": 2.1188707596807562e-06, + "loss": 0.6668, + "step": 60 + }, + { + "epoch": 1.35, + "grad_norm": 1.0386197566986084, + "learning_rate": 2.093646107779795e-06, + "loss": 0.6694, + "step": 65 + }, + { + "epoch": 1.46, + "grad_norm": 2.0565919876098633, + "learning_rate": 2.0684214558788335e-06, + "loss": 0.6561, + "step": 70 + }, + { + "epoch": 1.56, + "grad_norm": 1.2978509664535522, + "learning_rate": 2.043196803977872e-06, + "loss": 0.6789, + "step": 75 + }, + { + "epoch": 1.67, + "grad_norm": 2.058328628540039, + "learning_rate": 2.0179721520769108e-06, + "loss": 0.6633, + "step": 80 + }, + { + "epoch": 1.77, + "grad_norm": 0.6023226976394653, + "learning_rate": 1.9927475001759494e-06, + "loss": 0.6655, + "step": 85 + }, + { + "epoch": 1.88, + "grad_norm": 0.5510762929916382, + "learning_rate": 1.967522848274988e-06, + "loss": 0.6622, + "step": 90 + }, + { + "epoch": 1.98, + "grad_norm": 1.098602533340454, + "learning_rate": 1.9422981963740267e-06, + "loss": 0.6633, + "step": 95 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6816024780273438, + "eval_runtime": 1.3765, + "eval_samples_per_second": 46.493, + "eval_steps_per_second": 5.812, + "step": 96 + }, + { + "epoch": 2.08, + "grad_norm": 0.9589098691940308, + "learning_rate": 1.9170735444730654e-06, + "loss": 0.659, + "step": 100 + }, + { + "epoch": 2.19, + "grad_norm": 1.070695161819458, + "learning_rate": 1.8918488925721038e-06, + "loss": 0.6313, + "step": 105 + }, + { + "epoch": 2.29, + "grad_norm": 0.9913639426231384, + "learning_rate": 1.8666242406711424e-06, + "loss": 0.6652, + "step": 110 + }, + { + "epoch": 2.4, + "grad_norm": 1.0632878541946411, + "learning_rate": 1.841399588770181e-06, + "loss": 0.673, + "step": 115 + }, + { + "epoch": 2.5, + "grad_norm": 2.1036579608917236, + "learning_rate": 1.8161749368692197e-06, + "loss": 0.6451, + "step": 120 + }, + { + "epoch": 2.6, + "grad_norm": 1.08384108543396, + "learning_rate": 1.7909502849682583e-06, + "loss": 0.6322, + "step": 125 + }, + { + "epoch": 2.71, + "grad_norm": 0.9407000541687012, + "learning_rate": 1.765725633067297e-06, + "loss": 0.6755, + "step": 130 + }, + { + "epoch": 2.81, + "grad_norm": 0.9016568660736084, + "learning_rate": 1.7405009811663356e-06, + "loss": 0.5985, + "step": 135 + }, + { + "epoch": 2.92, + "grad_norm": 1.1134448051452637, + "learning_rate": 1.7152763292653743e-06, + "loss": 0.603, + "step": 140 + }, + { + "epoch": 3.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6800689697265625, + "eval_runtime": 1.3861, + "eval_samples_per_second": 46.173, + "eval_steps_per_second": 5.772, + "step": 144 + }, + { + "epoch": 3.02, + "grad_norm": 0.7627719640731812, + "learning_rate": 1.6900516773644127e-06, + "loss": 0.6557, + "step": 145 + }, + { + "epoch": 3.12, + "grad_norm": 0.9291415214538574, + "learning_rate": 1.6648270254634511e-06, + "loss": 0.6219, + "step": 150 + }, + { + "epoch": 3.23, + "grad_norm": 0.9248765707015991, + "learning_rate": 1.6396023735624898e-06, + "loss": 0.6325, + "step": 155 + }, + { + "epoch": 3.33, + "grad_norm": 0.9842573404312134, + "learning_rate": 1.6143777216615284e-06, + "loss": 0.6521, + "step": 160 + }, + { + "epoch": 3.44, + "grad_norm": 0.8689214587211609, + "learning_rate": 1.589153069760567e-06, + "loss": 0.5929, + "step": 165 + }, + { + "epoch": 3.54, + "grad_norm": 1.0012000799179077, + "learning_rate": 1.5639284178596057e-06, + "loss": 0.584, + "step": 170 + }, + { + "epoch": 3.65, + "grad_norm": 0.7438368797302246, + "learning_rate": 1.5387037659586443e-06, + "loss": 0.6813, + "step": 175 + }, + { + "epoch": 3.75, + "grad_norm": 1.8603870868682861, + "learning_rate": 1.513479114057683e-06, + "loss": 0.6099, + "step": 180 + }, + { + "epoch": 3.85, + "grad_norm": 0.9918416738510132, + "learning_rate": 1.4882544621567216e-06, + "loss": 0.6192, + "step": 185 + }, + { + "epoch": 3.96, + "grad_norm": 1.9146322011947632, + "learning_rate": 1.4630298102557603e-06, + "loss": 0.6472, + "step": 190 + }, + { + "epoch": 4.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6818161010742188, + "eval_runtime": 1.3841, + "eval_samples_per_second": 46.239, + "eval_steps_per_second": 5.78, + "step": 192 + }, + { + "epoch": 4.06, + "grad_norm": 0.9502781629562378, + "learning_rate": 1.437805158354799e-06, + "loss": 0.6447, + "step": 195 + }, + { + "epoch": 4.17, + "grad_norm": 0.8570067286491394, + "learning_rate": 1.4125805064538375e-06, + "loss": 0.5306, + "step": 200 + }, + { + "epoch": 4.27, + "grad_norm": 0.8097484111785889, + "learning_rate": 1.3873558545528762e-06, + "loss": 0.6202, + "step": 205 + }, + { + "epoch": 4.38, + "grad_norm": 2.0106472969055176, + "learning_rate": 1.3621312026519146e-06, + "loss": 0.6705, + "step": 210 + }, + { + "epoch": 4.48, + "grad_norm": 1.090775489807129, + "learning_rate": 1.3369065507509533e-06, + "loss": 0.6297, + "step": 215 + }, + { + "epoch": 4.58, + "grad_norm": 0.8988145589828491, + "learning_rate": 1.311681898849992e-06, + "loss": 0.5896, + "step": 220 + }, + { + "epoch": 4.69, + "grad_norm": 0.9149978756904602, + "learning_rate": 1.2864572469490305e-06, + "loss": 0.6156, + "step": 225 + }, + { + "epoch": 4.79, + "grad_norm": 1.9398412704467773, + "learning_rate": 1.2612325950480692e-06, + "loss": 0.6305, + "step": 230 + }, + { + "epoch": 4.9, + "grad_norm": 0.9217966794967651, + "learning_rate": 1.2360079431471078e-06, + "loss": 0.5943, + "step": 235 + }, + { + "epoch": 5.0, + "grad_norm": 0.9083653688430786, + "learning_rate": 1.2107832912461465e-06, + "loss": 0.6386, + "step": 240 + }, + { + "epoch": 5.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6846389770507812, + "eval_runtime": 1.4094, + "eval_samples_per_second": 45.409, + "eval_steps_per_second": 5.676, + "step": 240 + }, + { + "epoch": 5.1, + "grad_norm": 0.9323675036430359, + "learning_rate": 1.1855586393451851e-06, + "loss": 0.5779, + "step": 245 + }, + { + "epoch": 5.21, + "grad_norm": 0.7549787163734436, + "learning_rate": 1.1603339874442238e-06, + "loss": 0.5948, + "step": 250 + }, + { + "epoch": 5.31, + "grad_norm": 0.8535837531089783, + "learning_rate": 1.1351093355432624e-06, + "loss": 0.6928, + "step": 255 + }, + { + "epoch": 5.42, + "grad_norm": 1.2038137912750244, + "learning_rate": 1.109884683642301e-06, + "loss": 0.5887, + "step": 260 + }, + { + "epoch": 5.52, + "grad_norm": 0.9501279592514038, + "learning_rate": 1.0846600317413395e-06, + "loss": 0.5776, + "step": 265 + }, + { + "epoch": 5.62, + "grad_norm": 0.7421719431877136, + "learning_rate": 1.0594353798403781e-06, + "loss": 0.6734, + "step": 270 + }, + { + "epoch": 5.73, + "grad_norm": 0.8555863499641418, + "learning_rate": 1.0342107279394168e-06, + "loss": 0.6399, + "step": 275 + }, + { + "epoch": 5.83, + "grad_norm": 0.8841156363487244, + "learning_rate": 1.0089860760384554e-06, + "loss": 0.6173, + "step": 280 + }, + { + "epoch": 5.94, + "grad_norm": 0.8565478324890137, + "learning_rate": 9.83761424137494e-07, + "loss": 0.5537, + "step": 285 + }, + { + "epoch": 6.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6864242553710938, + "eval_runtime": 1.3622, + "eval_samples_per_second": 46.983, + "eval_steps_per_second": 5.873, + "step": 288 + }, + { + "epoch": 6.04, + "grad_norm": 0.8750139474868774, + "learning_rate": 9.585367722365327e-07, + "loss": 0.5531, + "step": 290 + }, + { + "epoch": 6.15, + "grad_norm": 1.0445302724838257, + "learning_rate": 9.333121203355712e-07, + "loss": 0.638, + "step": 295 + }, + { + "epoch": 6.25, + "grad_norm": 0.7958914637565613, + "learning_rate": 9.080874684346099e-07, + "loss": 0.547, + "step": 300 + }, + { + "epoch": 6.35, + "grad_norm": 0.9992254376411438, + "learning_rate": 8.828628165336485e-07, + "loss": 0.6425, + "step": 305 + }, + { + "epoch": 6.46, + "grad_norm": 0.8400682806968689, + "learning_rate": 8.576381646326871e-07, + "loss": 0.6955, + "step": 310 + }, + { + "epoch": 6.56, + "grad_norm": 0.742438793182373, + "learning_rate": 8.324135127317256e-07, + "loss": 0.6473, + "step": 315 + }, + { + "epoch": 6.67, + "grad_norm": 0.6693254113197327, + "learning_rate": 8.071888608307642e-07, + "loss": 0.603, + "step": 320 + }, + { + "epoch": 6.77, + "grad_norm": 1.0816401243209839, + "learning_rate": 7.819642089298028e-07, + "loss": 0.6053, + "step": 325 + }, + { + "epoch": 6.88, + "grad_norm": 0.7275277376174927, + "learning_rate": 7.567395570288415e-07, + "loss": 0.612, + "step": 330 + }, + { + "epoch": 6.98, + "grad_norm": 0.7834873795509338, + "learning_rate": 7.315149051278801e-07, + "loss": 0.55, + "step": 335 + }, + { + "epoch": 7.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6889228820800781, + "eval_runtime": 1.3769, + "eval_samples_per_second": 46.483, + "eval_steps_per_second": 5.81, + "step": 336 + }, + { + "epoch": 7.08, + "grad_norm": 1.24147367477417, + "learning_rate": 7.062902532269188e-07, + "loss": 0.508, + "step": 340 + }, + { + "epoch": 7.19, + "grad_norm": 1.8932181596755981, + "learning_rate": 6.810656013259573e-07, + "loss": 0.6358, + "step": 345 + }, + { + "epoch": 7.29, + "grad_norm": 1.861436128616333, + "learning_rate": 6.55840949424996e-07, + "loss": 0.5741, + "step": 350 + }, + { + "epoch": 7.4, + "grad_norm": 0.8429200053215027, + "learning_rate": 6.306162975240346e-07, + "loss": 0.5717, + "step": 355 + }, + { + "epoch": 7.5, + "grad_norm": 1.8665741682052612, + "learning_rate": 6.053916456230732e-07, + "loss": 0.6992, + "step": 360 + }, + { + "epoch": 7.6, + "grad_norm": 2.312748908996582, + "learning_rate": 5.801669937221119e-07, + "loss": 0.6151, + "step": 365 + }, + { + "epoch": 7.71, + "grad_norm": 1.1628329753875732, + "learning_rate": 5.549423418211505e-07, + "loss": 0.5354, + "step": 370 + }, + { + "epoch": 7.81, + "grad_norm": 1.8674992322921753, + "learning_rate": 5.297176899201891e-07, + "loss": 0.6411, + "step": 375 + }, + { + "epoch": 7.92, + "grad_norm": 0.7112137675285339, + "learning_rate": 5.044930380192277e-07, + "loss": 0.6063, + "step": 380 + }, + { + "epoch": 8.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6909217834472656, + "eval_runtime": 1.3913, + "eval_samples_per_second": 45.999, + "eval_steps_per_second": 5.75, + "step": 384 + }, + { + "epoch": 8.02, + "grad_norm": 1.201416015625, + "learning_rate": 4.792683861182663e-07, + "loss": 0.647, + "step": 385 + }, + { + "epoch": 8.12, + "grad_norm": 0.9348795413970947, + "learning_rate": 4.5404373421730493e-07, + "loss": 0.5356, + "step": 390 + }, + { + "epoch": 8.23, + "grad_norm": 1.895739197731018, + "learning_rate": 4.2881908231634357e-07, + "loss": 0.5384, + "step": 395 + }, + { + "epoch": 8.33, + "grad_norm": 1.8363467454910278, + "learning_rate": 4.035944304153821e-07, + "loss": 0.541, + "step": 400 + }, + { + "epoch": 8.44, + "grad_norm": 2.475804090499878, + "learning_rate": 3.7836977851442075e-07, + "loss": 0.5503, + "step": 405 + }, + { + "epoch": 8.54, + "grad_norm": 1.314663290977478, + "learning_rate": 3.531451266134594e-07, + "loss": 0.575, + "step": 410 + }, + { + "epoch": 8.65, + "grad_norm": 1.850918173789978, + "learning_rate": 3.27920474712498e-07, + "loss": 0.6658, + "step": 415 + }, + { + "epoch": 8.75, + "grad_norm": 0.7412477135658264, + "learning_rate": 3.026958228115366e-07, + "loss": 0.5632, + "step": 420 + }, + { + "epoch": 8.85, + "grad_norm": 1.3282320499420166, + "learning_rate": 2.7747117091057526e-07, + "loss": 0.7303, + "step": 425 + }, + { + "epoch": 8.96, + "grad_norm": 1.019906759262085, + "learning_rate": 2.5224651900961385e-07, + "loss": 0.7438, + "step": 430 + }, + { + "epoch": 9.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6923065185546875, + "eval_runtime": 1.3679, + "eval_samples_per_second": 46.786, + "eval_steps_per_second": 5.848, + "step": 432 + } + ], + "logging_steps": 5, + "max_steps": 480, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 6549027725117760.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": { + "learning_rate": 2.1794099242430636e-06, + "per_device_train_batch_size": 4 + } +} diff --git a/run-4/checkpoint-432/training_args.bin b/run-4/checkpoint-432/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2bb1c33e4434a3bb752764ab093170996801fe72 --- /dev/null +++ b/run-4/checkpoint-432/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eadd8415a707c8e4de7440233358b89448cac114a2ada36aef1d9186553c0c3 +size 4920 diff --git a/run-4/checkpoint-480/config.json b/run-4/checkpoint-480/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-4/checkpoint-480/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-4/checkpoint-480/model.safetensors b/run-4/checkpoint-480/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c78334adaacd76c552a0cc5616995effd21cb1a0 --- /dev/null +++ b/run-4/checkpoint-480/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0034f714216219c2c08cf8435919acdc657c4bd1a5b084ca61061fd82523431a +size 94763496 diff --git a/run-4/checkpoint-480/optimizer.pt b/run-4/checkpoint-480/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb2a8985b30963b4fe066162788032f40416c624 --- /dev/null +++ b/run-4/checkpoint-480/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed6eac6a26c2e56cd661232df4db8890244e1ed88b68f059b29731ded390bfd +size 189552570 diff --git a/run-4/checkpoint-480/preprocessor_config.json b/run-4/checkpoint-480/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-4/checkpoint-480/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-4/checkpoint-480/rng_state.pth b/run-4/checkpoint-480/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d8ce0733889ec145e042b57c109b1b3747a5e4b8 --- /dev/null +++ b/run-4/checkpoint-480/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50237159e3e933ca82ae35db92d0c845d9cc1581f3410598daa2edb356446877 +size 14244 diff --git a/run-4/checkpoint-480/scheduler.pt b/run-4/checkpoint-480/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..287f3449025e4c09b76c6995cd3d5f40d3a7a593 --- /dev/null +++ b/run-4/checkpoint-480/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfa1d55f6c03b261c742a005c910343e21fa2efbbe26a1ef6cc46d0a0e368330 +size 1064 diff --git a/run-4/checkpoint-480/trainer_state.json b/run-4/checkpoint-480/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..35504d10605f981bd0f3b51011aa9bbfa4e3198f --- /dev/null +++ b/run-4/checkpoint-480/trainer_state.json @@ -0,0 +1,786 @@ +{ + "best_metric": 0.7326732673267327, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-4/checkpoint-96", + "epoch": 10.0, + "eval_steps": 500, + "global_step": 480, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "grad_norm": 1.2733114957809448, + "learning_rate": 2.2702186710865246e-07, + "loss": 0.7025, + "step": 5 + }, + { + "epoch": 0.21, + "grad_norm": 1.243804931640625, + "learning_rate": 4.5404373421730493e-07, + "loss": 0.6974, + "step": 10 + }, + { + "epoch": 0.31, + "grad_norm": 1.7711552381515503, + "learning_rate": 6.810656013259573e-07, + "loss": 0.696, + "step": 15 + }, + { + "epoch": 0.42, + "grad_norm": 1.1453403234481812, + "learning_rate": 9.080874684346099e-07, + "loss": 0.6989, + "step": 20 + }, + { + "epoch": 0.52, + "grad_norm": 1.2729355096817017, + "learning_rate": 1.1351093355432624e-06, + "loss": 0.6968, + "step": 25 + }, + { + "epoch": 0.62, + "grad_norm": 1.1592165231704712, + "learning_rate": 1.3621312026519146e-06, + "loss": 0.6959, + "step": 30 + }, + { + "epoch": 0.73, + "grad_norm": 1.1798148155212402, + "learning_rate": 1.589153069760567e-06, + "loss": 0.6952, + "step": 35 + }, + { + "epoch": 0.83, + "grad_norm": 2.1216671466827393, + "learning_rate": 1.8161749368692197e-06, + "loss": 0.6886, + "step": 40 + }, + { + "epoch": 0.94, + "grad_norm": 1.3416370153427124, + "learning_rate": 2.043196803977872e-06, + "loss": 0.6864, + "step": 45 + }, + { + "epoch": 1.0, + "eval_f1": 0.72, + "eval_loss": 0.688262939453125, + "eval_runtime": 1.3468, + "eval_samples_per_second": 47.521, + "eval_steps_per_second": 5.94, + "step": 48 + }, + { + "epoch": 1.04, + "grad_norm": 2.1856281757354736, + "learning_rate": 2.169320063482679e-06, + "loss": 0.6917, + "step": 50 + }, + { + "epoch": 1.15, + "grad_norm": 1.4077153205871582, + "learning_rate": 2.1440954115817176e-06, + "loss": 0.6884, + "step": 55 + }, + { + "epoch": 1.25, + "grad_norm": 2.1792664527893066, + "learning_rate": 2.1188707596807562e-06, + "loss": 0.6668, + "step": 60 + }, + { + "epoch": 1.35, + "grad_norm": 1.0386197566986084, + "learning_rate": 2.093646107779795e-06, + "loss": 0.6694, + "step": 65 + }, + { + "epoch": 1.46, + "grad_norm": 2.0565919876098633, + "learning_rate": 2.0684214558788335e-06, + "loss": 0.6561, + "step": 70 + }, + { + "epoch": 1.56, + "grad_norm": 1.2978509664535522, + "learning_rate": 2.043196803977872e-06, + "loss": 0.6789, + "step": 75 + }, + { + "epoch": 1.67, + "grad_norm": 2.058328628540039, + "learning_rate": 2.0179721520769108e-06, + "loss": 0.6633, + "step": 80 + }, + { + "epoch": 1.77, + "grad_norm": 0.6023226976394653, + "learning_rate": 1.9927475001759494e-06, + "loss": 0.6655, + "step": 85 + }, + { + "epoch": 1.88, + "grad_norm": 0.5510762929916382, + "learning_rate": 1.967522848274988e-06, + "loss": 0.6622, + "step": 90 + }, + { + "epoch": 1.98, + "grad_norm": 1.098602533340454, + "learning_rate": 1.9422981963740267e-06, + "loss": 0.6633, + "step": 95 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6816024780273438, + "eval_runtime": 1.3765, + "eval_samples_per_second": 46.493, + "eval_steps_per_second": 5.812, + "step": 96 + }, + { + "epoch": 2.08, + "grad_norm": 0.9589098691940308, + "learning_rate": 1.9170735444730654e-06, + "loss": 0.659, + "step": 100 + }, + { + "epoch": 2.19, + "grad_norm": 1.070695161819458, + "learning_rate": 1.8918488925721038e-06, + "loss": 0.6313, + "step": 105 + }, + { + "epoch": 2.29, + "grad_norm": 0.9913639426231384, + "learning_rate": 1.8666242406711424e-06, + "loss": 0.6652, + "step": 110 + }, + { + "epoch": 2.4, + "grad_norm": 1.0632878541946411, + "learning_rate": 1.841399588770181e-06, + "loss": 0.673, + "step": 115 + }, + { + "epoch": 2.5, + "grad_norm": 2.1036579608917236, + "learning_rate": 1.8161749368692197e-06, + "loss": 0.6451, + "step": 120 + }, + { + "epoch": 2.6, + "grad_norm": 1.08384108543396, + "learning_rate": 1.7909502849682583e-06, + "loss": 0.6322, + "step": 125 + }, + { + "epoch": 2.71, + "grad_norm": 0.9407000541687012, + "learning_rate": 1.765725633067297e-06, + "loss": 0.6755, + "step": 130 + }, + { + "epoch": 2.81, + "grad_norm": 0.9016568660736084, + "learning_rate": 1.7405009811663356e-06, + "loss": 0.5985, + "step": 135 + }, + { + "epoch": 2.92, + "grad_norm": 1.1134448051452637, + "learning_rate": 1.7152763292653743e-06, + "loss": 0.603, + "step": 140 + }, + { + "epoch": 3.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6800689697265625, + "eval_runtime": 1.3861, + "eval_samples_per_second": 46.173, + "eval_steps_per_second": 5.772, + "step": 144 + }, + { + "epoch": 3.02, + "grad_norm": 0.7627719640731812, + "learning_rate": 1.6900516773644127e-06, + "loss": 0.6557, + "step": 145 + }, + { + "epoch": 3.12, + "grad_norm": 0.9291415214538574, + "learning_rate": 1.6648270254634511e-06, + "loss": 0.6219, + "step": 150 + }, + { + "epoch": 3.23, + "grad_norm": 0.9248765707015991, + "learning_rate": 1.6396023735624898e-06, + "loss": 0.6325, + "step": 155 + }, + { + "epoch": 3.33, + "grad_norm": 0.9842573404312134, + "learning_rate": 1.6143777216615284e-06, + "loss": 0.6521, + "step": 160 + }, + { + "epoch": 3.44, + "grad_norm": 0.8689214587211609, + "learning_rate": 1.589153069760567e-06, + "loss": 0.5929, + "step": 165 + }, + { + "epoch": 3.54, + "grad_norm": 1.0012000799179077, + "learning_rate": 1.5639284178596057e-06, + "loss": 0.584, + "step": 170 + }, + { + "epoch": 3.65, + "grad_norm": 0.7438368797302246, + "learning_rate": 1.5387037659586443e-06, + "loss": 0.6813, + "step": 175 + }, + { + "epoch": 3.75, + "grad_norm": 1.8603870868682861, + "learning_rate": 1.513479114057683e-06, + "loss": 0.6099, + "step": 180 + }, + { + "epoch": 3.85, + "grad_norm": 0.9918416738510132, + "learning_rate": 1.4882544621567216e-06, + "loss": 0.6192, + "step": 185 + }, + { + "epoch": 3.96, + "grad_norm": 1.9146322011947632, + "learning_rate": 1.4630298102557603e-06, + "loss": 0.6472, + "step": 190 + }, + { + "epoch": 4.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6818161010742188, + "eval_runtime": 1.3841, + "eval_samples_per_second": 46.239, + "eval_steps_per_second": 5.78, + "step": 192 + }, + { + "epoch": 4.06, + "grad_norm": 0.9502781629562378, + "learning_rate": 1.437805158354799e-06, + "loss": 0.6447, + "step": 195 + }, + { + "epoch": 4.17, + "grad_norm": 0.8570067286491394, + "learning_rate": 1.4125805064538375e-06, + "loss": 0.5306, + "step": 200 + }, + { + "epoch": 4.27, + "grad_norm": 0.8097484111785889, + "learning_rate": 1.3873558545528762e-06, + "loss": 0.6202, + "step": 205 + }, + { + "epoch": 4.38, + "grad_norm": 2.0106472969055176, + "learning_rate": 1.3621312026519146e-06, + "loss": 0.6705, + "step": 210 + }, + { + "epoch": 4.48, + "grad_norm": 1.090775489807129, + "learning_rate": 1.3369065507509533e-06, + "loss": 0.6297, + "step": 215 + }, + { + "epoch": 4.58, + "grad_norm": 0.8988145589828491, + "learning_rate": 1.311681898849992e-06, + "loss": 0.5896, + "step": 220 + }, + { + "epoch": 4.69, + "grad_norm": 0.9149978756904602, + "learning_rate": 1.2864572469490305e-06, + "loss": 0.6156, + "step": 225 + }, + { + "epoch": 4.79, + "grad_norm": 1.9398412704467773, + "learning_rate": 1.2612325950480692e-06, + "loss": 0.6305, + "step": 230 + }, + { + "epoch": 4.9, + "grad_norm": 0.9217966794967651, + "learning_rate": 1.2360079431471078e-06, + "loss": 0.5943, + "step": 235 + }, + { + "epoch": 5.0, + "grad_norm": 0.9083653688430786, + "learning_rate": 1.2107832912461465e-06, + "loss": 0.6386, + "step": 240 + }, + { + "epoch": 5.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6846389770507812, + "eval_runtime": 1.4094, + "eval_samples_per_second": 45.409, + "eval_steps_per_second": 5.676, + "step": 240 + }, + { + "epoch": 5.1, + "grad_norm": 0.9323675036430359, + "learning_rate": 1.1855586393451851e-06, + "loss": 0.5779, + "step": 245 + }, + { + "epoch": 5.21, + "grad_norm": 0.7549787163734436, + "learning_rate": 1.1603339874442238e-06, + "loss": 0.5948, + "step": 250 + }, + { + "epoch": 5.31, + "grad_norm": 0.8535837531089783, + "learning_rate": 1.1351093355432624e-06, + "loss": 0.6928, + "step": 255 + }, + { + "epoch": 5.42, + "grad_norm": 1.2038137912750244, + "learning_rate": 1.109884683642301e-06, + "loss": 0.5887, + "step": 260 + }, + { + "epoch": 5.52, + "grad_norm": 0.9501279592514038, + "learning_rate": 1.0846600317413395e-06, + "loss": 0.5776, + "step": 265 + }, + { + "epoch": 5.62, + "grad_norm": 0.7421719431877136, + "learning_rate": 1.0594353798403781e-06, + "loss": 0.6734, + "step": 270 + }, + { + "epoch": 5.73, + "grad_norm": 0.8555863499641418, + "learning_rate": 1.0342107279394168e-06, + "loss": 0.6399, + "step": 275 + }, + { + "epoch": 5.83, + "grad_norm": 0.8841156363487244, + "learning_rate": 1.0089860760384554e-06, + "loss": 0.6173, + "step": 280 + }, + { + "epoch": 5.94, + "grad_norm": 0.8565478324890137, + "learning_rate": 9.83761424137494e-07, + "loss": 0.5537, + "step": 285 + }, + { + "epoch": 6.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6864242553710938, + "eval_runtime": 1.3622, + "eval_samples_per_second": 46.983, + "eval_steps_per_second": 5.873, + "step": 288 + }, + { + "epoch": 6.04, + "grad_norm": 0.8750139474868774, + "learning_rate": 9.585367722365327e-07, + "loss": 0.5531, + "step": 290 + }, + { + "epoch": 6.15, + "grad_norm": 1.0445302724838257, + "learning_rate": 9.333121203355712e-07, + "loss": 0.638, + "step": 295 + }, + { + "epoch": 6.25, + "grad_norm": 0.7958914637565613, + "learning_rate": 9.080874684346099e-07, + "loss": 0.547, + "step": 300 + }, + { + "epoch": 6.35, + "grad_norm": 0.9992254376411438, + "learning_rate": 8.828628165336485e-07, + "loss": 0.6425, + "step": 305 + }, + { + "epoch": 6.46, + "grad_norm": 0.8400682806968689, + "learning_rate": 8.576381646326871e-07, + "loss": 0.6955, + "step": 310 + }, + { + "epoch": 6.56, + "grad_norm": 0.742438793182373, + "learning_rate": 8.324135127317256e-07, + "loss": 0.6473, + "step": 315 + }, + { + "epoch": 6.67, + "grad_norm": 0.6693254113197327, + "learning_rate": 8.071888608307642e-07, + "loss": 0.603, + "step": 320 + }, + { + "epoch": 6.77, + "grad_norm": 1.0816401243209839, + "learning_rate": 7.819642089298028e-07, + "loss": 0.6053, + "step": 325 + }, + { + "epoch": 6.88, + "grad_norm": 0.7275277376174927, + "learning_rate": 7.567395570288415e-07, + "loss": 0.612, + "step": 330 + }, + { + "epoch": 6.98, + "grad_norm": 0.7834873795509338, + "learning_rate": 7.315149051278801e-07, + "loss": 0.55, + "step": 335 + }, + { + "epoch": 7.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6889228820800781, + "eval_runtime": 1.3769, + "eval_samples_per_second": 46.483, + "eval_steps_per_second": 5.81, + "step": 336 + }, + { + "epoch": 7.08, + "grad_norm": 1.24147367477417, + "learning_rate": 7.062902532269188e-07, + "loss": 0.508, + "step": 340 + }, + { + "epoch": 7.19, + "grad_norm": 1.8932181596755981, + "learning_rate": 6.810656013259573e-07, + "loss": 0.6358, + "step": 345 + }, + { + "epoch": 7.29, + "grad_norm": 1.861436128616333, + "learning_rate": 6.55840949424996e-07, + "loss": 0.5741, + "step": 350 + }, + { + "epoch": 7.4, + "grad_norm": 0.8429200053215027, + "learning_rate": 6.306162975240346e-07, + "loss": 0.5717, + "step": 355 + }, + { + "epoch": 7.5, + "grad_norm": 1.8665741682052612, + "learning_rate": 6.053916456230732e-07, + "loss": 0.6992, + "step": 360 + }, + { + "epoch": 7.6, + "grad_norm": 2.312748908996582, + "learning_rate": 5.801669937221119e-07, + "loss": 0.6151, + "step": 365 + }, + { + "epoch": 7.71, + "grad_norm": 1.1628329753875732, + "learning_rate": 5.549423418211505e-07, + "loss": 0.5354, + "step": 370 + }, + { + "epoch": 7.81, + "grad_norm": 1.8674992322921753, + "learning_rate": 5.297176899201891e-07, + "loss": 0.6411, + "step": 375 + }, + { + "epoch": 7.92, + "grad_norm": 0.7112137675285339, + "learning_rate": 5.044930380192277e-07, + "loss": 0.6063, + "step": 380 + }, + { + "epoch": 8.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6909217834472656, + "eval_runtime": 1.3913, + "eval_samples_per_second": 45.999, + "eval_steps_per_second": 5.75, + "step": 384 + }, + { + "epoch": 8.02, + "grad_norm": 1.201416015625, + "learning_rate": 4.792683861182663e-07, + "loss": 0.647, + "step": 385 + }, + { + "epoch": 8.12, + "grad_norm": 0.9348795413970947, + "learning_rate": 4.5404373421730493e-07, + "loss": 0.5356, + "step": 390 + }, + { + "epoch": 8.23, + "grad_norm": 1.895739197731018, + "learning_rate": 4.2881908231634357e-07, + "loss": 0.5384, + "step": 395 + }, + { + "epoch": 8.33, + "grad_norm": 1.8363467454910278, + "learning_rate": 4.035944304153821e-07, + "loss": 0.541, + "step": 400 + }, + { + "epoch": 8.44, + "grad_norm": 2.475804090499878, + "learning_rate": 3.7836977851442075e-07, + "loss": 0.5503, + "step": 405 + }, + { + "epoch": 8.54, + "grad_norm": 1.314663290977478, + "learning_rate": 3.531451266134594e-07, + "loss": 0.575, + "step": 410 + }, + { + "epoch": 8.65, + "grad_norm": 1.850918173789978, + "learning_rate": 3.27920474712498e-07, + "loss": 0.6658, + "step": 415 + }, + { + "epoch": 8.75, + "grad_norm": 0.7412477135658264, + "learning_rate": 3.026958228115366e-07, + "loss": 0.5632, + "step": 420 + }, + { + "epoch": 8.85, + "grad_norm": 1.3282320499420166, + "learning_rate": 2.7747117091057526e-07, + "loss": 0.7303, + "step": 425 + }, + { + "epoch": 8.96, + "grad_norm": 1.019906759262085, + "learning_rate": 2.5224651900961385e-07, + "loss": 0.7438, + "step": 430 + }, + { + "epoch": 9.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6923065185546875, + "eval_runtime": 1.3679, + "eval_samples_per_second": 46.786, + "eval_steps_per_second": 5.848, + "step": 432 + }, + { + "epoch": 9.06, + "grad_norm": 1.0898264646530151, + "learning_rate": 2.2702186710865246e-07, + "loss": 0.6015, + "step": 435 + }, + { + "epoch": 9.17, + "grad_norm": 0.8358483910560608, + "learning_rate": 2.0179721520769105e-07, + "loss": 0.4656, + "step": 440 + }, + { + "epoch": 9.27, + "grad_norm": 0.6350271105766296, + "learning_rate": 1.765725633067297e-07, + "loss": 0.573, + "step": 445 + }, + { + "epoch": 9.38, + "grad_norm": 1.2606990337371826, + "learning_rate": 1.513479114057683e-07, + "loss": 0.5949, + "step": 450 + }, + { + "epoch": 9.48, + "grad_norm": 0.7057228088378906, + "learning_rate": 1.2612325950480692e-07, + "loss": 0.6387, + "step": 455 + }, + { + "epoch": 9.58, + "grad_norm": 0.7172160744667053, + "learning_rate": 1.0089860760384553e-07, + "loss": 0.7169, + "step": 460 + }, + { + "epoch": 9.69, + "grad_norm": 0.844514787197113, + "learning_rate": 7.567395570288415e-08, + "loss": 0.5349, + "step": 465 + }, + { + "epoch": 9.79, + "grad_norm": 1.9568054676055908, + "learning_rate": 5.044930380192276e-08, + "loss": 0.5772, + "step": 470 + }, + { + "epoch": 9.9, + "grad_norm": 0.8768876791000366, + "learning_rate": 2.522465190096138e-08, + "loss": 0.6347, + "step": 475 + }, + { + "epoch": 10.0, + "grad_norm": 2.576140880584717, + "learning_rate": 0.0, + "loss": 0.633, + "step": 480 + }, + { + "epoch": 10.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6924400329589844, + "eval_runtime": 1.3726, + "eval_samples_per_second": 46.626, + "eval_steps_per_second": 5.828, + "step": 480 + } + ], + "logging_steps": 5, + "max_steps": 480, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 7306519200387600.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": { + "learning_rate": 2.1794099242430636e-06, + "per_device_train_batch_size": 4 + } +} diff --git a/run-4/checkpoint-480/training_args.bin b/run-4/checkpoint-480/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2bb1c33e4434a3bb752764ab093170996801fe72 --- /dev/null +++ b/run-4/checkpoint-480/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eadd8415a707c8e4de7440233358b89448cac114a2ada36aef1d9186553c0c3 +size 4920 diff --git a/run-4/checkpoint-96/config.json b/run-4/checkpoint-96/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-4/checkpoint-96/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-4/checkpoint-96/model.safetensors b/run-4/checkpoint-96/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c2b8cd562ea6dcbe871747b14186312ae20d1c74 --- /dev/null +++ b/run-4/checkpoint-96/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e59874c674ef1e88ec3d04520746ac141416e5dadff4a6fbe708b712d19d5b78 +size 94763496 diff --git a/run-4/checkpoint-96/optimizer.pt b/run-4/checkpoint-96/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bbb8637443fcf4e610a24c501c4c39179aacb2d8 --- /dev/null +++ b/run-4/checkpoint-96/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89b3770155bbe1e8df4ab2ec0bfbd16f490e7a3a4e5b6fe431a8389dbe5dbc04 +size 189552570 diff --git a/run-4/checkpoint-96/preprocessor_config.json b/run-4/checkpoint-96/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-4/checkpoint-96/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-4/checkpoint-96/rng_state.pth b/run-4/checkpoint-96/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..5df9532d48eec28233ca1958234673b2505309f1 --- /dev/null +++ b/run-4/checkpoint-96/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39dbf03bf644af79257aec95c925042cb81a469bfcc7a839a95d68f1d0425513 +size 14244 diff --git a/run-4/checkpoint-96/scheduler.pt b/run-4/checkpoint-96/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d511fb6872f477f712ce1180da13d5b7c8e7c58 --- /dev/null +++ b/run-4/checkpoint-96/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dadd42b117ea47d6ed1be5b0a501ded55edf4cbe33128cd9dc2dc2f9fa2b2576 +size 1064 diff --git a/run-4/checkpoint-96/trainer_state.json b/run-4/checkpoint-96/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d1aa4e9fd3ab35dec0a379f529b315c00a6a6d2a --- /dev/null +++ b/run-4/checkpoint-96/trainer_state.json @@ -0,0 +1,175 @@ +{ + "best_metric": 0.7326732673267327, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-4/checkpoint-96", + "epoch": 2.0, + "eval_steps": 500, + "global_step": 96, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1, + "grad_norm": 1.2733114957809448, + "learning_rate": 2.2702186710865246e-07, + "loss": 0.7025, + "step": 5 + }, + { + "epoch": 0.21, + "grad_norm": 1.243804931640625, + "learning_rate": 4.5404373421730493e-07, + "loss": 0.6974, + "step": 10 + }, + { + "epoch": 0.31, + "grad_norm": 1.7711552381515503, + "learning_rate": 6.810656013259573e-07, + "loss": 0.696, + "step": 15 + }, + { + "epoch": 0.42, + "grad_norm": 1.1453403234481812, + "learning_rate": 9.080874684346099e-07, + "loss": 0.6989, + "step": 20 + }, + { + "epoch": 0.52, + "grad_norm": 1.2729355096817017, + "learning_rate": 1.1351093355432624e-06, + "loss": 0.6968, + "step": 25 + }, + { + "epoch": 0.62, + "grad_norm": 1.1592165231704712, + "learning_rate": 1.3621312026519146e-06, + "loss": 0.6959, + "step": 30 + }, + { + "epoch": 0.73, + "grad_norm": 1.1798148155212402, + "learning_rate": 1.589153069760567e-06, + "loss": 0.6952, + "step": 35 + }, + { + "epoch": 0.83, + "grad_norm": 2.1216671466827393, + "learning_rate": 1.8161749368692197e-06, + "loss": 0.6886, + "step": 40 + }, + { + "epoch": 0.94, + "grad_norm": 1.3416370153427124, + "learning_rate": 2.043196803977872e-06, + "loss": 0.6864, + "step": 45 + }, + { + "epoch": 1.0, + "eval_f1": 0.72, + "eval_loss": 0.688262939453125, + "eval_runtime": 1.3468, + "eval_samples_per_second": 47.521, + "eval_steps_per_second": 5.94, + "step": 48 + }, + { + "epoch": 1.04, + "grad_norm": 2.1856281757354736, + "learning_rate": 2.169320063482679e-06, + "loss": 0.6917, + "step": 50 + }, + { + "epoch": 1.15, + "grad_norm": 1.4077153205871582, + "learning_rate": 2.1440954115817176e-06, + "loss": 0.6884, + "step": 55 + }, + { + "epoch": 1.25, + "grad_norm": 2.1792664527893066, + "learning_rate": 2.1188707596807562e-06, + "loss": 0.6668, + "step": 60 + }, + { + "epoch": 1.35, + "grad_norm": 1.0386197566986084, + "learning_rate": 2.093646107779795e-06, + "loss": 0.6694, + "step": 65 + }, + { + "epoch": 1.46, + "grad_norm": 2.0565919876098633, + "learning_rate": 2.0684214558788335e-06, + "loss": 0.6561, + "step": 70 + }, + { + "epoch": 1.56, + "grad_norm": 1.2978509664535522, + "learning_rate": 2.043196803977872e-06, + "loss": 0.6789, + "step": 75 + }, + { + "epoch": 1.67, + "grad_norm": 2.058328628540039, + "learning_rate": 2.0179721520769108e-06, + "loss": 0.6633, + "step": 80 + }, + { + "epoch": 1.77, + "grad_norm": 0.6023226976394653, + "learning_rate": 1.9927475001759494e-06, + "loss": 0.6655, + "step": 85 + }, + { + "epoch": 1.88, + "grad_norm": 0.5510762929916382, + "learning_rate": 1.967522848274988e-06, + "loss": 0.6622, + "step": 90 + }, + { + "epoch": 1.98, + "grad_norm": 1.098602533340454, + "learning_rate": 1.9422981963740267e-06, + "loss": 0.6633, + "step": 95 + }, + { + "epoch": 2.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6816024780273438, + "eval_runtime": 1.3765, + "eval_samples_per_second": 46.493, + "eval_steps_per_second": 5.812, + "step": 96 + } + ], + "logging_steps": 5, + "max_steps": 480, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 1442567462539200.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": { + "learning_rate": 2.1794099242430636e-06, + "per_device_train_batch_size": 4 + } +} diff --git a/run-4/checkpoint-96/training_args.bin b/run-4/checkpoint-96/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2bb1c33e4434a3bb752764ab093170996801fe72 --- /dev/null +++ b/run-4/checkpoint-96/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eadd8415a707c8e4de7440233358b89448cac114a2ada36aef1d9186553c0c3 +size 4920 diff --git a/run-5/checkpoint-24/config.json b/run-5/checkpoint-24/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8a488f827577f57ac81420fa1b836ae1d5ceaeb --- /dev/null +++ b/run-5/checkpoint-24/config.json @@ -0,0 +1,80 @@ +{ + "_name_or_path": "ntu-spml/distilhubert", + "activation_dropout": 0.1, + "apply_spec_augment": false, + "architectures": [ + "HubertForSequenceClassification" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": false, + "final_dropout": 0.0, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "id2label": { + "0": "NOT_WORD", + "1": "WORD" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "NOT_WORD": "0", + "WORD": "1" + }, + "layer_norm_eps": 1e-05, + "layerdrop": 0.0, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 2, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.38.1", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/run-5/checkpoint-24/model.safetensors b/run-5/checkpoint-24/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9877a03ed7ffd01a15db65fc288d2a8b522a91f9 --- /dev/null +++ b/run-5/checkpoint-24/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1ed6ffdb684cc537f47f54766448db627495bfcb076b58f5bd92f9f75833010 +size 94763496 diff --git a/run-5/checkpoint-24/optimizer.pt b/run-5/checkpoint-24/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a96c24d03052270a79066b7eed85b6e55b7a202f --- /dev/null +++ b/run-5/checkpoint-24/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46cfd96302677a11e6dbf252455843d85d90ae5ce79e6886537d090129a18f23 +size 189552570 diff --git a/run-5/checkpoint-24/preprocessor_config.json b/run-5/checkpoint-24/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..36ebe8b7c1cc967b3059f0494ae8a1069dd67655 --- /dev/null +++ b/run-5/checkpoint-24/preprocessor_config.json @@ -0,0 +1,9 @@ +{ + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0, + "return_attention_mask": true, + "sampling_rate": 16000 +} diff --git a/run-5/checkpoint-24/rng_state.pth b/run-5/checkpoint-24/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0dfccad4abff54de4d7ebc9ca77c59ef24efaf5b --- /dev/null +++ b/run-5/checkpoint-24/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5fd464cfdc34ccc4a78e84a563ff84847a7f2ca68f8d1be703c5be378ce9c86 +size 14244 diff --git a/run-5/checkpoint-24/scheduler.pt b/run-5/checkpoint-24/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..16c1a4ef8bf76abfb35c6b546fed922c5ac7ab96 --- /dev/null +++ b/run-5/checkpoint-24/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:297682112b80eec04bb767686fcb39dbc89dc637c30fbeb9e914038917acc34e +size 1064 diff --git a/run-5/checkpoint-24/trainer_state.json b/run-5/checkpoint-24/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..62b4675fd4f5761ed06eee0c86520665019ecd15 --- /dev/null +++ b/run-5/checkpoint-24/trainer_state.json @@ -0,0 +1,61 @@ +{ + "best_metric": 0.7326732673267327, + "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-5/checkpoint-24", + "epoch": 1.0, + "eval_steps": 500, + "global_step": 24, + "is_hyper_param_search": true, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.21, + "grad_norm": 1.6560618877410889, + "learning_rate": 9.928194518007172e-07, + "loss": 0.6997, + "step": 5 + }, + { + "epoch": 0.42, + "grad_norm": 0.6409028768539429, + "learning_rate": 1.9856389036014343e-06, + "loss": 0.6971, + "step": 10 + }, + { + "epoch": 0.62, + "grad_norm": 0.6359347105026245, + "learning_rate": 2.978458355402151e-06, + "loss": 0.6953, + "step": 15 + }, + { + "epoch": 0.83, + "grad_norm": 1.56245756149292, + "learning_rate": 3.971277807202869e-06, + "loss": 0.6889, + "step": 20 + }, + { + "epoch": 1.0, + "eval_f1": 0.7326732673267327, + "eval_loss": 0.6865768432617188, + "eval_runtime": 1.3652, + "eval_samples_per_second": 46.879, + "eval_steps_per_second": 5.86, + "step": 24 + } + ], + "logging_steps": 5, + "max_steps": 240, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "total_flos": 670768569859200.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": { + "learning_rate": 4.765533368643442e-06, + "per_device_train_batch_size": 8 + } +} diff --git a/run-5/checkpoint-24/training_args.bin b/run-5/checkpoint-24/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..35fa0af60e4dca57930333ada64bcb1b3d5e32fe --- /dev/null +++ b/run-5/checkpoint-24/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:064d1923d7ff14a10cb0625f0bc3dd479a67056d28f933cdfbfdecd320e675db +size 4920 diff --git a/runs/Mar03_18-24-09_ca56ea9bc35e/events.out.tfevents.1709492181.ca56ea9bc35e.3883.6 b/runs/Mar03_18-24-09_ca56ea9bc35e/events.out.tfevents.1709492181.ca56ea9bc35e.3883.6 new file mode 100644 index 0000000000000000000000000000000000000000..16fb912160797863673185fe3c6ca4aa91ca7c30 --- /dev/null +++ b/runs/Mar03_18-24-09_ca56ea9bc35e/events.out.tfevents.1709492181.ca56ea9bc35e.3883.6 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ed63f7eaaa83f1a69757533ce7d70676a1ffcedee223fdf2ff2fd988f12782a +size 15943 diff --git a/training_args.bin b/training_args.bin index 2bb1c33e4434a3bb752764ab093170996801fe72..35fa0af60e4dca57930333ada64bcb1b3d5e32fe 100644 --- a/training_args.bin +++ b/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6eadd8415a707c8e4de7440233358b89448cac114a2ada36aef1d9186553c0c3 +oid sha256:064d1923d7ff14a10cb0625f0bc3dd479a67056d28f933cdfbfdecd320e675db size 4920