{ "best_metric": 0.74, "best_model_checkpoint": "distilhubert-finetuned-not-a-word2/run-8/checkpoint-168", "epoch": 9.0, "eval_steps": 500, "global_step": 216, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.21, "grad_norm": 1.6496813297271729, "learning_rate": 2.9441483262927863e-06, "loss": 0.6992, "step": 5 }, { "epoch": 0.42, "grad_norm": 0.620004415512085, "learning_rate": 5.888296652585573e-06, "loss": 0.6939, "step": 10 }, { "epoch": 0.62, "grad_norm": 0.5726878046989441, "learning_rate": 8.832444978878358e-06, "loss": 0.6835, "step": 15 }, { "epoch": 0.83, "grad_norm": 1.4239176511764526, "learning_rate": 1.1776593305171145e-05, "loss": 0.6673, "step": 20 }, { "epoch": 1.0, "eval_f1": 0.7326732673267327, "eval_loss": 0.6805419921875, "eval_runtime": 1.3716, "eval_samples_per_second": 46.661, "eval_steps_per_second": 5.833, "step": 24 }, { "epoch": 1.04, "grad_norm": 0.9195191264152527, "learning_rate": 1.406648644784331e-05, "loss": 0.6673, "step": 25 }, { "epoch": 1.25, "grad_norm": 1.8524231910705566, "learning_rate": 1.3739358856033002e-05, "loss": 0.6155, "step": 30 }, { "epoch": 1.46, "grad_norm": 1.8213531970977783, "learning_rate": 1.3412231264222692e-05, "loss": 0.5895, "step": 35 }, { "epoch": 1.67, "grad_norm": 0.4818130433559418, "learning_rate": 1.3085103672412383e-05, "loss": 0.6468, "step": 40 }, { "epoch": 1.88, "grad_norm": 0.6597484946250916, "learning_rate": 1.2757976080602073e-05, "loss": 0.6173, "step": 45 }, { "epoch": 2.0, "eval_f1": 0.7326732673267327, "eval_loss": 0.6970634460449219, "eval_runtime": 1.3632, "eval_samples_per_second": 46.949, "eval_steps_per_second": 5.869, "step": 48 }, { "epoch": 2.08, "grad_norm": 0.48903289437294006, "learning_rate": 1.2430848488791764e-05, "loss": 0.6302, "step": 50 }, { "epoch": 2.29, "grad_norm": 0.6064260601997375, "learning_rate": 1.2103720896981454e-05, "loss": 0.5867, "step": 55 }, { "epoch": 2.5, "grad_norm": 0.6802453398704529, "learning_rate": 1.1776593305171145e-05, "loss": 0.6321, "step": 60 }, { "epoch": 2.71, "grad_norm": 1.2592875957489014, "learning_rate": 1.1449465713360835e-05, "loss": 0.6223, "step": 65 }, { "epoch": 2.92, "grad_norm": 1.1591824293136597, "learning_rate": 1.1122338121550526e-05, "loss": 0.4922, "step": 70 }, { "epoch": 3.0, "eval_f1": 0.7326732673267327, "eval_loss": 0.7079887390136719, "eval_runtime": 1.3669, "eval_samples_per_second": 46.821, "eval_steps_per_second": 5.853, "step": 72 }, { "epoch": 3.12, "grad_norm": 0.5332023501396179, "learning_rate": 1.0795210529740214e-05, "loss": 0.5989, "step": 75 }, { "epoch": 3.33, "grad_norm": 0.5555600523948669, "learning_rate": 1.0468082937929906e-05, "loss": 0.6104, "step": 80 }, { "epoch": 3.54, "grad_norm": 1.2928024530410767, "learning_rate": 1.0140955346119596e-05, "loss": 0.4936, "step": 85 }, { "epoch": 3.75, "grad_norm": 1.1424989700317383, "learning_rate": 9.813827754309287e-06, "loss": 0.6191, "step": 90 }, { "epoch": 3.96, "grad_norm": 1.119732141494751, "learning_rate": 9.486700162498977e-06, "loss": 0.6004, "step": 95 }, { "epoch": 4.0, "eval_f1": 0.7326732673267327, "eval_loss": 0.7053489685058594, "eval_runtime": 1.3556, "eval_samples_per_second": 47.211, "eval_steps_per_second": 5.901, "step": 96 }, { "epoch": 4.17, "grad_norm": 0.8135461211204529, "learning_rate": 9.159572570688668e-06, "loss": 0.5154, "step": 100 }, { "epoch": 4.38, "grad_norm": 1.8034342527389526, "learning_rate": 8.832444978878358e-06, "loss": 0.6067, "step": 105 }, { "epoch": 4.58, "grad_norm": 0.8029685020446777, "learning_rate": 8.505317387068049e-06, "loss": 0.5499, "step": 110 }, { "epoch": 4.79, "grad_norm": 1.019626259803772, "learning_rate": 8.178189795257739e-06, "loss": 0.5542, "step": 115 }, { "epoch": 5.0, "grad_norm": 1.861674427986145, "learning_rate": 7.85106220344743e-06, "loss": 0.5545, "step": 120 }, { "epoch": 5.0, "eval_f1": 0.7326732673267327, "eval_loss": 0.6832504272460938, "eval_runtime": 1.3811, "eval_samples_per_second": 46.341, "eval_steps_per_second": 5.793, "step": 120 }, { "epoch": 5.21, "grad_norm": 1.5949212312698364, "learning_rate": 7.523934611637121e-06, "loss": 0.4806, "step": 125 }, { "epoch": 5.42, "grad_norm": 3.002861738204956, "learning_rate": 7.196807019826811e-06, "loss": 0.5832, "step": 130 }, { "epoch": 5.62, "grad_norm": 1.4606820344924927, "learning_rate": 6.9351049463785626e-06, "loss": 0.5481, "step": 135 }, { "epoch": 5.83, "grad_norm": 1.6088628768920898, "learning_rate": 6.607977354568253e-06, "loss": 0.5333, "step": 140 }, { "epoch": 6.0, "eval_f1": 0.7326732673267327, "eval_loss": 0.6599597930908203, "eval_runtime": 1.3747, "eval_samples_per_second": 46.555, "eval_steps_per_second": 5.819, "step": 144 }, { "epoch": 6.04, "grad_norm": 2.2051286697387695, "learning_rate": 6.280849762757943e-06, "loss": 0.4805, "step": 145 }, { "epoch": 6.25, "grad_norm": 1.6964988708496094, "learning_rate": 5.953722170947633e-06, "loss": 0.4573, "step": 150 }, { "epoch": 6.46, "grad_norm": 2.1374056339263916, "learning_rate": 5.626594579137324e-06, "loss": 0.59, "step": 155 }, { "epoch": 6.67, "grad_norm": 1.8037084341049194, "learning_rate": 5.299466987327015e-06, "loss": 0.494, "step": 160 }, { "epoch": 6.88, "grad_norm": 1.7916295528411865, "learning_rate": 4.972339395516706e-06, "loss": 0.4997, "step": 165 }, { "epoch": 7.0, "eval_f1": 0.74, "eval_loss": 0.6653976440429688, "eval_runtime": 1.3593, "eval_samples_per_second": 47.083, "eval_steps_per_second": 5.885, "step": 168 }, { "epoch": 7.08, "grad_norm": 2.2235190868377686, "learning_rate": 4.645211803706396e-06, "loss": 0.3887, "step": 170 }, { "epoch": 7.29, "grad_norm": 1.239268183708191, "learning_rate": 4.318084211896087e-06, "loss": 0.4519, "step": 175 }, { "epoch": 7.5, "grad_norm": 1.8677798509597778, "learning_rate": 3.990956620085777e-06, "loss": 0.5146, "step": 180 }, { "epoch": 7.71, "grad_norm": 3.8495407104492188, "learning_rate": 3.6638290282754668e-06, "loss": 0.4237, "step": 185 }, { "epoch": 7.92, "grad_norm": 1.8828785419464111, "learning_rate": 3.3367014364651573e-06, "loss": 0.5033, "step": 190 }, { "epoch": 8.0, "eval_f1": 0.74, "eval_loss": 0.6842975616455078, "eval_runtime": 1.3685, "eval_samples_per_second": 46.767, "eval_steps_per_second": 5.846, "step": 192 }, { "epoch": 8.12, "grad_norm": 4.3374714851379395, "learning_rate": 3.009573844654848e-06, "loss": 0.4588, "step": 195 }, { "epoch": 8.33, "grad_norm": 2.9799509048461914, "learning_rate": 2.6824462528445384e-06, "loss": 0.3783, "step": 200 }, { "epoch": 8.54, "grad_norm": 3.3768601417541504, "learning_rate": 2.3553186610342286e-06, "loss": 0.4366, "step": 205 }, { "epoch": 8.75, "grad_norm": 2.2495288848876953, "learning_rate": 2.028191069223919e-06, "loss": 0.4545, "step": 210 }, { "epoch": 8.96, "grad_norm": 2.078002691268921, "learning_rate": 1.7010634774136097e-06, "loss": 0.6012, "step": 215 }, { "epoch": 9.0, "eval_f1": 0.74, "eval_loss": 0.6836881637573242, "eval_runtime": 1.3545, "eval_samples_per_second": 47.25, "eval_steps_per_second": 5.906, "step": 216 } ], "logging_steps": 5, "max_steps": 240, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 7545814381042464.0, "train_batch_size": 8, "trial_name": null, "trial_params": { "learning_rate": 1.4131911966205373e-05, "per_device_train_batch_size": 8 } }