{ "best_metric": 0.00016613505431450903, "best_model_checkpoint": "./results/checkpoint-500", "epoch": 4.62962962962963, "eval_steps": 20, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18518518518518517, "grad_norm": 15.828068733215332, "learning_rate": 1.925925925925926e-05, "loss": 0.3468, "step": 20 }, { "epoch": 0.18518518518518517, "eval_accuracy": 0.9019607843137255, "eval_loss": 0.20517832040786743, "eval_runtime": 0.628, "eval_samples_per_second": 81.206, "eval_steps_per_second": 20.699, "step": 20 }, { "epoch": 0.37037037037037035, "grad_norm": 9.892717361450195, "learning_rate": 1.851851851851852e-05, "loss": 0.2089, "step": 40 }, { "epoch": 0.37037037037037035, "eval_accuracy": 0.9215686274509803, "eval_loss": 0.12760986387729645, "eval_runtime": 0.6405, "eval_samples_per_second": 79.628, "eval_steps_per_second": 20.297, "step": 40 }, { "epoch": 0.5555555555555556, "grad_norm": 0.21783480048179626, "learning_rate": 1.7777777777777777e-05, "loss": 0.214, "step": 60 }, { "epoch": 0.5555555555555556, "eval_accuracy": 0.9803921568627451, "eval_loss": 0.02514837123453617, "eval_runtime": 0.632, "eval_samples_per_second": 80.694, "eval_steps_per_second": 20.569, "step": 60 }, { "epoch": 0.7407407407407407, "grad_norm": 0.05077819526195526, "learning_rate": 1.7037037037037038e-05, "loss": 0.0807, "step": 80 }, { "epoch": 0.7407407407407407, "eval_accuracy": 0.9803921568627451, "eval_loss": 0.016518110409379005, "eval_runtime": 0.629, "eval_samples_per_second": 81.079, "eval_steps_per_second": 20.667, "step": 80 }, { "epoch": 0.9259259259259259, "grad_norm": 0.04394271969795227, "learning_rate": 1.6296296296296297e-05, "loss": 0.1064, "step": 100 }, { "epoch": 0.9259259259259259, "eval_accuracy": 1.0, "eval_loss": 0.0030651080887764692, "eval_runtime": 0.6365, "eval_samples_per_second": 80.124, "eval_steps_per_second": 20.424, "step": 100 }, { "epoch": 1.1111111111111112, "grad_norm": 0.08775356411933899, "learning_rate": 1.555555555555556e-05, "loss": 0.0134, "step": 120 }, { "epoch": 1.1111111111111112, "eval_accuracy": 1.0, "eval_loss": 0.002504199743270874, "eval_runtime": 0.6271, "eval_samples_per_second": 81.321, "eval_steps_per_second": 20.729, "step": 120 }, { "epoch": 1.2962962962962963, "grad_norm": 0.028400583192706108, "learning_rate": 1.4814814814814815e-05, "loss": 0.0349, "step": 140 }, { "epoch": 1.2962962962962963, "eval_accuracy": 1.0, "eval_loss": 0.0006583676440641284, "eval_runtime": 0.6233, "eval_samples_per_second": 81.817, "eval_steps_per_second": 20.855, "step": 140 }, { "epoch": 1.4814814814814814, "grad_norm": 0.00876949355006218, "learning_rate": 1.4074074074074075e-05, "loss": 0.0819, "step": 160 }, { "epoch": 1.4814814814814814, "eval_accuracy": 0.9607843137254902, "eval_loss": 0.07063630223274231, "eval_runtime": 0.6218, "eval_samples_per_second": 82.021, "eval_steps_per_second": 20.907, "step": 160 }, { "epoch": 1.6666666666666665, "grad_norm": 0.014950844459235668, "learning_rate": 1.3333333333333333e-05, "loss": 0.0586, "step": 180 }, { "epoch": 1.6666666666666665, "eval_accuracy": 1.0, "eval_loss": 0.0005163149326108396, "eval_runtime": 0.6311, "eval_samples_per_second": 80.809, "eval_steps_per_second": 20.598, "step": 180 }, { "epoch": 1.8518518518518519, "grad_norm": 0.01813225820660591, "learning_rate": 1.2592592592592593e-05, "loss": 0.0368, "step": 200 }, { "epoch": 1.8518518518518519, "eval_accuracy": 1.0, "eval_loss": 0.0004809926904272288, "eval_runtime": 0.6213, "eval_samples_per_second": 82.09, "eval_steps_per_second": 20.925, "step": 200 }, { "epoch": 2.037037037037037, "grad_norm": 0.018220530822873116, "learning_rate": 1.1851851851851852e-05, "loss": 0.0485, "step": 220 }, { "epoch": 2.037037037037037, "eval_accuracy": 1.0, "eval_loss": 0.003055064007639885, "eval_runtime": 0.6259, "eval_samples_per_second": 81.483, "eval_steps_per_second": 20.77, "step": 220 }, { "epoch": 2.2222222222222223, "grad_norm": 0.012761042453348637, "learning_rate": 1.1111111111111113e-05, "loss": 0.017, "step": 240 }, { "epoch": 2.2222222222222223, "eval_accuracy": 1.0, "eval_loss": 0.0003475048579275608, "eval_runtime": 0.6263, "eval_samples_per_second": 81.436, "eval_steps_per_second": 20.758, "step": 240 }, { "epoch": 2.4074074074074074, "grad_norm": 0.012553339824080467, "learning_rate": 1.037037037037037e-05, "loss": 0.0017, "step": 260 }, { "epoch": 2.4074074074074074, "eval_accuracy": 1.0, "eval_loss": 0.0020730593241751194, "eval_runtime": 0.623, "eval_samples_per_second": 81.864, "eval_steps_per_second": 20.867, "step": 260 }, { "epoch": 2.5925925925925926, "grad_norm": 0.009718580171465874, "learning_rate": 9.62962962962963e-06, "loss": 0.0004, "step": 280 }, { "epoch": 2.5925925925925926, "eval_accuracy": 1.0, "eval_loss": 0.0004216369998175651, "eval_runtime": 0.6294, "eval_samples_per_second": 81.028, "eval_steps_per_second": 20.654, "step": 280 }, { "epoch": 2.7777777777777777, "grad_norm": 0.009635565802454948, "learning_rate": 8.888888888888888e-06, "loss": 0.0444, "step": 300 }, { "epoch": 2.7777777777777777, "eval_accuracy": 1.0, "eval_loss": 0.00025334919337183237, "eval_runtime": 0.6299, "eval_samples_per_second": 80.96, "eval_steps_per_second": 20.637, "step": 300 }, { "epoch": 2.962962962962963, "grad_norm": 0.005175964906811714, "learning_rate": 8.148148148148148e-06, "loss": 0.0005, "step": 320 }, { "epoch": 2.962962962962963, "eval_accuracy": 1.0, "eval_loss": 0.00028179946821182966, "eval_runtime": 0.6242, "eval_samples_per_second": 81.709, "eval_steps_per_second": 20.828, "step": 320 }, { "epoch": 3.148148148148148, "grad_norm": 0.008158649317920208, "learning_rate": 7.4074074074074075e-06, "loss": 0.0003, "step": 340 }, { "epoch": 3.148148148148148, "eval_accuracy": 1.0, "eval_loss": 0.0002456614456605166, "eval_runtime": 0.6258, "eval_samples_per_second": 81.502, "eval_steps_per_second": 20.775, "step": 340 }, { "epoch": 3.3333333333333335, "grad_norm": 0.004232426173985004, "learning_rate": 6.666666666666667e-06, "loss": 0.0238, "step": 360 }, { "epoch": 3.3333333333333335, "eval_accuracy": 1.0, "eval_loss": 0.00026214588433504105, "eval_runtime": 0.6234, "eval_samples_per_second": 81.814, "eval_steps_per_second": 20.855, "step": 360 }, { "epoch": 3.5185185185185186, "grad_norm": 0.004335971549153328, "learning_rate": 5.925925925925926e-06, "loss": 0.0003, "step": 380 }, { "epoch": 3.5185185185185186, "eval_accuracy": 1.0, "eval_loss": 0.0002457990194670856, "eval_runtime": 0.6226, "eval_samples_per_second": 81.91, "eval_steps_per_second": 20.879, "step": 380 }, { "epoch": 3.7037037037037037, "grad_norm": 0.007228133734315634, "learning_rate": 5.185185185185185e-06, "loss": 0.0003, "step": 400 }, { "epoch": 3.7037037037037037, "eval_accuracy": 1.0, "eval_loss": 0.0002311759744770825, "eval_runtime": 0.6205, "eval_samples_per_second": 82.185, "eval_steps_per_second": 20.949, "step": 400 }, { "epoch": 3.888888888888889, "grad_norm": 0.004542021080851555, "learning_rate": 4.444444444444444e-06, "loss": 0.0003, "step": 420 }, { "epoch": 3.888888888888889, "eval_accuracy": 1.0, "eval_loss": 0.0002200859016738832, "eval_runtime": 0.6236, "eval_samples_per_second": 81.782, "eval_steps_per_second": 20.846, "step": 420 }, { "epoch": 4.074074074074074, "grad_norm": 0.004315598402172327, "learning_rate": 3.7037037037037037e-06, "loss": 0.0007, "step": 440 }, { "epoch": 4.074074074074074, "eval_accuracy": 1.0, "eval_loss": 0.0001800174795789644, "eval_runtime": 0.6201, "eval_samples_per_second": 82.247, "eval_steps_per_second": 20.965, "step": 440 }, { "epoch": 4.2592592592592595, "grad_norm": 0.014702328480780125, "learning_rate": 2.962962962962963e-06, "loss": 0.0002, "step": 460 }, { "epoch": 4.2592592592592595, "eval_accuracy": 1.0, "eval_loss": 0.00017176676192320883, "eval_runtime": 0.6211, "eval_samples_per_second": 82.115, "eval_steps_per_second": 20.931, "step": 460 }, { "epoch": 4.444444444444445, "grad_norm": 0.005943182855844498, "learning_rate": 2.222222222222222e-06, "loss": 0.0003, "step": 480 }, { "epoch": 4.444444444444445, "eval_accuracy": 1.0, "eval_loss": 0.00016821030294522643, "eval_runtime": 0.6229, "eval_samples_per_second": 81.877, "eval_steps_per_second": 20.871, "step": 480 }, { "epoch": 4.62962962962963, "grad_norm": 0.036252692341804504, "learning_rate": 1.4814814814814815e-06, "loss": 0.0002, "step": 500 }, { "epoch": 4.62962962962963, "eval_accuracy": 1.0, "eval_loss": 0.00016613505431450903, "eval_runtime": 0.627, "eval_samples_per_second": 81.342, "eval_steps_per_second": 20.734, "step": 500 } ], "logging_steps": 20, "max_steps": 540, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 411116162887800.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }