{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.5195263290501386, "eval_steps": 500, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06, "learning_rate": 0.000975, "loss": 1.0238, "step": 500 }, { "epoch": 0.06, "eval_loss": 0.8929525017738342, "eval_runtime": 4.0089, "eval_samples_per_second": 74.833, "eval_steps_per_second": 37.417, "step": 500 }, { "epoch": 0.13, "learning_rate": 0.00095, "loss": 0.9971, "step": 1000 }, { "epoch": 0.13, "eval_loss": 0.8510345220565796, "eval_runtime": 4.0089, "eval_samples_per_second": 74.833, "eval_steps_per_second": 37.417, "step": 1000 }, { "epoch": 0.19, "learning_rate": 0.000925, "loss": 0.9522, "step": 1500 }, { "epoch": 0.19, "eval_loss": 0.8565580248832703, "eval_runtime": 4.0278, "eval_samples_per_second": 74.483, "eval_steps_per_second": 37.241, "step": 1500 }, { "epoch": 0.25, "learning_rate": 0.0009000000000000001, "loss": 0.9143, "step": 2000 }, { "epoch": 0.25, "eval_loss": 0.8810315132141113, "eval_runtime": 4.0097, "eval_samples_per_second": 74.819, "eval_steps_per_second": 37.409, "step": 2000 }, { "epoch": 0.31, "learning_rate": 0.000875, "loss": 0.9151, "step": 2500 }, { "epoch": 0.31, "eval_loss": 0.8461073040962219, "eval_runtime": 4.0108, "eval_samples_per_second": 74.798, "eval_steps_per_second": 37.399, "step": 2500 }, { "epoch": 0.38, "learning_rate": 0.00085, "loss": 0.9263, "step": 3000 }, { "epoch": 0.38, "eval_loss": 0.8521466255187988, "eval_runtime": 4.0059, "eval_samples_per_second": 74.89, "eval_steps_per_second": 37.445, "step": 3000 }, { "epoch": 0.44, "learning_rate": 0.000825, "loss": 0.9102, "step": 3500 }, { "epoch": 0.44, "eval_loss": 0.8521981835365295, "eval_runtime": 4.0413, "eval_samples_per_second": 74.234, "eval_steps_per_second": 37.117, "step": 3500 }, { "epoch": 0.5, "learning_rate": 0.0008, "loss": 0.8866, "step": 4000 }, { "epoch": 0.5, "eval_loss": 0.8259674906730652, "eval_runtime": 4.0316, "eval_samples_per_second": 74.412, "eval_steps_per_second": 37.206, "step": 4000 }, { "epoch": 0.57, "learning_rate": 0.0007750000000000001, "loss": 0.911, "step": 4500 }, { "epoch": 0.57, "eval_loss": 0.8008900284767151, "eval_runtime": 4.0224, "eval_samples_per_second": 74.583, "eval_steps_per_second": 37.292, "step": 4500 }, { "epoch": 0.63, "learning_rate": 0.00075, "loss": 0.8536, "step": 5000 }, { "epoch": 0.63, "eval_loss": 0.808663547039032, "eval_runtime": 5.0615, "eval_samples_per_second": 59.271, "eval_steps_per_second": 29.635, "step": 5000 }, { "epoch": 0.69, "learning_rate": 0.000725, "loss": 0.8277, "step": 5500 }, { "epoch": 0.69, "eval_loss": 0.7903586626052856, "eval_runtime": 4.0081, "eval_samples_per_second": 74.849, "eval_steps_per_second": 37.425, "step": 5500 }, { "epoch": 0.76, "learning_rate": 0.0007, "loss": 0.8413, "step": 6000 }, { "epoch": 0.76, "eval_loss": 0.8013813495635986, "eval_runtime": 3.994, "eval_samples_per_second": 75.113, "eval_steps_per_second": 37.556, "step": 6000 }, { "epoch": 0.82, "learning_rate": 0.000675, "loss": 0.8491, "step": 6500 }, { "epoch": 0.82, "eval_loss": 0.7867017984390259, "eval_runtime": 4.0072, "eval_samples_per_second": 74.864, "eval_steps_per_second": 37.432, "step": 6500 }, { "epoch": 0.88, "learning_rate": 0.0006500000000000001, "loss": 0.8077, "step": 7000 }, { "epoch": 0.88, "eval_loss": 0.759165346622467, "eval_runtime": 4.0166, "eval_samples_per_second": 74.691, "eval_steps_per_second": 37.345, "step": 7000 }, { "epoch": 0.94, "learning_rate": 0.000625, "loss": 0.796, "step": 7500 }, { "epoch": 0.94, "eval_loss": 0.7527943849563599, "eval_runtime": 3.998, "eval_samples_per_second": 75.037, "eval_steps_per_second": 37.518, "step": 7500 }, { "epoch": 1.01, "learning_rate": 0.0006, "loss": 0.7952, "step": 8000 }, { "epoch": 1.01, "eval_loss": 0.7407393455505371, "eval_runtime": 3.9421, "eval_samples_per_second": 76.102, "eval_steps_per_second": 38.051, "step": 8000 }, { "epoch": 1.07, "learning_rate": 0.000575, "loss": 0.5761, "step": 8500 }, { "epoch": 1.07, "eval_loss": 0.7574229836463928, "eval_runtime": 3.9855, "eval_samples_per_second": 75.273, "eval_steps_per_second": 37.636, "step": 8500 }, { "epoch": 1.13, "learning_rate": 0.00055, "loss": 0.6084, "step": 9000 }, { "epoch": 1.13, "eval_loss": 0.7349967956542969, "eval_runtime": 3.995, "eval_samples_per_second": 75.093, "eval_steps_per_second": 37.547, "step": 9000 }, { "epoch": 1.2, "learning_rate": 0.0005250000000000001, "loss": 0.5863, "step": 9500 }, { "epoch": 1.2, "eval_loss": 0.7407773733139038, "eval_runtime": 3.9874, "eval_samples_per_second": 75.238, "eval_steps_per_second": 37.619, "step": 9500 }, { "epoch": 1.26, "learning_rate": 0.0005, "loss": 0.5835, "step": 10000 }, { "epoch": 1.26, "eval_loss": 0.719527006149292, "eval_runtime": 3.9471, "eval_samples_per_second": 76.006, "eval_steps_per_second": 38.003, "step": 10000 }, { "epoch": 1.32, "learning_rate": 0.000475, "loss": 0.5751, "step": 10500 }, { "epoch": 1.32, "eval_loss": 0.7423205971717834, "eval_runtime": 3.9978, "eval_samples_per_second": 75.041, "eval_steps_per_second": 37.52, "step": 10500 }, { "epoch": 1.39, "learning_rate": 0.00045000000000000004, "loss": 0.5746, "step": 11000 }, { "epoch": 1.39, "eval_loss": 0.7284040451049805, "eval_runtime": 3.9657, "eval_samples_per_second": 75.649, "eval_steps_per_second": 37.824, "step": 11000 }, { "epoch": 1.45, "learning_rate": 0.000425, "loss": 0.5847, "step": 11500 }, { "epoch": 1.45, "eval_loss": 0.7247716188430786, "eval_runtime": 3.962, "eval_samples_per_second": 75.719, "eval_steps_per_second": 37.86, "step": 11500 }, { "epoch": 1.51, "learning_rate": 0.0004, "loss": 0.5759, "step": 12000 }, { "epoch": 1.51, "eval_loss": 0.7151039838790894, "eval_runtime": 3.9922, "eval_samples_per_second": 75.147, "eval_steps_per_second": 37.573, "step": 12000 }, { "epoch": 1.57, "learning_rate": 0.000375, "loss": 0.5922, "step": 12500 }, { "epoch": 1.57, "eval_loss": 0.7031030058860779, "eval_runtime": 4.0386, "eval_samples_per_second": 74.282, "eval_steps_per_second": 37.141, "step": 12500 }, { "epoch": 1.64, "learning_rate": 0.00035, "loss": 0.5678, "step": 13000 }, { "epoch": 1.64, "eval_loss": 0.710035502910614, "eval_runtime": 4.0808, "eval_samples_per_second": 73.515, "eval_steps_per_second": 36.758, "step": 13000 }, { "epoch": 1.7, "learning_rate": 0.00032500000000000004, "loss": 0.5627, "step": 13500 }, { "epoch": 1.7, "eval_loss": 0.7093074321746826, "eval_runtime": 4.0326, "eval_samples_per_second": 74.393, "eval_steps_per_second": 37.197, "step": 13500 }, { "epoch": 1.76, "learning_rate": 0.0003, "loss": 0.5568, "step": 14000 }, { "epoch": 1.76, "eval_loss": 0.6942310333251953, "eval_runtime": 4.0875, "eval_samples_per_second": 73.395, "eval_steps_per_second": 36.697, "step": 14000 }, { "epoch": 1.83, "learning_rate": 0.000275, "loss": 0.5589, "step": 14500 }, { "epoch": 1.83, "eval_loss": 0.6947352290153503, "eval_runtime": 3.9856, "eval_samples_per_second": 75.27, "eval_steps_per_second": 37.635, "step": 14500 }, { "epoch": 1.89, "learning_rate": 0.00025, "loss": 0.5675, "step": 15000 }, { "epoch": 1.89, "eval_loss": 0.6902616620063782, "eval_runtime": 4.0066, "eval_samples_per_second": 74.876, "eval_steps_per_second": 37.438, "step": 15000 }, { "epoch": 1.95, "learning_rate": 0.00022500000000000002, "loss": 0.5422, "step": 15500 }, { "epoch": 1.95, "eval_loss": 0.6929380297660828, "eval_runtime": 4.107, "eval_samples_per_second": 73.047, "eval_steps_per_second": 36.523, "step": 15500 }, { "epoch": 2.02, "learning_rate": 0.0002, "loss": 0.5104, "step": 16000 }, { "epoch": 2.02, "eval_loss": 0.7098422050476074, "eval_runtime": 4.0323, "eval_samples_per_second": 74.4, "eval_steps_per_second": 37.2, "step": 16000 }, { "epoch": 2.08, "learning_rate": 0.000175, "loss": 0.3835, "step": 16500 }, { "epoch": 2.08, "eval_loss": 0.7105218768119812, "eval_runtime": 4.0594, "eval_samples_per_second": 73.903, "eval_steps_per_second": 36.952, "step": 16500 }, { "epoch": 2.14, "learning_rate": 0.00015, "loss": 0.3805, "step": 17000 }, { "epoch": 2.14, "eval_loss": 0.7144222855567932, "eval_runtime": 4.0853, "eval_samples_per_second": 73.434, "eval_steps_per_second": 36.717, "step": 17000 }, { "epoch": 2.2, "learning_rate": 0.000125, "loss": 0.3718, "step": 17500 }, { "epoch": 2.2, "eval_loss": 0.7210414409637451, "eval_runtime": 5.0511, "eval_samples_per_second": 59.393, "eval_steps_per_second": 29.697, "step": 17500 }, { "epoch": 2.27, "learning_rate": 0.0001, "loss": 0.3688, "step": 18000 }, { "epoch": 2.27, "eval_loss": 0.7145898342132568, "eval_runtime": 4.7793, "eval_samples_per_second": 62.77, "eval_steps_per_second": 31.385, "step": 18000 }, { "epoch": 2.33, "learning_rate": 7.5e-05, "loss": 0.3645, "step": 18500 }, { "epoch": 2.33, "eval_loss": 0.7136221528053284, "eval_runtime": 4.0171, "eval_samples_per_second": 74.681, "eval_steps_per_second": 37.34, "step": 18500 }, { "epoch": 2.39, "learning_rate": 5e-05, "loss": 0.3865, "step": 19000 }, { "epoch": 2.39, "eval_loss": 0.7075753808021545, "eval_runtime": 3.9658, "eval_samples_per_second": 75.646, "eval_steps_per_second": 37.823, "step": 19000 }, { "epoch": 2.46, "learning_rate": 2.5e-05, "loss": 0.3633, "step": 19500 }, { "epoch": 2.46, "eval_loss": 0.7097809314727783, "eval_runtime": 4.0163, "eval_samples_per_second": 74.696, "eval_steps_per_second": 37.348, "step": 19500 }, { "epoch": 2.52, "learning_rate": 0.0, "loss": 0.3674, "step": 20000 }, { "epoch": 2.52, "eval_loss": 0.7079904079437256, "eval_runtime": 4.0804, "eval_samples_per_second": 73.522, "eval_steps_per_second": 36.761, "step": 20000 } ], "logging_steps": 500, "max_steps": 20000, "num_train_epochs": 3, "save_steps": 5000, "total_flos": 3.549121832463667e+16, "trial_name": null, "trial_params": null }