{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 390, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002564102564102564, "grad_norm": 32.246810828841255, "learning_rate": 7.692307692307692e-06, "loss": 2.2691, "step": 1 }, { "epoch": 0.01282051282051282, "grad_norm": 29.537508150180734, "learning_rate": 3.8461538461538456e-05, "loss": 2.5015, "step": 5 }, { "epoch": 0.02564102564102564, "grad_norm": 7.166984360831344, "learning_rate": 7.692307692307691e-05, "loss": 1.3116, "step": 10 }, { "epoch": 0.038461538461538464, "grad_norm": 7.358331315050627, "learning_rate": 0.00011538461538461538, "loss": 1.2847, "step": 15 }, { "epoch": 0.05128205128205128, "grad_norm": 8.061303504403417, "learning_rate": 0.00015384615384615382, "loss": 1.424, "step": 20 }, { "epoch": 0.0641025641025641, "grad_norm": 30.923296803868965, "learning_rate": 0.0001923076923076923, "loss": 1.6455, "step": 25 }, { "epoch": 0.07692307692307693, "grad_norm": 35.007464110767536, "learning_rate": 0.00023076923076923076, "loss": 1.8041, "step": 30 }, { "epoch": 0.08974358974358974, "grad_norm": 10.894826428758686, "learning_rate": 0.0002692307692307692, "loss": 1.6061, "step": 35 }, { "epoch": 0.10256410256410256, "grad_norm": 32.31558367670942, "learning_rate": 0.0002999939918069778, "loss": 1.7147, "step": 40 }, { "epoch": 0.11538461538461539, "grad_norm": 408.39746177245365, "learning_rate": 0.0002997837555846212, "loss": 3.0391, "step": 45 }, { "epoch": 0.1282051282051282, "grad_norm": 585.894165849368, "learning_rate": 0.00029927359084964874, "loss": 7.5436, "step": 50 }, { "epoch": 0.14102564102564102, "grad_norm": 188.43190635567956, "learning_rate": 0.00029846451916110426, "loss": 6.3459, "step": 55 }, { "epoch": 0.15384615384615385, "grad_norm": 18.84113658139999, "learning_rate": 0.00029735816061234965, "loss": 4.155, "step": 60 }, { "epoch": 0.16666666666666666, "grad_norm": 83.76190854828539, "learning_rate": 0.00029595673058697357, "loss": 2.5163, "step": 65 }, { "epoch": 0.1794871794871795, "grad_norm": 8.013185246066367, "learning_rate": 0.00029426303532268435, "loss": 1.8248, "step": 70 }, { "epoch": 0.19230769230769232, "grad_norm": 7.809904167784529, "learning_rate": 0.00029228046629207175, "loss": 1.6297, "step": 75 }, { "epoch": 0.20512820512820512, "grad_norm": 6.240741545472446, "learning_rate": 0.00029001299341148754, "loss": 1.4935, "step": 80 }, { "epoch": 0.21794871794871795, "grad_norm": 56.36582267694748, "learning_rate": 0.0002874651570916444, "loss": 1.5049, "step": 85 }, { "epoch": 0.23076923076923078, "grad_norm": 244.98938960556112, "learning_rate": 0.0002846420591458521, "loss": 2.0434, "step": 90 }, { "epoch": 0.24358974358974358, "grad_norm": 15.923002750416765, "learning_rate": 0.00028154935257409397, "loss": 1.9592, "step": 95 }, { "epoch": 0.2564102564102564, "grad_norm": 6.887371068354224, "learning_rate": 0.000278193230243403, "loss": 1.52, "step": 100 }, { "epoch": 0.2692307692307692, "grad_norm": 2.1953075310025407, "learning_rate": 0.00027458041248720175, "loss": 1.4671, "step": 105 }, { "epoch": 0.28205128205128205, "grad_norm": 5.381841976743356, "learning_rate": 0.0002707181336484383, "loss": 1.7707, "step": 110 }, { "epoch": 0.2948717948717949, "grad_norm": 18.048940491315566, "learning_rate": 0.00026661412759346485, "loss": 1.4121, "step": 115 }, { "epoch": 0.3076923076923077, "grad_norm": 3.1798969853488384, "learning_rate": 0.00026227661222566516, "loss": 1.4291, "step": 120 }, { "epoch": 0.32051282051282054, "grad_norm": 43.65623626841003, "learning_rate": 0.00025771427302984107, "loss": 1.3341, "step": 125 }, { "epoch": 0.3333333333333333, "grad_norm": 26.333557314371472, "learning_rate": 0.00025293624568031, "loss": 1.7161, "step": 130 }, { "epoch": 0.34615384615384615, "grad_norm": 2.1036876520568257, "learning_rate": 0.0002479520977475377, "loss": 1.4303, "step": 135 }, { "epoch": 0.358974358974359, "grad_norm": 2.8179883662231773, "learning_rate": 0.0002427718095399382, "loss": 1.2882, "step": 140 }, { "epoch": 0.3717948717948718, "grad_norm": 2.0944018050914863, "learning_rate": 0.00023740575411920265, "loss": 1.3758, "step": 145 }, { "epoch": 0.38461538461538464, "grad_norm": 32.12093678906791, "learning_rate": 0.00023186467652917567, "loss": 1.3851, "step": 150 }, { "epoch": 0.3974358974358974, "grad_norm": 6.163138554388992, "learning_rate": 0.00022615967227987015, "loss": 1.3702, "step": 155 }, { "epoch": 0.41025641025641024, "grad_norm": 2.523613373490038, "learning_rate": 0.00022030216512970553, "loss": 1.278, "step": 160 }, { "epoch": 0.4230769230769231, "grad_norm": 4.836492744474631, "learning_rate": 0.0002143038842104581, "loss": 1.6826, "step": 165 }, { "epoch": 0.4358974358974359, "grad_norm": 35.30106381963297, "learning_rate": 0.00020817684054072824, "loss": 1.6584, "step": 170 }, { "epoch": 0.44871794871794873, "grad_norm": 5.821901622579396, "learning_rate": 0.0002019333029749549, "loss": 1.4078, "step": 175 }, { "epoch": 0.46153846153846156, "grad_norm": 1.8515370008819587, "learning_rate": 0.00019558577363613702, "loss": 1.1904, "step": 180 }, { "epoch": 0.47435897435897434, "grad_norm": 1.773287149962229, "learning_rate": 0.00018914696288145557, "loss": 1.1222, "step": 185 }, { "epoch": 0.48717948717948717, "grad_norm": 9.981307604755651, "learning_rate": 0.0001826297638509251, "loss": 1.1487, "step": 190 }, { "epoch": 0.5, "grad_norm": 3.292661747708301, "learning_rate": 0.00017604722665003956, "loss": 1.2224, "step": 195 }, { "epoch": 0.5128205128205128, "grad_norm": 1.5110397540810954, "learning_rate": 0.00016941253221810829, "loss": 1.164, "step": 200 }, { "epoch": 0.5256410256410257, "grad_norm": 1.809157946038207, "learning_rate": 0.0001627389659346097, "loss": 1.1763, "step": 205 }, { "epoch": 0.5384615384615384, "grad_norm": 1.3191427643036746, "learning_rate": 0.00015603989101641228, "loss": 1.2132, "step": 210 }, { "epoch": 0.5512820512820513, "grad_norm": 1.0948738027576763, "learning_rate": 0.00014932872175913348, "loss": 1.1136, "step": 215 }, { "epoch": 0.5641025641025641, "grad_norm": 39.38545507943537, "learning_rate": 0.00014261889667621826, "loss": 1.1053, "step": 220 }, { "epoch": 0.5769230769230769, "grad_norm": 6.565265116408364, "learning_rate": 0.0001359238515895231, "loss": 1.5669, "step": 225 }, { "epoch": 0.5897435897435898, "grad_norm": 4.60069460000724, "learning_rate": 0.00012925699272529006, "loss": 1.3127, "step": 230 }, { "epoch": 0.6025641025641025, "grad_norm": 2.5322952556318543, "learning_rate": 0.0001226316698693831, "loss": 1.189, "step": 235 }, { "epoch": 0.6153846153846154, "grad_norm": 1.4722453502945256, "learning_rate": 0.0001160611496355417, "loss": 1.1154, "step": 240 }, { "epoch": 0.6282051282051282, "grad_norm": 0.9179752397575098, "learning_rate": 0.00010955858890017846, "loss": 1.0788, "step": 245 }, { "epoch": 0.6410256410256411, "grad_norm": 1.6490754560884426, "learning_rate": 0.00010313700845691636, "loss": 1.5551, "step": 250 }, { "epoch": 0.6538461538461539, "grad_norm": 0.9038214719254013, "learning_rate": 9.680926694361964e-05, "loss": 1.0483, "step": 255 }, { "epoch": 0.6666666666666666, "grad_norm": 0.7506946296792257, "learning_rate": 9.058803509412646e-05, "loss": 1.0173, "step": 260 }, { "epoch": 0.6794871794871795, "grad_norm": 1.105603168554401, "learning_rate": 8.448577036624309e-05, "loss": 1.0209, "step": 265 }, { "epoch": 0.6923076923076923, "grad_norm": 0.7904976226318772, "learning_rate": 7.851469199680381e-05, "loss": 1.0113, "step": 270 }, { "epoch": 0.7051282051282052, "grad_norm": 0.8256317535886027, "learning_rate": 7.268675653374722e-05, "loss": 1.0168, "step": 275 }, { "epoch": 0.717948717948718, "grad_norm": 0.9927491874944325, "learning_rate": 6.701363389420296e-05, "loss": 0.999, "step": 280 }, { "epoch": 0.7307692307692307, "grad_norm": 0.73183193491982, "learning_rate": 6.15066839965316e-05, "loss": 0.9793, "step": 285 }, { "epoch": 0.7435897435897436, "grad_norm": 1.1245402624787457, "learning_rate": 5.6176934013108364e-05, "loss": 0.9756, "step": 290 }, { "epoch": 0.7564102564102564, "grad_norm": 0.6697033469064312, "learning_rate": 5.103505628940178e-05, "loss": 0.9658, "step": 295 }, { "epoch": 0.7692307692307693, "grad_norm": 0.7446212713719105, "learning_rate": 4.609134697356009e-05, "loss": 0.9734, "step": 300 }, { "epoch": 0.782051282051282, "grad_norm": 1.0552000330334352, "learning_rate": 4.135570539930026e-05, "loss": 0.9496, "step": 305 }, { "epoch": 0.7948717948717948, "grad_norm": 0.6168006421187829, "learning_rate": 3.683761426338148e-05, "loss": 0.9358, "step": 310 }, { "epoch": 0.8076923076923077, "grad_norm": 0.7790438783752558, "learning_rate": 3.254612063735667e-05, "loss": 1.0592, "step": 315 }, { "epoch": 0.8205128205128205, "grad_norm": 0.5966928897383734, "learning_rate": 2.8489817851625024e-05, "loss": 0.9039, "step": 320 }, { "epoch": 0.8333333333333334, "grad_norm": 11.644090734466477, "learning_rate": 2.4676828288059558e-05, "loss": 0.9481, "step": 325 }, { "epoch": 0.8461538461538461, "grad_norm": 0.676095723656071, "learning_rate": 2.1114787115667476e-05, "loss": 0.9089, "step": 330 }, { "epoch": 0.8589743589743589, "grad_norm": 0.7215500519599484, "learning_rate": 1.7810827001850187e-05, "loss": 0.9021, "step": 335 }, { "epoch": 0.8717948717948718, "grad_norm": 0.5589129407190109, "learning_rate": 1.4771563829877597e-05, "loss": 0.895, "step": 340 }, { "epoch": 0.8846153846153846, "grad_norm": 0.9293606746960693, "learning_rate": 1.2003083451176366e-05, "loss": 0.9188, "step": 345 }, { "epoch": 0.8974358974358975, "grad_norm": 0.5692953910303316, "learning_rate": 9.510929498959269e-06, "loss": 0.8814, "step": 350 }, { "epoch": 0.9102564102564102, "grad_norm": 0.6238210726565123, "learning_rate": 7.300092287597742e-06, "loss": 0.8627, "step": 355 }, { "epoch": 0.9230769230769231, "grad_norm": 0.5072463698927163, "learning_rate": 5.374998819965653e-06, "loss": 0.8626, "step": 360 }, { "epoch": 0.9358974358974359, "grad_norm": 0.6124784439233985, "learning_rate": 3.7395039227639644e-06, "loss": 0.8568, "step": 365 }, { "epoch": 0.9487179487179487, "grad_norm": 26.426978867070098, "learning_rate": 2.3968825275764768e-06, "loss": 0.8757, "step": 370 }, { "epoch": 0.9615384615384616, "grad_norm": 0.8103195703064432, "learning_rate": 1.3498231131137293e-06, "loss": 0.8811, "step": 375 }, { "epoch": 0.9743589743589743, "grad_norm": 0.557003321536524, "learning_rate": 6.004223217757509e-07, "loss": 0.8598, "step": 380 }, { "epoch": 0.9871794871794872, "grad_norm": 0.7470834443971104, "learning_rate": 1.50180761314167e-07, "loss": 0.8623, "step": 385 }, { "epoch": 1.0, "grad_norm": 0.6929955320341405, "learning_rate": 0.0, "loss": 0.9063, "step": 390 }, { "epoch": 1.0, "eval_loss": 3.8606326580047607, "eval_runtime": 2.3303, "eval_samples_per_second": 4.291, "eval_steps_per_second": 0.429, "step": 390 }, { "epoch": 1.0, "step": 390, "total_flos": 20388343971840.0, "train_loss": 1.4792061909651144, "train_runtime": 8464.1713, "train_samples_per_second": 1.474, "train_steps_per_second": 0.046 } ], "logging_steps": 5, "max_steps": 390, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 20388343971840.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }