|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 390, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002564102564102564, |
|
"grad_norm": 32.246810828841255, |
|
"learning_rate": 7.692307692307692e-06, |
|
"loss": 2.2691, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01282051282051282, |
|
"grad_norm": 29.537508150180734, |
|
"learning_rate": 3.8461538461538456e-05, |
|
"loss": 2.5015, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02564102564102564, |
|
"grad_norm": 7.166984360831344, |
|
"learning_rate": 7.692307692307691e-05, |
|
"loss": 1.3116, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.038461538461538464, |
|
"grad_norm": 7.358331315050627, |
|
"learning_rate": 0.00011538461538461538, |
|
"loss": 1.2847, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.05128205128205128, |
|
"grad_norm": 8.061303504403417, |
|
"learning_rate": 0.00015384615384615382, |
|
"loss": 1.424, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0641025641025641, |
|
"grad_norm": 30.923296803868965, |
|
"learning_rate": 0.0001923076923076923, |
|
"loss": 1.6455, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07692307692307693, |
|
"grad_norm": 35.007464110767536, |
|
"learning_rate": 0.00023076923076923076, |
|
"loss": 1.8041, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08974358974358974, |
|
"grad_norm": 10.894826428758686, |
|
"learning_rate": 0.0002692307692307692, |
|
"loss": 1.6061, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.10256410256410256, |
|
"grad_norm": 32.31558367670942, |
|
"learning_rate": 0.0002999939918069778, |
|
"loss": 1.7147, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11538461538461539, |
|
"grad_norm": 408.39746177245365, |
|
"learning_rate": 0.0002997837555846212, |
|
"loss": 3.0391, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1282051282051282, |
|
"grad_norm": 585.894165849368, |
|
"learning_rate": 0.00029927359084964874, |
|
"loss": 7.5436, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14102564102564102, |
|
"grad_norm": 188.43190635567956, |
|
"learning_rate": 0.00029846451916110426, |
|
"loss": 6.3459, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 18.84113658139999, |
|
"learning_rate": 0.00029735816061234965, |
|
"loss": 4.155, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 83.76190854828539, |
|
"learning_rate": 0.00029595673058697357, |
|
"loss": 2.5163, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1794871794871795, |
|
"grad_norm": 8.013185246066367, |
|
"learning_rate": 0.00029426303532268435, |
|
"loss": 1.8248, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.19230769230769232, |
|
"grad_norm": 7.809904167784529, |
|
"learning_rate": 0.00029228046629207175, |
|
"loss": 1.6297, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.20512820512820512, |
|
"grad_norm": 6.240741545472446, |
|
"learning_rate": 0.00029001299341148754, |
|
"loss": 1.4935, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.21794871794871795, |
|
"grad_norm": 56.36582267694748, |
|
"learning_rate": 0.0002874651570916444, |
|
"loss": 1.5049, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.23076923076923078, |
|
"grad_norm": 244.98938960556112, |
|
"learning_rate": 0.0002846420591458521, |
|
"loss": 2.0434, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.24358974358974358, |
|
"grad_norm": 15.923002750416765, |
|
"learning_rate": 0.00028154935257409397, |
|
"loss": 1.9592, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2564102564102564, |
|
"grad_norm": 6.887371068354224, |
|
"learning_rate": 0.000278193230243403, |
|
"loss": 1.52, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2692307692307692, |
|
"grad_norm": 2.1953075310025407, |
|
"learning_rate": 0.00027458041248720175, |
|
"loss": 1.4671, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.28205128205128205, |
|
"grad_norm": 5.381841976743356, |
|
"learning_rate": 0.0002707181336484383, |
|
"loss": 1.7707, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2948717948717949, |
|
"grad_norm": 18.048940491315566, |
|
"learning_rate": 0.00026661412759346485, |
|
"loss": 1.4121, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 3.1798969853488384, |
|
"learning_rate": 0.00026227661222566516, |
|
"loss": 1.4291, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.32051282051282054, |
|
"grad_norm": 43.65623626841003, |
|
"learning_rate": 0.00025771427302984107, |
|
"loss": 1.3341, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 26.333557314371472, |
|
"learning_rate": 0.00025293624568031, |
|
"loss": 1.7161, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.34615384615384615, |
|
"grad_norm": 2.1036876520568257, |
|
"learning_rate": 0.0002479520977475377, |
|
"loss": 1.4303, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.358974358974359, |
|
"grad_norm": 2.8179883662231773, |
|
"learning_rate": 0.0002427718095399382, |
|
"loss": 1.2882, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3717948717948718, |
|
"grad_norm": 2.0944018050914863, |
|
"learning_rate": 0.00023740575411920265, |
|
"loss": 1.3758, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"grad_norm": 32.12093678906791, |
|
"learning_rate": 0.00023186467652917567, |
|
"loss": 1.3851, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3974358974358974, |
|
"grad_norm": 6.163138554388992, |
|
"learning_rate": 0.00022615967227987015, |
|
"loss": 1.3702, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.41025641025641024, |
|
"grad_norm": 2.523613373490038, |
|
"learning_rate": 0.00022030216512970553, |
|
"loss": 1.278, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4230769230769231, |
|
"grad_norm": 4.836492744474631, |
|
"learning_rate": 0.0002143038842104581, |
|
"loss": 1.6826, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4358974358974359, |
|
"grad_norm": 35.30106381963297, |
|
"learning_rate": 0.00020817684054072824, |
|
"loss": 1.6584, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.44871794871794873, |
|
"grad_norm": 5.821901622579396, |
|
"learning_rate": 0.0002019333029749549, |
|
"loss": 1.4078, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 1.8515370008819587, |
|
"learning_rate": 0.00019558577363613702, |
|
"loss": 1.1904, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.47435897435897434, |
|
"grad_norm": 1.773287149962229, |
|
"learning_rate": 0.00018914696288145557, |
|
"loss": 1.1222, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.48717948717948717, |
|
"grad_norm": 9.981307604755651, |
|
"learning_rate": 0.0001826297638509251, |
|
"loss": 1.1487, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.292661747708301, |
|
"learning_rate": 0.00017604722665003956, |
|
"loss": 1.2224, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"grad_norm": 1.5110397540810954, |
|
"learning_rate": 0.00016941253221810829, |
|
"loss": 1.164, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5256410256410257, |
|
"grad_norm": 1.809157946038207, |
|
"learning_rate": 0.0001627389659346097, |
|
"loss": 1.1763, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5384615384615384, |
|
"grad_norm": 1.3191427643036746, |
|
"learning_rate": 0.00015603989101641228, |
|
"loss": 1.2132, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5512820512820513, |
|
"grad_norm": 1.0948738027576763, |
|
"learning_rate": 0.00014932872175913348, |
|
"loss": 1.1136, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5641025641025641, |
|
"grad_norm": 39.38545507943537, |
|
"learning_rate": 0.00014261889667621826, |
|
"loss": 1.1053, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5769230769230769, |
|
"grad_norm": 6.565265116408364, |
|
"learning_rate": 0.0001359238515895231, |
|
"loss": 1.5669, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5897435897435898, |
|
"grad_norm": 4.60069460000724, |
|
"learning_rate": 0.00012925699272529006, |
|
"loss": 1.3127, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6025641025641025, |
|
"grad_norm": 2.5322952556318543, |
|
"learning_rate": 0.0001226316698693831, |
|
"loss": 1.189, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 1.4722453502945256, |
|
"learning_rate": 0.0001160611496355417, |
|
"loss": 1.1154, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6282051282051282, |
|
"grad_norm": 0.9179752397575098, |
|
"learning_rate": 0.00010955858890017846, |
|
"loss": 1.0788, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6410256410256411, |
|
"grad_norm": 1.6490754560884426, |
|
"learning_rate": 0.00010313700845691636, |
|
"loss": 1.5551, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6538461538461539, |
|
"grad_norm": 0.9038214719254013, |
|
"learning_rate": 9.680926694361964e-05, |
|
"loss": 1.0483, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.7506946296792257, |
|
"learning_rate": 9.058803509412646e-05, |
|
"loss": 1.0173, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6794871794871795, |
|
"grad_norm": 1.105603168554401, |
|
"learning_rate": 8.448577036624309e-05, |
|
"loss": 1.0209, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6923076923076923, |
|
"grad_norm": 0.7904976226318772, |
|
"learning_rate": 7.851469199680381e-05, |
|
"loss": 1.0113, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7051282051282052, |
|
"grad_norm": 0.8256317535886027, |
|
"learning_rate": 7.268675653374722e-05, |
|
"loss": 1.0168, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.717948717948718, |
|
"grad_norm": 0.9927491874944325, |
|
"learning_rate": 6.701363389420296e-05, |
|
"loss": 0.999, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7307692307692307, |
|
"grad_norm": 0.73183193491982, |
|
"learning_rate": 6.15066839965316e-05, |
|
"loss": 0.9793, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7435897435897436, |
|
"grad_norm": 1.1245402624787457, |
|
"learning_rate": 5.6176934013108364e-05, |
|
"loss": 0.9756, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7564102564102564, |
|
"grad_norm": 0.6697033469064312, |
|
"learning_rate": 5.103505628940178e-05, |
|
"loss": 0.9658, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 0.7446212713719105, |
|
"learning_rate": 4.609134697356009e-05, |
|
"loss": 0.9734, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.782051282051282, |
|
"grad_norm": 1.0552000330334352, |
|
"learning_rate": 4.135570539930026e-05, |
|
"loss": 0.9496, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.7948717948717948, |
|
"grad_norm": 0.6168006421187829, |
|
"learning_rate": 3.683761426338148e-05, |
|
"loss": 0.9358, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8076923076923077, |
|
"grad_norm": 0.7790438783752558, |
|
"learning_rate": 3.254612063735667e-05, |
|
"loss": 1.0592, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.8205128205128205, |
|
"grad_norm": 0.5966928897383734, |
|
"learning_rate": 2.8489817851625024e-05, |
|
"loss": 0.9039, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 11.644090734466477, |
|
"learning_rate": 2.4676828288059558e-05, |
|
"loss": 0.9481, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8461538461538461, |
|
"grad_norm": 0.676095723656071, |
|
"learning_rate": 2.1114787115667476e-05, |
|
"loss": 0.9089, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8589743589743589, |
|
"grad_norm": 0.7215500519599484, |
|
"learning_rate": 1.7810827001850187e-05, |
|
"loss": 0.9021, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.8717948717948718, |
|
"grad_norm": 0.5589129407190109, |
|
"learning_rate": 1.4771563829877597e-05, |
|
"loss": 0.895, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8846153846153846, |
|
"grad_norm": 0.9293606746960693, |
|
"learning_rate": 1.2003083451176366e-05, |
|
"loss": 0.9188, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.8974358974358975, |
|
"grad_norm": 0.5692953910303316, |
|
"learning_rate": 9.510929498959269e-06, |
|
"loss": 0.8814, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9102564102564102, |
|
"grad_norm": 0.6238210726565123, |
|
"learning_rate": 7.300092287597742e-06, |
|
"loss": 0.8627, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 0.5072463698927163, |
|
"learning_rate": 5.374998819965653e-06, |
|
"loss": 0.8626, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9358974358974359, |
|
"grad_norm": 0.6124784439233985, |
|
"learning_rate": 3.7395039227639644e-06, |
|
"loss": 0.8568, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.9487179487179487, |
|
"grad_norm": 26.426978867070098, |
|
"learning_rate": 2.3968825275764768e-06, |
|
"loss": 0.8757, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9615384615384616, |
|
"grad_norm": 0.8103195703064432, |
|
"learning_rate": 1.3498231131137293e-06, |
|
"loss": 0.8811, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.9743589743589743, |
|
"grad_norm": 0.557003321536524, |
|
"learning_rate": 6.004223217757509e-07, |
|
"loss": 0.8598, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9871794871794872, |
|
"grad_norm": 0.7470834443971104, |
|
"learning_rate": 1.50180761314167e-07, |
|
"loss": 0.8623, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.6929955320341405, |
|
"learning_rate": 0.0, |
|
"loss": 0.9063, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 3.8606326580047607, |
|
"eval_runtime": 2.3303, |
|
"eval_samples_per_second": 4.291, |
|
"eval_steps_per_second": 0.429, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 390, |
|
"total_flos": 20388343971840.0, |
|
"train_loss": 1.4792061909651144, |
|
"train_runtime": 8464.1713, |
|
"train_samples_per_second": 1.474, |
|
"train_steps_per_second": 0.046 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 390, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 20388343971840.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|