|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.06255082254331644, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.8341, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 2e-05, |
|
"loss": 1.9527, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 4e-05, |
|
"loss": 1.9158, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 6e-05, |
|
"loss": 1.8797, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 8e-05, |
|
"loss": 1.7949, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.0001, |
|
"loss": 1.7219, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00012, |
|
"loss": 1.623, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00014, |
|
"loss": 1.5828, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00016, |
|
"loss": 1.5208, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00018, |
|
"loss": 1.4697, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4292, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0001999390827019096, |
|
"loss": 1.4127, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00019975640502598244, |
|
"loss": 1.3752, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00019945218953682734, |
|
"loss": 1.338, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00019902680687415705, |
|
"loss": 1.325, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00019848077530122083, |
|
"loss": 1.3627, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00019781476007338058, |
|
"loss": 1.3026, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00019702957262759965, |
|
"loss": 1.3302, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0001961261695938319, |
|
"loss": 1.3032, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 1.3538, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 1.3524, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00019271838545667876, |
|
"loss": 1.2811, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0001913545457642601, |
|
"loss": 1.2991, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0001898794046299167, |
|
"loss": 1.314, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00018829475928589271, |
|
"loss": 1.2834, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 1.3266, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0001848048096156426, |
|
"loss": 1.294, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00018290375725550417, |
|
"loss": 1.3133, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 1.3191, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00017880107536067218, |
|
"loss": 1.2459, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0001766044443118978, |
|
"loss": 1.3149, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00017431448254773944, |
|
"loss": 1.2173, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0001719339800338651, |
|
"loss": 1.277, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00016946583704589973, |
|
"loss": 1.2873, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00016691306063588583, |
|
"loss": 1.2535, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00016427876096865394, |
|
"loss": 1.2991, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0001615661475325658, |
|
"loss": 1.2486, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 1.3577, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0001559192903470747, |
|
"loss": 1.2981, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0001529919264233205, |
|
"loss": 1.3271, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 1.3087, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.00014694715627858908, |
|
"loss": 1.2858, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.00014383711467890774, |
|
"loss": 1.306, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.00014067366430758004, |
|
"loss": 1.279, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.00013746065934159123, |
|
"loss": 1.3175, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.00013420201433256689, |
|
"loss": 1.2362, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 1.2573, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0001275637355816999, |
|
"loss": 1.2666, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.00012419218955996676, |
|
"loss": 1.2854, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.00012079116908177593, |
|
"loss": 1.3342, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 1.2907, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.00011391731009600654, |
|
"loss": 1.2878, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.00011045284632676536, |
|
"loss": 1.2695, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.00010697564737441252, |
|
"loss": 1.2497, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.00010348994967025012, |
|
"loss": 1.2663, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2748, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 9.651005032974994e-05, |
|
"loss": 1.2526, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 9.302435262558747e-05, |
|
"loss": 1.3002, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 8.954715367323468e-05, |
|
"loss": 1.2367, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 8.608268990399349e-05, |
|
"loss": 1.2935, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 8.263518223330697e-05, |
|
"loss": 1.3007, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 7.920883091822408e-05, |
|
"loss": 1.3359, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 7.580781044003324e-05, |
|
"loss": 1.2743, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 7.243626441830009e-05, |
|
"loss": 1.2772, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 1.2885, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 6.579798566743314e-05, |
|
"loss": 1.2912, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 6.25393406584088e-05, |
|
"loss": 1.2306, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 5.9326335692419995e-05, |
|
"loss": 1.3064, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 5.616288532109225e-05, |
|
"loss": 1.297, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 5.305284372141095e-05, |
|
"loss": 1.2715, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 1.2466, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 4.700807357667952e-05, |
|
"loss": 1.2669, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 4.4080709652925336e-05, |
|
"loss": 1.249, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 1.235, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 3.843385246743417e-05, |
|
"loss": 1.2891, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 3.5721239031346066e-05, |
|
"loss": 1.2281, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 3.308693936411421e-05, |
|
"loss": 1.2338, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 3.053416295410026e-05, |
|
"loss": 1.2941, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 2.8066019966134904e-05, |
|
"loss": 1.2671, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 2.5685517452260567e-05, |
|
"loss": 1.3195, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 2.339555568810221e-05, |
|
"loss": 1.2021, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 2.119892463932781e-05, |
|
"loss": 1.2715, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 1.2557, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 1.7096242744495837e-05, |
|
"loss": 1.3001, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 1.5195190384357404e-05, |
|
"loss": 1.2931, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 1.339745962155613e-05, |
|
"loss": 1.2374, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 1.1705240714107302e-05, |
|
"loss": 1.2925, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 1.0120595370083318e-05, |
|
"loss": 1.2766, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 8.645454235739903e-06, |
|
"loss": 1.3244, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 7.281614543321269e-06, |
|
"loss": 1.3404, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 6.030737921409169e-06, |
|
"loss": 1.2886, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 4.8943483704846475e-06, |
|
"loss": 1.3172, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 3.873830406168111e-06, |
|
"loss": 1.2714, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 2.970427372400353e-06, |
|
"loss": 1.2419, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 2.1852399266194314e-06, |
|
"loss": 1.2816, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 1.5192246987791981e-06, |
|
"loss": 1.252, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 9.731931258429638e-07, |
|
"loss": 1.2246, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 5.478104631726711e-07, |
|
"loss": 1.2728, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 2.4359497401758024e-07, |
|
"loss": 1.2357, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 6.09172980904238e-08, |
|
"loss": 1.2624, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0, |
|
"loss": 1.2922, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 1.249782919883728, |
|
"eval_runtime": 1717.6564, |
|
"eval_samples_per_second": 16.48, |
|
"eval_steps_per_second": 1.03, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"step": 500, |
|
"total_flos": 1123799500062720.0, |
|
"train_loss": 1.3271520497798919, |
|
"train_runtime": 4460.9425, |
|
"train_samples_per_second": 3.587, |
|
"train_steps_per_second": 0.112 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"total_flos": 1123799500062720.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|