{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 536, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0018656716417910447, "grad_norm": 2.57972321339612, "learning_rate": 1.8518518518518518e-07, "loss": 2.092, "step": 1 }, { "epoch": 0.009328358208955223, "grad_norm": 2.3735119503905793, "learning_rate": 9.259259259259259e-07, "loss": 2.0863, "step": 5 }, { "epoch": 0.018656716417910446, "grad_norm": 2.569828354184601, "learning_rate": 1.8518518518518519e-06, "loss": 2.0955, "step": 10 }, { "epoch": 0.027985074626865673, "grad_norm": 2.3365878728458847, "learning_rate": 2.7777777777777783e-06, "loss": 2.078, "step": 15 }, { "epoch": 0.03731343283582089, "grad_norm": 2.258292749629048, "learning_rate": 3.7037037037037037e-06, "loss": 2.0535, "step": 20 }, { "epoch": 0.04664179104477612, "grad_norm": 2.046409624248939, "learning_rate": 4.62962962962963e-06, "loss": 1.9936, "step": 25 }, { "epoch": 0.055970149253731345, "grad_norm": 1.7763941617915107, "learning_rate": 5.555555555555557e-06, "loss": 1.9172, "step": 30 }, { "epoch": 0.06529850746268656, "grad_norm": 1.5289482279435747, "learning_rate": 6.481481481481482e-06, "loss": 1.8216, "step": 35 }, { "epoch": 0.07462686567164178, "grad_norm": 1.3374159627357958, "learning_rate": 7.4074074074074075e-06, "loss": 1.7237, "step": 40 }, { "epoch": 0.08395522388059702, "grad_norm": 1.038988897758925, "learning_rate": 8.333333333333334e-06, "loss": 1.6205, "step": 45 }, { "epoch": 0.09328358208955224, "grad_norm": 0.5713243982808286, "learning_rate": 9.25925925925926e-06, "loss": 1.5364, "step": 50 }, { "epoch": 0.10261194029850747, "grad_norm": 0.3049992132098635, "learning_rate": 9.999893795201304e-06, "loss": 1.491, "step": 55 }, { "epoch": 0.11194029850746269, "grad_norm": 0.36148501744582134, "learning_rate": 9.996177100962714e-06, "loss": 1.473, "step": 60 }, { "epoch": 0.12126865671641791, "grad_norm": 0.37327333184682066, "learning_rate": 9.987154677711482e-06, "loss": 1.4594, "step": 65 }, { "epoch": 0.13059701492537312, "grad_norm": 0.31133363324217617, "learning_rate": 9.972836106879936e-06, "loss": 1.4442, "step": 70 }, { "epoch": 0.13992537313432835, "grad_norm": 0.2375327029162037, "learning_rate": 9.953236594185396e-06, "loss": 1.4258, "step": 75 }, { "epoch": 0.14925373134328357, "grad_norm": 0.19998776915472055, "learning_rate": 9.928376953482343e-06, "loss": 1.4093, "step": 80 }, { "epoch": 0.15858208955223882, "grad_norm": 0.19069793139067803, "learning_rate": 9.898283584658988e-06, "loss": 1.4029, "step": 85 }, { "epoch": 0.16791044776119404, "grad_norm": 0.17979433709822593, "learning_rate": 9.86298844560169e-06, "loss": 1.3876, "step": 90 }, { "epoch": 0.17723880597014927, "grad_norm": 0.1750060584120198, "learning_rate": 9.822529018257049e-06, "loss": 1.3842, "step": 95 }, { "epoch": 0.1865671641791045, "grad_norm": 0.17054790759610952, "learning_rate": 9.776948268827658e-06, "loss": 1.3756, "step": 100 }, { "epoch": 0.1958955223880597, "grad_norm": 0.17397761641849066, "learning_rate": 9.726294602143807e-06, "loss": 1.3544, "step": 105 }, { "epoch": 0.20522388059701493, "grad_norm": 0.17494592787959154, "learning_rate": 9.670621810259596e-06, "loss": 1.338, "step": 110 }, { "epoch": 0.21455223880597016, "grad_norm": 0.17874287198232588, "learning_rate": 9.609989015328052e-06, "loss": 1.3321, "step": 115 }, { "epoch": 0.22388059701492538, "grad_norm": 0.18177787677872814, "learning_rate": 9.544460606815901e-06, "loss": 1.3231, "step": 120 }, { "epoch": 0.2332089552238806, "grad_norm": 0.17485139906929326, "learning_rate": 9.474106173124667e-06, "loss": 1.3029, "step": 125 }, { "epoch": 0.24253731343283583, "grad_norm": 0.16541538036070813, "learning_rate": 9.399000427690736e-06, "loss": 1.2914, "step": 130 }, { "epoch": 0.251865671641791, "grad_norm": 0.15475993597407106, "learning_rate": 9.31922312964284e-06, "loss": 1.2731, "step": 135 }, { "epoch": 0.26119402985074625, "grad_norm": 0.14876033685341322, "learning_rate": 9.234858999101232e-06, "loss": 1.2612, "step": 140 }, { "epoch": 0.27052238805970147, "grad_norm": 0.13102260537341828, "learning_rate": 9.1459976272085e-06, "loss": 1.2406, "step": 145 }, { "epoch": 0.2798507462686567, "grad_norm": 0.1231370916125911, "learning_rate": 9.052733380987555e-06, "loss": 1.2402, "step": 150 }, { "epoch": 0.2891791044776119, "grad_norm": 0.11617076992843688, "learning_rate": 8.955165303127841e-06, "loss": 1.2287, "step": 155 }, { "epoch": 0.29850746268656714, "grad_norm": 0.10711292754756604, "learning_rate": 8.853397006806183e-06, "loss": 1.2293, "step": 160 }, { "epoch": 0.30783582089552236, "grad_norm": 0.10583142709305077, "learning_rate": 8.747536565653966e-06, "loss": 1.2196, "step": 165 }, { "epoch": 0.31716417910447764, "grad_norm": 0.10071371091495551, "learning_rate": 8.637696398987517e-06, "loss": 1.2218, "step": 170 }, { "epoch": 0.32649253731343286, "grad_norm": 0.09786904166381197, "learning_rate": 8.523993152423522e-06, "loss": 1.2076, "step": 175 }, { "epoch": 0.3358208955223881, "grad_norm": 0.09308216883502402, "learning_rate": 8.406547574006326e-06, "loss": 1.2069, "step": 180 }, { "epoch": 0.3451492537313433, "grad_norm": 0.09403316911201794, "learning_rate": 8.285484385978598e-06, "loss": 1.2151, "step": 185 }, { "epoch": 0.35447761194029853, "grad_norm": 0.09276385606464647, "learning_rate": 8.160932152331587e-06, "loss": 1.2033, "step": 190 }, { "epoch": 0.36380597014925375, "grad_norm": 0.09044032546865181, "learning_rate": 8.03302314227559e-06, "loss": 1.2028, "step": 195 }, { "epoch": 0.373134328358209, "grad_norm": 0.08960581233300505, "learning_rate": 7.90189318977564e-06, "loss": 1.2036, "step": 200 }, { "epoch": 0.3824626865671642, "grad_norm": 0.08546838672930177, "learning_rate": 7.767681549301576e-06, "loss": 1.1932, "step": 205 }, { "epoch": 0.3917910447761194, "grad_norm": 0.08706831337193889, "learning_rate": 7.630530747945672e-06, "loss": 1.2016, "step": 210 }, { "epoch": 0.40111940298507465, "grad_norm": 0.08501828118277771, "learning_rate": 7.490586434064893e-06, "loss": 1.1984, "step": 215 }, { "epoch": 0.41044776119402987, "grad_norm": 0.08231749758944934, "learning_rate": 7.3479972226084925e-06, "loss": 1.1934, "step": 220 }, { "epoch": 0.4197761194029851, "grad_norm": 0.0846325939740857, "learning_rate": 7.202914537295211e-06, "loss": 1.1871, "step": 225 }, { "epoch": 0.4291044776119403, "grad_norm": 0.0839943062987978, "learning_rate": 7.055492449807684e-06, "loss": 1.1847, "step": 230 }, { "epoch": 0.43843283582089554, "grad_norm": 0.0842974194320567, "learning_rate": 6.905887516174827e-06, "loss": 1.1823, "step": 235 }, { "epoch": 0.44776119402985076, "grad_norm": 0.08246202494295994, "learning_rate": 6.754258610515949e-06, "loss": 1.1874, "step": 240 }, { "epoch": 0.457089552238806, "grad_norm": 0.08059952388104133, "learning_rate": 6.60076675632314e-06, "loss": 1.1768, "step": 245 }, { "epoch": 0.4664179104477612, "grad_norm": 0.08278287209635887, "learning_rate": 6.445574955461134e-06, "loss": 1.1743, "step": 250 }, { "epoch": 0.47574626865671643, "grad_norm": 0.08064613485931339, "learning_rate": 6.288848015066211e-06, "loss": 1.1787, "step": 255 }, { "epoch": 0.48507462686567165, "grad_norm": 0.08402784054638568, "learning_rate": 6.130752372527981e-06, "loss": 1.1797, "step": 260 }, { "epoch": 0.4944029850746269, "grad_norm": 0.07813087594533498, "learning_rate": 5.9714559187399094e-06, "loss": 1.1814, "step": 265 }, { "epoch": 0.503731343283582, "grad_norm": 0.08253282031035683, "learning_rate": 5.811127819806277e-06, "loss": 1.1767, "step": 270 }, { "epoch": 0.5130597014925373, "grad_norm": 0.0795025665014561, "learning_rate": 5.649938337394932e-06, "loss": 1.1684, "step": 275 }, { "epoch": 0.5223880597014925, "grad_norm": 0.08201269451158993, "learning_rate": 5.4880586479265774e-06, "loss": 1.1704, "step": 280 }, { "epoch": 0.5317164179104478, "grad_norm": 0.08039664736174637, "learning_rate": 5.325660660792657e-06, "loss": 1.1704, "step": 285 }, { "epoch": 0.5410447761194029, "grad_norm": 0.08098088631592301, "learning_rate": 5.162916835794843e-06, "loss": 1.1722, "step": 290 }, { "epoch": 0.5503731343283582, "grad_norm": 0.08014102460089276, "learning_rate": 5e-06, "loss": 1.1748, "step": 295 }, { "epoch": 0.5597014925373134, "grad_norm": 0.08067154136929545, "learning_rate": 4.837083164205159e-06, "loss": 1.1741, "step": 300 }, { "epoch": 0.5690298507462687, "grad_norm": 0.07702944221391651, "learning_rate": 4.6743393392073435e-06, "loss": 1.1734, "step": 305 }, { "epoch": 0.5783582089552238, "grad_norm": 0.08366251841352572, "learning_rate": 4.511941352073424e-06, "loss": 1.1674, "step": 310 }, { "epoch": 0.5876865671641791, "grad_norm": 0.08401515307564614, "learning_rate": 4.3500616626050705e-06, "loss": 1.171, "step": 315 }, { "epoch": 0.5970149253731343, "grad_norm": 0.07880234388575032, "learning_rate": 4.188872180193723e-06, "loss": 1.1617, "step": 320 }, { "epoch": 0.6063432835820896, "grad_norm": 0.08075724129202204, "learning_rate": 4.028544081260093e-06, "loss": 1.1664, "step": 325 }, { "epoch": 0.6156716417910447, "grad_norm": 0.08145891878774718, "learning_rate": 3.869247627472021e-06, "loss": 1.1647, "step": 330 }, { "epoch": 0.625, "grad_norm": 0.07981030112455705, "learning_rate": 3.7111519849337908e-06, "loss": 1.1661, "step": 335 }, { "epoch": 0.6343283582089553, "grad_norm": 0.07719220748480114, "learning_rate": 3.554425044538868e-06, "loss": 1.164, "step": 340 }, { "epoch": 0.6436567164179104, "grad_norm": 0.07965079201392798, "learning_rate": 3.3992332436768615e-06, "loss": 1.1685, "step": 345 }, { "epoch": 0.6529850746268657, "grad_norm": 0.07929508580781615, "learning_rate": 3.2457413894840516e-06, "loss": 1.166, "step": 350 }, { "epoch": 0.6623134328358209, "grad_norm": 0.08260668169292941, "learning_rate": 3.0941124838251734e-06, "loss": 1.1641, "step": 355 }, { "epoch": 0.6716417910447762, "grad_norm": 0.07880705452928782, "learning_rate": 2.944507550192318e-06, "loss": 1.1697, "step": 360 }, { "epoch": 0.6809701492537313, "grad_norm": 0.08617336475255008, "learning_rate": 2.7970854627047893e-06, "loss": 1.1617, "step": 365 }, { "epoch": 0.6902985074626866, "grad_norm": 0.08504447838669933, "learning_rate": 2.6520027773915075e-06, "loss": 1.1694, "step": 370 }, { "epoch": 0.6996268656716418, "grad_norm": 0.08177432381677913, "learning_rate": 2.509413565935107e-06, "loss": 1.1663, "step": 375 }, { "epoch": 0.7089552238805971, "grad_norm": 0.08035338660262055, "learning_rate": 2.3694692520543293e-06, "loss": 1.1617, "step": 380 }, { "epoch": 0.7182835820895522, "grad_norm": 0.08099302263307083, "learning_rate": 2.2323184506984257e-06, "loss": 1.1601, "step": 385 }, { "epoch": 0.7276119402985075, "grad_norm": 0.0788199344055389, "learning_rate": 2.098106810224362e-06, "loss": 1.1678, "step": 390 }, { "epoch": 0.7369402985074627, "grad_norm": 0.0810220420611739, "learning_rate": 1.9669768577244107e-06, "loss": 1.1659, "step": 395 }, { "epoch": 0.746268656716418, "grad_norm": 0.07813643594984888, "learning_rate": 1.8390678476684143e-06, "loss": 1.1619, "step": 400 }, { "epoch": 0.7555970149253731, "grad_norm": 0.07787996158898178, "learning_rate": 1.7145156140214032e-06, "loss": 1.1647, "step": 405 }, { "epoch": 0.7649253731343284, "grad_norm": 0.08149237459503167, "learning_rate": 1.5934524259936757e-06, "loss": 1.1663, "step": 410 }, { "epoch": 0.7742537313432836, "grad_norm": 0.08133121141847001, "learning_rate": 1.4760068475764789e-06, "loss": 1.1481, "step": 415 }, { "epoch": 0.7835820895522388, "grad_norm": 0.07796965232583761, "learning_rate": 1.3623036010124845e-06, "loss": 1.1592, "step": 420 }, { "epoch": 0.792910447761194, "grad_norm": 0.07862074938919475, "learning_rate": 1.2524634343460335e-06, "loss": 1.1537, "step": 425 }, { "epoch": 0.8022388059701493, "grad_norm": 0.08047006906879658, "learning_rate": 1.1466029931938182e-06, "loss": 1.1575, "step": 430 }, { "epoch": 0.8115671641791045, "grad_norm": 0.07919282621278656, "learning_rate": 1.0448346968721596e-06, "loss": 1.1591, "step": 435 }, { "epoch": 0.8208955223880597, "grad_norm": 0.08069925871151505, "learning_rate": 9.472666190124457e-07, "loss": 1.1588, "step": 440 }, { "epoch": 0.8302238805970149, "grad_norm": 0.07925111301368296, "learning_rate": 8.540023727915015e-07, "loss": 1.1552, "step": 445 }, { "epoch": 0.8395522388059702, "grad_norm": 0.08107260891150957, "learning_rate": 7.651410008987698e-07, "loss": 1.1568, "step": 450 }, { "epoch": 0.8488805970149254, "grad_norm": 0.07855238487611285, "learning_rate": 6.807768703571616e-07, "loss": 1.1498, "step": 455 }, { "epoch": 0.8582089552238806, "grad_norm": 0.08067373717155321, "learning_rate": 6.009995723092655e-07, "loss": 1.163, "step": 460 }, { "epoch": 0.8675373134328358, "grad_norm": 0.083123312332487, "learning_rate": 5.258938268753344e-07, "loss": 1.155, "step": 465 }, { "epoch": 0.8768656716417911, "grad_norm": 0.07901297859441447, "learning_rate": 4.555393931841001e-07, "loss": 1.1615, "step": 470 }, { "epoch": 0.8861940298507462, "grad_norm": 0.07759270476319681, "learning_rate": 3.9001098467194907e-07, "loss": 1.1575, "step": 475 }, { "epoch": 0.8955223880597015, "grad_norm": 0.07822412070732043, "learning_rate": 3.2937818974040637e-07, "loss": 1.1597, "step": 480 }, { "epoch": 0.9048507462686567, "grad_norm": 0.0799398408165139, "learning_rate": 2.737053978561943e-07, "loss": 1.1602, "step": 485 }, { "epoch": 0.914179104477612, "grad_norm": 0.07777635865631954, "learning_rate": 2.2305173117234236e-07, "loss": 1.1568, "step": 490 }, { "epoch": 0.9235074626865671, "grad_norm": 0.08036060154131985, "learning_rate": 1.7747098174295208e-07, "loss": 1.1652, "step": 495 }, { "epoch": 0.9328358208955224, "grad_norm": 0.08306752546345667, "learning_rate": 1.3701155439831249e-07, "loss": 1.1558, "step": 500 }, { "epoch": 0.9421641791044776, "grad_norm": 0.07992649084819468, "learning_rate": 1.017164153410144e-07, "loss": 1.1566, "step": 505 }, { "epoch": 0.9514925373134329, "grad_norm": 0.07911970141612473, "learning_rate": 7.16230465176565e-08, "loss": 1.1559, "step": 510 }, { "epoch": 0.960820895522388, "grad_norm": 0.0846383351709255, "learning_rate": 4.6763405814604926e-08, "loss": 1.1514, "step": 515 }, { "epoch": 0.9701492537313433, "grad_norm": 0.07934050493191654, "learning_rate": 2.7163893120066288e-08, "loss": 1.1542, "step": 520 }, { "epoch": 0.9794776119402985, "grad_norm": 0.08167448133397126, "learning_rate": 1.284532228851998e-08, "loss": 1.161, "step": 525 }, { "epoch": 0.9888059701492538, "grad_norm": 0.07857080161491098, "learning_rate": 3.822899037286276e-09, "loss": 1.1579, "step": 530 }, { "epoch": 0.9981343283582089, "grad_norm": 0.07847093836951612, "learning_rate": 1.0620479869771772e-10, "loss": 1.1582, "step": 535 }, { "epoch": 1.0, "eval_runtime": 3.2225, "eval_samples_per_second": 3.103, "eval_steps_per_second": 0.931, "step": 536 }, { "epoch": 1.0, "step": 536, "total_flos": 555957746663424.0, "train_loss": 1.2752919873194908, "train_runtime": 16895.3411, "train_samples_per_second": 2.029, "train_steps_per_second": 0.032 } ], "logging_steps": 5, "max_steps": 536, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 555957746663424.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }