{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.174311926605505, "eval_steps": 1000, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11467889908256881, "grad_norm": 4.947055339813232, "learning_rate": 0.000199079754601227, "loss": 4.2757, "step": 25 }, { "epoch": 0.22935779816513763, "grad_norm": 3.0868945121765137, "learning_rate": 0.00019754601226993864, "loss": 2.4215, "step": 50 }, { "epoch": 0.3440366972477064, "grad_norm": 3.1504604816436768, "learning_rate": 0.0001960122699386503, "loss": 1.5603, "step": 75 }, { "epoch": 0.45871559633027525, "grad_norm": 5.1320624351501465, "learning_rate": 0.00019447852760736198, "loss": 1.5129, "step": 100 }, { "epoch": 0.573394495412844, "grad_norm": 3.3029208183288574, "learning_rate": 0.00019294478527607362, "loss": 1.4394, "step": 125 }, { "epoch": 0.6880733944954128, "grad_norm": 4.884591102600098, "learning_rate": 0.0001914110429447853, "loss": 1.3357, "step": 150 }, { "epoch": 0.8027522935779816, "grad_norm": 2.960696220397949, "learning_rate": 0.00018987730061349693, "loss": 1.3536, "step": 175 }, { "epoch": 0.9174311926605505, "grad_norm": 3.2423105239868164, "learning_rate": 0.0001883435582822086, "loss": 1.3502, "step": 200 }, { "epoch": 1.0321100917431192, "grad_norm": 3.236219644546509, "learning_rate": 0.00018680981595092027, "loss": 1.2705, "step": 225 }, { "epoch": 1.146788990825688, "grad_norm": 6.417654514312744, "learning_rate": 0.0001852760736196319, "loss": 1.1982, "step": 250 }, { "epoch": 1.261467889908257, "grad_norm": 2.224224805831909, "learning_rate": 0.00018374233128834358, "loss": 0.9572, "step": 275 }, { "epoch": 1.3761467889908257, "grad_norm": 2.035285711288452, "learning_rate": 0.00018220858895705522, "loss": 1.1071, "step": 300 }, { "epoch": 1.4908256880733946, "grad_norm": 3.7886247634887695, "learning_rate": 0.0001806748466257669, "loss": 1.0682, "step": 325 }, { "epoch": 1.6055045871559632, "grad_norm": 5.813726902008057, "learning_rate": 0.00017914110429447856, "loss": 0.9436, "step": 350 }, { "epoch": 1.7201834862385321, "grad_norm": 7.5725178718566895, "learning_rate": 0.0001776073619631902, "loss": 1.0397, "step": 375 }, { "epoch": 1.834862385321101, "grad_norm": 3.6317062377929688, "learning_rate": 0.00017607361963190187, "loss": 1.1421, "step": 400 }, { "epoch": 1.9495412844036697, "grad_norm": 2.53063702583313, "learning_rate": 0.0001745398773006135, "loss": 1.1339, "step": 425 }, { "epoch": 2.0642201834862384, "grad_norm": 6.015753269195557, "learning_rate": 0.00017300613496932518, "loss": 0.8271, "step": 450 }, { "epoch": 2.1788990825688073, "grad_norm": 2.1599044799804688, "learning_rate": 0.00017147239263803682, "loss": 0.8183, "step": 475 }, { "epoch": 2.293577981651376, "grad_norm": 3.4313087463378906, "learning_rate": 0.00016993865030674846, "loss": 0.7794, "step": 500 }, { "epoch": 2.408256880733945, "grad_norm": 5.856240272521973, "learning_rate": 0.00016840490797546013, "loss": 0.8029, "step": 525 }, { "epoch": 2.522935779816514, "grad_norm": 3.658007860183716, "learning_rate": 0.00016687116564417177, "loss": 0.7877, "step": 550 }, { "epoch": 2.6376146788990824, "grad_norm": 4.434767723083496, "learning_rate": 0.00016533742331288344, "loss": 0.7313, "step": 575 }, { "epoch": 2.7522935779816513, "grad_norm": 5.971595287322998, "learning_rate": 0.0001638036809815951, "loss": 0.6757, "step": 600 }, { "epoch": 2.86697247706422, "grad_norm": 3.449643850326538, "learning_rate": 0.00016226993865030675, "loss": 0.8652, "step": 625 }, { "epoch": 2.981651376146789, "grad_norm": 3.473947763442993, "learning_rate": 0.00016073619631901842, "loss": 0.8634, "step": 650 }, { "epoch": 3.096330275229358, "grad_norm": 2.539017677307129, "learning_rate": 0.00015920245398773006, "loss": 0.6115, "step": 675 }, { "epoch": 3.2110091743119265, "grad_norm": 1.9025028944015503, "learning_rate": 0.00015766871165644173, "loss": 0.6218, "step": 700 }, { "epoch": 3.3256880733944953, "grad_norm": 5.206181526184082, "learning_rate": 0.0001561349693251534, "loss": 0.6527, "step": 725 }, { "epoch": 3.4403669724770642, "grad_norm": 7.43388032913208, "learning_rate": 0.00015460122699386504, "loss": 0.5914, "step": 750 }, { "epoch": 3.555045871559633, "grad_norm": 2.8263156414031982, "learning_rate": 0.0001530674846625767, "loss": 0.6598, "step": 775 }, { "epoch": 3.669724770642202, "grad_norm": 4.634042263031006, "learning_rate": 0.00015153374233128835, "loss": 0.6755, "step": 800 }, { "epoch": 3.7844036697247705, "grad_norm": 9.874078750610352, "learning_rate": 0.00015000000000000001, "loss": 0.6494, "step": 825 }, { "epoch": 3.8990825688073394, "grad_norm": 6.18958854675293, "learning_rate": 0.00014846625766871168, "loss": 0.6396, "step": 850 }, { "epoch": 4.013761467889908, "grad_norm": 5.176502227783203, "learning_rate": 0.00014693251533742332, "loss": 0.6837, "step": 875 }, { "epoch": 4.128440366972477, "grad_norm": 5.866634368896484, "learning_rate": 0.000145398773006135, "loss": 0.5174, "step": 900 }, { "epoch": 4.243119266055046, "grad_norm": 5.940659999847412, "learning_rate": 0.00014386503067484663, "loss": 0.5614, "step": 925 }, { "epoch": 4.3577981651376145, "grad_norm": 2.2365148067474365, "learning_rate": 0.00014233128834355828, "loss": 0.5525, "step": 950 }, { "epoch": 4.472477064220183, "grad_norm": 2.6536717414855957, "learning_rate": 0.00014079754601226994, "loss": 0.6028, "step": 975 }, { "epoch": 4.587155963302752, "grad_norm": 6.802552223205566, "learning_rate": 0.00013926380368098159, "loss": 0.5658, "step": 1000 }, { "epoch": 4.587155963302752, "eval_loss": 0.46536171436309814, "eval_runtime": 12.9671, "eval_samples_per_second": 16.812, "eval_steps_per_second": 2.159, "step": 1000 }, { "epoch": 4.701834862385321, "grad_norm": 5.150993347167969, "learning_rate": 0.00013773006134969325, "loss": 0.5522, "step": 1025 }, { "epoch": 4.81651376146789, "grad_norm": 5.858363151550293, "learning_rate": 0.0001361963190184049, "loss": 0.4839, "step": 1050 }, { "epoch": 4.931192660550459, "grad_norm": 3.4022738933563232, "learning_rate": 0.00013466257668711656, "loss": 0.597, "step": 1075 }, { "epoch": 5.045871559633028, "grad_norm": 3.6270384788513184, "learning_rate": 0.00013312883435582823, "loss": 0.5217, "step": 1100 }, { "epoch": 5.160550458715596, "grad_norm": 2.8926446437835693, "learning_rate": 0.00013159509202453987, "loss": 0.5683, "step": 1125 }, { "epoch": 5.275229357798165, "grad_norm": 2.598726511001587, "learning_rate": 0.00013006134969325154, "loss": 0.4556, "step": 1150 }, { "epoch": 5.389908256880734, "grad_norm": 5.851846694946289, "learning_rate": 0.00012852760736196318, "loss": 0.4656, "step": 1175 }, { "epoch": 5.504587155963303, "grad_norm": 1.5751358270645142, "learning_rate": 0.00012699386503067485, "loss": 0.5022, "step": 1200 }, { "epoch": 5.6192660550458715, "grad_norm": 3.152641534805298, "learning_rate": 0.00012546012269938652, "loss": 0.4727, "step": 1225 }, { "epoch": 5.73394495412844, "grad_norm": 2.1421804428100586, "learning_rate": 0.00012392638036809816, "loss": 0.556, "step": 1250 }, { "epoch": 5.848623853211009, "grad_norm": 2.27579665184021, "learning_rate": 0.00012239263803680983, "loss": 0.5588, "step": 1275 }, { "epoch": 5.963302752293578, "grad_norm": 7.381156921386719, "learning_rate": 0.00012085889570552147, "loss": 0.4679, "step": 1300 }, { "epoch": 6.077981651376147, "grad_norm": 1.3406894207000732, "learning_rate": 0.00011932515337423313, "loss": 0.4235, "step": 1325 }, { "epoch": 6.192660550458716, "grad_norm": 3.048557758331299, "learning_rate": 0.0001177914110429448, "loss": 0.4295, "step": 1350 }, { "epoch": 6.307339449541284, "grad_norm": 5.965505123138428, "learning_rate": 0.00011625766871165644, "loss": 0.4265, "step": 1375 }, { "epoch": 6.422018348623853, "grad_norm": 3.253556728363037, "learning_rate": 0.0001147239263803681, "loss": 0.4953, "step": 1400 }, { "epoch": 6.536697247706422, "grad_norm": 4.381213665008545, "learning_rate": 0.00011319018404907975, "loss": 0.4989, "step": 1425 }, { "epoch": 6.651376146788991, "grad_norm": 5.1543803215026855, "learning_rate": 0.00011165644171779142, "loss": 0.4478, "step": 1450 }, { "epoch": 6.76605504587156, "grad_norm": 4.282744407653809, "learning_rate": 0.00011012269938650308, "loss": 0.5269, "step": 1475 }, { "epoch": 6.8807339449541285, "grad_norm": 1.405004858970642, "learning_rate": 0.00010858895705521473, "loss": 0.4448, "step": 1500 }, { "epoch": 6.995412844036697, "grad_norm": 3.463747024536133, "learning_rate": 0.0001070552147239264, "loss": 0.4897, "step": 1525 }, { "epoch": 7.110091743119266, "grad_norm": 2.876891851425171, "learning_rate": 0.00010552147239263804, "loss": 0.4073, "step": 1550 }, { "epoch": 7.224770642201835, "grad_norm": 1.9896137714385986, "learning_rate": 0.00010398773006134969, "loss": 0.4214, "step": 1575 }, { "epoch": 7.339449541284404, "grad_norm": 5.495143890380859, "learning_rate": 0.00010245398773006136, "loss": 0.4318, "step": 1600 }, { "epoch": 7.454128440366972, "grad_norm": 2.7708942890167236, "learning_rate": 0.000100920245398773, "loss": 0.4374, "step": 1625 }, { "epoch": 7.568807339449541, "grad_norm": 2.327313184738159, "learning_rate": 9.938650306748467e-05, "loss": 0.4393, "step": 1650 }, { "epoch": 7.68348623853211, "grad_norm": 2.895890951156616, "learning_rate": 9.785276073619632e-05, "loss": 0.457, "step": 1675 }, { "epoch": 7.798165137614679, "grad_norm": 3.2652461528778076, "learning_rate": 9.631901840490798e-05, "loss": 0.4833, "step": 1700 }, { "epoch": 7.912844036697248, "grad_norm": 2.947260618209839, "learning_rate": 9.478527607361963e-05, "loss": 0.4142, "step": 1725 }, { "epoch": 8.027522935779816, "grad_norm": 1.9098671674728394, "learning_rate": 9.325153374233129e-05, "loss": 0.4309, "step": 1750 }, { "epoch": 8.142201834862385, "grad_norm": 4.037384033203125, "learning_rate": 9.171779141104296e-05, "loss": 0.4019, "step": 1775 }, { "epoch": 8.256880733944953, "grad_norm": 3.7623095512390137, "learning_rate": 9.018404907975461e-05, "loss": 0.3649, "step": 1800 }, { "epoch": 8.371559633027523, "grad_norm": 2.049928903579712, "learning_rate": 8.865030674846625e-05, "loss": 0.3905, "step": 1825 }, { "epoch": 8.486238532110091, "grad_norm": 6.4500017166137695, "learning_rate": 8.711656441717791e-05, "loss": 0.4493, "step": 1850 }, { "epoch": 8.600917431192661, "grad_norm": 1.4894698858261108, "learning_rate": 8.558282208588958e-05, "loss": 0.4416, "step": 1875 }, { "epoch": 8.715596330275229, "grad_norm": 3.064404010772705, "learning_rate": 8.404907975460123e-05, "loss": 0.395, "step": 1900 }, { "epoch": 8.830275229357799, "grad_norm": 4.572951316833496, "learning_rate": 8.251533742331289e-05, "loss": 0.4606, "step": 1925 }, { "epoch": 8.944954128440367, "grad_norm": 3.9111504554748535, "learning_rate": 8.098159509202454e-05, "loss": 0.4291, "step": 1950 }, { "epoch": 9.059633027522937, "grad_norm": 2.4905645847320557, "learning_rate": 7.94478527607362e-05, "loss": 0.3797, "step": 1975 }, { "epoch": 9.174311926605505, "grad_norm": 2.3467962741851807, "learning_rate": 7.791411042944787e-05, "loss": 0.4023, "step": 2000 }, { "epoch": 9.174311926605505, "eval_loss": 0.3534790277481079, "eval_runtime": 12.9976, "eval_samples_per_second": 16.772, "eval_steps_per_second": 2.154, "step": 2000 } ], "logging_steps": 25, "max_steps": 3270, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, "total_flos": 1448045812162560.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }