|
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 9.174311926605505,
|
|
"eval_steps": 1000,
|
|
"global_step": 2000,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.11467889908256881,
|
|
"grad_norm": 4.947055339813232,
|
|
"learning_rate": 0.000199079754601227,
|
|
"loss": 4.2757,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.22935779816513763,
|
|
"grad_norm": 3.0868945121765137,
|
|
"learning_rate": 0.00019754601226993864,
|
|
"loss": 2.4215,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.3440366972477064,
|
|
"grad_norm": 3.1504604816436768,
|
|
"learning_rate": 0.0001960122699386503,
|
|
"loss": 1.5603,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.45871559633027525,
|
|
"grad_norm": 5.1320624351501465,
|
|
"learning_rate": 0.00019447852760736198,
|
|
"loss": 1.5129,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.573394495412844,
|
|
"grad_norm": 3.3029208183288574,
|
|
"learning_rate": 0.00019294478527607362,
|
|
"loss": 1.4394,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.6880733944954128,
|
|
"grad_norm": 4.884591102600098,
|
|
"learning_rate": 0.0001914110429447853,
|
|
"loss": 1.3357,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.8027522935779816,
|
|
"grad_norm": 2.960696220397949,
|
|
"learning_rate": 0.00018987730061349693,
|
|
"loss": 1.3536,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.9174311926605505,
|
|
"grad_norm": 3.2423105239868164,
|
|
"learning_rate": 0.0001883435582822086,
|
|
"loss": 1.3502,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 1.0321100917431192,
|
|
"grad_norm": 3.236219644546509,
|
|
"learning_rate": 0.00018680981595092027,
|
|
"loss": 1.2705,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 1.146788990825688,
|
|
"grad_norm": 6.417654514312744,
|
|
"learning_rate": 0.0001852760736196319,
|
|
"loss": 1.1982,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 1.261467889908257,
|
|
"grad_norm": 2.224224805831909,
|
|
"learning_rate": 0.00018374233128834358,
|
|
"loss": 0.9572,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 1.3761467889908257,
|
|
"grad_norm": 2.035285711288452,
|
|
"learning_rate": 0.00018220858895705522,
|
|
"loss": 1.1071,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 1.4908256880733946,
|
|
"grad_norm": 3.7886247634887695,
|
|
"learning_rate": 0.0001806748466257669,
|
|
"loss": 1.0682,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 1.6055045871559632,
|
|
"grad_norm": 5.813726902008057,
|
|
"learning_rate": 0.00017914110429447856,
|
|
"loss": 0.9436,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 1.7201834862385321,
|
|
"grad_norm": 7.5725178718566895,
|
|
"learning_rate": 0.0001776073619631902,
|
|
"loss": 1.0397,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 1.834862385321101,
|
|
"grad_norm": 3.6317062377929688,
|
|
"learning_rate": 0.00017607361963190187,
|
|
"loss": 1.1421,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 1.9495412844036697,
|
|
"grad_norm": 2.53063702583313,
|
|
"learning_rate": 0.0001745398773006135,
|
|
"loss": 1.1339,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 2.0642201834862384,
|
|
"grad_norm": 6.015753269195557,
|
|
"learning_rate": 0.00017300613496932518,
|
|
"loss": 0.8271,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 2.1788990825688073,
|
|
"grad_norm": 2.1599044799804688,
|
|
"learning_rate": 0.00017147239263803682,
|
|
"loss": 0.8183,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 2.293577981651376,
|
|
"grad_norm": 3.4313087463378906,
|
|
"learning_rate": 0.00016993865030674846,
|
|
"loss": 0.7794,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 2.408256880733945,
|
|
"grad_norm": 5.856240272521973,
|
|
"learning_rate": 0.00016840490797546013,
|
|
"loss": 0.8029,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 2.522935779816514,
|
|
"grad_norm": 3.658007860183716,
|
|
"learning_rate": 0.00016687116564417177,
|
|
"loss": 0.7877,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 2.6376146788990824,
|
|
"grad_norm": 4.434767723083496,
|
|
"learning_rate": 0.00016533742331288344,
|
|
"loss": 0.7313,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 2.7522935779816513,
|
|
"grad_norm": 5.971595287322998,
|
|
"learning_rate": 0.0001638036809815951,
|
|
"loss": 0.6757,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 2.86697247706422,
|
|
"grad_norm": 3.449643850326538,
|
|
"learning_rate": 0.00016226993865030675,
|
|
"loss": 0.8652,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 2.981651376146789,
|
|
"grad_norm": 3.473947763442993,
|
|
"learning_rate": 0.00016073619631901842,
|
|
"loss": 0.8634,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 3.096330275229358,
|
|
"grad_norm": 2.539017677307129,
|
|
"learning_rate": 0.00015920245398773006,
|
|
"loss": 0.6115,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 3.2110091743119265,
|
|
"grad_norm": 1.9025028944015503,
|
|
"learning_rate": 0.00015766871165644173,
|
|
"loss": 0.6218,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 3.3256880733944953,
|
|
"grad_norm": 5.206181526184082,
|
|
"learning_rate": 0.0001561349693251534,
|
|
"loss": 0.6527,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 3.4403669724770642,
|
|
"grad_norm": 7.43388032913208,
|
|
"learning_rate": 0.00015460122699386504,
|
|
"loss": 0.5914,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 3.555045871559633,
|
|
"grad_norm": 2.8263156414031982,
|
|
"learning_rate": 0.0001530674846625767,
|
|
"loss": 0.6598,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 3.669724770642202,
|
|
"grad_norm": 4.634042263031006,
|
|
"learning_rate": 0.00015153374233128835,
|
|
"loss": 0.6755,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 3.7844036697247705,
|
|
"grad_norm": 9.874078750610352,
|
|
"learning_rate": 0.00015000000000000001,
|
|
"loss": 0.6494,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 3.8990825688073394,
|
|
"grad_norm": 6.18958854675293,
|
|
"learning_rate": 0.00014846625766871168,
|
|
"loss": 0.6396,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 4.013761467889908,
|
|
"grad_norm": 5.176502227783203,
|
|
"learning_rate": 0.00014693251533742332,
|
|
"loss": 0.6837,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 4.128440366972477,
|
|
"grad_norm": 5.866634368896484,
|
|
"learning_rate": 0.000145398773006135,
|
|
"loss": 0.5174,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 4.243119266055046,
|
|
"grad_norm": 5.940659999847412,
|
|
"learning_rate": 0.00014386503067484663,
|
|
"loss": 0.5614,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 4.3577981651376145,
|
|
"grad_norm": 2.2365148067474365,
|
|
"learning_rate": 0.00014233128834355828,
|
|
"loss": 0.5525,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 4.472477064220183,
|
|
"grad_norm": 2.6536717414855957,
|
|
"learning_rate": 0.00014079754601226994,
|
|
"loss": 0.6028,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 4.587155963302752,
|
|
"grad_norm": 6.802552223205566,
|
|
"learning_rate": 0.00013926380368098159,
|
|
"loss": 0.5658,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 4.587155963302752,
|
|
"eval_loss": 0.46536171436309814,
|
|
"eval_runtime": 12.9671,
|
|
"eval_samples_per_second": 16.812,
|
|
"eval_steps_per_second": 2.159,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 4.701834862385321,
|
|
"grad_norm": 5.150993347167969,
|
|
"learning_rate": 0.00013773006134969325,
|
|
"loss": 0.5522,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"epoch": 4.81651376146789,
|
|
"grad_norm": 5.858363151550293,
|
|
"learning_rate": 0.0001361963190184049,
|
|
"loss": 0.4839,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 4.931192660550459,
|
|
"grad_norm": 3.4022738933563232,
|
|
"learning_rate": 0.00013466257668711656,
|
|
"loss": 0.597,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"epoch": 5.045871559633028,
|
|
"grad_norm": 3.6270384788513184,
|
|
"learning_rate": 0.00013312883435582823,
|
|
"loss": 0.5217,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 5.160550458715596,
|
|
"grad_norm": 2.8926446437835693,
|
|
"learning_rate": 0.00013159509202453987,
|
|
"loss": 0.5683,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"epoch": 5.275229357798165,
|
|
"grad_norm": 2.598726511001587,
|
|
"learning_rate": 0.00013006134969325154,
|
|
"loss": 0.4556,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 5.389908256880734,
|
|
"grad_norm": 5.851846694946289,
|
|
"learning_rate": 0.00012852760736196318,
|
|
"loss": 0.4656,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"epoch": 5.504587155963303,
|
|
"grad_norm": 1.5751358270645142,
|
|
"learning_rate": 0.00012699386503067485,
|
|
"loss": 0.5022,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 5.6192660550458715,
|
|
"grad_norm": 3.152641534805298,
|
|
"learning_rate": 0.00012546012269938652,
|
|
"loss": 0.4727,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"epoch": 5.73394495412844,
|
|
"grad_norm": 2.1421804428100586,
|
|
"learning_rate": 0.00012392638036809816,
|
|
"loss": 0.556,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 5.848623853211009,
|
|
"grad_norm": 2.27579665184021,
|
|
"learning_rate": 0.00012239263803680983,
|
|
"loss": 0.5588,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"epoch": 5.963302752293578,
|
|
"grad_norm": 7.381156921386719,
|
|
"learning_rate": 0.00012085889570552147,
|
|
"loss": 0.4679,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 6.077981651376147,
|
|
"grad_norm": 1.3406894207000732,
|
|
"learning_rate": 0.00011932515337423313,
|
|
"loss": 0.4235,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"epoch": 6.192660550458716,
|
|
"grad_norm": 3.048557758331299,
|
|
"learning_rate": 0.0001177914110429448,
|
|
"loss": 0.4295,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 6.307339449541284,
|
|
"grad_norm": 5.965505123138428,
|
|
"learning_rate": 0.00011625766871165644,
|
|
"loss": 0.4265,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"epoch": 6.422018348623853,
|
|
"grad_norm": 3.253556728363037,
|
|
"learning_rate": 0.0001147239263803681,
|
|
"loss": 0.4953,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 6.536697247706422,
|
|
"grad_norm": 4.381213665008545,
|
|
"learning_rate": 0.00011319018404907975,
|
|
"loss": 0.4989,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"epoch": 6.651376146788991,
|
|
"grad_norm": 5.1543803215026855,
|
|
"learning_rate": 0.00011165644171779142,
|
|
"loss": 0.4478,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 6.76605504587156,
|
|
"grad_norm": 4.282744407653809,
|
|
"learning_rate": 0.00011012269938650308,
|
|
"loss": 0.5269,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"epoch": 6.8807339449541285,
|
|
"grad_norm": 1.405004858970642,
|
|
"learning_rate": 0.00010858895705521473,
|
|
"loss": 0.4448,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 6.995412844036697,
|
|
"grad_norm": 3.463747024536133,
|
|
"learning_rate": 0.0001070552147239264,
|
|
"loss": 0.4897,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"epoch": 7.110091743119266,
|
|
"grad_norm": 2.876891851425171,
|
|
"learning_rate": 0.00010552147239263804,
|
|
"loss": 0.4073,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 7.224770642201835,
|
|
"grad_norm": 1.9896137714385986,
|
|
"learning_rate": 0.00010398773006134969,
|
|
"loss": 0.4214,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"epoch": 7.339449541284404,
|
|
"grad_norm": 5.495143890380859,
|
|
"learning_rate": 0.00010245398773006136,
|
|
"loss": 0.4318,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 7.454128440366972,
|
|
"grad_norm": 2.7708942890167236,
|
|
"learning_rate": 0.000100920245398773,
|
|
"loss": 0.4374,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"epoch": 7.568807339449541,
|
|
"grad_norm": 2.327313184738159,
|
|
"learning_rate": 9.938650306748467e-05,
|
|
"loss": 0.4393,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 7.68348623853211,
|
|
"grad_norm": 2.895890951156616,
|
|
"learning_rate": 9.785276073619632e-05,
|
|
"loss": 0.457,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"epoch": 7.798165137614679,
|
|
"grad_norm": 3.2652461528778076,
|
|
"learning_rate": 9.631901840490798e-05,
|
|
"loss": 0.4833,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 7.912844036697248,
|
|
"grad_norm": 2.947260618209839,
|
|
"learning_rate": 9.478527607361963e-05,
|
|
"loss": 0.4142,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"epoch": 8.027522935779816,
|
|
"grad_norm": 1.9098671674728394,
|
|
"learning_rate": 9.325153374233129e-05,
|
|
"loss": 0.4309,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 8.142201834862385,
|
|
"grad_norm": 4.037384033203125,
|
|
"learning_rate": 9.171779141104296e-05,
|
|
"loss": 0.4019,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"epoch": 8.256880733944953,
|
|
"grad_norm": 3.7623095512390137,
|
|
"learning_rate": 9.018404907975461e-05,
|
|
"loss": 0.3649,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 8.371559633027523,
|
|
"grad_norm": 2.049928903579712,
|
|
"learning_rate": 8.865030674846625e-05,
|
|
"loss": 0.3905,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"epoch": 8.486238532110091,
|
|
"grad_norm": 6.4500017166137695,
|
|
"learning_rate": 8.711656441717791e-05,
|
|
"loss": 0.4493,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 8.600917431192661,
|
|
"grad_norm": 1.4894698858261108,
|
|
"learning_rate": 8.558282208588958e-05,
|
|
"loss": 0.4416,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"epoch": 8.715596330275229,
|
|
"grad_norm": 3.064404010772705,
|
|
"learning_rate": 8.404907975460123e-05,
|
|
"loss": 0.395,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 8.830275229357799,
|
|
"grad_norm": 4.572951316833496,
|
|
"learning_rate": 8.251533742331289e-05,
|
|
"loss": 0.4606,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"epoch": 8.944954128440367,
|
|
"grad_norm": 3.9111504554748535,
|
|
"learning_rate": 8.098159509202454e-05,
|
|
"loss": 0.4291,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 9.059633027522937,
|
|
"grad_norm": 2.4905645847320557,
|
|
"learning_rate": 7.94478527607362e-05,
|
|
"loss": 0.3797,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"epoch": 9.174311926605505,
|
|
"grad_norm": 2.3467962741851807,
|
|
"learning_rate": 7.791411042944787e-05,
|
|
"loss": 0.4023,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 9.174311926605505,
|
|
"eval_loss": 0.3534790277481079,
|
|
"eval_runtime": 12.9976,
|
|
"eval_samples_per_second": 16.772,
|
|
"eval_steps_per_second": 2.154,
|
|
"step": 2000
|
|
}
|
|
],
|
|
"logging_steps": 25,
|
|
"max_steps": 3270,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 15,
|
|
"save_steps": 500,
|
|
"total_flos": 1448045812162560.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|