results / trainer_state.json
Soykot's picture
Upload 14 files
6f5c44e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.174311926605505,
"eval_steps": 1000,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.11467889908256881,
"grad_norm": 4.947055339813232,
"learning_rate": 0.000199079754601227,
"loss": 4.2757,
"step": 25
},
{
"epoch": 0.22935779816513763,
"grad_norm": 3.0868945121765137,
"learning_rate": 0.00019754601226993864,
"loss": 2.4215,
"step": 50
},
{
"epoch": 0.3440366972477064,
"grad_norm": 3.1504604816436768,
"learning_rate": 0.0001960122699386503,
"loss": 1.5603,
"step": 75
},
{
"epoch": 0.45871559633027525,
"grad_norm": 5.1320624351501465,
"learning_rate": 0.00019447852760736198,
"loss": 1.5129,
"step": 100
},
{
"epoch": 0.573394495412844,
"grad_norm": 3.3029208183288574,
"learning_rate": 0.00019294478527607362,
"loss": 1.4394,
"step": 125
},
{
"epoch": 0.6880733944954128,
"grad_norm": 4.884591102600098,
"learning_rate": 0.0001914110429447853,
"loss": 1.3357,
"step": 150
},
{
"epoch": 0.8027522935779816,
"grad_norm": 2.960696220397949,
"learning_rate": 0.00018987730061349693,
"loss": 1.3536,
"step": 175
},
{
"epoch": 0.9174311926605505,
"grad_norm": 3.2423105239868164,
"learning_rate": 0.0001883435582822086,
"loss": 1.3502,
"step": 200
},
{
"epoch": 1.0321100917431192,
"grad_norm": 3.236219644546509,
"learning_rate": 0.00018680981595092027,
"loss": 1.2705,
"step": 225
},
{
"epoch": 1.146788990825688,
"grad_norm": 6.417654514312744,
"learning_rate": 0.0001852760736196319,
"loss": 1.1982,
"step": 250
},
{
"epoch": 1.261467889908257,
"grad_norm": 2.224224805831909,
"learning_rate": 0.00018374233128834358,
"loss": 0.9572,
"step": 275
},
{
"epoch": 1.3761467889908257,
"grad_norm": 2.035285711288452,
"learning_rate": 0.00018220858895705522,
"loss": 1.1071,
"step": 300
},
{
"epoch": 1.4908256880733946,
"grad_norm": 3.7886247634887695,
"learning_rate": 0.0001806748466257669,
"loss": 1.0682,
"step": 325
},
{
"epoch": 1.6055045871559632,
"grad_norm": 5.813726902008057,
"learning_rate": 0.00017914110429447856,
"loss": 0.9436,
"step": 350
},
{
"epoch": 1.7201834862385321,
"grad_norm": 7.5725178718566895,
"learning_rate": 0.0001776073619631902,
"loss": 1.0397,
"step": 375
},
{
"epoch": 1.834862385321101,
"grad_norm": 3.6317062377929688,
"learning_rate": 0.00017607361963190187,
"loss": 1.1421,
"step": 400
},
{
"epoch": 1.9495412844036697,
"grad_norm": 2.53063702583313,
"learning_rate": 0.0001745398773006135,
"loss": 1.1339,
"step": 425
},
{
"epoch": 2.0642201834862384,
"grad_norm": 6.015753269195557,
"learning_rate": 0.00017300613496932518,
"loss": 0.8271,
"step": 450
},
{
"epoch": 2.1788990825688073,
"grad_norm": 2.1599044799804688,
"learning_rate": 0.00017147239263803682,
"loss": 0.8183,
"step": 475
},
{
"epoch": 2.293577981651376,
"grad_norm": 3.4313087463378906,
"learning_rate": 0.00016993865030674846,
"loss": 0.7794,
"step": 500
},
{
"epoch": 2.408256880733945,
"grad_norm": 5.856240272521973,
"learning_rate": 0.00016840490797546013,
"loss": 0.8029,
"step": 525
},
{
"epoch": 2.522935779816514,
"grad_norm": 3.658007860183716,
"learning_rate": 0.00016687116564417177,
"loss": 0.7877,
"step": 550
},
{
"epoch": 2.6376146788990824,
"grad_norm": 4.434767723083496,
"learning_rate": 0.00016533742331288344,
"loss": 0.7313,
"step": 575
},
{
"epoch": 2.7522935779816513,
"grad_norm": 5.971595287322998,
"learning_rate": 0.0001638036809815951,
"loss": 0.6757,
"step": 600
},
{
"epoch": 2.86697247706422,
"grad_norm": 3.449643850326538,
"learning_rate": 0.00016226993865030675,
"loss": 0.8652,
"step": 625
},
{
"epoch": 2.981651376146789,
"grad_norm": 3.473947763442993,
"learning_rate": 0.00016073619631901842,
"loss": 0.8634,
"step": 650
},
{
"epoch": 3.096330275229358,
"grad_norm": 2.539017677307129,
"learning_rate": 0.00015920245398773006,
"loss": 0.6115,
"step": 675
},
{
"epoch": 3.2110091743119265,
"grad_norm": 1.9025028944015503,
"learning_rate": 0.00015766871165644173,
"loss": 0.6218,
"step": 700
},
{
"epoch": 3.3256880733944953,
"grad_norm": 5.206181526184082,
"learning_rate": 0.0001561349693251534,
"loss": 0.6527,
"step": 725
},
{
"epoch": 3.4403669724770642,
"grad_norm": 7.43388032913208,
"learning_rate": 0.00015460122699386504,
"loss": 0.5914,
"step": 750
},
{
"epoch": 3.555045871559633,
"grad_norm": 2.8263156414031982,
"learning_rate": 0.0001530674846625767,
"loss": 0.6598,
"step": 775
},
{
"epoch": 3.669724770642202,
"grad_norm": 4.634042263031006,
"learning_rate": 0.00015153374233128835,
"loss": 0.6755,
"step": 800
},
{
"epoch": 3.7844036697247705,
"grad_norm": 9.874078750610352,
"learning_rate": 0.00015000000000000001,
"loss": 0.6494,
"step": 825
},
{
"epoch": 3.8990825688073394,
"grad_norm": 6.18958854675293,
"learning_rate": 0.00014846625766871168,
"loss": 0.6396,
"step": 850
},
{
"epoch": 4.013761467889908,
"grad_norm": 5.176502227783203,
"learning_rate": 0.00014693251533742332,
"loss": 0.6837,
"step": 875
},
{
"epoch": 4.128440366972477,
"grad_norm": 5.866634368896484,
"learning_rate": 0.000145398773006135,
"loss": 0.5174,
"step": 900
},
{
"epoch": 4.243119266055046,
"grad_norm": 5.940659999847412,
"learning_rate": 0.00014386503067484663,
"loss": 0.5614,
"step": 925
},
{
"epoch": 4.3577981651376145,
"grad_norm": 2.2365148067474365,
"learning_rate": 0.00014233128834355828,
"loss": 0.5525,
"step": 950
},
{
"epoch": 4.472477064220183,
"grad_norm": 2.6536717414855957,
"learning_rate": 0.00014079754601226994,
"loss": 0.6028,
"step": 975
},
{
"epoch": 4.587155963302752,
"grad_norm": 6.802552223205566,
"learning_rate": 0.00013926380368098159,
"loss": 0.5658,
"step": 1000
},
{
"epoch": 4.587155963302752,
"eval_loss": 0.46536171436309814,
"eval_runtime": 12.9671,
"eval_samples_per_second": 16.812,
"eval_steps_per_second": 2.159,
"step": 1000
},
{
"epoch": 4.701834862385321,
"grad_norm": 5.150993347167969,
"learning_rate": 0.00013773006134969325,
"loss": 0.5522,
"step": 1025
},
{
"epoch": 4.81651376146789,
"grad_norm": 5.858363151550293,
"learning_rate": 0.0001361963190184049,
"loss": 0.4839,
"step": 1050
},
{
"epoch": 4.931192660550459,
"grad_norm": 3.4022738933563232,
"learning_rate": 0.00013466257668711656,
"loss": 0.597,
"step": 1075
},
{
"epoch": 5.045871559633028,
"grad_norm": 3.6270384788513184,
"learning_rate": 0.00013312883435582823,
"loss": 0.5217,
"step": 1100
},
{
"epoch": 5.160550458715596,
"grad_norm": 2.8926446437835693,
"learning_rate": 0.00013159509202453987,
"loss": 0.5683,
"step": 1125
},
{
"epoch": 5.275229357798165,
"grad_norm": 2.598726511001587,
"learning_rate": 0.00013006134969325154,
"loss": 0.4556,
"step": 1150
},
{
"epoch": 5.389908256880734,
"grad_norm": 5.851846694946289,
"learning_rate": 0.00012852760736196318,
"loss": 0.4656,
"step": 1175
},
{
"epoch": 5.504587155963303,
"grad_norm": 1.5751358270645142,
"learning_rate": 0.00012699386503067485,
"loss": 0.5022,
"step": 1200
},
{
"epoch": 5.6192660550458715,
"grad_norm": 3.152641534805298,
"learning_rate": 0.00012546012269938652,
"loss": 0.4727,
"step": 1225
},
{
"epoch": 5.73394495412844,
"grad_norm": 2.1421804428100586,
"learning_rate": 0.00012392638036809816,
"loss": 0.556,
"step": 1250
},
{
"epoch": 5.848623853211009,
"grad_norm": 2.27579665184021,
"learning_rate": 0.00012239263803680983,
"loss": 0.5588,
"step": 1275
},
{
"epoch": 5.963302752293578,
"grad_norm": 7.381156921386719,
"learning_rate": 0.00012085889570552147,
"loss": 0.4679,
"step": 1300
},
{
"epoch": 6.077981651376147,
"grad_norm": 1.3406894207000732,
"learning_rate": 0.00011932515337423313,
"loss": 0.4235,
"step": 1325
},
{
"epoch": 6.192660550458716,
"grad_norm": 3.048557758331299,
"learning_rate": 0.0001177914110429448,
"loss": 0.4295,
"step": 1350
},
{
"epoch": 6.307339449541284,
"grad_norm": 5.965505123138428,
"learning_rate": 0.00011625766871165644,
"loss": 0.4265,
"step": 1375
},
{
"epoch": 6.422018348623853,
"grad_norm": 3.253556728363037,
"learning_rate": 0.0001147239263803681,
"loss": 0.4953,
"step": 1400
},
{
"epoch": 6.536697247706422,
"grad_norm": 4.381213665008545,
"learning_rate": 0.00011319018404907975,
"loss": 0.4989,
"step": 1425
},
{
"epoch": 6.651376146788991,
"grad_norm": 5.1543803215026855,
"learning_rate": 0.00011165644171779142,
"loss": 0.4478,
"step": 1450
},
{
"epoch": 6.76605504587156,
"grad_norm": 4.282744407653809,
"learning_rate": 0.00011012269938650308,
"loss": 0.5269,
"step": 1475
},
{
"epoch": 6.8807339449541285,
"grad_norm": 1.405004858970642,
"learning_rate": 0.00010858895705521473,
"loss": 0.4448,
"step": 1500
},
{
"epoch": 6.995412844036697,
"grad_norm": 3.463747024536133,
"learning_rate": 0.0001070552147239264,
"loss": 0.4897,
"step": 1525
},
{
"epoch": 7.110091743119266,
"grad_norm": 2.876891851425171,
"learning_rate": 0.00010552147239263804,
"loss": 0.4073,
"step": 1550
},
{
"epoch": 7.224770642201835,
"grad_norm": 1.9896137714385986,
"learning_rate": 0.00010398773006134969,
"loss": 0.4214,
"step": 1575
},
{
"epoch": 7.339449541284404,
"grad_norm": 5.495143890380859,
"learning_rate": 0.00010245398773006136,
"loss": 0.4318,
"step": 1600
},
{
"epoch": 7.454128440366972,
"grad_norm": 2.7708942890167236,
"learning_rate": 0.000100920245398773,
"loss": 0.4374,
"step": 1625
},
{
"epoch": 7.568807339449541,
"grad_norm": 2.327313184738159,
"learning_rate": 9.938650306748467e-05,
"loss": 0.4393,
"step": 1650
},
{
"epoch": 7.68348623853211,
"grad_norm": 2.895890951156616,
"learning_rate": 9.785276073619632e-05,
"loss": 0.457,
"step": 1675
},
{
"epoch": 7.798165137614679,
"grad_norm": 3.2652461528778076,
"learning_rate": 9.631901840490798e-05,
"loss": 0.4833,
"step": 1700
},
{
"epoch": 7.912844036697248,
"grad_norm": 2.947260618209839,
"learning_rate": 9.478527607361963e-05,
"loss": 0.4142,
"step": 1725
},
{
"epoch": 8.027522935779816,
"grad_norm": 1.9098671674728394,
"learning_rate": 9.325153374233129e-05,
"loss": 0.4309,
"step": 1750
},
{
"epoch": 8.142201834862385,
"grad_norm": 4.037384033203125,
"learning_rate": 9.171779141104296e-05,
"loss": 0.4019,
"step": 1775
},
{
"epoch": 8.256880733944953,
"grad_norm": 3.7623095512390137,
"learning_rate": 9.018404907975461e-05,
"loss": 0.3649,
"step": 1800
},
{
"epoch": 8.371559633027523,
"grad_norm": 2.049928903579712,
"learning_rate": 8.865030674846625e-05,
"loss": 0.3905,
"step": 1825
},
{
"epoch": 8.486238532110091,
"grad_norm": 6.4500017166137695,
"learning_rate": 8.711656441717791e-05,
"loss": 0.4493,
"step": 1850
},
{
"epoch": 8.600917431192661,
"grad_norm": 1.4894698858261108,
"learning_rate": 8.558282208588958e-05,
"loss": 0.4416,
"step": 1875
},
{
"epoch": 8.715596330275229,
"grad_norm": 3.064404010772705,
"learning_rate": 8.404907975460123e-05,
"loss": 0.395,
"step": 1900
},
{
"epoch": 8.830275229357799,
"grad_norm": 4.572951316833496,
"learning_rate": 8.251533742331289e-05,
"loss": 0.4606,
"step": 1925
},
{
"epoch": 8.944954128440367,
"grad_norm": 3.9111504554748535,
"learning_rate": 8.098159509202454e-05,
"loss": 0.4291,
"step": 1950
},
{
"epoch": 9.059633027522937,
"grad_norm": 2.4905645847320557,
"learning_rate": 7.94478527607362e-05,
"loss": 0.3797,
"step": 1975
},
{
"epoch": 9.174311926605505,
"grad_norm": 2.3467962741851807,
"learning_rate": 7.791411042944787e-05,
"loss": 0.4023,
"step": 2000
},
{
"epoch": 9.174311926605505,
"eval_loss": 0.3534790277481079,
"eval_runtime": 12.9976,
"eval_samples_per_second": 16.772,
"eval_steps_per_second": 2.154,
"step": 2000
}
],
"logging_steps": 25,
"max_steps": 3270,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 500,
"total_flos": 1448045812162560.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}