besimray's picture
Training in progress, step 280, checkpoint
fab6de2 verified
{
"best_metric": 1.151181697845459,
"best_model_checkpoint": "miner_id_besimray/checkpoint-80",
"epoch": 5.894736842105263,
"eval_steps": 20,
"global_step": 280,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.021052631578947368,
"grad_norm": 0.25488582253456116,
"learning_rate": 2e-05,
"loss": 1.2983,
"step": 1
},
{
"epoch": 0.021052631578947368,
"eval_loss": 1.2585705518722534,
"eval_runtime": 14.9372,
"eval_samples_per_second": 6.695,
"eval_steps_per_second": 0.669,
"step": 1
},
{
"epoch": 0.042105263157894736,
"grad_norm": 0.2551250755786896,
"learning_rate": 4e-05,
"loss": 1.4576,
"step": 2
},
{
"epoch": 0.06315789473684211,
"grad_norm": 0.22965364158153534,
"learning_rate": 6e-05,
"loss": 1.2758,
"step": 3
},
{
"epoch": 0.08421052631578947,
"grad_norm": 0.25596627593040466,
"learning_rate": 8e-05,
"loss": 1.4291,
"step": 4
},
{
"epoch": 0.10526315789473684,
"grad_norm": 0.21169574558734894,
"learning_rate": 0.0001,
"loss": 1.2482,
"step": 5
},
{
"epoch": 0.12631578947368421,
"grad_norm": 0.2537442147731781,
"learning_rate": 0.00012,
"loss": 1.4111,
"step": 6
},
{
"epoch": 0.14736842105263157,
"grad_norm": 0.22837992012500763,
"learning_rate": 0.00014,
"loss": 1.338,
"step": 7
},
{
"epoch": 0.16842105263157894,
"grad_norm": 0.16500858962535858,
"learning_rate": 0.00016,
"loss": 1.2243,
"step": 8
},
{
"epoch": 0.18947368421052632,
"grad_norm": 0.2048870176076889,
"learning_rate": 0.00018,
"loss": 1.2117,
"step": 9
},
{
"epoch": 0.21052631578947367,
"grad_norm": 0.22877398133277893,
"learning_rate": 0.0002,
"loss": 1.3922,
"step": 10
},
{
"epoch": 0.23157894736842105,
"grad_norm": 0.25871542096138,
"learning_rate": 0.0001999999780359183,
"loss": 1.2073,
"step": 11
},
{
"epoch": 0.25263157894736843,
"grad_norm": 0.18334853649139404,
"learning_rate": 0.00019999991214368284,
"loss": 1.2268,
"step": 12
},
{
"epoch": 0.2736842105263158,
"grad_norm": 0.1385767012834549,
"learning_rate": 0.0001999998023233226,
"loss": 1.0728,
"step": 13
},
{
"epoch": 0.29473684210526313,
"grad_norm": 0.12759718298912048,
"learning_rate": 0.0001999996485748858,
"loss": 1.0786,
"step": 14
},
{
"epoch": 0.3157894736842105,
"grad_norm": 0.13563676178455353,
"learning_rate": 0.00019999945089843994,
"loss": 1.0021,
"step": 15
},
{
"epoch": 0.3368421052631579,
"grad_norm": 0.12118455767631531,
"learning_rate": 0.0001999992092940719,
"loss": 1.2168,
"step": 16
},
{
"epoch": 0.35789473684210527,
"grad_norm": 0.17731498181819916,
"learning_rate": 0.00019999892376188782,
"loss": 1.2883,
"step": 17
},
{
"epoch": 0.37894736842105264,
"grad_norm": 0.1495707631111145,
"learning_rate": 0.0001999985943020131,
"loss": 1.2307,
"step": 18
},
{
"epoch": 0.4,
"grad_norm": 0.14801433682441711,
"learning_rate": 0.00019999822091459248,
"loss": 1.2703,
"step": 19
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.15497420728206635,
"learning_rate": 0.00019999780359979,
"loss": 1.3601,
"step": 20
},
{
"epoch": 0.42105263157894735,
"eval_loss": 1.1756575107574463,
"eval_runtime": 15.1128,
"eval_samples_per_second": 6.617,
"eval_steps_per_second": 0.662,
"step": 20
},
{
"epoch": 0.4421052631578947,
"grad_norm": 0.154701828956604,
"learning_rate": 0.00019999734235778894,
"loss": 1.0972,
"step": 21
},
{
"epoch": 0.4631578947368421,
"grad_norm": 0.12257974594831467,
"learning_rate": 0.00019999683718879195,
"loss": 1.1799,
"step": 22
},
{
"epoch": 0.4842105263157895,
"grad_norm": 0.14035549759864807,
"learning_rate": 0.0001999962880930209,
"loss": 1.2342,
"step": 23
},
{
"epoch": 0.5052631578947369,
"grad_norm": 0.1992308348417282,
"learning_rate": 0.00019999569507071706,
"loss": 1.2253,
"step": 24
},
{
"epoch": 0.5263157894736842,
"grad_norm": 0.1479017287492752,
"learning_rate": 0.00019999505812214085,
"loss": 1.2039,
"step": 25
},
{
"epoch": 0.5473684210526316,
"grad_norm": 0.16983704268932343,
"learning_rate": 0.00019999437724757218,
"loss": 1.2489,
"step": 26
},
{
"epoch": 0.5684210526315789,
"grad_norm": 0.1630581021308899,
"learning_rate": 0.00019999365244731,
"loss": 1.2576,
"step": 27
},
{
"epoch": 0.5894736842105263,
"grad_norm": 0.15563088655471802,
"learning_rate": 0.00019999288372167287,
"loss": 1.231,
"step": 28
},
{
"epoch": 0.6105263157894737,
"grad_norm": 0.15619848668575287,
"learning_rate": 0.00019999207107099834,
"loss": 1.3292,
"step": 29
},
{
"epoch": 0.631578947368421,
"grad_norm": 0.1286384016275406,
"learning_rate": 0.00019999121449564347,
"loss": 1.1393,
"step": 30
},
{
"epoch": 0.6526315789473685,
"grad_norm": 0.13140268623828888,
"learning_rate": 0.0001999903139959845,
"loss": 1.0837,
"step": 31
},
{
"epoch": 0.6736842105263158,
"grad_norm": 0.13603107631206512,
"learning_rate": 0.000199989369572417,
"loss": 1.1516,
"step": 32
},
{
"epoch": 0.6947368421052632,
"grad_norm": 0.1404484510421753,
"learning_rate": 0.00019998838122535585,
"loss": 1.235,
"step": 33
},
{
"epoch": 0.7157894736842105,
"grad_norm": 0.18797928094863892,
"learning_rate": 0.00019998734895523525,
"loss": 1.1864,
"step": 34
},
{
"epoch": 0.7368421052631579,
"grad_norm": 0.1484900712966919,
"learning_rate": 0.00019998627276250858,
"loss": 1.1262,
"step": 35
},
{
"epoch": 0.7578947368421053,
"grad_norm": 0.14105963706970215,
"learning_rate": 0.0001999851526476487,
"loss": 1.238,
"step": 36
},
{
"epoch": 0.7789473684210526,
"grad_norm": 0.1366693079471588,
"learning_rate": 0.00019998398861114752,
"loss": 1.2757,
"step": 37
},
{
"epoch": 0.8,
"grad_norm": 0.16491512954235077,
"learning_rate": 0.00019998278065351646,
"loss": 1.2666,
"step": 38
},
{
"epoch": 0.8210526315789474,
"grad_norm": 0.13820528984069824,
"learning_rate": 0.0001999815287752862,
"loss": 1.1497,
"step": 39
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.15959441661834717,
"learning_rate": 0.00019998023297700658,
"loss": 1.2034,
"step": 40
},
{
"epoch": 0.8421052631578947,
"eval_loss": 1.1566897630691528,
"eval_runtime": 15.1159,
"eval_samples_per_second": 6.616,
"eval_steps_per_second": 0.662,
"step": 40
},
{
"epoch": 0.8631578947368421,
"grad_norm": 0.18244534730911255,
"learning_rate": 0.00019997889325924683,
"loss": 1.2506,
"step": 41
},
{
"epoch": 0.8842105263157894,
"grad_norm": 0.13269871473312378,
"learning_rate": 0.0001999775096225955,
"loss": 1.1754,
"step": 42
},
{
"epoch": 0.9052631578947369,
"grad_norm": 0.14465025067329407,
"learning_rate": 0.00019997608206766038,
"loss": 1.1623,
"step": 43
},
{
"epoch": 0.9263157894736842,
"grad_norm": 0.14105112850666046,
"learning_rate": 0.00019997461059506857,
"loss": 1.15,
"step": 44
},
{
"epoch": 0.9473684210526315,
"grad_norm": 0.14860253036022186,
"learning_rate": 0.00019997309520546647,
"loss": 1.2272,
"step": 45
},
{
"epoch": 0.968421052631579,
"grad_norm": 0.14539220929145813,
"learning_rate": 0.00019997153589951973,
"loss": 1.1894,
"step": 46
},
{
"epoch": 0.9894736842105263,
"grad_norm": 0.1519959717988968,
"learning_rate": 0.00019996993267791337,
"loss": 1.0394,
"step": 47
},
{
"epoch": 1.0105263157894737,
"grad_norm": 0.14171597361564636,
"learning_rate": 0.00019996828554135162,
"loss": 1.3091,
"step": 48
},
{
"epoch": 1.0315789473684212,
"grad_norm": 0.1611461490392685,
"learning_rate": 0.0001999665944905581,
"loss": 1.1096,
"step": 49
},
{
"epoch": 1.0526315789473684,
"grad_norm": 0.18341802060604095,
"learning_rate": 0.00019996485952627552,
"loss": 1.1828,
"step": 50
},
{
"epoch": 1.0736842105263158,
"grad_norm": 0.15702421963214874,
"learning_rate": 0.00019996308064926615,
"loss": 1.1569,
"step": 51
},
{
"epoch": 1.0947368421052632,
"grad_norm": 0.16117393970489502,
"learning_rate": 0.00019996125786031138,
"loss": 1.1337,
"step": 52
},
{
"epoch": 1.1157894736842104,
"grad_norm": 0.16280895471572876,
"learning_rate": 0.00019995939116021193,
"loss": 1.2093,
"step": 53
},
{
"epoch": 1.1368421052631579,
"grad_norm": 0.16143617033958435,
"learning_rate": 0.00019995748054978777,
"loss": 1.1394,
"step": 54
},
{
"epoch": 1.1578947368421053,
"grad_norm": 0.1600114107131958,
"learning_rate": 0.00019995552602987827,
"loss": 1.0491,
"step": 55
},
{
"epoch": 1.1789473684210527,
"grad_norm": 0.148758202791214,
"learning_rate": 0.00019995352760134193,
"loss": 1.0947,
"step": 56
},
{
"epoch": 1.2,
"grad_norm": 0.18547968566417694,
"learning_rate": 0.00019995148526505665,
"loss": 1.0401,
"step": 57
},
{
"epoch": 1.2210526315789474,
"grad_norm": 0.16114592552185059,
"learning_rate": 0.00019994939902191964,
"loss": 1.0556,
"step": 58
},
{
"epoch": 1.2421052631578948,
"grad_norm": 0.21020422875881195,
"learning_rate": 0.0001999472688728473,
"loss": 1.1656,
"step": 59
},
{
"epoch": 1.263157894736842,
"grad_norm": 0.16495747864246368,
"learning_rate": 0.00019994509481877537,
"loss": 1.1302,
"step": 60
},
{
"epoch": 1.263157894736842,
"eval_loss": 1.1534416675567627,
"eval_runtime": 15.0862,
"eval_samples_per_second": 6.629,
"eval_steps_per_second": 0.663,
"step": 60
},
{
"epoch": 1.2842105263157895,
"grad_norm": 0.18857495486736298,
"learning_rate": 0.00019994287686065886,
"loss": 1.0208,
"step": 61
},
{
"epoch": 1.305263157894737,
"grad_norm": 0.15040083229541779,
"learning_rate": 0.00019994061499947212,
"loss": 1.0189,
"step": 62
},
{
"epoch": 1.3263157894736843,
"grad_norm": 0.18986788392066956,
"learning_rate": 0.00019993830923620872,
"loss": 1.0854,
"step": 63
},
{
"epoch": 1.3473684210526315,
"grad_norm": 0.18154074251651764,
"learning_rate": 0.00019993595957188152,
"loss": 1.1744,
"step": 64
},
{
"epoch": 1.368421052631579,
"grad_norm": 0.1574270874261856,
"learning_rate": 0.00019993356600752276,
"loss": 1.1356,
"step": 65
},
{
"epoch": 1.3894736842105262,
"grad_norm": 0.19574891030788422,
"learning_rate": 0.0001999311285441838,
"loss": 1.0907,
"step": 66
},
{
"epoch": 1.4105263157894736,
"grad_norm": 0.20714887976646423,
"learning_rate": 0.0001999286471829354,
"loss": 1.1834,
"step": 67
},
{
"epoch": 1.431578947368421,
"grad_norm": 0.21967269480228424,
"learning_rate": 0.0001999261219248676,
"loss": 1.2089,
"step": 68
},
{
"epoch": 1.4526315789473685,
"grad_norm": 0.21682047843933105,
"learning_rate": 0.00019992355277108966,
"loss": 1.1635,
"step": 69
},
{
"epoch": 1.4736842105263157,
"grad_norm": 0.17350642383098602,
"learning_rate": 0.00019992093972273018,
"loss": 1.015,
"step": 70
},
{
"epoch": 1.4947368421052631,
"grad_norm": 0.2019474357366562,
"learning_rate": 0.00019991828278093706,
"loss": 1.2032,
"step": 71
},
{
"epoch": 1.5157894736842106,
"grad_norm": 0.2160518765449524,
"learning_rate": 0.0001999155819468774,
"loss": 1.0798,
"step": 72
},
{
"epoch": 1.5368421052631578,
"grad_norm": 0.17687132954597473,
"learning_rate": 0.00019991283722173764,
"loss": 1.1356,
"step": 73
},
{
"epoch": 1.5578947368421052,
"grad_norm": 0.16519969701766968,
"learning_rate": 0.0001999100486067235,
"loss": 1.0129,
"step": 74
},
{
"epoch": 1.5789473684210527,
"grad_norm": 0.15248972177505493,
"learning_rate": 0.00019990721610305996,
"loss": 1.0204,
"step": 75
},
{
"epoch": 1.6,
"grad_norm": 0.1959000527858734,
"learning_rate": 0.00019990433971199125,
"loss": 1.026,
"step": 76
},
{
"epoch": 1.6210526315789475,
"grad_norm": 0.20230266451835632,
"learning_rate": 0.00019990141943478098,
"loss": 1.0012,
"step": 77
},
{
"epoch": 1.6421052631578947,
"grad_norm": 0.18247124552726746,
"learning_rate": 0.00019989845527271195,
"loss": 1.1552,
"step": 78
},
{
"epoch": 1.663157894736842,
"grad_norm": 0.21317291259765625,
"learning_rate": 0.00019989544722708618,
"loss": 1.1461,
"step": 79
},
{
"epoch": 1.6842105263157894,
"grad_norm": 0.2219182848930359,
"learning_rate": 0.0001998923952992252,
"loss": 1.0958,
"step": 80
},
{
"epoch": 1.6842105263157894,
"eval_loss": 1.151181697845459,
"eval_runtime": 15.09,
"eval_samples_per_second": 6.627,
"eval_steps_per_second": 0.663,
"step": 80
},
{
"epoch": 1.7052631578947368,
"grad_norm": 0.20883385837078094,
"learning_rate": 0.00019988929949046958,
"loss": 1.1361,
"step": 81
},
{
"epoch": 1.7263157894736842,
"grad_norm": 0.23448118567466736,
"learning_rate": 0.00019988615980217925,
"loss": 1.1583,
"step": 82
},
{
"epoch": 1.7473684210526317,
"grad_norm": 0.19605010747909546,
"learning_rate": 0.00019988297623573344,
"loss": 1.2165,
"step": 83
},
{
"epoch": 1.768421052631579,
"grad_norm": 0.2519710063934326,
"learning_rate": 0.0001998797487925306,
"loss": 1.188,
"step": 84
},
{
"epoch": 1.7894736842105263,
"grad_norm": 0.2063812017440796,
"learning_rate": 0.00019987647747398852,
"loss": 1.2146,
"step": 85
},
{
"epoch": 1.8105263157894735,
"grad_norm": 0.18549425899982452,
"learning_rate": 0.00019987316228154423,
"loss": 1.1634,
"step": 86
},
{
"epoch": 1.831578947368421,
"grad_norm": 0.22328101098537445,
"learning_rate": 0.00019986980321665403,
"loss": 1.4089,
"step": 87
},
{
"epoch": 1.8526315789473684,
"grad_norm": 0.23256567120552063,
"learning_rate": 0.00019986640028079347,
"loss": 0.9991,
"step": 88
},
{
"epoch": 1.8736842105263158,
"grad_norm": 0.23284031450748444,
"learning_rate": 0.0001998629534754574,
"loss": 1.1857,
"step": 89
},
{
"epoch": 1.8947368421052633,
"grad_norm": 0.25720322132110596,
"learning_rate": 0.00019985946280215994,
"loss": 1.1692,
"step": 90
},
{
"epoch": 1.9157894736842105,
"grad_norm": 0.25529375672340393,
"learning_rate": 0.00019985592826243453,
"loss": 1.1913,
"step": 91
},
{
"epoch": 1.936842105263158,
"grad_norm": 0.21553778648376465,
"learning_rate": 0.0001998523498578338,
"loss": 1.0827,
"step": 92
},
{
"epoch": 1.9578947368421051,
"grad_norm": 0.22560931742191315,
"learning_rate": 0.00019984872758992963,
"loss": 1.2614,
"step": 93
},
{
"epoch": 1.9789473684210526,
"grad_norm": 0.21169520914554596,
"learning_rate": 0.00019984506146031325,
"loss": 1.1164,
"step": 94
},
{
"epoch": 2.0,
"grad_norm": 0.22062988579273224,
"learning_rate": 0.00019984135147059514,
"loss": 1.1638,
"step": 95
},
{
"epoch": 2.0210526315789474,
"grad_norm": 0.21019504964351654,
"learning_rate": 0.00019983759762240503,
"loss": 1.0949,
"step": 96
},
{
"epoch": 2.042105263157895,
"grad_norm": 0.19634267687797546,
"learning_rate": 0.00019983379991739188,
"loss": 1.0106,
"step": 97
},
{
"epoch": 2.0631578947368423,
"grad_norm": 0.24126608669757843,
"learning_rate": 0.00019982995835722398,
"loss": 0.9002,
"step": 98
},
{
"epoch": 2.0842105263157893,
"grad_norm": 0.22683700919151306,
"learning_rate": 0.0001998260729435889,
"loss": 1.0105,
"step": 99
},
{
"epoch": 2.1052631578947367,
"grad_norm": 0.27286496758461,
"learning_rate": 0.00019982214367819328,
"loss": 1.0285,
"step": 100
},
{
"epoch": 2.1052631578947367,
"eval_loss": 1.1652569770812988,
"eval_runtime": 15.0915,
"eval_samples_per_second": 6.626,
"eval_steps_per_second": 0.663,
"step": 100
},
{
"epoch": 2.126315789473684,
"grad_norm": 0.24511495232582092,
"learning_rate": 0.00019981817056276337,
"loss": 1.0178,
"step": 101
},
{
"epoch": 2.1473684210526316,
"grad_norm": 0.23129835724830627,
"learning_rate": 0.00019981415359904435,
"loss": 0.9067,
"step": 102
},
{
"epoch": 2.168421052631579,
"grad_norm": 0.25192540884017944,
"learning_rate": 0.00019981009278880087,
"loss": 1.0198,
"step": 103
},
{
"epoch": 2.1894736842105265,
"grad_norm": 0.28074589371681213,
"learning_rate": 0.0001998059881338167,
"loss": 0.9983,
"step": 104
},
{
"epoch": 2.2105263157894735,
"grad_norm": 0.22504180669784546,
"learning_rate": 0.00019980183963589504,
"loss": 0.9203,
"step": 105
},
{
"epoch": 2.231578947368421,
"grad_norm": 0.25999030470848083,
"learning_rate": 0.00019979764729685813,
"loss": 1.1073,
"step": 106
},
{
"epoch": 2.2526315789473683,
"grad_norm": 0.2536907196044922,
"learning_rate": 0.00019979341111854768,
"loss": 1.1423,
"step": 107
},
{
"epoch": 2.2736842105263158,
"grad_norm": 0.24934524297714233,
"learning_rate": 0.0001997891311028245,
"loss": 0.854,
"step": 108
},
{
"epoch": 2.294736842105263,
"grad_norm": 0.24719196557998657,
"learning_rate": 0.0001997848072515688,
"loss": 1.0318,
"step": 109
},
{
"epoch": 2.3157894736842106,
"grad_norm": 0.2829945385456085,
"learning_rate": 0.0001997804395666799,
"loss": 0.9709,
"step": 110
},
{
"epoch": 2.336842105263158,
"grad_norm": 0.29261600971221924,
"learning_rate": 0.00019977602805007648,
"loss": 0.942,
"step": 111
},
{
"epoch": 2.3578947368421055,
"grad_norm": 0.26871809363365173,
"learning_rate": 0.0001997715727036964,
"loss": 1.1248,
"step": 112
},
{
"epoch": 2.3789473684210525,
"grad_norm": 0.2754141390323639,
"learning_rate": 0.00019976707352949684,
"loss": 1.1282,
"step": 113
},
{
"epoch": 2.4,
"grad_norm": 0.26692995429039,
"learning_rate": 0.00019976253052945425,
"loss": 0.8413,
"step": 114
},
{
"epoch": 2.4210526315789473,
"grad_norm": 0.30057746171951294,
"learning_rate": 0.00019975794370556417,
"loss": 1.118,
"step": 115
},
{
"epoch": 2.442105263157895,
"grad_norm": 0.2569466531276703,
"learning_rate": 0.0001997533130598416,
"loss": 1.0404,
"step": 116
},
{
"epoch": 2.463157894736842,
"grad_norm": 0.30639445781707764,
"learning_rate": 0.00019974863859432068,
"loss": 1.0001,
"step": 117
},
{
"epoch": 2.4842105263157896,
"grad_norm": 0.26039522886276245,
"learning_rate": 0.00019974392031105482,
"loss": 1.1001,
"step": 118
},
{
"epoch": 2.5052631578947366,
"grad_norm": 0.31316864490509033,
"learning_rate": 0.00019973915821211666,
"loss": 0.9833,
"step": 119
},
{
"epoch": 2.526315789473684,
"grad_norm": 0.28356632590293884,
"learning_rate": 0.00019973435229959813,
"loss": 1.1265,
"step": 120
},
{
"epoch": 2.526315789473684,
"eval_loss": 1.1785136461257935,
"eval_runtime": 15.0968,
"eval_samples_per_second": 6.624,
"eval_steps_per_second": 0.662,
"step": 120
},
{
"epoch": 2.5473684210526315,
"grad_norm": 0.3286098837852478,
"learning_rate": 0.0001997295025756103,
"loss": 0.9399,
"step": 121
},
{
"epoch": 2.568421052631579,
"grad_norm": 0.29396986961364746,
"learning_rate": 0.00019972460904228365,
"loss": 0.9961,
"step": 122
},
{
"epoch": 2.5894736842105264,
"grad_norm": 0.31531810760498047,
"learning_rate": 0.0001997196717017678,
"loss": 1.0899,
"step": 123
},
{
"epoch": 2.610526315789474,
"grad_norm": 0.2522255480289459,
"learning_rate": 0.00019971469055623162,
"loss": 0.94,
"step": 124
},
{
"epoch": 2.6315789473684212,
"grad_norm": 0.2962912917137146,
"learning_rate": 0.00019970966560786324,
"loss": 0.976,
"step": 125
},
{
"epoch": 2.6526315789473687,
"grad_norm": 0.2782577872276306,
"learning_rate": 0.00019970459685887,
"loss": 0.985,
"step": 126
},
{
"epoch": 2.6736842105263157,
"grad_norm": 0.31023719906806946,
"learning_rate": 0.00019969948431147858,
"loss": 0.9228,
"step": 127
},
{
"epoch": 2.694736842105263,
"grad_norm": 0.35043251514434814,
"learning_rate": 0.00019969432796793478,
"loss": 1.0626,
"step": 128
},
{
"epoch": 2.7157894736842105,
"grad_norm": 0.28269392251968384,
"learning_rate": 0.00019968912783050366,
"loss": 0.9771,
"step": 129
},
{
"epoch": 2.736842105263158,
"grad_norm": 0.3904775381088257,
"learning_rate": 0.0001996838839014696,
"loss": 1.0986,
"step": 130
},
{
"epoch": 2.7578947368421054,
"grad_norm": 0.3603789806365967,
"learning_rate": 0.00019967859618313612,
"loss": 0.8981,
"step": 131
},
{
"epoch": 2.7789473684210524,
"grad_norm": 0.3876775801181793,
"learning_rate": 0.00019967326467782605,
"loss": 0.999,
"step": 132
},
{
"epoch": 2.8,
"grad_norm": 0.33544307947158813,
"learning_rate": 0.0001996678893878814,
"loss": 0.9868,
"step": 133
},
{
"epoch": 2.8210526315789473,
"grad_norm": 0.27033519744873047,
"learning_rate": 0.00019966247031566345,
"loss": 0.969,
"step": 134
},
{
"epoch": 2.8421052631578947,
"grad_norm": 0.29576611518859863,
"learning_rate": 0.0001996570074635527,
"loss": 1.1384,
"step": 135
},
{
"epoch": 2.863157894736842,
"grad_norm": 0.2918352484703064,
"learning_rate": 0.00019965150083394885,
"loss": 0.9385,
"step": 136
},
{
"epoch": 2.8842105263157896,
"grad_norm": 0.26381438970565796,
"learning_rate": 0.00019964595042927088,
"loss": 0.9742,
"step": 137
},
{
"epoch": 2.905263157894737,
"grad_norm": 0.3624316155910492,
"learning_rate": 0.000199640356251957,
"loss": 1.1089,
"step": 138
},
{
"epoch": 2.9263157894736844,
"grad_norm": 0.3607443869113922,
"learning_rate": 0.00019963471830446462,
"loss": 1.0243,
"step": 139
},
{
"epoch": 2.9473684210526314,
"grad_norm": 0.3374713063240051,
"learning_rate": 0.00019962903658927037,
"loss": 1.0215,
"step": 140
},
{
"epoch": 2.9473684210526314,
"eval_loss": 1.1921439170837402,
"eval_runtime": 15.1014,
"eval_samples_per_second": 6.622,
"eval_steps_per_second": 0.662,
"step": 140
},
{
"epoch": 2.968421052631579,
"grad_norm": 0.2549107074737549,
"learning_rate": 0.00019962331110887015,
"loss": 0.9862,
"step": 141
},
{
"epoch": 2.9894736842105263,
"grad_norm": 0.28837814927101135,
"learning_rate": 0.00019961754186577902,
"loss": 1.0686,
"step": 142
},
{
"epoch": 3.0105263157894737,
"grad_norm": 0.2922016978263855,
"learning_rate": 0.00019961172886253135,
"loss": 0.9724,
"step": 143
},
{
"epoch": 3.031578947368421,
"grad_norm": 0.3212421238422394,
"learning_rate": 0.00019960587210168064,
"loss": 0.9277,
"step": 144
},
{
"epoch": 3.0526315789473686,
"grad_norm": 0.2664543390274048,
"learning_rate": 0.00019959997158579967,
"loss": 0.851,
"step": 145
},
{
"epoch": 3.0736842105263156,
"grad_norm": 0.25021764636039734,
"learning_rate": 0.00019959402731748046,
"loss": 0.8162,
"step": 146
},
{
"epoch": 3.094736842105263,
"grad_norm": 0.2933099567890167,
"learning_rate": 0.00019958803929933421,
"loss": 0.8726,
"step": 147
},
{
"epoch": 3.1157894736842104,
"grad_norm": 0.2778937518596649,
"learning_rate": 0.0001995820075339913,
"loss": 0.8583,
"step": 148
},
{
"epoch": 3.136842105263158,
"grad_norm": 0.3159140646457672,
"learning_rate": 0.0001995759320241014,
"loss": 0.9162,
"step": 149
},
{
"epoch": 3.1578947368421053,
"grad_norm": 0.3638457655906677,
"learning_rate": 0.0001995698127723334,
"loss": 0.8698,
"step": 150
},
{
"epoch": 3.1789473684210527,
"grad_norm": 0.3314703702926636,
"learning_rate": 0.00019956364978137534,
"loss": 0.7974,
"step": 151
},
{
"epoch": 3.2,
"grad_norm": 0.3441965878009796,
"learning_rate": 0.00019955744305393452,
"loss": 0.8227,
"step": 152
},
{
"epoch": 3.221052631578947,
"grad_norm": 0.3249785304069519,
"learning_rate": 0.00019955119259273743,
"loss": 1.0359,
"step": 153
},
{
"epoch": 3.2421052631578946,
"grad_norm": 0.3665757477283478,
"learning_rate": 0.0001995448984005298,
"loss": 0.8252,
"step": 154
},
{
"epoch": 3.263157894736842,
"grad_norm": 0.36660316586494446,
"learning_rate": 0.00019953856048007652,
"loss": 0.951,
"step": 155
},
{
"epoch": 3.2842105263157895,
"grad_norm": 0.3821450173854828,
"learning_rate": 0.0001995321788341618,
"loss": 0.8429,
"step": 156
},
{
"epoch": 3.305263157894737,
"grad_norm": 0.3389227092266083,
"learning_rate": 0.0001995257534655889,
"loss": 0.8443,
"step": 157
},
{
"epoch": 3.3263157894736843,
"grad_norm": 0.37333276867866516,
"learning_rate": 0.00019951928437718039,
"loss": 0.8121,
"step": 158
},
{
"epoch": 3.3473684210526318,
"grad_norm": 0.3941514194011688,
"learning_rate": 0.000199512771571778,
"loss": 0.7905,
"step": 159
},
{
"epoch": 3.3684210526315788,
"grad_norm": 0.36983492970466614,
"learning_rate": 0.00019950621505224273,
"loss": 0.8495,
"step": 160
},
{
"epoch": 3.3684210526315788,
"eval_loss": 1.2672936916351318,
"eval_runtime": 15.1025,
"eval_samples_per_second": 6.621,
"eval_steps_per_second": 0.662,
"step": 160
},
{
"epoch": 3.389473684210526,
"grad_norm": 0.3236539363861084,
"learning_rate": 0.00019949961482145474,
"loss": 0.7451,
"step": 161
},
{
"epoch": 3.4105263157894736,
"grad_norm": 0.3834153413772583,
"learning_rate": 0.00019949297088231335,
"loss": 0.8501,
"step": 162
},
{
"epoch": 3.431578947368421,
"grad_norm": 0.40946483612060547,
"learning_rate": 0.00019948628323773716,
"loss": 0.7928,
"step": 163
},
{
"epoch": 3.4526315789473685,
"grad_norm": 0.3998974859714508,
"learning_rate": 0.00019947955189066388,
"loss": 0.8213,
"step": 164
},
{
"epoch": 3.473684210526316,
"grad_norm": 0.3378157317638397,
"learning_rate": 0.00019947277684405056,
"loss": 0.8236,
"step": 165
},
{
"epoch": 3.4947368421052634,
"grad_norm": 0.3661404848098755,
"learning_rate": 0.00019946595810087323,
"loss": 0.8427,
"step": 166
},
{
"epoch": 3.515789473684211,
"grad_norm": 0.3834976255893707,
"learning_rate": 0.0001994590956641273,
"loss": 0.7935,
"step": 167
},
{
"epoch": 3.536842105263158,
"grad_norm": 0.38265261054039,
"learning_rate": 0.00019945218953682734,
"loss": 0.8425,
"step": 168
},
{
"epoch": 3.557894736842105,
"grad_norm": 0.3965548276901245,
"learning_rate": 0.00019944523972200705,
"loss": 0.7629,
"step": 169
},
{
"epoch": 3.5789473684210527,
"grad_norm": 0.33844998478889465,
"learning_rate": 0.00019943824622271935,
"loss": 0.8272,
"step": 170
},
{
"epoch": 3.6,
"grad_norm": 0.3963635563850403,
"learning_rate": 0.0001994312090420364,
"loss": 0.8769,
"step": 171
},
{
"epoch": 3.6210526315789475,
"grad_norm": 0.4465744197368622,
"learning_rate": 0.00019942412818304943,
"loss": 0.8587,
"step": 172
},
{
"epoch": 3.6421052631578945,
"grad_norm": 0.4285814166069031,
"learning_rate": 0.00019941700364886899,
"loss": 0.8129,
"step": 173
},
{
"epoch": 3.663157894736842,
"grad_norm": 0.4269882142543793,
"learning_rate": 0.00019940983544262472,
"loss": 1.0085,
"step": 174
},
{
"epoch": 3.6842105263157894,
"grad_norm": 0.4019848108291626,
"learning_rate": 0.00019940262356746554,
"loss": 0.8352,
"step": 175
},
{
"epoch": 3.705263157894737,
"grad_norm": 0.3888452351093292,
"learning_rate": 0.00019939536802655945,
"loss": 0.9718,
"step": 176
},
{
"epoch": 3.7263157894736842,
"grad_norm": 0.4387454688549042,
"learning_rate": 0.00019938806882309368,
"loss": 1.1492,
"step": 177
},
{
"epoch": 3.7473684210526317,
"grad_norm": 0.47144615650177,
"learning_rate": 0.00019938072596027462,
"loss": 0.8351,
"step": 178
},
{
"epoch": 3.768421052631579,
"grad_norm": 0.37782907485961914,
"learning_rate": 0.0001993733394413279,
"loss": 0.8221,
"step": 179
},
{
"epoch": 3.7894736842105265,
"grad_norm": 0.3969016969203949,
"learning_rate": 0.0001993659092694982,
"loss": 0.901,
"step": 180
},
{
"epoch": 3.7894736842105265,
"eval_loss": 1.2610888481140137,
"eval_runtime": 15.0998,
"eval_samples_per_second": 6.623,
"eval_steps_per_second": 0.662,
"step": 180
},
{
"epoch": 3.8105263157894735,
"grad_norm": 0.34853246808052063,
"learning_rate": 0.00019935843544804956,
"loss": 0.9159,
"step": 181
},
{
"epoch": 3.831578947368421,
"grad_norm": 0.3776082396507263,
"learning_rate": 0.00019935091798026507,
"loss": 0.9256,
"step": 182
},
{
"epoch": 3.8526315789473684,
"grad_norm": 0.42846596240997314,
"learning_rate": 0.00019934335686944694,
"loss": 0.9343,
"step": 183
},
{
"epoch": 3.873684210526316,
"grad_norm": 0.38637641072273254,
"learning_rate": 0.0001993357521189167,
"loss": 0.9058,
"step": 184
},
{
"epoch": 3.8947368421052633,
"grad_norm": 0.35184672474861145,
"learning_rate": 0.00019932810373201495,
"loss": 0.8512,
"step": 185
},
{
"epoch": 3.9157894736842103,
"grad_norm": 0.3937157690525055,
"learning_rate": 0.00019932041171210151,
"loss": 0.926,
"step": 186
},
{
"epoch": 3.9368421052631577,
"grad_norm": 0.4368656277656555,
"learning_rate": 0.0001993126760625553,
"loss": 1.0166,
"step": 187
},
{
"epoch": 3.957894736842105,
"grad_norm": 0.3807603418827057,
"learning_rate": 0.0001993048967867745,
"loss": 0.9002,
"step": 188
},
{
"epoch": 3.9789473684210526,
"grad_norm": 0.3753417432308197,
"learning_rate": 0.00019929707388817637,
"loss": 0.9097,
"step": 189
},
{
"epoch": 4.0,
"grad_norm": 0.4108405113220215,
"learning_rate": 0.00019928920737019733,
"loss": 0.9339,
"step": 190
},
{
"epoch": 4.021052631578947,
"grad_norm": 0.3922579288482666,
"learning_rate": 0.0001992812972362931,
"loss": 0.7981,
"step": 191
},
{
"epoch": 4.042105263157895,
"grad_norm": 0.377907931804657,
"learning_rate": 0.00019927334348993837,
"loss": 0.7216,
"step": 192
},
{
"epoch": 4.063157894736842,
"grad_norm": 0.3310043513774872,
"learning_rate": 0.00019926534613462707,
"loss": 0.6362,
"step": 193
},
{
"epoch": 4.08421052631579,
"grad_norm": 0.38383597135543823,
"learning_rate": 0.00019925730517387239,
"loss": 0.6881,
"step": 194
},
{
"epoch": 4.105263157894737,
"grad_norm": 0.4816322326660156,
"learning_rate": 0.00019924922061120644,
"loss": 0.6735,
"step": 195
},
{
"epoch": 4.126315789473685,
"grad_norm": 0.4419063329696655,
"learning_rate": 0.00019924109245018072,
"loss": 0.6995,
"step": 196
},
{
"epoch": 4.147368421052631,
"grad_norm": 0.4181762933731079,
"learning_rate": 0.00019923292069436578,
"loss": 0.7638,
"step": 197
},
{
"epoch": 4.168421052631579,
"grad_norm": 0.4450121521949768,
"learning_rate": 0.00019922470534735123,
"loss": 0.6745,
"step": 198
},
{
"epoch": 4.189473684210526,
"grad_norm": 0.4250476658344269,
"learning_rate": 0.000199216446412746,
"loss": 0.8314,
"step": 199
},
{
"epoch": 4.2105263157894735,
"grad_norm": 0.4420212507247925,
"learning_rate": 0.0001992081438941781,
"loss": 0.7058,
"step": 200
},
{
"epoch": 4.2105263157894735,
"eval_loss": 1.3737410306930542,
"eval_runtime": 15.0784,
"eval_samples_per_second": 6.632,
"eval_steps_per_second": 0.663,
"step": 200
},
{
"epoch": 4.231578947368421,
"grad_norm": 0.4359537363052368,
"learning_rate": 0.00019919979779529462,
"loss": 0.7697,
"step": 201
},
{
"epoch": 4.252631578947368,
"grad_norm": 0.44620540738105774,
"learning_rate": 0.0001991914081197619,
"loss": 0.6539,
"step": 202
},
{
"epoch": 4.273684210526316,
"grad_norm": 0.4683506488800049,
"learning_rate": 0.0001991829748712653,
"loss": 0.6729,
"step": 203
},
{
"epoch": 4.294736842105263,
"grad_norm": 0.45054468512535095,
"learning_rate": 0.00019917449805350947,
"loss": 0.7936,
"step": 204
},
{
"epoch": 4.315789473684211,
"grad_norm": 0.3688434958457947,
"learning_rate": 0.00019916597767021807,
"loss": 0.7257,
"step": 205
},
{
"epoch": 4.336842105263158,
"grad_norm": 0.44829288125038147,
"learning_rate": 0.00019915741372513398,
"loss": 0.8966,
"step": 206
},
{
"epoch": 4.3578947368421055,
"grad_norm": 0.5000961422920227,
"learning_rate": 0.00019914880622201912,
"loss": 0.6876,
"step": 207
},
{
"epoch": 4.378947368421053,
"grad_norm": 0.4218606948852539,
"learning_rate": 0.0001991401551646547,
"loss": 0.6577,
"step": 208
},
{
"epoch": 4.4,
"grad_norm": 0.41971078515052795,
"learning_rate": 0.00019913146055684092,
"loss": 0.6466,
"step": 209
},
{
"epoch": 4.421052631578947,
"grad_norm": 0.5077884793281555,
"learning_rate": 0.00019912272240239716,
"loss": 0.6927,
"step": 210
},
{
"epoch": 4.442105263157894,
"grad_norm": 0.4335264265537262,
"learning_rate": 0.00019911394070516194,
"loss": 0.6726,
"step": 211
},
{
"epoch": 4.463157894736842,
"grad_norm": 0.4315473139286041,
"learning_rate": 0.0001991051154689929,
"loss": 0.6704,
"step": 212
},
{
"epoch": 4.484210526315789,
"grad_norm": 0.41848331689834595,
"learning_rate": 0.0001990962466977668,
"loss": 0.7625,
"step": 213
},
{
"epoch": 4.505263157894737,
"grad_norm": 0.43444666266441345,
"learning_rate": 0.0001990873343953795,
"loss": 0.6806,
"step": 214
},
{
"epoch": 4.526315789473684,
"grad_norm": 0.45425352454185486,
"learning_rate": 0.00019907837856574607,
"loss": 0.6422,
"step": 215
},
{
"epoch": 4.5473684210526315,
"grad_norm": 0.39107388257980347,
"learning_rate": 0.0001990693792128006,
"loss": 0.6979,
"step": 216
},
{
"epoch": 4.568421052631579,
"grad_norm": 0.5015237927436829,
"learning_rate": 0.00019906033634049637,
"loss": 0.76,
"step": 217
},
{
"epoch": 4.589473684210526,
"grad_norm": 0.5905852317810059,
"learning_rate": 0.00019905124995280572,
"loss": 0.6172,
"step": 218
},
{
"epoch": 4.610526315789474,
"grad_norm": 0.4459174573421478,
"learning_rate": 0.0001990421200537201,
"loss": 0.6808,
"step": 219
},
{
"epoch": 4.631578947368421,
"grad_norm": 0.4470289647579193,
"learning_rate": 0.0001990329466472502,
"loss": 0.7428,
"step": 220
},
{
"epoch": 4.631578947368421,
"eval_loss": 1.3823609352111816,
"eval_runtime": 15.1002,
"eval_samples_per_second": 6.622,
"eval_steps_per_second": 0.662,
"step": 220
},
{
"epoch": 4.652631578947369,
"grad_norm": 0.5562801361083984,
"learning_rate": 0.00019902372973742565,
"loss": 0.6417,
"step": 221
},
{
"epoch": 4.673684210526316,
"grad_norm": 0.4418918788433075,
"learning_rate": 0.00019901446932829532,
"loss": 0.6833,
"step": 222
},
{
"epoch": 4.6947368421052635,
"grad_norm": 0.4580042064189911,
"learning_rate": 0.00019900516542392712,
"loss": 0.7247,
"step": 223
},
{
"epoch": 4.715789473684211,
"grad_norm": 0.4451601207256317,
"learning_rate": 0.00019899581802840802,
"loss": 0.6519,
"step": 224
},
{
"epoch": 4.7368421052631575,
"grad_norm": 0.46985530853271484,
"learning_rate": 0.00019898642714584428,
"loss": 0.6464,
"step": 225
},
{
"epoch": 4.757894736842105,
"grad_norm": 0.4769364893436432,
"learning_rate": 0.00019897699278036108,
"loss": 0.6946,
"step": 226
},
{
"epoch": 4.778947368421052,
"grad_norm": 0.4480087459087372,
"learning_rate": 0.00019896751493610272,
"loss": 0.5716,
"step": 227
},
{
"epoch": 4.8,
"grad_norm": 0.4228164851665497,
"learning_rate": 0.00019895799361723272,
"loss": 0.6828,
"step": 228
},
{
"epoch": 4.821052631578947,
"grad_norm": 0.488390177488327,
"learning_rate": 0.00019894842882793362,
"loss": 0.7358,
"step": 229
},
{
"epoch": 4.842105263157895,
"grad_norm": 0.4492754340171814,
"learning_rate": 0.000198938820572407,
"loss": 0.6468,
"step": 230
},
{
"epoch": 4.863157894736842,
"grad_norm": 0.48078247904777527,
"learning_rate": 0.00019892916885487356,
"loss": 0.7425,
"step": 231
},
{
"epoch": 4.88421052631579,
"grad_norm": 0.3910196125507355,
"learning_rate": 0.00019891947367957322,
"loss": 0.7077,
"step": 232
},
{
"epoch": 4.905263157894737,
"grad_norm": 0.5010913610458374,
"learning_rate": 0.00019890973505076485,
"loss": 0.791,
"step": 233
},
{
"epoch": 4.926315789473684,
"grad_norm": 0.3905860483646393,
"learning_rate": 0.00019889995297272647,
"loss": 0.6987,
"step": 234
},
{
"epoch": 4.947368421052632,
"grad_norm": 0.5714398622512817,
"learning_rate": 0.00019889012744975508,
"loss": 0.6656,
"step": 235
},
{
"epoch": 4.968421052631579,
"grad_norm": 0.47835731506347656,
"learning_rate": 0.00019888025848616695,
"loss": 0.6948,
"step": 236
},
{
"epoch": 4.989473684210527,
"grad_norm": 0.4666154086589813,
"learning_rate": 0.00019887034608629734,
"loss": 0.7628,
"step": 237
},
{
"epoch": 5.010526315789473,
"grad_norm": 0.470474511384964,
"learning_rate": 0.0001988603902545005,
"loss": 0.529,
"step": 238
},
{
"epoch": 5.031578947368421,
"grad_norm": 0.4203038513660431,
"learning_rate": 0.00019885039099514992,
"loss": 0.529,
"step": 239
},
{
"epoch": 5.052631578947368,
"grad_norm": 0.46336621046066284,
"learning_rate": 0.00019884034831263808,
"loss": 0.4866,
"step": 240
},
{
"epoch": 5.052631578947368,
"eval_loss": 1.4474551677703857,
"eval_runtime": 15.0991,
"eval_samples_per_second": 6.623,
"eval_steps_per_second": 0.662,
"step": 240
},
{
"epoch": 5.073684210526316,
"grad_norm": 0.4180259704589844,
"learning_rate": 0.00019883026221137652,
"loss": 0.526,
"step": 241
},
{
"epoch": 5.094736842105263,
"grad_norm": 0.42054420709609985,
"learning_rate": 0.00019882013269579584,
"loss": 0.4412,
"step": 242
},
{
"epoch": 5.11578947368421,
"grad_norm": 0.5846607089042664,
"learning_rate": 0.00019880995977034584,
"loss": 0.5306,
"step": 243
},
{
"epoch": 5.136842105263158,
"grad_norm": 0.6321561932563782,
"learning_rate": 0.00019879974343949526,
"loss": 0.575,
"step": 244
},
{
"epoch": 5.157894736842105,
"grad_norm": 0.48956233263015747,
"learning_rate": 0.00019878948370773193,
"loss": 0.4667,
"step": 245
},
{
"epoch": 5.178947368421053,
"grad_norm": 0.49197542667388916,
"learning_rate": 0.00019877918057956278,
"loss": 0.473,
"step": 246
},
{
"epoch": 5.2,
"grad_norm": 0.5268818736076355,
"learning_rate": 0.00019876883405951377,
"loss": 0.6249,
"step": 247
},
{
"epoch": 5.221052631578948,
"grad_norm": 0.4883573651313782,
"learning_rate": 0.00019875844415212997,
"loss": 0.5239,
"step": 248
},
{
"epoch": 5.242105263157895,
"grad_norm": 0.45860010385513306,
"learning_rate": 0.00019874801086197544,
"loss": 0.5462,
"step": 249
},
{
"epoch": 5.2631578947368425,
"grad_norm": 0.41302675008773804,
"learning_rate": 0.00019873753419363336,
"loss": 0.5144,
"step": 250
},
{
"epoch": 5.284210526315789,
"grad_norm": 0.550791323184967,
"learning_rate": 0.00019872701415170593,
"loss": 0.5071,
"step": 251
},
{
"epoch": 5.3052631578947365,
"grad_norm": 0.4419604539871216,
"learning_rate": 0.00019871645074081434,
"loss": 0.4598,
"step": 252
},
{
"epoch": 5.326315789473684,
"grad_norm": 0.5271047353744507,
"learning_rate": 0.00019870584396559902,
"loss": 0.5444,
"step": 253
},
{
"epoch": 5.347368421052631,
"grad_norm": 0.4978967308998108,
"learning_rate": 0.00019869519383071928,
"loss": 0.5829,
"step": 254
},
{
"epoch": 5.368421052631579,
"grad_norm": 0.5046519041061401,
"learning_rate": 0.00019868450034085352,
"loss": 0.5343,
"step": 255
},
{
"epoch": 5.389473684210526,
"grad_norm": 0.5924373865127563,
"learning_rate": 0.0001986737635006992,
"loss": 0.514,
"step": 256
},
{
"epoch": 5.410526315789474,
"grad_norm": 0.47235432267189026,
"learning_rate": 0.00019866298331497283,
"loss": 0.4899,
"step": 257
},
{
"epoch": 5.431578947368421,
"grad_norm": 0.49679791927337646,
"learning_rate": 0.0001986521597884099,
"loss": 0.5483,
"step": 258
},
{
"epoch": 5.4526315789473685,
"grad_norm": 0.4871433973312378,
"learning_rate": 0.00019864129292576505,
"loss": 0.5544,
"step": 259
},
{
"epoch": 5.473684210526316,
"grad_norm": 0.5678947567939758,
"learning_rate": 0.00019863038273181186,
"loss": 0.5298,
"step": 260
},
{
"epoch": 5.473684210526316,
"eval_loss": 1.5484461784362793,
"eval_runtime": 15.0951,
"eval_samples_per_second": 6.625,
"eval_steps_per_second": 0.662,
"step": 260
},
{
"epoch": 5.494736842105263,
"grad_norm": 0.541333019733429,
"learning_rate": 0.00019861942921134298,
"loss": 0.5321,
"step": 261
},
{
"epoch": 5.515789473684211,
"grad_norm": 0.45463690161705017,
"learning_rate": 0.0001986084323691701,
"loss": 0.5239,
"step": 262
},
{
"epoch": 5.536842105263158,
"grad_norm": 0.5732460618019104,
"learning_rate": 0.0001985973922101239,
"loss": 0.4861,
"step": 263
},
{
"epoch": 5.557894736842105,
"grad_norm": 0.4361143112182617,
"learning_rate": 0.00019858630873905418,
"loss": 0.5427,
"step": 264
},
{
"epoch": 5.578947368421053,
"grad_norm": 0.48954471945762634,
"learning_rate": 0.00019857518196082964,
"loss": 0.5614,
"step": 265
},
{
"epoch": 5.6,
"grad_norm": 0.5832586884498596,
"learning_rate": 0.0001985640118803381,
"loss": 0.4603,
"step": 266
},
{
"epoch": 5.621052631578947,
"grad_norm": 0.5026202201843262,
"learning_rate": 0.0001985527985024864,
"loss": 0.6399,
"step": 267
},
{
"epoch": 5.6421052631578945,
"grad_norm": 0.4579145908355713,
"learning_rate": 0.0001985415418322003,
"loss": 0.5354,
"step": 268
},
{
"epoch": 5.663157894736842,
"grad_norm": 0.545054018497467,
"learning_rate": 0.00019853024187442472,
"loss": 0.5158,
"step": 269
},
{
"epoch": 5.684210526315789,
"grad_norm": 0.48174452781677246,
"learning_rate": 0.00019851889863412345,
"loss": 0.5014,
"step": 270
},
{
"epoch": 5.705263157894737,
"grad_norm": 0.5417779684066772,
"learning_rate": 0.00019850751211627945,
"loss": 0.54,
"step": 271
},
{
"epoch": 5.726315789473684,
"grad_norm": 0.46869099140167236,
"learning_rate": 0.00019849608232589457,
"loss": 0.5416,
"step": 272
},
{
"epoch": 5.747368421052632,
"grad_norm": 0.6471317410469055,
"learning_rate": 0.00019848460926798968,
"loss": 0.5962,
"step": 273
},
{
"epoch": 5.768421052631579,
"grad_norm": 0.5855197310447693,
"learning_rate": 0.00019847309294760473,
"loss": 0.6314,
"step": 274
},
{
"epoch": 5.7894736842105265,
"grad_norm": 0.5380208492279053,
"learning_rate": 0.00019846153336979856,
"loss": 0.5651,
"step": 275
},
{
"epoch": 5.810526315789474,
"grad_norm": 0.46017733216285706,
"learning_rate": 0.00019844993053964917,
"loss": 0.5575,
"step": 276
},
{
"epoch": 5.831578947368421,
"grad_norm": 0.49735313653945923,
"learning_rate": 0.00019843828446225342,
"loss": 0.5628,
"step": 277
},
{
"epoch": 5.852631578947369,
"grad_norm": 0.5164270401000977,
"learning_rate": 0.0001984265951427272,
"loss": 0.5026,
"step": 278
},
{
"epoch": 5.873684210526315,
"grad_norm": 0.5263252258300781,
"learning_rate": 0.00019841486258620545,
"loss": 0.5588,
"step": 279
},
{
"epoch": 5.894736842105263,
"grad_norm": 0.47757405042648315,
"learning_rate": 0.00019840308679784207,
"loss": 0.5671,
"step": 280
},
{
"epoch": 5.894736842105263,
"eval_loss": 1.5009753704071045,
"eval_runtime": 15.1017,
"eval_samples_per_second": 6.622,
"eval_steps_per_second": 0.662,
"step": 280
}
],
"logging_steps": 1,
"max_steps": 4750,
"num_input_tokens_seen": 0,
"num_train_epochs": 102,
"save_steps": 20,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 10,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 10
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.740650038525952e+17,
"train_batch_size": 10,
"trial_name": null,
"trial_params": null
}