RefalMachine's picture
Upload folder using huggingface_hub
7b55f92 verified
raw
history blame
192 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999558635300348,
"eval_steps": 2000,
"global_step": 11328,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 8.827293993026438e-05,
"eval_accuracy": 0.31068875219818615,
"eval_loss": 5.8817362785339355,
"eval_runtime": 7.092,
"eval_samples_per_second": 44.839,
"eval_steps_per_second": 0.423,
"step": 1
},
{
"epoch": 0.0008827293993026437,
"grad_norm": 7.5625,
"learning_rate": 5e-05,
"loss": 6.1788,
"step": 10
},
{
"epoch": 0.0017654587986052875,
"grad_norm": 7.21875,
"learning_rate": 0.0001,
"loss": 5.9299,
"step": 20
},
{
"epoch": 0.0026481881979079315,
"grad_norm": 2.078125,
"learning_rate": 0.00015,
"loss": 5.2333,
"step": 30
},
{
"epoch": 0.003530917597210575,
"grad_norm": 1.2890625,
"learning_rate": 0.0002,
"loss": 4.6613,
"step": 40
},
{
"epoch": 0.0044136469965132185,
"grad_norm": 0.53515625,
"learning_rate": 0.00025,
"loss": 4.3038,
"step": 50
},
{
"epoch": 0.005296376395815863,
"grad_norm": 0.35546875,
"learning_rate": 0.0003,
"loss": 3.9929,
"step": 60
},
{
"epoch": 0.0061791057951185065,
"grad_norm": 0.298828125,
"learning_rate": 0.00035,
"loss": 3.7479,
"step": 70
},
{
"epoch": 0.00706183519442115,
"grad_norm": 0.267578125,
"learning_rate": 0.0004,
"loss": 3.5018,
"step": 80
},
{
"epoch": 0.007944564593723794,
"grad_norm": 4.59375,
"learning_rate": 0.00045000000000000004,
"loss": 3.3363,
"step": 90
},
{
"epoch": 0.008827293993026437,
"grad_norm": 0.28125,
"learning_rate": 0.0005,
"loss": 3.1974,
"step": 100
},
{
"epoch": 0.009710023392329082,
"grad_norm": 0.296875,
"learning_rate": 0.0004999990214012265,
"loss": 3.1016,
"step": 110
},
{
"epoch": 0.010592752791631726,
"grad_norm": 0.46484375,
"learning_rate": 0.000499996085612567,
"loss": 3.0369,
"step": 120
},
{
"epoch": 0.01147548219093437,
"grad_norm": 0.2890625,
"learning_rate": 0.0004999911926570055,
"loss": 2.9845,
"step": 130
},
{
"epoch": 0.012358211590237013,
"grad_norm": 0.3125,
"learning_rate": 0.0004999843425728476,
"loss": 2.9364,
"step": 140
},
{
"epoch": 0.013240940989539656,
"grad_norm": 0.34375,
"learning_rate": 0.0004999755354137212,
"loss": 2.899,
"step": 150
},
{
"epoch": 0.0141236703888423,
"grad_norm": 0.26953125,
"learning_rate": 0.000499964771248576,
"loss": 2.8838,
"step": 160
},
{
"epoch": 0.015006399788144944,
"grad_norm": 0.251953125,
"learning_rate": 0.000499952050161682,
"loss": 2.8561,
"step": 170
},
{
"epoch": 0.015889129187447587,
"grad_norm": 0.3046875,
"learning_rate": 0.0004999373722526303,
"loss": 2.8367,
"step": 180
},
{
"epoch": 0.016771858586750232,
"grad_norm": 0.298828125,
"learning_rate": 0.0004999207376363309,
"loss": 2.8232,
"step": 190
},
{
"epoch": 0.017654587986052874,
"grad_norm": 0.28515625,
"learning_rate": 0.0004999021464430128,
"loss": 2.811,
"step": 200
},
{
"epoch": 0.01853731738535552,
"grad_norm": 0.2890625,
"learning_rate": 0.0004998815988182225,
"loss": 2.8107,
"step": 210
},
{
"epoch": 0.019420046784658165,
"grad_norm": 0.455078125,
"learning_rate": 0.0004998590949228232,
"loss": 2.7771,
"step": 220
},
{
"epoch": 0.020302776183960806,
"grad_norm": 0.29296875,
"learning_rate": 0.000499834634932993,
"loss": 2.7739,
"step": 230
},
{
"epoch": 0.02118550558326345,
"grad_norm": 0.26171875,
"learning_rate": 0.0004998082190402241,
"loss": 2.7691,
"step": 240
},
{
"epoch": 0.022068234982566094,
"grad_norm": 0.26171875,
"learning_rate": 0.0004997798474513211,
"loss": 2.7592,
"step": 250
},
{
"epoch": 0.02295096438186874,
"grad_norm": 0.3203125,
"learning_rate": 0.000499749520388399,
"loss": 2.7538,
"step": 260
},
{
"epoch": 0.02383369378117138,
"grad_norm": 0.3125,
"learning_rate": 0.0004997172380888822,
"loss": 2.7447,
"step": 270
},
{
"epoch": 0.024716423180474026,
"grad_norm": 0.283203125,
"learning_rate": 0.0004996830008055017,
"loss": 2.729,
"step": 280
},
{
"epoch": 0.02559915257977667,
"grad_norm": 0.2412109375,
"learning_rate": 0.0004996468088062946,
"loss": 2.7356,
"step": 290
},
{
"epoch": 0.026481881979079313,
"grad_norm": 0.24609375,
"learning_rate": 0.0004996086623746,
"loss": 2.7239,
"step": 300
},
{
"epoch": 0.027364611378381958,
"grad_norm": 0.326171875,
"learning_rate": 0.0004995685618090584,
"loss": 2.7162,
"step": 310
},
{
"epoch": 0.0282473407776846,
"grad_norm": 0.28125,
"learning_rate": 0.0004995265074236088,
"loss": 2.7254,
"step": 320
},
{
"epoch": 0.029130070176987245,
"grad_norm": 0.2470703125,
"learning_rate": 0.0004994824995474863,
"loss": 2.7169,
"step": 330
},
{
"epoch": 0.030012799576289887,
"grad_norm": 0.271484375,
"learning_rate": 0.0004994365385252189,
"loss": 2.7328,
"step": 340
},
{
"epoch": 0.030895528975592532,
"grad_norm": 0.390625,
"learning_rate": 0.0004993886247166261,
"loss": 2.7327,
"step": 350
},
{
"epoch": 0.031778258374895174,
"grad_norm": 0.388671875,
"learning_rate": 0.000499338758496815,
"loss": 2.7076,
"step": 360
},
{
"epoch": 0.03266098777419782,
"grad_norm": 0.29296875,
"learning_rate": 0.000499286940256178,
"loss": 2.7214,
"step": 370
},
{
"epoch": 0.033543717173500465,
"grad_norm": 0.255859375,
"learning_rate": 0.0004992331704003889,
"loss": 2.7024,
"step": 380
},
{
"epoch": 0.03442644657280311,
"grad_norm": 0.267578125,
"learning_rate": 0.0004991774493504007,
"loss": 2.7097,
"step": 390
},
{
"epoch": 0.03530917597210575,
"grad_norm": 0.2890625,
"learning_rate": 0.0004991197775424418,
"loss": 2.6817,
"step": 400
},
{
"epoch": 0.036191905371408394,
"grad_norm": 0.2451171875,
"learning_rate": 0.0004990601554280128,
"loss": 2.7019,
"step": 410
},
{
"epoch": 0.03707463477071104,
"grad_norm": 0.267578125,
"learning_rate": 0.0004989985834738824,
"loss": 2.6888,
"step": 420
},
{
"epoch": 0.037957364170013684,
"grad_norm": 0.25,
"learning_rate": 0.0004989350621620851,
"loss": 2.6891,
"step": 430
},
{
"epoch": 0.03884009356931633,
"grad_norm": 0.220703125,
"learning_rate": 0.0004988695919899154,
"loss": 2.7029,
"step": 440
},
{
"epoch": 0.03972282296861897,
"grad_norm": 0.28515625,
"learning_rate": 0.0004988021734699258,
"loss": 2.6838,
"step": 450
},
{
"epoch": 0.04060555236792161,
"grad_norm": 0.255859375,
"learning_rate": 0.0004987328071299217,
"loss": 2.6746,
"step": 460
},
{
"epoch": 0.04148828176722426,
"grad_norm": 0.337890625,
"learning_rate": 0.0004986614935129576,
"loss": 2.6911,
"step": 470
},
{
"epoch": 0.0423710111665269,
"grad_norm": 0.265625,
"learning_rate": 0.0004985882331773328,
"loss": 2.6699,
"step": 480
},
{
"epoch": 0.04325374056582954,
"grad_norm": 0.298828125,
"learning_rate": 0.0004985130266965871,
"loss": 2.664,
"step": 490
},
{
"epoch": 0.04413646996513219,
"grad_norm": 0.3125,
"learning_rate": 0.0004984358746594964,
"loss": 2.6587,
"step": 500
},
{
"epoch": 0.04501919936443483,
"grad_norm": 0.294921875,
"learning_rate": 0.0004983567776700676,
"loss": 2.6734,
"step": 510
},
{
"epoch": 0.04590192876373748,
"grad_norm": 0.291015625,
"learning_rate": 0.0004982757363475346,
"loss": 2.6638,
"step": 520
},
{
"epoch": 0.04678465816304012,
"grad_norm": 0.25,
"learning_rate": 0.0004981927513263529,
"loss": 2.6759,
"step": 530
},
{
"epoch": 0.04766738756234276,
"grad_norm": 0.283203125,
"learning_rate": 0.0004981078232561947,
"loss": 2.6665,
"step": 540
},
{
"epoch": 0.048550116961645406,
"grad_norm": 0.328125,
"learning_rate": 0.0004980209528019441,
"loss": 2.6673,
"step": 550
},
{
"epoch": 0.04943284636094805,
"grad_norm": 0.263671875,
"learning_rate": 0.0004979321406436917,
"loss": 2.6545,
"step": 560
},
{
"epoch": 0.0503155757602507,
"grad_norm": 0.232421875,
"learning_rate": 0.0004978413874767291,
"loss": 2.6685,
"step": 570
},
{
"epoch": 0.05119830515955334,
"grad_norm": 0.341796875,
"learning_rate": 0.0004977486940115441,
"loss": 2.6715,
"step": 580
},
{
"epoch": 0.05208103455885598,
"grad_norm": 0.275390625,
"learning_rate": 0.0004976540609738143,
"loss": 2.6611,
"step": 590
},
{
"epoch": 0.052963763958158626,
"grad_norm": 0.2431640625,
"learning_rate": 0.0004975574891044017,
"loss": 2.6682,
"step": 600
},
{
"epoch": 0.05384649335746127,
"grad_norm": 0.27734375,
"learning_rate": 0.0004974589791593472,
"loss": 2.6512,
"step": 610
},
{
"epoch": 0.054729222756763916,
"grad_norm": 0.251953125,
"learning_rate": 0.0004973585319098648,
"loss": 2.6565,
"step": 620
},
{
"epoch": 0.055611952156066555,
"grad_norm": 0.5546875,
"learning_rate": 0.0004972561481423346,
"loss": 2.6673,
"step": 630
},
{
"epoch": 0.0564946815553692,
"grad_norm": 0.333984375,
"learning_rate": 0.0004971518286582979,
"loss": 2.6604,
"step": 640
},
{
"epoch": 0.057377410954671845,
"grad_norm": 0.2373046875,
"learning_rate": 0.0004970455742744499,
"loss": 2.6483,
"step": 650
},
{
"epoch": 0.05826014035397449,
"grad_norm": 0.333984375,
"learning_rate": 0.0004969373858226341,
"loss": 2.6532,
"step": 660
},
{
"epoch": 0.059142869753277136,
"grad_norm": 0.29296875,
"learning_rate": 0.0004968272641498349,
"loss": 2.6505,
"step": 670
},
{
"epoch": 0.060025599152579774,
"grad_norm": 0.2392578125,
"learning_rate": 0.0004967152101181717,
"loss": 2.6512,
"step": 680
},
{
"epoch": 0.06090832855188242,
"grad_norm": 0.271484375,
"learning_rate": 0.0004966012246048924,
"loss": 2.6483,
"step": 690
},
{
"epoch": 0.061791057951185065,
"grad_norm": 0.2255859375,
"learning_rate": 0.0004964853085023653,
"loss": 2.6397,
"step": 700
},
{
"epoch": 0.0626737873504877,
"grad_norm": 0.2412109375,
"learning_rate": 0.0004963674627180735,
"loss": 2.6535,
"step": 710
},
{
"epoch": 0.06355651674979035,
"grad_norm": 0.328125,
"learning_rate": 0.0004962476881746068,
"loss": 2.6369,
"step": 720
},
{
"epoch": 0.064439246149093,
"grad_norm": 0.2890625,
"learning_rate": 0.000496125985809655,
"loss": 2.6288,
"step": 730
},
{
"epoch": 0.06532197554839564,
"grad_norm": 0.3359375,
"learning_rate": 0.0004960023565760003,
"loss": 2.6421,
"step": 740
},
{
"epoch": 0.06620470494769828,
"grad_norm": 0.29296875,
"learning_rate": 0.0004958768014415103,
"loss": 2.6378,
"step": 750
},
{
"epoch": 0.06708743434700093,
"grad_norm": 0.2451171875,
"learning_rate": 0.0004957493213891295,
"loss": 2.6562,
"step": 760
},
{
"epoch": 0.06797016374630357,
"grad_norm": 0.236328125,
"learning_rate": 0.0004956199174168725,
"loss": 2.638,
"step": 770
},
{
"epoch": 0.06885289314560622,
"grad_norm": 0.25390625,
"learning_rate": 0.000495488590537816,
"loss": 2.6232,
"step": 780
},
{
"epoch": 0.06973562254490885,
"grad_norm": 0.26953125,
"learning_rate": 0.0004953553417800905,
"loss": 2.6335,
"step": 790
},
{
"epoch": 0.0706183519442115,
"grad_norm": 0.255859375,
"learning_rate": 0.0004952201721868726,
"loss": 2.636,
"step": 800
},
{
"epoch": 0.07150108134351414,
"grad_norm": 0.23046875,
"learning_rate": 0.0004950830828163767,
"loss": 2.641,
"step": 810
},
{
"epoch": 0.07238381074281679,
"grad_norm": 0.248046875,
"learning_rate": 0.0004949440747418467,
"loss": 2.6415,
"step": 820
},
{
"epoch": 0.07326654014211943,
"grad_norm": 0.23828125,
"learning_rate": 0.0004948031490515476,
"loss": 2.6356,
"step": 830
},
{
"epoch": 0.07414926954142208,
"grad_norm": 0.498046875,
"learning_rate": 0.0004946603068487572,
"loss": 2.6286,
"step": 840
},
{
"epoch": 0.07503199894072472,
"grad_norm": 0.357421875,
"learning_rate": 0.0004945155492517569,
"loss": 2.6308,
"step": 850
},
{
"epoch": 0.07591472834002737,
"grad_norm": 0.25,
"learning_rate": 0.0004943688773938237,
"loss": 2.6379,
"step": 860
},
{
"epoch": 0.07679745773933001,
"grad_norm": 0.28125,
"learning_rate": 0.000494220292423221,
"loss": 2.6308,
"step": 870
},
{
"epoch": 0.07768018713863266,
"grad_norm": 0.259765625,
"learning_rate": 0.000494069795503189,
"loss": 2.6325,
"step": 880
},
{
"epoch": 0.07856291653793529,
"grad_norm": 0.25390625,
"learning_rate": 0.0004939173878119366,
"loss": 2.626,
"step": 890
},
{
"epoch": 0.07944564593723794,
"grad_norm": 0.263671875,
"learning_rate": 0.0004937630705426318,
"loss": 2.6191,
"step": 900
},
{
"epoch": 0.08032837533654058,
"grad_norm": 0.251953125,
"learning_rate": 0.000493606844903392,
"loss": 2.6315,
"step": 910
},
{
"epoch": 0.08121110473584323,
"grad_norm": 0.24609375,
"learning_rate": 0.000493448712117275,
"loss": 2.6306,
"step": 920
},
{
"epoch": 0.08209383413514587,
"grad_norm": 0.29296875,
"learning_rate": 0.0004932886734222693,
"loss": 2.6096,
"step": 930
},
{
"epoch": 0.08297656353444852,
"grad_norm": 0.279296875,
"learning_rate": 0.000493126730071284,
"loss": 2.6182,
"step": 940
},
{
"epoch": 0.08385929293375116,
"grad_norm": 0.234375,
"learning_rate": 0.0004929628833321397,
"loss": 2.63,
"step": 950
},
{
"epoch": 0.0847420223330538,
"grad_norm": 0.28515625,
"learning_rate": 0.0004927971344875585,
"loss": 2.6271,
"step": 960
},
{
"epoch": 0.08562475173235645,
"grad_norm": 0.390625,
"learning_rate": 0.0004926294848351528,
"loss": 2.6246,
"step": 970
},
{
"epoch": 0.08650748113165908,
"grad_norm": 0.310546875,
"learning_rate": 0.0004924599356874169,
"loss": 2.6244,
"step": 980
},
{
"epoch": 0.08739021053096173,
"grad_norm": 0.259765625,
"learning_rate": 0.0004922884883717154,
"loss": 2.609,
"step": 990
},
{
"epoch": 0.08827293993026437,
"grad_norm": 0.328125,
"learning_rate": 0.0004921151442302732,
"loss": 2.6245,
"step": 1000
},
{
"epoch": 0.08915566932956702,
"grad_norm": 0.2177734375,
"learning_rate": 0.0004919399046201656,
"loss": 2.6195,
"step": 1010
},
{
"epoch": 0.09003839872886966,
"grad_norm": 0.3203125,
"learning_rate": 0.0004917627709133064,
"loss": 2.6149,
"step": 1020
},
{
"epoch": 0.09092112812817231,
"grad_norm": 0.2451171875,
"learning_rate": 0.0004915837444964383,
"loss": 2.6333,
"step": 1030
},
{
"epoch": 0.09180385752747496,
"grad_norm": 0.2197265625,
"learning_rate": 0.0004914028267711217,
"loss": 2.617,
"step": 1040
},
{
"epoch": 0.0926865869267776,
"grad_norm": 0.244140625,
"learning_rate": 0.0004912200191537233,
"loss": 2.6324,
"step": 1050
},
{
"epoch": 0.09356931632608025,
"grad_norm": 0.4453125,
"learning_rate": 0.0004910353230754057,
"loss": 2.619,
"step": 1060
},
{
"epoch": 0.09445204572538288,
"grad_norm": 0.28515625,
"learning_rate": 0.0004908487399821158,
"loss": 2.6247,
"step": 1070
},
{
"epoch": 0.09533477512468552,
"grad_norm": 0.30078125,
"learning_rate": 0.0004906602713345735,
"loss": 2.6194,
"step": 1080
},
{
"epoch": 0.09621750452398817,
"grad_norm": 0.2431640625,
"learning_rate": 0.0004904699186082602,
"loss": 2.6127,
"step": 1090
},
{
"epoch": 0.09710023392329081,
"grad_norm": 0.2451171875,
"learning_rate": 0.0004902776832934074,
"loss": 2.6178,
"step": 1100
},
{
"epoch": 0.09798296332259346,
"grad_norm": 0.408203125,
"learning_rate": 0.0004900835668949852,
"loss": 2.6088,
"step": 1110
},
{
"epoch": 0.0988656927218961,
"grad_norm": 0.298828125,
"learning_rate": 0.00048988757093269,
"loss": 2.612,
"step": 1120
},
{
"epoch": 0.09974842212119875,
"grad_norm": 0.2373046875,
"learning_rate": 0.0004896896969409332,
"loss": 2.6148,
"step": 1130
},
{
"epoch": 0.1006311515205014,
"grad_norm": 0.255859375,
"learning_rate": 0.0004894899464688287,
"loss": 2.6227,
"step": 1140
},
{
"epoch": 0.10151388091980404,
"grad_norm": 0.3046875,
"learning_rate": 0.000489288321080181,
"loss": 2.6195,
"step": 1150
},
{
"epoch": 0.10239661031910668,
"grad_norm": 0.23046875,
"learning_rate": 0.0004890848223534732,
"loss": 2.6363,
"step": 1160
},
{
"epoch": 0.10327933971840932,
"grad_norm": 0.2470703125,
"learning_rate": 0.0004888794518818538,
"loss": 2.6029,
"step": 1170
},
{
"epoch": 0.10416206911771196,
"grad_norm": 0.275390625,
"learning_rate": 0.0004886722112731253,
"loss": 2.6123,
"step": 1180
},
{
"epoch": 0.1050447985170146,
"grad_norm": 0.306640625,
"learning_rate": 0.000488463102149731,
"loss": 2.6176,
"step": 1190
},
{
"epoch": 0.10592752791631725,
"grad_norm": 0.291015625,
"learning_rate": 0.0004882521261487422,
"loss": 2.6269,
"step": 1200
},
{
"epoch": 0.1068102573156199,
"grad_norm": 0.25,
"learning_rate": 0.0004880392849218459,
"loss": 2.6292,
"step": 1210
},
{
"epoch": 0.10769298671492254,
"grad_norm": 0.33203125,
"learning_rate": 0.00048782458013533125,
"loss": 2.6148,
"step": 1220
},
{
"epoch": 0.10857571611422519,
"grad_norm": 0.255859375,
"learning_rate": 0.00048760801347007716,
"loss": 2.6057,
"step": 1230
},
{
"epoch": 0.10945844551352783,
"grad_norm": 0.2353515625,
"learning_rate": 0.0004873895866215385,
"loss": 2.6181,
"step": 1240
},
{
"epoch": 0.11034117491283048,
"grad_norm": 0.306640625,
"learning_rate": 0.00048716930129973323,
"loss": 2.6098,
"step": 1250
},
{
"epoch": 0.11122390431213311,
"grad_norm": 0.28125,
"learning_rate": 0.0004869471592292289,
"loss": 2.6201,
"step": 1260
},
{
"epoch": 0.11210663371143575,
"grad_norm": 0.27734375,
"learning_rate": 0.0004867231621491293,
"loss": 2.6141,
"step": 1270
},
{
"epoch": 0.1129893631107384,
"grad_norm": 0.2314453125,
"learning_rate": 0.00048649731181306047,
"loss": 2.6008,
"step": 1280
},
{
"epoch": 0.11387209251004105,
"grad_norm": 0.265625,
"learning_rate": 0.00048626960998915733,
"loss": 2.6134,
"step": 1290
},
{
"epoch": 0.11475482190934369,
"grad_norm": 0.248046875,
"learning_rate": 0.0004860400584600496,
"loss": 2.6197,
"step": 1300
},
{
"epoch": 0.11563755130864634,
"grad_norm": 0.283203125,
"learning_rate": 0.0004858086590228482,
"loss": 2.6045,
"step": 1310
},
{
"epoch": 0.11652028070794898,
"grad_norm": 0.328125,
"learning_rate": 0.0004855754134891307,
"loss": 2.6152,
"step": 1320
},
{
"epoch": 0.11740301010725163,
"grad_norm": 0.251953125,
"learning_rate": 0.0004853403236849274,
"loss": 2.6074,
"step": 1330
},
{
"epoch": 0.11828573950655427,
"grad_norm": 0.25390625,
"learning_rate": 0.0004851033914507071,
"loss": 2.6143,
"step": 1340
},
{
"epoch": 0.1191684689058569,
"grad_norm": 0.255859375,
"learning_rate": 0.00048486461864136253,
"loss": 2.6143,
"step": 1350
},
{
"epoch": 0.12005119830515955,
"grad_norm": 0.224609375,
"learning_rate": 0.0004846240071261959,
"loss": 2.5931,
"step": 1360
},
{
"epoch": 0.1209339277044622,
"grad_norm": 0.25,
"learning_rate": 0.00048438155878890434,
"loss": 2.594,
"step": 1370
},
{
"epoch": 0.12181665710376484,
"grad_norm": 0.2734375,
"learning_rate": 0.00048413727552756505,
"loss": 2.6069,
"step": 1380
},
{
"epoch": 0.12269938650306748,
"grad_norm": 0.251953125,
"learning_rate": 0.00048389115925462025,
"loss": 2.5968,
"step": 1390
},
{
"epoch": 0.12358211590237013,
"grad_norm": 0.310546875,
"learning_rate": 0.00048364321189686276,
"loss": 2.606,
"step": 1400
},
{
"epoch": 0.12446484530167277,
"grad_norm": 0.265625,
"learning_rate": 0.00048339343539542033,
"loss": 2.5955,
"step": 1410
},
{
"epoch": 0.1253475747009754,
"grad_norm": 0.2412109375,
"learning_rate": 0.0004831418317057409,
"loss": 2.5942,
"step": 1420
},
{
"epoch": 0.12623030410027805,
"grad_norm": 0.333984375,
"learning_rate": 0.0004828884027975768,
"loss": 2.587,
"step": 1430
},
{
"epoch": 0.1271130334995807,
"grad_norm": 0.23828125,
"learning_rate": 0.00048263315065497,
"loss": 2.6048,
"step": 1440
},
{
"epoch": 0.12799576289888334,
"grad_norm": 0.455078125,
"learning_rate": 0.0004823760772762358,
"loss": 2.5977,
"step": 1450
},
{
"epoch": 0.128878492298186,
"grad_norm": 0.32421875,
"learning_rate": 0.00048211718467394774,
"loss": 2.6055,
"step": 1460
},
{
"epoch": 0.12976122169748863,
"grad_norm": 0.2578125,
"learning_rate": 0.0004818564748749218,
"loss": 2.5919,
"step": 1470
},
{
"epoch": 0.13064395109679128,
"grad_norm": 0.255859375,
"learning_rate": 0.0004815939499202001,
"loss": 2.6066,
"step": 1480
},
{
"epoch": 0.13152668049609392,
"grad_norm": 0.28125,
"learning_rate": 0.0004813296118650357,
"loss": 2.6125,
"step": 1490
},
{
"epoch": 0.13240940989539657,
"grad_norm": 0.287109375,
"learning_rate": 0.0004810634627788756,
"loss": 2.5976,
"step": 1500
},
{
"epoch": 0.1332921392946992,
"grad_norm": 0.271484375,
"learning_rate": 0.0004807955047453452,
"loss": 2.6044,
"step": 1510
},
{
"epoch": 0.13417486869400186,
"grad_norm": 0.251953125,
"learning_rate": 0.0004805257398622317,
"loss": 2.6011,
"step": 1520
},
{
"epoch": 0.1350575980933045,
"grad_norm": 0.33984375,
"learning_rate": 0.0004802541702414678,
"loss": 2.6004,
"step": 1530
},
{
"epoch": 0.13594032749260715,
"grad_norm": 0.283203125,
"learning_rate": 0.000479980798009115,
"loss": 2.5994,
"step": 1540
},
{
"epoch": 0.1368230568919098,
"grad_norm": 0.26171875,
"learning_rate": 0.00047970562530534724,
"loss": 2.6054,
"step": 1550
},
{
"epoch": 0.13770578629121244,
"grad_norm": 0.2265625,
"learning_rate": 0.0004794286542844338,
"loss": 2.5978,
"step": 1560
},
{
"epoch": 0.13858851569051509,
"grad_norm": 0.251953125,
"learning_rate": 0.00047914988711472283,
"loss": 2.6025,
"step": 1570
},
{
"epoch": 0.1394712450898177,
"grad_norm": 0.22265625,
"learning_rate": 0.00047886932597862396,
"loss": 2.59,
"step": 1580
},
{
"epoch": 0.14035397448912035,
"grad_norm": 0.259765625,
"learning_rate": 0.0004785869730725914,
"loss": 2.6018,
"step": 1590
},
{
"epoch": 0.141236703888423,
"grad_norm": 0.259765625,
"learning_rate": 0.0004783028306071069,
"loss": 2.5972,
"step": 1600
},
{
"epoch": 0.14211943328772564,
"grad_norm": 0.2490234375,
"learning_rate": 0.00047801690080666206,
"loss": 2.5886,
"step": 1610
},
{
"epoch": 0.14300216268702828,
"grad_norm": 0.25390625,
"learning_rate": 0.00047772918590974136,
"loss": 2.5954,
"step": 1620
},
{
"epoch": 0.14388489208633093,
"grad_norm": 0.3515625,
"learning_rate": 0.00047743968816880446,
"loss": 2.6028,
"step": 1630
},
{
"epoch": 0.14476762148563357,
"grad_norm": 0.37890625,
"learning_rate": 0.0004771484098502683,
"loss": 2.5978,
"step": 1640
},
{
"epoch": 0.14565035088493622,
"grad_norm": 0.267578125,
"learning_rate": 0.0004768553532344899,
"loss": 2.5883,
"step": 1650
},
{
"epoch": 0.14653308028423886,
"grad_norm": 0.228515625,
"learning_rate": 0.0004765605206157478,
"loss": 2.5949,
"step": 1660
},
{
"epoch": 0.1474158096835415,
"grad_norm": 0.271484375,
"learning_rate": 0.0004762639143022248,
"loss": 2.6048,
"step": 1670
},
{
"epoch": 0.14829853908284416,
"grad_norm": 0.296875,
"learning_rate": 0.00047596553661598956,
"loss": 2.5817,
"step": 1680
},
{
"epoch": 0.1491812684821468,
"grad_norm": 0.306640625,
"learning_rate": 0.00047566538989297837,
"loss": 2.5987,
"step": 1690
},
{
"epoch": 0.15006399788144945,
"grad_norm": 0.2421875,
"learning_rate": 0.00047536347648297685,
"loss": 2.5991,
"step": 1700
},
{
"epoch": 0.1509467272807521,
"grad_norm": 0.330078125,
"learning_rate": 0.0004750597987496018,
"loss": 2.6001,
"step": 1710
},
{
"epoch": 0.15182945668005474,
"grad_norm": 0.259765625,
"learning_rate": 0.00047475435907028254,
"loss": 2.5968,
"step": 1720
},
{
"epoch": 0.15271218607935738,
"grad_norm": 0.302734375,
"learning_rate": 0.0004744471598362421,
"loss": 2.5941,
"step": 1730
},
{
"epoch": 0.15359491547866003,
"grad_norm": 0.2333984375,
"learning_rate": 0.0004741382034524789,
"loss": 2.5971,
"step": 1740
},
{
"epoch": 0.15447764487796267,
"grad_norm": 0.216796875,
"learning_rate": 0.0004738274923377478,
"loss": 2.5867,
"step": 1750
},
{
"epoch": 0.15536037427726532,
"grad_norm": 0.2451171875,
"learning_rate": 0.0004735150289245407,
"loss": 2.5883,
"step": 1760
},
{
"epoch": 0.15624310367656793,
"grad_norm": 0.236328125,
"learning_rate": 0.00047320081565906813,
"loss": 2.6041,
"step": 1770
},
{
"epoch": 0.15712583307587058,
"grad_norm": 0.29296875,
"learning_rate": 0.0004728848550012399,
"loss": 2.6029,
"step": 1780
},
{
"epoch": 0.15800856247517323,
"grad_norm": 0.2578125,
"learning_rate": 0.00047256714942464574,
"loss": 2.5912,
"step": 1790
},
{
"epoch": 0.15889129187447587,
"grad_norm": 0.263671875,
"learning_rate": 0.0004722477014165358,
"loss": 2.586,
"step": 1800
},
{
"epoch": 0.15977402127377852,
"grad_norm": 0.251953125,
"learning_rate": 0.0004719265134778017,
"loss": 2.5931,
"step": 1810
},
{
"epoch": 0.16065675067308116,
"grad_norm": 0.25,
"learning_rate": 0.00047160358812295633,
"loss": 2.5792,
"step": 1820
},
{
"epoch": 0.1615394800723838,
"grad_norm": 0.2451171875,
"learning_rate": 0.0004712789278801145,
"loss": 2.6021,
"step": 1830
},
{
"epoch": 0.16242220947168645,
"grad_norm": 0.25390625,
"learning_rate": 0.00047095253529097313,
"loss": 2.594,
"step": 1840
},
{
"epoch": 0.1633049388709891,
"grad_norm": 0.232421875,
"learning_rate": 0.0004706244129107914,
"loss": 2.588,
"step": 1850
},
{
"epoch": 0.16418766827029174,
"grad_norm": 0.234375,
"learning_rate": 0.00047029456330837055,
"loss": 2.5905,
"step": 1860
},
{
"epoch": 0.1650703976695944,
"grad_norm": 0.2431640625,
"learning_rate": 0.0004699629890660339,
"loss": 2.592,
"step": 1870
},
{
"epoch": 0.16595312706889703,
"grad_norm": 0.2373046875,
"learning_rate": 0.00046962969277960663,
"loss": 2.6002,
"step": 1880
},
{
"epoch": 0.16683585646819968,
"grad_norm": 0.32421875,
"learning_rate": 0.00046929467705839544,
"loss": 2.5983,
"step": 1890
},
{
"epoch": 0.16771858586750232,
"grad_norm": 0.259765625,
"learning_rate": 0.0004689579445251681,
"loss": 2.5974,
"step": 1900
},
{
"epoch": 0.16860131526680497,
"grad_norm": 0.248046875,
"learning_rate": 0.000468619497816133,
"loss": 2.6035,
"step": 1910
},
{
"epoch": 0.1694840446661076,
"grad_norm": 0.2470703125,
"learning_rate": 0.0004682793395809184,
"loss": 2.5968,
"step": 1920
},
{
"epoch": 0.17036677406541026,
"grad_norm": 0.2412109375,
"learning_rate": 0.0004679374724825517,
"loss": 2.5902,
"step": 1930
},
{
"epoch": 0.1712495034647129,
"grad_norm": 0.251953125,
"learning_rate": 0.00046759389919743876,
"loss": 2.5931,
"step": 1940
},
{
"epoch": 0.17213223286401555,
"grad_norm": 0.265625,
"learning_rate": 0.0004672486224153427,
"loss": 2.5937,
"step": 1950
},
{
"epoch": 0.17301496226331817,
"grad_norm": 0.224609375,
"learning_rate": 0.0004669016448393631,
"loss": 2.5863,
"step": 1960
},
{
"epoch": 0.1738976916626208,
"grad_norm": 0.5625,
"learning_rate": 0.0004665529691859144,
"loss": 2.5893,
"step": 1970
},
{
"epoch": 0.17478042106192346,
"grad_norm": 0.2216796875,
"learning_rate": 0.00046620259818470536,
"loss": 2.59,
"step": 1980
},
{
"epoch": 0.1756631504612261,
"grad_norm": 0.25390625,
"learning_rate": 0.0004658505345787169,
"loss": 2.5924,
"step": 1990
},
{
"epoch": 0.17654587986052875,
"grad_norm": 0.2275390625,
"learning_rate": 0.00046549678112418116,
"loss": 2.6109,
"step": 2000
},
{
"epoch": 0.17654587986052875,
"eval_accuracy": 0.4971187442885556,
"eval_loss": 2.480692148208618,
"eval_runtime": 7.0837,
"eval_samples_per_second": 44.892,
"eval_steps_per_second": 0.424,
"step": 2000
},
{
"epoch": 0.1774286092598314,
"grad_norm": 0.279296875,
"learning_rate": 0.0004651413405905597,
"loss": 2.5819,
"step": 2010
},
{
"epoch": 0.17831133865913404,
"grad_norm": 0.283203125,
"learning_rate": 0.00046478421576052196,
"loss": 2.5949,
"step": 2020
},
{
"epoch": 0.17919406805843668,
"grad_norm": 0.302734375,
"learning_rate": 0.00046442540942992315,
"loss": 2.588,
"step": 2030
},
{
"epoch": 0.18007679745773933,
"grad_norm": 0.2373046875,
"learning_rate": 0.00046406492440778294,
"loss": 2.577,
"step": 2040
},
{
"epoch": 0.18095952685704197,
"grad_norm": 0.2353515625,
"learning_rate": 0.0004637027635162627,
"loss": 2.5906,
"step": 2050
},
{
"epoch": 0.18184225625634462,
"grad_norm": 0.263671875,
"learning_rate": 0.00046333892959064425,
"loss": 2.5913,
"step": 2060
},
{
"epoch": 0.18272498565564727,
"grad_norm": 0.3671875,
"learning_rate": 0.0004629734254793071,
"loss": 2.5859,
"step": 2070
},
{
"epoch": 0.1836077150549499,
"grad_norm": 0.263671875,
"learning_rate": 0.00046260625404370606,
"loss": 2.6003,
"step": 2080
},
{
"epoch": 0.18449044445425256,
"grad_norm": 0.234375,
"learning_rate": 0.0004622374181583494,
"loss": 2.5759,
"step": 2090
},
{
"epoch": 0.1853731738535552,
"grad_norm": 0.26171875,
"learning_rate": 0.00046186692071077586,
"loss": 2.5745,
"step": 2100
},
{
"epoch": 0.18625590325285785,
"grad_norm": 0.2353515625,
"learning_rate": 0.00046149476460153216,
"loss": 2.586,
"step": 2110
},
{
"epoch": 0.1871386326521605,
"grad_norm": 0.328125,
"learning_rate": 0.0004611209527441504,
"loss": 2.5893,
"step": 2120
},
{
"epoch": 0.18802136205146314,
"grad_norm": 0.259765625,
"learning_rate": 0.0004607454880651253,
"loss": 2.5885,
"step": 2130
},
{
"epoch": 0.18890409145076575,
"grad_norm": 0.2333984375,
"learning_rate": 0.0004603683735038909,
"loss": 2.5912,
"step": 2140
},
{
"epoch": 0.1897868208500684,
"grad_norm": 0.267578125,
"learning_rate": 0.00045998961201279814,
"loss": 2.5746,
"step": 2150
},
{
"epoch": 0.19066955024937104,
"grad_norm": 0.240234375,
"learning_rate": 0.00045960920655709113,
"loss": 2.5771,
"step": 2160
},
{
"epoch": 0.1915522796486737,
"grad_norm": 0.6796875,
"learning_rate": 0.0004592271601148844,
"loss": 2.5671,
"step": 2170
},
{
"epoch": 0.19243500904797634,
"grad_norm": 0.3984375,
"learning_rate": 0.00045884347567713945,
"loss": 2.5778,
"step": 2180
},
{
"epoch": 0.19331773844727898,
"grad_norm": 0.263671875,
"learning_rate": 0.0004584581562476412,
"loss": 2.6024,
"step": 2190
},
{
"epoch": 0.19420046784658163,
"grad_norm": 0.25,
"learning_rate": 0.0004580712048429746,
"loss": 2.5891,
"step": 2200
},
{
"epoch": 0.19508319724588427,
"grad_norm": 0.40234375,
"learning_rate": 0.000457682624492501,
"loss": 2.573,
"step": 2210
},
{
"epoch": 0.19596592664518692,
"grad_norm": 0.2431640625,
"learning_rate": 0.0004572924182383346,
"loss": 2.5845,
"step": 2220
},
{
"epoch": 0.19684865604448956,
"grad_norm": 0.2275390625,
"learning_rate": 0.00045690058913531794,
"loss": 2.5873,
"step": 2230
},
{
"epoch": 0.1977313854437922,
"grad_norm": 0.28515625,
"learning_rate": 0.0004565071402509992,
"loss": 2.5757,
"step": 2240
},
{
"epoch": 0.19861411484309485,
"grad_norm": 0.21484375,
"learning_rate": 0.000456112074665607,
"loss": 2.5904,
"step": 2250
},
{
"epoch": 0.1994968442423975,
"grad_norm": 0.2275390625,
"learning_rate": 0.0004557153954720269,
"loss": 2.5777,
"step": 2260
},
{
"epoch": 0.20037957364170014,
"grad_norm": 0.201171875,
"learning_rate": 0.0004553171057757772,
"loss": 2.59,
"step": 2270
},
{
"epoch": 0.2012623030410028,
"grad_norm": 0.248046875,
"learning_rate": 0.0004549172086949842,
"loss": 2.5746,
"step": 2280
},
{
"epoch": 0.20214503244030543,
"grad_norm": 0.265625,
"learning_rate": 0.0004545157073603584,
"loss": 2.5907,
"step": 2290
},
{
"epoch": 0.20302776183960808,
"grad_norm": 0.25,
"learning_rate": 0.0004541126049151694,
"loss": 2.6017,
"step": 2300
},
{
"epoch": 0.20391049123891072,
"grad_norm": 0.267578125,
"learning_rate": 0.00045370790451522165,
"loss": 2.5727,
"step": 2310
},
{
"epoch": 0.20479322063821337,
"grad_norm": 0.259765625,
"learning_rate": 0.0004533016093288298,
"loss": 2.5668,
"step": 2320
},
{
"epoch": 0.205675950037516,
"grad_norm": 0.29296875,
"learning_rate": 0.0004528937225367935,
"loss": 2.5869,
"step": 2330
},
{
"epoch": 0.20655867943681863,
"grad_norm": 0.2294921875,
"learning_rate": 0.0004524842473323729,
"loss": 2.59,
"step": 2340
},
{
"epoch": 0.20744140883612128,
"grad_norm": 0.21875,
"learning_rate": 0.0004520731869212634,
"loss": 2.5767,
"step": 2350
},
{
"epoch": 0.20832413823542392,
"grad_norm": 0.31640625,
"learning_rate": 0.0004516605445215709,
"loss": 2.5774,
"step": 2360
},
{
"epoch": 0.20920686763472657,
"grad_norm": 0.283203125,
"learning_rate": 0.00045124632336378603,
"loss": 2.5753,
"step": 2370
},
{
"epoch": 0.2100895970340292,
"grad_norm": 0.2578125,
"learning_rate": 0.00045083052669075936,
"loss": 2.5835,
"step": 2380
},
{
"epoch": 0.21097232643333186,
"grad_norm": 0.31640625,
"learning_rate": 0.0004504131577576758,
"loss": 2.5853,
"step": 2390
},
{
"epoch": 0.2118550558326345,
"grad_norm": 0.29296875,
"learning_rate": 0.00044999421983202905,
"loss": 2.5831,
"step": 2400
},
{
"epoch": 0.21273778523193715,
"grad_norm": 0.2578125,
"learning_rate": 0.00044957371619359644,
"loss": 2.5935,
"step": 2410
},
{
"epoch": 0.2136205146312398,
"grad_norm": 0.279296875,
"learning_rate": 0.00044915165013441257,
"loss": 2.5853,
"step": 2420
},
{
"epoch": 0.21450324403054244,
"grad_norm": 0.251953125,
"learning_rate": 0.0004487280249587441,
"loss": 2.5908,
"step": 2430
},
{
"epoch": 0.21538597342984508,
"grad_norm": 0.25,
"learning_rate": 0.00044830284398306375,
"loss": 2.5873,
"step": 2440
},
{
"epoch": 0.21626870282914773,
"grad_norm": 0.2333984375,
"learning_rate": 0.000447876110536024,
"loss": 2.5863,
"step": 2450
},
{
"epoch": 0.21715143222845038,
"grad_norm": 0.3046875,
"learning_rate": 0.0004474478279584316,
"loss": 2.5858,
"step": 2460
},
{
"epoch": 0.21803416162775302,
"grad_norm": 0.24609375,
"learning_rate": 0.00044701799960322085,
"loss": 2.5832,
"step": 2470
},
{
"epoch": 0.21891689102705567,
"grad_norm": 0.287109375,
"learning_rate": 0.000446586628835428,
"loss": 2.5848,
"step": 2480
},
{
"epoch": 0.2197996204263583,
"grad_norm": 0.216796875,
"learning_rate": 0.00044615371903216407,
"loss": 2.5662,
"step": 2490
},
{
"epoch": 0.22068234982566096,
"grad_norm": 0.232421875,
"learning_rate": 0.00044571927358258917,
"loss": 2.5855,
"step": 2500
},
{
"epoch": 0.22156507922496357,
"grad_norm": 0.279296875,
"learning_rate": 0.0004452832958878856,
"loss": 2.5872,
"step": 2510
},
{
"epoch": 0.22244780862426622,
"grad_norm": 0.22265625,
"learning_rate": 0.0004448457893612311,
"loss": 2.584,
"step": 2520
},
{
"epoch": 0.22333053802356886,
"grad_norm": 0.2421875,
"learning_rate": 0.0004444067574277727,
"loss": 2.579,
"step": 2530
},
{
"epoch": 0.2242132674228715,
"grad_norm": 0.248046875,
"learning_rate": 0.00044396620352459915,
"loss": 2.5757,
"step": 2540
},
{
"epoch": 0.22509599682217415,
"grad_norm": 0.271484375,
"learning_rate": 0.00044352413110071453,
"loss": 2.5684,
"step": 2550
},
{
"epoch": 0.2259787262214768,
"grad_norm": 0.251953125,
"learning_rate": 0.0004430805436170111,
"loss": 2.5839,
"step": 2560
},
{
"epoch": 0.22686145562077945,
"grad_norm": 0.22265625,
"learning_rate": 0.00044263544454624224,
"loss": 2.5779,
"step": 2570
},
{
"epoch": 0.2277441850200821,
"grad_norm": 0.2373046875,
"learning_rate": 0.00044218883737299526,
"loss": 2.573,
"step": 2580
},
{
"epoch": 0.22862691441938474,
"grad_norm": 0.28125,
"learning_rate": 0.00044174072559366386,
"loss": 2.5703,
"step": 2590
},
{
"epoch": 0.22950964381868738,
"grad_norm": 0.236328125,
"learning_rate": 0.00044129111271642117,
"loss": 2.5853,
"step": 2600
},
{
"epoch": 0.23039237321799003,
"grad_norm": 0.232421875,
"learning_rate": 0.0004408400022611921,
"loss": 2.5679,
"step": 2610
},
{
"epoch": 0.23127510261729267,
"grad_norm": 0.20703125,
"learning_rate": 0.00044038739775962584,
"loss": 2.5662,
"step": 2620
},
{
"epoch": 0.23215783201659532,
"grad_norm": 0.2197265625,
"learning_rate": 0.0004399333027550679,
"loss": 2.5646,
"step": 2630
},
{
"epoch": 0.23304056141589796,
"grad_norm": 0.240234375,
"learning_rate": 0.000439477720802533,
"loss": 2.5806,
"step": 2640
},
{
"epoch": 0.2339232908152006,
"grad_norm": 0.2412109375,
"learning_rate": 0.00043902065546867655,
"loss": 2.5744,
"step": 2650
},
{
"epoch": 0.23480602021450325,
"grad_norm": 0.2275390625,
"learning_rate": 0.0004385621103317671,
"loss": 2.5689,
"step": 2660
},
{
"epoch": 0.2356887496138059,
"grad_norm": 0.2138671875,
"learning_rate": 0.00043810208898165836,
"loss": 2.5626,
"step": 2670
},
{
"epoch": 0.23657147901310854,
"grad_norm": 0.251953125,
"learning_rate": 0.000437640595019761,
"loss": 2.5837,
"step": 2680
},
{
"epoch": 0.2374542084124112,
"grad_norm": 0.33203125,
"learning_rate": 0.00043717763205901436,
"loss": 2.5777,
"step": 2690
},
{
"epoch": 0.2383369378117138,
"grad_norm": 0.251953125,
"learning_rate": 0.00043671320372385834,
"loss": 2.571,
"step": 2700
},
{
"epoch": 0.23921966721101645,
"grad_norm": 0.263671875,
"learning_rate": 0.00043624731365020505,
"loss": 2.5759,
"step": 2710
},
{
"epoch": 0.2401023966103191,
"grad_norm": 0.263671875,
"learning_rate": 0.00043577996548541,
"loss": 2.5723,
"step": 2720
},
{
"epoch": 0.24098512600962174,
"grad_norm": 0.26953125,
"learning_rate": 0.00043531116288824393,
"loss": 2.5803,
"step": 2730
},
{
"epoch": 0.2418678554089244,
"grad_norm": 0.21875,
"learning_rate": 0.00043484090952886404,
"loss": 2.5819,
"step": 2740
},
{
"epoch": 0.24275058480822703,
"grad_norm": 0.25,
"learning_rate": 0.0004343692090887852,
"loss": 2.5608,
"step": 2750
},
{
"epoch": 0.24363331420752968,
"grad_norm": 0.251953125,
"learning_rate": 0.0004338960652608511,
"loss": 2.5712,
"step": 2760
},
{
"epoch": 0.24451604360683232,
"grad_norm": 0.23828125,
"learning_rate": 0.0004334214817492057,
"loss": 2.5537,
"step": 2770
},
{
"epoch": 0.24539877300613497,
"grad_norm": 0.2490234375,
"learning_rate": 0.0004329454622692636,
"loss": 2.566,
"step": 2780
},
{
"epoch": 0.2462815024054376,
"grad_norm": 0.23828125,
"learning_rate": 0.00043246801054768147,
"loss": 2.5767,
"step": 2790
},
{
"epoch": 0.24716423180474026,
"grad_norm": 0.251953125,
"learning_rate": 0.0004319891303223287,
"loss": 2.5636,
"step": 2800
},
{
"epoch": 0.2480469612040429,
"grad_norm": 0.2890625,
"learning_rate": 0.000431508825342258,
"loss": 2.5796,
"step": 2810
},
{
"epoch": 0.24892969060334555,
"grad_norm": 0.251953125,
"learning_rate": 0.0004310270993676764,
"loss": 2.5804,
"step": 2820
},
{
"epoch": 0.2498124200026482,
"grad_norm": 0.2451171875,
"learning_rate": 0.00043054395616991535,
"loss": 2.5703,
"step": 2830
},
{
"epoch": 0.2506951494019508,
"grad_norm": 0.2314453125,
"learning_rate": 0.0004300593995314017,
"loss": 2.5692,
"step": 2840
},
{
"epoch": 0.2515778788012535,
"grad_norm": 0.248046875,
"learning_rate": 0.0004295734332456277,
"loss": 2.5508,
"step": 2850
},
{
"epoch": 0.2524606082005561,
"grad_norm": 0.28515625,
"learning_rate": 0.00042908606111712136,
"loss": 2.5691,
"step": 2860
},
{
"epoch": 0.2533433375998588,
"grad_norm": 0.248046875,
"learning_rate": 0.0004285972869614169,
"loss": 2.5741,
"step": 2870
},
{
"epoch": 0.2542260669991614,
"grad_norm": 0.228515625,
"learning_rate": 0.00042810711460502447,
"loss": 2.5651,
"step": 2880
},
{
"epoch": 0.25510879639846407,
"grad_norm": 0.25390625,
"learning_rate": 0.00042761554788540084,
"loss": 2.5944,
"step": 2890
},
{
"epoch": 0.2559915257977667,
"grad_norm": 0.22265625,
"learning_rate": 0.0004271225906509186,
"loss": 2.5719,
"step": 2900
},
{
"epoch": 0.25687425519706936,
"grad_norm": 0.265625,
"learning_rate": 0.0004266282467608365,
"loss": 2.5665,
"step": 2910
},
{
"epoch": 0.257756984596372,
"grad_norm": 0.2578125,
"learning_rate": 0.00042613252008526914,
"loss": 2.5864,
"step": 2920
},
{
"epoch": 0.25863971399567465,
"grad_norm": 0.244140625,
"learning_rate": 0.0004256354145051567,
"loss": 2.5584,
"step": 2930
},
{
"epoch": 0.25952244339497726,
"grad_norm": 0.2275390625,
"learning_rate": 0.0004251369339122344,
"loss": 2.5835,
"step": 2940
},
{
"epoch": 0.26040517279427994,
"grad_norm": 0.37109375,
"learning_rate": 0.00042463708220900225,
"loss": 2.5874,
"step": 2950
},
{
"epoch": 0.26128790219358256,
"grad_norm": 0.2314453125,
"learning_rate": 0.00042413586330869446,
"loss": 2.5944,
"step": 2960
},
{
"epoch": 0.26217063159288523,
"grad_norm": 0.220703125,
"learning_rate": 0.00042363328113524846,
"loss": 2.579,
"step": 2970
},
{
"epoch": 0.26305336099218785,
"grad_norm": 0.2265625,
"learning_rate": 0.0004231293396232747,
"loss": 2.5835,
"step": 2980
},
{
"epoch": 0.26393609039149046,
"grad_norm": 0.2294921875,
"learning_rate": 0.00042262404271802565,
"loss": 2.5732,
"step": 2990
},
{
"epoch": 0.26481881979079314,
"grad_norm": 0.236328125,
"learning_rate": 0.00042211739437536457,
"loss": 2.58,
"step": 3000
},
{
"epoch": 0.26570154919009575,
"grad_norm": 0.2109375,
"learning_rate": 0.0004216093985617352,
"loss": 2.5709,
"step": 3010
},
{
"epoch": 0.2665842785893984,
"grad_norm": 0.228515625,
"learning_rate": 0.0004211000592541301,
"loss": 2.5737,
"step": 3020
},
{
"epoch": 0.26746700798870104,
"grad_norm": 0.259765625,
"learning_rate": 0.0004205893804400599,
"loss": 2.57,
"step": 3030
},
{
"epoch": 0.2683497373880037,
"grad_norm": 0.2412109375,
"learning_rate": 0.0004200773661175219,
"loss": 2.5627,
"step": 3040
},
{
"epoch": 0.26923246678730633,
"grad_norm": 0.29296875,
"learning_rate": 0.0004195640202949687,
"loss": 2.559,
"step": 3050
},
{
"epoch": 0.270115196186609,
"grad_norm": 0.318359375,
"learning_rate": 0.00041904934699127713,
"loss": 2.5736,
"step": 3060
},
{
"epoch": 0.2709979255859116,
"grad_norm": 0.326171875,
"learning_rate": 0.0004185333502357164,
"loss": 2.5594,
"step": 3070
},
{
"epoch": 0.2718806549852143,
"grad_norm": 0.2119140625,
"learning_rate": 0.000418016034067917,
"loss": 2.5649,
"step": 3080
},
{
"epoch": 0.2727633843845169,
"grad_norm": 0.2236328125,
"learning_rate": 0.00041749740253783853,
"loss": 2.5689,
"step": 3090
},
{
"epoch": 0.2736461137838196,
"grad_norm": 0.2275390625,
"learning_rate": 0.00041697745970573855,
"loss": 2.5798,
"step": 3100
},
{
"epoch": 0.2745288431831222,
"grad_norm": 0.25,
"learning_rate": 0.00041645620964214023,
"loss": 2.572,
"step": 3110
},
{
"epoch": 0.2754115725824249,
"grad_norm": 0.228515625,
"learning_rate": 0.0004159336564278012,
"loss": 2.5933,
"step": 3120
},
{
"epoch": 0.2762943019817275,
"grad_norm": 0.265625,
"learning_rate": 0.0004154098041536807,
"loss": 2.5831,
"step": 3130
},
{
"epoch": 0.27717703138103017,
"grad_norm": 0.291015625,
"learning_rate": 0.00041488465692090837,
"loss": 2.5858,
"step": 3140
},
{
"epoch": 0.2780597607803328,
"grad_norm": 0.287109375,
"learning_rate": 0.00041435821884075176,
"loss": 2.5733,
"step": 3150
},
{
"epoch": 0.2789424901796354,
"grad_norm": 0.29296875,
"learning_rate": 0.00041383049403458403,
"loss": 2.5785,
"step": 3160
},
{
"epoch": 0.2798252195789381,
"grad_norm": 0.2119140625,
"learning_rate": 0.0004133014866338521,
"loss": 2.5804,
"step": 3170
},
{
"epoch": 0.2807079489782407,
"grad_norm": 0.2275390625,
"learning_rate": 0.00041277120078004383,
"loss": 2.5579,
"step": 3180
},
{
"epoch": 0.28159067837754337,
"grad_norm": 0.30078125,
"learning_rate": 0.0004122396406246559,
"loss": 2.5792,
"step": 3190
},
{
"epoch": 0.282473407776846,
"grad_norm": 0.24609375,
"learning_rate": 0.0004117068103291614,
"loss": 2.5744,
"step": 3200
},
{
"epoch": 0.28335613717614866,
"grad_norm": 0.2294921875,
"learning_rate": 0.00041117271406497665,
"loss": 2.5614,
"step": 3210
},
{
"epoch": 0.2842388665754513,
"grad_norm": 0.2294921875,
"learning_rate": 0.00041063735601342934,
"loss": 2.5693,
"step": 3220
},
{
"epoch": 0.28512159597475395,
"grad_norm": 0.30859375,
"learning_rate": 0.0004101007403657255,
"loss": 2.5743,
"step": 3230
},
{
"epoch": 0.28600432537405657,
"grad_norm": 0.25390625,
"learning_rate": 0.00040956287132291625,
"loss": 2.5592,
"step": 3240
},
{
"epoch": 0.28688705477335924,
"grad_norm": 0.26953125,
"learning_rate": 0.00040902375309586557,
"loss": 2.5735,
"step": 3250
},
{
"epoch": 0.28776978417266186,
"grad_norm": 0.236328125,
"learning_rate": 0.00040848338990521696,
"loss": 2.5728,
"step": 3260
},
{
"epoch": 0.28865251357196453,
"grad_norm": 0.2353515625,
"learning_rate": 0.00040794178598136033,
"loss": 2.5648,
"step": 3270
},
{
"epoch": 0.28953524297126715,
"grad_norm": 0.20703125,
"learning_rate": 0.0004073989455643994,
"loss": 2.5843,
"step": 3280
},
{
"epoch": 0.2904179723705698,
"grad_norm": 0.234375,
"learning_rate": 0.00040685487290411765,
"loss": 2.5756,
"step": 3290
},
{
"epoch": 0.29130070176987244,
"grad_norm": 0.25390625,
"learning_rate": 0.0004063095722599459,
"loss": 2.5676,
"step": 3300
},
{
"epoch": 0.2921834311691751,
"grad_norm": 0.251953125,
"learning_rate": 0.00040576304790092857,
"loss": 2.5653,
"step": 3310
},
{
"epoch": 0.29306616056847773,
"grad_norm": 0.267578125,
"learning_rate": 0.00040521530410569007,
"loss": 2.5877,
"step": 3320
},
{
"epoch": 0.2939488899677804,
"grad_norm": 0.265625,
"learning_rate": 0.0004046663451624016,
"loss": 2.5722,
"step": 3330
},
{
"epoch": 0.294831619367083,
"grad_norm": 0.2060546875,
"learning_rate": 0.0004041161753687478,
"loss": 2.5592,
"step": 3340
},
{
"epoch": 0.29571434876638564,
"grad_norm": 0.2265625,
"learning_rate": 0.00040356479903189233,
"loss": 2.5817,
"step": 3350
},
{
"epoch": 0.2965970781656883,
"grad_norm": 0.23046875,
"learning_rate": 0.0004030122204684449,
"loss": 2.5689,
"step": 3360
},
{
"epoch": 0.29747980756499093,
"grad_norm": 0.2333984375,
"learning_rate": 0.0004024584440044271,
"loss": 2.563,
"step": 3370
},
{
"epoch": 0.2983625369642936,
"grad_norm": 0.2216796875,
"learning_rate": 0.00040190347397523873,
"loss": 2.5695,
"step": 3380
},
{
"epoch": 0.2992452663635962,
"grad_norm": 0.21875,
"learning_rate": 0.0004013473147256238,
"loss": 2.5658,
"step": 3390
},
{
"epoch": 0.3001279957628989,
"grad_norm": 0.28515625,
"learning_rate": 0.0004007899706096363,
"loss": 2.5648,
"step": 3400
},
{
"epoch": 0.3010107251622015,
"grad_norm": 0.240234375,
"learning_rate": 0.00040023144599060623,
"loss": 2.5534,
"step": 3410
},
{
"epoch": 0.3018934545615042,
"grad_norm": 0.259765625,
"learning_rate": 0.00039967174524110596,
"loss": 2.585,
"step": 3420
},
{
"epoch": 0.3027761839608068,
"grad_norm": 0.205078125,
"learning_rate": 0.000399110872742915,
"loss": 2.5641,
"step": 3430
},
{
"epoch": 0.3036589133601095,
"grad_norm": 0.2080078125,
"learning_rate": 0.0003985488328869865,
"loss": 2.582,
"step": 3440
},
{
"epoch": 0.3045416427594121,
"grad_norm": 0.2265625,
"learning_rate": 0.0003979856300734126,
"loss": 2.5632,
"step": 3450
},
{
"epoch": 0.30542437215871476,
"grad_norm": 0.2294921875,
"learning_rate": 0.00039742126871138996,
"loss": 2.5696,
"step": 3460
},
{
"epoch": 0.3063071015580174,
"grad_norm": 0.2314453125,
"learning_rate": 0.0003968557532191852,
"loss": 2.5784,
"step": 3470
},
{
"epoch": 0.30718983095732005,
"grad_norm": 0.294921875,
"learning_rate": 0.00039628908802410057,
"loss": 2.5746,
"step": 3480
},
{
"epoch": 0.30807256035662267,
"grad_norm": 0.2314453125,
"learning_rate": 0.00039572127756243904,
"loss": 2.5684,
"step": 3490
},
{
"epoch": 0.30895528975592534,
"grad_norm": 0.236328125,
"learning_rate": 0.0003951523262794693,
"loss": 2.5684,
"step": 3500
},
{
"epoch": 0.30983801915522796,
"grad_norm": 0.2294921875,
"learning_rate": 0.00039458223862939184,
"loss": 2.5781,
"step": 3510
},
{
"epoch": 0.31072074855453063,
"grad_norm": 0.240234375,
"learning_rate": 0.00039401101907530323,
"loss": 2.571,
"step": 3520
},
{
"epoch": 0.31160347795383325,
"grad_norm": 0.2294921875,
"learning_rate": 0.0003934386720891614,
"loss": 2.569,
"step": 3530
},
{
"epoch": 0.31248620735313587,
"grad_norm": 0.2138671875,
"learning_rate": 0.00039286520215175085,
"loss": 2.5527,
"step": 3540
},
{
"epoch": 0.31336893675243854,
"grad_norm": 0.2080078125,
"learning_rate": 0.0003922906137526474,
"loss": 2.5774,
"step": 3550
},
{
"epoch": 0.31425166615174116,
"grad_norm": 0.2236328125,
"learning_rate": 0.00039171491139018325,
"loss": 2.572,
"step": 3560
},
{
"epoch": 0.31513439555104383,
"grad_norm": 0.2890625,
"learning_rate": 0.0003911380995714111,
"loss": 2.5883,
"step": 3570
},
{
"epoch": 0.31601712495034645,
"grad_norm": 0.30078125,
"learning_rate": 0.0003905601828120698,
"loss": 2.5614,
"step": 3580
},
{
"epoch": 0.3168998543496491,
"grad_norm": 0.2197265625,
"learning_rate": 0.0003899811656365485,
"loss": 2.574,
"step": 3590
},
{
"epoch": 0.31778258374895174,
"grad_norm": 0.2177734375,
"learning_rate": 0.0003894010525778511,
"loss": 2.5814,
"step": 3600
},
{
"epoch": 0.3186653131482544,
"grad_norm": 0.23046875,
"learning_rate": 0.000388819848177561,
"loss": 2.5756,
"step": 3610
},
{
"epoch": 0.31954804254755703,
"grad_norm": 0.265625,
"learning_rate": 0.00038823755698580545,
"loss": 2.5644,
"step": 3620
},
{
"epoch": 0.3204307719468597,
"grad_norm": 0.24609375,
"learning_rate": 0.0003876541835612202,
"loss": 2.5813,
"step": 3630
},
{
"epoch": 0.3213135013461623,
"grad_norm": 0.26953125,
"learning_rate": 0.0003870697324709132,
"loss": 2.5781,
"step": 3640
},
{
"epoch": 0.322196230745465,
"grad_norm": 0.22265625,
"learning_rate": 0.00038648420829042954,
"loss": 2.5774,
"step": 3650
},
{
"epoch": 0.3230789601447676,
"grad_norm": 0.22265625,
"learning_rate": 0.00038589761560371515,
"loss": 2.5752,
"step": 3660
},
{
"epoch": 0.3239616895440703,
"grad_norm": 0.26953125,
"learning_rate": 0.00038530995900308107,
"loss": 2.5554,
"step": 3670
},
{
"epoch": 0.3248444189433729,
"grad_norm": 0.26171875,
"learning_rate": 0.00038472124308916753,
"loss": 2.5661,
"step": 3680
},
{
"epoch": 0.3257271483426756,
"grad_norm": 0.2216796875,
"learning_rate": 0.00038413147247090795,
"loss": 2.5818,
"step": 3690
},
{
"epoch": 0.3266098777419782,
"grad_norm": 0.24609375,
"learning_rate": 0.00038354065176549274,
"loss": 2.583,
"step": 3700
},
{
"epoch": 0.32749260714128087,
"grad_norm": 0.240234375,
"learning_rate": 0.00038294878559833317,
"loss": 2.5657,
"step": 3710
},
{
"epoch": 0.3283753365405835,
"grad_norm": 0.259765625,
"learning_rate": 0.0003823558786030255,
"loss": 2.5704,
"step": 3720
},
{
"epoch": 0.3292580659398861,
"grad_norm": 0.2578125,
"learning_rate": 0.00038176193542131386,
"loss": 2.5747,
"step": 3730
},
{
"epoch": 0.3301407953391888,
"grad_norm": 0.208984375,
"learning_rate": 0.00038116696070305503,
"loss": 2.5803,
"step": 3740
},
{
"epoch": 0.3310235247384914,
"grad_norm": 0.287109375,
"learning_rate": 0.00038057095910618125,
"loss": 2.5665,
"step": 3750
},
{
"epoch": 0.33190625413779407,
"grad_norm": 0.265625,
"learning_rate": 0.00037997393529666393,
"loss": 2.5765,
"step": 3760
},
{
"epoch": 0.3327889835370967,
"grad_norm": 0.2060546875,
"learning_rate": 0.00037937589394847714,
"loss": 2.5569,
"step": 3770
},
{
"epoch": 0.33367171293639936,
"grad_norm": 0.228515625,
"learning_rate": 0.00037877683974356114,
"loss": 2.5679,
"step": 3780
},
{
"epoch": 0.334554442335702,
"grad_norm": 0.244140625,
"learning_rate": 0.0003781767773717857,
"loss": 2.5664,
"step": 3790
},
{
"epoch": 0.33543717173500465,
"grad_norm": 0.283203125,
"learning_rate": 0.00037757571153091324,
"loss": 2.5706,
"step": 3800
},
{
"epoch": 0.33631990113430726,
"grad_norm": 0.26171875,
"learning_rate": 0.000376973646926562,
"loss": 2.5694,
"step": 3810
},
{
"epoch": 0.33720263053360994,
"grad_norm": 0.2216796875,
"learning_rate": 0.00037637058827216964,
"loss": 2.5567,
"step": 3820
},
{
"epoch": 0.33808535993291255,
"grad_norm": 0.2041015625,
"learning_rate": 0.00037576654028895554,
"loss": 2.5725,
"step": 3830
},
{
"epoch": 0.3389680893322152,
"grad_norm": 0.2373046875,
"learning_rate": 0.00037516150770588487,
"loss": 2.5594,
"step": 3840
},
{
"epoch": 0.33985081873151785,
"grad_norm": 0.208984375,
"learning_rate": 0.00037455549525963066,
"loss": 2.5653,
"step": 3850
},
{
"epoch": 0.3407335481308205,
"grad_norm": 0.306640625,
"learning_rate": 0.0003739485076945373,
"loss": 2.5642,
"step": 3860
},
{
"epoch": 0.34161627753012314,
"grad_norm": 0.2216796875,
"learning_rate": 0.000373340549762583,
"loss": 2.5428,
"step": 3870
},
{
"epoch": 0.3424990069294258,
"grad_norm": 0.240234375,
"learning_rate": 0.0003727316262233429,
"loss": 2.5701,
"step": 3880
},
{
"epoch": 0.3433817363287284,
"grad_norm": 0.2197265625,
"learning_rate": 0.0003721217418439516,
"loss": 2.556,
"step": 3890
},
{
"epoch": 0.3442644657280311,
"grad_norm": 0.2314453125,
"learning_rate": 0.00037151090139906593,
"loss": 2.5647,
"step": 3900
},
{
"epoch": 0.3451471951273337,
"grad_norm": 0.216796875,
"learning_rate": 0.00037089910967082765,
"loss": 2.5705,
"step": 3910
},
{
"epoch": 0.34602992452663633,
"grad_norm": 0.2109375,
"learning_rate": 0.0003702863714488257,
"loss": 2.5759,
"step": 3920
},
{
"epoch": 0.346912653925939,
"grad_norm": 0.2373046875,
"learning_rate": 0.0003696726915300592,
"loss": 2.5727,
"step": 3930
},
{
"epoch": 0.3477953833252416,
"grad_norm": 0.2197265625,
"learning_rate": 0.0003690580747188995,
"loss": 2.5742,
"step": 3940
},
{
"epoch": 0.3486781127245443,
"grad_norm": 0.2275390625,
"learning_rate": 0.00036844252582705244,
"loss": 2.5529,
"step": 3950
},
{
"epoch": 0.3495608421238469,
"grad_norm": 0.21484375,
"learning_rate": 0.0003678260496735214,
"loss": 2.5697,
"step": 3960
},
{
"epoch": 0.3504435715231496,
"grad_norm": 0.251953125,
"learning_rate": 0.0003672086510845687,
"loss": 2.5643,
"step": 3970
},
{
"epoch": 0.3513263009224522,
"grad_norm": 0.2158203125,
"learning_rate": 0.00036659033489367835,
"loss": 2.5644,
"step": 3980
},
{
"epoch": 0.3522090303217549,
"grad_norm": 0.263671875,
"learning_rate": 0.0003659711059415182,
"loss": 2.5698,
"step": 3990
},
{
"epoch": 0.3530917597210575,
"grad_norm": 0.2119140625,
"learning_rate": 0.0003653509690759016,
"loss": 2.5789,
"step": 4000
},
{
"epoch": 0.3530917597210575,
"eval_accuracy": 0.5002887399113815,
"eval_loss": 2.4561643600463867,
"eval_runtime": 6.9947,
"eval_samples_per_second": 45.463,
"eval_steps_per_second": 0.429,
"step": 4000
},
{
"epoch": 0.35397448912036017,
"grad_norm": 0.2275390625,
"learning_rate": 0.00036472992915175017,
"loss": 2.5587,
"step": 4010
},
{
"epoch": 0.3548572185196628,
"grad_norm": 0.2373046875,
"learning_rate": 0.00036410799103105503,
"loss": 2.5827,
"step": 4020
},
{
"epoch": 0.35573994791896546,
"grad_norm": 0.2353515625,
"learning_rate": 0.0003634851595828393,
"loss": 2.5659,
"step": 4030
},
{
"epoch": 0.3566226773182681,
"grad_norm": 0.23046875,
"learning_rate": 0.00036286143968311963,
"loss": 2.5649,
"step": 4040
},
{
"epoch": 0.35750540671757075,
"grad_norm": 0.23046875,
"learning_rate": 0.00036223683621486845,
"loss": 2.5683,
"step": 4050
},
{
"epoch": 0.35838813611687337,
"grad_norm": 0.2080078125,
"learning_rate": 0.00036161135406797504,
"loss": 2.5568,
"step": 4060
},
{
"epoch": 0.35927086551617604,
"grad_norm": 0.2421875,
"learning_rate": 0.0003609849981392079,
"loss": 2.5601,
"step": 4070
},
{
"epoch": 0.36015359491547866,
"grad_norm": 0.251953125,
"learning_rate": 0.0003603577733321764,
"loss": 2.5553,
"step": 4080
},
{
"epoch": 0.3610363243147813,
"grad_norm": 0.228515625,
"learning_rate": 0.0003597296845572917,
"loss": 2.5573,
"step": 4090
},
{
"epoch": 0.36191905371408395,
"grad_norm": 0.2177734375,
"learning_rate": 0.00035910073673172933,
"loss": 2.5609,
"step": 4100
},
{
"epoch": 0.36280178311338657,
"grad_norm": 0.22265625,
"learning_rate": 0.00035847093477938953,
"loss": 2.5557,
"step": 4110
},
{
"epoch": 0.36368451251268924,
"grad_norm": 0.2099609375,
"learning_rate": 0.00035784028363085985,
"loss": 2.5553,
"step": 4120
},
{
"epoch": 0.36456724191199186,
"grad_norm": 0.2099609375,
"learning_rate": 0.00035720878822337576,
"loss": 2.5494,
"step": 4130
},
{
"epoch": 0.36544997131129453,
"grad_norm": 0.279296875,
"learning_rate": 0.00035657645350078233,
"loss": 2.5837,
"step": 4140
},
{
"epoch": 0.36633270071059715,
"grad_norm": 0.2421875,
"learning_rate": 0.0003559432844134954,
"loss": 2.5717,
"step": 4150
},
{
"epoch": 0.3672154301098998,
"grad_norm": 0.23828125,
"learning_rate": 0.0003553092859184629,
"loss": 2.5629,
"step": 4160
},
{
"epoch": 0.36809815950920244,
"grad_norm": 0.2373046875,
"learning_rate": 0.0003546744629791261,
"loss": 2.5562,
"step": 4170
},
{
"epoch": 0.3689808889085051,
"grad_norm": 0.53125,
"learning_rate": 0.00035403882056538044,
"loss": 2.5511,
"step": 4180
},
{
"epoch": 0.36986361830780773,
"grad_norm": 0.236328125,
"learning_rate": 0.00035340236365353724,
"loss": 2.5611,
"step": 4190
},
{
"epoch": 0.3707463477071104,
"grad_norm": 0.21875,
"learning_rate": 0.000352765097226284,
"loss": 2.5717,
"step": 4200
},
{
"epoch": 0.371629077106413,
"grad_norm": 0.2333984375,
"learning_rate": 0.0003521270262726458,
"loss": 2.5666,
"step": 4210
},
{
"epoch": 0.3725118065057157,
"grad_norm": 0.28515625,
"learning_rate": 0.00035148815578794635,
"loss": 2.5583,
"step": 4220
},
{
"epoch": 0.3733945359050183,
"grad_norm": 0.2353515625,
"learning_rate": 0.0003508484907737687,
"loss": 2.5552,
"step": 4230
},
{
"epoch": 0.374277265304321,
"grad_norm": 0.2373046875,
"learning_rate": 0.0003502080362379159,
"loss": 2.5708,
"step": 4240
},
{
"epoch": 0.3751599947036236,
"grad_norm": 0.2197265625,
"learning_rate": 0.00034956679719437225,
"loss": 2.5804,
"step": 4250
},
{
"epoch": 0.3760427241029263,
"grad_norm": 0.212890625,
"learning_rate": 0.00034892477866326356,
"loss": 2.5592,
"step": 4260
},
{
"epoch": 0.3769254535022289,
"grad_norm": 0.224609375,
"learning_rate": 0.0003482819856708183,
"loss": 2.5529,
"step": 4270
},
{
"epoch": 0.3778081829015315,
"grad_norm": 0.216796875,
"learning_rate": 0.00034763842324932794,
"loss": 2.579,
"step": 4280
},
{
"epoch": 0.3786909123008342,
"grad_norm": 0.244140625,
"learning_rate": 0.00034699409643710764,
"loss": 2.5711,
"step": 4290
},
{
"epoch": 0.3795736417001368,
"grad_norm": 0.263671875,
"learning_rate": 0.00034634901027845677,
"loss": 2.5626,
"step": 4300
},
{
"epoch": 0.38045637109943947,
"grad_norm": 0.232421875,
"learning_rate": 0.0003457031698236196,
"loss": 2.5598,
"step": 4310
},
{
"epoch": 0.3813391004987421,
"grad_norm": 0.2294921875,
"learning_rate": 0.00034505658012874544,
"loss": 2.5722,
"step": 4320
},
{
"epoch": 0.38222182989804476,
"grad_norm": 0.255859375,
"learning_rate": 0.00034440924625584954,
"loss": 2.5619,
"step": 4330
},
{
"epoch": 0.3831045592973474,
"grad_norm": 0.2138671875,
"learning_rate": 0.0003437611732727728,
"loss": 2.5547,
"step": 4340
},
{
"epoch": 0.38398728869665005,
"grad_norm": 0.2021484375,
"learning_rate": 0.0003431123662531427,
"loss": 2.5654,
"step": 4350
},
{
"epoch": 0.38487001809595267,
"grad_norm": 0.197265625,
"learning_rate": 0.0003424628302763332,
"loss": 2.5526,
"step": 4360
},
{
"epoch": 0.38575274749525534,
"grad_norm": 0.23828125,
"learning_rate": 0.0003418125704274252,
"loss": 2.5546,
"step": 4370
},
{
"epoch": 0.38663547689455796,
"grad_norm": 0.25390625,
"learning_rate": 0.00034116159179716675,
"loss": 2.585,
"step": 4380
},
{
"epoch": 0.38751820629386063,
"grad_norm": 0.23046875,
"learning_rate": 0.0003405098994819329,
"loss": 2.572,
"step": 4390
},
{
"epoch": 0.38840093569316325,
"grad_norm": 0.2197265625,
"learning_rate": 0.00033985749858368605,
"loss": 2.5571,
"step": 4400
},
{
"epoch": 0.3892836650924659,
"grad_norm": 0.21484375,
"learning_rate": 0.0003392043942099358,
"loss": 2.5717,
"step": 4410
},
{
"epoch": 0.39016639449176854,
"grad_norm": 0.2373046875,
"learning_rate": 0.0003385505914736994,
"loss": 2.5652,
"step": 4420
},
{
"epoch": 0.3910491238910712,
"grad_norm": 0.2255859375,
"learning_rate": 0.00033789609549346146,
"loss": 2.5583,
"step": 4430
},
{
"epoch": 0.39193185329037383,
"grad_norm": 0.3046875,
"learning_rate": 0.0003372409113931334,
"loss": 2.5538,
"step": 4440
},
{
"epoch": 0.3928145826896765,
"grad_norm": 0.216796875,
"learning_rate": 0.0003365850443020142,
"loss": 2.5522,
"step": 4450
},
{
"epoch": 0.3936973120889791,
"grad_norm": 0.18359375,
"learning_rate": 0.00033592849935474965,
"loss": 2.5695,
"step": 4460
},
{
"epoch": 0.39458004148828174,
"grad_norm": 0.232421875,
"learning_rate": 0.0003352712816912925,
"loss": 2.57,
"step": 4470
},
{
"epoch": 0.3954627708875844,
"grad_norm": 0.197265625,
"learning_rate": 0.00033461339645686196,
"loss": 2.5631,
"step": 4480
},
{
"epoch": 0.39634550028688703,
"grad_norm": 0.205078125,
"learning_rate": 0.0003339548488019033,
"loss": 2.558,
"step": 4490
},
{
"epoch": 0.3972282296861897,
"grad_norm": 0.2236328125,
"learning_rate": 0.00033329564388204816,
"loss": 2.5512,
"step": 4500
},
{
"epoch": 0.3981109590854923,
"grad_norm": 0.2490234375,
"learning_rate": 0.0003326357868580734,
"loss": 2.5622,
"step": 4510
},
{
"epoch": 0.398993688484795,
"grad_norm": 0.228515625,
"learning_rate": 0.0003319752828958613,
"loss": 2.5679,
"step": 4520
},
{
"epoch": 0.3998764178840976,
"grad_norm": 0.22265625,
"learning_rate": 0.0003313141371663587,
"loss": 2.5745,
"step": 4530
},
{
"epoch": 0.4007591472834003,
"grad_norm": 0.2734375,
"learning_rate": 0.000330652354845537,
"loss": 2.5843,
"step": 4540
},
{
"epoch": 0.4016418766827029,
"grad_norm": 0.19921875,
"learning_rate": 0.0003299899411143509,
"loss": 2.5639,
"step": 4550
},
{
"epoch": 0.4025246060820056,
"grad_norm": 0.2392578125,
"learning_rate": 0.0003293269011586986,
"loss": 2.5564,
"step": 4560
},
{
"epoch": 0.4034073354813082,
"grad_norm": 0.2138671875,
"learning_rate": 0.00032866324016938095,
"loss": 2.5443,
"step": 4570
},
{
"epoch": 0.40429006488061087,
"grad_norm": 0.2158203125,
"learning_rate": 0.00032799896334206045,
"loss": 2.5623,
"step": 4580
},
{
"epoch": 0.4051727942799135,
"grad_norm": 0.2041015625,
"learning_rate": 0.000327334075877221,
"loss": 2.5788,
"step": 4590
},
{
"epoch": 0.40605552367921616,
"grad_norm": 0.2236328125,
"learning_rate": 0.000326668582980127,
"loss": 2.5764,
"step": 4600
},
{
"epoch": 0.4069382530785188,
"grad_norm": 0.1923828125,
"learning_rate": 0.00032600248986078295,
"loss": 2.5626,
"step": 4610
},
{
"epoch": 0.40782098247782145,
"grad_norm": 0.265625,
"learning_rate": 0.00032533580173389195,
"loss": 2.5496,
"step": 4620
},
{
"epoch": 0.40870371187712407,
"grad_norm": 0.29296875,
"learning_rate": 0.0003246685238188154,
"loss": 2.5608,
"step": 4630
},
{
"epoch": 0.40958644127642674,
"grad_norm": 0.3046875,
"learning_rate": 0.00032400066133953225,
"loss": 2.5702,
"step": 4640
},
{
"epoch": 0.41046917067572936,
"grad_norm": 0.279296875,
"learning_rate": 0.0003233322195245977,
"loss": 2.567,
"step": 4650
},
{
"epoch": 0.411351900075032,
"grad_norm": 0.2265625,
"learning_rate": 0.00032266320360710237,
"loss": 2.5644,
"step": 4660
},
{
"epoch": 0.41223462947433465,
"grad_norm": 0.216796875,
"learning_rate": 0.0003219936188246317,
"loss": 2.5675,
"step": 4670
},
{
"epoch": 0.41311735887363726,
"grad_norm": 0.2001953125,
"learning_rate": 0.0003213234704192243,
"loss": 2.5619,
"step": 4680
},
{
"epoch": 0.41400008827293994,
"grad_norm": 0.1982421875,
"learning_rate": 0.00032065276363733137,
"loss": 2.5594,
"step": 4690
},
{
"epoch": 0.41488281767224255,
"grad_norm": 0.2392578125,
"learning_rate": 0.00031998150372977577,
"loss": 2.5668,
"step": 4700
},
{
"epoch": 0.4157655470715452,
"grad_norm": 0.2021484375,
"learning_rate": 0.0003193096959517103,
"loss": 2.5547,
"step": 4710
},
{
"epoch": 0.41664827647084784,
"grad_norm": 0.2421875,
"learning_rate": 0.0003186373455625774,
"loss": 2.5528,
"step": 4720
},
{
"epoch": 0.4175310058701505,
"grad_norm": 0.216796875,
"learning_rate": 0.0003179644578260669,
"loss": 2.5555,
"step": 4730
},
{
"epoch": 0.41841373526945314,
"grad_norm": 0.208984375,
"learning_rate": 0.00031729103801007575,
"loss": 2.5547,
"step": 4740
},
{
"epoch": 0.4192964646687558,
"grad_norm": 0.208984375,
"learning_rate": 0.0003166170913866665,
"loss": 2.5666,
"step": 4750
},
{
"epoch": 0.4201791940680584,
"grad_norm": 0.1845703125,
"learning_rate": 0.00031594262323202577,
"loss": 2.5587,
"step": 4760
},
{
"epoch": 0.4210619234673611,
"grad_norm": 0.208984375,
"learning_rate": 0.0003152676388264234,
"loss": 2.5577,
"step": 4770
},
{
"epoch": 0.4219446528666637,
"grad_norm": 0.2236328125,
"learning_rate": 0.00031459214345417046,
"loss": 2.5362,
"step": 4780
},
{
"epoch": 0.4228273822659664,
"grad_norm": 0.216796875,
"learning_rate": 0.00031391614240357864,
"loss": 2.5542,
"step": 4790
},
{
"epoch": 0.423710111665269,
"grad_norm": 0.2470703125,
"learning_rate": 0.00031323964096691825,
"loss": 2.565,
"step": 4800
},
{
"epoch": 0.4245928410645717,
"grad_norm": 0.2392578125,
"learning_rate": 0.0003125626444403772,
"loss": 2.5467,
"step": 4810
},
{
"epoch": 0.4254755704638743,
"grad_norm": 0.236328125,
"learning_rate": 0.00031188515812401917,
"loss": 2.5632,
"step": 4820
},
{
"epoch": 0.42635829986317697,
"grad_norm": 0.2158203125,
"learning_rate": 0.00031120718732174235,
"loss": 2.5587,
"step": 4830
},
{
"epoch": 0.4272410292624796,
"grad_norm": 0.23828125,
"learning_rate": 0.000310528737341238,
"loss": 2.5333,
"step": 4840
},
{
"epoch": 0.4281237586617822,
"grad_norm": 0.267578125,
"learning_rate": 0.00030984981349394864,
"loss": 2.561,
"step": 4850
},
{
"epoch": 0.4290064880610849,
"grad_norm": 0.2294921875,
"learning_rate": 0.00030917042109502663,
"loss": 2.5618,
"step": 4860
},
{
"epoch": 0.4298892174603875,
"grad_norm": 0.1982421875,
"learning_rate": 0.00030849056546329253,
"loss": 2.5497,
"step": 4870
},
{
"epoch": 0.43077194685969017,
"grad_norm": 0.2060546875,
"learning_rate": 0.0003078102519211933,
"loss": 2.5374,
"step": 4880
},
{
"epoch": 0.4316546762589928,
"grad_norm": 0.2109375,
"learning_rate": 0.0003071294857947612,
"loss": 2.5631,
"step": 4890
},
{
"epoch": 0.43253740565829546,
"grad_norm": 0.20703125,
"learning_rate": 0.0003064482724135711,
"loss": 2.575,
"step": 4900
},
{
"epoch": 0.4334201350575981,
"grad_norm": 0.2099609375,
"learning_rate": 0.00030576661711069985,
"loss": 2.5525,
"step": 4910
},
{
"epoch": 0.43430286445690075,
"grad_norm": 0.2119140625,
"learning_rate": 0.0003050845252226837,
"loss": 2.5718,
"step": 4920
},
{
"epoch": 0.43518559385620337,
"grad_norm": 0.2236328125,
"learning_rate": 0.0003044020020894769,
"loss": 2.5601,
"step": 4930
},
{
"epoch": 0.43606832325550604,
"grad_norm": 0.19140625,
"learning_rate": 0.00030371905305441,
"loss": 2.5612,
"step": 4940
},
{
"epoch": 0.43695105265480866,
"grad_norm": 0.2255859375,
"learning_rate": 0.0003030356834641476,
"loss": 2.5504,
"step": 4950
},
{
"epoch": 0.43783378205411133,
"grad_norm": 0.2294921875,
"learning_rate": 0.0003023518986686469,
"loss": 2.5584,
"step": 4960
},
{
"epoch": 0.43871651145341395,
"grad_norm": 0.2451171875,
"learning_rate": 0.0003016677040211154,
"loss": 2.5645,
"step": 4970
},
{
"epoch": 0.4395992408527166,
"grad_norm": 0.2021484375,
"learning_rate": 0.00030098310487796965,
"loss": 2.5536,
"step": 4980
},
{
"epoch": 0.44048197025201924,
"grad_norm": 0.2197265625,
"learning_rate": 0.00030029810659879273,
"loss": 2.5535,
"step": 4990
},
{
"epoch": 0.4413646996513219,
"grad_norm": 0.1982421875,
"learning_rate": 0.00029961271454629235,
"loss": 2.565,
"step": 5000
},
{
"epoch": 0.44224742905062453,
"grad_norm": 0.193359375,
"learning_rate": 0.0002989269340862591,
"loss": 2.5531,
"step": 5010
},
{
"epoch": 0.44313015844992715,
"grad_norm": 0.236328125,
"learning_rate": 0.0002982407705875243,
"loss": 2.5636,
"step": 5020
},
{
"epoch": 0.4440128878492298,
"grad_norm": 0.19140625,
"learning_rate": 0.00029755422942191805,
"loss": 2.5507,
"step": 5030
},
{
"epoch": 0.44489561724853244,
"grad_norm": 0.2109375,
"learning_rate": 0.0002968673159642271,
"loss": 2.5646,
"step": 5040
},
{
"epoch": 0.4457783466478351,
"grad_norm": 0.2109375,
"learning_rate": 0.00029618003559215276,
"loss": 2.5697,
"step": 5050
},
{
"epoch": 0.44666107604713773,
"grad_norm": 0.193359375,
"learning_rate": 0.0002954923936862689,
"loss": 2.5557,
"step": 5060
},
{
"epoch": 0.4475438054464404,
"grad_norm": 0.216796875,
"learning_rate": 0.00029480439562997964,
"loss": 2.5661,
"step": 5070
},
{
"epoch": 0.448426534845743,
"grad_norm": 0.1943359375,
"learning_rate": 0.00029411604680947755,
"loss": 2.5527,
"step": 5080
},
{
"epoch": 0.4493092642450457,
"grad_norm": 0.25,
"learning_rate": 0.00029342735261370095,
"loss": 2.5538,
"step": 5090
},
{
"epoch": 0.4501919936443483,
"grad_norm": 0.181640625,
"learning_rate": 0.0002927383184342924,
"loss": 2.5503,
"step": 5100
},
{
"epoch": 0.451074723043651,
"grad_norm": 0.203125,
"learning_rate": 0.00029204894966555577,
"loss": 2.5669,
"step": 5110
},
{
"epoch": 0.4519574524429536,
"grad_norm": 0.203125,
"learning_rate": 0.00029135925170441457,
"loss": 2.5698,
"step": 5120
},
{
"epoch": 0.4528401818422563,
"grad_norm": 0.2060546875,
"learning_rate": 0.0002906692299503694,
"loss": 2.567,
"step": 5130
},
{
"epoch": 0.4537229112415589,
"grad_norm": 0.2158203125,
"learning_rate": 0.00028997888980545586,
"loss": 2.5538,
"step": 5140
},
{
"epoch": 0.45460564064086156,
"grad_norm": 0.2138671875,
"learning_rate": 0.00028928823667420206,
"loss": 2.5495,
"step": 5150
},
{
"epoch": 0.4554883700401642,
"grad_norm": 0.2158203125,
"learning_rate": 0.00028859727596358643,
"loss": 2.5627,
"step": 5160
},
{
"epoch": 0.45637109943946685,
"grad_norm": 0.2255859375,
"learning_rate": 0.00028790601308299545,
"loss": 2.5567,
"step": 5170
},
{
"epoch": 0.45725382883876947,
"grad_norm": 0.21484375,
"learning_rate": 0.0002872144534441812,
"loss": 2.5561,
"step": 5180
},
{
"epoch": 0.45813655823807214,
"grad_norm": 0.2080078125,
"learning_rate": 0.0002865226024612189,
"loss": 2.5693,
"step": 5190
},
{
"epoch": 0.45901928763737476,
"grad_norm": 0.205078125,
"learning_rate": 0.00028583046555046487,
"loss": 2.5478,
"step": 5200
},
{
"epoch": 0.4599020170366774,
"grad_norm": 0.224609375,
"learning_rate": 0.0002851380481305136,
"loss": 2.5533,
"step": 5210
},
{
"epoch": 0.46078474643598005,
"grad_norm": 0.2021484375,
"learning_rate": 0.00028444535562215594,
"loss": 2.5529,
"step": 5220
},
{
"epoch": 0.46166747583528267,
"grad_norm": 0.220703125,
"learning_rate": 0.00028375239344833616,
"loss": 2.5532,
"step": 5230
},
{
"epoch": 0.46255020523458534,
"grad_norm": 0.2255859375,
"learning_rate": 0.00028305916703410974,
"loss": 2.566,
"step": 5240
},
{
"epoch": 0.46343293463388796,
"grad_norm": 0.298828125,
"learning_rate": 0.00028236568180660073,
"loss": 2.5478,
"step": 5250
},
{
"epoch": 0.46431566403319063,
"grad_norm": 0.2392578125,
"learning_rate": 0.0002816719431949596,
"loss": 2.5633,
"step": 5260
},
{
"epoch": 0.46519839343249325,
"grad_norm": 0.2119140625,
"learning_rate": 0.0002809779566303203,
"loss": 2.5704,
"step": 5270
},
{
"epoch": 0.4660811228317959,
"grad_norm": 0.224609375,
"learning_rate": 0.00028028372754575805,
"loss": 2.5681,
"step": 5280
},
{
"epoch": 0.46696385223109854,
"grad_norm": 0.2119140625,
"learning_rate": 0.0002795892613762467,
"loss": 2.5515,
"step": 5290
},
{
"epoch": 0.4678465816304012,
"grad_norm": 0.2275390625,
"learning_rate": 0.00027889456355861635,
"loss": 2.5681,
"step": 5300
},
{
"epoch": 0.46872931102970383,
"grad_norm": 0.21484375,
"learning_rate": 0.00027819963953151024,
"loss": 2.5487,
"step": 5310
},
{
"epoch": 0.4696120404290065,
"grad_norm": 0.197265625,
"learning_rate": 0.0002775044947353428,
"loss": 2.5672,
"step": 5320
},
{
"epoch": 0.4704947698283091,
"grad_norm": 0.2080078125,
"learning_rate": 0.0002768091346122569,
"loss": 2.562,
"step": 5330
},
{
"epoch": 0.4713774992276118,
"grad_norm": 0.2041015625,
"learning_rate": 0.000276113564606081,
"loss": 2.5542,
"step": 5340
},
{
"epoch": 0.4722602286269144,
"grad_norm": 0.2177734375,
"learning_rate": 0.00027541779016228664,
"loss": 2.5435,
"step": 5350
},
{
"epoch": 0.4731429580262171,
"grad_norm": 0.2119140625,
"learning_rate": 0.0002747218167279461,
"loss": 2.5631,
"step": 5360
},
{
"epoch": 0.4740256874255197,
"grad_norm": 0.2060546875,
"learning_rate": 0.00027402564975168925,
"loss": 2.5464,
"step": 5370
},
{
"epoch": 0.4749084168248224,
"grad_norm": 0.21484375,
"learning_rate": 0.0002733292946836615,
"loss": 2.5498,
"step": 5380
},
{
"epoch": 0.475791146224125,
"grad_norm": 0.189453125,
"learning_rate": 0.0002726327569754803,
"loss": 2.559,
"step": 5390
},
{
"epoch": 0.4766738756234276,
"grad_norm": 0.1943359375,
"learning_rate": 0.00027193604208019346,
"loss": 2.5666,
"step": 5400
},
{
"epoch": 0.4775566050227303,
"grad_norm": 0.2177734375,
"learning_rate": 0.0002712391554522355,
"loss": 2.556,
"step": 5410
},
{
"epoch": 0.4784393344220329,
"grad_norm": 0.1962890625,
"learning_rate": 0.0002705421025473857,
"loss": 2.559,
"step": 5420
},
{
"epoch": 0.4793220638213356,
"grad_norm": 0.30078125,
"learning_rate": 0.0002698448888227251,
"loss": 2.5503,
"step": 5430
},
{
"epoch": 0.4802047932206382,
"grad_norm": 0.2236328125,
"learning_rate": 0.0002691475197365936,
"loss": 2.5404,
"step": 5440
},
{
"epoch": 0.48108752261994087,
"grad_norm": 0.1943359375,
"learning_rate": 0.00026845000074854754,
"loss": 2.5667,
"step": 5450
},
{
"epoch": 0.4819702520192435,
"grad_norm": 0.2177734375,
"learning_rate": 0.0002677523373193165,
"loss": 2.559,
"step": 5460
},
{
"epoch": 0.48285298141854616,
"grad_norm": 0.1982421875,
"learning_rate": 0.00026705453491076127,
"loss": 2.5533,
"step": 5470
},
{
"epoch": 0.4837357108178488,
"grad_norm": 0.228515625,
"learning_rate": 0.00026635659898583043,
"loss": 2.5518,
"step": 5480
},
{
"epoch": 0.48461844021715145,
"grad_norm": 0.20703125,
"learning_rate": 0.000265658535008518,
"loss": 2.5682,
"step": 5490
},
{
"epoch": 0.48550116961645406,
"grad_norm": 0.2138671875,
"learning_rate": 0.00026496034844382036,
"loss": 2.5576,
"step": 5500
},
{
"epoch": 0.48638389901575674,
"grad_norm": 0.2041015625,
"learning_rate": 0.0002642620447576935,
"loss": 2.546,
"step": 5510
},
{
"epoch": 0.48726662841505936,
"grad_norm": 0.212890625,
"learning_rate": 0.0002635636294170106,
"loss": 2.5629,
"step": 5520
},
{
"epoch": 0.48814935781436203,
"grad_norm": 0.236328125,
"learning_rate": 0.00026286510788951886,
"loss": 2.5602,
"step": 5530
},
{
"epoch": 0.48903208721366465,
"grad_norm": 0.1923828125,
"learning_rate": 0.0002621664856437967,
"loss": 2.5532,
"step": 5540
},
{
"epoch": 0.4899148166129673,
"grad_norm": 0.2158203125,
"learning_rate": 0.00026146776814921105,
"loss": 2.5645,
"step": 5550
},
{
"epoch": 0.49079754601226994,
"grad_norm": 0.2138671875,
"learning_rate": 0.0002607689608758746,
"loss": 2.577,
"step": 5560
},
{
"epoch": 0.4916802754115726,
"grad_norm": 0.2001953125,
"learning_rate": 0.000260070069294603,
"loss": 2.5333,
"step": 5570
},
{
"epoch": 0.4925630048108752,
"grad_norm": 0.1943359375,
"learning_rate": 0.00025937109887687164,
"loss": 2.5584,
"step": 5580
},
{
"epoch": 0.49344573421017784,
"grad_norm": 0.2275390625,
"learning_rate": 0.00025867205509477335,
"loss": 2.5522,
"step": 5590
},
{
"epoch": 0.4943284636094805,
"grad_norm": 0.1865234375,
"learning_rate": 0.0002579729434209752,
"loss": 2.5581,
"step": 5600
},
{
"epoch": 0.49521119300878313,
"grad_norm": 0.181640625,
"learning_rate": 0.00025727376932867593,
"loss": 2.5625,
"step": 5610
},
{
"epoch": 0.4960939224080858,
"grad_norm": 0.2119140625,
"learning_rate": 0.00025657453829156256,
"loss": 2.5555,
"step": 5620
},
{
"epoch": 0.4969766518073884,
"grad_norm": 0.228515625,
"learning_rate": 0.00025587525578376843,
"loss": 2.5526,
"step": 5630
},
{
"epoch": 0.4978593812066911,
"grad_norm": 0.181640625,
"learning_rate": 0.0002551759272798295,
"loss": 2.5501,
"step": 5640
},
{
"epoch": 0.4987421106059937,
"grad_norm": 0.2099609375,
"learning_rate": 0.00025447655825464174,
"loss": 2.5728,
"step": 5650
},
{
"epoch": 0.4996248400052964,
"grad_norm": 0.189453125,
"learning_rate": 0.0002537771541834187,
"loss": 2.5491,
"step": 5660
},
{
"epoch": 0.5005075694045991,
"grad_norm": 0.1884765625,
"learning_rate": 0.00025307772054164804,
"loss": 2.5658,
"step": 5670
},
{
"epoch": 0.5013902988039016,
"grad_norm": 0.1875,
"learning_rate": 0.000252378262805049,
"loss": 2.5504,
"step": 5680
},
{
"epoch": 0.5022730282032043,
"grad_norm": 0.212890625,
"learning_rate": 0.0002516787864495294,
"loss": 2.5621,
"step": 5690
},
{
"epoch": 0.503155757602507,
"grad_norm": 0.20703125,
"learning_rate": 0.00025097929695114295,
"loss": 2.5526,
"step": 5700
},
{
"epoch": 0.5040384870018096,
"grad_norm": 0.2412109375,
"learning_rate": 0.00025027979978604615,
"loss": 2.5535,
"step": 5710
},
{
"epoch": 0.5049212164011122,
"grad_norm": 0.2099609375,
"learning_rate": 0.0002495803004304556,
"loss": 2.5489,
"step": 5720
},
{
"epoch": 0.5058039458004149,
"grad_norm": 0.203125,
"learning_rate": 0.0002488808043606048,
"loss": 2.5585,
"step": 5730
},
{
"epoch": 0.5066866751997176,
"grad_norm": 0.2001953125,
"learning_rate": 0.0002481813170527019,
"loss": 2.561,
"step": 5740
},
{
"epoch": 0.5075694045990202,
"grad_norm": 0.197265625,
"learning_rate": 0.0002474818439828862,
"loss": 2.5538,
"step": 5750
},
{
"epoch": 0.5084521339983228,
"grad_norm": 0.2216796875,
"learning_rate": 0.0002467823906271856,
"loss": 2.559,
"step": 5760
},
{
"epoch": 0.5093348633976255,
"grad_norm": 0.1962890625,
"learning_rate": 0.00024608296246147375,
"loss": 2.5583,
"step": 5770
},
{
"epoch": 0.5102175927969281,
"grad_norm": 0.2021484375,
"learning_rate": 0.00024538356496142693,
"loss": 2.5506,
"step": 5780
},
{
"epoch": 0.5111003221962307,
"grad_norm": 0.1962890625,
"learning_rate": 0.00024468420360248145,
"loss": 2.5589,
"step": 5790
},
{
"epoch": 0.5119830515955334,
"grad_norm": 0.2158203125,
"learning_rate": 0.00024398488385979055,
"loss": 2.5531,
"step": 5800
},
{
"epoch": 0.512865780994836,
"grad_norm": 0.1826171875,
"learning_rate": 0.00024328561120818195,
"loss": 2.5605,
"step": 5810
},
{
"epoch": 0.5137485103941387,
"grad_norm": 0.193359375,
"learning_rate": 0.00024258639112211453,
"loss": 2.5698,
"step": 5820
},
{
"epoch": 0.5146312397934413,
"grad_norm": 0.1962890625,
"learning_rate": 0.00024188722907563537,
"loss": 2.5531,
"step": 5830
},
{
"epoch": 0.515513969192744,
"grad_norm": 0.19140625,
"learning_rate": 0.00024118813054233774,
"loss": 2.547,
"step": 5840
},
{
"epoch": 0.5163966985920466,
"grad_norm": 0.20703125,
"learning_rate": 0.00024048910099531726,
"loss": 2.5631,
"step": 5850
},
{
"epoch": 0.5172794279913493,
"grad_norm": 0.2158203125,
"learning_rate": 0.00023979014590712962,
"loss": 2.5436,
"step": 5860
},
{
"epoch": 0.5181621573906519,
"grad_norm": 0.2060546875,
"learning_rate": 0.00023909127074974744,
"loss": 2.5586,
"step": 5870
},
{
"epoch": 0.5190448867899545,
"grad_norm": 0.2060546875,
"learning_rate": 0.00023839248099451782,
"loss": 2.5524,
"step": 5880
},
{
"epoch": 0.5199276161892572,
"grad_norm": 0.193359375,
"learning_rate": 0.00023769378211211916,
"loss": 2.5391,
"step": 5890
},
{
"epoch": 0.5208103455885599,
"grad_norm": 0.1767578125,
"learning_rate": 0.00023699517957251825,
"loss": 2.5464,
"step": 5900
},
{
"epoch": 0.5216930749878624,
"grad_norm": 0.1904296875,
"learning_rate": 0.00023629667884492799,
"loss": 2.556,
"step": 5910
},
{
"epoch": 0.5225758043871651,
"grad_norm": 0.2431640625,
"learning_rate": 0.00023559828539776394,
"loss": 2.5516,
"step": 5920
},
{
"epoch": 0.5234585337864678,
"grad_norm": 0.203125,
"learning_rate": 0.00023490000469860185,
"loss": 2.5518,
"step": 5930
},
{
"epoch": 0.5243412631857705,
"grad_norm": 0.1728515625,
"learning_rate": 0.0002342018422141347,
"loss": 2.5477,
"step": 5940
},
{
"epoch": 0.525223992585073,
"grad_norm": 0.201171875,
"learning_rate": 0.00023350380341013034,
"loss": 2.5656,
"step": 5950
},
{
"epoch": 0.5261067219843757,
"grad_norm": 0.1875,
"learning_rate": 0.000232805893751388,
"loss": 2.568,
"step": 5960
},
{
"epoch": 0.5269894513836784,
"grad_norm": 0.1923828125,
"learning_rate": 0.0002321081187016959,
"loss": 2.5531,
"step": 5970
},
{
"epoch": 0.5278721807829809,
"grad_norm": 0.2080078125,
"learning_rate": 0.00023141048372378863,
"loss": 2.5555,
"step": 5980
},
{
"epoch": 0.5287549101822836,
"grad_norm": 0.1875,
"learning_rate": 0.00023071299427930396,
"loss": 2.5531,
"step": 5990
},
{
"epoch": 0.5296376395815863,
"grad_norm": 0.1875,
"learning_rate": 0.00023001565582874046,
"loss": 2.555,
"step": 6000
},
{
"epoch": 0.5296376395815863,
"eval_accuracy": 0.5019328679706038,
"eval_loss": 2.4451804161071777,
"eval_runtime": 7.0082,
"eval_samples_per_second": 45.375,
"eval_steps_per_second": 0.428,
"step": 6000
},
{
"epoch": 0.530520368980889,
"grad_norm": 0.208984375,
"learning_rate": 0.00022931847383141446,
"loss": 2.5439,
"step": 6010
},
{
"epoch": 0.5314030983801915,
"grad_norm": 0.2119140625,
"learning_rate": 0.00022862145374541768,
"loss": 2.553,
"step": 6020
},
{
"epoch": 0.5322858277794942,
"grad_norm": 0.1826171875,
"learning_rate": 0.00022792460102757407,
"loss": 2.5539,
"step": 6030
},
{
"epoch": 0.5331685571787969,
"grad_norm": 0.1865234375,
"learning_rate": 0.00022722792113339722,
"loss": 2.5546,
"step": 6040
},
{
"epoch": 0.5340512865780995,
"grad_norm": 0.205078125,
"learning_rate": 0.0002265314195170481,
"loss": 2.5649,
"step": 6050
},
{
"epoch": 0.5349340159774021,
"grad_norm": 0.203125,
"learning_rate": 0.00022583510163129162,
"loss": 2.5396,
"step": 6060
},
{
"epoch": 0.5358167453767048,
"grad_norm": 0.181640625,
"learning_rate": 0.00022513897292745434,
"loss": 2.5698,
"step": 6070
},
{
"epoch": 0.5366994747760074,
"grad_norm": 0.1865234375,
"learning_rate": 0.00022444303885538178,
"loss": 2.5594,
"step": 6080
},
{
"epoch": 0.5375822041753101,
"grad_norm": 0.171875,
"learning_rate": 0.000223747304863396,
"loss": 2.5539,
"step": 6090
},
{
"epoch": 0.5384649335746127,
"grad_norm": 0.1767578125,
"learning_rate": 0.0002230517763982523,
"loss": 2.5658,
"step": 6100
},
{
"epoch": 0.5393476629739153,
"grad_norm": 0.1826171875,
"learning_rate": 0.0002223564589050971,
"loss": 2.5584,
"step": 6110
},
{
"epoch": 0.540230392373218,
"grad_norm": 0.17578125,
"learning_rate": 0.00022166135782742525,
"loss": 2.5497,
"step": 6120
},
{
"epoch": 0.5411131217725207,
"grad_norm": 0.1826171875,
"learning_rate": 0.0002209664786070372,
"loss": 2.5505,
"step": 6130
},
{
"epoch": 0.5419958511718233,
"grad_norm": 0.2080078125,
"learning_rate": 0.00022027182668399653,
"loss": 2.5513,
"step": 6140
},
{
"epoch": 0.5428785805711259,
"grad_norm": 0.1806640625,
"learning_rate": 0.0002195774074965874,
"loss": 2.5493,
"step": 6150
},
{
"epoch": 0.5437613099704286,
"grad_norm": 0.224609375,
"learning_rate": 0.00021888322648127206,
"loss": 2.5636,
"step": 6160
},
{
"epoch": 0.5446440393697312,
"grad_norm": 0.2060546875,
"learning_rate": 0.0002181892890726479,
"loss": 2.5583,
"step": 6170
},
{
"epoch": 0.5455267687690338,
"grad_norm": 0.2099609375,
"learning_rate": 0.00021749560070340534,
"loss": 2.5529,
"step": 6180
},
{
"epoch": 0.5464094981683365,
"grad_norm": 0.2109375,
"learning_rate": 0.000216802166804285,
"loss": 2.5515,
"step": 6190
},
{
"epoch": 0.5472922275676392,
"grad_norm": 0.185546875,
"learning_rate": 0.00021610899280403555,
"loss": 2.5585,
"step": 6200
},
{
"epoch": 0.5481749569669417,
"grad_norm": 0.1845703125,
"learning_rate": 0.00021541608412937075,
"loss": 2.5432,
"step": 6210
},
{
"epoch": 0.5490576863662444,
"grad_norm": 0.1943359375,
"learning_rate": 0.000214723446204927,
"loss": 2.5633,
"step": 6220
},
{
"epoch": 0.5499404157655471,
"grad_norm": 0.193359375,
"learning_rate": 0.00021403108445322168,
"loss": 2.5604,
"step": 6230
},
{
"epoch": 0.5508231451648498,
"grad_norm": 0.203125,
"learning_rate": 0.0002133390042946094,
"loss": 2.5477,
"step": 6240
},
{
"epoch": 0.5517058745641523,
"grad_norm": 0.1845703125,
"learning_rate": 0.00021264721114724064,
"loss": 2.5514,
"step": 6250
},
{
"epoch": 0.552588603963455,
"grad_norm": 0.1884765625,
"learning_rate": 0.0002119557104270187,
"loss": 2.5616,
"step": 6260
},
{
"epoch": 0.5534713333627577,
"grad_norm": 0.1875,
"learning_rate": 0.00021126450754755774,
"loss": 2.5491,
"step": 6270
},
{
"epoch": 0.5543540627620603,
"grad_norm": 0.1982421875,
"learning_rate": 0.00021057360792014004,
"loss": 2.5473,
"step": 6280
},
{
"epoch": 0.5552367921613629,
"grad_norm": 0.177734375,
"learning_rate": 0.0002098830169536738,
"loss": 2.5478,
"step": 6290
},
{
"epoch": 0.5561195215606656,
"grad_norm": 0.177734375,
"learning_rate": 0.00020919274005465083,
"loss": 2.552,
"step": 6300
},
{
"epoch": 0.5570022509599682,
"grad_norm": 0.185546875,
"learning_rate": 0.00020850278262710416,
"loss": 2.5571,
"step": 6310
},
{
"epoch": 0.5578849803592708,
"grad_norm": 0.17578125,
"learning_rate": 0.0002078131500725657,
"loss": 2.5556,
"step": 6320
},
{
"epoch": 0.5587677097585735,
"grad_norm": 0.2255859375,
"learning_rate": 0.00020712384779002392,
"loss": 2.552,
"step": 6330
},
{
"epoch": 0.5596504391578762,
"grad_norm": 0.19140625,
"learning_rate": 0.00020643488117588199,
"loss": 2.5512,
"step": 6340
},
{
"epoch": 0.5605331685571788,
"grad_norm": 0.1845703125,
"learning_rate": 0.00020574625562391494,
"loss": 2.5546,
"step": 6350
},
{
"epoch": 0.5614158979564814,
"grad_norm": 0.1728515625,
"learning_rate": 0.00020505797652522751,
"loss": 2.5543,
"step": 6360
},
{
"epoch": 0.5622986273557841,
"grad_norm": 0.1796875,
"learning_rate": 0.00020437004926821255,
"loss": 2.5575,
"step": 6370
},
{
"epoch": 0.5631813567550867,
"grad_norm": 0.216796875,
"learning_rate": 0.00020368247923850826,
"loss": 2.5547,
"step": 6380
},
{
"epoch": 0.5640640861543894,
"grad_norm": 0.17578125,
"learning_rate": 0.00020299527181895602,
"loss": 2.5412,
"step": 6390
},
{
"epoch": 0.564946815553692,
"grad_norm": 0.2001953125,
"learning_rate": 0.00020230843238955854,
"loss": 2.544,
"step": 6400
},
{
"epoch": 0.5658295449529946,
"grad_norm": 0.1884765625,
"learning_rate": 0.0002016219663274377,
"loss": 2.5603,
"step": 6410
},
{
"epoch": 0.5667122743522973,
"grad_norm": 0.189453125,
"learning_rate": 0.00020093587900679217,
"loss": 2.5474,
"step": 6420
},
{
"epoch": 0.5675950037516,
"grad_norm": 0.1748046875,
"learning_rate": 0.00020025017579885563,
"loss": 2.565,
"step": 6430
},
{
"epoch": 0.5684777331509026,
"grad_norm": 0.197265625,
"learning_rate": 0.00019956486207185477,
"loss": 2.5528,
"step": 6440
},
{
"epoch": 0.5693604625502052,
"grad_norm": 0.1728515625,
"learning_rate": 0.0001988799431909668,
"loss": 2.5615,
"step": 6450
},
{
"epoch": 0.5702431919495079,
"grad_norm": 0.19140625,
"learning_rate": 0.00019819542451827808,
"loss": 2.5547,
"step": 6460
},
{
"epoch": 0.5711259213488106,
"grad_norm": 0.166015625,
"learning_rate": 0.00019751131141274147,
"loss": 2.5488,
"step": 6470
},
{
"epoch": 0.5720086507481131,
"grad_norm": 0.220703125,
"learning_rate": 0.0001968276092301352,
"loss": 2.5499,
"step": 6480
},
{
"epoch": 0.5728913801474158,
"grad_norm": 0.193359375,
"learning_rate": 0.00019614432332302006,
"loss": 2.5489,
"step": 6490
},
{
"epoch": 0.5737741095467185,
"grad_norm": 0.1728515625,
"learning_rate": 0.00019546145904069808,
"loss": 2.5497,
"step": 6500
},
{
"epoch": 0.574656838946021,
"grad_norm": 0.1806640625,
"learning_rate": 0.00019477902172917045,
"loss": 2.5487,
"step": 6510
},
{
"epoch": 0.5755395683453237,
"grad_norm": 0.19140625,
"learning_rate": 0.0001940970167310957,
"loss": 2.5668,
"step": 6520
},
{
"epoch": 0.5764222977446264,
"grad_norm": 0.1787109375,
"learning_rate": 0.0001934154493857479,
"loss": 2.5521,
"step": 6530
},
{
"epoch": 0.5773050271439291,
"grad_norm": 0.189453125,
"learning_rate": 0.0001927343250289747,
"loss": 2.5676,
"step": 6540
},
{
"epoch": 0.5781877565432316,
"grad_norm": 0.2255859375,
"learning_rate": 0.00019205364899315593,
"loss": 2.5402,
"step": 6550
},
{
"epoch": 0.5790704859425343,
"grad_norm": 0.1796875,
"learning_rate": 0.00019137342660716133,
"loss": 2.5538,
"step": 6560
},
{
"epoch": 0.579953215341837,
"grad_norm": 0.234375,
"learning_rate": 0.00019069366319630923,
"loss": 2.5536,
"step": 6570
},
{
"epoch": 0.5808359447411396,
"grad_norm": 0.1962890625,
"learning_rate": 0.00019001436408232496,
"loss": 2.5481,
"step": 6580
},
{
"epoch": 0.5817186741404422,
"grad_norm": 0.173828125,
"learning_rate": 0.00018933553458329856,
"loss": 2.5494,
"step": 6590
},
{
"epoch": 0.5826014035397449,
"grad_norm": 0.18359375,
"learning_rate": 0.00018865718001364375,
"loss": 2.5421,
"step": 6600
},
{
"epoch": 0.5834841329390476,
"grad_norm": 0.1865234375,
"learning_rate": 0.00018797930568405612,
"loss": 2.5504,
"step": 6610
},
{
"epoch": 0.5843668623383502,
"grad_norm": 0.197265625,
"learning_rate": 0.00018730191690147176,
"loss": 2.5459,
"step": 6620
},
{
"epoch": 0.5852495917376528,
"grad_norm": 0.208984375,
"learning_rate": 0.00018662501896902519,
"loss": 2.5339,
"step": 6630
},
{
"epoch": 0.5861323211369555,
"grad_norm": 0.1806640625,
"learning_rate": 0.0001859486171860082,
"loss": 2.5401,
"step": 6640
},
{
"epoch": 0.5870150505362581,
"grad_norm": 0.1953125,
"learning_rate": 0.00018527271684782865,
"loss": 2.5508,
"step": 6650
},
{
"epoch": 0.5878977799355608,
"grad_norm": 0.2109375,
"learning_rate": 0.00018459732324596834,
"loss": 2.555,
"step": 6660
},
{
"epoch": 0.5887805093348634,
"grad_norm": 0.1787109375,
"learning_rate": 0.0001839224416679421,
"loss": 2.5675,
"step": 6670
},
{
"epoch": 0.589663238734166,
"grad_norm": 0.1787109375,
"learning_rate": 0.00018324807739725614,
"loss": 2.5473,
"step": 6680
},
{
"epoch": 0.5905459681334687,
"grad_norm": 0.1845703125,
"learning_rate": 0.000182574235713367,
"loss": 2.5612,
"step": 6690
},
{
"epoch": 0.5914286975327713,
"grad_norm": 0.1884765625,
"learning_rate": 0.00018190092189163974,
"loss": 2.5791,
"step": 6700
},
{
"epoch": 0.592311426932074,
"grad_norm": 0.1962890625,
"learning_rate": 0.00018122814120330688,
"loss": 2.5439,
"step": 6710
},
{
"epoch": 0.5931941563313766,
"grad_norm": 0.1669921875,
"learning_rate": 0.00018055589891542758,
"loss": 2.5517,
"step": 6720
},
{
"epoch": 0.5940768857306793,
"grad_norm": 0.1728515625,
"learning_rate": 0.00017988420029084551,
"loss": 2.5437,
"step": 6730
},
{
"epoch": 0.5949596151299819,
"grad_norm": 0.18359375,
"learning_rate": 0.00017921305058814818,
"loss": 2.5537,
"step": 6740
},
{
"epoch": 0.5958423445292845,
"grad_norm": 0.1796875,
"learning_rate": 0.00017854245506162582,
"loss": 2.544,
"step": 6750
},
{
"epoch": 0.5967250739285872,
"grad_norm": 0.177734375,
"learning_rate": 0.00017787241896123024,
"loss": 2.5581,
"step": 6760
},
{
"epoch": 0.5976078033278899,
"grad_norm": 0.1728515625,
"learning_rate": 0.00017720294753253345,
"loss": 2.5579,
"step": 6770
},
{
"epoch": 0.5984905327271924,
"grad_norm": 0.185546875,
"learning_rate": 0.00017653404601668666,
"loss": 2.5429,
"step": 6780
},
{
"epoch": 0.5993732621264951,
"grad_norm": 0.18359375,
"learning_rate": 0.00017586571965037966,
"loss": 2.5569,
"step": 6790
},
{
"epoch": 0.6002559915257978,
"grad_norm": 0.185546875,
"learning_rate": 0.0001751979736657993,
"loss": 2.545,
"step": 6800
},
{
"epoch": 0.6011387209251005,
"grad_norm": 0.1748046875,
"learning_rate": 0.00017453081329058882,
"loss": 2.5456,
"step": 6810
},
{
"epoch": 0.602021450324403,
"grad_norm": 0.1708984375,
"learning_rate": 0.0001738642437478067,
"loss": 2.5416,
"step": 6820
},
{
"epoch": 0.6029041797237057,
"grad_norm": 0.1806640625,
"learning_rate": 0.00017319827025588614,
"loss": 2.5233,
"step": 6830
},
{
"epoch": 0.6037869091230084,
"grad_norm": 0.1904296875,
"learning_rate": 0.0001725328980285939,
"loss": 2.5527,
"step": 6840
},
{
"epoch": 0.604669638522311,
"grad_norm": 0.1669921875,
"learning_rate": 0.00017186813227498937,
"loss": 2.55,
"step": 6850
},
{
"epoch": 0.6055523679216136,
"grad_norm": 0.1728515625,
"learning_rate": 0.0001712039781993844,
"loss": 2.5464,
"step": 6860
},
{
"epoch": 0.6064350973209163,
"grad_norm": 0.1904296875,
"learning_rate": 0.00017054044100130178,
"loss": 2.5457,
"step": 6870
},
{
"epoch": 0.607317826720219,
"grad_norm": 0.1806640625,
"learning_rate": 0.0001698775258754351,
"loss": 2.551,
"step": 6880
},
{
"epoch": 0.6082005561195215,
"grad_norm": 0.1943359375,
"learning_rate": 0.00016921523801160756,
"loss": 2.5549,
"step": 6890
},
{
"epoch": 0.6090832855188242,
"grad_norm": 0.224609375,
"learning_rate": 0.00016855358259473217,
"loss": 2.5485,
"step": 6900
},
{
"epoch": 0.6099660149181269,
"grad_norm": 0.20703125,
"learning_rate": 0.00016789256480477023,
"loss": 2.5402,
"step": 6910
},
{
"epoch": 0.6108487443174295,
"grad_norm": 0.1904296875,
"learning_rate": 0.00016723218981669127,
"loss": 2.5418,
"step": 6920
},
{
"epoch": 0.6117314737167321,
"grad_norm": 0.16015625,
"learning_rate": 0.00016657246280043266,
"loss": 2.5591,
"step": 6930
},
{
"epoch": 0.6126142031160348,
"grad_norm": 0.1748046875,
"learning_rate": 0.00016591338892085874,
"loss": 2.5536,
"step": 6940
},
{
"epoch": 0.6134969325153374,
"grad_norm": 0.169921875,
"learning_rate": 0.0001652549733377206,
"loss": 2.5456,
"step": 6950
},
{
"epoch": 0.6143796619146401,
"grad_norm": 0.1826171875,
"learning_rate": 0.00016459722120561567,
"loss": 2.5326,
"step": 6960
},
{
"epoch": 0.6152623913139427,
"grad_norm": 0.1708984375,
"learning_rate": 0.0001639401376739475,
"loss": 2.5623,
"step": 6970
},
{
"epoch": 0.6161451207132453,
"grad_norm": 0.177734375,
"learning_rate": 0.0001632837278868851,
"loss": 2.5383,
"step": 6980
},
{
"epoch": 0.617027850112548,
"grad_norm": 0.171875,
"learning_rate": 0.00016262799698332292,
"loss": 2.5386,
"step": 6990
},
{
"epoch": 0.6179105795118507,
"grad_norm": 0.1884765625,
"learning_rate": 0.00016197295009684077,
"loss": 2.5427,
"step": 7000
},
{
"epoch": 0.6187933089111533,
"grad_norm": 0.1650390625,
"learning_rate": 0.00016131859235566325,
"loss": 2.541,
"step": 7010
},
{
"epoch": 0.6196760383104559,
"grad_norm": 0.171875,
"learning_rate": 0.00016066492888261983,
"loss": 2.5609,
"step": 7020
},
{
"epoch": 0.6205587677097586,
"grad_norm": 0.181640625,
"learning_rate": 0.00016001196479510448,
"loss": 2.5601,
"step": 7030
},
{
"epoch": 0.6214414971090613,
"grad_norm": 0.1787109375,
"learning_rate": 0.00015935970520503638,
"loss": 2.5552,
"step": 7040
},
{
"epoch": 0.6223242265083638,
"grad_norm": 0.185546875,
"learning_rate": 0.0001587081552188188,
"loss": 2.5498,
"step": 7050
},
{
"epoch": 0.6232069559076665,
"grad_norm": 0.162109375,
"learning_rate": 0.0001580573199372999,
"loss": 2.5479,
"step": 7060
},
{
"epoch": 0.6240896853069692,
"grad_norm": 0.18359375,
"learning_rate": 0.00015740720445573262,
"loss": 2.5488,
"step": 7070
},
{
"epoch": 0.6249724147062717,
"grad_norm": 0.19140625,
"learning_rate": 0.00015675781386373462,
"loss": 2.5478,
"step": 7080
},
{
"epoch": 0.6258551441055744,
"grad_norm": 0.177734375,
"learning_rate": 0.0001561091532452486,
"loss": 2.5579,
"step": 7090
},
{
"epoch": 0.6267378735048771,
"grad_norm": 0.17578125,
"learning_rate": 0.00015546122767850232,
"loss": 2.5543,
"step": 7100
},
{
"epoch": 0.6276206029041798,
"grad_norm": 0.189453125,
"learning_rate": 0.00015481404223596939,
"loss": 2.559,
"step": 7110
},
{
"epoch": 0.6285033323034823,
"grad_norm": 0.1875,
"learning_rate": 0.0001541676019843286,
"loss": 2.549,
"step": 7120
},
{
"epoch": 0.629386061702785,
"grad_norm": 0.1875,
"learning_rate": 0.00015352191198442507,
"loss": 2.5372,
"step": 7130
},
{
"epoch": 0.6302687911020877,
"grad_norm": 0.1630859375,
"learning_rate": 0.00015287697729123045,
"loss": 2.5458,
"step": 7140
},
{
"epoch": 0.6311515205013903,
"grad_norm": 0.1650390625,
"learning_rate": 0.0001522328029538031,
"loss": 2.5545,
"step": 7150
},
{
"epoch": 0.6320342499006929,
"grad_norm": 0.1630859375,
"learning_rate": 0.00015158939401524877,
"loss": 2.5564,
"step": 7160
},
{
"epoch": 0.6329169792999956,
"grad_norm": 0.19921875,
"learning_rate": 0.00015094675551268096,
"loss": 2.5528,
"step": 7170
},
{
"epoch": 0.6337997086992982,
"grad_norm": 0.1806640625,
"learning_rate": 0.00015030489247718173,
"loss": 2.5414,
"step": 7180
},
{
"epoch": 0.6346824380986009,
"grad_norm": 0.173828125,
"learning_rate": 0.00014966380993376217,
"loss": 2.5522,
"step": 7190
},
{
"epoch": 0.6355651674979035,
"grad_norm": 0.21484375,
"learning_rate": 0.0001490235129013228,
"loss": 2.5521,
"step": 7200
},
{
"epoch": 0.6364478968972062,
"grad_norm": 0.1982421875,
"learning_rate": 0.00014838400639261503,
"loss": 2.5627,
"step": 7210
},
{
"epoch": 0.6373306262965088,
"grad_norm": 0.2138671875,
"learning_rate": 0.000147745295414201,
"loss": 2.5546,
"step": 7220
},
{
"epoch": 0.6382133556958115,
"grad_norm": 0.1943359375,
"learning_rate": 0.00014710738496641492,
"loss": 2.5284,
"step": 7230
},
{
"epoch": 0.6390960850951141,
"grad_norm": 0.1748046875,
"learning_rate": 0.0001464702800433238,
"loss": 2.5326,
"step": 7240
},
{
"epoch": 0.6399788144944167,
"grad_norm": 0.16796875,
"learning_rate": 0.00014583398563268858,
"loss": 2.5522,
"step": 7250
},
{
"epoch": 0.6408615438937194,
"grad_norm": 0.166015625,
"learning_rate": 0.00014519850671592467,
"loss": 2.5589,
"step": 7260
},
{
"epoch": 0.641744273293022,
"grad_norm": 0.1728515625,
"learning_rate": 0.000144563848268063,
"loss": 2.5653,
"step": 7270
},
{
"epoch": 0.6426270026923246,
"grad_norm": 0.171875,
"learning_rate": 0.00014393001525771153,
"loss": 2.55,
"step": 7280
},
{
"epoch": 0.6435097320916273,
"grad_norm": 0.201171875,
"learning_rate": 0.00014329701264701597,
"loss": 2.5498,
"step": 7290
},
{
"epoch": 0.64439246149093,
"grad_norm": 0.1689453125,
"learning_rate": 0.0001426648453916208,
"loss": 2.545,
"step": 7300
},
{
"epoch": 0.6452751908902326,
"grad_norm": 0.1806640625,
"learning_rate": 0.00014203351844063088,
"loss": 2.537,
"step": 7310
},
{
"epoch": 0.6461579202895352,
"grad_norm": 0.1904296875,
"learning_rate": 0.0001414030367365725,
"loss": 2.5452,
"step": 7320
},
{
"epoch": 0.6470406496888379,
"grad_norm": 0.2119140625,
"learning_rate": 0.00014077340521535472,
"loss": 2.5548,
"step": 7330
},
{
"epoch": 0.6479233790881406,
"grad_norm": 0.20703125,
"learning_rate": 0.00014014462880623042,
"loss": 2.5404,
"step": 7340
},
{
"epoch": 0.6488061084874431,
"grad_norm": 0.1708984375,
"learning_rate": 0.00013951671243175824,
"loss": 2.5443,
"step": 7350
},
{
"epoch": 0.6496888378867458,
"grad_norm": 0.166015625,
"learning_rate": 0.00013888966100776386,
"loss": 2.5506,
"step": 7360
},
{
"epoch": 0.6505715672860485,
"grad_norm": 0.2158203125,
"learning_rate": 0.00013826347944330116,
"loss": 2.5296,
"step": 7370
},
{
"epoch": 0.6514542966853512,
"grad_norm": 0.158203125,
"learning_rate": 0.00013763817264061425,
"loss": 2.5591,
"step": 7380
},
{
"epoch": 0.6523370260846537,
"grad_norm": 0.1826171875,
"learning_rate": 0.00013701374549509899,
"loss": 2.5541,
"step": 7390
},
{
"epoch": 0.6532197554839564,
"grad_norm": 0.185546875,
"learning_rate": 0.00013639020289526438,
"loss": 2.5624,
"step": 7400
},
{
"epoch": 0.6541024848832591,
"grad_norm": 0.171875,
"learning_rate": 0.00013576754972269463,
"loss": 2.5578,
"step": 7410
},
{
"epoch": 0.6549852142825617,
"grad_norm": 0.1884765625,
"learning_rate": 0.0001351457908520109,
"loss": 2.5454,
"step": 7420
},
{
"epoch": 0.6558679436818643,
"grad_norm": 0.1591796875,
"learning_rate": 0.0001345249311508328,
"loss": 2.5486,
"step": 7430
},
{
"epoch": 0.656750673081167,
"grad_norm": 0.1748046875,
"learning_rate": 0.00013390497547974078,
"loss": 2.5484,
"step": 7440
},
{
"epoch": 0.6576334024804696,
"grad_norm": 0.1572265625,
"learning_rate": 0.00013328592869223747,
"loss": 2.5486,
"step": 7450
},
{
"epoch": 0.6585161318797722,
"grad_norm": 0.166015625,
"learning_rate": 0.00013266779563471064,
"loss": 2.5437,
"step": 7460
},
{
"epoch": 0.6593988612790749,
"grad_norm": 0.1708984375,
"learning_rate": 0.00013205058114639407,
"loss": 2.5521,
"step": 7470
},
{
"epoch": 0.6602815906783776,
"grad_norm": 0.1982421875,
"learning_rate": 0.00013143429005933052,
"loss": 2.5482,
"step": 7480
},
{
"epoch": 0.6611643200776802,
"grad_norm": 0.16015625,
"learning_rate": 0.00013081892719833378,
"loss": 2.5343,
"step": 7490
},
{
"epoch": 0.6620470494769828,
"grad_norm": 0.1767578125,
"learning_rate": 0.0001302044973809503,
"loss": 2.5493,
"step": 7500
},
{
"epoch": 0.6629297788762855,
"grad_norm": 0.173828125,
"learning_rate": 0.00012959100541742248,
"loss": 2.5553,
"step": 7510
},
{
"epoch": 0.6638125082755881,
"grad_norm": 0.1865234375,
"learning_rate": 0.0001289784561106499,
"loss": 2.5531,
"step": 7520
},
{
"epoch": 0.6646952376748908,
"grad_norm": 0.1953125,
"learning_rate": 0.00012836685425615275,
"loss": 2.5634,
"step": 7530
},
{
"epoch": 0.6655779670741934,
"grad_norm": 0.1748046875,
"learning_rate": 0.00012775620464203365,
"loss": 2.547,
"step": 7540
},
{
"epoch": 0.666460696473496,
"grad_norm": 0.162109375,
"learning_rate": 0.0001271465120489401,
"loss": 2.54,
"step": 7550
},
{
"epoch": 0.6673434258727987,
"grad_norm": 0.1826171875,
"learning_rate": 0.0001265377812500278,
"loss": 2.548,
"step": 7560
},
{
"epoch": 0.6682261552721014,
"grad_norm": 0.166015625,
"learning_rate": 0.00012593001701092233,
"loss": 2.547,
"step": 7570
},
{
"epoch": 0.669108884671404,
"grad_norm": 0.1630859375,
"learning_rate": 0.00012532322408968221,
"loss": 2.5431,
"step": 7580
},
{
"epoch": 0.6699916140707066,
"grad_norm": 0.197265625,
"learning_rate": 0.00012471740723676213,
"loss": 2.5517,
"step": 7590
},
{
"epoch": 0.6708743434700093,
"grad_norm": 0.17578125,
"learning_rate": 0.000124112571194975,
"loss": 2.5473,
"step": 7600
},
{
"epoch": 0.671757072869312,
"grad_norm": 0.171875,
"learning_rate": 0.00012350872069945547,
"loss": 2.5503,
"step": 7610
},
{
"epoch": 0.6726398022686145,
"grad_norm": 0.16796875,
"learning_rate": 0.00012290586047762216,
"loss": 2.547,
"step": 7620
},
{
"epoch": 0.6735225316679172,
"grad_norm": 0.1640625,
"learning_rate": 0.00012230399524914136,
"loss": 2.5385,
"step": 7630
},
{
"epoch": 0.6744052610672199,
"grad_norm": 0.169921875,
"learning_rate": 0.00012170312972588974,
"loss": 2.5363,
"step": 7640
},
{
"epoch": 0.6752879904665224,
"grad_norm": 0.1640625,
"learning_rate": 0.00012110326861191722,
"loss": 2.5413,
"step": 7650
},
{
"epoch": 0.6761707198658251,
"grad_norm": 0.177734375,
"learning_rate": 0.00012050441660341074,
"loss": 2.5474,
"step": 7660
},
{
"epoch": 0.6770534492651278,
"grad_norm": 0.1728515625,
"learning_rate": 0.00011990657838865706,
"loss": 2.5413,
"step": 7670
},
{
"epoch": 0.6779361786644305,
"grad_norm": 0.1650390625,
"learning_rate": 0.00011930975864800603,
"loss": 2.5438,
"step": 7680
},
{
"epoch": 0.678818908063733,
"grad_norm": 0.1640625,
"learning_rate": 0.0001187139620538342,
"loss": 2.5575,
"step": 7690
},
{
"epoch": 0.6797016374630357,
"grad_norm": 0.1767578125,
"learning_rate": 0.0001181191932705081,
"loss": 2.5511,
"step": 7700
},
{
"epoch": 0.6805843668623384,
"grad_norm": 0.1669921875,
"learning_rate": 0.00011752545695434788,
"loss": 2.5575,
"step": 7710
},
{
"epoch": 0.681467096261641,
"grad_norm": 0.1767578125,
"learning_rate": 0.00011693275775359049,
"loss": 2.5661,
"step": 7720
},
{
"epoch": 0.6823498256609436,
"grad_norm": 0.1669921875,
"learning_rate": 0.00011634110030835341,
"loss": 2.5405,
"step": 7730
},
{
"epoch": 0.6832325550602463,
"grad_norm": 0.1708984375,
"learning_rate": 0.000115750489250599,
"loss": 2.5429,
"step": 7740
},
{
"epoch": 0.684115284459549,
"grad_norm": 0.169921875,
"learning_rate": 0.00011516092920409706,
"loss": 2.5527,
"step": 7750
},
{
"epoch": 0.6849980138588516,
"grad_norm": 0.154296875,
"learning_rate": 0.00011457242478438962,
"loss": 2.5431,
"step": 7760
},
{
"epoch": 0.6858807432581542,
"grad_norm": 0.158203125,
"learning_rate": 0.00011398498059875434,
"loss": 2.5475,
"step": 7770
},
{
"epoch": 0.6867634726574569,
"grad_norm": 0.1728515625,
"learning_rate": 0.00011339860124616833,
"loss": 2.5277,
"step": 7780
},
{
"epoch": 0.6876462020567595,
"grad_norm": 0.1689453125,
"learning_rate": 0.00011281329131727272,
"loss": 2.5447,
"step": 7790
},
{
"epoch": 0.6885289314560622,
"grad_norm": 0.1728515625,
"learning_rate": 0.00011222905539433593,
"loss": 2.5402,
"step": 7800
},
{
"epoch": 0.6894116608553648,
"grad_norm": 0.1572265625,
"learning_rate": 0.00011164589805121852,
"loss": 2.5401,
"step": 7810
},
{
"epoch": 0.6902943902546674,
"grad_norm": 0.1611328125,
"learning_rate": 0.00011106382385333708,
"loss": 2.5293,
"step": 7820
},
{
"epoch": 0.6911771196539701,
"grad_norm": 0.2001953125,
"learning_rate": 0.00011048283735762806,
"loss": 2.5591,
"step": 7830
},
{
"epoch": 0.6920598490532727,
"grad_norm": 0.1513671875,
"learning_rate": 0.00010990294311251328,
"loss": 2.5501,
"step": 7840
},
{
"epoch": 0.6929425784525753,
"grad_norm": 0.177734375,
"learning_rate": 0.00010932414565786286,
"loss": 2.5488,
"step": 7850
},
{
"epoch": 0.693825307851878,
"grad_norm": 0.1708984375,
"learning_rate": 0.0001087464495249606,
"loss": 2.5563,
"step": 7860
},
{
"epoch": 0.6947080372511807,
"grad_norm": 0.1572265625,
"learning_rate": 0.00010816985923646838,
"loss": 2.5468,
"step": 7870
},
{
"epoch": 0.6955907666504832,
"grad_norm": 0.16796875,
"learning_rate": 0.00010759437930639058,
"loss": 2.5426,
"step": 7880
},
{
"epoch": 0.6964734960497859,
"grad_norm": 0.1630859375,
"learning_rate": 0.00010702001424003896,
"loss": 2.5377,
"step": 7890
},
{
"epoch": 0.6973562254490886,
"grad_norm": 0.15234375,
"learning_rate": 0.00010644676853399688,
"loss": 2.5323,
"step": 7900
},
{
"epoch": 0.6982389548483913,
"grad_norm": 0.1708984375,
"learning_rate": 0.00010587464667608484,
"loss": 2.5584,
"step": 7910
},
{
"epoch": 0.6991216842476938,
"grad_norm": 0.181640625,
"learning_rate": 0.00010530365314532488,
"loss": 2.5627,
"step": 7920
},
{
"epoch": 0.7000044136469965,
"grad_norm": 0.18359375,
"learning_rate": 0.00010473379241190542,
"loss": 2.5529,
"step": 7930
},
{
"epoch": 0.7008871430462992,
"grad_norm": 0.162109375,
"learning_rate": 0.00010416506893714662,
"loss": 2.5464,
"step": 7940
},
{
"epoch": 0.7017698724456018,
"grad_norm": 0.1767578125,
"learning_rate": 0.00010359748717346534,
"loss": 2.54,
"step": 7950
},
{
"epoch": 0.7026526018449044,
"grad_norm": 0.1611328125,
"learning_rate": 0.00010303105156433998,
"loss": 2.5576,
"step": 7960
},
{
"epoch": 0.7035353312442071,
"grad_norm": 0.318359375,
"learning_rate": 0.00010246576654427611,
"loss": 2.5533,
"step": 7970
},
{
"epoch": 0.7044180606435098,
"grad_norm": 0.15625,
"learning_rate": 0.0001019016365387716,
"loss": 2.5419,
"step": 7980
},
{
"epoch": 0.7053007900428124,
"grad_norm": 0.1640625,
"learning_rate": 0.00010133866596428196,
"loss": 2.549,
"step": 7990
},
{
"epoch": 0.706183519442115,
"grad_norm": 0.1552734375,
"learning_rate": 0.0001007768592281856,
"loss": 2.5558,
"step": 8000
},
{
"epoch": 0.706183519442115,
"eval_accuracy": 0.5025226345981063,
"eval_loss": 2.4390876293182373,
"eval_runtime": 7.0517,
"eval_samples_per_second": 45.095,
"eval_steps_per_second": 0.425,
"step": 8000
},
{
"epoch": 0.7070662488414177,
"grad_norm": 0.185546875,
"learning_rate": 0.00010021622072874948,
"loss": 2.5533,
"step": 8010
},
{
"epoch": 0.7079489782407203,
"grad_norm": 0.1572265625,
"learning_rate": 9.965675485509504e-05,
"loss": 2.5469,
"step": 8020
},
{
"epoch": 0.7088317076400229,
"grad_norm": 0.1640625,
"learning_rate": 9.909846598716302e-05,
"loss": 2.5456,
"step": 8030
},
{
"epoch": 0.7097144370393256,
"grad_norm": 0.158203125,
"learning_rate": 9.854135849567988e-05,
"loss": 2.5486,
"step": 8040
},
{
"epoch": 0.7105971664386282,
"grad_norm": 0.162109375,
"learning_rate": 9.79854367421234e-05,
"loss": 2.5466,
"step": 8050
},
{
"epoch": 0.7114798958379309,
"grad_norm": 0.15625,
"learning_rate": 9.743070507868818e-05,
"loss": 2.5508,
"step": 8060
},
{
"epoch": 0.7123626252372335,
"grad_norm": 0.1630859375,
"learning_rate": 9.687716784825218e-05,
"loss": 2.5515,
"step": 8070
},
{
"epoch": 0.7132453546365362,
"grad_norm": 0.1982421875,
"learning_rate": 9.632482938434197e-05,
"loss": 2.5433,
"step": 8080
},
{
"epoch": 0.7141280840358388,
"grad_norm": 0.17578125,
"learning_rate": 9.577369401109987e-05,
"loss": 2.5499,
"step": 8090
},
{
"epoch": 0.7150108134351415,
"grad_norm": 0.1484375,
"learning_rate": 9.522376604324889e-05,
"loss": 2.5531,
"step": 8100
},
{
"epoch": 0.7158935428344441,
"grad_norm": 0.166015625,
"learning_rate": 9.467504978605956e-05,
"loss": 2.5524,
"step": 8110
},
{
"epoch": 0.7167762722337467,
"grad_norm": 0.1533203125,
"learning_rate": 9.412754953531663e-05,
"loss": 2.5444,
"step": 8120
},
{
"epoch": 0.7176590016330494,
"grad_norm": 0.15234375,
"learning_rate": 9.35812695772845e-05,
"loss": 2.5384,
"step": 8130
},
{
"epoch": 0.7185417310323521,
"grad_norm": 0.1591796875,
"learning_rate": 9.303621418867444e-05,
"loss": 2.5473,
"step": 8140
},
{
"epoch": 0.7194244604316546,
"grad_norm": 0.1640625,
"learning_rate": 9.24923876366106e-05,
"loss": 2.5543,
"step": 8150
},
{
"epoch": 0.7203071898309573,
"grad_norm": 0.1513671875,
"learning_rate": 9.194979417859705e-05,
"loss": 2.5362,
"step": 8160
},
{
"epoch": 0.72118991923026,
"grad_norm": 0.150390625,
"learning_rate": 9.14084380624842e-05,
"loss": 2.5362,
"step": 8170
},
{
"epoch": 0.7220726486295626,
"grad_norm": 0.15234375,
"learning_rate": 9.086832352643535e-05,
"loss": 2.5472,
"step": 8180
},
{
"epoch": 0.7229553780288652,
"grad_norm": 0.1591796875,
"learning_rate": 9.032945479889391e-05,
"loss": 2.5464,
"step": 8190
},
{
"epoch": 0.7238381074281679,
"grad_norm": 0.1533203125,
"learning_rate": 8.979183609855024e-05,
"loss": 2.5572,
"step": 8200
},
{
"epoch": 0.7247208368274706,
"grad_norm": 0.15625,
"learning_rate": 8.925547163430812e-05,
"loss": 2.5419,
"step": 8210
},
{
"epoch": 0.7256035662267731,
"grad_norm": 0.158203125,
"learning_rate": 8.872036560525254e-05,
"loss": 2.5313,
"step": 8220
},
{
"epoch": 0.7264862956260758,
"grad_norm": 0.162109375,
"learning_rate": 8.818652220061638e-05,
"loss": 2.5315,
"step": 8230
},
{
"epoch": 0.7273690250253785,
"grad_norm": 0.1669921875,
"learning_rate": 8.76539455997475e-05,
"loss": 2.549,
"step": 8240
},
{
"epoch": 0.7282517544246812,
"grad_norm": 0.1572265625,
"learning_rate": 8.71226399720764e-05,
"loss": 2.5549,
"step": 8250
},
{
"epoch": 0.7291344838239837,
"grad_norm": 0.15625,
"learning_rate": 8.659260947708344e-05,
"loss": 2.5558,
"step": 8260
},
{
"epoch": 0.7300172132232864,
"grad_norm": 0.1591796875,
"learning_rate": 8.606385826426621e-05,
"loss": 2.5501,
"step": 8270
},
{
"epoch": 0.7308999426225891,
"grad_norm": 0.1630859375,
"learning_rate": 8.553639047310685e-05,
"loss": 2.5546,
"step": 8280
},
{
"epoch": 0.7317826720218917,
"grad_norm": 0.15234375,
"learning_rate": 8.50102102330401e-05,
"loss": 2.5545,
"step": 8290
},
{
"epoch": 0.7326654014211943,
"grad_norm": 0.171875,
"learning_rate": 8.448532166342077e-05,
"loss": 2.5349,
"step": 8300
},
{
"epoch": 0.733548130820497,
"grad_norm": 0.1494140625,
"learning_rate": 8.396172887349115e-05,
"loss": 2.5466,
"step": 8310
},
{
"epoch": 0.7344308602197996,
"grad_norm": 0.15625,
"learning_rate": 8.343943596234943e-05,
"loss": 2.5521,
"step": 8320
},
{
"epoch": 0.7353135896191023,
"grad_norm": 0.1689453125,
"learning_rate": 8.291844701891732e-05,
"loss": 2.5412,
"step": 8330
},
{
"epoch": 0.7361963190184049,
"grad_norm": 0.146484375,
"learning_rate": 8.239876612190778e-05,
"loss": 2.5424,
"step": 8340
},
{
"epoch": 0.7370790484177075,
"grad_norm": 0.1533203125,
"learning_rate": 8.188039733979366e-05,
"loss": 2.5543,
"step": 8350
},
{
"epoch": 0.7379617778170102,
"grad_norm": 0.1591796875,
"learning_rate": 8.136334473077519e-05,
"loss": 2.5527,
"step": 8360
},
{
"epoch": 0.7388445072163128,
"grad_norm": 0.15625,
"learning_rate": 8.084761234274906e-05,
"loss": 2.5302,
"step": 8370
},
{
"epoch": 0.7397272366156155,
"grad_norm": 0.17578125,
"learning_rate": 8.033320421327578e-05,
"loss": 2.5411,
"step": 8380
},
{
"epoch": 0.7406099660149181,
"grad_norm": 0.1591796875,
"learning_rate": 7.982012436954849e-05,
"loss": 2.5302,
"step": 8390
},
{
"epoch": 0.7414926954142208,
"grad_norm": 0.1689453125,
"learning_rate": 7.930837682836195e-05,
"loss": 2.549,
"step": 8400
},
{
"epoch": 0.7423754248135234,
"grad_norm": 0.1572265625,
"learning_rate": 7.87979655960801e-05,
"loss": 2.5501,
"step": 8410
},
{
"epoch": 0.743258154212826,
"grad_norm": 0.1591796875,
"learning_rate": 7.828889466860551e-05,
"loss": 2.5477,
"step": 8420
},
{
"epoch": 0.7441408836121287,
"grad_norm": 0.158203125,
"learning_rate": 7.77811680313475e-05,
"loss": 2.5561,
"step": 8430
},
{
"epoch": 0.7450236130114314,
"grad_norm": 0.1474609375,
"learning_rate": 7.727478965919144e-05,
"loss": 2.5498,
"step": 8440
},
{
"epoch": 0.745906342410734,
"grad_norm": 0.1669921875,
"learning_rate": 7.67697635164675e-05,
"loss": 2.5422,
"step": 8450
},
{
"epoch": 0.7467890718100366,
"grad_norm": 0.171875,
"learning_rate": 7.626609355691922e-05,
"loss": 2.5452,
"step": 8460
},
{
"epoch": 0.7476718012093393,
"grad_norm": 0.1513671875,
"learning_rate": 7.576378372367306e-05,
"loss": 2.5422,
"step": 8470
},
{
"epoch": 0.748554530608642,
"grad_norm": 0.1533203125,
"learning_rate": 7.52628379492075e-05,
"loss": 2.5423,
"step": 8480
},
{
"epoch": 0.7494372600079445,
"grad_norm": 0.150390625,
"learning_rate": 7.476326015532162e-05,
"loss": 2.5439,
"step": 8490
},
{
"epoch": 0.7503199894072472,
"grad_norm": 0.1591796875,
"learning_rate": 7.426505425310531e-05,
"loss": 2.5584,
"step": 8500
},
{
"epoch": 0.7512027188065499,
"grad_norm": 0.150390625,
"learning_rate": 7.376822414290804e-05,
"loss": 2.5494,
"step": 8510
},
{
"epoch": 0.7520854482058525,
"grad_norm": 0.16015625,
"learning_rate": 7.327277371430858e-05,
"loss": 2.5476,
"step": 8520
},
{
"epoch": 0.7529681776051551,
"grad_norm": 0.154296875,
"learning_rate": 7.27787068460842e-05,
"loss": 2.5534,
"step": 8530
},
{
"epoch": 0.7538509070044578,
"grad_norm": 0.1494140625,
"learning_rate": 7.228602740618085e-05,
"loss": 2.5516,
"step": 8540
},
{
"epoch": 0.7547336364037605,
"grad_norm": 0.1552734375,
"learning_rate": 7.179473925168256e-05,
"loss": 2.5482,
"step": 8550
},
{
"epoch": 0.755616365803063,
"grad_norm": 0.1630859375,
"learning_rate": 7.130484622878108e-05,
"loss": 2.5597,
"step": 8560
},
{
"epoch": 0.7564990952023657,
"grad_norm": 0.1611328125,
"learning_rate": 7.081635217274617e-05,
"loss": 2.5501,
"step": 8570
},
{
"epoch": 0.7573818246016684,
"grad_norm": 0.166015625,
"learning_rate": 7.032926090789537e-05,
"loss": 2.5453,
"step": 8580
},
{
"epoch": 0.758264554000971,
"grad_norm": 0.1494140625,
"learning_rate": 6.984357624756388e-05,
"loss": 2.5454,
"step": 8590
},
{
"epoch": 0.7591472834002736,
"grad_norm": 0.146484375,
"learning_rate": 6.935930199407501e-05,
"loss": 2.5486,
"step": 8600
},
{
"epoch": 0.7600300127995763,
"grad_norm": 0.16796875,
"learning_rate": 6.887644193871042e-05,
"loss": 2.5446,
"step": 8610
},
{
"epoch": 0.7609127421988789,
"grad_norm": 0.1494140625,
"learning_rate": 6.839499986167999e-05,
"loss": 2.5639,
"step": 8620
},
{
"epoch": 0.7617954715981816,
"grad_norm": 0.1494140625,
"learning_rate": 6.791497953209289e-05,
"loss": 2.5376,
"step": 8630
},
{
"epoch": 0.7626782009974842,
"grad_norm": 0.146484375,
"learning_rate": 6.743638470792735e-05,
"loss": 2.5355,
"step": 8640
},
{
"epoch": 0.7635609303967869,
"grad_norm": 0.15234375,
"learning_rate": 6.695921913600212e-05,
"loss": 2.5469,
"step": 8650
},
{
"epoch": 0.7644436597960895,
"grad_norm": 0.150390625,
"learning_rate": 6.648348655194613e-05,
"loss": 2.5516,
"step": 8660
},
{
"epoch": 0.7653263891953922,
"grad_norm": 0.14453125,
"learning_rate": 6.600919068017006e-05,
"loss": 2.538,
"step": 8670
},
{
"epoch": 0.7662091185946948,
"grad_norm": 0.1474609375,
"learning_rate": 6.553633523383682e-05,
"loss": 2.5491,
"step": 8680
},
{
"epoch": 0.7670918479939974,
"grad_norm": 0.1474609375,
"learning_rate": 6.506492391483232e-05,
"loss": 2.5383,
"step": 8690
},
{
"epoch": 0.7679745773933001,
"grad_norm": 0.150390625,
"learning_rate": 6.459496041373708e-05,
"loss": 2.5425,
"step": 8700
},
{
"epoch": 0.7688573067926028,
"grad_norm": 0.1474609375,
"learning_rate": 6.412644840979656e-05,
"loss": 2.5525,
"step": 8710
},
{
"epoch": 0.7697400361919053,
"grad_norm": 0.15234375,
"learning_rate": 6.365939157089304e-05,
"loss": 2.5425,
"step": 8720
},
{
"epoch": 0.770622765591208,
"grad_norm": 0.1533203125,
"learning_rate": 6.319379355351653e-05,
"loss": 2.5293,
"step": 8730
},
{
"epoch": 0.7715054949905107,
"grad_norm": 0.15234375,
"learning_rate": 6.272965800273608e-05,
"loss": 2.5375,
"step": 8740
},
{
"epoch": 0.7723882243898132,
"grad_norm": 0.1552734375,
"learning_rate": 6.226698855217178e-05,
"loss": 2.5502,
"step": 8750
},
{
"epoch": 0.7732709537891159,
"grad_norm": 0.169921875,
"learning_rate": 6.180578882396556e-05,
"loss": 2.5518,
"step": 8760
},
{
"epoch": 0.7741536831884186,
"grad_norm": 0.1533203125,
"learning_rate": 6.134606242875324e-05,
"loss": 2.5396,
"step": 8770
},
{
"epoch": 0.7750364125877213,
"grad_norm": 0.150390625,
"learning_rate": 6.088781296563636e-05,
"loss": 2.5522,
"step": 8780
},
{
"epoch": 0.7759191419870238,
"grad_norm": 0.1650390625,
"learning_rate": 6.043104402215388e-05,
"loss": 2.5597,
"step": 8790
},
{
"epoch": 0.7768018713863265,
"grad_norm": 0.158203125,
"learning_rate": 5.9975759174254075e-05,
"loss": 2.5519,
"step": 8800
},
{
"epoch": 0.7776846007856292,
"grad_norm": 0.166015625,
"learning_rate": 5.952196198626633e-05,
"loss": 2.5654,
"step": 8810
},
{
"epoch": 0.7785673301849318,
"grad_norm": 0.1533203125,
"learning_rate": 5.906965601087369e-05,
"loss": 2.5543,
"step": 8820
},
{
"epoch": 0.7794500595842344,
"grad_norm": 0.150390625,
"learning_rate": 5.861884478908483e-05,
"loss": 2.5422,
"step": 8830
},
{
"epoch": 0.7803327889835371,
"grad_norm": 0.1533203125,
"learning_rate": 5.816953185020607e-05,
"loss": 2.5479,
"step": 8840
},
{
"epoch": 0.7812155183828398,
"grad_norm": 0.154296875,
"learning_rate": 5.7721720711814195e-05,
"loss": 2.5471,
"step": 8850
},
{
"epoch": 0.7820982477821424,
"grad_norm": 0.146484375,
"learning_rate": 5.727541487972876e-05,
"loss": 2.5383,
"step": 8860
},
{
"epoch": 0.782980977181445,
"grad_norm": 0.154296875,
"learning_rate": 5.68306178479843e-05,
"loss": 2.54,
"step": 8870
},
{
"epoch": 0.7838637065807477,
"grad_norm": 0.16015625,
"learning_rate": 5.638733309880353e-05,
"loss": 2.5504,
"step": 8880
},
{
"epoch": 0.7847464359800503,
"grad_norm": 0.1591796875,
"learning_rate": 5.5945564102569764e-05,
"loss": 2.5533,
"step": 8890
},
{
"epoch": 0.785629165379353,
"grad_norm": 0.1513671875,
"learning_rate": 5.550531431779984e-05,
"loss": 2.5376,
"step": 8900
},
{
"epoch": 0.7865118947786556,
"grad_norm": 0.162109375,
"learning_rate": 5.50665871911169e-05,
"loss": 2.5491,
"step": 8910
},
{
"epoch": 0.7873946241779582,
"grad_norm": 0.162109375,
"learning_rate": 5.4629386157223434e-05,
"loss": 2.533,
"step": 8920
},
{
"epoch": 0.7882773535772609,
"grad_norm": 0.15625,
"learning_rate": 5.4193714638874845e-05,
"loss": 2.5541,
"step": 8930
},
{
"epoch": 0.7891600829765635,
"grad_norm": 0.16015625,
"learning_rate": 5.375957604685186e-05,
"loss": 2.5261,
"step": 8940
},
{
"epoch": 0.7900428123758662,
"grad_norm": 0.1494140625,
"learning_rate": 5.3326973779934506e-05,
"loss": 2.5527,
"step": 8950
},
{
"epoch": 0.7909255417751688,
"grad_norm": 0.1552734375,
"learning_rate": 5.289591122487522e-05,
"loss": 2.5499,
"step": 8960
},
{
"epoch": 0.7918082711744715,
"grad_norm": 0.166015625,
"learning_rate": 5.246639175637216e-05,
"loss": 2.5553,
"step": 8970
},
{
"epoch": 0.7926910005737741,
"grad_norm": 0.166015625,
"learning_rate": 5.203841873704329e-05,
"loss": 2.5535,
"step": 8980
},
{
"epoch": 0.7935737299730767,
"grad_norm": 0.146484375,
"learning_rate": 5.161199551739942e-05,
"loss": 2.5253,
"step": 8990
},
{
"epoch": 0.7944564593723794,
"grad_norm": 0.1474609375,
"learning_rate": 5.1187125435818575e-05,
"loss": 2.5568,
"step": 9000
},
{
"epoch": 0.7953391887716821,
"grad_norm": 0.1572265625,
"learning_rate": 5.0763811818519494e-05,
"loss": 2.5483,
"step": 9010
},
{
"epoch": 0.7962219181709846,
"grad_norm": 0.146484375,
"learning_rate": 5.0342057979535507e-05,
"loss": 2.5541,
"step": 9020
},
{
"epoch": 0.7971046475702873,
"grad_norm": 0.1572265625,
"learning_rate": 4.99218672206892e-05,
"loss": 2.5512,
"step": 9030
},
{
"epoch": 0.79798737696959,
"grad_norm": 0.150390625,
"learning_rate": 4.950324283156562e-05,
"loss": 2.5524,
"step": 9040
},
{
"epoch": 0.7988701063688927,
"grad_norm": 0.150390625,
"learning_rate": 4.908618808948748e-05,
"loss": 2.5388,
"step": 9050
},
{
"epoch": 0.7997528357681952,
"grad_norm": 0.1435546875,
"learning_rate": 4.867070625948866e-05,
"loss": 2.5634,
"step": 9060
},
{
"epoch": 0.8006355651674979,
"grad_norm": 0.15625,
"learning_rate": 4.825680059428933e-05,
"loss": 2.5374,
"step": 9070
},
{
"epoch": 0.8015182945668006,
"grad_norm": 0.146484375,
"learning_rate": 4.784447433427016e-05,
"loss": 2.5457,
"step": 9080
},
{
"epoch": 0.8024010239661032,
"grad_norm": 0.1572265625,
"learning_rate": 4.7433730707446805e-05,
"loss": 2.5496,
"step": 9090
},
{
"epoch": 0.8032837533654058,
"grad_norm": 0.1513671875,
"learning_rate": 4.702457292944498e-05,
"loss": 2.546,
"step": 9100
},
{
"epoch": 0.8041664827647085,
"grad_norm": 0.162109375,
"learning_rate": 4.661700420347517e-05,
"loss": 2.5403,
"step": 9110
},
{
"epoch": 0.8050492121640112,
"grad_norm": 0.162109375,
"learning_rate": 4.62110277203073e-05,
"loss": 2.5484,
"step": 9120
},
{
"epoch": 0.8059319415633137,
"grad_norm": 0.150390625,
"learning_rate": 4.5806646658246104e-05,
"loss": 2.5572,
"step": 9130
},
{
"epoch": 0.8068146709626164,
"grad_norm": 0.150390625,
"learning_rate": 4.5403864183106184e-05,
"loss": 2.555,
"step": 9140
},
{
"epoch": 0.8076974003619191,
"grad_norm": 0.154296875,
"learning_rate": 4.5002683448186866e-05,
"loss": 2.5622,
"step": 9150
},
{
"epoch": 0.8085801297612217,
"grad_norm": 0.1513671875,
"learning_rate": 4.460310759424802e-05,
"loss": 2.5454,
"step": 9160
},
{
"epoch": 0.8094628591605243,
"grad_norm": 0.158203125,
"learning_rate": 4.420513974948517e-05,
"loss": 2.5404,
"step": 9170
},
{
"epoch": 0.810345588559827,
"grad_norm": 0.15625,
"learning_rate": 4.3808783029505166e-05,
"loss": 2.5385,
"step": 9180
},
{
"epoch": 0.8112283179591296,
"grad_norm": 0.1572265625,
"learning_rate": 4.341404053730147e-05,
"loss": 2.5515,
"step": 9190
},
{
"epoch": 0.8121110473584323,
"grad_norm": 0.1494140625,
"learning_rate": 4.3020915363230274e-05,
"loss": 2.5482,
"step": 9200
},
{
"epoch": 0.8129937767577349,
"grad_norm": 0.1552734375,
"learning_rate": 4.262941058498615e-05,
"loss": 2.5382,
"step": 9210
},
{
"epoch": 0.8138765061570375,
"grad_norm": 0.154296875,
"learning_rate": 4.2239529267577736e-05,
"loss": 2.5462,
"step": 9220
},
{
"epoch": 0.8147592355563402,
"grad_norm": 0.1484375,
"learning_rate": 4.1851274463304165e-05,
"loss": 2.551,
"step": 9230
},
{
"epoch": 0.8156419649556429,
"grad_norm": 0.1474609375,
"learning_rate": 4.146464921173088e-05,
"loss": 2.542,
"step": 9240
},
{
"epoch": 0.8165246943549455,
"grad_norm": 0.146484375,
"learning_rate": 4.1079656539665696e-05,
"loss": 2.5525,
"step": 9250
},
{
"epoch": 0.8174074237542481,
"grad_norm": 0.146484375,
"learning_rate": 4.069629946113565e-05,
"loss": 2.5403,
"step": 9260
},
{
"epoch": 0.8182901531535508,
"grad_norm": 0.1494140625,
"learning_rate": 4.0314580977362655e-05,
"loss": 2.5468,
"step": 9270
},
{
"epoch": 0.8191728825528535,
"grad_norm": 0.1455078125,
"learning_rate": 3.99345040767409e-05,
"loss": 2.5448,
"step": 9280
},
{
"epoch": 0.820055611952156,
"grad_norm": 0.146484375,
"learning_rate": 3.955607173481254e-05,
"loss": 2.5475,
"step": 9290
},
{
"epoch": 0.8209383413514587,
"grad_norm": 0.1484375,
"learning_rate": 3.9179286914244884e-05,
"loss": 2.5421,
"step": 9300
},
{
"epoch": 0.8218210707507614,
"grad_norm": 0.1494140625,
"learning_rate": 3.880415256480749e-05,
"loss": 2.5562,
"step": 9310
},
{
"epoch": 0.822703800150064,
"grad_norm": 0.1494140625,
"learning_rate": 3.843067162334826e-05,
"loss": 2.5252,
"step": 9320
},
{
"epoch": 0.8235865295493666,
"grad_norm": 0.15234375,
"learning_rate": 3.805884701377127e-05,
"loss": 2.5409,
"step": 9330
},
{
"epoch": 0.8244692589486693,
"grad_norm": 0.1640625,
"learning_rate": 3.768868164701325e-05,
"loss": 2.5449,
"step": 9340
},
{
"epoch": 0.825351988347972,
"grad_norm": 0.162109375,
"learning_rate": 3.732017842102126e-05,
"loss": 2.5703,
"step": 9350
},
{
"epoch": 0.8262347177472745,
"grad_norm": 0.1455078125,
"learning_rate": 3.695334022072977e-05,
"loss": 2.5449,
"step": 9360
},
{
"epoch": 0.8271174471465772,
"grad_norm": 0.15625,
"learning_rate": 3.658816991803798e-05,
"loss": 2.5508,
"step": 9370
},
{
"epoch": 0.8280001765458799,
"grad_norm": 0.146484375,
"learning_rate": 3.622467037178765e-05,
"loss": 2.5448,
"step": 9380
},
{
"epoch": 0.8288829059451825,
"grad_norm": 0.1533203125,
"learning_rate": 3.586284442774049e-05,
"loss": 2.5299,
"step": 9390
},
{
"epoch": 0.8297656353444851,
"grad_norm": 0.142578125,
"learning_rate": 3.550269491855579e-05,
"loss": 2.5425,
"step": 9400
},
{
"epoch": 0.8306483647437878,
"grad_norm": 0.146484375,
"learning_rate": 3.514422466376857e-05,
"loss": 2.5504,
"step": 9410
},
{
"epoch": 0.8315310941430905,
"grad_norm": 0.14453125,
"learning_rate": 3.478743646976726e-05,
"loss": 2.551,
"step": 9420
},
{
"epoch": 0.8324138235423931,
"grad_norm": 0.1513671875,
"learning_rate": 3.443233312977176e-05,
"loss": 2.5484,
"step": 9430
},
{
"epoch": 0.8332965529416957,
"grad_norm": 0.14453125,
"learning_rate": 3.4078917423811556e-05,
"loss": 2.5335,
"step": 9440
},
{
"epoch": 0.8341792823409984,
"grad_norm": 0.15234375,
"learning_rate": 3.372719211870412e-05,
"loss": 2.5315,
"step": 9450
},
{
"epoch": 0.835062011740301,
"grad_norm": 0.14453125,
"learning_rate": 3.3377159968033085e-05,
"loss": 2.5582,
"step": 9460
},
{
"epoch": 0.8359447411396037,
"grad_norm": 0.169921875,
"learning_rate": 3.302882371212665e-05,
"loss": 2.5467,
"step": 9470
},
{
"epoch": 0.8368274705389063,
"grad_norm": 0.1474609375,
"learning_rate": 3.2682186078036304e-05,
"loss": 2.5539,
"step": 9480
},
{
"epoch": 0.8377101999382089,
"grad_norm": 0.14453125,
"learning_rate": 3.2337249779515436e-05,
"loss": 2.5506,
"step": 9490
},
{
"epoch": 0.8385929293375116,
"grad_norm": 0.1494140625,
"learning_rate": 3.199401751699782e-05,
"loss": 2.5415,
"step": 9500
},
{
"epoch": 0.8394756587368142,
"grad_norm": 0.1513671875,
"learning_rate": 3.1652491977576883e-05,
"loss": 2.5471,
"step": 9510
},
{
"epoch": 0.8403583881361169,
"grad_norm": 0.15625,
"learning_rate": 3.131267583498448e-05,
"loss": 2.552,
"step": 9520
},
{
"epoch": 0.8412411175354195,
"grad_norm": 0.146484375,
"learning_rate": 3.097457174956977e-05,
"loss": 2.5561,
"step": 9530
},
{
"epoch": 0.8421238469347222,
"grad_norm": 0.1513671875,
"learning_rate": 3.063818236827884e-05,
"loss": 2.5502,
"step": 9540
},
{
"epoch": 0.8430065763340248,
"grad_norm": 0.1474609375,
"learning_rate": 3.030351032463341e-05,
"loss": 2.5575,
"step": 9550
},
{
"epoch": 0.8438893057333274,
"grad_norm": 0.1513671875,
"learning_rate": 2.9970558238710865e-05,
"loss": 2.5531,
"step": 9560
},
{
"epoch": 0.8447720351326301,
"grad_norm": 0.14453125,
"learning_rate": 2.9639328717123104e-05,
"loss": 2.5366,
"step": 9570
},
{
"epoch": 0.8456547645319328,
"grad_norm": 0.1484375,
"learning_rate": 2.9309824352996618e-05,
"loss": 2.5446,
"step": 9580
},
{
"epoch": 0.8465374939312353,
"grad_norm": 0.1484375,
"learning_rate": 2.898204772595195e-05,
"loss": 2.5454,
"step": 9590
},
{
"epoch": 0.847420223330538,
"grad_norm": 0.15625,
"learning_rate": 2.865600140208349e-05,
"loss": 2.5283,
"step": 9600
},
{
"epoch": 0.8483029527298407,
"grad_norm": 0.158203125,
"learning_rate": 2.833168793393956e-05,
"loss": 2.5519,
"step": 9610
},
{
"epoch": 0.8491856821291434,
"grad_norm": 0.154296875,
"learning_rate": 2.8009109860502174e-05,
"loss": 2.5443,
"step": 9620
},
{
"epoch": 0.8500684115284459,
"grad_norm": 0.1494140625,
"learning_rate": 2.768826970716745e-05,
"loss": 2.55,
"step": 9630
},
{
"epoch": 0.8509511409277486,
"grad_norm": 0.1552734375,
"learning_rate": 2.736916998572567e-05,
"loss": 2.5536,
"step": 9640
},
{
"epoch": 0.8518338703270513,
"grad_norm": 0.1396484375,
"learning_rate": 2.705181319434144e-05,
"loss": 2.554,
"step": 9650
},
{
"epoch": 0.8527165997263539,
"grad_norm": 0.1455078125,
"learning_rate": 2.6736201817534696e-05,
"loss": 2.5469,
"step": 9660
},
{
"epoch": 0.8535993291256565,
"grad_norm": 0.1513671875,
"learning_rate": 2.6422338326160618e-05,
"loss": 2.5496,
"step": 9670
},
{
"epoch": 0.8544820585249592,
"grad_norm": 0.1435546875,
"learning_rate": 2.6110225177390534e-05,
"loss": 2.5509,
"step": 9680
},
{
"epoch": 0.8553647879242618,
"grad_norm": 0.1552734375,
"learning_rate": 2.5799864814692902e-05,
"loss": 2.5452,
"step": 9690
},
{
"epoch": 0.8562475173235644,
"grad_norm": 0.146484375,
"learning_rate": 2.549125966781385e-05,
"loss": 2.5413,
"step": 9700
},
{
"epoch": 0.8571302467228671,
"grad_norm": 0.1435546875,
"learning_rate": 2.518441215275838e-05,
"loss": 2.5428,
"step": 9710
},
{
"epoch": 0.8580129761221698,
"grad_norm": 0.1494140625,
"learning_rate": 2.48793246717712e-05,
"loss": 2.545,
"step": 9720
},
{
"epoch": 0.8588957055214724,
"grad_norm": 0.1513671875,
"learning_rate": 2.4575999613318245e-05,
"loss": 2.5541,
"step": 9730
},
{
"epoch": 0.859778434920775,
"grad_norm": 0.1591796875,
"learning_rate": 2.4274439352067828e-05,
"loss": 2.5458,
"step": 9740
},
{
"epoch": 0.8606611643200777,
"grad_norm": 0.14453125,
"learning_rate": 2.3974646248871827e-05,
"loss": 2.547,
"step": 9750
},
{
"epoch": 0.8615438937193803,
"grad_norm": 0.14453125,
"learning_rate": 2.3676622650747603e-05,
"loss": 2.5407,
"step": 9760
},
{
"epoch": 0.862426623118683,
"grad_norm": 0.1474609375,
"learning_rate": 2.3380370890859454e-05,
"loss": 2.5465,
"step": 9770
},
{
"epoch": 0.8633093525179856,
"grad_norm": 0.15234375,
"learning_rate": 2.3085893288500136e-05,
"loss": 2.5445,
"step": 9780
},
{
"epoch": 0.8641920819172882,
"grad_norm": 0.1513671875,
"learning_rate": 2.279319214907305e-05,
"loss": 2.5268,
"step": 9790
},
{
"epoch": 0.8650748113165909,
"grad_norm": 0.1455078125,
"learning_rate": 2.2502269764074017e-05,
"loss": 2.5262,
"step": 9800
},
{
"epoch": 0.8659575407158936,
"grad_norm": 0.154296875,
"learning_rate": 2.2213128411073396e-05,
"loss": 2.5578,
"step": 9810
},
{
"epoch": 0.8668402701151962,
"grad_norm": 0.1484375,
"learning_rate": 2.1925770353698137e-05,
"loss": 2.5533,
"step": 9820
},
{
"epoch": 0.8677229995144988,
"grad_norm": 0.14453125,
"learning_rate": 2.1640197841614083e-05,
"loss": 2.5468,
"step": 9830
},
{
"epoch": 0.8686057289138015,
"grad_norm": 0.142578125,
"learning_rate": 2.1356413110508675e-05,
"loss": 2.5399,
"step": 9840
},
{
"epoch": 0.8694884583131042,
"grad_norm": 0.142578125,
"learning_rate": 2.1074418382072912e-05,
"loss": 2.5452,
"step": 9850
},
{
"epoch": 0.8703711877124067,
"grad_norm": 0.1513671875,
"learning_rate": 2.0794215863984417e-05,
"loss": 2.5361,
"step": 9860
},
{
"epoch": 0.8712539171117094,
"grad_norm": 0.1572265625,
"learning_rate": 2.0515807749889954e-05,
"loss": 2.5424,
"step": 9870
},
{
"epoch": 0.8721366465110121,
"grad_norm": 0.1484375,
"learning_rate": 2.0239196219388133e-05,
"loss": 2.5568,
"step": 9880
},
{
"epoch": 0.8730193759103146,
"grad_norm": 0.1474609375,
"learning_rate": 1.9964383438012685e-05,
"loss": 2.5599,
"step": 9890
},
{
"epoch": 0.8739021053096173,
"grad_norm": 0.1416015625,
"learning_rate": 1.969137155721509e-05,
"loss": 2.5448,
"step": 9900
},
{
"epoch": 0.87478483470892,
"grad_norm": 0.1455078125,
"learning_rate": 1.942016271434821e-05,
"loss": 2.5507,
"step": 9910
},
{
"epoch": 0.8756675641082227,
"grad_norm": 0.1474609375,
"learning_rate": 1.915075903264915e-05,
"loss": 2.5443,
"step": 9920
},
{
"epoch": 0.8765502935075252,
"grad_norm": 0.146484375,
"learning_rate": 1.8883162621222693e-05,
"loss": 2.5618,
"step": 9930
},
{
"epoch": 0.8774330229068279,
"grad_norm": 0.1513671875,
"learning_rate": 1.8617375575025186e-05,
"loss": 2.5591,
"step": 9940
},
{
"epoch": 0.8783157523061306,
"grad_norm": 0.1474609375,
"learning_rate": 1.835339997484753e-05,
"loss": 2.5593,
"step": 9950
},
{
"epoch": 0.8791984817054332,
"grad_norm": 0.1513671875,
"learning_rate": 1.8091237887299357e-05,
"loss": 2.5468,
"step": 9960
},
{
"epoch": 0.8800812111047358,
"grad_norm": 0.1416015625,
"learning_rate": 1.783089136479257e-05,
"loss": 2.5537,
"step": 9970
},
{
"epoch": 0.8809639405040385,
"grad_norm": 0.1572265625,
"learning_rate": 1.757236244552557e-05,
"loss": 2.5536,
"step": 9980
},
{
"epoch": 0.8818466699033412,
"grad_norm": 0.1435546875,
"learning_rate": 1.7315653153466977e-05,
"loss": 2.5452,
"step": 9990
},
{
"epoch": 0.8827293993026438,
"grad_norm": 0.1513671875,
"learning_rate": 1.7060765498339958e-05,
"loss": 2.5535,
"step": 10000
},
{
"epoch": 0.8827293993026438,
"eval_accuracy": 0.5028574500272613,
"eval_loss": 2.4378483295440674,
"eval_runtime": 7.0626,
"eval_samples_per_second": 45.026,
"eval_steps_per_second": 0.425,
"step": 10000
},
{
"epoch": 0.8836121287019464,
"grad_norm": 0.1416015625,
"learning_rate": 1.6807701475606534e-05,
"loss": 2.5573,
"step": 10010
},
{
"epoch": 0.8844948581012491,
"grad_norm": 0.1494140625,
"learning_rate": 1.6556463066451837e-05,
"loss": 2.5438,
"step": 10020
},
{
"epoch": 0.8853775875005517,
"grad_norm": 0.1484375,
"learning_rate": 1.63070522377686e-05,
"loss": 2.5571,
"step": 10030
},
{
"epoch": 0.8862603168998543,
"grad_norm": 0.1455078125,
"learning_rate": 1.6059470942141912e-05,
"loss": 2.5412,
"step": 10040
},
{
"epoch": 0.887143046299157,
"grad_norm": 0.1435546875,
"learning_rate": 1.5813721117833828e-05,
"loss": 2.5566,
"step": 10050
},
{
"epoch": 0.8880257756984596,
"grad_norm": 0.146484375,
"learning_rate": 1.5569804688768092e-05,
"loss": 2.5315,
"step": 10060
},
{
"epoch": 0.8889085050977623,
"grad_norm": 0.1650390625,
"learning_rate": 1.532772356451531e-05,
"loss": 2.542,
"step": 10070
},
{
"epoch": 0.8897912344970649,
"grad_norm": 0.15234375,
"learning_rate": 1.5087479640277763e-05,
"loss": 2.5465,
"step": 10080
},
{
"epoch": 0.8906739638963675,
"grad_norm": 0.140625,
"learning_rate": 1.4849074796874779e-05,
"loss": 2.5593,
"step": 10090
},
{
"epoch": 0.8915566932956702,
"grad_norm": 0.14453125,
"learning_rate": 1.4612510900727794e-05,
"loss": 2.5438,
"step": 10100
},
{
"epoch": 0.8924394226949729,
"grad_norm": 0.1513671875,
"learning_rate": 1.4377789803845964e-05,
"loss": 2.5491,
"step": 10110
},
{
"epoch": 0.8933221520942755,
"grad_norm": 0.1416015625,
"learning_rate": 1.4144913343811544e-05,
"loss": 2.5414,
"step": 10120
},
{
"epoch": 0.8942048814935781,
"grad_norm": 0.1396484375,
"learning_rate": 1.3913883343765394e-05,
"loss": 2.5444,
"step": 10130
},
{
"epoch": 0.8950876108928808,
"grad_norm": 0.154296875,
"learning_rate": 1.3684701612392963e-05,
"loss": 2.5444,
"step": 10140
},
{
"epoch": 0.8959703402921835,
"grad_norm": 0.1474609375,
"learning_rate": 1.345736994390992e-05,
"loss": 2.5356,
"step": 10150
},
{
"epoch": 0.896853069691486,
"grad_norm": 0.1474609375,
"learning_rate": 1.3231890118048179e-05,
"loss": 2.5487,
"step": 10160
},
{
"epoch": 0.8977357990907887,
"grad_norm": 0.138671875,
"learning_rate": 1.300826390004209e-05,
"loss": 2.5567,
"step": 10170
},
{
"epoch": 0.8986185284900914,
"grad_norm": 0.14453125,
"learning_rate": 1.2786493040614245e-05,
"loss": 2.5631,
"step": 10180
},
{
"epoch": 0.8995012578893941,
"grad_norm": 0.146484375,
"learning_rate": 1.2566579275962303e-05,
"loss": 2.5384,
"step": 10190
},
{
"epoch": 0.9003839872886966,
"grad_norm": 0.146484375,
"learning_rate": 1.2348524327744943e-05,
"loss": 2.5369,
"step": 10200
},
{
"epoch": 0.9012667166879993,
"grad_norm": 0.142578125,
"learning_rate": 1.2132329903068563e-05,
"loss": 2.5445,
"step": 10210
},
{
"epoch": 0.902149446087302,
"grad_norm": 0.1484375,
"learning_rate": 1.1917997694473992e-05,
"loss": 2.549,
"step": 10220
},
{
"epoch": 0.9030321754866045,
"grad_norm": 0.1533203125,
"learning_rate": 1.1705529379923085e-05,
"loss": 2.5339,
"step": 10230
},
{
"epoch": 0.9039149048859072,
"grad_norm": 0.15234375,
"learning_rate": 1.1494926622785811e-05,
"loss": 2.5437,
"step": 10240
},
{
"epoch": 0.9047976342852099,
"grad_norm": 0.14453125,
"learning_rate": 1.1286191071826823e-05,
"loss": 2.5387,
"step": 10250
},
{
"epoch": 0.9056803636845125,
"grad_norm": 0.1513671875,
"learning_rate": 1.1079324361193022e-05,
"loss": 2.5676,
"step": 10260
},
{
"epoch": 0.9065630930838151,
"grad_norm": 0.1484375,
"learning_rate": 1.0874328110400511e-05,
"loss": 2.5503,
"step": 10270
},
{
"epoch": 0.9074458224831178,
"grad_norm": 0.1474609375,
"learning_rate": 1.0671203924321887e-05,
"loss": 2.5516,
"step": 10280
},
{
"epoch": 0.9083285518824205,
"grad_norm": 0.142578125,
"learning_rate": 1.0469953393173776e-05,
"loss": 2.5399,
"step": 10290
},
{
"epoch": 0.9092112812817231,
"grad_norm": 0.14453125,
"learning_rate": 1.0270578092504396e-05,
"loss": 2.5427,
"step": 10300
},
{
"epoch": 0.9100940106810257,
"grad_norm": 0.142578125,
"learning_rate": 1.0073079583181126e-05,
"loss": 2.5459,
"step": 10310
},
{
"epoch": 0.9109767400803284,
"grad_norm": 0.1474609375,
"learning_rate": 9.877459411378325e-06,
"loss": 2.552,
"step": 10320
},
{
"epoch": 0.911859469479631,
"grad_norm": 0.1572265625,
"learning_rate": 9.683719108565331e-06,
"loss": 2.5469,
"step": 10330
},
{
"epoch": 0.9127421988789337,
"grad_norm": 0.1435546875,
"learning_rate": 9.49186019149434e-06,
"loss": 2.5547,
"step": 10340
},
{
"epoch": 0.9136249282782363,
"grad_norm": 0.1494140625,
"learning_rate": 9.301884162188496e-06,
"loss": 2.5461,
"step": 10350
},
{
"epoch": 0.9145076576775389,
"grad_norm": 0.1484375,
"learning_rate": 9.113792507930263e-06,
"loss": 2.5475,
"step": 10360
},
{
"epoch": 0.9153903870768416,
"grad_norm": 0.1533203125,
"learning_rate": 8.927586701249852e-06,
"loss": 2.5437,
"step": 10370
},
{
"epoch": 0.9162731164761443,
"grad_norm": 0.15234375,
"learning_rate": 8.743268199913307e-06,
"loss": 2.5339,
"step": 10380
},
{
"epoch": 0.9171558458754469,
"grad_norm": 0.150390625,
"learning_rate": 8.560838446911607e-06,
"loss": 2.539,
"step": 10390
},
{
"epoch": 0.9180385752747495,
"grad_norm": 0.14453125,
"learning_rate": 8.380298870449e-06,
"loss": 2.5314,
"step": 10400
},
{
"epoch": 0.9189213046740522,
"grad_norm": 0.1484375,
"learning_rate": 8.201650883931904e-06,
"loss": 2.5467,
"step": 10410
},
{
"epoch": 0.9198040340733548,
"grad_norm": 0.146484375,
"learning_rate": 8.024895885957978e-06,
"loss": 2.533,
"step": 10420
},
{
"epoch": 0.9206867634726574,
"grad_norm": 0.146484375,
"learning_rate": 7.85003526030495e-06,
"loss": 2.5422,
"step": 10430
},
{
"epoch": 0.9215694928719601,
"grad_norm": 0.1484375,
"learning_rate": 7.677070375920026e-06,
"loss": 2.5415,
"step": 10440
},
{
"epoch": 0.9224522222712628,
"grad_norm": 0.1435546875,
"learning_rate": 7.506002586909006e-06,
"loss": 2.5579,
"step": 10450
},
{
"epoch": 0.9233349516705653,
"grad_norm": 0.1630859375,
"learning_rate": 7.336833232525625e-06,
"loss": 2.5422,
"step": 10460
},
{
"epoch": 0.924217681069868,
"grad_norm": 0.1484375,
"learning_rate": 7.169563637161397e-06,
"loss": 2.55,
"step": 10470
},
{
"epoch": 0.9251004104691707,
"grad_norm": 0.14453125,
"learning_rate": 7.004195110334788e-06,
"loss": 2.5397,
"step": 10480
},
{
"epoch": 0.9259831398684734,
"grad_norm": 0.14453125,
"learning_rate": 6.840728946681363e-06,
"loss": 2.5606,
"step": 10490
},
{
"epoch": 0.9268658692677759,
"grad_norm": 0.1484375,
"learning_rate": 6.679166425943351e-06,
"loss": 2.5403,
"step": 10500
},
{
"epoch": 0.9277485986670786,
"grad_norm": 0.1474609375,
"learning_rate": 6.519508812959873e-06,
"loss": 2.5464,
"step": 10510
},
{
"epoch": 0.9286313280663813,
"grad_norm": 0.1513671875,
"learning_rate": 6.3617573576569274e-06,
"loss": 2.546,
"step": 10520
},
{
"epoch": 0.9295140574656839,
"grad_norm": 0.1474609375,
"learning_rate": 6.205913295037474e-06,
"loss": 2.5394,
"step": 10530
},
{
"epoch": 0.9303967868649865,
"grad_norm": 0.1435546875,
"learning_rate": 6.051977845172002e-06,
"loss": 2.5584,
"step": 10540
},
{
"epoch": 0.9312795162642892,
"grad_norm": 0.15234375,
"learning_rate": 5.899952213188897e-06,
"loss": 2.5341,
"step": 10550
},
{
"epoch": 0.9321622456635918,
"grad_norm": 0.1474609375,
"learning_rate": 5.749837589264895e-06,
"loss": 2.5478,
"step": 10560
},
{
"epoch": 0.9330449750628945,
"grad_norm": 0.1484375,
"learning_rate": 5.601635148615891e-06,
"loss": 2.5387,
"step": 10570
},
{
"epoch": 0.9339277044621971,
"grad_norm": 0.1474609375,
"learning_rate": 5.4553460514877304e-06,
"loss": 2.5579,
"step": 10580
},
{
"epoch": 0.9348104338614998,
"grad_norm": 0.14453125,
"learning_rate": 5.3109714431470165e-06,
"loss": 2.5602,
"step": 10590
},
{
"epoch": 0.9356931632608024,
"grad_norm": 0.140625,
"learning_rate": 5.168512453872287e-06,
"loss": 2.5453,
"step": 10600
},
{
"epoch": 0.936575892660105,
"grad_norm": 0.146484375,
"learning_rate": 5.027970198945076e-06,
"loss": 2.5461,
"step": 10610
},
{
"epoch": 0.9374586220594077,
"grad_norm": 0.146484375,
"learning_rate": 4.889345778641252e-06,
"loss": 2.5422,
"step": 10620
},
{
"epoch": 0.9383413514587103,
"grad_norm": 0.1494140625,
"learning_rate": 4.752640278222254e-06,
"loss": 2.5523,
"step": 10630
},
{
"epoch": 0.939224080858013,
"grad_norm": 0.1416015625,
"learning_rate": 4.617854767926782e-06,
"loss": 2.5384,
"step": 10640
},
{
"epoch": 0.9401068102573156,
"grad_norm": 0.1474609375,
"learning_rate": 4.484990302962344e-06,
"loss": 2.564,
"step": 10650
},
{
"epoch": 0.9409895396566182,
"grad_norm": 0.14453125,
"learning_rate": 4.354047923496917e-06,
"loss": 2.5429,
"step": 10660
},
{
"epoch": 0.9418722690559209,
"grad_norm": 0.1416015625,
"learning_rate": 4.2250286546509365e-06,
"loss": 2.5365,
"step": 10670
},
{
"epoch": 0.9427549984552236,
"grad_norm": 0.1474609375,
"learning_rate": 4.09793350648921e-06,
"loss": 2.543,
"step": 10680
},
{
"epoch": 0.9436377278545262,
"grad_norm": 0.140625,
"learning_rate": 3.9727634740129585e-06,
"loss": 2.5527,
"step": 10690
},
{
"epoch": 0.9445204572538288,
"grad_norm": 0.14453125,
"learning_rate": 3.849519537152124e-06,
"loss": 2.5534,
"step": 10700
},
{
"epoch": 0.9454031866531315,
"grad_norm": 0.142578125,
"learning_rate": 3.7282026607576016e-06,
"loss": 2.5467,
"step": 10710
},
{
"epoch": 0.9462859160524342,
"grad_norm": 0.1494140625,
"learning_rate": 3.608813794593796e-06,
"loss": 2.5537,
"step": 10720
},
{
"epoch": 0.9471686454517367,
"grad_norm": 0.1455078125,
"learning_rate": 3.491353873331077e-06,
"loss": 2.5443,
"step": 10730
},
{
"epoch": 0.9480513748510394,
"grad_norm": 0.1513671875,
"learning_rate": 3.3758238165384757e-06,
"loss": 2.5409,
"step": 10740
},
{
"epoch": 0.9489341042503421,
"grad_norm": 0.14453125,
"learning_rate": 3.262224528676666e-06,
"loss": 2.5294,
"step": 10750
},
{
"epoch": 0.9498168336496448,
"grad_norm": 0.1484375,
"learning_rate": 3.1505568990905787e-06,
"loss": 2.5535,
"step": 10760
},
{
"epoch": 0.9506995630489473,
"grad_norm": 0.146484375,
"learning_rate": 3.040821802002658e-06,
"loss": 2.534,
"step": 10770
},
{
"epoch": 0.95158229244825,
"grad_norm": 0.142578125,
"learning_rate": 2.9330200965059507e-06,
"loss": 2.5347,
"step": 10780
},
{
"epoch": 0.9524650218475527,
"grad_norm": 0.1435546875,
"learning_rate": 2.827152626557389e-06,
"loss": 2.5541,
"step": 10790
},
{
"epoch": 0.9533477512468552,
"grad_norm": 0.14453125,
"learning_rate": 2.72322022097124e-06,
"loss": 2.5358,
"step": 10800
},
{
"epoch": 0.9542304806461579,
"grad_norm": 0.14453125,
"learning_rate": 2.621223693412417e-06,
"loss": 2.5485,
"step": 10810
},
{
"epoch": 0.9551132100454606,
"grad_norm": 0.1455078125,
"learning_rate": 2.5211638423903725e-06,
"loss": 2.5523,
"step": 10820
},
{
"epoch": 0.9559959394447632,
"grad_norm": 0.1513671875,
"learning_rate": 2.4230414512527166e-06,
"loss": 2.5485,
"step": 10830
},
{
"epoch": 0.9568786688440658,
"grad_norm": 0.140625,
"learning_rate": 2.326857288178996e-06,
"loss": 2.5437,
"step": 10840
},
{
"epoch": 0.9577613982433685,
"grad_norm": 0.158203125,
"learning_rate": 2.232612106174897e-06,
"loss": 2.5459,
"step": 10850
},
{
"epoch": 0.9586441276426712,
"grad_norm": 0.1513671875,
"learning_rate": 2.1403066430661644e-06,
"loss": 2.5504,
"step": 10860
},
{
"epoch": 0.9595268570419738,
"grad_norm": 0.1494140625,
"learning_rate": 2.0499416214928844e-06,
"loss": 2.5543,
"step": 10870
},
{
"epoch": 0.9604095864412764,
"grad_norm": 0.1474609375,
"learning_rate": 1.9615177489038792e-06,
"loss": 2.5351,
"step": 10880
},
{
"epoch": 0.9612923158405791,
"grad_norm": 0.146484375,
"learning_rate": 1.8750357175510435e-06,
"loss": 2.5447,
"step": 10890
},
{
"epoch": 0.9621750452398817,
"grad_norm": 0.146484375,
"learning_rate": 1.7904962044841266e-06,
"loss": 2.5591,
"step": 10900
},
{
"epoch": 0.9630577746391844,
"grad_norm": 0.1455078125,
"learning_rate": 1.70789987154521e-06,
"loss": 2.5395,
"step": 10910
},
{
"epoch": 0.963940504038487,
"grad_norm": 0.1455078125,
"learning_rate": 1.6272473653636266e-06,
"loss": 2.5443,
"step": 10920
},
{
"epoch": 0.9648232334377896,
"grad_norm": 0.1494140625,
"learning_rate": 1.5485393173509388e-06,
"loss": 2.5364,
"step": 10930
},
{
"epoch": 0.9657059628370923,
"grad_norm": 0.1474609375,
"learning_rate": 1.4717763436959685e-06,
"loss": 2.55,
"step": 10940
},
{
"epoch": 0.966588692236395,
"grad_norm": 0.142578125,
"learning_rate": 1.3969590453598858e-06,
"loss": 2.5337,
"step": 10950
},
{
"epoch": 0.9674714216356975,
"grad_norm": 0.15234375,
"learning_rate": 1.3240880080716832e-06,
"loss": 2.5396,
"step": 10960
},
{
"epoch": 0.9683541510350002,
"grad_norm": 0.1435546875,
"learning_rate": 1.2531638023233761e-06,
"loss": 2.5398,
"step": 10970
},
{
"epoch": 0.9692368804343029,
"grad_norm": 0.14453125,
"learning_rate": 1.1841869833656981e-06,
"loss": 2.5688,
"step": 10980
},
{
"epoch": 0.9701196098336055,
"grad_norm": 0.1435546875,
"learning_rate": 1.1171580912036627e-06,
"loss": 2.5305,
"step": 10990
},
{
"epoch": 0.9710023392329081,
"grad_norm": 0.1455078125,
"learning_rate": 1.0520776505924812e-06,
"loss": 2.5474,
"step": 11000
},
{
"epoch": 0.9718850686322108,
"grad_norm": 0.1552734375,
"learning_rate": 9.889461710332059e-07,
"loss": 2.5524,
"step": 11010
},
{
"epoch": 0.9727677980315135,
"grad_norm": 0.14453125,
"learning_rate": 9.277641467689279e-07,
"loss": 2.5433,
"step": 11020
},
{
"epoch": 0.973650527430816,
"grad_norm": 0.1513671875,
"learning_rate": 8.685320567809741e-07,
"loss": 2.5445,
"step": 11030
},
{
"epoch": 0.9745332568301187,
"grad_norm": 0.1513671875,
"learning_rate": 8.112503647848546e-07,
"loss": 2.5276,
"step": 11040
},
{
"epoch": 0.9754159862294214,
"grad_norm": 0.1435546875,
"learning_rate": 7.559195192269608e-07,
"loss": 2.5454,
"step": 11050
},
{
"epoch": 0.9762987156287241,
"grad_norm": 0.1474609375,
"learning_rate": 7.025399532808452e-07,
"loss": 2.5486,
"step": 11060
},
{
"epoch": 0.9771814450280266,
"grad_norm": 0.1484375,
"learning_rate": 6.511120848439467e-07,
"loss": 2.5565,
"step": 11070
},
{
"epoch": 0.9780641744273293,
"grad_norm": 0.1474609375,
"learning_rate": 6.016363165342875e-07,
"loss": 2.5388,
"step": 11080
},
{
"epoch": 0.978946903826632,
"grad_norm": 0.1455078125,
"learning_rate": 5.54113035687226e-07,
"loss": 2.5419,
"step": 11090
},
{
"epoch": 0.9798296332259346,
"grad_norm": 0.150390625,
"learning_rate": 5.085426143525695e-07,
"loss": 2.5327,
"step": 11100
},
{
"epoch": 0.9807123626252372,
"grad_norm": 0.1455078125,
"learning_rate": 4.649254092916333e-07,
"loss": 2.5482,
"step": 11110
},
{
"epoch": 0.9815950920245399,
"grad_norm": 0.166015625,
"learning_rate": 4.2326176197429735e-07,
"loss": 2.5524,
"step": 11120
},
{
"epoch": 0.9824778214238425,
"grad_norm": 0.1494140625,
"learning_rate": 3.835519985765368e-07,
"loss": 2.5317,
"step": 11130
},
{
"epoch": 0.9833605508231452,
"grad_norm": 0.15234375,
"learning_rate": 3.457964299777849e-07,
"loss": 2.5451,
"step": 11140
},
{
"epoch": 0.9842432802224478,
"grad_norm": 0.1591796875,
"learning_rate": 3.099953517584353e-07,
"loss": 2.5406,
"step": 11150
},
{
"epoch": 0.9851260096217505,
"grad_norm": 0.162109375,
"learning_rate": 2.761490441976211e-07,
"loss": 2.5455,
"step": 11160
},
{
"epoch": 0.9860087390210531,
"grad_norm": 0.14453125,
"learning_rate": 2.4425777227102265e-07,
"loss": 2.5483,
"step": 11170
},
{
"epoch": 0.9868914684203557,
"grad_norm": 0.1474609375,
"learning_rate": 2.1432178564867455e-07,
"loss": 2.5509,
"step": 11180
},
{
"epoch": 0.9877741978196584,
"grad_norm": 0.146484375,
"learning_rate": 1.8634131869313397e-07,
"loss": 2.5409,
"step": 11190
},
{
"epoch": 0.988656927218961,
"grad_norm": 0.1474609375,
"learning_rate": 1.6031659045759318e-07,
"loss": 2.537,
"step": 11200
},
{
"epoch": 0.9895396566182637,
"grad_norm": 0.142578125,
"learning_rate": 1.3624780468424192e-07,
"loss": 2.5476,
"step": 11210
},
{
"epoch": 0.9904223860175663,
"grad_norm": 0.1533203125,
"learning_rate": 1.1413514980254669e-07,
"loss": 2.5474,
"step": 11220
},
{
"epoch": 0.9913051154168689,
"grad_norm": 0.14453125,
"learning_rate": 9.397879892777961e-08,
"loss": 2.5472,
"step": 11230
},
{
"epoch": 0.9921878448161716,
"grad_norm": 0.150390625,
"learning_rate": 7.577890985985269e-08,
"loss": 2.5441,
"step": 11240
},
{
"epoch": 0.9930705742154743,
"grad_norm": 0.150390625,
"learning_rate": 5.953562508184684e-08,
"loss": 2.5474,
"step": 11250
},
{
"epoch": 0.9939533036147769,
"grad_norm": 0.146484375,
"learning_rate": 4.524907175904036e-08,
"loss": 2.5428,
"step": 11260
},
{
"epoch": 0.9948360330140795,
"grad_norm": 0.1455078125,
"learning_rate": 3.2919361737854256e-08,
"loss": 2.5553,
"step": 11270
},
{
"epoch": 0.9957187624133822,
"grad_norm": 0.1474609375,
"learning_rate": 2.2546591544991833e-08,
"loss": 2.5346,
"step": 11280
},
{
"epoch": 0.9966014918126849,
"grad_norm": 0.1474609375,
"learning_rate": 1.4130842386717025e-08,
"loss": 2.548,
"step": 11290
},
{
"epoch": 0.9974842212119874,
"grad_norm": 0.146484375,
"learning_rate": 7.672180148132757e-09,
"loss": 2.5376,
"step": 11300
},
{
"epoch": 0.9983669506112901,
"grad_norm": 0.1513671875,
"learning_rate": 3.1706553927923763e-09,
"loss": 2.5424,
"step": 11310
},
{
"epoch": 0.9992496800105928,
"grad_norm": 0.1435546875,
"learning_rate": 6.263033621722869e-10,
"loss": 2.5439,
"step": 11320
},
{
"epoch": 0.9999558635300348,
"step": 11328,
"total_flos": 1.9775705361382638e+20,
"train_loss": 2.5878285095516573,
"train_runtime": 23032.3399,
"train_samples_per_second": 125.914,
"train_steps_per_second": 0.492
}
],
"logging_steps": 10,
"max_steps": 11328,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.9775705361382638e+20,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}