diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,142142 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0641072591145835, + "eval_steps": 12288, + "global_step": 101455, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 2.0345052083333332e-05, + "eval_loss": 13.501585006713867, + "eval_runtime": 105.7668, + "eval_samples_per_second": 18.976, + "eval_steps_per_second": 9.493, + "step": 1 + }, + { + "epoch": 0.00010172526041666667, + "grad_norm": 187.2008514404297, + "learning_rate": 5.000000000000001e-07, + "loss": 13.5952, + "step": 5 + }, + { + "epoch": 0.00020345052083333334, + "grad_norm": 234.9751739501953, + "learning_rate": 1.0000000000000002e-06, + "loss": 13.3781, + "step": 10 + }, + { + "epoch": 0.00030517578125, + "grad_norm": 207.3023681640625, + "learning_rate": 1.5e-06, + "loss": 13.3302, + "step": 15 + }, + { + "epoch": 0.0004069010416666667, + "grad_norm": 217.7012939453125, + "learning_rate": 2.0000000000000003e-06, + "loss": 13.5236, + "step": 20 + }, + { + "epoch": 0.0005086263020833334, + "grad_norm": 240.89230346679688, + "learning_rate": 2.5e-06, + "loss": 13.3795, + "step": 25 + }, + { + "epoch": 0.0006103515625, + "grad_norm": 227.74560546875, + "learning_rate": 3e-06, + "loss": 12.4315, + "step": 30 + }, + { + "epoch": 0.0007120768229166666, + "grad_norm": 229.02545166015625, + "learning_rate": 3.5e-06, + "loss": 12.8162, + "step": 35 + }, + { + "epoch": 0.0008138020833333334, + "grad_norm": 192.57810974121094, + "learning_rate": 4.000000000000001e-06, + "loss": 12.5698, + "step": 40 + }, + { + "epoch": 0.00091552734375, + "grad_norm": 158.721923828125, + "learning_rate": 4.5e-06, + "loss": 12.0444, + "step": 45 + }, + { + "epoch": 0.0010172526041666667, + "grad_norm": 169.90931701660156, + "learning_rate": 5e-06, + "loss": 12.0289, + "step": 50 + }, + { + "epoch": 0.0011189778645833333, + "grad_norm": 152.55885314941406, + "learning_rate": 5.500000000000001e-06, + "loss": 11.5522, + "step": 55 + }, + { + "epoch": 0.001220703125, + "grad_norm": 157.96636962890625, + "learning_rate": 6e-06, + "loss": 11.1354, + "step": 60 + }, + { + "epoch": 0.0013224283854166667, + "grad_norm": 88.49183654785156, + "learning_rate": 6.5000000000000004e-06, + "loss": 10.3339, + "step": 65 + }, + { + "epoch": 0.0014241536458333333, + "grad_norm": 79.22685241699219, + "learning_rate": 7e-06, + "loss": 10.2831, + "step": 70 + }, + { + "epoch": 0.00152587890625, + "grad_norm": 78.410400390625, + "learning_rate": 7.500000000000001e-06, + "loss": 9.29, + "step": 75 + }, + { + "epoch": 0.0016276041666666667, + "grad_norm": 53.37043380737305, + "learning_rate": 8.000000000000001e-06, + "loss": 9.4067, + "step": 80 + }, + { + "epoch": 0.0017293294270833333, + "grad_norm": 41.43476867675781, + "learning_rate": 8.5e-06, + "loss": 8.6693, + "step": 85 + }, + { + "epoch": 0.0018310546875, + "grad_norm": 34.20158004760742, + "learning_rate": 9e-06, + "loss": 8.4403, + "step": 90 + }, + { + "epoch": 0.0019327799479166667, + "grad_norm": 32.26168441772461, + "learning_rate": 9.5e-06, + "loss": 8.4257, + "step": 95 + }, + { + "epoch": 0.0020345052083333335, + "grad_norm": 25.387107849121094, + "learning_rate": 1e-05, + "loss": 8.1384, + "step": 100 + }, + { + "epoch": 0.00213623046875, + "grad_norm": 24.035205841064453, + "learning_rate": 9.999999984025795e-06, + "loss": 7.8455, + "step": 105 + }, + { + "epoch": 0.0022379557291666665, + "grad_norm": 17.602067947387695, + "learning_rate": 9.999999936103176e-06, + "loss": 7.8294, + "step": 110 + }, + { + "epoch": 0.0023396809895833335, + "grad_norm": 23.5384464263916, + "learning_rate": 9.999999856232143e-06, + "loss": 7.7578, + "step": 115 + }, + { + "epoch": 0.00244140625, + "grad_norm": 30.8400821685791, + "learning_rate": 9.999999744412701e-06, + "loss": 7.8559, + "step": 120 + }, + { + "epoch": 0.0025431315104166665, + "grad_norm": 17.0482120513916, + "learning_rate": 9.999999600644846e-06, + "loss": 7.2873, + "step": 125 + }, + { + "epoch": 0.0026448567708333335, + "grad_norm": 16.93459129333496, + "learning_rate": 9.99999942492858e-06, + "loss": 7.8431, + "step": 130 + }, + { + "epoch": 0.00274658203125, + "grad_norm": 28.9517879486084, + "learning_rate": 9.999999217263907e-06, + "loss": 7.5152, + "step": 135 + }, + { + "epoch": 0.0028483072916666665, + "grad_norm": 27.741579055786133, + "learning_rate": 9.999998977650827e-06, + "loss": 7.6284, + "step": 140 + }, + { + "epoch": 0.0029500325520833335, + "grad_norm": 15.774077415466309, + "learning_rate": 9.999998706089338e-06, + "loss": 7.5583, + "step": 145 + }, + { + "epoch": 0.0030517578125, + "grad_norm": 21.22496223449707, + "learning_rate": 9.999998402579446e-06, + "loss": 7.3046, + "step": 150 + }, + { + "epoch": 0.0031534830729166665, + "grad_norm": 14.374629020690918, + "learning_rate": 9.999998067121151e-06, + "loss": 7.3582, + "step": 155 + }, + { + "epoch": 0.0032552083333333335, + "grad_norm": 18.121116638183594, + "learning_rate": 9.999997699714456e-06, + "loss": 7.2362, + "step": 160 + }, + { + "epoch": 0.00335693359375, + "grad_norm": 20.033939361572266, + "learning_rate": 9.999997300359361e-06, + "loss": 7.3692, + "step": 165 + }, + { + "epoch": 0.0034586588541666665, + "grad_norm": 16.032236099243164, + "learning_rate": 9.999996869055873e-06, + "loss": 7.3623, + "step": 170 + }, + { + "epoch": 0.0035603841145833335, + "grad_norm": 21.473352432250977, + "learning_rate": 9.999996405803991e-06, + "loss": 7.3341, + "step": 175 + }, + { + "epoch": 0.003662109375, + "grad_norm": 23.298728942871094, + "learning_rate": 9.999995910603719e-06, + "loss": 7.1938, + "step": 180 + }, + { + "epoch": 0.0037638346354166665, + "grad_norm": 15.726787567138672, + "learning_rate": 9.999995383455062e-06, + "loss": 7.0991, + "step": 185 + }, + { + "epoch": 0.0038655598958333335, + "grad_norm": 26.825517654418945, + "learning_rate": 9.999994824358019e-06, + "loss": 7.1554, + "step": 190 + }, + { + "epoch": 0.00396728515625, + "grad_norm": 22.702795028686523, + "learning_rate": 9.999994233312597e-06, + "loss": 7.2226, + "step": 195 + }, + { + "epoch": 0.004069010416666667, + "grad_norm": 14.919492721557617, + "learning_rate": 9.9999936103188e-06, + "loss": 7.0152, + "step": 200 + }, + { + "epoch": 0.004170735677083333, + "grad_norm": 17.504310607910156, + "learning_rate": 9.999992955376632e-06, + "loss": 6.926, + "step": 205 + }, + { + "epoch": 0.0042724609375, + "grad_norm": 19.972328186035156, + "learning_rate": 9.999992268486096e-06, + "loss": 6.7351, + "step": 210 + }, + { + "epoch": 0.004374186197916667, + "grad_norm": 18.042076110839844, + "learning_rate": 9.999991549647194e-06, + "loss": 7.1731, + "step": 215 + }, + { + "epoch": 0.004475911458333333, + "grad_norm": 16.70429801940918, + "learning_rate": 9.999990798859936e-06, + "loss": 6.8756, + "step": 220 + }, + { + "epoch": 0.00457763671875, + "grad_norm": 17.5783748626709, + "learning_rate": 9.999990016124323e-06, + "loss": 6.9326, + "step": 225 + }, + { + "epoch": 0.004679361979166667, + "grad_norm": 15.982688903808594, + "learning_rate": 9.99998920144036e-06, + "loss": 6.7762, + "step": 230 + }, + { + "epoch": 0.004781087239583333, + "grad_norm": 16.940990447998047, + "learning_rate": 9.999988354808054e-06, + "loss": 6.4696, + "step": 235 + }, + { + "epoch": 0.0048828125, + "grad_norm": 18.594928741455078, + "learning_rate": 9.99998747622741e-06, + "loss": 6.6879, + "step": 240 + }, + { + "epoch": 0.004984537760416667, + "grad_norm": 19.32349967956543, + "learning_rate": 9.999986565698432e-06, + "loss": 6.8833, + "step": 245 + }, + { + "epoch": 0.005086263020833333, + "grad_norm": 16.72201156616211, + "learning_rate": 9.99998562322113e-06, + "loss": 6.966, + "step": 250 + }, + { + "epoch": 0.00518798828125, + "grad_norm": 20.089780807495117, + "learning_rate": 9.999984648795505e-06, + "loss": 6.8645, + "step": 255 + }, + { + "epoch": 0.005289713541666667, + "grad_norm": 17.048059463500977, + "learning_rate": 9.999983642421564e-06, + "loss": 6.9131, + "step": 260 + }, + { + "epoch": 0.005391438802083333, + "grad_norm": 12.952704429626465, + "learning_rate": 9.999982604099317e-06, + "loss": 6.905, + "step": 265 + }, + { + "epoch": 0.0054931640625, + "grad_norm": 21.715478897094727, + "learning_rate": 9.999981533828766e-06, + "loss": 6.8938, + "step": 270 + }, + { + "epoch": 0.005594889322916667, + "grad_norm": 21.554119110107422, + "learning_rate": 9.999980431609922e-06, + "loss": 6.7301, + "step": 275 + }, + { + "epoch": 0.005696614583333333, + "grad_norm": 19.184682846069336, + "learning_rate": 9.99997929744279e-06, + "loss": 6.8108, + "step": 280 + }, + { + "epoch": 0.00579833984375, + "grad_norm": 12.472123146057129, + "learning_rate": 9.999978131327379e-06, + "loss": 6.7023, + "step": 285 + }, + { + "epoch": 0.005900065104166667, + "grad_norm": 11.925370216369629, + "learning_rate": 9.999976933263693e-06, + "loss": 6.5, + "step": 290 + }, + { + "epoch": 0.006001790364583333, + "grad_norm": 15.944580078125, + "learning_rate": 9.999975703251741e-06, + "loss": 6.7326, + "step": 295 + }, + { + "epoch": 0.006103515625, + "grad_norm": 14.497954368591309, + "learning_rate": 9.999974441291533e-06, + "loss": 6.9048, + "step": 300 + }, + { + "epoch": 0.006205240885416667, + "grad_norm": 14.118562698364258, + "learning_rate": 9.999973147383076e-06, + "loss": 6.5547, + "step": 305 + }, + { + "epoch": 0.006306966145833333, + "grad_norm": 19.02632713317871, + "learning_rate": 9.999971821526376e-06, + "loss": 6.6384, + "step": 310 + }, + { + "epoch": 0.00640869140625, + "grad_norm": 18.794261932373047, + "learning_rate": 9.999970463721443e-06, + "loss": 6.4575, + "step": 315 + }, + { + "epoch": 0.006510416666666667, + "grad_norm": 14.502382278442383, + "learning_rate": 9.999969073968288e-06, + "loss": 6.4677, + "step": 320 + }, + { + "epoch": 0.006612141927083333, + "grad_norm": 16.55919647216797, + "learning_rate": 9.999967652266917e-06, + "loss": 6.6987, + "step": 325 + }, + { + "epoch": 0.0067138671875, + "grad_norm": 15.21981143951416, + "learning_rate": 9.999966198617338e-06, + "loss": 6.8427, + "step": 330 + }, + { + "epoch": 0.006815592447916667, + "grad_norm": 20.04963493347168, + "learning_rate": 9.999964713019566e-06, + "loss": 6.7188, + "step": 335 + }, + { + "epoch": 0.006917317708333333, + "grad_norm": 13.341812133789062, + "learning_rate": 9.999963195473603e-06, + "loss": 6.5026, + "step": 340 + }, + { + "epoch": 0.00701904296875, + "grad_norm": 16.2296142578125, + "learning_rate": 9.999961645979465e-06, + "loss": 6.6436, + "step": 345 + }, + { + "epoch": 0.007120768229166667, + "grad_norm": 21.553808212280273, + "learning_rate": 9.999960064537158e-06, + "loss": 6.4794, + "step": 350 + }, + { + "epoch": 0.007222493489583333, + "grad_norm": 19.853740692138672, + "learning_rate": 9.999958451146694e-06, + "loss": 6.6756, + "step": 355 + }, + { + "epoch": 0.00732421875, + "grad_norm": 15.604897499084473, + "learning_rate": 9.999956805808083e-06, + "loss": 6.5316, + "step": 360 + }, + { + "epoch": 0.007425944010416667, + "grad_norm": 22.304800033569336, + "learning_rate": 9.999955128521333e-06, + "loss": 6.9055, + "step": 365 + }, + { + "epoch": 0.007527669270833333, + "grad_norm": 16.207674026489258, + "learning_rate": 9.99995341928646e-06, + "loss": 6.7588, + "step": 370 + }, + { + "epoch": 0.00762939453125, + "grad_norm": 16.2166805267334, + "learning_rate": 9.99995167810347e-06, + "loss": 6.3506, + "step": 375 + }, + { + "epoch": 0.007731119791666667, + "grad_norm": 17.854827880859375, + "learning_rate": 9.999949904972376e-06, + "loss": 6.5435, + "step": 380 + }, + { + "epoch": 0.007832845052083334, + "grad_norm": 20.180625915527344, + "learning_rate": 9.99994809989319e-06, + "loss": 6.695, + "step": 385 + }, + { + "epoch": 0.0079345703125, + "grad_norm": 18.95235824584961, + "learning_rate": 9.999946262865921e-06, + "loss": 6.5806, + "step": 390 + }, + { + "epoch": 0.008036295572916666, + "grad_norm": 20.12158966064453, + "learning_rate": 9.999944393890584e-06, + "loss": 6.6697, + "step": 395 + }, + { + "epoch": 0.008138020833333334, + "grad_norm": 15.898601531982422, + "learning_rate": 9.999942492967189e-06, + "loss": 6.8135, + "step": 400 + }, + { + "epoch": 0.00823974609375, + "grad_norm": 20.538938522338867, + "learning_rate": 9.99994056009575e-06, + "loss": 7.0004, + "step": 405 + }, + { + "epoch": 0.008341471354166666, + "grad_norm": 16.172489166259766, + "learning_rate": 9.999938595276278e-06, + "loss": 6.5813, + "step": 410 + }, + { + "epoch": 0.008443196614583334, + "grad_norm": 13.720282554626465, + "learning_rate": 9.999936598508784e-06, + "loss": 6.4192, + "step": 415 + }, + { + "epoch": 0.008544921875, + "grad_norm": 14.712517738342285, + "learning_rate": 9.999934569793284e-06, + "loss": 6.3489, + "step": 420 + }, + { + "epoch": 0.008646647135416666, + "grad_norm": 30.682470321655273, + "learning_rate": 9.999932509129786e-06, + "loss": 6.4758, + "step": 425 + }, + { + "epoch": 0.008748372395833334, + "grad_norm": 16.026031494140625, + "learning_rate": 9.99993041651831e-06, + "loss": 6.5035, + "step": 430 + }, + { + "epoch": 0.00885009765625, + "grad_norm": 16.791662216186523, + "learning_rate": 9.999928291958864e-06, + "loss": 6.5171, + "step": 435 + }, + { + "epoch": 0.008951822916666666, + "grad_norm": 17.777681350708008, + "learning_rate": 9.999926135451466e-06, + "loss": 6.2574, + "step": 440 + }, + { + "epoch": 0.009053548177083334, + "grad_norm": 21.421998977661133, + "learning_rate": 9.999923946996123e-06, + "loss": 6.6252, + "step": 445 + }, + { + "epoch": 0.0091552734375, + "grad_norm": 19.111989974975586, + "learning_rate": 9.999921726592856e-06, + "loss": 6.2462, + "step": 450 + }, + { + "epoch": 0.009256998697916666, + "grad_norm": 16.37298011779785, + "learning_rate": 9.999919474241678e-06, + "loss": 6.3921, + "step": 455 + }, + { + "epoch": 0.009358723958333334, + "grad_norm": 13.511612892150879, + "learning_rate": 9.999917189942599e-06, + "loss": 6.3154, + "step": 460 + }, + { + "epoch": 0.00946044921875, + "grad_norm": 17.39017677307129, + "learning_rate": 9.999914873695636e-06, + "loss": 6.5523, + "step": 465 + }, + { + "epoch": 0.009562174479166666, + "grad_norm": 14.11750316619873, + "learning_rate": 9.999912525500806e-06, + "loss": 6.4333, + "step": 470 + }, + { + "epoch": 0.009663899739583334, + "grad_norm": 31.548959732055664, + "learning_rate": 9.999910145358121e-06, + "loss": 6.5185, + "step": 475 + }, + { + "epoch": 0.009765625, + "grad_norm": 17.383716583251953, + "learning_rate": 9.999907733267596e-06, + "loss": 6.3272, + "step": 480 + }, + { + "epoch": 0.009867350260416666, + "grad_norm": 15.274198532104492, + "learning_rate": 9.99990528922925e-06, + "loss": 6.2926, + "step": 485 + }, + { + "epoch": 0.009969075520833334, + "grad_norm": 15.915552139282227, + "learning_rate": 9.999902813243096e-06, + "loss": 6.4007, + "step": 490 + }, + { + "epoch": 0.01007080078125, + "grad_norm": 14.228730201721191, + "learning_rate": 9.999900305309149e-06, + "loss": 6.3022, + "step": 495 + }, + { + "epoch": 0.010172526041666666, + "grad_norm": 13.360937118530273, + "learning_rate": 9.999897765427427e-06, + "loss": 6.0019, + "step": 500 + }, + { + "epoch": 0.010274251302083334, + "grad_norm": 18.921798706054688, + "learning_rate": 9.999895193597946e-06, + "loss": 6.3684, + "step": 505 + }, + { + "epoch": 0.0103759765625, + "grad_norm": 20.283798217773438, + "learning_rate": 9.99989258982072e-06, + "loss": 6.6517, + "step": 510 + }, + { + "epoch": 0.010477701822916666, + "grad_norm": 17.526382446289062, + "learning_rate": 9.999889954095771e-06, + "loss": 6.3969, + "step": 515 + }, + { + "epoch": 0.010579427083333334, + "grad_norm": 15.044032096862793, + "learning_rate": 9.99988728642311e-06, + "loss": 6.4389, + "step": 520 + }, + { + "epoch": 0.01068115234375, + "grad_norm": 16.624927520751953, + "learning_rate": 9.999884586802757e-06, + "loss": 6.5768, + "step": 525 + }, + { + "epoch": 0.010782877604166666, + "grad_norm": 14.993972778320312, + "learning_rate": 9.999881855234727e-06, + "loss": 6.4086, + "step": 530 + }, + { + "epoch": 0.010884602864583334, + "grad_norm": 15.304669380187988, + "learning_rate": 9.999879091719043e-06, + "loss": 6.4951, + "step": 535 + }, + { + "epoch": 0.010986328125, + "grad_norm": 17.68439483642578, + "learning_rate": 9.999876296255714e-06, + "loss": 6.6295, + "step": 540 + }, + { + "epoch": 0.011088053385416666, + "grad_norm": 16.611164093017578, + "learning_rate": 9.999873468844766e-06, + "loss": 6.39, + "step": 545 + }, + { + "epoch": 0.011189778645833334, + "grad_norm": 17.16684913635254, + "learning_rate": 9.999870609486213e-06, + "loss": 6.3262, + "step": 550 + }, + { + "epoch": 0.01129150390625, + "grad_norm": 13.845763206481934, + "learning_rate": 9.999867718180075e-06, + "loss": 6.4096, + "step": 555 + }, + { + "epoch": 0.011393229166666666, + "grad_norm": 15.110551834106445, + "learning_rate": 9.999864794926366e-06, + "loss": 6.4421, + "step": 560 + }, + { + "epoch": 0.011494954427083334, + "grad_norm": 14.663053512573242, + "learning_rate": 9.999861839725112e-06, + "loss": 6.6888, + "step": 565 + }, + { + "epoch": 0.0115966796875, + "grad_norm": 22.608501434326172, + "learning_rate": 9.999858852576325e-06, + "loss": 6.1774, + "step": 570 + }, + { + "epoch": 0.011698404947916666, + "grad_norm": 15.32004451751709, + "learning_rate": 9.99985583348003e-06, + "loss": 6.6978, + "step": 575 + }, + { + "epoch": 0.011800130208333334, + "grad_norm": 15.750069618225098, + "learning_rate": 9.999852782436242e-06, + "loss": 6.4021, + "step": 580 + }, + { + "epoch": 0.01190185546875, + "grad_norm": 17.4362735748291, + "learning_rate": 9.99984969944498e-06, + "loss": 6.4015, + "step": 585 + }, + { + "epoch": 0.012003580729166666, + "grad_norm": 19.504657745361328, + "learning_rate": 9.999846584506267e-06, + "loss": 6.2446, + "step": 590 + }, + { + "epoch": 0.012105305989583334, + "grad_norm": 15.031131744384766, + "learning_rate": 9.999843437620122e-06, + "loss": 6.5576, + "step": 595 + }, + { + "epoch": 0.01220703125, + "grad_norm": 20.078384399414062, + "learning_rate": 9.999840258786565e-06, + "loss": 6.4641, + "step": 600 + }, + { + "epoch": 0.012308756510416666, + "grad_norm": 20.882177352905273, + "learning_rate": 9.999837048005614e-06, + "loss": 6.3432, + "step": 605 + }, + { + "epoch": 0.012410481770833334, + "grad_norm": 29.77153205871582, + "learning_rate": 9.999833805277293e-06, + "loss": 6.2118, + "step": 610 + }, + { + "epoch": 0.01251220703125, + "grad_norm": 17.335845947265625, + "learning_rate": 9.999830530601622e-06, + "loss": 6.3612, + "step": 615 + }, + { + "epoch": 0.012613932291666666, + "grad_norm": 20.453691482543945, + "learning_rate": 9.99982722397862e-06, + "loss": 6.1359, + "step": 620 + }, + { + "epoch": 0.012715657552083334, + "grad_norm": 15.832781791687012, + "learning_rate": 9.999823885408309e-06, + "loss": 6.1994, + "step": 625 + }, + { + "epoch": 0.0128173828125, + "grad_norm": 14.899689674377441, + "learning_rate": 9.99982051489071e-06, + "loss": 5.9086, + "step": 630 + }, + { + "epoch": 0.012919108072916666, + "grad_norm": 17.79781150817871, + "learning_rate": 9.999817112425845e-06, + "loss": 6.0511, + "step": 635 + }, + { + "epoch": 0.013020833333333334, + "grad_norm": 21.759098052978516, + "learning_rate": 9.999813678013737e-06, + "loss": 6.182, + "step": 640 + }, + { + "epoch": 0.01312255859375, + "grad_norm": 14.50014877319336, + "learning_rate": 9.999810211654405e-06, + "loss": 6.3663, + "step": 645 + }, + { + "epoch": 0.013224283854166666, + "grad_norm": 22.91578483581543, + "learning_rate": 9.999806713347875e-06, + "loss": 6.1381, + "step": 650 + }, + { + "epoch": 0.013326009114583334, + "grad_norm": 15.962817192077637, + "learning_rate": 9.999803183094166e-06, + "loss": 6.3089, + "step": 655 + }, + { + "epoch": 0.013427734375, + "grad_norm": 18.82858657836914, + "learning_rate": 9.999799620893302e-06, + "loss": 6.2528, + "step": 660 + }, + { + "epoch": 0.013529459635416666, + "grad_norm": 16.326257705688477, + "learning_rate": 9.999796026745307e-06, + "loss": 5.9086, + "step": 665 + }, + { + "epoch": 0.013631184895833334, + "grad_norm": 16.926494598388672, + "learning_rate": 9.999792400650201e-06, + "loss": 6.3107, + "step": 670 + }, + { + "epoch": 0.01373291015625, + "grad_norm": 21.313989639282227, + "learning_rate": 9.999788742608009e-06, + "loss": 6.4905, + "step": 675 + }, + { + "epoch": 0.013834635416666666, + "grad_norm": 15.717494010925293, + "learning_rate": 9.999785052618753e-06, + "loss": 6.1815, + "step": 680 + }, + { + "epoch": 0.013936360677083334, + "grad_norm": 16.92892074584961, + "learning_rate": 9.99978133068246e-06, + "loss": 6.4027, + "step": 685 + }, + { + "epoch": 0.0140380859375, + "grad_norm": 16.64455795288086, + "learning_rate": 9.99977757679915e-06, + "loss": 6.155, + "step": 690 + }, + { + "epoch": 0.014139811197916666, + "grad_norm": 17.005037307739258, + "learning_rate": 9.99977379096885e-06, + "loss": 6.3129, + "step": 695 + }, + { + "epoch": 0.014241536458333334, + "grad_norm": 15.138684272766113, + "learning_rate": 9.99976997319158e-06, + "loss": 6.1038, + "step": 700 + }, + { + "epoch": 0.01434326171875, + "grad_norm": 15.471761703491211, + "learning_rate": 9.999766123467369e-06, + "loss": 6.2372, + "step": 705 + }, + { + "epoch": 0.014444986979166666, + "grad_norm": 19.317546844482422, + "learning_rate": 9.999762241796237e-06, + "loss": 6.2086, + "step": 710 + }, + { + "epoch": 0.014546712239583334, + "grad_norm": 12.711938858032227, + "learning_rate": 9.999758328178213e-06, + "loss": 6.3789, + "step": 715 + }, + { + "epoch": 0.0146484375, + "grad_norm": 15.725773811340332, + "learning_rate": 9.999754382613323e-06, + "loss": 6.1679, + "step": 720 + }, + { + "epoch": 0.014750162760416666, + "grad_norm": 13.663782119750977, + "learning_rate": 9.999750405101586e-06, + "loss": 6.2403, + "step": 725 + }, + { + "epoch": 0.014851888020833334, + "grad_norm": 18.41815185546875, + "learning_rate": 9.999746395643033e-06, + "loss": 6.0896, + "step": 730 + }, + { + "epoch": 0.01495361328125, + "grad_norm": 14.879762649536133, + "learning_rate": 9.999742354237686e-06, + "loss": 6.1011, + "step": 735 + }, + { + "epoch": 0.015055338541666666, + "grad_norm": 22.801156997680664, + "learning_rate": 9.999738280885574e-06, + "loss": 6.317, + "step": 740 + }, + { + "epoch": 0.015157063802083334, + "grad_norm": 19.429927825927734, + "learning_rate": 9.999734175586721e-06, + "loss": 6.3397, + "step": 745 + }, + { + "epoch": 0.0152587890625, + "grad_norm": 17.059511184692383, + "learning_rate": 9.999730038341154e-06, + "loss": 6.23, + "step": 750 + }, + { + "epoch": 0.015360514322916666, + "grad_norm": 21.643585205078125, + "learning_rate": 9.9997258691489e-06, + "loss": 6.4206, + "step": 755 + }, + { + "epoch": 0.015462239583333334, + "grad_norm": 16.705398559570312, + "learning_rate": 9.999721668009984e-06, + "loss": 6.3106, + "step": 760 + }, + { + "epoch": 0.01556396484375, + "grad_norm": 22.098966598510742, + "learning_rate": 9.999717434924432e-06, + "loss": 5.8572, + "step": 765 + }, + { + "epoch": 0.015665690104166668, + "grad_norm": 15.450386047363281, + "learning_rate": 9.999713169892274e-06, + "loss": 6.2335, + "step": 770 + }, + { + "epoch": 0.015767415364583332, + "grad_norm": 15.822566032409668, + "learning_rate": 9.999708872913536e-06, + "loss": 6.0241, + "step": 775 + }, + { + "epoch": 0.015869140625, + "grad_norm": 15.075798034667969, + "learning_rate": 9.999704543988246e-06, + "loss": 5.9697, + "step": 780 + }, + { + "epoch": 0.015970865885416668, + "grad_norm": 20.23349952697754, + "learning_rate": 9.999700183116431e-06, + "loss": 6.1399, + "step": 785 + }, + { + "epoch": 0.016072591145833332, + "grad_norm": 14.411809921264648, + "learning_rate": 9.999695790298118e-06, + "loss": 6.1458, + "step": 790 + }, + { + "epoch": 0.01617431640625, + "grad_norm": 19.720853805541992, + "learning_rate": 9.999691365533337e-06, + "loss": 6.1918, + "step": 795 + }, + { + "epoch": 0.016276041666666668, + "grad_norm": 12.635454177856445, + "learning_rate": 9.999686908822115e-06, + "loss": 6.3008, + "step": 800 + }, + { + "epoch": 0.016377766927083332, + "grad_norm": 15.821192741394043, + "learning_rate": 9.99968242016448e-06, + "loss": 6.1527, + "step": 805 + }, + { + "epoch": 0.0164794921875, + "grad_norm": 14.938788414001465, + "learning_rate": 9.999677899560463e-06, + "loss": 6.3417, + "step": 810 + }, + { + "epoch": 0.016581217447916668, + "grad_norm": 13.912782669067383, + "learning_rate": 9.999673347010091e-06, + "loss": 6.212, + "step": 815 + }, + { + "epoch": 0.016682942708333332, + "grad_norm": 14.625290870666504, + "learning_rate": 9.999668762513392e-06, + "loss": 6.0772, + "step": 820 + }, + { + "epoch": 0.01678466796875, + "grad_norm": 19.16244125366211, + "learning_rate": 9.999664146070398e-06, + "loss": 5.8604, + "step": 825 + }, + { + "epoch": 0.016886393229166668, + "grad_norm": 17.346038818359375, + "learning_rate": 9.999659497681135e-06, + "loss": 6.222, + "step": 830 + }, + { + "epoch": 0.016988118489583332, + "grad_norm": 21.519062042236328, + "learning_rate": 9.99965481734564e-06, + "loss": 6.3716, + "step": 835 + }, + { + "epoch": 0.01708984375, + "grad_norm": 16.477502822875977, + "learning_rate": 9.999650105063932e-06, + "loss": 5.7144, + "step": 840 + }, + { + "epoch": 0.017191569010416668, + "grad_norm": 17.213186264038086, + "learning_rate": 9.99964536083605e-06, + "loss": 6.3174, + "step": 845 + }, + { + "epoch": 0.017293294270833332, + "grad_norm": 20.125654220581055, + "learning_rate": 9.99964058466202e-06, + "loss": 6.1869, + "step": 850 + }, + { + "epoch": 0.01739501953125, + "grad_norm": 20.213478088378906, + "learning_rate": 9.999635776541875e-06, + "loss": 6.2209, + "step": 855 + }, + { + "epoch": 0.017496744791666668, + "grad_norm": 14.639833450317383, + "learning_rate": 9.999630936475644e-06, + "loss": 6.1745, + "step": 860 + }, + { + "epoch": 0.017598470052083332, + "grad_norm": 16.572776794433594, + "learning_rate": 9.999626064463359e-06, + "loss": 6.4877, + "step": 865 + }, + { + "epoch": 0.0177001953125, + "grad_norm": 20.911428451538086, + "learning_rate": 9.999621160505048e-06, + "loss": 6.0936, + "step": 870 + }, + { + "epoch": 0.017801920572916668, + "grad_norm": 21.864120483398438, + "learning_rate": 9.999616224600747e-06, + "loss": 5.9036, + "step": 875 + }, + { + "epoch": 0.017903645833333332, + "grad_norm": 16.322891235351562, + "learning_rate": 9.999611256750487e-06, + "loss": 6.1985, + "step": 880 + }, + { + "epoch": 0.01800537109375, + "grad_norm": 19.20863914489746, + "learning_rate": 9.999606256954295e-06, + "loss": 6.1189, + "step": 885 + }, + { + "epoch": 0.018107096354166668, + "grad_norm": 16.85309410095215, + "learning_rate": 9.999601225212208e-06, + "loss": 5.8693, + "step": 890 + }, + { + "epoch": 0.018208821614583332, + "grad_norm": 18.39799690246582, + "learning_rate": 9.999596161524256e-06, + "loss": 5.9533, + "step": 895 + }, + { + "epoch": 0.018310546875, + "grad_norm": 16.06209373474121, + "learning_rate": 9.99959106589047e-06, + "loss": 5.8551, + "step": 900 + }, + { + "epoch": 0.018412272135416668, + "grad_norm": 13.610026359558105, + "learning_rate": 9.999585938310887e-06, + "loss": 5.8387, + "step": 905 + }, + { + "epoch": 0.018513997395833332, + "grad_norm": 14.547969818115234, + "learning_rate": 9.999580778785536e-06, + "loss": 6.3802, + "step": 910 + }, + { + "epoch": 0.01861572265625, + "grad_norm": 17.78799057006836, + "learning_rate": 9.99957558731445e-06, + "loss": 5.9853, + "step": 915 + }, + { + "epoch": 0.018717447916666668, + "grad_norm": 14.51143741607666, + "learning_rate": 9.999570363897664e-06, + "loss": 6.2408, + "step": 920 + }, + { + "epoch": 0.018819173177083332, + "grad_norm": 14.90191650390625, + "learning_rate": 9.99956510853521e-06, + "loss": 6.1126, + "step": 925 + }, + { + "epoch": 0.0189208984375, + "grad_norm": 14.731975555419922, + "learning_rate": 9.99955982122712e-06, + "loss": 6.5161, + "step": 930 + }, + { + "epoch": 0.019022623697916668, + "grad_norm": 23.994226455688477, + "learning_rate": 9.999554501973431e-06, + "loss": 6.1387, + "step": 935 + }, + { + "epoch": 0.019124348958333332, + "grad_norm": 20.015348434448242, + "learning_rate": 9.999549150774177e-06, + "loss": 5.9758, + "step": 940 + }, + { + "epoch": 0.01922607421875, + "grad_norm": 16.70435333251953, + "learning_rate": 9.999543767629391e-06, + "loss": 6.0314, + "step": 945 + }, + { + "epoch": 0.019327799479166668, + "grad_norm": 13.971758842468262, + "learning_rate": 9.999538352539108e-06, + "loss": 6.1443, + "step": 950 + }, + { + "epoch": 0.019429524739583332, + "grad_norm": 14.015809059143066, + "learning_rate": 9.99953290550336e-06, + "loss": 6.3802, + "step": 955 + }, + { + "epoch": 0.01953125, + "grad_norm": 26.053682327270508, + "learning_rate": 9.999527426522186e-06, + "loss": 6.0565, + "step": 960 + }, + { + "epoch": 0.019632975260416668, + "grad_norm": 19.186412811279297, + "learning_rate": 9.999521915595616e-06, + "loss": 6.1678, + "step": 965 + }, + { + "epoch": 0.019734700520833332, + "grad_norm": 16.644746780395508, + "learning_rate": 9.999516372723689e-06, + "loss": 6.1748, + "step": 970 + }, + { + "epoch": 0.01983642578125, + "grad_norm": 15.482766151428223, + "learning_rate": 9.999510797906441e-06, + "loss": 6.1224, + "step": 975 + }, + { + "epoch": 0.019938151041666668, + "grad_norm": 17.60553550720215, + "learning_rate": 9.999505191143906e-06, + "loss": 6.0821, + "step": 980 + }, + { + "epoch": 0.020039876302083332, + "grad_norm": 20.658655166625977, + "learning_rate": 9.999499552436118e-06, + "loss": 6.2336, + "step": 985 + }, + { + "epoch": 0.0201416015625, + "grad_norm": 38.21339416503906, + "learning_rate": 9.999493881783115e-06, + "loss": 6.1883, + "step": 990 + }, + { + "epoch": 0.020243326822916668, + "grad_norm": 17.47433853149414, + "learning_rate": 9.999488179184935e-06, + "loss": 6.0696, + "step": 995 + }, + { + "epoch": 0.020345052083333332, + "grad_norm": 15.066445350646973, + "learning_rate": 9.999482444641611e-06, + "loss": 5.8153, + "step": 1000 + }, + { + "epoch": 0.02044677734375, + "grad_norm": 18.611103057861328, + "learning_rate": 9.999476678153181e-06, + "loss": 6.2162, + "step": 1005 + }, + { + "epoch": 0.020548502604166668, + "grad_norm": 21.6483154296875, + "learning_rate": 9.999470879719684e-06, + "loss": 5.8706, + "step": 1010 + }, + { + "epoch": 0.020650227864583332, + "grad_norm": 16.52705955505371, + "learning_rate": 9.999465049341156e-06, + "loss": 6.0938, + "step": 1015 + }, + { + "epoch": 0.020751953125, + "grad_norm": 14.821548461914062, + "learning_rate": 9.99945918701763e-06, + "loss": 5.9386, + "step": 1020 + }, + { + "epoch": 0.020853678385416668, + "grad_norm": 13.421024322509766, + "learning_rate": 9.999453292749148e-06, + "loss": 5.8982, + "step": 1025 + }, + { + "epoch": 0.020955403645833332, + "grad_norm": 15.458352088928223, + "learning_rate": 9.999447366535745e-06, + "loss": 5.8866, + "step": 1030 + }, + { + "epoch": 0.02105712890625, + "grad_norm": 12.253440856933594, + "learning_rate": 9.999441408377464e-06, + "loss": 6.0773, + "step": 1035 + }, + { + "epoch": 0.021158854166666668, + "grad_norm": 15.976445198059082, + "learning_rate": 9.999435418274338e-06, + "loss": 6.0333, + "step": 1040 + }, + { + "epoch": 0.021260579427083332, + "grad_norm": 15.615504264831543, + "learning_rate": 9.999429396226406e-06, + "loss": 6.1585, + "step": 1045 + }, + { + "epoch": 0.0213623046875, + "grad_norm": 19.31055450439453, + "learning_rate": 9.999423342233707e-06, + "loss": 5.9945, + "step": 1050 + }, + { + "epoch": 0.021464029947916668, + "grad_norm": 17.401948928833008, + "learning_rate": 9.99941725629628e-06, + "loss": 5.7312, + "step": 1055 + }, + { + "epoch": 0.021565755208333332, + "grad_norm": 16.580659866333008, + "learning_rate": 9.999411138414163e-06, + "loss": 5.9958, + "step": 1060 + }, + { + "epoch": 0.02166748046875, + "grad_norm": 18.02301025390625, + "learning_rate": 9.999404988587398e-06, + "loss": 6.4218, + "step": 1065 + }, + { + "epoch": 0.021769205729166668, + "grad_norm": 18.302724838256836, + "learning_rate": 9.99939880681602e-06, + "loss": 5.9143, + "step": 1070 + }, + { + "epoch": 0.021870930989583332, + "grad_norm": 14.602520942687988, + "learning_rate": 9.999392593100072e-06, + "loss": 6.1716, + "step": 1075 + }, + { + "epoch": 0.02197265625, + "grad_norm": 16.554738998413086, + "learning_rate": 9.999386347439592e-06, + "loss": 6.1264, + "step": 1080 + }, + { + "epoch": 0.022074381510416668, + "grad_norm": 17.32607078552246, + "learning_rate": 9.99938006983462e-06, + "loss": 6.0934, + "step": 1085 + }, + { + "epoch": 0.022176106770833332, + "grad_norm": 17.626110076904297, + "learning_rate": 9.999373760285199e-06, + "loss": 5.8736, + "step": 1090 + }, + { + "epoch": 0.02227783203125, + "grad_norm": 17.10244369506836, + "learning_rate": 9.999367418791364e-06, + "loss": 5.7691, + "step": 1095 + }, + { + "epoch": 0.022379557291666668, + "grad_norm": 21.617374420166016, + "learning_rate": 9.999361045353157e-06, + "loss": 6.1331, + "step": 1100 + }, + { + "epoch": 0.022481282552083332, + "grad_norm": 12.846638679504395, + "learning_rate": 9.999354639970622e-06, + "loss": 6.2107, + "step": 1105 + }, + { + "epoch": 0.0225830078125, + "grad_norm": 15.556273460388184, + "learning_rate": 9.999348202643797e-06, + "loss": 6.0038, + "step": 1110 + }, + { + "epoch": 0.022684733072916668, + "grad_norm": 17.417882919311523, + "learning_rate": 9.999341733372725e-06, + "loss": 5.8668, + "step": 1115 + }, + { + "epoch": 0.022786458333333332, + "grad_norm": 18.81239128112793, + "learning_rate": 9.999335232157445e-06, + "loss": 6.1866, + "step": 1120 + }, + { + "epoch": 0.02288818359375, + "grad_norm": 17.713916778564453, + "learning_rate": 9.999328698998001e-06, + "loss": 5.9185, + "step": 1125 + }, + { + "epoch": 0.022989908854166668, + "grad_norm": 18.960283279418945, + "learning_rate": 9.999322133894433e-06, + "loss": 6.3169, + "step": 1130 + }, + { + "epoch": 0.023091634114583332, + "grad_norm": 16.75794219970703, + "learning_rate": 9.999315536846784e-06, + "loss": 6.0566, + "step": 1135 + }, + { + "epoch": 0.023193359375, + "grad_norm": 16.383392333984375, + "learning_rate": 9.999308907855095e-06, + "loss": 5.9156, + "step": 1140 + }, + { + "epoch": 0.023295084635416668, + "grad_norm": 13.930533409118652, + "learning_rate": 9.99930224691941e-06, + "loss": 6.0214, + "step": 1145 + }, + { + "epoch": 0.023396809895833332, + "grad_norm": 16.63954734802246, + "learning_rate": 9.999295554039769e-06, + "loss": 5.8897, + "step": 1150 + }, + { + "epoch": 0.02349853515625, + "grad_norm": 14.844721794128418, + "learning_rate": 9.999288829216219e-06, + "loss": 5.9586, + "step": 1155 + }, + { + "epoch": 0.023600260416666668, + "grad_norm": 15.012185096740723, + "learning_rate": 9.999282072448798e-06, + "loss": 5.9742, + "step": 1160 + }, + { + "epoch": 0.023701985677083332, + "grad_norm": 16.14036750793457, + "learning_rate": 9.999275283737553e-06, + "loss": 6.1716, + "step": 1165 + }, + { + "epoch": 0.0238037109375, + "grad_norm": 20.422229766845703, + "learning_rate": 9.999268463082526e-06, + "loss": 6.2924, + "step": 1170 + }, + { + "epoch": 0.023905436197916668, + "grad_norm": 18.413963317871094, + "learning_rate": 9.99926161048376e-06, + "loss": 5.9577, + "step": 1175 + }, + { + "epoch": 0.024007161458333332, + "grad_norm": 16.484912872314453, + "learning_rate": 9.999254725941299e-06, + "loss": 6.0114, + "step": 1180 + }, + { + "epoch": 0.02410888671875, + "grad_norm": 17.431936264038086, + "learning_rate": 9.999247809455185e-06, + "loss": 6.3848, + "step": 1185 + }, + { + "epoch": 0.024210611979166668, + "grad_norm": 16.671167373657227, + "learning_rate": 9.99924086102547e-06, + "loss": 5.9693, + "step": 1190 + }, + { + "epoch": 0.024312337239583332, + "grad_norm": 21.112850189208984, + "learning_rate": 9.999233880652187e-06, + "loss": 6.0038, + "step": 1195 + }, + { + "epoch": 0.0244140625, + "grad_norm": 16.35606575012207, + "learning_rate": 9.99922686833539e-06, + "loss": 5.9396, + "step": 1200 + }, + { + "epoch": 0.024515787760416668, + "grad_norm": 15.174341201782227, + "learning_rate": 9.99921982407512e-06, + "loss": 5.905, + "step": 1205 + }, + { + "epoch": 0.024617513020833332, + "grad_norm": 14.521608352661133, + "learning_rate": 9.999212747871422e-06, + "loss": 5.8563, + "step": 1210 + }, + { + "epoch": 0.02471923828125, + "grad_norm": 15.280231475830078, + "learning_rate": 9.999205639724342e-06, + "loss": 6.3178, + "step": 1215 + }, + { + "epoch": 0.024820963541666668, + "grad_norm": 18.43303871154785, + "learning_rate": 9.999198499633924e-06, + "loss": 6.2973, + "step": 1220 + }, + { + "epoch": 0.024922688802083332, + "grad_norm": 14.526780128479004, + "learning_rate": 9.999191327600215e-06, + "loss": 6.2462, + "step": 1225 + }, + { + "epoch": 0.0250244140625, + "grad_norm": 15.445826530456543, + "learning_rate": 9.99918412362326e-06, + "loss": 5.8172, + "step": 1230 + }, + { + "epoch": 0.025126139322916668, + "grad_norm": 16.775989532470703, + "learning_rate": 9.999176887703106e-06, + "loss": 6.171, + "step": 1235 + }, + { + "epoch": 0.025227864583333332, + "grad_norm": 15.464823722839355, + "learning_rate": 9.999169619839798e-06, + "loss": 6.1699, + "step": 1240 + }, + { + "epoch": 0.02532958984375, + "grad_norm": 20.61041831970215, + "learning_rate": 9.999162320033383e-06, + "loss": 6.0408, + "step": 1245 + }, + { + "epoch": 0.025431315104166668, + "grad_norm": 16.18393325805664, + "learning_rate": 9.999154988283907e-06, + "loss": 5.9139, + "step": 1250 + }, + { + "epoch": 0.025533040364583332, + "grad_norm": 30.69072914123535, + "learning_rate": 9.999147624591418e-06, + "loss": 5.8728, + "step": 1255 + }, + { + "epoch": 0.025634765625, + "grad_norm": 16.119518280029297, + "learning_rate": 9.999140228955965e-06, + "loss": 6.0996, + "step": 1260 + }, + { + "epoch": 0.025736490885416668, + "grad_norm": 13.239355087280273, + "learning_rate": 9.999132801377592e-06, + "loss": 5.9002, + "step": 1265 + }, + { + "epoch": 0.025838216145833332, + "grad_norm": 12.750971794128418, + "learning_rate": 9.999125341856345e-06, + "loss": 6.0066, + "step": 1270 + }, + { + "epoch": 0.02593994140625, + "grad_norm": 16.84983253479004, + "learning_rate": 9.999117850392275e-06, + "loss": 6.1884, + "step": 1275 + }, + { + "epoch": 0.026041666666666668, + "grad_norm": 16.963289260864258, + "learning_rate": 9.999110326985429e-06, + "loss": 5.8593, + "step": 1280 + }, + { + "epoch": 0.026143391927083332, + "grad_norm": 18.115386962890625, + "learning_rate": 9.999102771635855e-06, + "loss": 6.1149, + "step": 1285 + }, + { + "epoch": 0.0262451171875, + "grad_norm": 20.03508186340332, + "learning_rate": 9.999095184343602e-06, + "loss": 5.9639, + "step": 1290 + }, + { + "epoch": 0.026346842447916668, + "grad_norm": 14.980121612548828, + "learning_rate": 9.999087565108719e-06, + "loss": 6.3128, + "step": 1295 + }, + { + "epoch": 0.026448567708333332, + "grad_norm": 19.49420738220215, + "learning_rate": 9.99907991393125e-06, + "loss": 6.197, + "step": 1300 + }, + { + "epoch": 0.02655029296875, + "grad_norm": 28.621410369873047, + "learning_rate": 9.99907223081125e-06, + "loss": 6.2056, + "step": 1305 + }, + { + "epoch": 0.026652018229166668, + "grad_norm": 14.60916805267334, + "learning_rate": 9.999064515748763e-06, + "loss": 5.9169, + "step": 1310 + }, + { + "epoch": 0.026753743489583332, + "grad_norm": 14.234273910522461, + "learning_rate": 9.999056768743842e-06, + "loss": 6.1318, + "step": 1315 + }, + { + "epoch": 0.02685546875, + "grad_norm": 22.264732360839844, + "learning_rate": 9.999048989796535e-06, + "loss": 5.9674, + "step": 1320 + }, + { + "epoch": 0.026957194010416668, + "grad_norm": 13.790782928466797, + "learning_rate": 9.999041178906891e-06, + "loss": 5.7764, + "step": 1325 + }, + { + "epoch": 0.027058919270833332, + "grad_norm": 22.134410858154297, + "learning_rate": 9.999033336074963e-06, + "loss": 6.1886, + "step": 1330 + }, + { + "epoch": 0.02716064453125, + "grad_norm": 17.55847930908203, + "learning_rate": 9.999025461300798e-06, + "loss": 6.1673, + "step": 1335 + }, + { + "epoch": 0.027262369791666668, + "grad_norm": 13.48686695098877, + "learning_rate": 9.999017554584445e-06, + "loss": 5.886, + "step": 1340 + }, + { + "epoch": 0.027364095052083332, + "grad_norm": 19.61215591430664, + "learning_rate": 9.999009615925959e-06, + "loss": 5.8703, + "step": 1345 + }, + { + "epoch": 0.0274658203125, + "grad_norm": 13.262721061706543, + "learning_rate": 9.999001645325388e-06, + "loss": 6.2337, + "step": 1350 + }, + { + "epoch": 0.027567545572916668, + "grad_norm": 13.333744049072266, + "learning_rate": 9.998993642782785e-06, + "loss": 5.9605, + "step": 1355 + }, + { + "epoch": 0.027669270833333332, + "grad_norm": 11.15735149383545, + "learning_rate": 9.998985608298196e-06, + "loss": 6.251, + "step": 1360 + }, + { + "epoch": 0.02777099609375, + "grad_norm": 17.526168823242188, + "learning_rate": 9.998977541871677e-06, + "loss": 5.9318, + "step": 1365 + }, + { + "epoch": 0.027872721354166668, + "grad_norm": 18.90509605407715, + "learning_rate": 9.998969443503279e-06, + "loss": 5.9052, + "step": 1370 + }, + { + "epoch": 0.027974446614583332, + "grad_norm": 18.167953491210938, + "learning_rate": 9.998961313193053e-06, + "loss": 5.871, + "step": 1375 + }, + { + "epoch": 0.028076171875, + "grad_norm": 15.26009750366211, + "learning_rate": 9.998953150941053e-06, + "loss": 6.2639, + "step": 1380 + }, + { + "epoch": 0.028177897135416668, + "grad_norm": 18.214427947998047, + "learning_rate": 9.998944956747328e-06, + "loss": 6.0403, + "step": 1385 + }, + { + "epoch": 0.028279622395833332, + "grad_norm": 27.031484603881836, + "learning_rate": 9.99893673061193e-06, + "loss": 5.9385, + "step": 1390 + }, + { + "epoch": 0.02838134765625, + "grad_norm": 14.345611572265625, + "learning_rate": 9.998928472534914e-06, + "loss": 6.0399, + "step": 1395 + }, + { + "epoch": 0.028483072916666668, + "grad_norm": 18.153465270996094, + "learning_rate": 9.998920182516332e-06, + "loss": 6.1935, + "step": 1400 + }, + { + "epoch": 0.028584798177083332, + "grad_norm": 20.480792999267578, + "learning_rate": 9.998911860556237e-06, + "loss": 6.1623, + "step": 1405 + }, + { + "epoch": 0.0286865234375, + "grad_norm": 16.054752349853516, + "learning_rate": 9.998903506654682e-06, + "loss": 5.7629, + "step": 1410 + }, + { + "epoch": 0.028788248697916668, + "grad_norm": 15.317622184753418, + "learning_rate": 9.998895120811721e-06, + "loss": 5.7508, + "step": 1415 + }, + { + "epoch": 0.028889973958333332, + "grad_norm": 21.298158645629883, + "learning_rate": 9.998886703027407e-06, + "loss": 5.897, + "step": 1420 + }, + { + "epoch": 0.02899169921875, + "grad_norm": 16.57981300354004, + "learning_rate": 9.998878253301794e-06, + "loss": 5.9792, + "step": 1425 + }, + { + "epoch": 0.029093424479166668, + "grad_norm": 13.906112670898438, + "learning_rate": 9.998869771634937e-06, + "loss": 5.8832, + "step": 1430 + }, + { + "epoch": 0.029195149739583332, + "grad_norm": 21.147687911987305, + "learning_rate": 9.998861258026886e-06, + "loss": 5.9867, + "step": 1435 + }, + { + "epoch": 0.029296875, + "grad_norm": 17.143922805786133, + "learning_rate": 9.9988527124777e-06, + "loss": 5.7559, + "step": 1440 + }, + { + "epoch": 0.029398600260416668, + "grad_norm": 13.276544570922852, + "learning_rate": 9.998844134987431e-06, + "loss": 5.8972, + "step": 1445 + }, + { + "epoch": 0.029500325520833332, + "grad_norm": 12.775952339172363, + "learning_rate": 9.998835525556136e-06, + "loss": 5.9302, + "step": 1450 + }, + { + "epoch": 0.02960205078125, + "grad_norm": 14.441143035888672, + "learning_rate": 9.998826884183868e-06, + "loss": 5.7078, + "step": 1455 + }, + { + "epoch": 0.029703776041666668, + "grad_norm": 15.492350578308105, + "learning_rate": 9.998818210870684e-06, + "loss": 6.0574, + "step": 1460 + }, + { + "epoch": 0.029805501302083332, + "grad_norm": 17.265533447265625, + "learning_rate": 9.998809505616637e-06, + "loss": 5.7959, + "step": 1465 + }, + { + "epoch": 0.0299072265625, + "grad_norm": 17.465608596801758, + "learning_rate": 9.998800768421786e-06, + "loss": 6.0468, + "step": 1470 + }, + { + "epoch": 0.030008951822916668, + "grad_norm": 15.138232231140137, + "learning_rate": 9.998791999286183e-06, + "loss": 5.912, + "step": 1475 + }, + { + "epoch": 0.030110677083333332, + "grad_norm": 14.133085250854492, + "learning_rate": 9.998783198209887e-06, + "loss": 5.973, + "step": 1480 + }, + { + "epoch": 0.03021240234375, + "grad_norm": 15.911847114562988, + "learning_rate": 9.998774365192953e-06, + "loss": 6.0185, + "step": 1485 + }, + { + "epoch": 0.030314127604166668, + "grad_norm": 16.951932907104492, + "learning_rate": 9.998765500235438e-06, + "loss": 6.1561, + "step": 1490 + }, + { + "epoch": 0.030415852864583332, + "grad_norm": 14.356291770935059, + "learning_rate": 9.998756603337399e-06, + "loss": 6.192, + "step": 1495 + }, + { + "epoch": 0.030517578125, + "grad_norm": 20.593795776367188, + "learning_rate": 9.99874767449889e-06, + "loss": 5.9312, + "step": 1500 + }, + { + "epoch": 0.030619303385416668, + "grad_norm": 24.445053100585938, + "learning_rate": 9.998738713719974e-06, + "loss": 5.8045, + "step": 1505 + }, + { + "epoch": 0.030721028645833332, + "grad_norm": 19.065996170043945, + "learning_rate": 9.9987297210007e-06, + "loss": 6.0406, + "step": 1510 + }, + { + "epoch": 0.03082275390625, + "grad_norm": 14.275654792785645, + "learning_rate": 9.998720696341132e-06, + "loss": 5.6339, + "step": 1515 + }, + { + "epoch": 0.030924479166666668, + "grad_norm": 14.706521987915039, + "learning_rate": 9.998711639741325e-06, + "loss": 5.7789, + "step": 1520 + }, + { + "epoch": 0.031026204427083332, + "grad_norm": 16.471675872802734, + "learning_rate": 9.998702551201337e-06, + "loss": 6.0115, + "step": 1525 + }, + { + "epoch": 0.0311279296875, + "grad_norm": 16.330862045288086, + "learning_rate": 9.998693430721228e-06, + "loss": 5.8615, + "step": 1530 + }, + { + "epoch": 0.031229654947916668, + "grad_norm": 14.332329750061035, + "learning_rate": 9.998684278301055e-06, + "loss": 5.977, + "step": 1535 + }, + { + "epoch": 0.031331380208333336, + "grad_norm": 18.00435447692871, + "learning_rate": 9.998675093940875e-06, + "loss": 5.8445, + "step": 1540 + }, + { + "epoch": 0.03143310546875, + "grad_norm": 24.712127685546875, + "learning_rate": 9.998665877640748e-06, + "loss": 5.7904, + "step": 1545 + }, + { + "epoch": 0.031534830729166664, + "grad_norm": 21.074745178222656, + "learning_rate": 9.998656629400731e-06, + "loss": 6.2187, + "step": 1550 + }, + { + "epoch": 0.031636555989583336, + "grad_norm": 13.952010154724121, + "learning_rate": 9.998647349220888e-06, + "loss": 5.941, + "step": 1555 + }, + { + "epoch": 0.03173828125, + "grad_norm": 17.39000129699707, + "learning_rate": 9.998638037101273e-06, + "loss": 6.0845, + "step": 1560 + }, + { + "epoch": 0.031840006510416664, + "grad_norm": 11.351719856262207, + "learning_rate": 9.99862869304195e-06, + "loss": 5.8774, + "step": 1565 + }, + { + "epoch": 0.031941731770833336, + "grad_norm": 13.788360595703125, + "learning_rate": 9.998619317042975e-06, + "loss": 5.9659, + "step": 1570 + }, + { + "epoch": 0.03204345703125, + "grad_norm": 17.161029815673828, + "learning_rate": 9.99860990910441e-06, + "loss": 6.0288, + "step": 1575 + }, + { + "epoch": 0.032145182291666664, + "grad_norm": 22.317855834960938, + "learning_rate": 9.998600469226315e-06, + "loss": 5.8885, + "step": 1580 + }, + { + "epoch": 0.032246907552083336, + "grad_norm": 21.642566680908203, + "learning_rate": 9.998590997408748e-06, + "loss": 5.9475, + "step": 1585 + }, + { + "epoch": 0.0323486328125, + "grad_norm": 22.953271865844727, + "learning_rate": 9.998581493651771e-06, + "loss": 5.9559, + "step": 1590 + }, + { + "epoch": 0.032450358072916664, + "grad_norm": 18.098085403442383, + "learning_rate": 9.998571957955447e-06, + "loss": 6.0096, + "step": 1595 + }, + { + "epoch": 0.032552083333333336, + "grad_norm": 16.041545867919922, + "learning_rate": 9.998562390319835e-06, + "loss": 5.9652, + "step": 1600 + }, + { + "epoch": 0.03265380859375, + "grad_norm": 15.929896354675293, + "learning_rate": 9.998552790744995e-06, + "loss": 6.0827, + "step": 1605 + }, + { + "epoch": 0.032755533854166664, + "grad_norm": 14.19112491607666, + "learning_rate": 9.99854315923099e-06, + "loss": 6.2096, + "step": 1610 + }, + { + "epoch": 0.032857259114583336, + "grad_norm": 16.485393524169922, + "learning_rate": 9.99853349577788e-06, + "loss": 5.8476, + "step": 1615 + }, + { + "epoch": 0.032958984375, + "grad_norm": 20.552186965942383, + "learning_rate": 9.998523800385727e-06, + "loss": 6.0206, + "step": 1620 + }, + { + "epoch": 0.033060709635416664, + "grad_norm": 15.785804748535156, + "learning_rate": 9.998514073054595e-06, + "loss": 5.6012, + "step": 1625 + }, + { + "epoch": 0.033162434895833336, + "grad_norm": 18.969783782958984, + "learning_rate": 9.998504313784545e-06, + "loss": 5.643, + "step": 1630 + }, + { + "epoch": 0.03326416015625, + "grad_norm": 15.27829360961914, + "learning_rate": 9.99849452257564e-06, + "loss": 5.8633, + "step": 1635 + }, + { + "epoch": 0.033365885416666664, + "grad_norm": 14.593358039855957, + "learning_rate": 9.99848469942794e-06, + "loss": 5.8499, + "step": 1640 + }, + { + "epoch": 0.033467610677083336, + "grad_norm": 18.851600646972656, + "learning_rate": 9.99847484434151e-06, + "loss": 5.7298, + "step": 1645 + }, + { + "epoch": 0.0335693359375, + "grad_norm": 17.105985641479492, + "learning_rate": 9.998464957316412e-06, + "loss": 5.9334, + "step": 1650 + }, + { + "epoch": 0.033671061197916664, + "grad_norm": 15.112398147583008, + "learning_rate": 9.998455038352711e-06, + "loss": 5.8104, + "step": 1655 + }, + { + "epoch": 0.033772786458333336, + "grad_norm": 16.612445831298828, + "learning_rate": 9.998445087450469e-06, + "loss": 5.8161, + "step": 1660 + }, + { + "epoch": 0.03387451171875, + "grad_norm": 15.013223648071289, + "learning_rate": 9.998435104609748e-06, + "loss": 5.879, + "step": 1665 + }, + { + "epoch": 0.033976236979166664, + "grad_norm": 16.683223724365234, + "learning_rate": 9.998425089830615e-06, + "loss": 5.9379, + "step": 1670 + }, + { + "epoch": 0.034077962239583336, + "grad_norm": 16.499467849731445, + "learning_rate": 9.998415043113132e-06, + "loss": 5.7749, + "step": 1675 + }, + { + "epoch": 0.0341796875, + "grad_norm": 14.530900001525879, + "learning_rate": 9.998404964457362e-06, + "loss": 5.8671, + "step": 1680 + }, + { + "epoch": 0.034281412760416664, + "grad_norm": 15.496219635009766, + "learning_rate": 9.998394853863373e-06, + "loss": 5.8634, + "step": 1685 + }, + { + "epoch": 0.034383138020833336, + "grad_norm": 18.255422592163086, + "learning_rate": 9.998384711331227e-06, + "loss": 5.9932, + "step": 1690 + }, + { + "epoch": 0.03448486328125, + "grad_norm": 11.201310157775879, + "learning_rate": 9.99837453686099e-06, + "loss": 5.9352, + "step": 1695 + }, + { + "epoch": 0.034586588541666664, + "grad_norm": 21.083984375, + "learning_rate": 9.998364330452725e-06, + "loss": 5.9887, + "step": 1700 + }, + { + "epoch": 0.034688313802083336, + "grad_norm": 15.521758079528809, + "learning_rate": 9.9983540921065e-06, + "loss": 5.7597, + "step": 1705 + }, + { + "epoch": 0.0347900390625, + "grad_norm": 13.892156600952148, + "learning_rate": 9.998343821822378e-06, + "loss": 5.9385, + "step": 1710 + }, + { + "epoch": 0.034891764322916664, + "grad_norm": 17.92337417602539, + "learning_rate": 9.998333519600424e-06, + "loss": 6.0957, + "step": 1715 + }, + { + "epoch": 0.034993489583333336, + "grad_norm": 14.799049377441406, + "learning_rate": 9.99832318544071e-06, + "loss": 5.8125, + "step": 1720 + }, + { + "epoch": 0.03509521484375, + "grad_norm": 14.18670654296875, + "learning_rate": 9.998312819343294e-06, + "loss": 5.8854, + "step": 1725 + }, + { + "epoch": 0.035196940104166664, + "grad_norm": 19.32386589050293, + "learning_rate": 9.998302421308246e-06, + "loss": 6.1251, + "step": 1730 + }, + { + "epoch": 0.035298665364583336, + "grad_norm": 16.450395584106445, + "learning_rate": 9.998291991335634e-06, + "loss": 5.8333, + "step": 1735 + }, + { + "epoch": 0.035400390625, + "grad_norm": 18.05861473083496, + "learning_rate": 9.998281529425521e-06, + "loss": 5.7014, + "step": 1740 + }, + { + "epoch": 0.035502115885416664, + "grad_norm": 15.328421592712402, + "learning_rate": 9.998271035577979e-06, + "loss": 5.9626, + "step": 1745 + }, + { + "epoch": 0.035603841145833336, + "grad_norm": 17.081066131591797, + "learning_rate": 9.998260509793067e-06, + "loss": 6.1534, + "step": 1750 + }, + { + "epoch": 0.03570556640625, + "grad_norm": 15.841145515441895, + "learning_rate": 9.99824995207086e-06, + "loss": 5.7923, + "step": 1755 + }, + { + "epoch": 0.035807291666666664, + "grad_norm": 28.384925842285156, + "learning_rate": 9.998239362411422e-06, + "loss": 5.8879, + "step": 1760 + }, + { + "epoch": 0.035909016927083336, + "grad_norm": 17.67591094970703, + "learning_rate": 9.998228740814821e-06, + "loss": 5.5992, + "step": 1765 + }, + { + "epoch": 0.0360107421875, + "grad_norm": 14.921061515808105, + "learning_rate": 9.998218087281124e-06, + "loss": 5.5182, + "step": 1770 + }, + { + "epoch": 0.036112467447916664, + "grad_norm": 17.978429794311523, + "learning_rate": 9.998207401810402e-06, + "loss": 5.9395, + "step": 1775 + }, + { + "epoch": 0.036214192708333336, + "grad_norm": 14.31653118133545, + "learning_rate": 9.998196684402719e-06, + "loss": 5.9191, + "step": 1780 + }, + { + "epoch": 0.03631591796875, + "grad_norm": 13.341893196105957, + "learning_rate": 9.998185935058147e-06, + "loss": 5.9083, + "step": 1785 + }, + { + "epoch": 0.036417643229166664, + "grad_norm": 13.563104629516602, + "learning_rate": 9.998175153776755e-06, + "loss": 5.9277, + "step": 1790 + }, + { + "epoch": 0.036519368489583336, + "grad_norm": 19.35736656188965, + "learning_rate": 9.998164340558609e-06, + "loss": 6.1421, + "step": 1795 + }, + { + "epoch": 0.03662109375, + "grad_norm": 13.342545509338379, + "learning_rate": 9.998153495403779e-06, + "loss": 5.7615, + "step": 1800 + }, + { + "epoch": 0.036722819010416664, + "grad_norm": 16.893980026245117, + "learning_rate": 9.998142618312333e-06, + "loss": 5.9871, + "step": 1805 + }, + { + "epoch": 0.036824544270833336, + "grad_norm": 19.527698516845703, + "learning_rate": 9.998131709284346e-06, + "loss": 5.951, + "step": 1810 + }, + { + "epoch": 0.03692626953125, + "grad_norm": 17.660438537597656, + "learning_rate": 9.998120768319883e-06, + "loss": 5.8465, + "step": 1815 + }, + { + "epoch": 0.037027994791666664, + "grad_norm": 16.078311920166016, + "learning_rate": 9.998109795419015e-06, + "loss": 5.742, + "step": 1820 + }, + { + "epoch": 0.037129720052083336, + "grad_norm": 15.998230934143066, + "learning_rate": 9.998098790581811e-06, + "loss": 5.7781, + "step": 1825 + }, + { + "epoch": 0.0372314453125, + "grad_norm": 14.748312950134277, + "learning_rate": 9.998087753808343e-06, + "loss": 6.1244, + "step": 1830 + }, + { + "epoch": 0.037333170572916664, + "grad_norm": 18.672748565673828, + "learning_rate": 9.99807668509868e-06, + "loss": 5.6084, + "step": 1835 + }, + { + "epoch": 0.037434895833333336, + "grad_norm": 22.51088523864746, + "learning_rate": 9.998065584452896e-06, + "loss": 5.9929, + "step": 1840 + }, + { + "epoch": 0.03753662109375, + "grad_norm": 14.813888549804688, + "learning_rate": 9.998054451871058e-06, + "loss": 5.7603, + "step": 1845 + }, + { + "epoch": 0.037638346354166664, + "grad_norm": 17.332717895507812, + "learning_rate": 9.998043287353238e-06, + "loss": 5.6822, + "step": 1850 + }, + { + "epoch": 0.037740071614583336, + "grad_norm": 16.694185256958008, + "learning_rate": 9.998032090899507e-06, + "loss": 5.814, + "step": 1855 + }, + { + "epoch": 0.037841796875, + "grad_norm": 16.95908546447754, + "learning_rate": 9.998020862509941e-06, + "loss": 5.8484, + "step": 1860 + }, + { + "epoch": 0.037943522135416664, + "grad_norm": 16.984792709350586, + "learning_rate": 9.998009602184605e-06, + "loss": 6.153, + "step": 1865 + }, + { + "epoch": 0.038045247395833336, + "grad_norm": 14.221283912658691, + "learning_rate": 9.997998309923576e-06, + "loss": 5.7885, + "step": 1870 + }, + { + "epoch": 0.03814697265625, + "grad_norm": 27.035308837890625, + "learning_rate": 9.997986985726925e-06, + "loss": 5.8283, + "step": 1875 + }, + { + "epoch": 0.038248697916666664, + "grad_norm": 13.670437812805176, + "learning_rate": 9.99797562959472e-06, + "loss": 5.6042, + "step": 1880 + }, + { + "epoch": 0.038350423177083336, + "grad_norm": 12.596402168273926, + "learning_rate": 9.99796424152704e-06, + "loss": 5.81, + "step": 1885 + }, + { + "epoch": 0.0384521484375, + "grad_norm": 18.31295394897461, + "learning_rate": 9.997952821523956e-06, + "loss": 5.9392, + "step": 1890 + }, + { + "epoch": 0.038553873697916664, + "grad_norm": 20.888320922851562, + "learning_rate": 9.997941369585539e-06, + "loss": 5.8534, + "step": 1895 + }, + { + "epoch": 0.038655598958333336, + "grad_norm": 18.60557746887207, + "learning_rate": 9.997929885711864e-06, + "loss": 6.1012, + "step": 1900 + }, + { + "epoch": 0.03875732421875, + "grad_norm": 19.761333465576172, + "learning_rate": 9.997918369903002e-06, + "loss": 5.8631, + "step": 1905 + }, + { + "epoch": 0.038859049479166664, + "grad_norm": 15.781414985656738, + "learning_rate": 9.99790682215903e-06, + "loss": 5.7075, + "step": 1910 + }, + { + "epoch": 0.038960774739583336, + "grad_norm": 15.580400466918945, + "learning_rate": 9.997895242480018e-06, + "loss": 5.8955, + "step": 1915 + }, + { + "epoch": 0.0390625, + "grad_norm": 16.08568000793457, + "learning_rate": 9.997883630866043e-06, + "loss": 5.8149, + "step": 1920 + }, + { + "epoch": 0.039164225260416664, + "grad_norm": 16.374584197998047, + "learning_rate": 9.997871987317179e-06, + "loss": 5.8874, + "step": 1925 + }, + { + "epoch": 0.039265950520833336, + "grad_norm": 11.281521797180176, + "learning_rate": 9.997860311833498e-06, + "loss": 5.7966, + "step": 1930 + }, + { + "epoch": 0.03936767578125, + "grad_norm": 14.793662071228027, + "learning_rate": 9.997848604415077e-06, + "loss": 5.9551, + "step": 1935 + }, + { + "epoch": 0.039469401041666664, + "grad_norm": 13.656495094299316, + "learning_rate": 9.997836865061989e-06, + "loss": 5.9642, + "step": 1940 + }, + { + "epoch": 0.039571126302083336, + "grad_norm": 14.507530212402344, + "learning_rate": 9.997825093774309e-06, + "loss": 5.8291, + "step": 1945 + }, + { + "epoch": 0.0396728515625, + "grad_norm": 11.211280822753906, + "learning_rate": 9.997813290552116e-06, + "loss": 5.6532, + "step": 1950 + }, + { + "epoch": 0.039774576822916664, + "grad_norm": 22.774553298950195, + "learning_rate": 9.997801455395482e-06, + "loss": 6.3146, + "step": 1955 + }, + { + "epoch": 0.039876302083333336, + "grad_norm": 15.679645538330078, + "learning_rate": 9.997789588304481e-06, + "loss": 5.76, + "step": 1960 + }, + { + "epoch": 0.03997802734375, + "grad_norm": 16.479734420776367, + "learning_rate": 9.997777689279193e-06, + "loss": 5.5745, + "step": 1965 + }, + { + "epoch": 0.040079752604166664, + "grad_norm": 17.99978256225586, + "learning_rate": 9.997765758319692e-06, + "loss": 5.6615, + "step": 1970 + }, + { + "epoch": 0.040181477864583336, + "grad_norm": 16.095617294311523, + "learning_rate": 9.997753795426055e-06, + "loss": 5.8705, + "step": 1975 + }, + { + "epoch": 0.040283203125, + "grad_norm": 20.743240356445312, + "learning_rate": 9.997741800598355e-06, + "loss": 5.8386, + "step": 1980 + }, + { + "epoch": 0.040384928385416664, + "grad_norm": 11.897674560546875, + "learning_rate": 9.997729773836675e-06, + "loss": 5.9083, + "step": 1985 + }, + { + "epoch": 0.040486653645833336, + "grad_norm": 17.04975700378418, + "learning_rate": 9.997717715141085e-06, + "loss": 5.8058, + "step": 1990 + }, + { + "epoch": 0.04058837890625, + "grad_norm": 15.148179054260254, + "learning_rate": 9.997705624511667e-06, + "loss": 5.9677, + "step": 1995 + }, + { + "epoch": 0.040690104166666664, + "grad_norm": 17.47242546081543, + "learning_rate": 9.997693501948496e-06, + "loss": 5.9102, + "step": 2000 + }, + { + "epoch": 0.040791829427083336, + "grad_norm": 17.647449493408203, + "learning_rate": 9.997681347451652e-06, + "loss": 5.7622, + "step": 2005 + }, + { + "epoch": 0.0408935546875, + "grad_norm": 17.943206787109375, + "learning_rate": 9.997669161021207e-06, + "loss": 5.8401, + "step": 2010 + }, + { + "epoch": 0.040995279947916664, + "grad_norm": 20.34568214416504, + "learning_rate": 9.997656942657245e-06, + "loss": 5.6818, + "step": 2015 + }, + { + "epoch": 0.041097005208333336, + "grad_norm": 19.244522094726562, + "learning_rate": 9.99764469235984e-06, + "loss": 5.7194, + "step": 2020 + }, + { + "epoch": 0.04119873046875, + "grad_norm": 14.445345878601074, + "learning_rate": 9.997632410129074e-06, + "loss": 5.9867, + "step": 2025 + }, + { + "epoch": 0.041300455729166664, + "grad_norm": 17.204317092895508, + "learning_rate": 9.997620095965022e-06, + "loss": 5.94, + "step": 2030 + }, + { + "epoch": 0.041402180989583336, + "grad_norm": 14.822259902954102, + "learning_rate": 9.997607749867763e-06, + "loss": 5.7754, + "step": 2035 + }, + { + "epoch": 0.04150390625, + "grad_norm": 16.357872009277344, + "learning_rate": 9.99759537183738e-06, + "loss": 5.8043, + "step": 2040 + }, + { + "epoch": 0.041605631510416664, + "grad_norm": 11.29537296295166, + "learning_rate": 9.997582961873947e-06, + "loss": 5.6372, + "step": 2045 + }, + { + "epoch": 0.041707356770833336, + "grad_norm": 18.486286163330078, + "learning_rate": 9.997570519977545e-06, + "loss": 5.8201, + "step": 2050 + }, + { + "epoch": 0.04180908203125, + "grad_norm": 17.231403350830078, + "learning_rate": 9.997558046148255e-06, + "loss": 6.0616, + "step": 2055 + }, + { + "epoch": 0.041910807291666664, + "grad_norm": 18.054079055786133, + "learning_rate": 9.997545540386155e-06, + "loss": 5.8261, + "step": 2060 + }, + { + "epoch": 0.042012532552083336, + "grad_norm": 13.196372985839844, + "learning_rate": 9.997533002691324e-06, + "loss": 5.5322, + "step": 2065 + }, + { + "epoch": 0.0421142578125, + "grad_norm": 13.664402961730957, + "learning_rate": 9.997520433063845e-06, + "loss": 5.7679, + "step": 2070 + }, + { + "epoch": 0.042215983072916664, + "grad_norm": 15.411184310913086, + "learning_rate": 9.997507831503797e-06, + "loss": 5.8991, + "step": 2075 + }, + { + "epoch": 0.042317708333333336, + "grad_norm": 14.467016220092773, + "learning_rate": 9.99749519801126e-06, + "loss": 6.0415, + "step": 2080 + }, + { + "epoch": 0.04241943359375, + "grad_norm": 15.398569107055664, + "learning_rate": 9.997482532586316e-06, + "loss": 6.0095, + "step": 2085 + }, + { + "epoch": 0.042521158854166664, + "grad_norm": 29.4045352935791, + "learning_rate": 9.997469835229044e-06, + "loss": 5.8558, + "step": 2090 + }, + { + "epoch": 0.042622884114583336, + "grad_norm": 19.950077056884766, + "learning_rate": 9.997457105939527e-06, + "loss": 5.6805, + "step": 2095 + }, + { + "epoch": 0.042724609375, + "grad_norm": 27.64505958557129, + "learning_rate": 9.997444344717844e-06, + "loss": 5.8776, + "step": 2100 + }, + { + "epoch": 0.042826334635416664, + "grad_norm": 23.616737365722656, + "learning_rate": 9.997431551564079e-06, + "loss": 5.6492, + "step": 2105 + }, + { + "epoch": 0.042928059895833336, + "grad_norm": 17.247844696044922, + "learning_rate": 9.997418726478313e-06, + "loss": 5.9646, + "step": 2110 + }, + { + "epoch": 0.04302978515625, + "grad_norm": 13.81291389465332, + "learning_rate": 9.997405869460629e-06, + "loss": 5.6052, + "step": 2115 + }, + { + "epoch": 0.043131510416666664, + "grad_norm": 16.15083122253418, + "learning_rate": 9.997392980511104e-06, + "loss": 5.9454, + "step": 2120 + }, + { + "epoch": 0.043233235677083336, + "grad_norm": 22.077579498291016, + "learning_rate": 9.997380059629827e-06, + "loss": 5.8474, + "step": 2125 + }, + { + "epoch": 0.0433349609375, + "grad_norm": 13.69865608215332, + "learning_rate": 9.997367106816877e-06, + "loss": 5.8734, + "step": 2130 + }, + { + "epoch": 0.043436686197916664, + "grad_norm": 12.769457817077637, + "learning_rate": 9.997354122072338e-06, + "loss": 5.8566, + "step": 2135 + }, + { + "epoch": 0.043538411458333336, + "grad_norm": 15.643074035644531, + "learning_rate": 9.997341105396293e-06, + "loss": 5.754, + "step": 2140 + }, + { + "epoch": 0.04364013671875, + "grad_norm": 14.990761756896973, + "learning_rate": 9.997328056788824e-06, + "loss": 5.8796, + "step": 2145 + }, + { + "epoch": 0.043741861979166664, + "grad_norm": 16.625585556030273, + "learning_rate": 9.997314976250017e-06, + "loss": 5.8325, + "step": 2150 + }, + { + "epoch": 0.043843587239583336, + "grad_norm": 14.539824485778809, + "learning_rate": 9.997301863779952e-06, + "loss": 5.7903, + "step": 2155 + }, + { + "epoch": 0.0439453125, + "grad_norm": 14.834152221679688, + "learning_rate": 9.997288719378713e-06, + "loss": 5.9961, + "step": 2160 + }, + { + "epoch": 0.044047037760416664, + "grad_norm": 14.886181831359863, + "learning_rate": 9.997275543046387e-06, + "loss": 5.7756, + "step": 2165 + }, + { + "epoch": 0.044148763020833336, + "grad_norm": 15.35965347290039, + "learning_rate": 9.997262334783056e-06, + "loss": 6.3054, + "step": 2170 + }, + { + "epoch": 0.04425048828125, + "grad_norm": 14.605013847351074, + "learning_rate": 9.997249094588805e-06, + "loss": 6.0051, + "step": 2175 + }, + { + "epoch": 0.044352213541666664, + "grad_norm": 19.29071617126465, + "learning_rate": 9.99723582246372e-06, + "loss": 5.6629, + "step": 2180 + }, + { + "epoch": 0.044453938802083336, + "grad_norm": 13.829155921936035, + "learning_rate": 9.997222518407882e-06, + "loss": 5.9391, + "step": 2185 + }, + { + "epoch": 0.0445556640625, + "grad_norm": 16.468215942382812, + "learning_rate": 9.99720918242138e-06, + "loss": 5.751, + "step": 2190 + }, + { + "epoch": 0.044657389322916664, + "grad_norm": 24.723594665527344, + "learning_rate": 9.9971958145043e-06, + "loss": 6.0943, + "step": 2195 + }, + { + "epoch": 0.044759114583333336, + "grad_norm": 21.772815704345703, + "learning_rate": 9.997182414656721e-06, + "loss": 5.6062, + "step": 2200 + }, + { + "epoch": 0.04486083984375, + "grad_norm": 13.7568998336792, + "learning_rate": 9.997168982878735e-06, + "loss": 5.7538, + "step": 2205 + }, + { + "epoch": 0.044962565104166664, + "grad_norm": 19.56125831604004, + "learning_rate": 9.997155519170425e-06, + "loss": 5.9538, + "step": 2210 + }, + { + "epoch": 0.045064290364583336, + "grad_norm": 15.139726638793945, + "learning_rate": 9.997142023531879e-06, + "loss": 5.9353, + "step": 2215 + }, + { + "epoch": 0.045166015625, + "grad_norm": 16.453174591064453, + "learning_rate": 9.997128495963179e-06, + "loss": 5.87, + "step": 2220 + }, + { + "epoch": 0.045267740885416664, + "grad_norm": 23.067331314086914, + "learning_rate": 9.997114936464417e-06, + "loss": 6.0876, + "step": 2225 + }, + { + "epoch": 0.045369466145833336, + "grad_norm": 15.434601783752441, + "learning_rate": 9.997101345035673e-06, + "loss": 5.8211, + "step": 2230 + }, + { + "epoch": 0.04547119140625, + "grad_norm": 14.882776260375977, + "learning_rate": 9.99708772167704e-06, + "loss": 5.649, + "step": 2235 + }, + { + "epoch": 0.045572916666666664, + "grad_norm": 15.628950119018555, + "learning_rate": 9.997074066388603e-06, + "loss": 5.8371, + "step": 2240 + }, + { + "epoch": 0.045674641927083336, + "grad_norm": 14.62673568725586, + "learning_rate": 9.997060379170448e-06, + "loss": 5.7198, + "step": 2245 + }, + { + "epoch": 0.0457763671875, + "grad_norm": 12.457629203796387, + "learning_rate": 9.997046660022665e-06, + "loss": 5.6903, + "step": 2250 + }, + { + "epoch": 0.045878092447916664, + "grad_norm": 11.346028327941895, + "learning_rate": 9.997032908945338e-06, + "loss": 5.7456, + "step": 2255 + }, + { + "epoch": 0.045979817708333336, + "grad_norm": 15.728675842285156, + "learning_rate": 9.997019125938559e-06, + "loss": 5.7821, + "step": 2260 + }, + { + "epoch": 0.04608154296875, + "grad_norm": 14.210660934448242, + "learning_rate": 9.997005311002415e-06, + "loss": 5.6861, + "step": 2265 + }, + { + "epoch": 0.046183268229166664, + "grad_norm": 19.56140899658203, + "learning_rate": 9.996991464136992e-06, + "loss": 5.7972, + "step": 2270 + }, + { + "epoch": 0.046284993489583336, + "grad_norm": 14.704282760620117, + "learning_rate": 9.996977585342377e-06, + "loss": 5.7059, + "step": 2275 + }, + { + "epoch": 0.04638671875, + "grad_norm": 14.88856315612793, + "learning_rate": 9.996963674618666e-06, + "loss": 5.6617, + "step": 2280 + }, + { + "epoch": 0.046488444010416664, + "grad_norm": 18.278547286987305, + "learning_rate": 9.99694973196594e-06, + "loss": 5.8219, + "step": 2285 + }, + { + "epoch": 0.046590169270833336, + "grad_norm": 13.517939567565918, + "learning_rate": 9.996935757384295e-06, + "loss": 5.6014, + "step": 2290 + }, + { + "epoch": 0.04669189453125, + "grad_norm": 13.35275936126709, + "learning_rate": 9.996921750873813e-06, + "loss": 6.0498, + "step": 2295 + }, + { + "epoch": 0.046793619791666664, + "grad_norm": 15.373016357421875, + "learning_rate": 9.996907712434589e-06, + "loss": 5.6052, + "step": 2300 + }, + { + "epoch": 0.046895345052083336, + "grad_norm": 15.164552688598633, + "learning_rate": 9.99689364206671e-06, + "loss": 5.8072, + "step": 2305 + }, + { + "epoch": 0.0469970703125, + "grad_norm": 16.11260223388672, + "learning_rate": 9.996879539770268e-06, + "loss": 5.8483, + "step": 2310 + }, + { + "epoch": 0.047098795572916664, + "grad_norm": 18.389127731323242, + "learning_rate": 9.99686540554535e-06, + "loss": 5.8403, + "step": 2315 + }, + { + "epoch": 0.047200520833333336, + "grad_norm": 14.635401725769043, + "learning_rate": 9.996851239392052e-06, + "loss": 5.8534, + "step": 2320 + }, + { + "epoch": 0.04730224609375, + "grad_norm": 21.77419090270996, + "learning_rate": 9.996837041310458e-06, + "loss": 5.5921, + "step": 2325 + }, + { + "epoch": 0.047403971354166664, + "grad_norm": 13.518994331359863, + "learning_rate": 9.996822811300662e-06, + "loss": 5.8266, + "step": 2330 + }, + { + "epoch": 0.047505696614583336, + "grad_norm": 15.88668441772461, + "learning_rate": 9.996808549362755e-06, + "loss": 6.1089, + "step": 2335 + }, + { + "epoch": 0.047607421875, + "grad_norm": 17.17308235168457, + "learning_rate": 9.996794255496827e-06, + "loss": 5.727, + "step": 2340 + }, + { + "epoch": 0.047709147135416664, + "grad_norm": 17.022592544555664, + "learning_rate": 9.99677992970297e-06, + "loss": 5.858, + "step": 2345 + }, + { + "epoch": 0.047810872395833336, + "grad_norm": 18.09345817565918, + "learning_rate": 9.996765571981274e-06, + "loss": 5.7712, + "step": 2350 + }, + { + "epoch": 0.04791259765625, + "grad_norm": 19.891782760620117, + "learning_rate": 9.996751182331834e-06, + "loss": 5.7982, + "step": 2355 + }, + { + "epoch": 0.048014322916666664, + "grad_norm": 16.13992691040039, + "learning_rate": 9.996736760754741e-06, + "loss": 5.8631, + "step": 2360 + }, + { + "epoch": 0.048116048177083336, + "grad_norm": 14.947916030883789, + "learning_rate": 9.996722307250084e-06, + "loss": 5.9895, + "step": 2365 + }, + { + "epoch": 0.0482177734375, + "grad_norm": 15.50719165802002, + "learning_rate": 9.996707821817958e-06, + "loss": 5.838, + "step": 2370 + }, + { + "epoch": 0.048319498697916664, + "grad_norm": 14.999481201171875, + "learning_rate": 9.996693304458457e-06, + "loss": 5.8842, + "step": 2375 + }, + { + "epoch": 0.048421223958333336, + "grad_norm": 14.50729751586914, + "learning_rate": 9.996678755171672e-06, + "loss": 5.7899, + "step": 2380 + }, + { + "epoch": 0.04852294921875, + "grad_norm": 15.21367359161377, + "learning_rate": 9.996664173957694e-06, + "loss": 5.6679, + "step": 2385 + }, + { + "epoch": 0.048624674479166664, + "grad_norm": 13.982894897460938, + "learning_rate": 9.996649560816619e-06, + "loss": 5.6113, + "step": 2390 + }, + { + "epoch": 0.048726399739583336, + "grad_norm": 11.01335620880127, + "learning_rate": 9.99663491574854e-06, + "loss": 5.6741, + "step": 2395 + }, + { + "epoch": 0.048828125, + "grad_norm": 15.3165283203125, + "learning_rate": 9.996620238753548e-06, + "loss": 5.7259, + "step": 2400 + }, + { + "epoch": 0.048929850260416664, + "grad_norm": 12.577077865600586, + "learning_rate": 9.99660552983174e-06, + "loss": 6.0009, + "step": 2405 + }, + { + "epoch": 0.049031575520833336, + "grad_norm": 12.803409576416016, + "learning_rate": 9.996590788983209e-06, + "loss": 5.7304, + "step": 2410 + }, + { + "epoch": 0.04913330078125, + "grad_norm": 15.829156875610352, + "learning_rate": 9.99657601620805e-06, + "loss": 6.1825, + "step": 2415 + }, + { + "epoch": 0.049235026041666664, + "grad_norm": 18.349252700805664, + "learning_rate": 9.996561211506355e-06, + "loss": 5.6147, + "step": 2420 + }, + { + "epoch": 0.049336751302083336, + "grad_norm": 15.907801628112793, + "learning_rate": 9.99654637487822e-06, + "loss": 5.7597, + "step": 2425 + }, + { + "epoch": 0.0494384765625, + "grad_norm": 17.61026954650879, + "learning_rate": 9.99653150632374e-06, + "loss": 5.8054, + "step": 2430 + }, + { + "epoch": 0.049540201822916664, + "grad_norm": 15.96060848236084, + "learning_rate": 9.99651660584301e-06, + "loss": 5.4173, + "step": 2435 + }, + { + "epoch": 0.049641927083333336, + "grad_norm": 17.150787353515625, + "learning_rate": 9.996501673436126e-06, + "loss": 5.747, + "step": 2440 + }, + { + "epoch": 0.04974365234375, + "grad_norm": 17.872703552246094, + "learning_rate": 9.99648670910318e-06, + "loss": 5.7257, + "step": 2445 + }, + { + "epoch": 0.049845377604166664, + "grad_norm": 22.165237426757812, + "learning_rate": 9.99647171284427e-06, + "loss": 5.9877, + "step": 2450 + }, + { + "epoch": 0.049947102864583336, + "grad_norm": 19.590131759643555, + "learning_rate": 9.996456684659494e-06, + "loss": 5.9019, + "step": 2455 + }, + { + "epoch": 0.050048828125, + "grad_norm": 15.977219581604004, + "learning_rate": 9.996441624548944e-06, + "loss": 5.6072, + "step": 2460 + }, + { + "epoch": 0.050150553385416664, + "grad_norm": 15.896774291992188, + "learning_rate": 9.996426532512722e-06, + "loss": 5.5897, + "step": 2465 + }, + { + "epoch": 0.050252278645833336, + "grad_norm": 16.320350646972656, + "learning_rate": 9.996411408550917e-06, + "loss": 6.1549, + "step": 2470 + }, + { + "epoch": 0.05035400390625, + "grad_norm": 14.820196151733398, + "learning_rate": 9.99639625266363e-06, + "loss": 5.7383, + "step": 2475 + }, + { + "epoch": 0.050455729166666664, + "grad_norm": 13.985960960388184, + "learning_rate": 9.996381064850954e-06, + "loss": 5.7211, + "step": 2480 + }, + { + "epoch": 0.050557454427083336, + "grad_norm": 16.489643096923828, + "learning_rate": 9.996365845112994e-06, + "loss": 5.7739, + "step": 2485 + }, + { + "epoch": 0.0506591796875, + "grad_norm": 18.643049240112305, + "learning_rate": 9.99635059344984e-06, + "loss": 5.7568, + "step": 2490 + }, + { + "epoch": 0.050760904947916664, + "grad_norm": 16.585468292236328, + "learning_rate": 9.996335309861591e-06, + "loss": 5.7592, + "step": 2495 + }, + { + "epoch": 0.050862630208333336, + "grad_norm": 16.152862548828125, + "learning_rate": 9.996319994348346e-06, + "loss": 5.6142, + "step": 2500 + }, + { + "epoch": 0.05096435546875, + "grad_norm": 19.240642547607422, + "learning_rate": 9.996304646910204e-06, + "loss": 5.9917, + "step": 2505 + }, + { + "epoch": 0.051066080729166664, + "grad_norm": 20.872976303100586, + "learning_rate": 9.99628926754726e-06, + "loss": 5.9284, + "step": 2510 + }, + { + "epoch": 0.051167805989583336, + "grad_norm": 15.328634262084961, + "learning_rate": 9.996273856259613e-06, + "loss": 5.8219, + "step": 2515 + }, + { + "epoch": 0.05126953125, + "grad_norm": 18.13556671142578, + "learning_rate": 9.996258413047365e-06, + "loss": 5.9996, + "step": 2520 + }, + { + "epoch": 0.051371256510416664, + "grad_norm": 11.803972244262695, + "learning_rate": 9.996242937910608e-06, + "loss": 5.7223, + "step": 2525 + }, + { + "epoch": 0.051472981770833336, + "grad_norm": 16.976093292236328, + "learning_rate": 9.996227430849446e-06, + "loss": 5.8032, + "step": 2530 + }, + { + "epoch": 0.05157470703125, + "grad_norm": 25.81716537475586, + "learning_rate": 9.996211891863976e-06, + "loss": 5.9186, + "step": 2535 + }, + { + "epoch": 0.051676432291666664, + "grad_norm": 16.801651000976562, + "learning_rate": 9.9961963209543e-06, + "loss": 6.0649, + "step": 2540 + }, + { + "epoch": 0.051778157552083336, + "grad_norm": 15.76025104522705, + "learning_rate": 9.996180718120514e-06, + "loss": 5.8937, + "step": 2545 + }, + { + "epoch": 0.0518798828125, + "grad_norm": 14.650188446044922, + "learning_rate": 9.99616508336272e-06, + "loss": 5.8059, + "step": 2550 + }, + { + "epoch": 0.051981608072916664, + "grad_norm": 14.170364379882812, + "learning_rate": 9.996149416681016e-06, + "loss": 5.7045, + "step": 2555 + }, + { + "epoch": 0.052083333333333336, + "grad_norm": 18.371957778930664, + "learning_rate": 9.996133718075504e-06, + "loss": 5.7595, + "step": 2560 + }, + { + "epoch": 0.05218505859375, + "grad_norm": 15.457711219787598, + "learning_rate": 9.996117987546283e-06, + "loss": 5.72, + "step": 2565 + }, + { + "epoch": 0.052286783854166664, + "grad_norm": 16.166696548461914, + "learning_rate": 9.996102225093455e-06, + "loss": 5.9538, + "step": 2570 + }, + { + "epoch": 0.052388509114583336, + "grad_norm": 13.235213279724121, + "learning_rate": 9.99608643071712e-06, + "loss": 5.719, + "step": 2575 + }, + { + "epoch": 0.052490234375, + "grad_norm": 13.223714828491211, + "learning_rate": 9.996070604417378e-06, + "loss": 5.7369, + "step": 2580 + }, + { + "epoch": 0.052591959635416664, + "grad_norm": 20.482236862182617, + "learning_rate": 9.99605474619433e-06, + "loss": 5.7795, + "step": 2585 + }, + { + "epoch": 0.052693684895833336, + "grad_norm": 16.870145797729492, + "learning_rate": 9.99603885604808e-06, + "loss": 5.8239, + "step": 2590 + }, + { + "epoch": 0.05279541015625, + "grad_norm": 18.26272964477539, + "learning_rate": 9.996022933978727e-06, + "loss": 6.1422, + "step": 2595 + }, + { + "epoch": 0.052897135416666664, + "grad_norm": 13.998729705810547, + "learning_rate": 9.996006979986373e-06, + "loss": 5.9366, + "step": 2600 + }, + { + "epoch": 0.052998860677083336, + "grad_norm": 20.002986907958984, + "learning_rate": 9.99599099407112e-06, + "loss": 5.7147, + "step": 2605 + }, + { + "epoch": 0.0531005859375, + "grad_norm": 15.763337135314941, + "learning_rate": 9.995974976233069e-06, + "loss": 6.0033, + "step": 2610 + }, + { + "epoch": 0.053202311197916664, + "grad_norm": 17.27014923095703, + "learning_rate": 9.995958926472326e-06, + "loss": 5.6588, + "step": 2615 + }, + { + "epoch": 0.053304036458333336, + "grad_norm": 16.061233520507812, + "learning_rate": 9.995942844788991e-06, + "loss": 5.8256, + "step": 2620 + }, + { + "epoch": 0.05340576171875, + "grad_norm": 16.647367477416992, + "learning_rate": 9.995926731183168e-06, + "loss": 5.7821, + "step": 2625 + }, + { + "epoch": 0.053507486979166664, + "grad_norm": 12.441757202148438, + "learning_rate": 9.995910585654957e-06, + "loss": 5.6754, + "step": 2630 + }, + { + "epoch": 0.053609212239583336, + "grad_norm": 17.343935012817383, + "learning_rate": 9.995894408204465e-06, + "loss": 5.5684, + "step": 2635 + }, + { + "epoch": 0.0537109375, + "grad_norm": 14.482321739196777, + "learning_rate": 9.995878198831794e-06, + "loss": 5.6308, + "step": 2640 + }, + { + "epoch": 0.053812662760416664, + "grad_norm": 18.633420944213867, + "learning_rate": 9.995861957537047e-06, + "loss": 5.6819, + "step": 2645 + }, + { + "epoch": 0.053914388020833336, + "grad_norm": 13.030791282653809, + "learning_rate": 9.995845684320327e-06, + "loss": 5.4637, + "step": 2650 + }, + { + "epoch": 0.05401611328125, + "grad_norm": 28.413673400878906, + "learning_rate": 9.995829379181739e-06, + "loss": 5.8491, + "step": 2655 + }, + { + "epoch": 0.054117838541666664, + "grad_norm": 20.423786163330078, + "learning_rate": 9.995813042121387e-06, + "loss": 5.8396, + "step": 2660 + }, + { + "epoch": 0.054219563802083336, + "grad_norm": 13.132603645324707, + "learning_rate": 9.995796673139376e-06, + "loss": 5.5742, + "step": 2665 + }, + { + "epoch": 0.0543212890625, + "grad_norm": 17.115156173706055, + "learning_rate": 9.99578027223581e-06, + "loss": 5.6513, + "step": 2670 + }, + { + "epoch": 0.054423014322916664, + "grad_norm": 20.509994506835938, + "learning_rate": 9.995763839410795e-06, + "loss": 6.0509, + "step": 2675 + }, + { + "epoch": 0.054524739583333336, + "grad_norm": 17.372209548950195, + "learning_rate": 9.995747374664435e-06, + "loss": 5.7433, + "step": 2680 + }, + { + "epoch": 0.05462646484375, + "grad_norm": 13.453932762145996, + "learning_rate": 9.995730877996833e-06, + "loss": 5.5301, + "step": 2685 + }, + { + "epoch": 0.054728190104166664, + "grad_norm": 14.438817977905273, + "learning_rate": 9.995714349408097e-06, + "loss": 5.395, + "step": 2690 + }, + { + "epoch": 0.054829915364583336, + "grad_norm": 14.02118968963623, + "learning_rate": 9.995697788898333e-06, + "loss": 5.8289, + "step": 2695 + }, + { + "epoch": 0.054931640625, + "grad_norm": 17.169315338134766, + "learning_rate": 9.995681196467647e-06, + "loss": 5.7168, + "step": 2700 + }, + { + "epoch": 0.055033365885416664, + "grad_norm": 22.14277458190918, + "learning_rate": 9.995664572116142e-06, + "loss": 5.6048, + "step": 2705 + }, + { + "epoch": 0.055135091145833336, + "grad_norm": 12.981744766235352, + "learning_rate": 9.995647915843928e-06, + "loss": 5.7929, + "step": 2710 + }, + { + "epoch": 0.05523681640625, + "grad_norm": 14.055262565612793, + "learning_rate": 9.995631227651109e-06, + "loss": 5.488, + "step": 2715 + }, + { + "epoch": 0.055338541666666664, + "grad_norm": 15.945588111877441, + "learning_rate": 9.995614507537791e-06, + "loss": 6.0063, + "step": 2720 + }, + { + "epoch": 0.055440266927083336, + "grad_norm": 14.790645599365234, + "learning_rate": 9.995597755504085e-06, + "loss": 5.6378, + "step": 2725 + }, + { + "epoch": 0.0555419921875, + "grad_norm": 20.09528160095215, + "learning_rate": 9.995580971550092e-06, + "loss": 5.8243, + "step": 2730 + }, + { + "epoch": 0.055643717447916664, + "grad_norm": 18.029855728149414, + "learning_rate": 9.995564155675922e-06, + "loss": 5.796, + "step": 2735 + }, + { + "epoch": 0.055745442708333336, + "grad_norm": 14.369986534118652, + "learning_rate": 9.995547307881685e-06, + "loss": 5.5637, + "step": 2740 + }, + { + "epoch": 0.05584716796875, + "grad_norm": 14.279390335083008, + "learning_rate": 9.995530428167487e-06, + "loss": 5.6566, + "step": 2745 + }, + { + "epoch": 0.055948893229166664, + "grad_norm": 17.333528518676758, + "learning_rate": 9.995513516533436e-06, + "loss": 5.7036, + "step": 2750 + }, + { + "epoch": 0.056050618489583336, + "grad_norm": 17.63787841796875, + "learning_rate": 9.995496572979637e-06, + "loss": 5.6811, + "step": 2755 + }, + { + "epoch": 0.05615234375, + "grad_norm": 18.99125099182129, + "learning_rate": 9.995479597506201e-06, + "loss": 5.7527, + "step": 2760 + }, + { + "epoch": 0.056254069010416664, + "grad_norm": 19.664230346679688, + "learning_rate": 9.995462590113237e-06, + "loss": 5.6768, + "step": 2765 + }, + { + "epoch": 0.056355794270833336, + "grad_norm": 12.641407012939453, + "learning_rate": 9.995445550800852e-06, + "loss": 5.5588, + "step": 2770 + }, + { + "epoch": 0.05645751953125, + "grad_norm": 18.876291275024414, + "learning_rate": 9.995428479569156e-06, + "loss": 5.6864, + "step": 2775 + }, + { + "epoch": 0.056559244791666664, + "grad_norm": 14.688037872314453, + "learning_rate": 9.995411376418258e-06, + "loss": 5.5638, + "step": 2780 + }, + { + "epoch": 0.056660970052083336, + "grad_norm": 15.844191551208496, + "learning_rate": 9.995394241348267e-06, + "loss": 5.7403, + "step": 2785 + }, + { + "epoch": 0.0567626953125, + "grad_norm": 21.16891860961914, + "learning_rate": 9.995377074359294e-06, + "loss": 5.7159, + "step": 2790 + }, + { + "epoch": 0.056864420572916664, + "grad_norm": 25.63341522216797, + "learning_rate": 9.995359875451445e-06, + "loss": 5.8679, + "step": 2795 + }, + { + "epoch": 0.056966145833333336, + "grad_norm": 14.842373847961426, + "learning_rate": 9.995342644624833e-06, + "loss": 5.8859, + "step": 2800 + }, + { + "epoch": 0.05706787109375, + "grad_norm": 14.018280982971191, + "learning_rate": 9.995325381879567e-06, + "loss": 5.725, + "step": 2805 + }, + { + "epoch": 0.057169596354166664, + "grad_norm": 17.529541015625, + "learning_rate": 9.995308087215758e-06, + "loss": 5.7136, + "step": 2810 + }, + { + "epoch": 0.057271321614583336, + "grad_norm": 12.11523151397705, + "learning_rate": 9.995290760633516e-06, + "loss": 5.6948, + "step": 2815 + }, + { + "epoch": 0.057373046875, + "grad_norm": 18.07270622253418, + "learning_rate": 9.99527340213295e-06, + "loss": 5.9558, + "step": 2820 + }, + { + "epoch": 0.057474772135416664, + "grad_norm": 17.055496215820312, + "learning_rate": 9.995256011714176e-06, + "loss": 5.7501, + "step": 2825 + }, + { + "epoch": 0.057576497395833336, + "grad_norm": 13.075011253356934, + "learning_rate": 9.9952385893773e-06, + "loss": 5.6903, + "step": 2830 + }, + { + "epoch": 0.05767822265625, + "grad_norm": 14.730393409729004, + "learning_rate": 9.995221135122434e-06, + "loss": 5.5861, + "step": 2835 + }, + { + "epoch": 0.057779947916666664, + "grad_norm": 15.994260787963867, + "learning_rate": 9.995203648949692e-06, + "loss": 5.8515, + "step": 2840 + }, + { + "epoch": 0.057881673177083336, + "grad_norm": 19.641014099121094, + "learning_rate": 9.995186130859184e-06, + "loss": 5.7822, + "step": 2845 + }, + { + "epoch": 0.0579833984375, + "grad_norm": 10.764851570129395, + "learning_rate": 9.995168580851022e-06, + "loss": 5.548, + "step": 2850 + }, + { + "epoch": 0.058085123697916664, + "grad_norm": 12.756096839904785, + "learning_rate": 9.99515099892532e-06, + "loss": 5.7239, + "step": 2855 + }, + { + "epoch": 0.058186848958333336, + "grad_norm": 15.490705490112305, + "learning_rate": 9.995133385082188e-06, + "loss": 5.7277, + "step": 2860 + }, + { + "epoch": 0.05828857421875, + "grad_norm": 18.654191970825195, + "learning_rate": 9.995115739321739e-06, + "loss": 5.6086, + "step": 2865 + }, + { + "epoch": 0.058390299479166664, + "grad_norm": 16.6170711517334, + "learning_rate": 9.995098061644086e-06, + "loss": 5.8968, + "step": 2870 + }, + { + "epoch": 0.058492024739583336, + "grad_norm": 17.48712730407715, + "learning_rate": 9.995080352049341e-06, + "loss": 5.9883, + "step": 2875 + }, + { + "epoch": 0.05859375, + "grad_norm": 31.84381675720215, + "learning_rate": 9.995062610537621e-06, + "loss": 5.6568, + "step": 2880 + }, + { + "epoch": 0.058695475260416664, + "grad_norm": 11.970757484436035, + "learning_rate": 9.995044837109033e-06, + "loss": 5.7226, + "step": 2885 + }, + { + "epoch": 0.058797200520833336, + "grad_norm": 18.510433197021484, + "learning_rate": 9.995027031763697e-06, + "loss": 5.8855, + "step": 2890 + }, + { + "epoch": 0.05889892578125, + "grad_norm": 15.014832496643066, + "learning_rate": 9.995009194501724e-06, + "loss": 5.9244, + "step": 2895 + }, + { + "epoch": 0.059000651041666664, + "grad_norm": 14.760784149169922, + "learning_rate": 9.994991325323226e-06, + "loss": 5.8388, + "step": 2900 + }, + { + "epoch": 0.059102376302083336, + "grad_norm": 14.72612190246582, + "learning_rate": 9.99497342422832e-06, + "loss": 5.6346, + "step": 2905 + }, + { + "epoch": 0.0592041015625, + "grad_norm": 11.852595329284668, + "learning_rate": 9.99495549121712e-06, + "loss": 5.8797, + "step": 2910 + }, + { + "epoch": 0.059305826822916664, + "grad_norm": 17.12039566040039, + "learning_rate": 9.994937526289738e-06, + "loss": 5.5645, + "step": 2915 + }, + { + "epoch": 0.059407552083333336, + "grad_norm": 15.539984703063965, + "learning_rate": 9.994919529446294e-06, + "loss": 5.5675, + "step": 2920 + }, + { + "epoch": 0.05950927734375, + "grad_norm": 14.417523384094238, + "learning_rate": 9.994901500686897e-06, + "loss": 5.6552, + "step": 2925 + }, + { + "epoch": 0.059611002604166664, + "grad_norm": 22.501100540161133, + "learning_rate": 9.994883440011666e-06, + "loss": 5.6457, + "step": 2930 + }, + { + "epoch": 0.059712727864583336, + "grad_norm": 18.31247901916504, + "learning_rate": 9.994865347420715e-06, + "loss": 5.8815, + "step": 2935 + }, + { + "epoch": 0.059814453125, + "grad_norm": 17.698143005371094, + "learning_rate": 9.994847222914162e-06, + "loss": 5.4565, + "step": 2940 + }, + { + "epoch": 0.059916178385416664, + "grad_norm": 20.443588256835938, + "learning_rate": 9.994829066492117e-06, + "loss": 5.5698, + "step": 2945 + }, + { + "epoch": 0.060017903645833336, + "grad_norm": 13.576700210571289, + "learning_rate": 9.994810878154703e-06, + "loss": 5.5873, + "step": 2950 + }, + { + "epoch": 0.06011962890625, + "grad_norm": 17.373624801635742, + "learning_rate": 9.994792657902033e-06, + "loss": 5.8363, + "step": 2955 + }, + { + "epoch": 0.060221354166666664, + "grad_norm": 14.047901153564453, + "learning_rate": 9.994774405734222e-06, + "loss": 5.7445, + "step": 2960 + }, + { + "epoch": 0.060323079427083336, + "grad_norm": 19.167673110961914, + "learning_rate": 9.994756121651388e-06, + "loss": 5.7048, + "step": 2965 + }, + { + "epoch": 0.0604248046875, + "grad_norm": 12.773022651672363, + "learning_rate": 9.994737805653649e-06, + "loss": 5.9667, + "step": 2970 + }, + { + "epoch": 0.060526529947916664, + "grad_norm": 18.582748413085938, + "learning_rate": 9.99471945774112e-06, + "loss": 5.4681, + "step": 2975 + }, + { + "epoch": 0.060628255208333336, + "grad_norm": 12.4488525390625, + "learning_rate": 9.99470107791392e-06, + "loss": 5.7284, + "step": 2980 + }, + { + "epoch": 0.06072998046875, + "grad_norm": 17.34236717224121, + "learning_rate": 9.994682666172167e-06, + "loss": 5.4575, + "step": 2985 + }, + { + "epoch": 0.060831705729166664, + "grad_norm": 12.172924041748047, + "learning_rate": 9.994664222515975e-06, + "loss": 5.7635, + "step": 2990 + }, + { + "epoch": 0.060933430989583336, + "grad_norm": 17.777202606201172, + "learning_rate": 9.994645746945465e-06, + "loss": 5.6927, + "step": 2995 + }, + { + "epoch": 0.06103515625, + "grad_norm": 13.568960189819336, + "learning_rate": 9.994627239460754e-06, + "loss": 5.6477, + "step": 3000 + }, + { + "epoch": 0.061136881510416664, + "grad_norm": 18.516103744506836, + "learning_rate": 9.99460870006196e-06, + "loss": 5.7988, + "step": 3005 + }, + { + "epoch": 0.061238606770833336, + "grad_norm": 18.931724548339844, + "learning_rate": 9.994590128749201e-06, + "loss": 5.6103, + "step": 3010 + }, + { + "epoch": 0.06134033203125, + "grad_norm": 17.17027473449707, + "learning_rate": 9.9945715255226e-06, + "loss": 5.6378, + "step": 3015 + }, + { + "epoch": 0.061442057291666664, + "grad_norm": 26.34682273864746, + "learning_rate": 9.99455289038227e-06, + "loss": 5.7115, + "step": 3020 + }, + { + "epoch": 0.061543782552083336, + "grad_norm": 14.93790054321289, + "learning_rate": 9.994534223328333e-06, + "loss": 5.7794, + "step": 3025 + }, + { + "epoch": 0.0616455078125, + "grad_norm": 18.503334045410156, + "learning_rate": 9.994515524360908e-06, + "loss": 5.6769, + "step": 3030 + }, + { + "epoch": 0.061747233072916664, + "grad_norm": 11.464913368225098, + "learning_rate": 9.994496793480112e-06, + "loss": 5.5122, + "step": 3035 + }, + { + "epoch": 0.061848958333333336, + "grad_norm": 16.46463394165039, + "learning_rate": 9.99447803068607e-06, + "loss": 5.365, + "step": 3040 + }, + { + "epoch": 0.06195068359375, + "grad_norm": 15.740452766418457, + "learning_rate": 9.994459235978898e-06, + "loss": 5.9167, + "step": 3045 + }, + { + "epoch": 0.062052408854166664, + "grad_norm": 20.950214385986328, + "learning_rate": 9.994440409358716e-06, + "loss": 5.6398, + "step": 3050 + }, + { + "epoch": 0.062154134114583336, + "grad_norm": 17.207698822021484, + "learning_rate": 9.994421550825647e-06, + "loss": 5.7125, + "step": 3055 + }, + { + "epoch": 0.062255859375, + "grad_norm": 14.788219451904297, + "learning_rate": 9.994402660379809e-06, + "loss": 5.7614, + "step": 3060 + }, + { + "epoch": 0.062357584635416664, + "grad_norm": 13.497633934020996, + "learning_rate": 9.994383738021322e-06, + "loss": 5.4549, + "step": 3065 + }, + { + "epoch": 0.062459309895833336, + "grad_norm": 17.987503051757812, + "learning_rate": 9.99436478375031e-06, + "loss": 5.7754, + "step": 3070 + }, + { + "epoch": 0.06256103515625, + "grad_norm": 14.038566589355469, + "learning_rate": 9.994345797566892e-06, + "loss": 5.7106, + "step": 3075 + }, + { + "epoch": 0.06266276041666667, + "grad_norm": 14.654552459716797, + "learning_rate": 9.99432677947119e-06, + "loss": 5.7604, + "step": 3080 + }, + { + "epoch": 0.06276448567708333, + "grad_norm": 22.379453659057617, + "learning_rate": 9.994307729463323e-06, + "loss": 5.8748, + "step": 3085 + }, + { + "epoch": 0.0628662109375, + "grad_norm": 22.711963653564453, + "learning_rate": 9.99428864754342e-06, + "loss": 5.7547, + "step": 3090 + }, + { + "epoch": 0.06296793619791667, + "grad_norm": 15.584921836853027, + "learning_rate": 9.994269533711593e-06, + "loss": 5.6496, + "step": 3095 + }, + { + "epoch": 0.06306966145833333, + "grad_norm": 14.515154838562012, + "learning_rate": 9.994250387967971e-06, + "loss": 5.7558, + "step": 3100 + }, + { + "epoch": 0.06317138671875, + "grad_norm": 15.196714401245117, + "learning_rate": 9.994231210312675e-06, + "loss": 5.7071, + "step": 3105 + }, + { + "epoch": 0.06327311197916667, + "grad_norm": 30.60645294189453, + "learning_rate": 9.994212000745827e-06, + "loss": 5.6315, + "step": 3110 + }, + { + "epoch": 0.06337483723958333, + "grad_norm": 21.41769790649414, + "learning_rate": 9.994192759267549e-06, + "loss": 5.7617, + "step": 3115 + }, + { + "epoch": 0.0634765625, + "grad_norm": 15.299363136291504, + "learning_rate": 9.994173485877964e-06, + "loss": 5.5109, + "step": 3120 + }, + { + "epoch": 0.06357828776041667, + "grad_norm": 12.069134712219238, + "learning_rate": 9.994154180577196e-06, + "loss": 5.4975, + "step": 3125 + }, + { + "epoch": 0.06368001302083333, + "grad_norm": 13.40970516204834, + "learning_rate": 9.99413484336537e-06, + "loss": 5.7065, + "step": 3130 + }, + { + "epoch": 0.06378173828125, + "grad_norm": 11.88183879852295, + "learning_rate": 9.994115474242606e-06, + "loss": 5.8718, + "step": 3135 + }, + { + "epoch": 0.06388346354166667, + "grad_norm": 13.359987258911133, + "learning_rate": 9.99409607320903e-06, + "loss": 5.7036, + "step": 3140 + }, + { + "epoch": 0.06398518880208333, + "grad_norm": 13.231866836547852, + "learning_rate": 9.994076640264765e-06, + "loss": 5.6578, + "step": 3145 + }, + { + "epoch": 0.0640869140625, + "grad_norm": 16.55290412902832, + "learning_rate": 9.994057175409936e-06, + "loss": 5.6388, + "step": 3150 + }, + { + "epoch": 0.06418863932291667, + "grad_norm": 15.25450611114502, + "learning_rate": 9.994037678644667e-06, + "loss": 5.8736, + "step": 3155 + }, + { + "epoch": 0.06429036458333333, + "grad_norm": 15.898117065429688, + "learning_rate": 9.994018149969083e-06, + "loss": 5.9608, + "step": 3160 + }, + { + "epoch": 0.06439208984375, + "grad_norm": 12.63216781616211, + "learning_rate": 9.993998589383306e-06, + "loss": 5.5666, + "step": 3165 + }, + { + "epoch": 0.06449381510416667, + "grad_norm": 15.083229064941406, + "learning_rate": 9.993978996887466e-06, + "loss": 5.8547, + "step": 3170 + }, + { + "epoch": 0.06459554036458333, + "grad_norm": 15.309268951416016, + "learning_rate": 9.993959372481684e-06, + "loss": 5.926, + "step": 3175 + }, + { + "epoch": 0.064697265625, + "grad_norm": 20.166330337524414, + "learning_rate": 9.993939716166089e-06, + "loss": 5.6975, + "step": 3180 + }, + { + "epoch": 0.06479899088541667, + "grad_norm": 25.244525909423828, + "learning_rate": 9.993920027940802e-06, + "loss": 5.5598, + "step": 3185 + }, + { + "epoch": 0.06490071614583333, + "grad_norm": 18.462827682495117, + "learning_rate": 9.993900307805953e-06, + "loss": 5.8015, + "step": 3190 + }, + { + "epoch": 0.06500244140625, + "grad_norm": 22.435009002685547, + "learning_rate": 9.993880555761664e-06, + "loss": 5.8164, + "step": 3195 + }, + { + "epoch": 0.06510416666666667, + "grad_norm": 13.359432220458984, + "learning_rate": 9.993860771808066e-06, + "loss": 6.036, + "step": 3200 + }, + { + "epoch": 0.06520589192708333, + "grad_norm": 17.164249420166016, + "learning_rate": 9.993840955945282e-06, + "loss": 5.7477, + "step": 3205 + }, + { + "epoch": 0.0653076171875, + "grad_norm": 13.667597770690918, + "learning_rate": 9.99382110817344e-06, + "loss": 5.9048, + "step": 3210 + }, + { + "epoch": 0.06540934244791667, + "grad_norm": 13.1735258102417, + "learning_rate": 9.993801228492665e-06, + "loss": 5.7643, + "step": 3215 + }, + { + "epoch": 0.06551106770833333, + "grad_norm": 23.154888153076172, + "learning_rate": 9.993781316903087e-06, + "loss": 5.7094, + "step": 3220 + }, + { + "epoch": 0.06561279296875, + "grad_norm": 16.214889526367188, + "learning_rate": 9.993761373404831e-06, + "loss": 5.7533, + "step": 3225 + }, + { + "epoch": 0.06571451822916667, + "grad_norm": 21.512401580810547, + "learning_rate": 9.993741397998025e-06, + "loss": 5.6617, + "step": 3230 + }, + { + "epoch": 0.06581624348958333, + "grad_norm": 12.785857200622559, + "learning_rate": 9.993721390682796e-06, + "loss": 5.5152, + "step": 3235 + }, + { + "epoch": 0.06591796875, + "grad_norm": 16.219192504882812, + "learning_rate": 9.993701351459272e-06, + "loss": 5.8239, + "step": 3240 + }, + { + "epoch": 0.06601969401041667, + "grad_norm": 16.22003173828125, + "learning_rate": 9.993681280327585e-06, + "loss": 5.4527, + "step": 3245 + }, + { + "epoch": 0.06612141927083333, + "grad_norm": 18.68373680114746, + "learning_rate": 9.993661177287857e-06, + "loss": 5.624, + "step": 3250 + }, + { + "epoch": 0.06622314453125, + "grad_norm": 26.19914436340332, + "learning_rate": 9.99364104234022e-06, + "loss": 5.536, + "step": 3255 + }, + { + "epoch": 0.06632486979166667, + "grad_norm": 15.746367454528809, + "learning_rate": 9.993620875484803e-06, + "loss": 5.4907, + "step": 3260 + }, + { + "epoch": 0.06642659505208333, + "grad_norm": 18.941043853759766, + "learning_rate": 9.99360067672173e-06, + "loss": 5.7713, + "step": 3265 + }, + { + "epoch": 0.0665283203125, + "grad_norm": 15.097516059875488, + "learning_rate": 9.993580446051139e-06, + "loss": 5.855, + "step": 3270 + }, + { + "epoch": 0.06663004557291667, + "grad_norm": 17.312793731689453, + "learning_rate": 9.99356018347315e-06, + "loss": 5.6285, + "step": 3275 + }, + { + "epoch": 0.06673177083333333, + "grad_norm": 13.584954261779785, + "learning_rate": 9.993539888987899e-06, + "loss": 5.7798, + "step": 3280 + }, + { + "epoch": 0.06683349609375, + "grad_norm": 19.160978317260742, + "learning_rate": 9.993519562595513e-06, + "loss": 5.845, + "step": 3285 + }, + { + "epoch": 0.06693522135416667, + "grad_norm": 21.065378189086914, + "learning_rate": 9.993499204296121e-06, + "loss": 6.0915, + "step": 3290 + }, + { + "epoch": 0.06703694661458333, + "grad_norm": 16.311445236206055, + "learning_rate": 9.993478814089854e-06, + "loss": 5.6177, + "step": 3295 + }, + { + "epoch": 0.067138671875, + "grad_norm": 16.458982467651367, + "learning_rate": 9.993458391976845e-06, + "loss": 5.6747, + "step": 3300 + }, + { + "epoch": 0.06724039713541667, + "grad_norm": 13.889200210571289, + "learning_rate": 9.99343793795722e-06, + "loss": 5.7001, + "step": 3305 + }, + { + "epoch": 0.06734212239583333, + "grad_norm": 14.881014823913574, + "learning_rate": 9.993417452031112e-06, + "loss": 6.116, + "step": 3310 + }, + { + "epoch": 0.06744384765625, + "grad_norm": 16.67597770690918, + "learning_rate": 9.993396934198652e-06, + "loss": 5.6254, + "step": 3315 + }, + { + "epoch": 0.06754557291666667, + "grad_norm": 18.107507705688477, + "learning_rate": 9.99337638445997e-06, + "loss": 5.3732, + "step": 3320 + }, + { + "epoch": 0.06764729817708333, + "grad_norm": 40.19498825073242, + "learning_rate": 9.993355802815198e-06, + "loss": 6.1674, + "step": 3325 + }, + { + "epoch": 0.0677490234375, + "grad_norm": 20.098541259765625, + "learning_rate": 9.993335189264468e-06, + "loss": 5.8208, + "step": 3330 + }, + { + "epoch": 0.06785074869791667, + "grad_norm": 18.023733139038086, + "learning_rate": 9.993314543807913e-06, + "loss": 5.5537, + "step": 3335 + }, + { + "epoch": 0.06795247395833333, + "grad_norm": 14.456014633178711, + "learning_rate": 9.99329386644566e-06, + "loss": 5.5451, + "step": 3340 + }, + { + "epoch": 0.06805419921875, + "grad_norm": 20.077524185180664, + "learning_rate": 9.993273157177845e-06, + "loss": 6.2014, + "step": 3345 + }, + { + "epoch": 0.06815592447916667, + "grad_norm": 18.06685447692871, + "learning_rate": 9.993252416004603e-06, + "loss": 5.6105, + "step": 3350 + }, + { + "epoch": 0.06825764973958333, + "grad_norm": 20.9133358001709, + "learning_rate": 9.993231642926058e-06, + "loss": 5.742, + "step": 3355 + }, + { + "epoch": 0.068359375, + "grad_norm": 15.375894546508789, + "learning_rate": 9.993210837942351e-06, + "loss": 5.9184, + "step": 3360 + }, + { + "epoch": 0.06846110026041667, + "grad_norm": 19.78826141357422, + "learning_rate": 9.993190001053611e-06, + "loss": 5.7199, + "step": 3365 + }, + { + "epoch": 0.06856282552083333, + "grad_norm": 19.003822326660156, + "learning_rate": 9.993169132259973e-06, + "loss": 6.2923, + "step": 3370 + }, + { + "epoch": 0.06866455078125, + "grad_norm": 19.476451873779297, + "learning_rate": 9.993148231561566e-06, + "loss": 5.432, + "step": 3375 + }, + { + "epoch": 0.06876627604166667, + "grad_norm": 14.09756851196289, + "learning_rate": 9.99312729895853e-06, + "loss": 5.5983, + "step": 3380 + }, + { + "epoch": 0.06886800130208333, + "grad_norm": 15.839126586914062, + "learning_rate": 9.993106334450995e-06, + "loss": 5.6202, + "step": 3385 + }, + { + "epoch": 0.0689697265625, + "grad_norm": 14.705282211303711, + "learning_rate": 9.993085338039094e-06, + "loss": 5.5722, + "step": 3390 + }, + { + "epoch": 0.06907145182291667, + "grad_norm": 16.702863693237305, + "learning_rate": 9.993064309722965e-06, + "loss": 6.0448, + "step": 3395 + }, + { + "epoch": 0.06917317708333333, + "grad_norm": 15.585478782653809, + "learning_rate": 9.99304324950274e-06, + "loss": 5.764, + "step": 3400 + }, + { + "epoch": 0.06927490234375, + "grad_norm": 13.260176658630371, + "learning_rate": 9.993022157378551e-06, + "loss": 5.5302, + "step": 3405 + }, + { + "epoch": 0.06937662760416667, + "grad_norm": 16.192020416259766, + "learning_rate": 9.993001033350538e-06, + "loss": 5.5107, + "step": 3410 + }, + { + "epoch": 0.06947835286458333, + "grad_norm": 15.448719024658203, + "learning_rate": 9.992979877418833e-06, + "loss": 5.6724, + "step": 3415 + }, + { + "epoch": 0.069580078125, + "grad_norm": 14.095213890075684, + "learning_rate": 9.99295868958357e-06, + "loss": 5.7174, + "step": 3420 + }, + { + "epoch": 0.06968180338541667, + "grad_norm": 15.745182991027832, + "learning_rate": 9.992937469844888e-06, + "loss": 5.7602, + "step": 3425 + }, + { + "epoch": 0.06978352864583333, + "grad_norm": 17.92978858947754, + "learning_rate": 9.99291621820292e-06, + "loss": 5.3933, + "step": 3430 + }, + { + "epoch": 0.06988525390625, + "grad_norm": 21.03660774230957, + "learning_rate": 9.992894934657802e-06, + "loss": 5.7285, + "step": 3435 + }, + { + "epoch": 0.06998697916666667, + "grad_norm": 21.50324249267578, + "learning_rate": 9.992873619209668e-06, + "loss": 5.5095, + "step": 3440 + }, + { + "epoch": 0.07008870442708333, + "grad_norm": 16.907894134521484, + "learning_rate": 9.99285227185866e-06, + "loss": 5.6426, + "step": 3445 + }, + { + "epoch": 0.0701904296875, + "grad_norm": 11.66292667388916, + "learning_rate": 9.99283089260491e-06, + "loss": 5.6304, + "step": 3450 + }, + { + "epoch": 0.07029215494791667, + "grad_norm": 16.819149017333984, + "learning_rate": 9.992809481448555e-06, + "loss": 5.9132, + "step": 3455 + }, + { + "epoch": 0.07039388020833333, + "grad_norm": 13.198978424072266, + "learning_rate": 9.992788038389733e-06, + "loss": 5.7598, + "step": 3460 + }, + { + "epoch": 0.07049560546875, + "grad_norm": 16.451580047607422, + "learning_rate": 9.99276656342858e-06, + "loss": 5.7784, + "step": 3465 + }, + { + "epoch": 0.07059733072916667, + "grad_norm": 16.67352294921875, + "learning_rate": 9.992745056565233e-06, + "loss": 5.7732, + "step": 3470 + }, + { + "epoch": 0.07069905598958333, + "grad_norm": 13.356194496154785, + "learning_rate": 9.992723517799831e-06, + "loss": 5.654, + "step": 3475 + }, + { + "epoch": 0.07080078125, + "grad_norm": 14.685572624206543, + "learning_rate": 9.99270194713251e-06, + "loss": 5.6582, + "step": 3480 + }, + { + "epoch": 0.07090250651041667, + "grad_norm": 17.127084732055664, + "learning_rate": 9.992680344563408e-06, + "loss": 5.6932, + "step": 3485 + }, + { + "epoch": 0.07100423177083333, + "grad_norm": 19.33751106262207, + "learning_rate": 9.992658710092665e-06, + "loss": 5.6708, + "step": 3490 + }, + { + "epoch": 0.07110595703125, + "grad_norm": 13.083264350891113, + "learning_rate": 9.992637043720416e-06, + "loss": 5.5942, + "step": 3495 + }, + { + "epoch": 0.07120768229166667, + "grad_norm": 21.22812271118164, + "learning_rate": 9.992615345446801e-06, + "loss": 5.754, + "step": 3500 + }, + { + "epoch": 0.07130940755208333, + "grad_norm": 14.943262100219727, + "learning_rate": 9.99259361527196e-06, + "loss": 5.5041, + "step": 3505 + }, + { + "epoch": 0.0714111328125, + "grad_norm": 15.397842407226562, + "learning_rate": 9.99257185319603e-06, + "loss": 5.6191, + "step": 3510 + }, + { + "epoch": 0.07151285807291667, + "grad_norm": 12.162185668945312, + "learning_rate": 9.992550059219153e-06, + "loss": 5.6896, + "step": 3515 + }, + { + "epoch": 0.07161458333333333, + "grad_norm": 11.951550483703613, + "learning_rate": 9.992528233341463e-06, + "loss": 5.6006, + "step": 3520 + }, + { + "epoch": 0.07171630859375, + "grad_norm": 21.016151428222656, + "learning_rate": 9.992506375563104e-06, + "loss": 5.7039, + "step": 3525 + }, + { + "epoch": 0.07181803385416667, + "grad_norm": 12.688127517700195, + "learning_rate": 9.992484485884213e-06, + "loss": 5.6211, + "step": 3530 + }, + { + "epoch": 0.07191975911458333, + "grad_norm": 14.555366516113281, + "learning_rate": 9.992462564304932e-06, + "loss": 5.6643, + "step": 3535 + }, + { + "epoch": 0.072021484375, + "grad_norm": 12.18079948425293, + "learning_rate": 9.9924406108254e-06, + "loss": 5.7278, + "step": 3540 + }, + { + "epoch": 0.07212320963541667, + "grad_norm": 12.764280319213867, + "learning_rate": 9.992418625445757e-06, + "loss": 5.6253, + "step": 3545 + }, + { + "epoch": 0.07222493489583333, + "grad_norm": 17.25390625, + "learning_rate": 9.992396608166144e-06, + "loss": 5.5234, + "step": 3550 + }, + { + "epoch": 0.07232666015625, + "grad_norm": 13.60385799407959, + "learning_rate": 9.9923745589867e-06, + "loss": 5.8179, + "step": 3555 + }, + { + "epoch": 0.07242838541666667, + "grad_norm": 20.5423583984375, + "learning_rate": 9.99235247790757e-06, + "loss": 5.6818, + "step": 3560 + }, + { + "epoch": 0.07253011067708333, + "grad_norm": 16.143699645996094, + "learning_rate": 9.99233036492889e-06, + "loss": 5.7102, + "step": 3565 + }, + { + "epoch": 0.0726318359375, + "grad_norm": 14.185507774353027, + "learning_rate": 9.992308220050804e-06, + "loss": 5.5982, + "step": 3570 + }, + { + "epoch": 0.07273356119791667, + "grad_norm": 15.181941032409668, + "learning_rate": 9.992286043273456e-06, + "loss": 5.4114, + "step": 3575 + }, + { + "epoch": 0.07283528645833333, + "grad_norm": 13.966020584106445, + "learning_rate": 9.992263834596982e-06, + "loss": 5.3883, + "step": 3580 + }, + { + "epoch": 0.07293701171875, + "grad_norm": 13.988459587097168, + "learning_rate": 9.992241594021527e-06, + "loss": 5.5825, + "step": 3585 + }, + { + "epoch": 0.07303873697916667, + "grad_norm": 22.905550003051758, + "learning_rate": 9.992219321547235e-06, + "loss": 5.4545, + "step": 3590 + }, + { + "epoch": 0.07314046223958333, + "grad_norm": 22.324756622314453, + "learning_rate": 9.992197017174244e-06, + "loss": 5.5717, + "step": 3595 + }, + { + "epoch": 0.0732421875, + "grad_norm": 14.744086265563965, + "learning_rate": 9.9921746809027e-06, + "loss": 5.3607, + "step": 3600 + }, + { + "epoch": 0.07334391276041667, + "grad_norm": 13.760676383972168, + "learning_rate": 9.992152312732744e-06, + "loss": 5.6267, + "step": 3605 + }, + { + "epoch": 0.07344563802083333, + "grad_norm": 17.526702880859375, + "learning_rate": 9.99212991266452e-06, + "loss": 5.3918, + "step": 3610 + }, + { + "epoch": 0.07354736328125, + "grad_norm": 20.085227966308594, + "learning_rate": 9.99210748069817e-06, + "loss": 5.4907, + "step": 3615 + }, + { + "epoch": 0.07364908854166667, + "grad_norm": 15.037704467773438, + "learning_rate": 9.992085016833839e-06, + "loss": 5.519, + "step": 3620 + }, + { + "epoch": 0.07375081380208333, + "grad_norm": 18.753063201904297, + "learning_rate": 9.992062521071669e-06, + "loss": 5.8823, + "step": 3625 + }, + { + "epoch": 0.0738525390625, + "grad_norm": 16.151647567749023, + "learning_rate": 9.992039993411804e-06, + "loss": 5.3356, + "step": 3630 + }, + { + "epoch": 0.07395426432291667, + "grad_norm": 19.61491584777832, + "learning_rate": 9.992017433854387e-06, + "loss": 5.4548, + "step": 3635 + }, + { + "epoch": 0.07405598958333333, + "grad_norm": 13.386198043823242, + "learning_rate": 9.991994842399566e-06, + "loss": 5.6056, + "step": 3640 + }, + { + "epoch": 0.07415771484375, + "grad_norm": 15.04864501953125, + "learning_rate": 9.99197221904748e-06, + "loss": 5.7137, + "step": 3645 + }, + { + "epoch": 0.07425944010416667, + "grad_norm": 17.29755973815918, + "learning_rate": 9.991949563798278e-06, + "loss": 5.7492, + "step": 3650 + }, + { + "epoch": 0.07436116536458333, + "grad_norm": 18.11957359313965, + "learning_rate": 9.9919268766521e-06, + "loss": 6.0339, + "step": 3655 + }, + { + "epoch": 0.074462890625, + "grad_norm": 14.732203483581543, + "learning_rate": 9.991904157609097e-06, + "loss": 5.7948, + "step": 3660 + }, + { + "epoch": 0.07456461588541667, + "grad_norm": 19.462589263916016, + "learning_rate": 9.99188140666941e-06, + "loss": 5.7752, + "step": 3665 + }, + { + "epoch": 0.07466634114583333, + "grad_norm": 16.262208938598633, + "learning_rate": 9.991858623833186e-06, + "loss": 5.5989, + "step": 3670 + }, + { + "epoch": 0.07476806640625, + "grad_norm": 12.902961730957031, + "learning_rate": 9.99183580910057e-06, + "loss": 5.8408, + "step": 3675 + }, + { + "epoch": 0.07486979166666667, + "grad_norm": 20.194639205932617, + "learning_rate": 9.991812962471706e-06, + "loss": 5.4535, + "step": 3680 + }, + { + "epoch": 0.07497151692708333, + "grad_norm": 14.136159896850586, + "learning_rate": 9.991790083946744e-06, + "loss": 5.356, + "step": 3685 + }, + { + "epoch": 0.0750732421875, + "grad_norm": 21.490060806274414, + "learning_rate": 9.991767173525825e-06, + "loss": 5.5736, + "step": 3690 + }, + { + "epoch": 0.07517496744791667, + "grad_norm": 17.796281814575195, + "learning_rate": 9.991744231209102e-06, + "loss": 5.6632, + "step": 3695 + }, + { + "epoch": 0.07527669270833333, + "grad_norm": 21.41227912902832, + "learning_rate": 9.991721256996715e-06, + "loss": 5.8805, + "step": 3700 + }, + { + "epoch": 0.07537841796875, + "grad_norm": 18.11663818359375, + "learning_rate": 9.991698250888817e-06, + "loss": 5.3522, + "step": 3705 + }, + { + "epoch": 0.07548014322916667, + "grad_norm": 14.365523338317871, + "learning_rate": 9.991675212885549e-06, + "loss": 5.5042, + "step": 3710 + }, + { + "epoch": 0.07558186848958333, + "grad_norm": 19.356510162353516, + "learning_rate": 9.991652142987063e-06, + "loss": 5.6159, + "step": 3715 + }, + { + "epoch": 0.07568359375, + "grad_norm": 17.551382064819336, + "learning_rate": 9.991629041193503e-06, + "loss": 5.6922, + "step": 3720 + }, + { + "epoch": 0.07578531901041667, + "grad_norm": 17.498254776000977, + "learning_rate": 9.991605907505018e-06, + "loss": 5.5397, + "step": 3725 + }, + { + "epoch": 0.07588704427083333, + "grad_norm": 12.429972648620605, + "learning_rate": 9.991582741921756e-06, + "loss": 5.7234, + "step": 3730 + }, + { + "epoch": 0.07598876953125, + "grad_norm": 15.403250694274902, + "learning_rate": 9.991559544443865e-06, + "loss": 5.7151, + "step": 3735 + }, + { + "epoch": 0.07609049479166667, + "grad_norm": 16.9572811126709, + "learning_rate": 9.991536315071493e-06, + "loss": 5.5576, + "step": 3740 + }, + { + "epoch": 0.07619222005208333, + "grad_norm": 14.087833404541016, + "learning_rate": 9.991513053804788e-06, + "loss": 5.3468, + "step": 3745 + }, + { + "epoch": 0.0762939453125, + "grad_norm": 16.610572814941406, + "learning_rate": 9.9914897606439e-06, + "loss": 5.596, + "step": 3750 + }, + { + "epoch": 0.07639567057291667, + "grad_norm": 16.353437423706055, + "learning_rate": 9.991466435588976e-06, + "loss": 5.5454, + "step": 3755 + }, + { + "epoch": 0.07649739583333333, + "grad_norm": 13.357071876525879, + "learning_rate": 9.991443078640167e-06, + "loss": 5.6136, + "step": 3760 + }, + { + "epoch": 0.07659912109375, + "grad_norm": 19.992855072021484, + "learning_rate": 9.991419689797619e-06, + "loss": 5.5569, + "step": 3765 + }, + { + "epoch": 0.07670084635416667, + "grad_norm": 17.082693099975586, + "learning_rate": 9.991396269061486e-06, + "loss": 5.6151, + "step": 3770 + }, + { + "epoch": 0.07680257161458333, + "grad_norm": 11.539113998413086, + "learning_rate": 9.991372816431914e-06, + "loss": 5.5225, + "step": 3775 + }, + { + "epoch": 0.076904296875, + "grad_norm": 18.36899185180664, + "learning_rate": 9.991349331909055e-06, + "loss": 5.6519, + "step": 3780 + }, + { + "epoch": 0.07700602213541667, + "grad_norm": 18.98521614074707, + "learning_rate": 9.991325815493058e-06, + "loss": 5.5587, + "step": 3785 + }, + { + "epoch": 0.07710774739583333, + "grad_norm": 19.45314598083496, + "learning_rate": 9.991302267184072e-06, + "loss": 5.6461, + "step": 3790 + }, + { + "epoch": 0.07720947265625, + "grad_norm": 16.975543975830078, + "learning_rate": 9.991278686982254e-06, + "loss": 5.5315, + "step": 3795 + }, + { + "epoch": 0.07731119791666667, + "grad_norm": 18.802888870239258, + "learning_rate": 9.991255074887745e-06, + "loss": 5.6003, + "step": 3800 + }, + { + "epoch": 0.07741292317708333, + "grad_norm": 21.006961822509766, + "learning_rate": 9.9912314309007e-06, + "loss": 5.6444, + "step": 3805 + }, + { + "epoch": 0.0775146484375, + "grad_norm": 19.797279357910156, + "learning_rate": 9.991207755021272e-06, + "loss": 5.6039, + "step": 3810 + }, + { + "epoch": 0.07761637369791667, + "grad_norm": 16.317195892333984, + "learning_rate": 9.991184047249612e-06, + "loss": 5.5985, + "step": 3815 + }, + { + "epoch": 0.07771809895833333, + "grad_norm": 16.129213333129883, + "learning_rate": 9.99116030758587e-06, + "loss": 5.5956, + "step": 3820 + }, + { + "epoch": 0.07781982421875, + "grad_norm": 14.54155445098877, + "learning_rate": 9.991136536030198e-06, + "loss": 5.648, + "step": 3825 + }, + { + "epoch": 0.07792154947916667, + "grad_norm": 17.49031639099121, + "learning_rate": 9.991112732582746e-06, + "loss": 5.7036, + "step": 3830 + }, + { + "epoch": 0.07802327473958333, + "grad_norm": 14.745145797729492, + "learning_rate": 9.99108889724367e-06, + "loss": 5.5859, + "step": 3835 + }, + { + "epoch": 0.078125, + "grad_norm": 14.876293182373047, + "learning_rate": 9.991065030013121e-06, + "loss": 5.7074, + "step": 3840 + }, + { + "epoch": 0.07822672526041667, + "grad_norm": 13.54076862335205, + "learning_rate": 9.991041130891248e-06, + "loss": 5.3749, + "step": 3845 + }, + { + "epoch": 0.07832845052083333, + "grad_norm": 14.782487869262695, + "learning_rate": 9.991017199878208e-06, + "loss": 5.593, + "step": 3850 + }, + { + "epoch": 0.07843017578125, + "grad_norm": 30.088090896606445, + "learning_rate": 9.990993236974154e-06, + "loss": 6.0875, + "step": 3855 + }, + { + "epoch": 0.07853190104166667, + "grad_norm": 17.7982177734375, + "learning_rate": 9.990969242179235e-06, + "loss": 5.558, + "step": 3860 + }, + { + "epoch": 0.07863362630208333, + "grad_norm": 14.655909538269043, + "learning_rate": 9.990945215493609e-06, + "loss": 5.514, + "step": 3865 + }, + { + "epoch": 0.0787353515625, + "grad_norm": 13.807270050048828, + "learning_rate": 9.990921156917426e-06, + "loss": 5.8323, + "step": 3870 + }, + { + "epoch": 0.07883707682291667, + "grad_norm": 17.626304626464844, + "learning_rate": 9.990897066450842e-06, + "loss": 5.5105, + "step": 3875 + }, + { + "epoch": 0.07893880208333333, + "grad_norm": 15.067071914672852, + "learning_rate": 9.99087294409401e-06, + "loss": 5.5879, + "step": 3880 + }, + { + "epoch": 0.07904052734375, + "grad_norm": 14.102787017822266, + "learning_rate": 9.990848789847084e-06, + "loss": 5.6873, + "step": 3885 + }, + { + "epoch": 0.07914225260416667, + "grad_norm": 13.035388946533203, + "learning_rate": 9.990824603710217e-06, + "loss": 5.5663, + "step": 3890 + }, + { + "epoch": 0.07924397786458333, + "grad_norm": 14.296939849853516, + "learning_rate": 9.990800385683568e-06, + "loss": 5.6238, + "step": 3895 + }, + { + "epoch": 0.079345703125, + "grad_norm": 11.668535232543945, + "learning_rate": 9.990776135767286e-06, + "loss": 5.6074, + "step": 3900 + }, + { + "epoch": 0.07944742838541667, + "grad_norm": 18.606346130371094, + "learning_rate": 9.990751853961531e-06, + "loss": 5.6742, + "step": 3905 + }, + { + "epoch": 0.07954915364583333, + "grad_norm": 22.075462341308594, + "learning_rate": 9.990727540266453e-06, + "loss": 5.4558, + "step": 3910 + }, + { + "epoch": 0.07965087890625, + "grad_norm": 22.347639083862305, + "learning_rate": 9.990703194682214e-06, + "loss": 5.8987, + "step": 3915 + }, + { + "epoch": 0.07975260416666667, + "grad_norm": 17.195466995239258, + "learning_rate": 9.990678817208963e-06, + "loss": 5.6763, + "step": 3920 + }, + { + "epoch": 0.07985432942708333, + "grad_norm": 16.926132202148438, + "learning_rate": 9.99065440784686e-06, + "loss": 5.5527, + "step": 3925 + }, + { + "epoch": 0.0799560546875, + "grad_norm": 19.848102569580078, + "learning_rate": 9.99062996659606e-06, + "loss": 5.7213, + "step": 3930 + }, + { + "epoch": 0.08005777994791667, + "grad_norm": 15.301279067993164, + "learning_rate": 9.990605493456717e-06, + "loss": 5.6227, + "step": 3935 + }, + { + "epoch": 0.08015950520833333, + "grad_norm": 24.404170989990234, + "learning_rate": 9.990580988428991e-06, + "loss": 5.6465, + "step": 3940 + }, + { + "epoch": 0.08026123046875, + "grad_norm": 16.80660629272461, + "learning_rate": 9.990556451513035e-06, + "loss": 5.4868, + "step": 3945 + }, + { + "epoch": 0.08036295572916667, + "grad_norm": 15.071172714233398, + "learning_rate": 9.99053188270901e-06, + "loss": 5.543, + "step": 3950 + }, + { + "epoch": 0.08046468098958333, + "grad_norm": 11.686379432678223, + "learning_rate": 9.99050728201707e-06, + "loss": 5.8442, + "step": 3955 + }, + { + "epoch": 0.08056640625, + "grad_norm": 14.876461029052734, + "learning_rate": 9.99048264943737e-06, + "loss": 5.4852, + "step": 3960 + }, + { + "epoch": 0.08066813151041667, + "grad_norm": 12.678647994995117, + "learning_rate": 9.990457984970072e-06, + "loss": 5.7485, + "step": 3965 + }, + { + "epoch": 0.08076985677083333, + "grad_norm": 16.776098251342773, + "learning_rate": 9.99043328861533e-06, + "loss": 5.3856, + "step": 3970 + }, + { + "epoch": 0.08087158203125, + "grad_norm": 28.56300163269043, + "learning_rate": 9.990408560373307e-06, + "loss": 5.4186, + "step": 3975 + }, + { + "epoch": 0.08097330729166667, + "grad_norm": 14.247042655944824, + "learning_rate": 9.990383800244154e-06, + "loss": 5.7003, + "step": 3980 + }, + { + "epoch": 0.08107503255208333, + "grad_norm": 16.04862403869629, + "learning_rate": 9.990359008228034e-06, + "loss": 5.4918, + "step": 3985 + }, + { + "epoch": 0.0811767578125, + "grad_norm": 15.074546813964844, + "learning_rate": 9.990334184325104e-06, + "loss": 5.6631, + "step": 3990 + }, + { + "epoch": 0.08127848307291667, + "grad_norm": 15.562714576721191, + "learning_rate": 9.990309328535523e-06, + "loss": 5.5325, + "step": 3995 + }, + { + "epoch": 0.08138020833333333, + "grad_norm": 22.600017547607422, + "learning_rate": 9.99028444085945e-06, + "loss": 5.7178, + "step": 4000 + }, + { + "epoch": 0.08148193359375, + "grad_norm": 16.070743560791016, + "learning_rate": 9.990259521297042e-06, + "loss": 5.4819, + "step": 4005 + }, + { + "epoch": 0.08158365885416667, + "grad_norm": 17.64358139038086, + "learning_rate": 9.99023456984846e-06, + "loss": 5.7384, + "step": 4010 + }, + { + "epoch": 0.08168538411458333, + "grad_norm": 14.198810577392578, + "learning_rate": 9.990209586513866e-06, + "loss": 5.6118, + "step": 4015 + }, + { + "epoch": 0.081787109375, + "grad_norm": 13.75503921508789, + "learning_rate": 9.990184571293414e-06, + "loss": 5.4935, + "step": 4020 + }, + { + "epoch": 0.08188883463541667, + "grad_norm": 16.780900955200195, + "learning_rate": 9.990159524187267e-06, + "loss": 5.7327, + "step": 4025 + }, + { + "epoch": 0.08199055989583333, + "grad_norm": 13.935791969299316, + "learning_rate": 9.990134445195585e-06, + "loss": 5.814, + "step": 4030 + }, + { + "epoch": 0.08209228515625, + "grad_norm": 12.70767879486084, + "learning_rate": 9.990109334318528e-06, + "loss": 5.6325, + "step": 4035 + }, + { + "epoch": 0.08219401041666667, + "grad_norm": 17.445505142211914, + "learning_rate": 9.990084191556256e-06, + "loss": 5.6923, + "step": 4040 + }, + { + "epoch": 0.08229573567708333, + "grad_norm": 15.662973403930664, + "learning_rate": 9.990059016908931e-06, + "loss": 5.378, + "step": 4045 + }, + { + "epoch": 0.0823974609375, + "grad_norm": 16.54880142211914, + "learning_rate": 9.990033810376712e-06, + "loss": 5.4813, + "step": 4050 + }, + { + "epoch": 0.08249918619791667, + "grad_norm": 19.173009872436523, + "learning_rate": 9.990008571959761e-06, + "loss": 5.5757, + "step": 4055 + }, + { + "epoch": 0.08260091145833333, + "grad_norm": 21.3898868560791, + "learning_rate": 9.98998330165824e-06, + "loss": 5.9583, + "step": 4060 + }, + { + "epoch": 0.08270263671875, + "grad_norm": 16.474098205566406, + "learning_rate": 9.989957999472311e-06, + "loss": 5.5192, + "step": 4065 + }, + { + "epoch": 0.08280436197916667, + "grad_norm": 17.00994873046875, + "learning_rate": 9.98993266540213e-06, + "loss": 5.4783, + "step": 4070 + }, + { + "epoch": 0.08290608723958333, + "grad_norm": 17.459482192993164, + "learning_rate": 9.989907299447868e-06, + "loss": 5.6486, + "step": 4075 + }, + { + "epoch": 0.0830078125, + "grad_norm": 16.782203674316406, + "learning_rate": 9.989881901609681e-06, + "loss": 5.5303, + "step": 4080 + }, + { + "epoch": 0.08310953776041667, + "grad_norm": 15.367051124572754, + "learning_rate": 9.98985647188773e-06, + "loss": 5.4079, + "step": 4085 + }, + { + "epoch": 0.08321126302083333, + "grad_norm": 19.375995635986328, + "learning_rate": 9.989831010282184e-06, + "loss": 5.5241, + "step": 4090 + }, + { + "epoch": 0.08331298828125, + "grad_norm": 16.91722869873047, + "learning_rate": 9.989805516793201e-06, + "loss": 5.4528, + "step": 4095 + }, + { + "epoch": 0.08341471354166667, + "grad_norm": 25.450180053710938, + "learning_rate": 9.989779991420943e-06, + "loss": 5.2963, + "step": 4100 + }, + { + "epoch": 0.08351643880208333, + "grad_norm": 15.937856674194336, + "learning_rate": 9.989754434165575e-06, + "loss": 5.5915, + "step": 4105 + }, + { + "epoch": 0.0836181640625, + "grad_norm": 14.108912467956543, + "learning_rate": 9.989728845027262e-06, + "loss": 5.7279, + "step": 4110 + }, + { + "epoch": 0.08371988932291667, + "grad_norm": 11.435669898986816, + "learning_rate": 9.989703224006164e-06, + "loss": 5.4996, + "step": 4115 + }, + { + "epoch": 0.08382161458333333, + "grad_norm": 16.037294387817383, + "learning_rate": 9.989677571102448e-06, + "loss": 5.4643, + "step": 4120 + }, + { + "epoch": 0.08392333984375, + "grad_norm": 15.875090599060059, + "learning_rate": 9.989651886316273e-06, + "loss": 5.601, + "step": 4125 + }, + { + "epoch": 0.08402506510416667, + "grad_norm": 16.08260726928711, + "learning_rate": 9.989626169647809e-06, + "loss": 5.3109, + "step": 4130 + }, + { + "epoch": 0.08412679036458333, + "grad_norm": 27.219707489013672, + "learning_rate": 9.989600421097217e-06, + "loss": 5.6938, + "step": 4135 + }, + { + "epoch": 0.084228515625, + "grad_norm": 14.732483863830566, + "learning_rate": 9.989574640664663e-06, + "loss": 5.5229, + "step": 4140 + }, + { + "epoch": 0.08433024088541667, + "grad_norm": 20.995079040527344, + "learning_rate": 9.98954882835031e-06, + "loss": 5.4937, + "step": 4145 + }, + { + "epoch": 0.08443196614583333, + "grad_norm": 14.413102149963379, + "learning_rate": 9.989522984154325e-06, + "loss": 5.612, + "step": 4150 + }, + { + "epoch": 0.08453369140625, + "grad_norm": 21.876312255859375, + "learning_rate": 9.98949710807687e-06, + "loss": 5.782, + "step": 4155 + }, + { + "epoch": 0.08463541666666667, + "grad_norm": 17.119291305541992, + "learning_rate": 9.989471200118114e-06, + "loss": 5.6788, + "step": 4160 + }, + { + "epoch": 0.08473714192708333, + "grad_norm": 14.423016548156738, + "learning_rate": 9.989445260278221e-06, + "loss": 5.684, + "step": 4165 + }, + { + "epoch": 0.0848388671875, + "grad_norm": 17.28114891052246, + "learning_rate": 9.989419288557355e-06, + "loss": 5.534, + "step": 4170 + }, + { + "epoch": 0.08494059244791667, + "grad_norm": 22.49473762512207, + "learning_rate": 9.989393284955685e-06, + "loss": 5.5899, + "step": 4175 + }, + { + "epoch": 0.08504231770833333, + "grad_norm": 16.005613327026367, + "learning_rate": 9.989367249473376e-06, + "loss": 5.8055, + "step": 4180 + }, + { + "epoch": 0.08514404296875, + "grad_norm": 17.706296920776367, + "learning_rate": 9.989341182110593e-06, + "loss": 5.5576, + "step": 4185 + }, + { + "epoch": 0.08524576822916667, + "grad_norm": 15.911280632019043, + "learning_rate": 9.989315082867505e-06, + "loss": 5.6746, + "step": 4190 + }, + { + "epoch": 0.08534749348958333, + "grad_norm": 13.976616859436035, + "learning_rate": 9.989288951744276e-06, + "loss": 5.3198, + "step": 4195 + }, + { + "epoch": 0.08544921875, + "grad_norm": 19.4809627532959, + "learning_rate": 9.989262788741077e-06, + "loss": 5.6014, + "step": 4200 + }, + { + "epoch": 0.08555094401041667, + "grad_norm": 14.794751167297363, + "learning_rate": 9.98923659385807e-06, + "loss": 5.6891, + "step": 4205 + }, + { + "epoch": 0.08565266927083333, + "grad_norm": 15.02981185913086, + "learning_rate": 9.989210367095426e-06, + "loss": 5.6292, + "step": 4210 + }, + { + "epoch": 0.08575439453125, + "grad_norm": 11.203535079956055, + "learning_rate": 9.98918410845331e-06, + "loss": 5.8829, + "step": 4215 + }, + { + "epoch": 0.08585611979166667, + "grad_norm": 23.59090232849121, + "learning_rate": 9.989157817931892e-06, + "loss": 5.8183, + "step": 4220 + }, + { + "epoch": 0.08595784505208333, + "grad_norm": 16.435232162475586, + "learning_rate": 9.98913149553134e-06, + "loss": 5.4424, + "step": 4225 + }, + { + "epoch": 0.0860595703125, + "grad_norm": 13.457387924194336, + "learning_rate": 9.989105141251822e-06, + "loss": 5.5827, + "step": 4230 + }, + { + "epoch": 0.08616129557291667, + "grad_norm": 17.07048988342285, + "learning_rate": 9.989078755093505e-06, + "loss": 5.8617, + "step": 4235 + }, + { + "epoch": 0.08626302083333333, + "grad_norm": 18.609529495239258, + "learning_rate": 9.989052337056559e-06, + "loss": 5.8647, + "step": 4240 + }, + { + "epoch": 0.08636474609375, + "grad_norm": 15.203387260437012, + "learning_rate": 9.989025887141152e-06, + "loss": 5.5742, + "step": 4245 + }, + { + "epoch": 0.08646647135416667, + "grad_norm": 15.268631935119629, + "learning_rate": 9.988999405347454e-06, + "loss": 6.0883, + "step": 4250 + }, + { + "epoch": 0.08656819661458333, + "grad_norm": 16.128259658813477, + "learning_rate": 9.988972891675633e-06, + "loss": 5.5525, + "step": 4255 + }, + { + "epoch": 0.086669921875, + "grad_norm": 16.603557586669922, + "learning_rate": 9.988946346125858e-06, + "loss": 5.3496, + "step": 4260 + }, + { + "epoch": 0.08677164713541667, + "grad_norm": 18.58730125427246, + "learning_rate": 9.9889197686983e-06, + "loss": 6.0185, + "step": 4265 + }, + { + "epoch": 0.08687337239583333, + "grad_norm": 11.06638240814209, + "learning_rate": 9.98889315939313e-06, + "loss": 5.4803, + "step": 4270 + }, + { + "epoch": 0.08697509765625, + "grad_norm": 15.531336784362793, + "learning_rate": 9.988866518210514e-06, + "loss": 5.5846, + "step": 4275 + }, + { + "epoch": 0.08707682291666667, + "grad_norm": 16.51064682006836, + "learning_rate": 9.988839845150627e-06, + "loss": 5.6458, + "step": 4280 + }, + { + "epoch": 0.08717854817708333, + "grad_norm": 16.119773864746094, + "learning_rate": 9.988813140213635e-06, + "loss": 5.7376, + "step": 4285 + }, + { + "epoch": 0.0872802734375, + "grad_norm": 20.063817977905273, + "learning_rate": 9.988786403399712e-06, + "loss": 5.5552, + "step": 4290 + }, + { + "epoch": 0.08738199869791667, + "grad_norm": 19.206296920776367, + "learning_rate": 9.988759634709026e-06, + "loss": 5.6688, + "step": 4295 + }, + { + "epoch": 0.08748372395833333, + "grad_norm": 17.996116638183594, + "learning_rate": 9.988732834141752e-06, + "loss": 5.5822, + "step": 4300 + }, + { + "epoch": 0.08758544921875, + "grad_norm": 21.585866928100586, + "learning_rate": 9.988706001698057e-06, + "loss": 5.6677, + "step": 4305 + }, + { + "epoch": 0.08768717447916667, + "grad_norm": 15.39219856262207, + "learning_rate": 9.988679137378114e-06, + "loss": 5.4451, + "step": 4310 + }, + { + "epoch": 0.08778889973958333, + "grad_norm": 19.62917137145996, + "learning_rate": 9.988652241182096e-06, + "loss": 5.6489, + "step": 4315 + }, + { + "epoch": 0.087890625, + "grad_norm": 17.949249267578125, + "learning_rate": 9.988625313110174e-06, + "loss": 5.4842, + "step": 4320 + }, + { + "epoch": 0.08799235026041667, + "grad_norm": 15.886926651000977, + "learning_rate": 9.98859835316252e-06, + "loss": 5.803, + "step": 4325 + }, + { + "epoch": 0.08809407552083333, + "grad_norm": 15.324554443359375, + "learning_rate": 9.988571361339305e-06, + "loss": 5.5662, + "step": 4330 + }, + { + "epoch": 0.08819580078125, + "grad_norm": 18.459707260131836, + "learning_rate": 9.988544337640702e-06, + "loss": 5.4364, + "step": 4335 + }, + { + "epoch": 0.08829752604166667, + "grad_norm": 19.754961013793945, + "learning_rate": 9.988517282066887e-06, + "loss": 5.7015, + "step": 4340 + }, + { + "epoch": 0.08839925130208333, + "grad_norm": 14.381133079528809, + "learning_rate": 9.988490194618027e-06, + "loss": 5.6711, + "step": 4345 + }, + { + "epoch": 0.0885009765625, + "grad_norm": 12.59147834777832, + "learning_rate": 9.9884630752943e-06, + "loss": 5.3345, + "step": 4350 + }, + { + "epoch": 0.08860270182291667, + "grad_norm": 17.52713966369629, + "learning_rate": 9.988435924095878e-06, + "loss": 5.9298, + "step": 4355 + }, + { + "epoch": 0.08870442708333333, + "grad_norm": 14.633036613464355, + "learning_rate": 9.988408741022933e-06, + "loss": 5.5459, + "step": 4360 + }, + { + "epoch": 0.08880615234375, + "grad_norm": 39.110679626464844, + "learning_rate": 9.988381526075638e-06, + "loss": 5.5684, + "step": 4365 + }, + { + "epoch": 0.08890787760416667, + "grad_norm": 15.96020793914795, + "learning_rate": 9.98835427925417e-06, + "loss": 5.3737, + "step": 4370 + }, + { + "epoch": 0.08900960286458333, + "grad_norm": 13.2327241897583, + "learning_rate": 9.988327000558704e-06, + "loss": 5.7338, + "step": 4375 + }, + { + "epoch": 0.089111328125, + "grad_norm": 13.990145683288574, + "learning_rate": 9.98829968998941e-06, + "loss": 5.3952, + "step": 4380 + }, + { + "epoch": 0.08921305338541667, + "grad_norm": 18.496973037719727, + "learning_rate": 9.988272347546464e-06, + "loss": 5.5952, + "step": 4385 + }, + { + "epoch": 0.08931477864583333, + "grad_norm": 19.073495864868164, + "learning_rate": 9.98824497323004e-06, + "loss": 5.513, + "step": 4390 + }, + { + "epoch": 0.08941650390625, + "grad_norm": 20.852787017822266, + "learning_rate": 9.988217567040316e-06, + "loss": 5.6851, + "step": 4395 + }, + { + "epoch": 0.08951822916666667, + "grad_norm": 19.306093215942383, + "learning_rate": 9.988190128977466e-06, + "loss": 5.6465, + "step": 4400 + }, + { + "epoch": 0.08961995442708333, + "grad_norm": 14.573668479919434, + "learning_rate": 9.988162659041665e-06, + "loss": 5.5788, + "step": 4405 + }, + { + "epoch": 0.0897216796875, + "grad_norm": 14.651238441467285, + "learning_rate": 9.988135157233085e-06, + "loss": 5.6611, + "step": 4410 + }, + { + "epoch": 0.08982340494791667, + "grad_norm": 16.140708923339844, + "learning_rate": 9.988107623551908e-06, + "loss": 5.6901, + "step": 4415 + }, + { + "epoch": 0.08992513020833333, + "grad_norm": 18.6047306060791, + "learning_rate": 9.988080057998305e-06, + "loss": 5.7033, + "step": 4420 + }, + { + "epoch": 0.09002685546875, + "grad_norm": 15.632603645324707, + "learning_rate": 9.988052460572455e-06, + "loss": 5.5006, + "step": 4425 + }, + { + "epoch": 0.09012858072916667, + "grad_norm": 12.951349258422852, + "learning_rate": 9.988024831274532e-06, + "loss": 5.4831, + "step": 4430 + }, + { + "epoch": 0.09023030598958333, + "grad_norm": 17.084712982177734, + "learning_rate": 9.987997170104712e-06, + "loss": 5.7204, + "step": 4435 + }, + { + "epoch": 0.09033203125, + "grad_norm": 18.050716400146484, + "learning_rate": 9.987969477063176e-06, + "loss": 5.4527, + "step": 4440 + }, + { + "epoch": 0.09043375651041667, + "grad_norm": 17.922048568725586, + "learning_rate": 9.987941752150099e-06, + "loss": 5.9095, + "step": 4445 + }, + { + "epoch": 0.09053548177083333, + "grad_norm": 15.51566219329834, + "learning_rate": 9.987913995365655e-06, + "loss": 5.5258, + "step": 4450 + }, + { + "epoch": 0.09063720703125, + "grad_norm": 15.309165000915527, + "learning_rate": 9.987886206710025e-06, + "loss": 5.6928, + "step": 4455 + }, + { + "epoch": 0.09073893229166667, + "grad_norm": 15.824016571044922, + "learning_rate": 9.987858386183386e-06, + "loss": 5.6871, + "step": 4460 + }, + { + "epoch": 0.09084065755208333, + "grad_norm": 16.459182739257812, + "learning_rate": 9.987830533785916e-06, + "loss": 5.575, + "step": 4465 + }, + { + "epoch": 0.0909423828125, + "grad_norm": 17.366064071655273, + "learning_rate": 9.987802649517791e-06, + "loss": 5.564, + "step": 4470 + }, + { + "epoch": 0.09104410807291667, + "grad_norm": 14.337353706359863, + "learning_rate": 9.987774733379191e-06, + "loss": 5.6003, + "step": 4475 + }, + { + "epoch": 0.09114583333333333, + "grad_norm": 14.602683067321777, + "learning_rate": 9.987746785370294e-06, + "loss": 5.4616, + "step": 4480 + }, + { + "epoch": 0.09124755859375, + "grad_norm": 14.538405418395996, + "learning_rate": 9.987718805491277e-06, + "loss": 5.3826, + "step": 4485 + }, + { + "epoch": 0.09134928385416667, + "grad_norm": 20.41831398010254, + "learning_rate": 9.987690793742322e-06, + "loss": 5.5559, + "step": 4490 + }, + { + "epoch": 0.09145100911458333, + "grad_norm": 14.623827934265137, + "learning_rate": 9.987662750123605e-06, + "loss": 5.5264, + "step": 4495 + }, + { + "epoch": 0.091552734375, + "grad_norm": 27.413021087646484, + "learning_rate": 9.987634674635307e-06, + "loss": 5.8396, + "step": 4500 + }, + { + "epoch": 0.09165445963541667, + "grad_norm": 12.505378723144531, + "learning_rate": 9.987606567277606e-06, + "loss": 5.6049, + "step": 4505 + }, + { + "epoch": 0.09175618489583333, + "grad_norm": 18.824413299560547, + "learning_rate": 9.987578428050682e-06, + "loss": 5.571, + "step": 4510 + }, + { + "epoch": 0.09185791015625, + "grad_norm": 16.035520553588867, + "learning_rate": 9.987550256954716e-06, + "loss": 5.5657, + "step": 4515 + }, + { + "epoch": 0.09195963541666667, + "grad_norm": 13.759669303894043, + "learning_rate": 9.987522053989886e-06, + "loss": 5.3972, + "step": 4520 + }, + { + "epoch": 0.09206136067708333, + "grad_norm": 17.823102951049805, + "learning_rate": 9.987493819156376e-06, + "loss": 5.6817, + "step": 4525 + }, + { + "epoch": 0.0921630859375, + "grad_norm": 16.50334358215332, + "learning_rate": 9.987465552454361e-06, + "loss": 5.5733, + "step": 4530 + }, + { + "epoch": 0.09226481119791667, + "grad_norm": 33.02798843383789, + "learning_rate": 9.987437253884026e-06, + "loss": 5.5257, + "step": 4535 + }, + { + "epoch": 0.09236653645833333, + "grad_norm": 18.806385040283203, + "learning_rate": 9.98740892344555e-06, + "loss": 5.651, + "step": 4540 + }, + { + "epoch": 0.09246826171875, + "grad_norm": 12.297256469726562, + "learning_rate": 9.987380561139113e-06, + "loss": 5.7514, + "step": 4545 + }, + { + "epoch": 0.09256998697916667, + "grad_norm": 13.449804306030273, + "learning_rate": 9.987352166964897e-06, + "loss": 5.6757, + "step": 4550 + }, + { + "epoch": 0.09267171223958333, + "grad_norm": 48.02235794067383, + "learning_rate": 9.987323740923085e-06, + "loss": 5.3488, + "step": 4555 + }, + { + "epoch": 0.0927734375, + "grad_norm": 18.478254318237305, + "learning_rate": 9.987295283013858e-06, + "loss": 5.5652, + "step": 4560 + }, + { + "epoch": 0.09287516276041667, + "grad_norm": 14.933732032775879, + "learning_rate": 9.987266793237397e-06, + "loss": 5.6504, + "step": 4565 + }, + { + "epoch": 0.09297688802083333, + "grad_norm": 22.94769859313965, + "learning_rate": 9.987238271593885e-06, + "loss": 5.4933, + "step": 4570 + }, + { + "epoch": 0.09307861328125, + "grad_norm": 15.550332069396973, + "learning_rate": 9.987209718083502e-06, + "loss": 5.6284, + "step": 4575 + }, + { + "epoch": 0.09318033854166667, + "grad_norm": 16.336265563964844, + "learning_rate": 9.987181132706432e-06, + "loss": 5.7625, + "step": 4580 + }, + { + "epoch": 0.09328206380208333, + "grad_norm": 17.94223403930664, + "learning_rate": 9.987152515462859e-06, + "loss": 5.6887, + "step": 4585 + }, + { + "epoch": 0.0933837890625, + "grad_norm": 13.792035102844238, + "learning_rate": 9.987123866352963e-06, + "loss": 5.6228, + "step": 4590 + }, + { + "epoch": 0.09348551432291667, + "grad_norm": 14.285360336303711, + "learning_rate": 9.98709518537693e-06, + "loss": 5.4741, + "step": 4595 + }, + { + "epoch": 0.09358723958333333, + "grad_norm": 23.043302536010742, + "learning_rate": 9.987066472534943e-06, + "loss": 5.5087, + "step": 4600 + }, + { + "epoch": 0.09368896484375, + "grad_norm": 18.572467803955078, + "learning_rate": 9.987037727827182e-06, + "loss": 5.4238, + "step": 4605 + }, + { + "epoch": 0.09379069010416667, + "grad_norm": 13.760415077209473, + "learning_rate": 9.987008951253834e-06, + "loss": 5.572, + "step": 4610 + }, + { + "epoch": 0.09389241536458333, + "grad_norm": 17.726572036743164, + "learning_rate": 9.98698014281508e-06, + "loss": 5.6954, + "step": 4615 + }, + { + "epoch": 0.093994140625, + "grad_norm": 17.967201232910156, + "learning_rate": 9.986951302511109e-06, + "loss": 5.5656, + "step": 4620 + }, + { + "epoch": 0.09409586588541667, + "grad_norm": 19.494056701660156, + "learning_rate": 9.986922430342102e-06, + "loss": 5.4791, + "step": 4625 + }, + { + "epoch": 0.09419759114583333, + "grad_norm": 20.119651794433594, + "learning_rate": 9.986893526308242e-06, + "loss": 5.6645, + "step": 4630 + }, + { + "epoch": 0.09429931640625, + "grad_norm": 18.45551109313965, + "learning_rate": 9.986864590409715e-06, + "loss": 5.4475, + "step": 4635 + }, + { + "epoch": 0.09440104166666667, + "grad_norm": 16.207752227783203, + "learning_rate": 9.98683562264671e-06, + "loss": 5.657, + "step": 4640 + }, + { + "epoch": 0.09450276692708333, + "grad_norm": 13.537372589111328, + "learning_rate": 9.986806623019404e-06, + "loss": 5.6111, + "step": 4645 + }, + { + "epoch": 0.0946044921875, + "grad_norm": 20.58209228515625, + "learning_rate": 9.986777591527989e-06, + "loss": 5.6383, + "step": 4650 + }, + { + "epoch": 0.09470621744791667, + "grad_norm": 15.710925102233887, + "learning_rate": 9.986748528172648e-06, + "loss": 5.4054, + "step": 4655 + }, + { + "epoch": 0.09480794270833333, + "grad_norm": 18.534027099609375, + "learning_rate": 9.986719432953567e-06, + "loss": 5.7132, + "step": 4660 + }, + { + "epoch": 0.09490966796875, + "grad_norm": 20.695640563964844, + "learning_rate": 9.98669030587093e-06, + "loss": 5.4812, + "step": 4665 + }, + { + "epoch": 0.09501139322916667, + "grad_norm": 13.798317909240723, + "learning_rate": 9.986661146924928e-06, + "loss": 5.4685, + "step": 4670 + }, + { + "epoch": 0.09511311848958333, + "grad_norm": 14.020719528198242, + "learning_rate": 9.986631956115743e-06, + "loss": 5.509, + "step": 4675 + }, + { + "epoch": 0.09521484375, + "grad_norm": 15.413867950439453, + "learning_rate": 9.986602733443563e-06, + "loss": 5.9285, + "step": 4680 + }, + { + "epoch": 0.09531656901041667, + "grad_norm": 15.703710556030273, + "learning_rate": 9.986573478908572e-06, + "loss": 5.3624, + "step": 4685 + }, + { + "epoch": 0.09541829427083333, + "grad_norm": 11.922365188598633, + "learning_rate": 9.986544192510962e-06, + "loss": 5.5836, + "step": 4690 + }, + { + "epoch": 0.09552001953125, + "grad_norm": 13.040605545043945, + "learning_rate": 9.986514874250916e-06, + "loss": 5.5269, + "step": 4695 + }, + { + "epoch": 0.09562174479166667, + "grad_norm": 17.121845245361328, + "learning_rate": 9.986485524128623e-06, + "loss": 5.3835, + "step": 4700 + }, + { + "epoch": 0.09572347005208333, + "grad_norm": 15.769513130187988, + "learning_rate": 9.986456142144273e-06, + "loss": 5.4306, + "step": 4705 + }, + { + "epoch": 0.0958251953125, + "grad_norm": 14.983192443847656, + "learning_rate": 9.986426728298048e-06, + "loss": 5.7815, + "step": 4710 + }, + { + "epoch": 0.09592692057291667, + "grad_norm": 16.56182098388672, + "learning_rate": 9.98639728259014e-06, + "loss": 5.4647, + "step": 4715 + }, + { + "epoch": 0.09602864583333333, + "grad_norm": 15.121134757995605, + "learning_rate": 9.986367805020738e-06, + "loss": 5.5303, + "step": 4720 + }, + { + "epoch": 0.09613037109375, + "grad_norm": 18.78289222717285, + "learning_rate": 9.986338295590028e-06, + "loss": 5.5379, + "step": 4725 + }, + { + "epoch": 0.09623209635416667, + "grad_norm": 16.58477020263672, + "learning_rate": 9.986308754298198e-06, + "loss": 5.6354, + "step": 4730 + }, + { + "epoch": 0.09633382161458333, + "grad_norm": 13.661042213439941, + "learning_rate": 9.986279181145438e-06, + "loss": 5.4239, + "step": 4735 + }, + { + "epoch": 0.096435546875, + "grad_norm": 17.93897247314453, + "learning_rate": 9.98624957613194e-06, + "loss": 5.4961, + "step": 4740 + }, + { + "epoch": 0.09653727213541667, + "grad_norm": 20.341426849365234, + "learning_rate": 9.986219939257886e-06, + "loss": 5.6577, + "step": 4745 + }, + { + "epoch": 0.09663899739583333, + "grad_norm": 18.03200912475586, + "learning_rate": 9.98619027052347e-06, + "loss": 5.5127, + "step": 4750 + }, + { + "epoch": 0.09674072265625, + "grad_norm": 13.341665267944336, + "learning_rate": 9.986160569928882e-06, + "loss": 5.631, + "step": 4755 + }, + { + "epoch": 0.09684244791666667, + "grad_norm": 20.069355010986328, + "learning_rate": 9.98613083747431e-06, + "loss": 5.6181, + "step": 4760 + }, + { + "epoch": 0.09694417317708333, + "grad_norm": 16.274816513061523, + "learning_rate": 9.986101073159947e-06, + "loss": 5.4209, + "step": 4765 + }, + { + "epoch": 0.0970458984375, + "grad_norm": 12.893757820129395, + "learning_rate": 9.986071276985977e-06, + "loss": 5.7227, + "step": 4770 + }, + { + "epoch": 0.09714762369791667, + "grad_norm": 13.710043907165527, + "learning_rate": 9.986041448952598e-06, + "loss": 5.8448, + "step": 4775 + }, + { + "epoch": 0.09724934895833333, + "grad_norm": 18.309247970581055, + "learning_rate": 9.986011589059996e-06, + "loss": 5.4901, + "step": 4780 + }, + { + "epoch": 0.09735107421875, + "grad_norm": 16.957304000854492, + "learning_rate": 9.985981697308363e-06, + "loss": 5.5553, + "step": 4785 + }, + { + "epoch": 0.09745279947916667, + "grad_norm": 14.586840629577637, + "learning_rate": 9.985951773697888e-06, + "loss": 5.7519, + "step": 4790 + }, + { + "epoch": 0.09755452473958333, + "grad_norm": 14.635825157165527, + "learning_rate": 9.985921818228765e-06, + "loss": 5.6066, + "step": 4795 + }, + { + "epoch": 0.09765625, + "grad_norm": 19.3857479095459, + "learning_rate": 9.985891830901184e-06, + "loss": 5.7319, + "step": 4800 + }, + { + "epoch": 0.09775797526041667, + "grad_norm": 15.582842826843262, + "learning_rate": 9.985861811715338e-06, + "loss": 5.3971, + "step": 4805 + }, + { + "epoch": 0.09785970052083333, + "grad_norm": 14.99433708190918, + "learning_rate": 9.985831760671416e-06, + "loss": 5.795, + "step": 4810 + }, + { + "epoch": 0.09796142578125, + "grad_norm": 27.29306411743164, + "learning_rate": 9.985801677769613e-06, + "loss": 5.6553, + "step": 4815 + }, + { + "epoch": 0.09806315104166667, + "grad_norm": 12.136336326599121, + "learning_rate": 9.98577156301012e-06, + "loss": 5.4497, + "step": 4820 + }, + { + "epoch": 0.09816487630208333, + "grad_norm": 20.006664276123047, + "learning_rate": 9.985741416393129e-06, + "loss": 5.5919, + "step": 4825 + }, + { + "epoch": 0.0982666015625, + "grad_norm": 20.711597442626953, + "learning_rate": 9.985711237918834e-06, + "loss": 5.3484, + "step": 4830 + }, + { + "epoch": 0.09836832682291667, + "grad_norm": 14.656824111938477, + "learning_rate": 9.985681027587426e-06, + "loss": 5.5819, + "step": 4835 + }, + { + "epoch": 0.09847005208333333, + "grad_norm": 14.682312965393066, + "learning_rate": 9.985650785399098e-06, + "loss": 5.8691, + "step": 4840 + }, + { + "epoch": 0.09857177734375, + "grad_norm": 18.58519172668457, + "learning_rate": 9.985620511354046e-06, + "loss": 5.4622, + "step": 4845 + }, + { + "epoch": 0.09867350260416667, + "grad_norm": 16.377925872802734, + "learning_rate": 9.98559020545246e-06, + "loss": 5.5429, + "step": 4850 + }, + { + "epoch": 0.09877522786458333, + "grad_norm": 12.732099533081055, + "learning_rate": 9.985559867694535e-06, + "loss": 5.7873, + "step": 4855 + }, + { + "epoch": 0.098876953125, + "grad_norm": 14.73703384399414, + "learning_rate": 9.985529498080466e-06, + "loss": 5.5673, + "step": 4860 + }, + { + "epoch": 0.09897867838541667, + "grad_norm": 13.428210258483887, + "learning_rate": 9.985499096610447e-06, + "loss": 5.6975, + "step": 4865 + }, + { + "epoch": 0.09908040364583333, + "grad_norm": 16.933019638061523, + "learning_rate": 9.985468663284669e-06, + "loss": 5.5303, + "step": 4870 + }, + { + "epoch": 0.09918212890625, + "grad_norm": 24.888715744018555, + "learning_rate": 9.98543819810333e-06, + "loss": 5.639, + "step": 4875 + }, + { + "epoch": 0.09928385416666667, + "grad_norm": 17.966646194458008, + "learning_rate": 9.985407701066624e-06, + "loss": 5.3507, + "step": 4880 + }, + { + "epoch": 0.09938557942708333, + "grad_norm": 15.912296295166016, + "learning_rate": 9.985377172174745e-06, + "loss": 5.535, + "step": 4885 + }, + { + "epoch": 0.0994873046875, + "grad_norm": 15.622234344482422, + "learning_rate": 9.98534661142789e-06, + "loss": 5.5579, + "step": 4890 + }, + { + "epoch": 0.09958902994791667, + "grad_norm": 16.39847755432129, + "learning_rate": 9.98531601882625e-06, + "loss": 5.4311, + "step": 4895 + }, + { + "epoch": 0.09969075520833333, + "grad_norm": 14.173028945922852, + "learning_rate": 9.985285394370026e-06, + "loss": 5.6062, + "step": 4900 + }, + { + "epoch": 0.09979248046875, + "grad_norm": 18.390710830688477, + "learning_rate": 9.985254738059409e-06, + "loss": 5.6336, + "step": 4905 + }, + { + "epoch": 0.09989420572916667, + "grad_norm": 16.143016815185547, + "learning_rate": 9.985224049894597e-06, + "loss": 5.3807, + "step": 4910 + }, + { + "epoch": 0.09999593098958333, + "grad_norm": 16.890270233154297, + "learning_rate": 9.985193329875787e-06, + "loss": 5.6561, + "step": 4915 + }, + { + "epoch": 0.10009765625, + "grad_norm": 17.520221710205078, + "learning_rate": 9.985162578003172e-06, + "loss": 5.6899, + "step": 4920 + }, + { + "epoch": 0.10019938151041667, + "grad_norm": 15.099200248718262, + "learning_rate": 9.985131794276952e-06, + "loss": 5.5169, + "step": 4925 + }, + { + "epoch": 0.10030110677083333, + "grad_norm": 14.284749031066895, + "learning_rate": 9.985100978697323e-06, + "loss": 5.5975, + "step": 4930 + }, + { + "epoch": 0.10040283203125, + "grad_norm": 15.264209747314453, + "learning_rate": 9.985070131264479e-06, + "loss": 5.711, + "step": 4935 + }, + { + "epoch": 0.10050455729166667, + "grad_norm": 15.490504264831543, + "learning_rate": 9.98503925197862e-06, + "loss": 5.8259, + "step": 4940 + }, + { + "epoch": 0.10060628255208333, + "grad_norm": 18.0833683013916, + "learning_rate": 9.985008340839943e-06, + "loss": 5.5317, + "step": 4945 + }, + { + "epoch": 0.1007080078125, + "grad_norm": 14.458824157714844, + "learning_rate": 9.984977397848647e-06, + "loss": 5.6655, + "step": 4950 + }, + { + "epoch": 0.10080973307291667, + "grad_norm": 16.603322982788086, + "learning_rate": 9.984946423004926e-06, + "loss": 5.7533, + "step": 4955 + }, + { + "epoch": 0.10091145833333333, + "grad_norm": 17.101688385009766, + "learning_rate": 9.98491541630898e-06, + "loss": 5.4953, + "step": 4960 + }, + { + "epoch": 0.10101318359375, + "grad_norm": 16.907854080200195, + "learning_rate": 9.984884377761007e-06, + "loss": 5.6751, + "step": 4965 + }, + { + "epoch": 0.10111490885416667, + "grad_norm": 13.8328275680542, + "learning_rate": 9.984853307361205e-06, + "loss": 5.7318, + "step": 4970 + }, + { + "epoch": 0.10121663411458333, + "grad_norm": 14.737936019897461, + "learning_rate": 9.984822205109773e-06, + "loss": 5.7519, + "step": 4975 + }, + { + "epoch": 0.101318359375, + "grad_norm": 19.43643569946289, + "learning_rate": 9.98479107100691e-06, + "loss": 5.5044, + "step": 4980 + }, + { + "epoch": 0.10142008463541667, + "grad_norm": 17.130868911743164, + "learning_rate": 9.984759905052814e-06, + "loss": 5.5952, + "step": 4985 + }, + { + "epoch": 0.10152180989583333, + "grad_norm": 23.274538040161133, + "learning_rate": 9.984728707247684e-06, + "loss": 5.4231, + "step": 4990 + }, + { + "epoch": 0.10162353515625, + "grad_norm": 17.013160705566406, + "learning_rate": 9.98469747759172e-06, + "loss": 5.8514, + "step": 4995 + }, + { + "epoch": 0.10172526041666667, + "grad_norm": 16.906240463256836, + "learning_rate": 9.984666216085123e-06, + "loss": 5.7046, + "step": 5000 + }, + { + "epoch": 0.10182698567708333, + "grad_norm": 15.425639152526855, + "learning_rate": 9.98463492272809e-06, + "loss": 5.5468, + "step": 5005 + }, + { + "epoch": 0.1019287109375, + "grad_norm": 16.494733810424805, + "learning_rate": 9.984603597520821e-06, + "loss": 5.6465, + "step": 5010 + }, + { + "epoch": 0.10203043619791667, + "grad_norm": 18.390609741210938, + "learning_rate": 9.98457224046352e-06, + "loss": 5.5785, + "step": 5015 + }, + { + "epoch": 0.10213216145833333, + "grad_norm": 30.75277328491211, + "learning_rate": 9.984540851556383e-06, + "loss": 5.4857, + "step": 5020 + }, + { + "epoch": 0.10223388671875, + "grad_norm": 17.045501708984375, + "learning_rate": 9.984509430799614e-06, + "loss": 5.7026, + "step": 5025 + }, + { + "epoch": 0.10233561197916667, + "grad_norm": 14.499194145202637, + "learning_rate": 9.98447797819341e-06, + "loss": 5.4672, + "step": 5030 + }, + { + "epoch": 0.10243733723958333, + "grad_norm": 14.624445915222168, + "learning_rate": 9.984446493737975e-06, + "loss": 5.428, + "step": 5035 + }, + { + "epoch": 0.1025390625, + "grad_norm": 16.897308349609375, + "learning_rate": 9.984414977433509e-06, + "loss": 5.3458, + "step": 5040 + }, + { + "epoch": 0.10264078776041667, + "grad_norm": 24.25098991394043, + "learning_rate": 9.984383429280213e-06, + "loss": 5.6674, + "step": 5045 + }, + { + "epoch": 0.10274251302083333, + "grad_norm": 18.828588485717773, + "learning_rate": 9.984351849278289e-06, + "loss": 5.4146, + "step": 5050 + }, + { + "epoch": 0.10284423828125, + "grad_norm": 14.8408842086792, + "learning_rate": 9.98432023742794e-06, + "loss": 5.4941, + "step": 5055 + }, + { + "epoch": 0.10294596354166667, + "grad_norm": 16.05213165283203, + "learning_rate": 9.984288593729366e-06, + "loss": 5.5963, + "step": 5060 + }, + { + "epoch": 0.10304768880208333, + "grad_norm": 12.718965530395508, + "learning_rate": 9.984256918182771e-06, + "loss": 5.7074, + "step": 5065 + }, + { + "epoch": 0.1031494140625, + "grad_norm": 20.80908966064453, + "learning_rate": 9.984225210788357e-06, + "loss": 5.3994, + "step": 5070 + }, + { + "epoch": 0.10325113932291667, + "grad_norm": 22.530677795410156, + "learning_rate": 9.984193471546324e-06, + "loss": 5.562, + "step": 5075 + }, + { + "epoch": 0.10335286458333333, + "grad_norm": 17.209287643432617, + "learning_rate": 9.984161700456877e-06, + "loss": 5.7063, + "step": 5080 + }, + { + "epoch": 0.10345458984375, + "grad_norm": 14.544754981994629, + "learning_rate": 9.98412989752022e-06, + "loss": 5.523, + "step": 5085 + }, + { + "epoch": 0.10355631510416667, + "grad_norm": 13.476971626281738, + "learning_rate": 9.984098062736555e-06, + "loss": 5.5676, + "step": 5090 + }, + { + "epoch": 0.10365804036458333, + "grad_norm": 22.372255325317383, + "learning_rate": 9.984066196106086e-06, + "loss": 5.629, + "step": 5095 + }, + { + "epoch": 0.103759765625, + "grad_norm": 20.16147232055664, + "learning_rate": 9.984034297629017e-06, + "loss": 5.7272, + "step": 5100 + }, + { + "epoch": 0.10386149088541667, + "grad_norm": 14.540892601013184, + "learning_rate": 9.98400236730555e-06, + "loss": 5.6942, + "step": 5105 + }, + { + "epoch": 0.10396321614583333, + "grad_norm": 35.084327697753906, + "learning_rate": 9.98397040513589e-06, + "loss": 5.7143, + "step": 5110 + }, + { + "epoch": 0.10406494140625, + "grad_norm": 22.663501739501953, + "learning_rate": 9.98393841112024e-06, + "loss": 5.4507, + "step": 5115 + }, + { + "epoch": 0.10416666666666667, + "grad_norm": 42.657814025878906, + "learning_rate": 9.983906385258808e-06, + "loss": 5.5878, + "step": 5120 + }, + { + "epoch": 0.10426839192708333, + "grad_norm": 28.3552303314209, + "learning_rate": 9.983874327551795e-06, + "loss": 5.4346, + "step": 5125 + }, + { + "epoch": 0.1043701171875, + "grad_norm": 13.148078918457031, + "learning_rate": 9.983842237999406e-06, + "loss": 5.6513, + "step": 5130 + }, + { + "epoch": 0.10447184244791667, + "grad_norm": 13.102762222290039, + "learning_rate": 9.983810116601849e-06, + "loss": 5.3241, + "step": 5135 + }, + { + "epoch": 0.10457356770833333, + "grad_norm": 13.465314865112305, + "learning_rate": 9.983777963359327e-06, + "loss": 5.6334, + "step": 5140 + }, + { + "epoch": 0.10467529296875, + "grad_norm": 12.45387077331543, + "learning_rate": 9.983745778272045e-06, + "loss": 5.8314, + "step": 5145 + }, + { + "epoch": 0.10477701822916667, + "grad_norm": 13.725682258605957, + "learning_rate": 9.983713561340211e-06, + "loss": 5.6846, + "step": 5150 + }, + { + "epoch": 0.10487874348958333, + "grad_norm": 20.45978546142578, + "learning_rate": 9.983681312564028e-06, + "loss": 5.3016, + "step": 5155 + }, + { + "epoch": 0.10498046875, + "grad_norm": 19.570064544677734, + "learning_rate": 9.983649031943704e-06, + "loss": 5.6563, + "step": 5160 + }, + { + "epoch": 0.10508219401041667, + "grad_norm": 18.400236129760742, + "learning_rate": 9.983616719479446e-06, + "loss": 5.6753, + "step": 5165 + }, + { + "epoch": 0.10518391927083333, + "grad_norm": 18.477479934692383, + "learning_rate": 9.983584375171457e-06, + "loss": 5.3222, + "step": 5170 + }, + { + "epoch": 0.10528564453125, + "grad_norm": 17.753345489501953, + "learning_rate": 9.983551999019946e-06, + "loss": 5.7206, + "step": 5175 + }, + { + "epoch": 0.10538736979166667, + "grad_norm": 12.851256370544434, + "learning_rate": 9.983519591025122e-06, + "loss": 5.6309, + "step": 5180 + }, + { + "epoch": 0.10548909505208333, + "grad_norm": 16.627155303955078, + "learning_rate": 9.983487151187188e-06, + "loss": 5.5506, + "step": 5185 + }, + { + "epoch": 0.1055908203125, + "grad_norm": 14.317872047424316, + "learning_rate": 9.983454679506352e-06, + "loss": 5.6113, + "step": 5190 + }, + { + "epoch": 0.10569254557291667, + "grad_norm": 15.059728622436523, + "learning_rate": 9.983422175982825e-06, + "loss": 5.8038, + "step": 5195 + }, + { + "epoch": 0.10579427083333333, + "grad_norm": 19.34741973876953, + "learning_rate": 9.98338964061681e-06, + "loss": 5.5664, + "step": 5200 + }, + { + "epoch": 0.10589599609375, + "grad_norm": 15.922147750854492, + "learning_rate": 9.98335707340852e-06, + "loss": 5.2847, + "step": 5205 + }, + { + "epoch": 0.10599772135416667, + "grad_norm": 16.95050811767578, + "learning_rate": 9.983324474358156e-06, + "loss": 5.6474, + "step": 5210 + }, + { + "epoch": 0.10609944661458333, + "grad_norm": 16.483888626098633, + "learning_rate": 9.983291843465934e-06, + "loss": 5.6273, + "step": 5215 + }, + { + "epoch": 0.106201171875, + "grad_norm": 17.03272819519043, + "learning_rate": 9.98325918073206e-06, + "loss": 5.3845, + "step": 5220 + }, + { + "epoch": 0.10630289713541667, + "grad_norm": 14.414079666137695, + "learning_rate": 9.98322648615674e-06, + "loss": 5.3051, + "step": 5225 + }, + { + "epoch": 0.10640462239583333, + "grad_norm": 20.686729431152344, + "learning_rate": 9.983193759740182e-06, + "loss": 5.7389, + "step": 5230 + }, + { + "epoch": 0.10650634765625, + "grad_norm": 18.611289978027344, + "learning_rate": 9.9831610014826e-06, + "loss": 5.5451, + "step": 5235 + }, + { + "epoch": 0.10660807291666667, + "grad_norm": 13.961222648620605, + "learning_rate": 9.983128211384202e-06, + "loss": 5.3854, + "step": 5240 + }, + { + "epoch": 0.10670979817708333, + "grad_norm": 15.284721374511719, + "learning_rate": 9.983095389445196e-06, + "loss": 5.599, + "step": 5245 + }, + { + "epoch": 0.1068115234375, + "grad_norm": 16.997055053710938, + "learning_rate": 9.983062535665793e-06, + "loss": 5.3834, + "step": 5250 + }, + { + "epoch": 0.10691324869791667, + "grad_norm": 16.03573989868164, + "learning_rate": 9.983029650046201e-06, + "loss": 5.5261, + "step": 5255 + }, + { + "epoch": 0.10701497395833333, + "grad_norm": 12.70956039428711, + "learning_rate": 9.982996732586633e-06, + "loss": 5.5749, + "step": 5260 + }, + { + "epoch": 0.10711669921875, + "grad_norm": 14.123883247375488, + "learning_rate": 9.982963783287294e-06, + "loss": 5.7852, + "step": 5265 + }, + { + "epoch": 0.10721842447916667, + "grad_norm": 32.650634765625, + "learning_rate": 9.982930802148401e-06, + "loss": 6.0561, + "step": 5270 + }, + { + "epoch": 0.10732014973958333, + "grad_norm": 17.711483001708984, + "learning_rate": 9.982897789170162e-06, + "loss": 5.7864, + "step": 5275 + }, + { + "epoch": 0.107421875, + "grad_norm": 12.571752548217773, + "learning_rate": 9.982864744352786e-06, + "loss": 5.498, + "step": 5280 + }, + { + "epoch": 0.10752360026041667, + "grad_norm": 17.714900970458984, + "learning_rate": 9.982831667696489e-06, + "loss": 5.6305, + "step": 5285 + }, + { + "epoch": 0.10762532552083333, + "grad_norm": 13.442083358764648, + "learning_rate": 9.982798559201479e-06, + "loss": 5.3831, + "step": 5290 + }, + { + "epoch": 0.10772705078125, + "grad_norm": 17.27506446838379, + "learning_rate": 9.982765418867965e-06, + "loss": 5.6097, + "step": 5295 + }, + { + "epoch": 0.10782877604166667, + "grad_norm": 17.112293243408203, + "learning_rate": 9.982732246696165e-06, + "loss": 5.7128, + "step": 5300 + }, + { + "epoch": 0.10793050130208333, + "grad_norm": 14.253948211669922, + "learning_rate": 9.982699042686285e-06, + "loss": 5.6802, + "step": 5305 + }, + { + "epoch": 0.1080322265625, + "grad_norm": 13.404081344604492, + "learning_rate": 9.982665806838543e-06, + "loss": 5.7312, + "step": 5310 + }, + { + "epoch": 0.10813395182291667, + "grad_norm": 14.009191513061523, + "learning_rate": 9.982632539153145e-06, + "loss": 5.6154, + "step": 5315 + }, + { + "epoch": 0.10823567708333333, + "grad_norm": 14.888713836669922, + "learning_rate": 9.982599239630309e-06, + "loss": 5.6425, + "step": 5320 + }, + { + "epoch": 0.10833740234375, + "grad_norm": 18.349557876586914, + "learning_rate": 9.982565908270245e-06, + "loss": 5.7542, + "step": 5325 + }, + { + "epoch": 0.10843912760416667, + "grad_norm": 15.508920669555664, + "learning_rate": 9.982532545073166e-06, + "loss": 5.7959, + "step": 5330 + }, + { + "epoch": 0.10854085286458333, + "grad_norm": 14.643509864807129, + "learning_rate": 9.982499150039286e-06, + "loss": 5.6418, + "step": 5335 + }, + { + "epoch": 0.108642578125, + "grad_norm": 15.334203720092773, + "learning_rate": 9.982465723168818e-06, + "loss": 5.3136, + "step": 5340 + }, + { + "epoch": 0.10874430338541667, + "grad_norm": 27.274593353271484, + "learning_rate": 9.982432264461976e-06, + "loss": 5.6257, + "step": 5345 + }, + { + "epoch": 0.10884602864583333, + "grad_norm": 15.818146705627441, + "learning_rate": 9.982398773918973e-06, + "loss": 5.4616, + "step": 5350 + }, + { + "epoch": 0.10894775390625, + "grad_norm": 17.546504974365234, + "learning_rate": 9.982365251540024e-06, + "loss": 5.7141, + "step": 5355 + }, + { + "epoch": 0.10904947916666667, + "grad_norm": 14.040081977844238, + "learning_rate": 9.982331697325343e-06, + "loss": 5.4178, + "step": 5360 + }, + { + "epoch": 0.10915120442708333, + "grad_norm": 16.72359275817871, + "learning_rate": 9.982298111275142e-06, + "loss": 5.3793, + "step": 5365 + }, + { + "epoch": 0.1092529296875, + "grad_norm": 23.02867317199707, + "learning_rate": 9.98226449338964e-06, + "loss": 5.3517, + "step": 5370 + }, + { + "epoch": 0.10935465494791667, + "grad_norm": 13.486559867858887, + "learning_rate": 9.982230843669047e-06, + "loss": 5.585, + "step": 5375 + }, + { + "epoch": 0.10945638020833333, + "grad_norm": 12.738748550415039, + "learning_rate": 9.982197162113582e-06, + "loss": 5.5543, + "step": 5380 + }, + { + "epoch": 0.10955810546875, + "grad_norm": 14.522045135498047, + "learning_rate": 9.98216344872346e-06, + "loss": 5.6234, + "step": 5385 + }, + { + "epoch": 0.10965983072916667, + "grad_norm": 29.12970733642578, + "learning_rate": 9.982129703498893e-06, + "loss": 5.3164, + "step": 5390 + }, + { + "epoch": 0.10976155598958333, + "grad_norm": 16.687612533569336, + "learning_rate": 9.982095926440101e-06, + "loss": 5.256, + "step": 5395 + }, + { + "epoch": 0.10986328125, + "grad_norm": 15.880518913269043, + "learning_rate": 9.982062117547295e-06, + "loss": 5.9739, + "step": 5400 + }, + { + "epoch": 0.10996500651041667, + "grad_norm": 13.569526672363281, + "learning_rate": 9.982028276820694e-06, + "loss": 5.5322, + "step": 5405 + }, + { + "epoch": 0.11006673177083333, + "grad_norm": 12.863286972045898, + "learning_rate": 9.981994404260515e-06, + "loss": 5.5881, + "step": 5410 + }, + { + "epoch": 0.11016845703125, + "grad_norm": 11.893025398254395, + "learning_rate": 9.981960499866974e-06, + "loss": 5.3458, + "step": 5415 + }, + { + "epoch": 0.11027018229166667, + "grad_norm": 16.14165687561035, + "learning_rate": 9.981926563640285e-06, + "loss": 5.4381, + "step": 5420 + }, + { + "epoch": 0.11037190755208333, + "grad_norm": 13.66281509399414, + "learning_rate": 9.981892595580666e-06, + "loss": 5.6439, + "step": 5425 + }, + { + "epoch": 0.1104736328125, + "grad_norm": 19.624658584594727, + "learning_rate": 9.981858595688338e-06, + "loss": 5.5746, + "step": 5430 + }, + { + "epoch": 0.11057535807291667, + "grad_norm": 26.155256271362305, + "learning_rate": 9.981824563963512e-06, + "loss": 5.5834, + "step": 5435 + }, + { + "epoch": 0.11067708333333333, + "grad_norm": 18.873008728027344, + "learning_rate": 9.98179050040641e-06, + "loss": 5.4881, + "step": 5440 + }, + { + "epoch": 0.11077880859375, + "grad_norm": 18.955059051513672, + "learning_rate": 9.981756405017248e-06, + "loss": 5.6776, + "step": 5445 + }, + { + "epoch": 0.11088053385416667, + "grad_norm": 13.982261657714844, + "learning_rate": 9.981722277796244e-06, + "loss": 5.4627, + "step": 5450 + }, + { + "epoch": 0.11098225911458333, + "grad_norm": 20.42593765258789, + "learning_rate": 9.981688118743616e-06, + "loss": 5.5873, + "step": 5455 + }, + { + "epoch": 0.111083984375, + "grad_norm": 17.626745223999023, + "learning_rate": 9.981653927859583e-06, + "loss": 5.3644, + "step": 5460 + }, + { + "epoch": 0.11118570963541667, + "grad_norm": 15.297684669494629, + "learning_rate": 9.98161970514436e-06, + "loss": 5.3376, + "step": 5465 + }, + { + "epoch": 0.11128743489583333, + "grad_norm": 30.35951042175293, + "learning_rate": 9.981585450598172e-06, + "loss": 5.7056, + "step": 5470 + }, + { + "epoch": 0.11138916015625, + "grad_norm": 16.599233627319336, + "learning_rate": 9.981551164221232e-06, + "loss": 5.6463, + "step": 5475 + }, + { + "epoch": 0.11149088541666667, + "grad_norm": 27.276199340820312, + "learning_rate": 9.981516846013763e-06, + "loss": 5.9055, + "step": 5480 + }, + { + "epoch": 0.11159261067708333, + "grad_norm": 17.46451759338379, + "learning_rate": 9.98148249597598e-06, + "loss": 5.4004, + "step": 5485 + }, + { + "epoch": 0.1116943359375, + "grad_norm": 11.558103561401367, + "learning_rate": 9.981448114108108e-06, + "loss": 5.6121, + "step": 5490 + }, + { + "epoch": 0.11179606119791667, + "grad_norm": 18.474925994873047, + "learning_rate": 9.981413700410363e-06, + "loss": 5.4235, + "step": 5495 + }, + { + "epoch": 0.11189778645833333, + "grad_norm": 13.78626823425293, + "learning_rate": 9.981379254882966e-06, + "loss": 5.6297, + "step": 5500 + }, + { + "epoch": 0.11199951171875, + "grad_norm": 17.609577178955078, + "learning_rate": 9.981344777526136e-06, + "loss": 5.6495, + "step": 5505 + }, + { + "epoch": 0.11210123697916667, + "grad_norm": 27.403850555419922, + "learning_rate": 9.981310268340095e-06, + "loss": 5.5149, + "step": 5510 + }, + { + "epoch": 0.11220296223958333, + "grad_norm": 12.901259422302246, + "learning_rate": 9.981275727325063e-06, + "loss": 5.3391, + "step": 5515 + }, + { + "epoch": 0.1123046875, + "grad_norm": 20.786754608154297, + "learning_rate": 9.98124115448126e-06, + "loss": 5.5261, + "step": 5520 + }, + { + "epoch": 0.11240641276041667, + "grad_norm": 20.737239837646484, + "learning_rate": 9.981206549808906e-06, + "loss": 5.4281, + "step": 5525 + }, + { + "epoch": 0.11250813802083333, + "grad_norm": 18.038654327392578, + "learning_rate": 9.981171913308225e-06, + "loss": 5.62, + "step": 5530 + }, + { + "epoch": 0.11260986328125, + "grad_norm": 20.48394775390625, + "learning_rate": 9.981137244979436e-06, + "loss": 5.5444, + "step": 5535 + }, + { + "epoch": 0.11271158854166667, + "grad_norm": 16.361276626586914, + "learning_rate": 9.981102544822761e-06, + "loss": 5.6497, + "step": 5540 + }, + { + "epoch": 0.11281331380208333, + "grad_norm": 12.59543514251709, + "learning_rate": 9.981067812838423e-06, + "loss": 5.7151, + "step": 5545 + }, + { + "epoch": 0.1129150390625, + "grad_norm": 13.310791969299316, + "learning_rate": 9.981033049026643e-06, + "loss": 5.4049, + "step": 5550 + }, + { + "epoch": 0.11301676432291667, + "grad_norm": 13.676026344299316, + "learning_rate": 9.980998253387642e-06, + "loss": 5.713, + "step": 5555 + }, + { + "epoch": 0.11311848958333333, + "grad_norm": 12.783082008361816, + "learning_rate": 9.980963425921644e-06, + "loss": 5.4829, + "step": 5560 + }, + { + "epoch": 0.11322021484375, + "grad_norm": 15.877127647399902, + "learning_rate": 9.98092856662887e-06, + "loss": 5.528, + "step": 5565 + }, + { + "epoch": 0.11332194010416667, + "grad_norm": 15.454524993896484, + "learning_rate": 9.980893675509545e-06, + "loss": 5.3184, + "step": 5570 + }, + { + "epoch": 0.11342366536458333, + "grad_norm": 12.635504722595215, + "learning_rate": 9.98085875256389e-06, + "loss": 5.6344, + "step": 5575 + }, + { + "epoch": 0.113525390625, + "grad_norm": 20.6888484954834, + "learning_rate": 9.980823797792126e-06, + "loss": 5.6385, + "step": 5580 + }, + { + "epoch": 0.11362711588541667, + "grad_norm": 16.932600021362305, + "learning_rate": 9.980788811194482e-06, + "loss": 5.8879, + "step": 5585 + }, + { + "epoch": 0.11372884114583333, + "grad_norm": 12.13128662109375, + "learning_rate": 9.980753792771179e-06, + "loss": 5.5195, + "step": 5590 + }, + { + "epoch": 0.11383056640625, + "grad_norm": 15.190686225891113, + "learning_rate": 9.980718742522438e-06, + "loss": 5.4339, + "step": 5595 + }, + { + "epoch": 0.11393229166666667, + "grad_norm": 18.671998977661133, + "learning_rate": 9.980683660448487e-06, + "loss": 5.6249, + "step": 5600 + }, + { + "epoch": 0.11403401692708333, + "grad_norm": 13.792531967163086, + "learning_rate": 9.980648546549548e-06, + "loss": 5.578, + "step": 5605 + }, + { + "epoch": 0.1141357421875, + "grad_norm": 21.272212982177734, + "learning_rate": 9.980613400825846e-06, + "loss": 5.5973, + "step": 5610 + }, + { + "epoch": 0.11423746744791667, + "grad_norm": 20.26042366027832, + "learning_rate": 9.980578223277606e-06, + "loss": 5.3198, + "step": 5615 + }, + { + "epoch": 0.11433919270833333, + "grad_norm": 23.226755142211914, + "learning_rate": 9.98054301390505e-06, + "loss": 5.2429, + "step": 5620 + }, + { + "epoch": 0.11444091796875, + "grad_norm": 16.263988494873047, + "learning_rate": 9.980507772708408e-06, + "loss": 5.4928, + "step": 5625 + }, + { + "epoch": 0.11454264322916667, + "grad_norm": 15.788424491882324, + "learning_rate": 9.9804724996879e-06, + "loss": 5.4034, + "step": 5630 + }, + { + "epoch": 0.11464436848958333, + "grad_norm": 17.93870735168457, + "learning_rate": 9.980437194843758e-06, + "loss": 5.2457, + "step": 5635 + }, + { + "epoch": 0.11474609375, + "grad_norm": 17.51276397705078, + "learning_rate": 9.980401858176199e-06, + "loss": 5.5301, + "step": 5640 + }, + { + "epoch": 0.11484781901041667, + "grad_norm": 15.711578369140625, + "learning_rate": 9.980366489685453e-06, + "loss": 5.6242, + "step": 5645 + }, + { + "epoch": 0.11494954427083333, + "grad_norm": 14.171538352966309, + "learning_rate": 9.980331089371749e-06, + "loss": 5.6175, + "step": 5650 + }, + { + "epoch": 0.11505126953125, + "grad_norm": 13.30618667602539, + "learning_rate": 9.980295657235308e-06, + "loss": 5.3455, + "step": 5655 + }, + { + "epoch": 0.11515299479166667, + "grad_norm": 17.71959686279297, + "learning_rate": 9.980260193276361e-06, + "loss": 5.5914, + "step": 5660 + }, + { + "epoch": 0.11525472005208333, + "grad_norm": 12.202285766601562, + "learning_rate": 9.98022469749513e-06, + "loss": 5.6339, + "step": 5665 + }, + { + "epoch": 0.1153564453125, + "grad_norm": 18.3709716796875, + "learning_rate": 9.980189169891846e-06, + "loss": 5.8888, + "step": 5670 + }, + { + "epoch": 0.11545817057291667, + "grad_norm": 17.270036697387695, + "learning_rate": 9.980153610466731e-06, + "loss": 5.5912, + "step": 5675 + }, + { + "epoch": 0.11555989583333333, + "grad_norm": 17.246625900268555, + "learning_rate": 9.980118019220018e-06, + "loss": 5.7364, + "step": 5680 + }, + { + "epoch": 0.11566162109375, + "grad_norm": 13.393970489501953, + "learning_rate": 9.980082396151931e-06, + "loss": 5.4699, + "step": 5685 + }, + { + "epoch": 0.11576334635416667, + "grad_norm": 18.553340911865234, + "learning_rate": 9.980046741262698e-06, + "loss": 5.9507, + "step": 5690 + }, + { + "epoch": 0.11586507161458333, + "grad_norm": 18.79754066467285, + "learning_rate": 9.98001105455255e-06, + "loss": 5.9081, + "step": 5695 + }, + { + "epoch": 0.115966796875, + "grad_norm": 17.753856658935547, + "learning_rate": 9.97997533602171e-06, + "loss": 5.5454, + "step": 5700 + }, + { + "epoch": 0.11606852213541667, + "grad_norm": 13.595898628234863, + "learning_rate": 9.979939585670409e-06, + "loss": 5.3697, + "step": 5705 + }, + { + "epoch": 0.11617024739583333, + "grad_norm": 19.91731834411621, + "learning_rate": 9.979903803498873e-06, + "loss": 5.5539, + "step": 5710 + }, + { + "epoch": 0.11627197265625, + "grad_norm": 12.854681015014648, + "learning_rate": 9.979867989507335e-06, + "loss": 5.4193, + "step": 5715 + }, + { + "epoch": 0.11637369791666667, + "grad_norm": 18.604637145996094, + "learning_rate": 9.97983214369602e-06, + "loss": 5.2726, + "step": 5720 + }, + { + "epoch": 0.11647542317708333, + "grad_norm": 16.35883331298828, + "learning_rate": 9.979796266065158e-06, + "loss": 5.3226, + "step": 5725 + }, + { + "epoch": 0.1165771484375, + "grad_norm": 21.447500228881836, + "learning_rate": 9.979760356614978e-06, + "loss": 5.6718, + "step": 5730 + }, + { + "epoch": 0.11667887369791667, + "grad_norm": 16.50398826599121, + "learning_rate": 9.979724415345714e-06, + "loss": 5.7282, + "step": 5735 + }, + { + "epoch": 0.11678059895833333, + "grad_norm": 14.621660232543945, + "learning_rate": 9.979688442257588e-06, + "loss": 5.4124, + "step": 5740 + }, + { + "epoch": 0.11688232421875, + "grad_norm": 17.84079360961914, + "learning_rate": 9.979652437350835e-06, + "loss": 5.5893, + "step": 5745 + }, + { + "epoch": 0.11698404947916667, + "grad_norm": 12.448996543884277, + "learning_rate": 9.979616400625683e-06, + "loss": 5.5487, + "step": 5750 + }, + { + "epoch": 0.11708577473958333, + "grad_norm": 17.938230514526367, + "learning_rate": 9.979580332082364e-06, + "loss": 5.5989, + "step": 5755 + }, + { + "epoch": 0.1171875, + "grad_norm": 14.208882331848145, + "learning_rate": 9.979544231721106e-06, + "loss": 5.7008, + "step": 5760 + }, + { + "epoch": 0.11728922526041667, + "grad_norm": 24.107589721679688, + "learning_rate": 9.979508099542141e-06, + "loss": 5.6212, + "step": 5765 + }, + { + "epoch": 0.11739095052083333, + "grad_norm": 28.280742645263672, + "learning_rate": 9.979471935545702e-06, + "loss": 5.6114, + "step": 5770 + }, + { + "epoch": 0.11749267578125, + "grad_norm": 16.03752326965332, + "learning_rate": 9.979435739732017e-06, + "loss": 5.4752, + "step": 5775 + }, + { + "epoch": 0.11759440104166667, + "grad_norm": 15.053032875061035, + "learning_rate": 9.97939951210132e-06, + "loss": 5.4764, + "step": 5780 + }, + { + "epoch": 0.11769612630208333, + "grad_norm": 16.416614532470703, + "learning_rate": 9.979363252653838e-06, + "loss": 5.5367, + "step": 5785 + }, + { + "epoch": 0.1177978515625, + "grad_norm": 20.632585525512695, + "learning_rate": 9.979326961389806e-06, + "loss": 5.9272, + "step": 5790 + }, + { + "epoch": 0.11789957682291667, + "grad_norm": 11.978500366210938, + "learning_rate": 9.979290638309455e-06, + "loss": 5.4847, + "step": 5795 + }, + { + "epoch": 0.11800130208333333, + "grad_norm": 15.6491117477417, + "learning_rate": 9.97925428341302e-06, + "loss": 5.6334, + "step": 5800 + }, + { + "epoch": 0.11810302734375, + "grad_norm": 15.280145645141602, + "learning_rate": 9.979217896700727e-06, + "loss": 5.6078, + "step": 5805 + }, + { + "epoch": 0.11820475260416667, + "grad_norm": 15.029557228088379, + "learning_rate": 9.979181478172815e-06, + "loss": 5.5486, + "step": 5810 + }, + { + "epoch": 0.11830647786458333, + "grad_norm": 16.99915885925293, + "learning_rate": 9.979145027829514e-06, + "loss": 5.67, + "step": 5815 + }, + { + "epoch": 0.118408203125, + "grad_norm": 15.411309242248535, + "learning_rate": 9.979108545671057e-06, + "loss": 5.4065, + "step": 5820 + }, + { + "epoch": 0.11850992838541667, + "grad_norm": 16.334487915039062, + "learning_rate": 9.979072031697675e-06, + "loss": 5.5063, + "step": 5825 + }, + { + "epoch": 0.11861165364583333, + "grad_norm": 12.960384368896484, + "learning_rate": 9.979035485909604e-06, + "loss": 5.5651, + "step": 5830 + }, + { + "epoch": 0.11871337890625, + "grad_norm": 14.225271224975586, + "learning_rate": 9.978998908307079e-06, + "loss": 5.4054, + "step": 5835 + }, + { + "epoch": 0.11881510416666667, + "grad_norm": 13.309423446655273, + "learning_rate": 9.978962298890329e-06, + "loss": 5.5878, + "step": 5840 + }, + { + "epoch": 0.11891682942708333, + "grad_norm": 16.017745971679688, + "learning_rate": 9.978925657659591e-06, + "loss": 5.5904, + "step": 5845 + }, + { + "epoch": 0.1190185546875, + "grad_norm": 15.2990140914917, + "learning_rate": 9.978888984615099e-06, + "loss": 5.6579, + "step": 5850 + }, + { + "epoch": 0.11912027994791667, + "grad_norm": 13.111958503723145, + "learning_rate": 9.978852279757087e-06, + "loss": 5.6608, + "step": 5855 + }, + { + "epoch": 0.11922200520833333, + "grad_norm": 16.02227783203125, + "learning_rate": 9.978815543085789e-06, + "loss": 5.5507, + "step": 5860 + }, + { + "epoch": 0.11932373046875, + "grad_norm": 23.138832092285156, + "learning_rate": 9.97877877460144e-06, + "loss": 5.4465, + "step": 5865 + }, + { + "epoch": 0.11942545572916667, + "grad_norm": 16.26433563232422, + "learning_rate": 9.978741974304276e-06, + "loss": 5.4833, + "step": 5870 + }, + { + "epoch": 0.11952718098958333, + "grad_norm": 18.8841495513916, + "learning_rate": 9.97870514219453e-06, + "loss": 5.2432, + "step": 5875 + }, + { + "epoch": 0.11962890625, + "grad_norm": 17.812355041503906, + "learning_rate": 9.978668278272438e-06, + "loss": 5.4893, + "step": 5880 + }, + { + "epoch": 0.11973063151041667, + "grad_norm": 22.67943000793457, + "learning_rate": 9.978631382538239e-06, + "loss": 5.9232, + "step": 5885 + }, + { + "epoch": 0.11983235677083333, + "grad_norm": 16.197946548461914, + "learning_rate": 9.978594454992164e-06, + "loss": 5.3804, + "step": 5890 + }, + { + "epoch": 0.11993408203125, + "grad_norm": 10.924442291259766, + "learning_rate": 9.978557495634452e-06, + "loss": 5.4151, + "step": 5895 + }, + { + "epoch": 0.12003580729166667, + "grad_norm": 19.129146575927734, + "learning_rate": 9.978520504465336e-06, + "loss": 5.3317, + "step": 5900 + }, + { + "epoch": 0.12013753255208333, + "grad_norm": 12.470024108886719, + "learning_rate": 9.978483481485055e-06, + "loss": 5.2454, + "step": 5905 + }, + { + "epoch": 0.1202392578125, + "grad_norm": 15.5589599609375, + "learning_rate": 9.978446426693847e-06, + "loss": 5.5, + "step": 5910 + }, + { + "epoch": 0.12034098307291667, + "grad_norm": 15.118534088134766, + "learning_rate": 9.978409340091943e-06, + "loss": 5.4139, + "step": 5915 + }, + { + "epoch": 0.12044270833333333, + "grad_norm": 17.323434829711914, + "learning_rate": 9.978372221679587e-06, + "loss": 5.6603, + "step": 5920 + }, + { + "epoch": 0.12054443359375, + "grad_norm": 17.35328483581543, + "learning_rate": 9.978335071457012e-06, + "loss": 5.5833, + "step": 5925 + }, + { + "epoch": 0.12064615885416667, + "grad_norm": 13.464364051818848, + "learning_rate": 9.978297889424455e-06, + "loss": 5.5046, + "step": 5930 + }, + { + "epoch": 0.12074788411458333, + "grad_norm": 18.162336349487305, + "learning_rate": 9.978260675582157e-06, + "loss": 5.7278, + "step": 5935 + }, + { + "epoch": 0.120849609375, + "grad_norm": 17.647350311279297, + "learning_rate": 9.978223429930351e-06, + "loss": 5.5773, + "step": 5940 + }, + { + "epoch": 0.12095133463541667, + "grad_norm": 12.606416702270508, + "learning_rate": 9.978186152469278e-06, + "loss": 5.6196, + "step": 5945 + }, + { + "epoch": 0.12105305989583333, + "grad_norm": 16.864665985107422, + "learning_rate": 9.978148843199178e-06, + "loss": 5.4995, + "step": 5950 + }, + { + "epoch": 0.12115478515625, + "grad_norm": 18.872222900390625, + "learning_rate": 9.978111502120286e-06, + "loss": 5.6361, + "step": 5955 + }, + { + "epoch": 0.12125651041666667, + "grad_norm": 16.452091217041016, + "learning_rate": 9.978074129232842e-06, + "loss": 5.403, + "step": 5960 + }, + { + "epoch": 0.12135823567708333, + "grad_norm": 16.063762664794922, + "learning_rate": 9.978036724537084e-06, + "loss": 5.5712, + "step": 5965 + }, + { + "epoch": 0.1214599609375, + "grad_norm": 17.580394744873047, + "learning_rate": 9.977999288033251e-06, + "loss": 5.653, + "step": 5970 + }, + { + "epoch": 0.12156168619791667, + "grad_norm": 13.635100364685059, + "learning_rate": 9.977961819721585e-06, + "loss": 5.4945, + "step": 5975 + }, + { + "epoch": 0.12166341145833333, + "grad_norm": 14.05410385131836, + "learning_rate": 9.97792431960232e-06, + "loss": 5.72, + "step": 5980 + }, + { + "epoch": 0.12176513671875, + "grad_norm": 18.205442428588867, + "learning_rate": 9.9778867876757e-06, + "loss": 5.4005, + "step": 5985 + }, + { + "epoch": 0.12186686197916667, + "grad_norm": 17.374042510986328, + "learning_rate": 9.977849223941964e-06, + "loss": 5.5933, + "step": 5990 + }, + { + "epoch": 0.12196858723958333, + "grad_norm": 18.638153076171875, + "learning_rate": 9.977811628401353e-06, + "loss": 5.4612, + "step": 5995 + }, + { + "epoch": 0.1220703125, + "grad_norm": 16.750097274780273, + "learning_rate": 9.977774001054104e-06, + "loss": 5.5681, + "step": 6000 + }, + { + "epoch": 0.12217203776041667, + "grad_norm": 23.11859893798828, + "learning_rate": 9.977736341900461e-06, + "loss": 5.3345, + "step": 6005 + }, + { + "epoch": 0.12227376302083333, + "grad_norm": 16.79775619506836, + "learning_rate": 9.977698650940662e-06, + "loss": 5.486, + "step": 6010 + }, + { + "epoch": 0.12237548828125, + "grad_norm": 18.603025436401367, + "learning_rate": 9.977660928174948e-06, + "loss": 5.3864, + "step": 6015 + }, + { + "epoch": 0.12247721354166667, + "grad_norm": 16.795148849487305, + "learning_rate": 9.977623173603562e-06, + "loss": 5.8918, + "step": 6020 + }, + { + "epoch": 0.12257893880208333, + "grad_norm": 13.252036094665527, + "learning_rate": 9.977585387226743e-06, + "loss": 5.4374, + "step": 6025 + }, + { + "epoch": 0.1226806640625, + "grad_norm": 16.60871696472168, + "learning_rate": 9.977547569044732e-06, + "loss": 5.6824, + "step": 6030 + }, + { + "epoch": 0.12278238932291667, + "grad_norm": 17.588054656982422, + "learning_rate": 9.977509719057775e-06, + "loss": 5.871, + "step": 6035 + }, + { + "epoch": 0.12288411458333333, + "grad_norm": 17.528627395629883, + "learning_rate": 9.97747183726611e-06, + "loss": 5.3309, + "step": 6040 + }, + { + "epoch": 0.12298583984375, + "grad_norm": 18.430023193359375, + "learning_rate": 9.97743392366998e-06, + "loss": 5.4813, + "step": 6045 + }, + { + "epoch": 0.12308756510416667, + "grad_norm": 15.118419647216797, + "learning_rate": 9.977395978269628e-06, + "loss": 5.6413, + "step": 6050 + }, + { + "epoch": 0.12318929036458333, + "grad_norm": 21.591899871826172, + "learning_rate": 9.977358001065295e-06, + "loss": 5.3878, + "step": 6055 + }, + { + "epoch": 0.123291015625, + "grad_norm": 23.28841209411621, + "learning_rate": 9.977319992057225e-06, + "loss": 5.4288, + "step": 6060 + }, + { + "epoch": 0.12339274088541667, + "grad_norm": 14.502372741699219, + "learning_rate": 9.977281951245659e-06, + "loss": 5.7333, + "step": 6065 + }, + { + "epoch": 0.12349446614583333, + "grad_norm": 16.227210998535156, + "learning_rate": 9.977243878630843e-06, + "loss": 5.5538, + "step": 6070 + }, + { + "epoch": 0.12359619140625, + "grad_norm": 18.696762084960938, + "learning_rate": 9.977205774213017e-06, + "loss": 5.703, + "step": 6075 + }, + { + "epoch": 0.12369791666666667, + "grad_norm": 19.37496566772461, + "learning_rate": 9.977167637992427e-06, + "loss": 5.5382, + "step": 6080 + }, + { + "epoch": 0.12379964192708333, + "grad_norm": 17.448726654052734, + "learning_rate": 9.977129469969315e-06, + "loss": 5.2802, + "step": 6085 + }, + { + "epoch": 0.1239013671875, + "grad_norm": 18.429819107055664, + "learning_rate": 9.977091270143927e-06, + "loss": 5.2604, + "step": 6090 + }, + { + "epoch": 0.12400309244791667, + "grad_norm": 34.148006439208984, + "learning_rate": 9.977053038516504e-06, + "loss": 5.8844, + "step": 6095 + }, + { + "epoch": 0.12410481770833333, + "grad_norm": 16.312471389770508, + "learning_rate": 9.977014775087293e-06, + "loss": 5.5745, + "step": 6100 + }, + { + "epoch": 0.12420654296875, + "grad_norm": 11.530475616455078, + "learning_rate": 9.976976479856537e-06, + "loss": 5.3883, + "step": 6105 + }, + { + "epoch": 0.12430826822916667, + "grad_norm": 16.53692626953125, + "learning_rate": 9.976938152824481e-06, + "loss": 5.5494, + "step": 6110 + }, + { + "epoch": 0.12440999348958333, + "grad_norm": 18.363943099975586, + "learning_rate": 9.97689979399137e-06, + "loss": 5.6602, + "step": 6115 + }, + { + "epoch": 0.12451171875, + "grad_norm": 16.130619049072266, + "learning_rate": 9.976861403357451e-06, + "loss": 5.4992, + "step": 6120 + }, + { + "epoch": 0.12461344401041667, + "grad_norm": 19.75748062133789, + "learning_rate": 9.976822980922967e-06, + "loss": 5.5587, + "step": 6125 + }, + { + "epoch": 0.12471516927083333, + "grad_norm": 15.258757591247559, + "learning_rate": 9.976784526688162e-06, + "loss": 5.4155, + "step": 6130 + }, + { + "epoch": 0.12481689453125, + "grad_norm": 16.33151626586914, + "learning_rate": 9.976746040653285e-06, + "loss": 5.2317, + "step": 6135 + }, + { + "epoch": 0.12491861979166667, + "grad_norm": 11.931876182556152, + "learning_rate": 9.97670752281858e-06, + "loss": 5.2824, + "step": 6140 + }, + { + "epoch": 0.12502034505208334, + "grad_norm": 15.404830932617188, + "learning_rate": 9.976668973184296e-06, + "loss": 5.4234, + "step": 6145 + }, + { + "epoch": 0.1251220703125, + "grad_norm": 18.813953399658203, + "learning_rate": 9.976630391750674e-06, + "loss": 5.4639, + "step": 6150 + }, + { + "epoch": 0.12522379557291666, + "grad_norm": 18.069244384765625, + "learning_rate": 9.976591778517966e-06, + "loss": 5.2581, + "step": 6155 + }, + { + "epoch": 0.12532552083333334, + "grad_norm": 13.048432350158691, + "learning_rate": 9.976553133486416e-06, + "loss": 5.4528, + "step": 6160 + }, + { + "epoch": 0.12542724609375, + "grad_norm": 24.70384407043457, + "learning_rate": 9.97651445665627e-06, + "loss": 5.5101, + "step": 6165 + }, + { + "epoch": 0.12552897135416666, + "grad_norm": 13.577106475830078, + "learning_rate": 9.976475748027776e-06, + "loss": 5.5991, + "step": 6170 + }, + { + "epoch": 0.12563069661458334, + "grad_norm": 13.11683464050293, + "learning_rate": 9.976437007601183e-06, + "loss": 5.5235, + "step": 6175 + }, + { + "epoch": 0.125732421875, + "grad_norm": 17.569801330566406, + "learning_rate": 9.976398235376739e-06, + "loss": 5.6246, + "step": 6180 + }, + { + "epoch": 0.12583414713541666, + "grad_norm": 15.668710708618164, + "learning_rate": 9.976359431354687e-06, + "loss": 5.5192, + "step": 6185 + }, + { + "epoch": 0.12593587239583334, + "grad_norm": 26.87118911743164, + "learning_rate": 9.976320595535278e-06, + "loss": 5.4919, + "step": 6190 + }, + { + "epoch": 0.12603759765625, + "grad_norm": 15.820136070251465, + "learning_rate": 9.976281727918762e-06, + "loss": 5.7551, + "step": 6195 + }, + { + "epoch": 0.12613932291666666, + "grad_norm": 14.209673881530762, + "learning_rate": 9.976242828505385e-06, + "loss": 5.5626, + "step": 6200 + }, + { + "epoch": 0.12624104817708334, + "grad_norm": 20.53704261779785, + "learning_rate": 9.976203897295394e-06, + "loss": 5.4363, + "step": 6205 + }, + { + "epoch": 0.1263427734375, + "grad_norm": 13.738995552062988, + "learning_rate": 9.976164934289043e-06, + "loss": 5.4766, + "step": 6210 + }, + { + "epoch": 0.12644449869791666, + "grad_norm": 11.809370040893555, + "learning_rate": 9.976125939486576e-06, + "loss": 5.0977, + "step": 6215 + }, + { + "epoch": 0.12654622395833334, + "grad_norm": 20.48621940612793, + "learning_rate": 9.976086912888244e-06, + "loss": 5.5355, + "step": 6220 + }, + { + "epoch": 0.12664794921875, + "grad_norm": 12.381156921386719, + "learning_rate": 9.976047854494295e-06, + "loss": 5.4684, + "step": 6225 + }, + { + "epoch": 0.12674967447916666, + "grad_norm": 21.476415634155273, + "learning_rate": 9.97600876430498e-06, + "loss": 5.3573, + "step": 6230 + }, + { + "epoch": 0.12685139973958334, + "grad_norm": 20.3490047454834, + "learning_rate": 9.975969642320552e-06, + "loss": 5.6986, + "step": 6235 + }, + { + "epoch": 0.126953125, + "grad_norm": 16.36519432067871, + "learning_rate": 9.975930488541254e-06, + "loss": 5.7751, + "step": 6240 + }, + { + "epoch": 0.12705485026041666, + "grad_norm": 19.26746940612793, + "learning_rate": 9.97589130296734e-06, + "loss": 5.4599, + "step": 6245 + }, + { + "epoch": 0.12715657552083334, + "grad_norm": 21.13620948791504, + "learning_rate": 9.97585208559906e-06, + "loss": 5.6982, + "step": 6250 + }, + { + "epoch": 0.12725830078125, + "grad_norm": 12.752982139587402, + "learning_rate": 9.975812836436666e-06, + "loss": 5.7226, + "step": 6255 + }, + { + "epoch": 0.12736002604166666, + "grad_norm": 17.468717575073242, + "learning_rate": 9.975773555480408e-06, + "loss": 5.4381, + "step": 6260 + }, + { + "epoch": 0.12746175130208334, + "grad_norm": 19.98785400390625, + "learning_rate": 9.975734242730535e-06, + "loss": 5.525, + "step": 6265 + }, + { + "epoch": 0.1275634765625, + "grad_norm": 18.575790405273438, + "learning_rate": 9.9756948981873e-06, + "loss": 5.7243, + "step": 6270 + }, + { + "epoch": 0.12766520182291666, + "grad_norm": 30.028776168823242, + "learning_rate": 9.975655521850955e-06, + "loss": 5.7105, + "step": 6275 + }, + { + "epoch": 0.12776692708333334, + "grad_norm": 16.51436424255371, + "learning_rate": 9.975616113721751e-06, + "loss": 5.4723, + "step": 6280 + }, + { + "epoch": 0.12786865234375, + "grad_norm": 16.02109718322754, + "learning_rate": 9.975576673799938e-06, + "loss": 5.6322, + "step": 6285 + }, + { + "epoch": 0.12797037760416666, + "grad_norm": 13.753850936889648, + "learning_rate": 9.97553720208577e-06, + "loss": 5.3301, + "step": 6290 + }, + { + "epoch": 0.12807210286458334, + "grad_norm": 25.21549415588379, + "learning_rate": 9.9754976985795e-06, + "loss": 5.3385, + "step": 6295 + }, + { + "epoch": 0.128173828125, + "grad_norm": 17.987503051757812, + "learning_rate": 9.975458163281377e-06, + "loss": 5.7756, + "step": 6300 + }, + { + "epoch": 0.12827555338541666, + "grad_norm": 17.510482788085938, + "learning_rate": 9.975418596191657e-06, + "loss": 5.2603, + "step": 6305 + }, + { + "epoch": 0.12837727864583334, + "grad_norm": 15.50986099243164, + "learning_rate": 9.975378997310591e-06, + "loss": 5.6123, + "step": 6310 + }, + { + "epoch": 0.12847900390625, + "grad_norm": 14.721126556396484, + "learning_rate": 9.975339366638434e-06, + "loss": 5.323, + "step": 6315 + }, + { + "epoch": 0.12858072916666666, + "grad_norm": 21.921064376831055, + "learning_rate": 9.975299704175436e-06, + "loss": 5.4012, + "step": 6320 + }, + { + "epoch": 0.12868245442708334, + "grad_norm": 16.753738403320312, + "learning_rate": 9.975260009921854e-06, + "loss": 5.237, + "step": 6325 + }, + { + "epoch": 0.1287841796875, + "grad_norm": 12.621625900268555, + "learning_rate": 9.97522028387794e-06, + "loss": 5.5392, + "step": 6330 + }, + { + "epoch": 0.12888590494791666, + "grad_norm": 21.298059463500977, + "learning_rate": 9.975180526043946e-06, + "loss": 5.3254, + "step": 6335 + }, + { + "epoch": 0.12898763020833334, + "grad_norm": 20.378755569458008, + "learning_rate": 9.975140736420127e-06, + "loss": 5.5337, + "step": 6340 + }, + { + "epoch": 0.12908935546875, + "grad_norm": 19.17125701904297, + "learning_rate": 9.975100915006742e-06, + "loss": 5.5341, + "step": 6345 + }, + { + "epoch": 0.12919108072916666, + "grad_norm": 14.850894927978516, + "learning_rate": 9.975061061804037e-06, + "loss": 5.5676, + "step": 6350 + }, + { + "epoch": 0.12929280598958334, + "grad_norm": 13.91081714630127, + "learning_rate": 9.975021176812274e-06, + "loss": 5.4025, + "step": 6355 + }, + { + "epoch": 0.12939453125, + "grad_norm": 17.212099075317383, + "learning_rate": 9.974981260031705e-06, + "loss": 5.347, + "step": 6360 + }, + { + "epoch": 0.12949625651041666, + "grad_norm": 19.634180068969727, + "learning_rate": 9.974941311462582e-06, + "loss": 5.4734, + "step": 6365 + }, + { + "epoch": 0.12959798177083334, + "grad_norm": 14.929391860961914, + "learning_rate": 9.974901331105167e-06, + "loss": 5.4497, + "step": 6370 + }, + { + "epoch": 0.12969970703125, + "grad_norm": 11.093545913696289, + "learning_rate": 9.974861318959709e-06, + "loss": 5.385, + "step": 6375 + }, + { + "epoch": 0.12980143229166666, + "grad_norm": 17.600996017456055, + "learning_rate": 9.974821275026467e-06, + "loss": 5.4398, + "step": 6380 + }, + { + "epoch": 0.12990315755208334, + "grad_norm": 14.654824256896973, + "learning_rate": 9.974781199305698e-06, + "loss": 5.4847, + "step": 6385 + }, + { + "epoch": 0.1300048828125, + "grad_norm": 17.38625144958496, + "learning_rate": 9.974741091797655e-06, + "loss": 5.5315, + "step": 6390 + }, + { + "epoch": 0.13010660807291666, + "grad_norm": 14.527276039123535, + "learning_rate": 9.974700952502594e-06, + "loss": 5.7764, + "step": 6395 + }, + { + "epoch": 0.13020833333333334, + "grad_norm": 15.738358497619629, + "learning_rate": 9.974660781420776e-06, + "loss": 5.446, + "step": 6400 + }, + { + "epoch": 0.13031005859375, + "grad_norm": 34.546363830566406, + "learning_rate": 9.974620578552454e-06, + "loss": 5.6036, + "step": 6405 + }, + { + "epoch": 0.13041178385416666, + "grad_norm": 16.194419860839844, + "learning_rate": 9.974580343897885e-06, + "loss": 5.4148, + "step": 6410 + }, + { + "epoch": 0.13051350911458334, + "grad_norm": 18.954099655151367, + "learning_rate": 9.974540077457326e-06, + "loss": 5.1926, + "step": 6415 + }, + { + "epoch": 0.130615234375, + "grad_norm": 13.780653953552246, + "learning_rate": 9.974499779231038e-06, + "loss": 5.5155, + "step": 6420 + }, + { + "epoch": 0.13071695963541666, + "grad_norm": 15.045720100402832, + "learning_rate": 9.974459449219272e-06, + "loss": 5.5341, + "step": 6425 + }, + { + "epoch": 0.13081868489583334, + "grad_norm": 13.26309585571289, + "learning_rate": 9.974419087422292e-06, + "loss": 5.4595, + "step": 6430 + }, + { + "epoch": 0.13092041015625, + "grad_norm": 20.83778953552246, + "learning_rate": 9.974378693840353e-06, + "loss": 5.655, + "step": 6435 + }, + { + "epoch": 0.13102213541666666, + "grad_norm": 14.684647560119629, + "learning_rate": 9.97433826847371e-06, + "loss": 5.4013, + "step": 6440 + }, + { + "epoch": 0.13112386067708334, + "grad_norm": 17.280065536499023, + "learning_rate": 9.974297811322628e-06, + "loss": 5.3775, + "step": 6445 + }, + { + "epoch": 0.1312255859375, + "grad_norm": 17.152990341186523, + "learning_rate": 9.97425732238736e-06, + "loss": 5.4855, + "step": 6450 + }, + { + "epoch": 0.13132731119791666, + "grad_norm": 16.762203216552734, + "learning_rate": 9.974216801668167e-06, + "loss": 5.404, + "step": 6455 + }, + { + "epoch": 0.13142903645833334, + "grad_norm": 18.323640823364258, + "learning_rate": 9.974176249165309e-06, + "loss": 5.2961, + "step": 6460 + }, + { + "epoch": 0.13153076171875, + "grad_norm": 13.218562126159668, + "learning_rate": 9.974135664879043e-06, + "loss": 5.4416, + "step": 6465 + }, + { + "epoch": 0.13163248697916666, + "grad_norm": 16.028209686279297, + "learning_rate": 9.97409504880963e-06, + "loss": 5.4699, + "step": 6470 + }, + { + "epoch": 0.13173421223958334, + "grad_norm": 21.183195114135742, + "learning_rate": 9.974054400957327e-06, + "loss": 5.938, + "step": 6475 + }, + { + "epoch": 0.1318359375, + "grad_norm": 17.12059783935547, + "learning_rate": 9.974013721322396e-06, + "loss": 5.6108, + "step": 6480 + }, + { + "epoch": 0.13193766276041666, + "grad_norm": 13.369504928588867, + "learning_rate": 9.973973009905096e-06, + "loss": 5.6305, + "step": 6485 + }, + { + "epoch": 0.13203938802083334, + "grad_norm": 16.861982345581055, + "learning_rate": 9.973932266705688e-06, + "loss": 5.8317, + "step": 6490 + }, + { + "epoch": 0.13214111328125, + "grad_norm": 13.924752235412598, + "learning_rate": 9.97389149172443e-06, + "loss": 5.4439, + "step": 6495 + }, + { + "epoch": 0.13224283854166666, + "grad_norm": 17.390562057495117, + "learning_rate": 9.973850684961586e-06, + "loss": 5.1212, + "step": 6500 + }, + { + "epoch": 0.13234456380208334, + "grad_norm": 18.3359317779541, + "learning_rate": 9.973809846417414e-06, + "loss": 5.5231, + "step": 6505 + }, + { + "epoch": 0.1324462890625, + "grad_norm": 20.00368881225586, + "learning_rate": 9.973768976092178e-06, + "loss": 5.5388, + "step": 6510 + }, + { + "epoch": 0.13254801432291666, + "grad_norm": 17.03586196899414, + "learning_rate": 9.973728073986136e-06, + "loss": 5.2627, + "step": 6515 + }, + { + "epoch": 0.13264973958333334, + "grad_norm": 26.31982421875, + "learning_rate": 9.973687140099551e-06, + "loss": 5.4936, + "step": 6520 + }, + { + "epoch": 0.13275146484375, + "grad_norm": 15.215396881103516, + "learning_rate": 9.973646174432683e-06, + "loss": 5.5095, + "step": 6525 + }, + { + "epoch": 0.13285319010416666, + "grad_norm": 10.917333602905273, + "learning_rate": 9.973605176985794e-06, + "loss": 5.6406, + "step": 6530 + }, + { + "epoch": 0.13295491536458334, + "grad_norm": 15.195916175842285, + "learning_rate": 9.97356414775915e-06, + "loss": 5.5221, + "step": 6535 + }, + { + "epoch": 0.133056640625, + "grad_norm": 16.625072479248047, + "learning_rate": 9.973523086753008e-06, + "loss": 5.6148, + "step": 6540 + }, + { + "epoch": 0.13315836588541666, + "grad_norm": 15.177531242370605, + "learning_rate": 9.97348199396763e-06, + "loss": 5.5707, + "step": 6545 + }, + { + "epoch": 0.13326009114583334, + "grad_norm": 15.967267036437988, + "learning_rate": 9.973440869403283e-06, + "loss": 5.3821, + "step": 6550 + }, + { + "epoch": 0.13336181640625, + "grad_norm": 12.055782318115234, + "learning_rate": 9.973399713060228e-06, + "loss": 5.1441, + "step": 6555 + }, + { + "epoch": 0.13346354166666666, + "grad_norm": 17.87104034423828, + "learning_rate": 9.973358524938728e-06, + "loss": 5.4469, + "step": 6560 + }, + { + "epoch": 0.13356526692708334, + "grad_norm": 16.035717010498047, + "learning_rate": 9.973317305039046e-06, + "loss": 5.3866, + "step": 6565 + }, + { + "epoch": 0.1336669921875, + "grad_norm": 14.022066116333008, + "learning_rate": 9.973276053361443e-06, + "loss": 5.5743, + "step": 6570 + }, + { + "epoch": 0.13376871744791666, + "grad_norm": 14.909185409545898, + "learning_rate": 9.973234769906185e-06, + "loss": 5.5593, + "step": 6575 + }, + { + "epoch": 0.13387044270833334, + "grad_norm": 16.03993034362793, + "learning_rate": 9.973193454673538e-06, + "loss": 5.7138, + "step": 6580 + }, + { + "epoch": 0.13397216796875, + "grad_norm": 14.691089630126953, + "learning_rate": 9.973152107663761e-06, + "loss": 5.5302, + "step": 6585 + }, + { + "epoch": 0.13407389322916666, + "grad_norm": 16.418058395385742, + "learning_rate": 9.973110728877121e-06, + "loss": 5.3896, + "step": 6590 + }, + { + "epoch": 0.13417561848958334, + "grad_norm": 16.08272933959961, + "learning_rate": 9.973069318313883e-06, + "loss": 5.65, + "step": 6595 + }, + { + "epoch": 0.13427734375, + "grad_norm": 16.405181884765625, + "learning_rate": 9.973027875974313e-06, + "loss": 5.655, + "step": 6600 + }, + { + "epoch": 0.13437906901041666, + "grad_norm": 20.592803955078125, + "learning_rate": 9.97298640185867e-06, + "loss": 5.3227, + "step": 6605 + }, + { + "epoch": 0.13448079427083334, + "grad_norm": 11.60227108001709, + "learning_rate": 9.972944895967224e-06, + "loss": 5.3994, + "step": 6610 + }, + { + "epoch": 0.13458251953125, + "grad_norm": 16.982913970947266, + "learning_rate": 9.972903358300238e-06, + "loss": 5.5774, + "step": 6615 + }, + { + "epoch": 0.13468424479166666, + "grad_norm": 13.736530303955078, + "learning_rate": 9.97286178885798e-06, + "loss": 5.8131, + "step": 6620 + }, + { + "epoch": 0.13478597005208334, + "grad_norm": 20.948110580444336, + "learning_rate": 9.97282018764071e-06, + "loss": 5.2845, + "step": 6625 + }, + { + "epoch": 0.1348876953125, + "grad_norm": 10.814974784851074, + "learning_rate": 9.9727785546487e-06, + "loss": 5.5532, + "step": 6630 + }, + { + "epoch": 0.13498942057291666, + "grad_norm": 16.796785354614258, + "learning_rate": 9.972736889882215e-06, + "loss": 5.5797, + "step": 6635 + }, + { + "epoch": 0.13509114583333334, + "grad_norm": 19.66461753845215, + "learning_rate": 9.972695193341518e-06, + "loss": 5.4676, + "step": 6640 + }, + { + "epoch": 0.13519287109375, + "grad_norm": 18.523765563964844, + "learning_rate": 9.972653465026878e-06, + "loss": 5.3902, + "step": 6645 + }, + { + "epoch": 0.13529459635416666, + "grad_norm": 18.61583137512207, + "learning_rate": 9.97261170493856e-06, + "loss": 5.9456, + "step": 6650 + }, + { + "epoch": 0.13539632161458334, + "grad_norm": 20.9200439453125, + "learning_rate": 9.972569913076833e-06, + "loss": 5.6307, + "step": 6655 + }, + { + "epoch": 0.135498046875, + "grad_norm": 18.09457778930664, + "learning_rate": 9.972528089441961e-06, + "loss": 5.4061, + "step": 6660 + }, + { + "epoch": 0.13559977213541666, + "grad_norm": 19.8204345703125, + "learning_rate": 9.972486234034214e-06, + "loss": 5.2148, + "step": 6665 + }, + { + "epoch": 0.13570149739583334, + "grad_norm": 14.230914115905762, + "learning_rate": 9.972444346853858e-06, + "loss": 5.4368, + "step": 6670 + }, + { + "epoch": 0.13580322265625, + "grad_norm": 21.32179069519043, + "learning_rate": 9.972402427901162e-06, + "loss": 5.4692, + "step": 6675 + }, + { + "epoch": 0.13590494791666666, + "grad_norm": 15.763835906982422, + "learning_rate": 9.972360477176392e-06, + "loss": 5.4607, + "step": 6680 + }, + { + "epoch": 0.13600667317708334, + "grad_norm": 16.644426345825195, + "learning_rate": 9.972318494679818e-06, + "loss": 5.5802, + "step": 6685 + }, + { + "epoch": 0.1361083984375, + "grad_norm": 18.177053451538086, + "learning_rate": 9.972276480411706e-06, + "loss": 5.3433, + "step": 6690 + }, + { + "epoch": 0.13621012369791666, + "grad_norm": 12.981131553649902, + "learning_rate": 9.972234434372328e-06, + "loss": 5.3145, + "step": 6695 + }, + { + "epoch": 0.13631184895833334, + "grad_norm": 12.766824722290039, + "learning_rate": 9.972192356561947e-06, + "loss": 5.5396, + "step": 6700 + }, + { + "epoch": 0.13641357421875, + "grad_norm": 13.67133617401123, + "learning_rate": 9.972150246980837e-06, + "loss": 5.5223, + "step": 6705 + }, + { + "epoch": 0.13651529947916666, + "grad_norm": 39.21326446533203, + "learning_rate": 9.972108105629265e-06, + "loss": 5.7727, + "step": 6710 + }, + { + "epoch": 0.13661702473958334, + "grad_norm": 17.62091064453125, + "learning_rate": 9.972065932507503e-06, + "loss": 5.5661, + "step": 6715 + }, + { + "epoch": 0.13671875, + "grad_norm": 15.070162773132324, + "learning_rate": 9.972023727615813e-06, + "loss": 5.5474, + "step": 6720 + }, + { + "epoch": 0.13682047526041666, + "grad_norm": 22.838829040527344, + "learning_rate": 9.971981490954474e-06, + "loss": 5.3472, + "step": 6725 + }, + { + "epoch": 0.13692220052083334, + "grad_norm": 17.958263397216797, + "learning_rate": 9.97193922252375e-06, + "loss": 5.3735, + "step": 6730 + }, + { + "epoch": 0.13702392578125, + "grad_norm": 12.440970420837402, + "learning_rate": 9.971896922323915e-06, + "loss": 5.3564, + "step": 6735 + }, + { + "epoch": 0.13712565104166666, + "grad_norm": 14.383737564086914, + "learning_rate": 9.971854590355233e-06, + "loss": 5.2803, + "step": 6740 + }, + { + "epoch": 0.13722737630208334, + "grad_norm": 21.683218002319336, + "learning_rate": 9.971812226617981e-06, + "loss": 5.4241, + "step": 6745 + }, + { + "epoch": 0.1373291015625, + "grad_norm": 12.777520179748535, + "learning_rate": 9.971769831112426e-06, + "loss": 5.4196, + "step": 6750 + }, + { + "epoch": 0.13743082682291666, + "grad_norm": 18.855588912963867, + "learning_rate": 9.97172740383884e-06, + "loss": 5.1996, + "step": 6755 + }, + { + "epoch": 0.13753255208333334, + "grad_norm": 16.42656707763672, + "learning_rate": 9.971684944797494e-06, + "loss": 5.345, + "step": 6760 + }, + { + "epoch": 0.13763427734375, + "grad_norm": 19.82832908630371, + "learning_rate": 9.97164245398866e-06, + "loss": 5.8368, + "step": 6765 + }, + { + "epoch": 0.13773600260416666, + "grad_norm": 17.472043991088867, + "learning_rate": 9.97159993141261e-06, + "loss": 5.8059, + "step": 6770 + }, + { + "epoch": 0.13783772786458334, + "grad_norm": 15.390020370483398, + "learning_rate": 9.971557377069614e-06, + "loss": 5.5148, + "step": 6775 + }, + { + "epoch": 0.137939453125, + "grad_norm": 19.57646942138672, + "learning_rate": 9.971514790959943e-06, + "loss": 5.3825, + "step": 6780 + }, + { + "epoch": 0.13804117838541666, + "grad_norm": 16.566064834594727, + "learning_rate": 9.971472173083872e-06, + "loss": 5.6098, + "step": 6785 + }, + { + "epoch": 0.13814290364583334, + "grad_norm": 18.700590133666992, + "learning_rate": 9.971429523441672e-06, + "loss": 5.3011, + "step": 6790 + }, + { + "epoch": 0.13824462890625, + "grad_norm": 12.520125389099121, + "learning_rate": 9.971386842033614e-06, + "loss": 5.3072, + "step": 6795 + }, + { + "epoch": 0.13834635416666666, + "grad_norm": 17.58144187927246, + "learning_rate": 9.971344128859974e-06, + "loss": 5.4354, + "step": 6800 + }, + { + "epoch": 0.13844807942708334, + "grad_norm": 20.350311279296875, + "learning_rate": 9.971301383921022e-06, + "loss": 5.5834, + "step": 6805 + }, + { + "epoch": 0.1385498046875, + "grad_norm": 18.999706268310547, + "learning_rate": 9.971258607217034e-06, + "loss": 5.5432, + "step": 6810 + }, + { + "epoch": 0.13865152994791666, + "grad_norm": 12.340110778808594, + "learning_rate": 9.97121579874828e-06, + "loss": 5.3969, + "step": 6815 + }, + { + "epoch": 0.13875325520833334, + "grad_norm": 17.6500301361084, + "learning_rate": 9.971172958515035e-06, + "loss": 5.5604, + "step": 6820 + }, + { + "epoch": 0.13885498046875, + "grad_norm": 16.53435707092285, + "learning_rate": 9.971130086517574e-06, + "loss": 5.5886, + "step": 6825 + }, + { + "epoch": 0.13895670572916666, + "grad_norm": 21.63037109375, + "learning_rate": 9.971087182756169e-06, + "loss": 5.5328, + "step": 6830 + }, + { + "epoch": 0.13905843098958334, + "grad_norm": 17.87527847290039, + "learning_rate": 9.971044247231094e-06, + "loss": 5.5101, + "step": 6835 + }, + { + "epoch": 0.13916015625, + "grad_norm": 15.030261993408203, + "learning_rate": 9.971001279942627e-06, + "loss": 5.5263, + "step": 6840 + }, + { + "epoch": 0.13926188151041666, + "grad_norm": 16.272798538208008, + "learning_rate": 9.970958280891038e-06, + "loss": 5.686, + "step": 6845 + }, + { + "epoch": 0.13936360677083334, + "grad_norm": 15.933757781982422, + "learning_rate": 9.970915250076602e-06, + "loss": 5.4105, + "step": 6850 + }, + { + "epoch": 0.13946533203125, + "grad_norm": 13.156631469726562, + "learning_rate": 9.9708721874996e-06, + "loss": 5.5079, + "step": 6855 + }, + { + "epoch": 0.13956705729166666, + "grad_norm": 14.787038803100586, + "learning_rate": 9.9708290931603e-06, + "loss": 5.4532, + "step": 6860 + }, + { + "epoch": 0.13966878255208334, + "grad_norm": 16.969831466674805, + "learning_rate": 9.970785967058978e-06, + "loss": 5.4488, + "step": 6865 + }, + { + "epoch": 0.1397705078125, + "grad_norm": 13.706640243530273, + "learning_rate": 9.970742809195915e-06, + "loss": 5.217, + "step": 6870 + }, + { + "epoch": 0.13987223307291666, + "grad_norm": 16.488004684448242, + "learning_rate": 9.970699619571382e-06, + "loss": 5.4519, + "step": 6875 + }, + { + "epoch": 0.13997395833333334, + "grad_norm": 16.351879119873047, + "learning_rate": 9.970656398185656e-06, + "loss": 5.7102, + "step": 6880 + }, + { + "epoch": 0.14007568359375, + "grad_norm": 15.128007888793945, + "learning_rate": 9.970613145039015e-06, + "loss": 5.7027, + "step": 6885 + }, + { + "epoch": 0.14017740885416666, + "grad_norm": 16.15266990661621, + "learning_rate": 9.970569860131732e-06, + "loss": 5.5961, + "step": 6890 + }, + { + "epoch": 0.14027913411458334, + "grad_norm": 13.300681114196777, + "learning_rate": 9.970526543464086e-06, + "loss": 5.4441, + "step": 6895 + }, + { + "epoch": 0.140380859375, + "grad_norm": 21.333019256591797, + "learning_rate": 9.970483195036354e-06, + "loss": 5.5997, + "step": 6900 + }, + { + "epoch": 0.14048258463541666, + "grad_norm": 15.690264701843262, + "learning_rate": 9.970439814848811e-06, + "loss": 5.5291, + "step": 6905 + }, + { + "epoch": 0.14058430989583334, + "grad_norm": 17.490753173828125, + "learning_rate": 9.970396402901736e-06, + "loss": 5.7692, + "step": 6910 + }, + { + "epoch": 0.14068603515625, + "grad_norm": 24.829875946044922, + "learning_rate": 9.970352959195406e-06, + "loss": 5.9015, + "step": 6915 + }, + { + "epoch": 0.14078776041666666, + "grad_norm": 18.077558517456055, + "learning_rate": 9.970309483730099e-06, + "loss": 5.3427, + "step": 6920 + }, + { + "epoch": 0.14088948567708334, + "grad_norm": 15.435985565185547, + "learning_rate": 9.97026597650609e-06, + "loss": 5.7527, + "step": 6925 + }, + { + "epoch": 0.1409912109375, + "grad_norm": 12.483321189880371, + "learning_rate": 9.970222437523663e-06, + "loss": 5.5798, + "step": 6930 + }, + { + "epoch": 0.14109293619791666, + "grad_norm": 13.781185150146484, + "learning_rate": 9.970178866783089e-06, + "loss": 5.3507, + "step": 6935 + }, + { + "epoch": 0.14119466145833334, + "grad_norm": 16.55399513244629, + "learning_rate": 9.970135264284651e-06, + "loss": 6.1171, + "step": 6940 + }, + { + "epoch": 0.14129638671875, + "grad_norm": 17.854705810546875, + "learning_rate": 9.970091630028627e-06, + "loss": 5.6844, + "step": 6945 + }, + { + "epoch": 0.14139811197916666, + "grad_norm": 13.398846626281738, + "learning_rate": 9.970047964015292e-06, + "loss": 5.3054, + "step": 6950 + }, + { + "epoch": 0.14149983723958334, + "grad_norm": 14.791109085083008, + "learning_rate": 9.970004266244933e-06, + "loss": 5.6707, + "step": 6955 + }, + { + "epoch": 0.1416015625, + "grad_norm": 15.542041778564453, + "learning_rate": 9.969960536717821e-06, + "loss": 5.4185, + "step": 6960 + }, + { + "epoch": 0.14170328776041666, + "grad_norm": 18.565505981445312, + "learning_rate": 9.969916775434241e-06, + "loss": 5.5213, + "step": 6965 + }, + { + "epoch": 0.14180501302083334, + "grad_norm": 15.466068267822266, + "learning_rate": 9.969872982394468e-06, + "loss": 5.4298, + "step": 6970 + }, + { + "epoch": 0.14190673828125, + "grad_norm": 15.682196617126465, + "learning_rate": 9.969829157598786e-06, + "loss": 5.2852, + "step": 6975 + }, + { + "epoch": 0.14200846354166666, + "grad_norm": 19.63357162475586, + "learning_rate": 9.969785301047473e-06, + "loss": 5.2459, + "step": 6980 + }, + { + "epoch": 0.14211018880208334, + "grad_norm": 14.813878059387207, + "learning_rate": 9.96974141274081e-06, + "loss": 5.3188, + "step": 6985 + }, + { + "epoch": 0.1422119140625, + "grad_norm": 12.547582626342773, + "learning_rate": 9.969697492679076e-06, + "loss": 5.6622, + "step": 6990 + }, + { + "epoch": 0.14231363932291666, + "grad_norm": 16.080371856689453, + "learning_rate": 9.969653540862553e-06, + "loss": 5.3904, + "step": 6995 + }, + { + "epoch": 0.14241536458333334, + "grad_norm": 14.601740837097168, + "learning_rate": 9.969609557291523e-06, + "loss": 5.4528, + "step": 7000 + }, + { + "epoch": 0.14251708984375, + "grad_norm": 17.71467399597168, + "learning_rate": 9.969565541966263e-06, + "loss": 5.5183, + "step": 7005 + }, + { + "epoch": 0.14261881510416666, + "grad_norm": 14.304770469665527, + "learning_rate": 9.969521494887057e-06, + "loss": 5.4596, + "step": 7010 + }, + { + "epoch": 0.14272054036458334, + "grad_norm": 18.513883590698242, + "learning_rate": 9.969477416054188e-06, + "loss": 5.5309, + "step": 7015 + }, + { + "epoch": 0.142822265625, + "grad_norm": 20.687406539916992, + "learning_rate": 9.969433305467935e-06, + "loss": 5.3676, + "step": 7020 + }, + { + "epoch": 0.14292399088541666, + "grad_norm": 20.320608139038086, + "learning_rate": 9.96938916312858e-06, + "loss": 5.3721, + "step": 7025 + }, + { + "epoch": 0.14302571614583334, + "grad_norm": 14.950809478759766, + "learning_rate": 9.969344989036406e-06, + "loss": 5.4009, + "step": 7030 + }, + { + "epoch": 0.14312744140625, + "grad_norm": 19.343198776245117, + "learning_rate": 9.969300783191696e-06, + "loss": 5.665, + "step": 7035 + }, + { + "epoch": 0.14322916666666666, + "grad_norm": 17.619956970214844, + "learning_rate": 9.969256545594732e-06, + "loss": 5.4673, + "step": 7040 + }, + { + "epoch": 0.14333089192708334, + "grad_norm": 14.888957977294922, + "learning_rate": 9.969212276245795e-06, + "loss": 5.4657, + "step": 7045 + }, + { + "epoch": 0.1434326171875, + "grad_norm": 19.219663619995117, + "learning_rate": 9.969167975145168e-06, + "loss": 5.4642, + "step": 7050 + }, + { + "epoch": 0.14353434244791666, + "grad_norm": 12.539443969726562, + "learning_rate": 9.969123642293136e-06, + "loss": 5.5677, + "step": 7055 + }, + { + "epoch": 0.14363606770833334, + "grad_norm": 17.16502571105957, + "learning_rate": 9.969079277689982e-06, + "loss": 5.5443, + "step": 7060 + }, + { + "epoch": 0.14373779296875, + "grad_norm": 30.987369537353516, + "learning_rate": 9.969034881335988e-06, + "loss": 5.7004, + "step": 7065 + }, + { + "epoch": 0.14383951822916666, + "grad_norm": 15.972176551818848, + "learning_rate": 9.968990453231438e-06, + "loss": 5.4169, + "step": 7070 + }, + { + "epoch": 0.14394124348958334, + "grad_norm": 16.935121536254883, + "learning_rate": 9.968945993376617e-06, + "loss": 5.5092, + "step": 7075 + }, + { + "epoch": 0.14404296875, + "grad_norm": 21.16288948059082, + "learning_rate": 9.968901501771808e-06, + "loss": 5.6146, + "step": 7080 + }, + { + "epoch": 0.14414469401041666, + "grad_norm": 17.30523109436035, + "learning_rate": 9.968856978417297e-06, + "loss": 5.3913, + "step": 7085 + }, + { + "epoch": 0.14424641927083334, + "grad_norm": 16.750335693359375, + "learning_rate": 9.968812423313367e-06, + "loss": 5.4267, + "step": 7090 + }, + { + "epoch": 0.14434814453125, + "grad_norm": 21.092885971069336, + "learning_rate": 9.968767836460302e-06, + "loss": 5.492, + "step": 7095 + }, + { + "epoch": 0.14444986979166666, + "grad_norm": 19.541011810302734, + "learning_rate": 9.968723217858386e-06, + "loss": 5.6074, + "step": 7100 + }, + { + "epoch": 0.14455159505208334, + "grad_norm": 18.85484504699707, + "learning_rate": 9.96867856750791e-06, + "loss": 5.4919, + "step": 7105 + }, + { + "epoch": 0.1446533203125, + "grad_norm": 15.366929054260254, + "learning_rate": 9.968633885409154e-06, + "loss": 5.3542, + "step": 7110 + }, + { + "epoch": 0.14475504557291666, + "grad_norm": 19.182964324951172, + "learning_rate": 9.968589171562403e-06, + "loss": 5.5218, + "step": 7115 + }, + { + "epoch": 0.14485677083333334, + "grad_norm": 22.148792266845703, + "learning_rate": 9.968544425967946e-06, + "loss": 5.4877, + "step": 7120 + }, + { + "epoch": 0.14495849609375, + "grad_norm": 16.036930084228516, + "learning_rate": 9.968499648626067e-06, + "loss": 5.2162, + "step": 7125 + }, + { + "epoch": 0.14506022135416666, + "grad_norm": 18.068288803100586, + "learning_rate": 9.968454839537052e-06, + "loss": 5.5567, + "step": 7130 + }, + { + "epoch": 0.14516194661458334, + "grad_norm": 19.564973831176758, + "learning_rate": 9.968409998701187e-06, + "loss": 5.2882, + "step": 7135 + }, + { + "epoch": 0.145263671875, + "grad_norm": 17.309951782226562, + "learning_rate": 9.968365126118763e-06, + "loss": 5.3692, + "step": 7140 + }, + { + "epoch": 0.14536539713541666, + "grad_norm": 13.8280029296875, + "learning_rate": 9.968320221790058e-06, + "loss": 5.3596, + "step": 7145 + }, + { + "epoch": 0.14546712239583334, + "grad_norm": 14.43309211730957, + "learning_rate": 9.968275285715368e-06, + "loss": 5.4431, + "step": 7150 + }, + { + "epoch": 0.14556884765625, + "grad_norm": 16.8054141998291, + "learning_rate": 9.968230317894973e-06, + "loss": 5.5842, + "step": 7155 + }, + { + "epoch": 0.14567057291666666, + "grad_norm": 15.789630889892578, + "learning_rate": 9.968185318329165e-06, + "loss": 5.5256, + "step": 7160 + }, + { + "epoch": 0.14577229817708334, + "grad_norm": 13.036558151245117, + "learning_rate": 9.96814028701823e-06, + "loss": 5.5058, + "step": 7165 + }, + { + "epoch": 0.1458740234375, + "grad_norm": 17.584959030151367, + "learning_rate": 9.968095223962455e-06, + "loss": 5.514, + "step": 7170 + }, + { + "epoch": 0.14597574869791666, + "grad_norm": 15.923765182495117, + "learning_rate": 9.96805012916213e-06, + "loss": 5.6161, + "step": 7175 + }, + { + "epoch": 0.14607747395833334, + "grad_norm": 18.165285110473633, + "learning_rate": 9.96800500261754e-06, + "loss": 5.2886, + "step": 7180 + }, + { + "epoch": 0.14617919921875, + "grad_norm": 20.06735610961914, + "learning_rate": 9.967959844328975e-06, + "loss": 5.1542, + "step": 7185 + }, + { + "epoch": 0.14628092447916666, + "grad_norm": 17.31222152709961, + "learning_rate": 9.967914654296726e-06, + "loss": 5.4917, + "step": 7190 + }, + { + "epoch": 0.14638264973958334, + "grad_norm": 16.43758773803711, + "learning_rate": 9.967869432521078e-06, + "loss": 5.6765, + "step": 7195 + }, + { + "epoch": 0.146484375, + "grad_norm": 13.40861701965332, + "learning_rate": 9.967824179002321e-06, + "loss": 5.5157, + "step": 7200 + }, + { + "epoch": 0.14658610026041666, + "grad_norm": 15.468470573425293, + "learning_rate": 9.967778893740746e-06, + "loss": 5.352, + "step": 7205 + }, + { + "epoch": 0.14668782552083334, + "grad_norm": 16.821542739868164, + "learning_rate": 9.967733576736638e-06, + "loss": 5.4058, + "step": 7210 + }, + { + "epoch": 0.14678955078125, + "grad_norm": 19.60890769958496, + "learning_rate": 9.967688227990292e-06, + "loss": 5.2189, + "step": 7215 + }, + { + "epoch": 0.14689127604166666, + "grad_norm": 17.491960525512695, + "learning_rate": 9.967642847501993e-06, + "loss": 5.2631, + "step": 7220 + }, + { + "epoch": 0.14699300130208334, + "grad_norm": 16.940570831298828, + "learning_rate": 9.967597435272033e-06, + "loss": 5.577, + "step": 7225 + }, + { + "epoch": 0.1470947265625, + "grad_norm": 15.196976661682129, + "learning_rate": 9.967551991300705e-06, + "loss": 5.3175, + "step": 7230 + }, + { + "epoch": 0.14719645182291666, + "grad_norm": 15.229844093322754, + "learning_rate": 9.967506515588295e-06, + "loss": 5.6601, + "step": 7235 + }, + { + "epoch": 0.14729817708333334, + "grad_norm": 14.673489570617676, + "learning_rate": 9.967461008135095e-06, + "loss": 5.434, + "step": 7240 + }, + { + "epoch": 0.14739990234375, + "grad_norm": 18.00442123413086, + "learning_rate": 9.967415468941397e-06, + "loss": 5.5633, + "step": 7245 + }, + { + "epoch": 0.14750162760416666, + "grad_norm": 15.018078804016113, + "learning_rate": 9.96736989800749e-06, + "loss": 5.5704, + "step": 7250 + }, + { + "epoch": 0.14760335286458334, + "grad_norm": 16.0889835357666, + "learning_rate": 9.967324295333667e-06, + "loss": 5.6839, + "step": 7255 + }, + { + "epoch": 0.147705078125, + "grad_norm": 15.386335372924805, + "learning_rate": 9.967278660920219e-06, + "loss": 5.2612, + "step": 7260 + }, + { + "epoch": 0.14780680338541666, + "grad_norm": 17.789508819580078, + "learning_rate": 9.967232994767437e-06, + "loss": 5.608, + "step": 7265 + }, + { + "epoch": 0.14790852864583334, + "grad_norm": 31.396814346313477, + "learning_rate": 9.967187296875612e-06, + "loss": 5.7012, + "step": 7270 + }, + { + "epoch": 0.14801025390625, + "grad_norm": 16.02705955505371, + "learning_rate": 9.967141567245038e-06, + "loss": 5.4038, + "step": 7275 + }, + { + "epoch": 0.14811197916666666, + "grad_norm": 23.235389709472656, + "learning_rate": 9.967095805876005e-06, + "loss": 5.5944, + "step": 7280 + }, + { + "epoch": 0.14821370442708334, + "grad_norm": 17.86994743347168, + "learning_rate": 9.967050012768808e-06, + "loss": 5.3187, + "step": 7285 + }, + { + "epoch": 0.1483154296875, + "grad_norm": 17.24744987487793, + "learning_rate": 9.967004187923737e-06, + "loss": 5.4823, + "step": 7290 + }, + { + "epoch": 0.14841715494791666, + "grad_norm": 12.23805046081543, + "learning_rate": 9.966958331341086e-06, + "loss": 5.4245, + "step": 7295 + }, + { + "epoch": 0.14851888020833334, + "grad_norm": 18.953510284423828, + "learning_rate": 9.966912443021148e-06, + "loss": 5.4535, + "step": 7300 + }, + { + "epoch": 0.14862060546875, + "grad_norm": 14.838933944702148, + "learning_rate": 9.966866522964218e-06, + "loss": 5.4372, + "step": 7305 + }, + { + "epoch": 0.14872233072916666, + "grad_norm": 14.615714073181152, + "learning_rate": 9.966820571170585e-06, + "loss": 5.322, + "step": 7310 + }, + { + "epoch": 0.14882405598958334, + "grad_norm": 19.49492073059082, + "learning_rate": 9.966774587640547e-06, + "loss": 5.5528, + "step": 7315 + }, + { + "epoch": 0.14892578125, + "grad_norm": 17.504180908203125, + "learning_rate": 9.966728572374395e-06, + "loss": 5.4236, + "step": 7320 + }, + { + "epoch": 0.14902750651041666, + "grad_norm": 20.420495986938477, + "learning_rate": 9.966682525372426e-06, + "loss": 5.5925, + "step": 7325 + }, + { + "epoch": 0.14912923177083334, + "grad_norm": 19.972951889038086, + "learning_rate": 9.966636446634929e-06, + "loss": 5.4488, + "step": 7330 + }, + { + "epoch": 0.14923095703125, + "grad_norm": 17.646751403808594, + "learning_rate": 9.966590336162204e-06, + "loss": 5.3987, + "step": 7335 + }, + { + "epoch": 0.14933268229166666, + "grad_norm": 17.6201171875, + "learning_rate": 9.966544193954543e-06, + "loss": 5.43, + "step": 7340 + }, + { + "epoch": 0.14943440755208334, + "grad_norm": 14.813765525817871, + "learning_rate": 9.966498020012242e-06, + "loss": 5.613, + "step": 7345 + }, + { + "epoch": 0.1495361328125, + "grad_norm": 14.14120101928711, + "learning_rate": 9.966451814335595e-06, + "loss": 5.5314, + "step": 7350 + }, + { + "epoch": 0.14963785807291666, + "grad_norm": 20.000608444213867, + "learning_rate": 9.966405576924896e-06, + "loss": 5.3608, + "step": 7355 + }, + { + "epoch": 0.14973958333333334, + "grad_norm": 22.091073989868164, + "learning_rate": 9.966359307780442e-06, + "loss": 5.3582, + "step": 7360 + }, + { + "epoch": 0.14984130859375, + "grad_norm": 20.208480834960938, + "learning_rate": 9.966313006902531e-06, + "loss": 5.5241, + "step": 7365 + }, + { + "epoch": 0.14994303385416666, + "grad_norm": 12.503329277038574, + "learning_rate": 9.966266674291454e-06, + "loss": 5.5926, + "step": 7370 + }, + { + "epoch": 0.15004475911458334, + "grad_norm": 15.673813819885254, + "learning_rate": 9.96622030994751e-06, + "loss": 5.3148, + "step": 7375 + }, + { + "epoch": 0.150146484375, + "grad_norm": 14.484097480773926, + "learning_rate": 9.966173913870996e-06, + "loss": 5.6331, + "step": 7380 + }, + { + "epoch": 0.15024820963541666, + "grad_norm": 16.820354461669922, + "learning_rate": 9.966127486062205e-06, + "loss": 5.6675, + "step": 7385 + }, + { + "epoch": 0.15034993489583334, + "grad_norm": 20.0877628326416, + "learning_rate": 9.966081026521437e-06, + "loss": 5.3753, + "step": 7390 + }, + { + "epoch": 0.15045166015625, + "grad_norm": 13.221187591552734, + "learning_rate": 9.966034535248987e-06, + "loss": 5.3034, + "step": 7395 + }, + { + "epoch": 0.15055338541666666, + "grad_norm": 15.021241188049316, + "learning_rate": 9.965988012245153e-06, + "loss": 5.4628, + "step": 7400 + }, + { + "epoch": 0.15065511067708334, + "grad_norm": 13.251863479614258, + "learning_rate": 9.965941457510233e-06, + "loss": 5.619, + "step": 7405 + }, + { + "epoch": 0.1507568359375, + "grad_norm": 15.944108009338379, + "learning_rate": 9.965894871044523e-06, + "loss": 5.6989, + "step": 7410 + }, + { + "epoch": 0.15085856119791666, + "grad_norm": 14.09328842163086, + "learning_rate": 9.965848252848322e-06, + "loss": 5.3511, + "step": 7415 + }, + { + "epoch": 0.15096028645833334, + "grad_norm": 19.230358123779297, + "learning_rate": 9.965801602921926e-06, + "loss": 5.3677, + "step": 7420 + }, + { + "epoch": 0.15106201171875, + "grad_norm": 14.273237228393555, + "learning_rate": 9.965754921265634e-06, + "loss": 5.3363, + "step": 7425 + }, + { + "epoch": 0.15116373697916666, + "grad_norm": 12.769891738891602, + "learning_rate": 9.965708207879744e-06, + "loss": 5.3531, + "step": 7430 + }, + { + "epoch": 0.15126546223958334, + "grad_norm": 13.19538402557373, + "learning_rate": 9.965661462764555e-06, + "loss": 5.3151, + "step": 7435 + }, + { + "epoch": 0.1513671875, + "grad_norm": 14.226571083068848, + "learning_rate": 9.965614685920365e-06, + "loss": 5.1385, + "step": 7440 + }, + { + "epoch": 0.15146891276041666, + "grad_norm": 17.845035552978516, + "learning_rate": 9.965567877347477e-06, + "loss": 5.2072, + "step": 7445 + }, + { + "epoch": 0.15157063802083334, + "grad_norm": 13.95977783203125, + "learning_rate": 9.965521037046182e-06, + "loss": 5.4476, + "step": 7450 + }, + { + "epoch": 0.15167236328125, + "grad_norm": 18.004384994506836, + "learning_rate": 9.965474165016786e-06, + "loss": 5.4469, + "step": 7455 + }, + { + "epoch": 0.15177408854166666, + "grad_norm": 19.328617095947266, + "learning_rate": 9.965427261259588e-06, + "loss": 5.2772, + "step": 7460 + }, + { + "epoch": 0.15187581380208334, + "grad_norm": 14.817307472229004, + "learning_rate": 9.965380325774882e-06, + "loss": 5.3688, + "step": 7465 + }, + { + "epoch": 0.1519775390625, + "grad_norm": 16.104671478271484, + "learning_rate": 9.965333358562975e-06, + "loss": 5.3896, + "step": 7470 + }, + { + "epoch": 0.15207926432291666, + "grad_norm": 21.359956741333008, + "learning_rate": 9.965286359624163e-06, + "loss": 5.3616, + "step": 7475 + }, + { + "epoch": 0.15218098958333334, + "grad_norm": 16.98548126220703, + "learning_rate": 9.965239328958748e-06, + "loss": 5.3411, + "step": 7480 + }, + { + "epoch": 0.15228271484375, + "grad_norm": 14.866292953491211, + "learning_rate": 9.96519226656703e-06, + "loss": 5.6011, + "step": 7485 + }, + { + "epoch": 0.15238444010416666, + "grad_norm": 14.449453353881836, + "learning_rate": 9.965145172449308e-06, + "loss": 5.4071, + "step": 7490 + }, + { + "epoch": 0.15248616536458334, + "grad_norm": 14.585965156555176, + "learning_rate": 9.965098046605886e-06, + "loss": 5.4145, + "step": 7495 + }, + { + "epoch": 0.152587890625, + "grad_norm": 18.32076644897461, + "learning_rate": 9.965050889037064e-06, + "loss": 5.4124, + "step": 7500 + }, + { + "epoch": 0.15268961588541666, + "grad_norm": 14.111846923828125, + "learning_rate": 9.96500369974314e-06, + "loss": 5.3219, + "step": 7505 + }, + { + "epoch": 0.15279134114583334, + "grad_norm": 14.138226509094238, + "learning_rate": 9.964956478724421e-06, + "loss": 5.6753, + "step": 7510 + }, + { + "epoch": 0.15289306640625, + "grad_norm": 16.99839973449707, + "learning_rate": 9.964909225981207e-06, + "loss": 5.2912, + "step": 7515 + }, + { + "epoch": 0.15299479166666666, + "grad_norm": 15.451153755187988, + "learning_rate": 9.964861941513797e-06, + "loss": 5.3801, + "step": 7520 + }, + { + "epoch": 0.15309651692708334, + "grad_norm": 13.800834655761719, + "learning_rate": 9.964814625322497e-06, + "loss": 5.7263, + "step": 7525 + }, + { + "epoch": 0.1531982421875, + "grad_norm": 15.150814056396484, + "learning_rate": 9.964767277407605e-06, + "loss": 5.4492, + "step": 7530 + }, + { + "epoch": 0.15329996744791666, + "grad_norm": 12.240714073181152, + "learning_rate": 9.964719897769428e-06, + "loss": 5.4645, + "step": 7535 + }, + { + "epoch": 0.15340169270833334, + "grad_norm": 17.42015266418457, + "learning_rate": 9.964672486408268e-06, + "loss": 5.6978, + "step": 7540 + }, + { + "epoch": 0.15350341796875, + "grad_norm": 13.480195045471191, + "learning_rate": 9.964625043324425e-06, + "loss": 5.2946, + "step": 7545 + }, + { + "epoch": 0.15360514322916666, + "grad_norm": 13.610269546508789, + "learning_rate": 9.964577568518204e-06, + "loss": 5.7954, + "step": 7550 + }, + { + "epoch": 0.15370686848958334, + "grad_norm": 16.62486457824707, + "learning_rate": 9.96453006198991e-06, + "loss": 6.009, + "step": 7555 + }, + { + "epoch": 0.15380859375, + "grad_norm": 14.744949340820312, + "learning_rate": 9.964482523739843e-06, + "loss": 5.483, + "step": 7560 + }, + { + "epoch": 0.15391031901041666, + "grad_norm": 14.184566497802734, + "learning_rate": 9.96443495376831e-06, + "loss": 5.4639, + "step": 7565 + }, + { + "epoch": 0.15401204427083334, + "grad_norm": 16.239887237548828, + "learning_rate": 9.964387352075613e-06, + "loss": 5.3273, + "step": 7570 + }, + { + "epoch": 0.15411376953125, + "grad_norm": 17.127864837646484, + "learning_rate": 9.964339718662055e-06, + "loss": 5.5321, + "step": 7575 + }, + { + "epoch": 0.15421549479166666, + "grad_norm": 17.66735076904297, + "learning_rate": 9.964292053527944e-06, + "loss": 5.6187, + "step": 7580 + }, + { + "epoch": 0.15431722005208334, + "grad_norm": 18.962635040283203, + "learning_rate": 9.964244356673582e-06, + "loss": 5.5934, + "step": 7585 + }, + { + "epoch": 0.1544189453125, + "grad_norm": 18.491323471069336, + "learning_rate": 9.964196628099273e-06, + "loss": 5.3098, + "step": 7590 + }, + { + "epoch": 0.15452067057291666, + "grad_norm": 18.759170532226562, + "learning_rate": 9.964148867805327e-06, + "loss": 5.3867, + "step": 7595 + }, + { + "epoch": 0.15462239583333334, + "grad_norm": 14.303768157958984, + "learning_rate": 9.964101075792043e-06, + "loss": 5.5821, + "step": 7600 + }, + { + "epoch": 0.15472412109375, + "grad_norm": 19.598833084106445, + "learning_rate": 9.96405325205973e-06, + "loss": 5.2279, + "step": 7605 + }, + { + "epoch": 0.15482584635416666, + "grad_norm": 23.001379013061523, + "learning_rate": 9.964005396608692e-06, + "loss": 5.3548, + "step": 7610 + }, + { + "epoch": 0.15492757161458334, + "grad_norm": 18.324913024902344, + "learning_rate": 9.963957509439235e-06, + "loss": 5.4646, + "step": 7615 + }, + { + "epoch": 0.155029296875, + "grad_norm": 24.995569229125977, + "learning_rate": 9.963909590551666e-06, + "loss": 5.114, + "step": 7620 + }, + { + "epoch": 0.15513102213541666, + "grad_norm": 13.065277099609375, + "learning_rate": 9.96386163994629e-06, + "loss": 5.3746, + "step": 7625 + }, + { + "epoch": 0.15523274739583334, + "grad_norm": 13.120121002197266, + "learning_rate": 9.963813657623414e-06, + "loss": 5.5626, + "step": 7630 + }, + { + "epoch": 0.15533447265625, + "grad_norm": 13.396736145019531, + "learning_rate": 9.963765643583345e-06, + "loss": 5.6168, + "step": 7635 + }, + { + "epoch": 0.15543619791666666, + "grad_norm": 20.836057662963867, + "learning_rate": 9.963717597826388e-06, + "loss": 5.3814, + "step": 7640 + }, + { + "epoch": 0.15553792317708334, + "grad_norm": 14.583788871765137, + "learning_rate": 9.963669520352852e-06, + "loss": 5.1818, + "step": 7645 + }, + { + "epoch": 0.1556396484375, + "grad_norm": 23.499282836914062, + "learning_rate": 9.963621411163045e-06, + "loss": 5.6251, + "step": 7650 + }, + { + "epoch": 0.15574137369791666, + "grad_norm": 12.874505996704102, + "learning_rate": 9.96357327025727e-06, + "loss": 5.2644, + "step": 7655 + }, + { + "epoch": 0.15584309895833334, + "grad_norm": 11.395499229431152, + "learning_rate": 9.963525097635838e-06, + "loss": 5.2446, + "step": 7660 + }, + { + "epoch": 0.15594482421875, + "grad_norm": 20.2305908203125, + "learning_rate": 9.963476893299057e-06, + "loss": 5.5975, + "step": 7665 + }, + { + "epoch": 0.15604654947916666, + "grad_norm": 13.89411735534668, + "learning_rate": 9.963428657247236e-06, + "loss": 5.4313, + "step": 7670 + }, + { + "epoch": 0.15614827473958334, + "grad_norm": 14.440144538879395, + "learning_rate": 9.96338038948068e-06, + "loss": 5.6838, + "step": 7675 + }, + { + "epoch": 0.15625, + "grad_norm": 17.812559127807617, + "learning_rate": 9.963332089999698e-06, + "loss": 5.5172, + "step": 7680 + }, + { + "epoch": 0.15635172526041666, + "grad_norm": 13.330060958862305, + "learning_rate": 9.9632837588046e-06, + "loss": 5.3878, + "step": 7685 + }, + { + "epoch": 0.15645345052083334, + "grad_norm": 18.896575927734375, + "learning_rate": 9.963235395895694e-06, + "loss": 5.5527, + "step": 7690 + }, + { + "epoch": 0.15655517578125, + "grad_norm": 22.92671775817871, + "learning_rate": 9.96318700127329e-06, + "loss": 5.5147, + "step": 7695 + }, + { + "epoch": 0.15665690104166666, + "grad_norm": 21.636869430541992, + "learning_rate": 9.963138574937696e-06, + "loss": 5.7949, + "step": 7700 + }, + { + "epoch": 0.15675862630208334, + "grad_norm": 15.990111351013184, + "learning_rate": 9.963090116889222e-06, + "loss": 5.4749, + "step": 7705 + }, + { + "epoch": 0.1568603515625, + "grad_norm": 13.518935203552246, + "learning_rate": 9.96304162712818e-06, + "loss": 5.3916, + "step": 7710 + }, + { + "epoch": 0.15696207682291666, + "grad_norm": 13.33862590789795, + "learning_rate": 9.962993105654875e-06, + "loss": 5.4765, + "step": 7715 + }, + { + "epoch": 0.15706380208333334, + "grad_norm": 13.473953247070312, + "learning_rate": 9.96294455246962e-06, + "loss": 5.529, + "step": 7720 + }, + { + "epoch": 0.15716552734375, + "grad_norm": 17.91921043395996, + "learning_rate": 9.962895967572726e-06, + "loss": 5.3228, + "step": 7725 + }, + { + "epoch": 0.15726725260416666, + "grad_norm": 16.5872745513916, + "learning_rate": 9.9628473509645e-06, + "loss": 5.2935, + "step": 7730 + }, + { + "epoch": 0.15736897786458334, + "grad_norm": 17.06328773498535, + "learning_rate": 9.962798702645255e-06, + "loss": 5.3788, + "step": 7735 + }, + { + "epoch": 0.157470703125, + "grad_norm": 12.566216468811035, + "learning_rate": 9.962750022615304e-06, + "loss": 5.5037, + "step": 7740 + }, + { + "epoch": 0.15757242838541666, + "grad_norm": 15.192073822021484, + "learning_rate": 9.962701310874954e-06, + "loss": 5.0896, + "step": 7745 + }, + { + "epoch": 0.15767415364583334, + "grad_norm": 17.962167739868164, + "learning_rate": 9.962652567424518e-06, + "loss": 5.5597, + "step": 7750 + }, + { + "epoch": 0.15777587890625, + "grad_norm": 13.610526084899902, + "learning_rate": 9.962603792264308e-06, + "loss": 5.5379, + "step": 7755 + }, + { + "epoch": 0.15787760416666666, + "grad_norm": 10.698001861572266, + "learning_rate": 9.962554985394635e-06, + "loss": 5.4257, + "step": 7760 + }, + { + "epoch": 0.15797932942708334, + "grad_norm": 15.023443222045898, + "learning_rate": 9.962506146815812e-06, + "loss": 5.2834, + "step": 7765 + }, + { + "epoch": 0.1580810546875, + "grad_norm": 20.01814842224121, + "learning_rate": 9.962457276528148e-06, + "loss": 5.5803, + "step": 7770 + }, + { + "epoch": 0.15818277994791666, + "grad_norm": 17.833189010620117, + "learning_rate": 9.962408374531958e-06, + "loss": 5.3543, + "step": 7775 + }, + { + "epoch": 0.15828450520833334, + "grad_norm": 15.730818748474121, + "learning_rate": 9.962359440827555e-06, + "loss": 5.4345, + "step": 7780 + }, + { + "epoch": 0.15838623046875, + "grad_norm": 17.75408935546875, + "learning_rate": 9.962310475415248e-06, + "loss": 5.4257, + "step": 7785 + }, + { + "epoch": 0.15848795572916666, + "grad_norm": 15.354164123535156, + "learning_rate": 9.962261478295354e-06, + "loss": 5.498, + "step": 7790 + }, + { + "epoch": 0.15858968098958334, + "grad_norm": 19.073076248168945, + "learning_rate": 9.962212449468185e-06, + "loss": 5.7157, + "step": 7795 + }, + { + "epoch": 0.15869140625, + "grad_norm": 16.185317993164062, + "learning_rate": 9.962163388934053e-06, + "loss": 5.5319, + "step": 7800 + }, + { + "epoch": 0.15879313151041666, + "grad_norm": 17.836156845092773, + "learning_rate": 9.96211429669327e-06, + "loss": 5.5306, + "step": 7805 + }, + { + "epoch": 0.15889485677083334, + "grad_norm": 15.592826843261719, + "learning_rate": 9.962065172746154e-06, + "loss": 5.3864, + "step": 7810 + }, + { + "epoch": 0.15899658203125, + "grad_norm": 22.363250732421875, + "learning_rate": 9.962016017093017e-06, + "loss": 5.4801, + "step": 7815 + }, + { + "epoch": 0.15909830729166666, + "grad_norm": 16.81391716003418, + "learning_rate": 9.961966829734174e-06, + "loss": 5.4448, + "step": 7820 + }, + { + "epoch": 0.15920003255208334, + "grad_norm": 28.233245849609375, + "learning_rate": 9.961917610669935e-06, + "loss": 5.3688, + "step": 7825 + }, + { + "epoch": 0.1593017578125, + "grad_norm": 19.332672119140625, + "learning_rate": 9.96186835990062e-06, + "loss": 5.3609, + "step": 7830 + }, + { + "epoch": 0.15940348307291666, + "grad_norm": 18.496122360229492, + "learning_rate": 9.961819077426538e-06, + "loss": 5.3385, + "step": 7835 + }, + { + "epoch": 0.15950520833333334, + "grad_norm": 19.318157196044922, + "learning_rate": 9.961769763248011e-06, + "loss": 5.3171, + "step": 7840 + }, + { + "epoch": 0.15960693359375, + "grad_norm": 22.900842666625977, + "learning_rate": 9.961720417365349e-06, + "loss": 5.4249, + "step": 7845 + }, + { + "epoch": 0.15970865885416666, + "grad_norm": 14.834782600402832, + "learning_rate": 9.961671039778867e-06, + "loss": 5.2471, + "step": 7850 + }, + { + "epoch": 0.15981038411458334, + "grad_norm": 18.033737182617188, + "learning_rate": 9.961621630488885e-06, + "loss": 5.4761, + "step": 7855 + }, + { + "epoch": 0.159912109375, + "grad_norm": 16.943105697631836, + "learning_rate": 9.961572189495714e-06, + "loss": 5.1229, + "step": 7860 + }, + { + "epoch": 0.16001383463541666, + "grad_norm": 17.431650161743164, + "learning_rate": 9.961522716799671e-06, + "loss": 5.2637, + "step": 7865 + }, + { + "epoch": 0.16011555989583334, + "grad_norm": 14.576851844787598, + "learning_rate": 9.961473212401074e-06, + "loss": 5.2454, + "step": 7870 + }, + { + "epoch": 0.16021728515625, + "grad_norm": 16.535715103149414, + "learning_rate": 9.961423676300238e-06, + "loss": 5.5684, + "step": 7875 + }, + { + "epoch": 0.16031901041666666, + "grad_norm": 12.88680362701416, + "learning_rate": 9.96137410849748e-06, + "loss": 5.4815, + "step": 7880 + }, + { + "epoch": 0.16042073567708334, + "grad_norm": 15.28410816192627, + "learning_rate": 9.961324508993116e-06, + "loss": 5.4188, + "step": 7885 + }, + { + "epoch": 0.1605224609375, + "grad_norm": 15.150443077087402, + "learning_rate": 9.961274877787463e-06, + "loss": 5.3567, + "step": 7890 + }, + { + "epoch": 0.16062418619791666, + "grad_norm": 19.053499221801758, + "learning_rate": 9.961225214880838e-06, + "loss": 5.5106, + "step": 7895 + }, + { + "epoch": 0.16072591145833334, + "grad_norm": 14.071982383728027, + "learning_rate": 9.961175520273561e-06, + "loss": 5.5705, + "step": 7900 + }, + { + "epoch": 0.16082763671875, + "grad_norm": 12.876595497131348, + "learning_rate": 9.961125793965945e-06, + "loss": 5.6554, + "step": 7905 + }, + { + "epoch": 0.16092936197916666, + "grad_norm": 15.619491577148438, + "learning_rate": 9.961076035958312e-06, + "loss": 5.3318, + "step": 7910 + }, + { + "epoch": 0.16103108723958334, + "grad_norm": 18.43037223815918, + "learning_rate": 9.961026246250976e-06, + "loss": 5.3952, + "step": 7915 + }, + { + "epoch": 0.1611328125, + "grad_norm": 13.561946868896484, + "learning_rate": 9.960976424844258e-06, + "loss": 5.3662, + "step": 7920 + }, + { + "epoch": 0.16123453776041666, + "grad_norm": 16.147296905517578, + "learning_rate": 9.960926571738477e-06, + "loss": 5.2762, + "step": 7925 + }, + { + "epoch": 0.16133626302083334, + "grad_norm": 15.561074256896973, + "learning_rate": 9.960876686933948e-06, + "loss": 5.2831, + "step": 7930 + }, + { + "epoch": 0.16143798828125, + "grad_norm": 17.4566593170166, + "learning_rate": 9.960826770430991e-06, + "loss": 5.2461, + "step": 7935 + }, + { + "epoch": 0.16153971354166666, + "grad_norm": 16.425676345825195, + "learning_rate": 9.96077682222993e-06, + "loss": 5.7153, + "step": 7940 + }, + { + "epoch": 0.16164143880208334, + "grad_norm": 22.95783042907715, + "learning_rate": 9.960726842331075e-06, + "loss": 5.5348, + "step": 7945 + }, + { + "epoch": 0.1617431640625, + "grad_norm": 14.267427444458008, + "learning_rate": 9.960676830734753e-06, + "loss": 5.6117, + "step": 7950 + }, + { + "epoch": 0.16184488932291666, + "grad_norm": 22.11170196533203, + "learning_rate": 9.96062678744128e-06, + "loss": 5.6627, + "step": 7955 + }, + { + "epoch": 0.16194661458333334, + "grad_norm": 15.9086275100708, + "learning_rate": 9.960576712450976e-06, + "loss": 5.5292, + "step": 7960 + }, + { + "epoch": 0.16204833984375, + "grad_norm": 17.472944259643555, + "learning_rate": 9.960526605764162e-06, + "loss": 5.393, + "step": 7965 + }, + { + "epoch": 0.16215006510416666, + "grad_norm": 17.657127380371094, + "learning_rate": 9.960476467381157e-06, + "loss": 5.2145, + "step": 7970 + }, + { + "epoch": 0.16225179036458334, + "grad_norm": 14.515355110168457, + "learning_rate": 9.960426297302284e-06, + "loss": 5.4796, + "step": 7975 + }, + { + "epoch": 0.162353515625, + "grad_norm": 10.162940979003906, + "learning_rate": 9.960376095527859e-06, + "loss": 5.6742, + "step": 7980 + }, + { + "epoch": 0.16245524088541666, + "grad_norm": 12.182851791381836, + "learning_rate": 9.960325862058207e-06, + "loss": 5.2555, + "step": 7985 + }, + { + "epoch": 0.16255696614583334, + "grad_norm": 13.090380668640137, + "learning_rate": 9.960275596893647e-06, + "loss": 5.2625, + "step": 7990 + }, + { + "epoch": 0.16265869140625, + "grad_norm": 21.13256072998047, + "learning_rate": 9.960225300034501e-06, + "loss": 5.5064, + "step": 7995 + }, + { + "epoch": 0.16276041666666666, + "grad_norm": 26.061588287353516, + "learning_rate": 9.960174971481088e-06, + "loss": 5.5555, + "step": 8000 + }, + { + "epoch": 0.16286214192708334, + "grad_norm": 27.924213409423828, + "learning_rate": 9.960124611233734e-06, + "loss": 5.5374, + "step": 8005 + }, + { + "epoch": 0.1629638671875, + "grad_norm": 23.20097541809082, + "learning_rate": 9.960074219292756e-06, + "loss": 5.6523, + "step": 8010 + }, + { + "epoch": 0.16306559244791666, + "grad_norm": 16.20667266845703, + "learning_rate": 9.96002379565848e-06, + "loss": 5.6982, + "step": 8015 + }, + { + "epoch": 0.16316731770833334, + "grad_norm": 13.150439262390137, + "learning_rate": 9.959973340331227e-06, + "loss": 5.5652, + "step": 8020 + }, + { + "epoch": 0.16326904296875, + "grad_norm": 18.28520393371582, + "learning_rate": 9.959922853311318e-06, + "loss": 5.6818, + "step": 8025 + }, + { + "epoch": 0.16337076822916666, + "grad_norm": 14.445357322692871, + "learning_rate": 9.959872334599076e-06, + "loss": 5.2062, + "step": 8030 + }, + { + "epoch": 0.16347249348958334, + "grad_norm": 17.10443115234375, + "learning_rate": 9.959821784194825e-06, + "loss": 5.3037, + "step": 8035 + }, + { + "epoch": 0.16357421875, + "grad_norm": 51.1016845703125, + "learning_rate": 9.959771202098886e-06, + "loss": 5.9963, + "step": 8040 + }, + { + "epoch": 0.16367594401041666, + "grad_norm": 15.426451683044434, + "learning_rate": 9.959720588311586e-06, + "loss": 5.5658, + "step": 8045 + }, + { + "epoch": 0.16377766927083334, + "grad_norm": 12.833091735839844, + "learning_rate": 9.959669942833243e-06, + "loss": 5.4248, + "step": 8050 + }, + { + "epoch": 0.16387939453125, + "grad_norm": 13.870063781738281, + "learning_rate": 9.959619265664185e-06, + "loss": 5.7107, + "step": 8055 + }, + { + "epoch": 0.16398111979166666, + "grad_norm": 16.905141830444336, + "learning_rate": 9.959568556804734e-06, + "loss": 5.3804, + "step": 8060 + }, + { + "epoch": 0.16408284505208334, + "grad_norm": 12.68128776550293, + "learning_rate": 9.959517816255214e-06, + "loss": 5.5863, + "step": 8065 + }, + { + "epoch": 0.1641845703125, + "grad_norm": 18.117958068847656, + "learning_rate": 9.95946704401595e-06, + "loss": 5.526, + "step": 8070 + }, + { + "epoch": 0.16428629557291666, + "grad_norm": 15.613215446472168, + "learning_rate": 9.959416240087266e-06, + "loss": 5.4173, + "step": 8075 + }, + { + "epoch": 0.16438802083333334, + "grad_norm": 13.818394660949707, + "learning_rate": 9.959365404469485e-06, + "loss": 5.3882, + "step": 8080 + }, + { + "epoch": 0.16448974609375, + "grad_norm": 17.056739807128906, + "learning_rate": 9.959314537162936e-06, + "loss": 5.2342, + "step": 8085 + }, + { + "epoch": 0.16459147135416666, + "grad_norm": 13.645252227783203, + "learning_rate": 9.95926363816794e-06, + "loss": 5.6395, + "step": 8090 + }, + { + "epoch": 0.16469319661458334, + "grad_norm": 16.581581115722656, + "learning_rate": 9.959212707484824e-06, + "loss": 5.2926, + "step": 8095 + }, + { + "epoch": 0.164794921875, + "grad_norm": 12.899117469787598, + "learning_rate": 9.959161745113912e-06, + "loss": 5.2468, + "step": 8100 + }, + { + "epoch": 0.16489664713541666, + "grad_norm": 13.944337844848633, + "learning_rate": 9.959110751055532e-06, + "loss": 5.4745, + "step": 8105 + }, + { + "epoch": 0.16499837239583334, + "grad_norm": 13.131104469299316, + "learning_rate": 9.959059725310007e-06, + "loss": 5.5969, + "step": 8110 + }, + { + "epoch": 0.16510009765625, + "grad_norm": 21.75611686706543, + "learning_rate": 9.959008667877665e-06, + "loss": 5.3178, + "step": 8115 + }, + { + "epoch": 0.16520182291666666, + "grad_norm": 16.92811393737793, + "learning_rate": 9.958957578758834e-06, + "loss": 5.3819, + "step": 8120 + }, + { + "epoch": 0.16530354817708334, + "grad_norm": 19.316097259521484, + "learning_rate": 9.958906457953837e-06, + "loss": 5.2368, + "step": 8125 + }, + { + "epoch": 0.1654052734375, + "grad_norm": 11.703134536743164, + "learning_rate": 9.958855305463e-06, + "loss": 5.2665, + "step": 8130 + }, + { + "epoch": 0.16550699869791666, + "grad_norm": 19.508386611938477, + "learning_rate": 9.958804121286654e-06, + "loss": 5.4284, + "step": 8135 + }, + { + "epoch": 0.16560872395833334, + "grad_norm": 19.106143951416016, + "learning_rate": 9.958752905425122e-06, + "loss": 5.3023, + "step": 8140 + }, + { + "epoch": 0.16571044921875, + "grad_norm": 23.050426483154297, + "learning_rate": 9.958701657878736e-06, + "loss": 5.7215, + "step": 8145 + }, + { + "epoch": 0.16581217447916666, + "grad_norm": 20.58616828918457, + "learning_rate": 9.958650378647819e-06, + "loss": 5.4115, + "step": 8150 + }, + { + "epoch": 0.16591389973958334, + "grad_norm": 17.18950653076172, + "learning_rate": 9.958599067732699e-06, + "loss": 5.3721, + "step": 8155 + }, + { + "epoch": 0.166015625, + "grad_norm": 14.587106704711914, + "learning_rate": 9.958547725133705e-06, + "loss": 5.1369, + "step": 8160 + }, + { + "epoch": 0.16611735026041666, + "grad_norm": 14.987319946289062, + "learning_rate": 9.958496350851167e-06, + "loss": 5.334, + "step": 8165 + }, + { + "epoch": 0.16621907552083334, + "grad_norm": 12.0374116897583, + "learning_rate": 9.95844494488541e-06, + "loss": 5.2364, + "step": 8170 + }, + { + "epoch": 0.16632080078125, + "grad_norm": 15.58442497253418, + "learning_rate": 9.958393507236763e-06, + "loss": 5.356, + "step": 8175 + }, + { + "epoch": 0.16642252604166666, + "grad_norm": 13.709733009338379, + "learning_rate": 9.958342037905557e-06, + "loss": 5.3994, + "step": 8180 + }, + { + "epoch": 0.16652425130208334, + "grad_norm": 16.199438095092773, + "learning_rate": 9.958290536892118e-06, + "loss": 5.4076, + "step": 8185 + }, + { + "epoch": 0.1666259765625, + "grad_norm": 17.31839370727539, + "learning_rate": 9.95823900419678e-06, + "loss": 5.4423, + "step": 8190 + }, + { + "epoch": 0.16672770182291666, + "grad_norm": 14.857915878295898, + "learning_rate": 9.958187439819866e-06, + "loss": 5.3668, + "step": 8195 + }, + { + "epoch": 0.16682942708333334, + "grad_norm": 19.54537582397461, + "learning_rate": 9.958135843761707e-06, + "loss": 5.5157, + "step": 8200 + }, + { + "epoch": 0.16693115234375, + "grad_norm": 14.58091926574707, + "learning_rate": 9.958084216022636e-06, + "loss": 5.5727, + "step": 8205 + }, + { + "epoch": 0.16703287760416666, + "grad_norm": 14.54755687713623, + "learning_rate": 9.95803255660298e-06, + "loss": 5.4015, + "step": 8210 + }, + { + "epoch": 0.16713460286458334, + "grad_norm": 17.593050003051758, + "learning_rate": 9.95798086550307e-06, + "loss": 5.4932, + "step": 8215 + }, + { + "epoch": 0.167236328125, + "grad_norm": 16.06844711303711, + "learning_rate": 9.957929142723235e-06, + "loss": 5.1757, + "step": 8220 + }, + { + "epoch": 0.16733805338541666, + "grad_norm": 18.102815628051758, + "learning_rate": 9.957877388263808e-06, + "loss": 5.5898, + "step": 8225 + }, + { + "epoch": 0.16743977864583334, + "grad_norm": 14.780433654785156, + "learning_rate": 9.957825602125117e-06, + "loss": 5.5489, + "step": 8230 + }, + { + "epoch": 0.16754150390625, + "grad_norm": 18.98553466796875, + "learning_rate": 9.957773784307495e-06, + "loss": 5.1887, + "step": 8235 + }, + { + "epoch": 0.16764322916666666, + "grad_norm": 14.679461479187012, + "learning_rate": 9.957721934811273e-06, + "loss": 5.5256, + "step": 8240 + }, + { + "epoch": 0.16774495442708334, + "grad_norm": 17.569597244262695, + "learning_rate": 9.957670053636782e-06, + "loss": 5.4591, + "step": 8245 + }, + { + "epoch": 0.1678466796875, + "grad_norm": 18.631494522094727, + "learning_rate": 9.957618140784352e-06, + "loss": 5.6789, + "step": 8250 + }, + { + "epoch": 0.16794840494791666, + "grad_norm": 16.574783325195312, + "learning_rate": 9.957566196254317e-06, + "loss": 5.3332, + "step": 8255 + }, + { + "epoch": 0.16805013020833334, + "grad_norm": 14.363646507263184, + "learning_rate": 9.957514220047007e-06, + "loss": 5.4732, + "step": 8260 + }, + { + "epoch": 0.16815185546875, + "grad_norm": 40.6783447265625, + "learning_rate": 9.957462212162755e-06, + "loss": 5.5801, + "step": 8265 + }, + { + "epoch": 0.16825358072916666, + "grad_norm": 14.171721458435059, + "learning_rate": 9.957410172601892e-06, + "loss": 5.4373, + "step": 8270 + }, + { + "epoch": 0.16835530598958334, + "grad_norm": 18.414405822753906, + "learning_rate": 9.957358101364752e-06, + "loss": 5.2619, + "step": 8275 + }, + { + "epoch": 0.16845703125, + "grad_norm": 15.946203231811523, + "learning_rate": 9.957305998451668e-06, + "loss": 5.652, + "step": 8280 + }, + { + "epoch": 0.16855875651041666, + "grad_norm": 15.31955337524414, + "learning_rate": 9.957253863862972e-06, + "loss": 5.4005, + "step": 8285 + }, + { + "epoch": 0.16866048177083334, + "grad_norm": 13.980257034301758, + "learning_rate": 9.957201697598998e-06, + "loss": 5.3762, + "step": 8290 + }, + { + "epoch": 0.16876220703125, + "grad_norm": 14.772516250610352, + "learning_rate": 9.95714949966008e-06, + "loss": 5.3593, + "step": 8295 + }, + { + "epoch": 0.16886393229166666, + "grad_norm": 16.23666763305664, + "learning_rate": 9.957097270046547e-06, + "loss": 5.248, + "step": 8300 + }, + { + "epoch": 0.16896565755208334, + "grad_norm": 14.796059608459473, + "learning_rate": 9.957045008758739e-06, + "loss": 5.4349, + "step": 8305 + }, + { + "epoch": 0.1690673828125, + "grad_norm": 13.858386039733887, + "learning_rate": 9.956992715796986e-06, + "loss": 5.5017, + "step": 8310 + }, + { + "epoch": 0.16916910807291666, + "grad_norm": 18.983840942382812, + "learning_rate": 9.956940391161623e-06, + "loss": 5.3587, + "step": 8315 + }, + { + "epoch": 0.16927083333333334, + "grad_norm": 14.075675010681152, + "learning_rate": 9.956888034852983e-06, + "loss": 5.4648, + "step": 8320 + }, + { + "epoch": 0.16937255859375, + "grad_norm": 17.552783966064453, + "learning_rate": 9.956835646871405e-06, + "loss": 5.3834, + "step": 8325 + }, + { + "epoch": 0.16947428385416666, + "grad_norm": 13.93071460723877, + "learning_rate": 9.95678322721722e-06, + "loss": 5.5004, + "step": 8330 + }, + { + "epoch": 0.16957600911458334, + "grad_norm": 14.731468200683594, + "learning_rate": 9.956730775890761e-06, + "loss": 5.3732, + "step": 8335 + }, + { + "epoch": 0.169677734375, + "grad_norm": 14.285768508911133, + "learning_rate": 9.956678292892367e-06, + "loss": 5.4442, + "step": 8340 + }, + { + "epoch": 0.16977945963541666, + "grad_norm": 18.91328239440918, + "learning_rate": 9.956625778222375e-06, + "loss": 5.5959, + "step": 8345 + }, + { + "epoch": 0.16988118489583334, + "grad_norm": 14.529850006103516, + "learning_rate": 9.956573231881116e-06, + "loss": 5.48, + "step": 8350 + }, + { + "epoch": 0.16998291015625, + "grad_norm": 14.315996170043945, + "learning_rate": 9.956520653868925e-06, + "loss": 5.4337, + "step": 8355 + }, + { + "epoch": 0.17008463541666666, + "grad_norm": 16.356773376464844, + "learning_rate": 9.956468044186142e-06, + "loss": 5.4327, + "step": 8360 + }, + { + "epoch": 0.17018636067708334, + "grad_norm": 12.063360214233398, + "learning_rate": 9.956415402833102e-06, + "loss": 5.1706, + "step": 8365 + }, + { + "epoch": 0.1702880859375, + "grad_norm": 16.22122573852539, + "learning_rate": 9.956362729810141e-06, + "loss": 5.5508, + "step": 8370 + }, + { + "epoch": 0.17038981119791666, + "grad_norm": 16.45595932006836, + "learning_rate": 9.956310025117597e-06, + "loss": 5.3652, + "step": 8375 + }, + { + "epoch": 0.17049153645833334, + "grad_norm": 19.070093154907227, + "learning_rate": 9.956257288755802e-06, + "loss": 5.328, + "step": 8380 + }, + { + "epoch": 0.17059326171875, + "grad_norm": 14.600157737731934, + "learning_rate": 9.956204520725099e-06, + "loss": 5.3801, + "step": 8385 + }, + { + "epoch": 0.17069498697916666, + "grad_norm": 16.93374252319336, + "learning_rate": 9.956151721025821e-06, + "loss": 5.5063, + "step": 8390 + }, + { + "epoch": 0.17079671223958334, + "grad_norm": 19.353239059448242, + "learning_rate": 9.956098889658307e-06, + "loss": 5.6082, + "step": 8395 + }, + { + "epoch": 0.1708984375, + "grad_norm": 15.046883583068848, + "learning_rate": 9.956046026622896e-06, + "loss": 5.3141, + "step": 8400 + }, + { + "epoch": 0.17100016276041666, + "grad_norm": 16.827709197998047, + "learning_rate": 9.955993131919922e-06, + "loss": 5.4586, + "step": 8405 + }, + { + "epoch": 0.17110188802083334, + "grad_norm": 24.00628662109375, + "learning_rate": 9.955940205549726e-06, + "loss": 5.6077, + "step": 8410 + }, + { + "epoch": 0.17120361328125, + "grad_norm": 14.724183082580566, + "learning_rate": 9.955887247512646e-06, + "loss": 5.5041, + "step": 8415 + }, + { + "epoch": 0.17130533854166666, + "grad_norm": 15.395419120788574, + "learning_rate": 9.955834257809018e-06, + "loss": 5.367, + "step": 8420 + }, + { + "epoch": 0.17140706380208334, + "grad_norm": 19.40331268310547, + "learning_rate": 9.955781236439183e-06, + "loss": 5.3833, + "step": 8425 + }, + { + "epoch": 0.1715087890625, + "grad_norm": 16.159231185913086, + "learning_rate": 9.955728183403479e-06, + "loss": 5.4286, + "step": 8430 + }, + { + "epoch": 0.17161051432291666, + "grad_norm": 15.030744552612305, + "learning_rate": 9.955675098702247e-06, + "loss": 5.3254, + "step": 8435 + }, + { + "epoch": 0.17171223958333334, + "grad_norm": 16.3015079498291, + "learning_rate": 9.955621982335821e-06, + "loss": 5.2941, + "step": 8440 + }, + { + "epoch": 0.17181396484375, + "grad_norm": 16.556209564208984, + "learning_rate": 9.955568834304545e-06, + "loss": 5.4701, + "step": 8445 + }, + { + "epoch": 0.17191569010416666, + "grad_norm": 15.8231782913208, + "learning_rate": 9.955515654608758e-06, + "loss": 5.2947, + "step": 8450 + }, + { + "epoch": 0.17201741536458334, + "grad_norm": 18.567855834960938, + "learning_rate": 9.955462443248798e-06, + "loss": 5.3919, + "step": 8455 + }, + { + "epoch": 0.172119140625, + "grad_norm": 16.441137313842773, + "learning_rate": 9.955409200225008e-06, + "loss": 5.1436, + "step": 8460 + }, + { + "epoch": 0.17222086588541666, + "grad_norm": 13.08769416809082, + "learning_rate": 9.955355925537724e-06, + "loss": 5.3758, + "step": 8465 + }, + { + "epoch": 0.17232259114583334, + "grad_norm": 21.02294921875, + "learning_rate": 9.95530261918729e-06, + "loss": 5.2792, + "step": 8470 + }, + { + "epoch": 0.17242431640625, + "grad_norm": 13.485523223876953, + "learning_rate": 9.955249281174045e-06, + "loss": 5.1047, + "step": 8475 + }, + { + "epoch": 0.17252604166666666, + "grad_norm": 14.849356651306152, + "learning_rate": 9.95519591149833e-06, + "loss": 5.658, + "step": 8480 + }, + { + "epoch": 0.17262776692708334, + "grad_norm": 15.555683135986328, + "learning_rate": 9.955142510160487e-06, + "loss": 5.5273, + "step": 8485 + }, + { + "epoch": 0.1727294921875, + "grad_norm": 15.620842933654785, + "learning_rate": 9.955089077160855e-06, + "loss": 5.4008, + "step": 8490 + }, + { + "epoch": 0.17283121744791666, + "grad_norm": 13.213794708251953, + "learning_rate": 9.955035612499778e-06, + "loss": 5.5104, + "step": 8495 + }, + { + "epoch": 0.17293294270833334, + "grad_norm": 15.904237747192383, + "learning_rate": 9.954982116177593e-06, + "loss": 5.4742, + "step": 8500 + }, + { + "epoch": 0.17303466796875, + "grad_norm": 14.531661033630371, + "learning_rate": 9.95492858819465e-06, + "loss": 5.5641, + "step": 8505 + }, + { + "epoch": 0.17313639322916666, + "grad_norm": 12.28598690032959, + "learning_rate": 9.954875028551284e-06, + "loss": 5.1765, + "step": 8510 + }, + { + "epoch": 0.17323811848958334, + "grad_norm": 15.1087064743042, + "learning_rate": 9.954821437247839e-06, + "loss": 5.7861, + "step": 8515 + }, + { + "epoch": 0.17333984375, + "grad_norm": 14.995052337646484, + "learning_rate": 9.954767814284658e-06, + "loss": 5.3294, + "step": 8520 + }, + { + "epoch": 0.17344156901041666, + "grad_norm": 15.97739028930664, + "learning_rate": 9.954714159662085e-06, + "loss": 5.271, + "step": 8525 + }, + { + "epoch": 0.17354329427083334, + "grad_norm": 18.555334091186523, + "learning_rate": 9.95466047338046e-06, + "loss": 5.4245, + "step": 8530 + }, + { + "epoch": 0.17364501953125, + "grad_norm": 15.530320167541504, + "learning_rate": 9.954606755440126e-06, + "loss": 5.2288, + "step": 8535 + }, + { + "epoch": 0.17374674479166666, + "grad_norm": 15.567760467529297, + "learning_rate": 9.95455300584143e-06, + "loss": 5.1533, + "step": 8540 + }, + { + "epoch": 0.17384847005208334, + "grad_norm": 17.194860458374023, + "learning_rate": 9.954499224584713e-06, + "loss": 5.265, + "step": 8545 + }, + { + "epoch": 0.1739501953125, + "grad_norm": 16.46260643005371, + "learning_rate": 9.954445411670317e-06, + "loss": 5.313, + "step": 8550 + }, + { + "epoch": 0.17405192057291666, + "grad_norm": 16.36751937866211, + "learning_rate": 9.95439156709859e-06, + "loss": 5.3483, + "step": 8555 + }, + { + "epoch": 0.17415364583333334, + "grad_norm": 16.211471557617188, + "learning_rate": 9.95433769086987e-06, + "loss": 5.2678, + "step": 8560 + }, + { + "epoch": 0.17425537109375, + "grad_norm": 18.474300384521484, + "learning_rate": 9.95428378298451e-06, + "loss": 5.3744, + "step": 8565 + }, + { + "epoch": 0.17435709635416666, + "grad_norm": 15.385424613952637, + "learning_rate": 9.954229843442844e-06, + "loss": 5.4728, + "step": 8570 + }, + { + "epoch": 0.17445882161458334, + "grad_norm": 14.549713134765625, + "learning_rate": 9.954175872245225e-06, + "loss": 5.3141, + "step": 8575 + }, + { + "epoch": 0.174560546875, + "grad_norm": 15.80581283569336, + "learning_rate": 9.954121869391996e-06, + "loss": 5.3517, + "step": 8580 + }, + { + "epoch": 0.17466227213541666, + "grad_norm": 15.878275871276855, + "learning_rate": 9.954067834883498e-06, + "loss": 5.4083, + "step": 8585 + }, + { + "epoch": 0.17476399739583334, + "grad_norm": 15.276546478271484, + "learning_rate": 9.95401376872008e-06, + "loss": 5.1828, + "step": 8590 + }, + { + "epoch": 0.17486572265625, + "grad_norm": 22.125411987304688, + "learning_rate": 9.953959670902086e-06, + "loss": 5.4696, + "step": 8595 + }, + { + "epoch": 0.17496744791666666, + "grad_norm": 18.702545166015625, + "learning_rate": 9.953905541429865e-06, + "loss": 5.329, + "step": 8600 + }, + { + "epoch": 0.17506917317708334, + "grad_norm": 14.835700035095215, + "learning_rate": 9.953851380303757e-06, + "loss": 5.522, + "step": 8605 + }, + { + "epoch": 0.1751708984375, + "grad_norm": 16.357532501220703, + "learning_rate": 9.953797187524114e-06, + "loss": 5.2869, + "step": 8610 + }, + { + "epoch": 0.17527262369791666, + "grad_norm": 17.679996490478516, + "learning_rate": 9.953742963091278e-06, + "loss": 5.5339, + "step": 8615 + }, + { + "epoch": 0.17537434895833334, + "grad_norm": 16.744508743286133, + "learning_rate": 9.953688707005597e-06, + "loss": 5.4981, + "step": 8620 + }, + { + "epoch": 0.17547607421875, + "grad_norm": 14.133514404296875, + "learning_rate": 9.953634419267418e-06, + "loss": 5.4609, + "step": 8625 + }, + { + "epoch": 0.17557779947916666, + "grad_norm": 20.516807556152344, + "learning_rate": 9.953580099877086e-06, + "loss": 5.2866, + "step": 8630 + }, + { + "epoch": 0.17567952473958334, + "grad_norm": 15.374407768249512, + "learning_rate": 9.953525748834951e-06, + "loss": 5.4612, + "step": 8635 + }, + { + "epoch": 0.17578125, + "grad_norm": 22.150360107421875, + "learning_rate": 9.953471366141362e-06, + "loss": 5.6591, + "step": 8640 + }, + { + "epoch": 0.17588297526041666, + "grad_norm": 13.954136848449707, + "learning_rate": 9.953416951796658e-06, + "loss": 5.6071, + "step": 8645 + }, + { + "epoch": 0.17598470052083334, + "grad_norm": 15.530481338500977, + "learning_rate": 9.953362505801196e-06, + "loss": 5.429, + "step": 8650 + }, + { + "epoch": 0.17608642578125, + "grad_norm": 18.69144630432129, + "learning_rate": 9.953308028155318e-06, + "loss": 5.4182, + "step": 8655 + }, + { + "epoch": 0.17618815104166666, + "grad_norm": 17.578035354614258, + "learning_rate": 9.953253518859374e-06, + "loss": 5.5357, + "step": 8660 + }, + { + "epoch": 0.17628987630208334, + "grad_norm": 13.799113273620605, + "learning_rate": 9.953198977913713e-06, + "loss": 5.1672, + "step": 8665 + }, + { + "epoch": 0.1763916015625, + "grad_norm": 14.622766494750977, + "learning_rate": 9.953144405318684e-06, + "loss": 5.5813, + "step": 8670 + }, + { + "epoch": 0.17649332682291666, + "grad_norm": 14.150805473327637, + "learning_rate": 9.953089801074633e-06, + "loss": 5.0491, + "step": 8675 + }, + { + "epoch": 0.17659505208333334, + "grad_norm": 14.98856258392334, + "learning_rate": 9.95303516518191e-06, + "loss": 5.3215, + "step": 8680 + }, + { + "epoch": 0.17669677734375, + "grad_norm": 21.016042709350586, + "learning_rate": 9.952980497640867e-06, + "loss": 5.6324, + "step": 8685 + }, + { + "epoch": 0.17679850260416666, + "grad_norm": 13.526346206665039, + "learning_rate": 9.95292579845185e-06, + "loss": 5.428, + "step": 8690 + }, + { + "epoch": 0.17690022786458334, + "grad_norm": 16.16767692565918, + "learning_rate": 9.952871067615208e-06, + "loss": 5.6225, + "step": 8695 + }, + { + "epoch": 0.177001953125, + "grad_norm": 13.20745849609375, + "learning_rate": 9.952816305131293e-06, + "loss": 5.5478, + "step": 8700 + }, + { + "epoch": 0.17710367838541666, + "grad_norm": 15.945897102355957, + "learning_rate": 9.952761511000453e-06, + "loss": 5.4449, + "step": 8705 + }, + { + "epoch": 0.17720540364583334, + "grad_norm": 16.72264862060547, + "learning_rate": 9.95270668522304e-06, + "loss": 5.3405, + "step": 8710 + }, + { + "epoch": 0.17730712890625, + "grad_norm": 19.20711898803711, + "learning_rate": 9.952651827799404e-06, + "loss": 5.491, + "step": 8715 + }, + { + "epoch": 0.17740885416666666, + "grad_norm": 21.282682418823242, + "learning_rate": 9.952596938729894e-06, + "loss": 5.6459, + "step": 8720 + }, + { + "epoch": 0.17751057942708334, + "grad_norm": 16.85504913330078, + "learning_rate": 9.952542018014864e-06, + "loss": 5.3849, + "step": 8725 + }, + { + "epoch": 0.1776123046875, + "grad_norm": 17.892139434814453, + "learning_rate": 9.952487065654662e-06, + "loss": 5.3712, + "step": 8730 + }, + { + "epoch": 0.17771402994791666, + "grad_norm": 13.817110061645508, + "learning_rate": 9.952432081649637e-06, + "loss": 5.5764, + "step": 8735 + }, + { + "epoch": 0.17781575520833334, + "grad_norm": 18.57796859741211, + "learning_rate": 9.952377066000145e-06, + "loss": 5.4026, + "step": 8740 + }, + { + "epoch": 0.17791748046875, + "grad_norm": 14.945940017700195, + "learning_rate": 9.952322018706536e-06, + "loss": 5.433, + "step": 8745 + }, + { + "epoch": 0.17801920572916666, + "grad_norm": 14.414974212646484, + "learning_rate": 9.952266939769162e-06, + "loss": 5.5614, + "step": 8750 + }, + { + "epoch": 0.17812093098958334, + "grad_norm": 13.245692253112793, + "learning_rate": 9.952211829188371e-06, + "loss": 5.1534, + "step": 8755 + }, + { + "epoch": 0.17822265625, + "grad_norm": 15.520194053649902, + "learning_rate": 9.952156686964522e-06, + "loss": 5.5664, + "step": 8760 + }, + { + "epoch": 0.17832438151041666, + "grad_norm": 17.57768440246582, + "learning_rate": 9.952101513097962e-06, + "loss": 5.4355, + "step": 8765 + }, + { + "epoch": 0.17842610677083334, + "grad_norm": 12.347573280334473, + "learning_rate": 9.952046307589047e-06, + "loss": 5.259, + "step": 8770 + }, + { + "epoch": 0.17852783203125, + "grad_norm": 16.368383407592773, + "learning_rate": 9.951991070438128e-06, + "loss": 5.37, + "step": 8775 + }, + { + "epoch": 0.17862955729166666, + "grad_norm": 17.44694709777832, + "learning_rate": 9.951935801645555e-06, + "loss": 5.3863, + "step": 8780 + }, + { + "epoch": 0.17873128255208334, + "grad_norm": 13.21349811553955, + "learning_rate": 9.951880501211688e-06, + "loss": 5.388, + "step": 8785 + }, + { + "epoch": 0.1788330078125, + "grad_norm": 14.685022354125977, + "learning_rate": 9.951825169136872e-06, + "loss": 5.5797, + "step": 8790 + }, + { + "epoch": 0.17893473307291666, + "grad_norm": 13.826556205749512, + "learning_rate": 9.95176980542147e-06, + "loss": 5.2076, + "step": 8795 + }, + { + "epoch": 0.17903645833333334, + "grad_norm": 21.548553466796875, + "learning_rate": 9.951714410065827e-06, + "loss": 5.3594, + "step": 8800 + }, + { + "epoch": 0.17913818359375, + "grad_norm": 16.221927642822266, + "learning_rate": 9.951658983070303e-06, + "loss": 5.357, + "step": 8805 + }, + { + "epoch": 0.17923990885416666, + "grad_norm": 16.363384246826172, + "learning_rate": 9.95160352443525e-06, + "loss": 5.5792, + "step": 8810 + }, + { + "epoch": 0.17934163411458334, + "grad_norm": 23.10530662536621, + "learning_rate": 9.951548034161021e-06, + "loss": 5.3494, + "step": 8815 + }, + { + "epoch": 0.179443359375, + "grad_norm": 19.19097137451172, + "learning_rate": 9.951492512247973e-06, + "loss": 5.5225, + "step": 8820 + }, + { + "epoch": 0.17954508463541666, + "grad_norm": 18.110355377197266, + "learning_rate": 9.95143695869646e-06, + "loss": 5.6129, + "step": 8825 + }, + { + "epoch": 0.17964680989583334, + "grad_norm": 13.900396347045898, + "learning_rate": 9.951381373506836e-06, + "loss": 5.4082, + "step": 8830 + }, + { + "epoch": 0.17974853515625, + "grad_norm": 11.688215255737305, + "learning_rate": 9.951325756679456e-06, + "loss": 5.3138, + "step": 8835 + }, + { + "epoch": 0.17985026041666666, + "grad_norm": 14.189027786254883, + "learning_rate": 9.951270108214677e-06, + "loss": 5.2913, + "step": 8840 + }, + { + "epoch": 0.17995198567708334, + "grad_norm": 15.215627670288086, + "learning_rate": 9.951214428112854e-06, + "loss": 5.333, + "step": 8845 + }, + { + "epoch": 0.1800537109375, + "grad_norm": 15.206294059753418, + "learning_rate": 9.951158716374343e-06, + "loss": 5.5849, + "step": 8850 + }, + { + "epoch": 0.18015543619791666, + "grad_norm": 19.93958282470703, + "learning_rate": 9.951102972999499e-06, + "loss": 5.3784, + "step": 8855 + }, + { + "epoch": 0.18025716145833334, + "grad_norm": 17.308563232421875, + "learning_rate": 9.951047197988676e-06, + "loss": 5.4703, + "step": 8860 + }, + { + "epoch": 0.18035888671875, + "grad_norm": 19.03771209716797, + "learning_rate": 9.950991391342238e-06, + "loss": 5.161, + "step": 8865 + }, + { + "epoch": 0.18046061197916666, + "grad_norm": 16.580636978149414, + "learning_rate": 9.950935553060533e-06, + "loss": 5.1527, + "step": 8870 + }, + { + "epoch": 0.18056233723958334, + "grad_norm": 16.991125106811523, + "learning_rate": 9.950879683143924e-06, + "loss": 5.3572, + "step": 8875 + }, + { + "epoch": 0.1806640625, + "grad_norm": 16.58769989013672, + "learning_rate": 9.950823781592763e-06, + "loss": 5.3505, + "step": 8880 + }, + { + "epoch": 0.18076578776041666, + "grad_norm": 19.51580047607422, + "learning_rate": 9.950767848407412e-06, + "loss": 5.5668, + "step": 8885 + }, + { + "epoch": 0.18086751302083334, + "grad_norm": 17.001262664794922, + "learning_rate": 9.950711883588224e-06, + "loss": 5.4144, + "step": 8890 + }, + { + "epoch": 0.18096923828125, + "grad_norm": 19.698774337768555, + "learning_rate": 9.95065588713556e-06, + "loss": 5.3962, + "step": 8895 + }, + { + "epoch": 0.18107096354166666, + "grad_norm": 14.345602035522461, + "learning_rate": 9.950599859049775e-06, + "loss": 5.4379, + "step": 8900 + }, + { + "epoch": 0.18117268880208334, + "grad_norm": 21.215538024902344, + "learning_rate": 9.95054379933123e-06, + "loss": 5.6026, + "step": 8905 + }, + { + "epoch": 0.1812744140625, + "grad_norm": 17.69208526611328, + "learning_rate": 9.95048770798028e-06, + "loss": 5.4632, + "step": 8910 + }, + { + "epoch": 0.18137613932291666, + "grad_norm": 15.134289741516113, + "learning_rate": 9.950431584997286e-06, + "loss": 5.2793, + "step": 8915 + }, + { + "epoch": 0.18147786458333334, + "grad_norm": 14.81847858428955, + "learning_rate": 9.950375430382606e-06, + "loss": 5.7186, + "step": 8920 + }, + { + "epoch": 0.18157958984375, + "grad_norm": 15.903595924377441, + "learning_rate": 9.950319244136597e-06, + "loss": 5.3161, + "step": 8925 + }, + { + "epoch": 0.18168131510416666, + "grad_norm": 15.681243896484375, + "learning_rate": 9.950263026259621e-06, + "loss": 5.5515, + "step": 8930 + }, + { + "epoch": 0.18178304036458334, + "grad_norm": 12.595696449279785, + "learning_rate": 9.950206776752035e-06, + "loss": 5.8083, + "step": 8935 + }, + { + "epoch": 0.181884765625, + "grad_norm": 15.336480140686035, + "learning_rate": 9.950150495614199e-06, + "loss": 5.5118, + "step": 8940 + }, + { + "epoch": 0.18198649088541666, + "grad_norm": 15.388830184936523, + "learning_rate": 9.950094182846473e-06, + "loss": 5.4497, + "step": 8945 + }, + { + "epoch": 0.18208821614583334, + "grad_norm": 16.5172061920166, + "learning_rate": 9.950037838449216e-06, + "loss": 5.2446, + "step": 8950 + }, + { + "epoch": 0.18218994140625, + "grad_norm": 17.045618057250977, + "learning_rate": 9.94998146242279e-06, + "loss": 5.5317, + "step": 8955 + }, + { + "epoch": 0.18229166666666666, + "grad_norm": 17.141271591186523, + "learning_rate": 9.949925054767552e-06, + "loss": 5.0933, + "step": 8960 + }, + { + "epoch": 0.18239339192708334, + "grad_norm": 15.787102699279785, + "learning_rate": 9.949868615483867e-06, + "loss": 5.5254, + "step": 8965 + }, + { + "epoch": 0.1824951171875, + "grad_norm": 17.947282791137695, + "learning_rate": 9.94981214457209e-06, + "loss": 5.4729, + "step": 8970 + }, + { + "epoch": 0.18259684244791666, + "grad_norm": 14.402547836303711, + "learning_rate": 9.949755642032586e-06, + "loss": 5.4433, + "step": 8975 + }, + { + "epoch": 0.18269856770833334, + "grad_norm": 13.718733787536621, + "learning_rate": 9.949699107865713e-06, + "loss": 5.3355, + "step": 8980 + }, + { + "epoch": 0.18280029296875, + "grad_norm": 21.95297622680664, + "learning_rate": 9.949642542071835e-06, + "loss": 5.5907, + "step": 8985 + }, + { + "epoch": 0.18290201822916666, + "grad_norm": 16.06968116760254, + "learning_rate": 9.949585944651314e-06, + "loss": 5.3228, + "step": 8990 + }, + { + "epoch": 0.18300374348958334, + "grad_norm": 21.489038467407227, + "learning_rate": 9.94952931560451e-06, + "loss": 5.3767, + "step": 8995 + }, + { + "epoch": 0.18310546875, + "grad_norm": 15.127798080444336, + "learning_rate": 9.949472654931782e-06, + "loss": 5.1372, + "step": 9000 + }, + { + "epoch": 0.18320719401041666, + "grad_norm": 17.8845272064209, + "learning_rate": 9.949415962633498e-06, + "loss": 5.322, + "step": 9005 + }, + { + "epoch": 0.18330891927083334, + "grad_norm": 14.194129943847656, + "learning_rate": 9.949359238710015e-06, + "loss": 5.3086, + "step": 9010 + }, + { + "epoch": 0.18341064453125, + "grad_norm": 17.947084426879883, + "learning_rate": 9.949302483161698e-06, + "loss": 5.2856, + "step": 9015 + }, + { + "epoch": 0.18351236979166666, + "grad_norm": 15.450242042541504, + "learning_rate": 9.94924569598891e-06, + "loss": 5.5282, + "step": 9020 + }, + { + "epoch": 0.18361409505208334, + "grad_norm": 13.240667343139648, + "learning_rate": 9.949188877192013e-06, + "loss": 5.142, + "step": 9025 + }, + { + "epoch": 0.1837158203125, + "grad_norm": 18.84765625, + "learning_rate": 9.94913202677137e-06, + "loss": 5.5154, + "step": 9030 + }, + { + "epoch": 0.18381754557291666, + "grad_norm": 15.601950645446777, + "learning_rate": 9.949075144727342e-06, + "loss": 5.4667, + "step": 9035 + }, + { + "epoch": 0.18391927083333334, + "grad_norm": 17.465578079223633, + "learning_rate": 9.949018231060298e-06, + "loss": 5.7696, + "step": 9040 + }, + { + "epoch": 0.18402099609375, + "grad_norm": 16.50350570678711, + "learning_rate": 9.948961285770598e-06, + "loss": 5.405, + "step": 9045 + }, + { + "epoch": 0.18412272135416666, + "grad_norm": 17.17493438720703, + "learning_rate": 9.948904308858606e-06, + "loss": 5.4418, + "step": 9050 + }, + { + "epoch": 0.18422444661458334, + "grad_norm": 14.300909042358398, + "learning_rate": 9.948847300324687e-06, + "loss": 5.5722, + "step": 9055 + }, + { + "epoch": 0.184326171875, + "grad_norm": 18.649797439575195, + "learning_rate": 9.948790260169203e-06, + "loss": 5.2443, + "step": 9060 + }, + { + "epoch": 0.18442789713541666, + "grad_norm": 13.884014129638672, + "learning_rate": 9.948733188392522e-06, + "loss": 5.3711, + "step": 9065 + }, + { + "epoch": 0.18452962239583334, + "grad_norm": 22.01243019104004, + "learning_rate": 9.948676084995006e-06, + "loss": 5.5277, + "step": 9070 + }, + { + "epoch": 0.18463134765625, + "grad_norm": 16.12378692626953, + "learning_rate": 9.948618949977021e-06, + "loss": 5.6353, + "step": 9075 + }, + { + "epoch": 0.18473307291666666, + "grad_norm": 17.768577575683594, + "learning_rate": 9.94856178333893e-06, + "loss": 5.3911, + "step": 9080 + }, + { + "epoch": 0.18483479817708334, + "grad_norm": 24.13721466064453, + "learning_rate": 9.948504585081104e-06, + "loss": 5.1771, + "step": 9085 + }, + { + "epoch": 0.1849365234375, + "grad_norm": 14.585609436035156, + "learning_rate": 9.948447355203901e-06, + "loss": 5.3615, + "step": 9090 + }, + { + "epoch": 0.18503824869791666, + "grad_norm": 18.624008178710938, + "learning_rate": 9.948390093707693e-06, + "loss": 5.3367, + "step": 9095 + }, + { + "epoch": 0.18513997395833334, + "grad_norm": 14.3472318649292, + "learning_rate": 9.948332800592841e-06, + "loss": 5.1362, + "step": 9100 + }, + { + "epoch": 0.18524169921875, + "grad_norm": 22.27127456665039, + "learning_rate": 9.948275475859714e-06, + "loss": 5.5889, + "step": 9105 + }, + { + "epoch": 0.18534342447916666, + "grad_norm": 12.39824390411377, + "learning_rate": 9.948218119508678e-06, + "loss": 5.1734, + "step": 9110 + }, + { + "epoch": 0.18544514973958334, + "grad_norm": 14.340011596679688, + "learning_rate": 9.948160731540098e-06, + "loss": 5.2488, + "step": 9115 + }, + { + "epoch": 0.185546875, + "grad_norm": 17.384000778198242, + "learning_rate": 9.948103311954342e-06, + "loss": 5.3214, + "step": 9120 + }, + { + "epoch": 0.18564860026041666, + "grad_norm": 16.710268020629883, + "learning_rate": 9.948045860751778e-06, + "loss": 5.3821, + "step": 9125 + }, + { + "epoch": 0.18575032552083334, + "grad_norm": 17.47419548034668, + "learning_rate": 9.94798837793277e-06, + "loss": 5.5921, + "step": 9130 + }, + { + "epoch": 0.18585205078125, + "grad_norm": 15.435600280761719, + "learning_rate": 9.947930863497688e-06, + "loss": 5.3115, + "step": 9135 + }, + { + "epoch": 0.18595377604166666, + "grad_norm": 16.551179885864258, + "learning_rate": 9.947873317446896e-06, + "loss": 5.5365, + "step": 9140 + }, + { + "epoch": 0.18605550130208334, + "grad_norm": 14.239866256713867, + "learning_rate": 9.947815739780767e-06, + "loss": 5.364, + "step": 9145 + }, + { + "epoch": 0.1861572265625, + "grad_norm": 15.614245414733887, + "learning_rate": 9.947758130499666e-06, + "loss": 5.6282, + "step": 9150 + }, + { + "epoch": 0.18625895182291666, + "grad_norm": 20.846033096313477, + "learning_rate": 9.947700489603959e-06, + "loss": 5.471, + "step": 9155 + }, + { + "epoch": 0.18636067708333334, + "grad_norm": 17.918336868286133, + "learning_rate": 9.947642817094018e-06, + "loss": 5.3353, + "step": 9160 + }, + { + "epoch": 0.18646240234375, + "grad_norm": 12.62096118927002, + "learning_rate": 9.94758511297021e-06, + "loss": 5.3654, + "step": 9165 + }, + { + "epoch": 0.18656412760416666, + "grad_norm": 15.53834342956543, + "learning_rate": 9.947527377232904e-06, + "loss": 5.3292, + "step": 9170 + }, + { + "epoch": 0.18666585286458334, + "grad_norm": 23.408185958862305, + "learning_rate": 9.947469609882468e-06, + "loss": 5.3393, + "step": 9175 + }, + { + "epoch": 0.186767578125, + "grad_norm": 13.367298126220703, + "learning_rate": 9.947411810919271e-06, + "loss": 5.2886, + "step": 9180 + }, + { + "epoch": 0.18686930338541666, + "grad_norm": 18.28557014465332, + "learning_rate": 9.947353980343685e-06, + "loss": 5.0962, + "step": 9185 + }, + { + "epoch": 0.18697102864583334, + "grad_norm": 14.227713584899902, + "learning_rate": 9.947296118156076e-06, + "loss": 5.5235, + "step": 9190 + }, + { + "epoch": 0.18707275390625, + "grad_norm": 19.801408767700195, + "learning_rate": 9.947238224356816e-06, + "loss": 5.3205, + "step": 9195 + }, + { + "epoch": 0.18717447916666666, + "grad_norm": 16.683053970336914, + "learning_rate": 9.947180298946273e-06, + "loss": 5.5367, + "step": 9200 + }, + { + "epoch": 0.18727620442708334, + "grad_norm": 15.36580753326416, + "learning_rate": 9.947122341924819e-06, + "loss": 5.2363, + "step": 9205 + }, + { + "epoch": 0.1873779296875, + "grad_norm": 16.16913604736328, + "learning_rate": 9.947064353292826e-06, + "loss": 5.4056, + "step": 9210 + }, + { + "epoch": 0.18747965494791666, + "grad_norm": 18.328369140625, + "learning_rate": 9.947006333050659e-06, + "loss": 5.5385, + "step": 9215 + }, + { + "epoch": 0.18758138020833334, + "grad_norm": 13.550973892211914, + "learning_rate": 9.946948281198693e-06, + "loss": 5.3527, + "step": 9220 + }, + { + "epoch": 0.18768310546875, + "grad_norm": 17.91370391845703, + "learning_rate": 9.946890197737298e-06, + "loss": 5.5459, + "step": 9225 + }, + { + "epoch": 0.18778483072916666, + "grad_norm": 19.33777618408203, + "learning_rate": 9.946832082666845e-06, + "loss": 5.4895, + "step": 9230 + }, + { + "epoch": 0.18788655598958334, + "grad_norm": 15.734110832214355, + "learning_rate": 9.946773935987706e-06, + "loss": 5.3663, + "step": 9235 + }, + { + "epoch": 0.18798828125, + "grad_norm": 18.668031692504883, + "learning_rate": 9.94671575770025e-06, + "loss": 5.373, + "step": 9240 + }, + { + "epoch": 0.18809000651041666, + "grad_norm": 15.54752254486084, + "learning_rate": 9.946657547804852e-06, + "loss": 5.3877, + "step": 9245 + }, + { + "epoch": 0.18819173177083334, + "grad_norm": 17.60775375366211, + "learning_rate": 9.946599306301882e-06, + "loss": 5.377, + "step": 9250 + }, + { + "epoch": 0.18829345703125, + "grad_norm": 18.32415771484375, + "learning_rate": 9.946541033191714e-06, + "loss": 5.4353, + "step": 9255 + }, + { + "epoch": 0.18839518229166666, + "grad_norm": 12.907644271850586, + "learning_rate": 9.946482728474717e-06, + "loss": 5.2891, + "step": 9260 + }, + { + "epoch": 0.18849690755208334, + "grad_norm": 20.314956665039062, + "learning_rate": 9.946424392151266e-06, + "loss": 5.4446, + "step": 9265 + }, + { + "epoch": 0.1885986328125, + "grad_norm": 15.321666717529297, + "learning_rate": 9.946366024221734e-06, + "loss": 5.4862, + "step": 9270 + }, + { + "epoch": 0.18870035807291666, + "grad_norm": 17.01535415649414, + "learning_rate": 9.946307624686493e-06, + "loss": 5.51, + "step": 9275 + }, + { + "epoch": 0.18880208333333334, + "grad_norm": 20.249860763549805, + "learning_rate": 9.946249193545918e-06, + "loss": 5.5428, + "step": 9280 + }, + { + "epoch": 0.18890380859375, + "grad_norm": 14.481475830078125, + "learning_rate": 9.94619073080038e-06, + "loss": 5.1812, + "step": 9285 + }, + { + "epoch": 0.18900553385416666, + "grad_norm": 16.871828079223633, + "learning_rate": 9.946132236450252e-06, + "loss": 5.4918, + "step": 9290 + }, + { + "epoch": 0.18910725911458334, + "grad_norm": 12.979053497314453, + "learning_rate": 9.946073710495909e-06, + "loss": 5.0523, + "step": 9295 + }, + { + "epoch": 0.189208984375, + "grad_norm": 12.035467147827148, + "learning_rate": 9.946015152937727e-06, + "loss": 5.3413, + "step": 9300 + }, + { + "epoch": 0.18931070963541666, + "grad_norm": 17.637691497802734, + "learning_rate": 9.945956563776077e-06, + "loss": 5.3833, + "step": 9305 + }, + { + "epoch": 0.18941243489583334, + "grad_norm": 14.500848770141602, + "learning_rate": 9.945897943011336e-06, + "loss": 5.3953, + "step": 9310 + }, + { + "epoch": 0.18951416015625, + "grad_norm": 12.7211332321167, + "learning_rate": 9.945839290643874e-06, + "loss": 5.2061, + "step": 9315 + }, + { + "epoch": 0.18961588541666666, + "grad_norm": 16.801780700683594, + "learning_rate": 9.945780606674071e-06, + "loss": 5.5107, + "step": 9320 + }, + { + "epoch": 0.18971761067708334, + "grad_norm": 17.539220809936523, + "learning_rate": 9.9457218911023e-06, + "loss": 5.1468, + "step": 9325 + }, + { + "epoch": 0.1898193359375, + "grad_norm": 15.995965957641602, + "learning_rate": 9.945663143928937e-06, + "loss": 5.4338, + "step": 9330 + }, + { + "epoch": 0.18992106119791666, + "grad_norm": 17.95293617248535, + "learning_rate": 9.945604365154356e-06, + "loss": 5.154, + "step": 9335 + }, + { + "epoch": 0.19002278645833334, + "grad_norm": 16.86269760131836, + "learning_rate": 9.945545554778933e-06, + "loss": 5.4976, + "step": 9340 + }, + { + "epoch": 0.19012451171875, + "grad_norm": 15.46239948272705, + "learning_rate": 9.945486712803043e-06, + "loss": 5.3378, + "step": 9345 + }, + { + "epoch": 0.19022623697916666, + "grad_norm": 16.117626190185547, + "learning_rate": 9.945427839227063e-06, + "loss": 5.3361, + "step": 9350 + }, + { + "epoch": 0.19032796223958334, + "grad_norm": 18.988618850708008, + "learning_rate": 9.94536893405137e-06, + "loss": 5.2289, + "step": 9355 + }, + { + "epoch": 0.1904296875, + "grad_norm": 14.455869674682617, + "learning_rate": 9.945309997276339e-06, + "loss": 5.4579, + "step": 9360 + }, + { + "epoch": 0.19053141276041666, + "grad_norm": 12.974495887756348, + "learning_rate": 9.945251028902345e-06, + "loss": 5.5712, + "step": 9365 + }, + { + "epoch": 0.19063313802083334, + "grad_norm": 14.224137306213379, + "learning_rate": 9.945192028929769e-06, + "loss": 5.5238, + "step": 9370 + }, + { + "epoch": 0.19073486328125, + "grad_norm": 19.456132888793945, + "learning_rate": 9.945132997358984e-06, + "loss": 5.3662, + "step": 9375 + }, + { + "epoch": 0.19083658854166666, + "grad_norm": 16.220867156982422, + "learning_rate": 9.94507393419037e-06, + "loss": 5.2946, + "step": 9380 + }, + { + "epoch": 0.19093831380208334, + "grad_norm": 14.916299819946289, + "learning_rate": 9.945014839424305e-06, + "loss": 5.4742, + "step": 9385 + }, + { + "epoch": 0.1910400390625, + "grad_norm": 15.170906066894531, + "learning_rate": 9.944955713061162e-06, + "loss": 5.3937, + "step": 9390 + }, + { + "epoch": 0.19114176432291666, + "grad_norm": 12.579385757446289, + "learning_rate": 9.944896555101324e-06, + "loss": 5.3661, + "step": 9395 + }, + { + "epoch": 0.19124348958333334, + "grad_norm": 15.32807445526123, + "learning_rate": 9.944837365545166e-06, + "loss": 5.3609, + "step": 9400 + }, + { + "epoch": 0.19134521484375, + "grad_norm": 19.116044998168945, + "learning_rate": 9.944778144393066e-06, + "loss": 5.2292, + "step": 9405 + }, + { + "epoch": 0.19144694010416666, + "grad_norm": 14.98764705657959, + "learning_rate": 9.944718891645404e-06, + "loss": 5.363, + "step": 9410 + }, + { + "epoch": 0.19154866536458334, + "grad_norm": 13.646204948425293, + "learning_rate": 9.944659607302558e-06, + "loss": 5.4821, + "step": 9415 + }, + { + "epoch": 0.191650390625, + "grad_norm": 16.83823585510254, + "learning_rate": 9.944600291364907e-06, + "loss": 5.4927, + "step": 9420 + }, + { + "epoch": 0.19175211588541666, + "grad_norm": 13.651832580566406, + "learning_rate": 9.944540943832828e-06, + "loss": 5.1892, + "step": 9425 + }, + { + "epoch": 0.19185384114583334, + "grad_norm": 15.754396438598633, + "learning_rate": 9.944481564706706e-06, + "loss": 5.2068, + "step": 9430 + }, + { + "epoch": 0.19195556640625, + "grad_norm": 20.73539161682129, + "learning_rate": 9.944422153986913e-06, + "loss": 5.5231, + "step": 9435 + }, + { + "epoch": 0.19205729166666666, + "grad_norm": 16.490995407104492, + "learning_rate": 9.944362711673832e-06, + "loss": 5.2488, + "step": 9440 + }, + { + "epoch": 0.19215901692708334, + "grad_norm": 12.93749713897705, + "learning_rate": 9.944303237767844e-06, + "loss": 5.3917, + "step": 9445 + }, + { + "epoch": 0.1922607421875, + "grad_norm": 14.537949562072754, + "learning_rate": 9.944243732269327e-06, + "loss": 5.3612, + "step": 9450 + }, + { + "epoch": 0.19236246744791666, + "grad_norm": 16.745330810546875, + "learning_rate": 9.944184195178663e-06, + "loss": 5.3099, + "step": 9455 + }, + { + "epoch": 0.19246419270833334, + "grad_norm": 15.794748306274414, + "learning_rate": 9.944124626496231e-06, + "loss": 5.4978, + "step": 9460 + }, + { + "epoch": 0.19256591796875, + "grad_norm": 16.20575523376465, + "learning_rate": 9.944065026222413e-06, + "loss": 5.5724, + "step": 9465 + }, + { + "epoch": 0.19266764322916666, + "grad_norm": 13.264728546142578, + "learning_rate": 9.944005394357588e-06, + "loss": 5.3701, + "step": 9470 + }, + { + "epoch": 0.19276936848958334, + "grad_norm": 16.883140563964844, + "learning_rate": 9.943945730902138e-06, + "loss": 5.2953, + "step": 9475 + }, + { + "epoch": 0.19287109375, + "grad_norm": 28.38263702392578, + "learning_rate": 9.943886035856444e-06, + "loss": 5.6136, + "step": 9480 + }, + { + "epoch": 0.19297281901041666, + "grad_norm": 13.486275672912598, + "learning_rate": 9.943826309220888e-06, + "loss": 5.5991, + "step": 9485 + }, + { + "epoch": 0.19307454427083334, + "grad_norm": 19.94158935546875, + "learning_rate": 9.943766550995852e-06, + "loss": 5.2898, + "step": 9490 + }, + { + "epoch": 0.19317626953125, + "grad_norm": 14.564233779907227, + "learning_rate": 9.943706761181715e-06, + "loss": 5.6217, + "step": 9495 + }, + { + "epoch": 0.19327799479166666, + "grad_norm": 13.785870552062988, + "learning_rate": 9.943646939778864e-06, + "loss": 5.3482, + "step": 9500 + }, + { + "epoch": 0.19337972005208334, + "grad_norm": 16.719419479370117, + "learning_rate": 9.943587086787675e-06, + "loss": 5.4614, + "step": 9505 + }, + { + "epoch": 0.1934814453125, + "grad_norm": 18.997745513916016, + "learning_rate": 9.943527202208536e-06, + "loss": 5.8462, + "step": 9510 + }, + { + "epoch": 0.19358317057291666, + "grad_norm": 15.456578254699707, + "learning_rate": 9.943467286041827e-06, + "loss": 5.2719, + "step": 9515 + }, + { + "epoch": 0.19368489583333334, + "grad_norm": 13.552586555480957, + "learning_rate": 9.943407338287932e-06, + "loss": 5.3139, + "step": 9520 + }, + { + "epoch": 0.19378662109375, + "grad_norm": 16.901756286621094, + "learning_rate": 9.943347358947232e-06, + "loss": 5.1163, + "step": 9525 + }, + { + "epoch": 0.19388834635416666, + "grad_norm": 28.875638961791992, + "learning_rate": 9.943287348020113e-06, + "loss": 5.4345, + "step": 9530 + }, + { + "epoch": 0.19399007161458334, + "grad_norm": 16.060937881469727, + "learning_rate": 9.943227305506958e-06, + "loss": 5.3471, + "step": 9535 + }, + { + "epoch": 0.194091796875, + "grad_norm": 18.191408157348633, + "learning_rate": 9.943167231408146e-06, + "loss": 5.3933, + "step": 9540 + }, + { + "epoch": 0.19419352213541666, + "grad_norm": 13.670891761779785, + "learning_rate": 9.943107125724068e-06, + "loss": 5.2991, + "step": 9545 + }, + { + "epoch": 0.19429524739583334, + "grad_norm": 13.703091621398926, + "learning_rate": 9.943046988455103e-06, + "loss": 5.7391, + "step": 9550 + }, + { + "epoch": 0.19439697265625, + "grad_norm": 18.82770347595215, + "learning_rate": 9.942986819601637e-06, + "loss": 5.6998, + "step": 9555 + }, + { + "epoch": 0.19449869791666666, + "grad_norm": 13.97412395477295, + "learning_rate": 9.942926619164054e-06, + "loss": 5.4593, + "step": 9560 + }, + { + "epoch": 0.19460042317708334, + "grad_norm": 11.383221626281738, + "learning_rate": 9.94286638714274e-06, + "loss": 5.3366, + "step": 9565 + }, + { + "epoch": 0.1947021484375, + "grad_norm": 12.931407928466797, + "learning_rate": 9.942806123538079e-06, + "loss": 5.5693, + "step": 9570 + }, + { + "epoch": 0.19480387369791666, + "grad_norm": 13.936859130859375, + "learning_rate": 9.942745828350455e-06, + "loss": 5.3106, + "step": 9575 + }, + { + "epoch": 0.19490559895833334, + "grad_norm": 16.133806228637695, + "learning_rate": 9.942685501580254e-06, + "loss": 5.296, + "step": 9580 + }, + { + "epoch": 0.19500732421875, + "grad_norm": 14.036813735961914, + "learning_rate": 9.942625143227863e-06, + "loss": 5.1694, + "step": 9585 + }, + { + "epoch": 0.19510904947916666, + "grad_norm": 15.90652847290039, + "learning_rate": 9.942564753293666e-06, + "loss": 5.1778, + "step": 9590 + }, + { + "epoch": 0.19521077473958334, + "grad_norm": 24.1423397064209, + "learning_rate": 9.94250433177805e-06, + "loss": 5.2041, + "step": 9595 + }, + { + "epoch": 0.1953125, + "grad_norm": 17.5612735748291, + "learning_rate": 9.942443878681397e-06, + "loss": 5.6638, + "step": 9600 + }, + { + "epoch": 0.19541422526041666, + "grad_norm": 15.26194953918457, + "learning_rate": 9.942383394004098e-06, + "loss": 5.313, + "step": 9605 + }, + { + "epoch": 0.19551595052083334, + "grad_norm": 20.848726272583008, + "learning_rate": 9.94232287774654e-06, + "loss": 5.4265, + "step": 9610 + }, + { + "epoch": 0.19561767578125, + "grad_norm": 15.602553367614746, + "learning_rate": 9.942262329909107e-06, + "loss": 5.45, + "step": 9615 + }, + { + "epoch": 0.19571940104166666, + "grad_norm": 28.235857009887695, + "learning_rate": 9.942201750492185e-06, + "loss": 5.4825, + "step": 9620 + }, + { + "epoch": 0.19582112630208334, + "grad_norm": 16.69292640686035, + "learning_rate": 9.942141139496163e-06, + "loss": 5.0343, + "step": 9625 + }, + { + "epoch": 0.1959228515625, + "grad_norm": 15.955881118774414, + "learning_rate": 9.94208049692143e-06, + "loss": 5.2741, + "step": 9630 + }, + { + "epoch": 0.19602457682291666, + "grad_norm": 12.364121437072754, + "learning_rate": 9.94201982276837e-06, + "loss": 5.3768, + "step": 9635 + }, + { + "epoch": 0.19612630208333334, + "grad_norm": 21.204404830932617, + "learning_rate": 9.941959117037372e-06, + "loss": 5.0975, + "step": 9640 + }, + { + "epoch": 0.19622802734375, + "grad_norm": 22.552515029907227, + "learning_rate": 9.941898379728823e-06, + "loss": 5.2576, + "step": 9645 + }, + { + "epoch": 0.19632975260416666, + "grad_norm": 22.22311019897461, + "learning_rate": 9.941837610843114e-06, + "loss": 5.4488, + "step": 9650 + }, + { + "epoch": 0.19643147786458334, + "grad_norm": 16.71539878845215, + "learning_rate": 9.94177681038063e-06, + "loss": 5.1909, + "step": 9655 + }, + { + "epoch": 0.196533203125, + "grad_norm": 28.85649299621582, + "learning_rate": 9.941715978341761e-06, + "loss": 5.0973, + "step": 9660 + }, + { + "epoch": 0.19663492838541666, + "grad_norm": 13.996614456176758, + "learning_rate": 9.941655114726898e-06, + "loss": 5.3766, + "step": 9665 + }, + { + "epoch": 0.19673665364583334, + "grad_norm": 15.255691528320312, + "learning_rate": 9.941594219536425e-06, + "loss": 5.2882, + "step": 9670 + }, + { + "epoch": 0.19683837890625, + "grad_norm": 18.444477081298828, + "learning_rate": 9.941533292770736e-06, + "loss": 5.356, + "step": 9675 + }, + { + "epoch": 0.19694010416666666, + "grad_norm": 14.963077545166016, + "learning_rate": 9.941472334430215e-06, + "loss": 5.24, + "step": 9680 + }, + { + "epoch": 0.19704182942708334, + "grad_norm": 17.18778419494629, + "learning_rate": 9.941411344515255e-06, + "loss": 5.3989, + "step": 9685 + }, + { + "epoch": 0.1971435546875, + "grad_norm": 14.115653991699219, + "learning_rate": 9.941350323026246e-06, + "loss": 5.4845, + "step": 9690 + }, + { + "epoch": 0.19724527994791666, + "grad_norm": 15.156307220458984, + "learning_rate": 9.941289269963577e-06, + "loss": 5.3346, + "step": 9695 + }, + { + "epoch": 0.19734700520833334, + "grad_norm": 15.938470840454102, + "learning_rate": 9.941228185327638e-06, + "loss": 5.4279, + "step": 9700 + }, + { + "epoch": 0.19744873046875, + "grad_norm": 14.90074634552002, + "learning_rate": 9.94116706911882e-06, + "loss": 5.3957, + "step": 9705 + }, + { + "epoch": 0.19755045572916666, + "grad_norm": 15.44297981262207, + "learning_rate": 9.941105921337512e-06, + "loss": 5.5211, + "step": 9710 + }, + { + "epoch": 0.19765218098958334, + "grad_norm": 21.48776626586914, + "learning_rate": 9.941044741984106e-06, + "loss": 5.4668, + "step": 9715 + }, + { + "epoch": 0.19775390625, + "grad_norm": 18.46709442138672, + "learning_rate": 9.940983531058992e-06, + "loss": 5.5453, + "step": 9720 + }, + { + "epoch": 0.19785563151041666, + "grad_norm": 15.427667617797852, + "learning_rate": 9.940922288562563e-06, + "loss": 5.3775, + "step": 9725 + }, + { + "epoch": 0.19795735677083334, + "grad_norm": 13.280352592468262, + "learning_rate": 9.940861014495209e-06, + "loss": 5.4645, + "step": 9730 + }, + { + "epoch": 0.19805908203125, + "grad_norm": 18.755170822143555, + "learning_rate": 9.94079970885732e-06, + "loss": 5.2965, + "step": 9735 + }, + { + "epoch": 0.19816080729166666, + "grad_norm": 18.213659286499023, + "learning_rate": 9.94073837164929e-06, + "loss": 5.3654, + "step": 9740 + }, + { + "epoch": 0.19826253255208334, + "grad_norm": 12.66787338256836, + "learning_rate": 9.940677002871511e-06, + "loss": 5.6661, + "step": 9745 + }, + { + "epoch": 0.1983642578125, + "grad_norm": 14.477725982666016, + "learning_rate": 9.940615602524375e-06, + "loss": 5.5126, + "step": 9750 + }, + { + "epoch": 0.19846598307291666, + "grad_norm": 15.735899925231934, + "learning_rate": 9.940554170608272e-06, + "loss": 5.6257, + "step": 9755 + }, + { + "epoch": 0.19856770833333334, + "grad_norm": 14.02489185333252, + "learning_rate": 9.940492707123596e-06, + "loss": 5.202, + "step": 9760 + }, + { + "epoch": 0.19866943359375, + "grad_norm": 14.717028617858887, + "learning_rate": 9.940431212070742e-06, + "loss": 5.5738, + "step": 9765 + }, + { + "epoch": 0.19877115885416666, + "grad_norm": 14.435527801513672, + "learning_rate": 9.940369685450099e-06, + "loss": 5.137, + "step": 9770 + }, + { + "epoch": 0.19887288411458334, + "grad_norm": 14.717361450195312, + "learning_rate": 9.940308127262061e-06, + "loss": 5.3532, + "step": 9775 + }, + { + "epoch": 0.198974609375, + "grad_norm": 13.417134284973145, + "learning_rate": 9.940246537507026e-06, + "loss": 5.2498, + "step": 9780 + }, + { + "epoch": 0.19907633463541666, + "grad_norm": 14.667508125305176, + "learning_rate": 9.94018491618538e-06, + "loss": 5.4003, + "step": 9785 + }, + { + "epoch": 0.19917805989583334, + "grad_norm": 14.835881233215332, + "learning_rate": 9.940123263297523e-06, + "loss": 5.4499, + "step": 9790 + }, + { + "epoch": 0.19927978515625, + "grad_norm": 16.537506103515625, + "learning_rate": 9.940061578843846e-06, + "loss": 5.402, + "step": 9795 + }, + { + "epoch": 0.19938151041666666, + "grad_norm": 16.187585830688477, + "learning_rate": 9.939999862824744e-06, + "loss": 5.2617, + "step": 9800 + }, + { + "epoch": 0.19948323567708334, + "grad_norm": 17.78742790222168, + "learning_rate": 9.93993811524061e-06, + "loss": 5.7102, + "step": 9805 + }, + { + "epoch": 0.1995849609375, + "grad_norm": 13.681543350219727, + "learning_rate": 9.93987633609184e-06, + "loss": 5.3339, + "step": 9810 + }, + { + "epoch": 0.19968668619791666, + "grad_norm": 13.407475471496582, + "learning_rate": 9.939814525378829e-06, + "loss": 5.395, + "step": 9815 + }, + { + "epoch": 0.19978841145833334, + "grad_norm": 13.873031616210938, + "learning_rate": 9.939752683101971e-06, + "loss": 5.4374, + "step": 9820 + }, + { + "epoch": 0.19989013671875, + "grad_norm": 16.18618392944336, + "learning_rate": 9.939690809261661e-06, + "loss": 5.5913, + "step": 9825 + }, + { + "epoch": 0.19999186197916666, + "grad_norm": 20.25551414489746, + "learning_rate": 9.939628903858294e-06, + "loss": 5.445, + "step": 9830 + }, + { + "epoch": 0.20009358723958334, + "grad_norm": 15.418718338012695, + "learning_rate": 9.939566966892267e-06, + "loss": 5.2063, + "step": 9835 + }, + { + "epoch": 0.2001953125, + "grad_norm": 16.754671096801758, + "learning_rate": 9.939504998363977e-06, + "loss": 5.492, + "step": 9840 + }, + { + "epoch": 0.20029703776041666, + "grad_norm": 16.118635177612305, + "learning_rate": 9.939442998273816e-06, + "loss": 5.1664, + "step": 9845 + }, + { + "epoch": 0.20039876302083334, + "grad_norm": 18.24745750427246, + "learning_rate": 9.939380966622184e-06, + "loss": 5.621, + "step": 9850 + }, + { + "epoch": 0.20050048828125, + "grad_norm": 20.25698471069336, + "learning_rate": 9.939318903409474e-06, + "loss": 5.435, + "step": 9855 + }, + { + "epoch": 0.20060221354166666, + "grad_norm": 15.820714950561523, + "learning_rate": 9.939256808636084e-06, + "loss": 5.3666, + "step": 9860 + }, + { + "epoch": 0.20070393880208334, + "grad_norm": 14.590901374816895, + "learning_rate": 9.939194682302412e-06, + "loss": 5.2799, + "step": 9865 + }, + { + "epoch": 0.2008056640625, + "grad_norm": 17.86176109313965, + "learning_rate": 9.939132524408852e-06, + "loss": 5.318, + "step": 9870 + }, + { + "epoch": 0.20090738932291666, + "grad_norm": 16.066181182861328, + "learning_rate": 9.939070334955805e-06, + "loss": 5.1653, + "step": 9875 + }, + { + "epoch": 0.20100911458333334, + "grad_norm": 20.04253387451172, + "learning_rate": 9.939008113943666e-06, + "loss": 5.3275, + "step": 9880 + }, + { + "epoch": 0.20111083984375, + "grad_norm": 16.70815658569336, + "learning_rate": 9.938945861372833e-06, + "loss": 5.3492, + "step": 9885 + }, + { + "epoch": 0.20121256510416666, + "grad_norm": 15.671804428100586, + "learning_rate": 9.938883577243703e-06, + "loss": 5.243, + "step": 9890 + }, + { + "epoch": 0.20131429036458334, + "grad_norm": 15.175533294677734, + "learning_rate": 9.938821261556676e-06, + "loss": 5.3397, + "step": 9895 + }, + { + "epoch": 0.201416015625, + "grad_norm": 21.790721893310547, + "learning_rate": 9.938758914312148e-06, + "loss": 5.4807, + "step": 9900 + }, + { + "epoch": 0.20151774088541666, + "grad_norm": 16.19756507873535, + "learning_rate": 9.938696535510519e-06, + "loss": 5.3499, + "step": 9905 + }, + { + "epoch": 0.20161946614583334, + "grad_norm": 23.333511352539062, + "learning_rate": 9.938634125152187e-06, + "loss": 5.5573, + "step": 9910 + }, + { + "epoch": 0.20172119140625, + "grad_norm": 13.298005104064941, + "learning_rate": 9.93857168323755e-06, + "loss": 5.4772, + "step": 9915 + }, + { + "epoch": 0.20182291666666666, + "grad_norm": 18.8491268157959, + "learning_rate": 9.938509209767007e-06, + "loss": 5.3509, + "step": 9920 + }, + { + "epoch": 0.20192464192708334, + "grad_norm": 20.469928741455078, + "learning_rate": 9.938446704740959e-06, + "loss": 5.6945, + "step": 9925 + }, + { + "epoch": 0.2020263671875, + "grad_norm": 17.826908111572266, + "learning_rate": 9.938384168159805e-06, + "loss": 5.3955, + "step": 9930 + }, + { + "epoch": 0.20212809244791666, + "grad_norm": 15.334890365600586, + "learning_rate": 9.938321600023942e-06, + "loss": 5.2502, + "step": 9935 + }, + { + "epoch": 0.20222981770833334, + "grad_norm": 16.44826316833496, + "learning_rate": 9.938259000333772e-06, + "loss": 5.1958, + "step": 9940 + }, + { + "epoch": 0.20233154296875, + "grad_norm": 13.152313232421875, + "learning_rate": 9.938196369089696e-06, + "loss": 5.6519, + "step": 9945 + }, + { + "epoch": 0.20243326822916666, + "grad_norm": 16.583518981933594, + "learning_rate": 9.938133706292112e-06, + "loss": 5.2855, + "step": 9950 + }, + { + "epoch": 0.20253499348958334, + "grad_norm": 13.69851303100586, + "learning_rate": 9.93807101194142e-06, + "loss": 5.5338, + "step": 9955 + }, + { + "epoch": 0.20263671875, + "grad_norm": 13.506708145141602, + "learning_rate": 9.938008286038023e-06, + "loss": 5.1632, + "step": 9960 + }, + { + "epoch": 0.20273844401041666, + "grad_norm": 15.210138320922852, + "learning_rate": 9.93794552858232e-06, + "loss": 5.4845, + "step": 9965 + }, + { + "epoch": 0.20284016927083334, + "grad_norm": 19.190879821777344, + "learning_rate": 9.937882739574713e-06, + "loss": 5.4008, + "step": 9970 + }, + { + "epoch": 0.20294189453125, + "grad_norm": 17.006288528442383, + "learning_rate": 9.937819919015603e-06, + "loss": 5.2523, + "step": 9975 + }, + { + "epoch": 0.20304361979166666, + "grad_norm": 20.578012466430664, + "learning_rate": 9.937757066905392e-06, + "loss": 5.3422, + "step": 9980 + }, + { + "epoch": 0.20314534505208334, + "grad_norm": 17.6387939453125, + "learning_rate": 9.937694183244479e-06, + "loss": 5.5043, + "step": 9985 + }, + { + "epoch": 0.2032470703125, + "grad_norm": 18.070207595825195, + "learning_rate": 9.937631268033267e-06, + "loss": 5.0796, + "step": 9990 + }, + { + "epoch": 0.20334879557291666, + "grad_norm": 13.18276309967041, + "learning_rate": 9.93756832127216e-06, + "loss": 5.5606, + "step": 9995 + }, + { + "epoch": 0.20345052083333334, + "grad_norm": 16.406293869018555, + "learning_rate": 9.93750534296156e-06, + "loss": 5.4358, + "step": 10000 + }, + { + "epoch": 0.20355224609375, + "grad_norm": 17.932540893554688, + "learning_rate": 9.937442333101865e-06, + "loss": 5.2622, + "step": 10005 + }, + { + "epoch": 0.20365397135416666, + "grad_norm": 17.66790199279785, + "learning_rate": 9.937379291693485e-06, + "loss": 5.4081, + "step": 10010 + }, + { + "epoch": 0.20375569661458334, + "grad_norm": 16.33098793029785, + "learning_rate": 9.937316218736815e-06, + "loss": 5.3399, + "step": 10015 + }, + { + "epoch": 0.203857421875, + "grad_norm": 17.29053497314453, + "learning_rate": 9.937253114232263e-06, + "loss": 5.554, + "step": 10020 + }, + { + "epoch": 0.20395914713541666, + "grad_norm": 17.42905616760254, + "learning_rate": 9.937189978180231e-06, + "loss": 5.3517, + "step": 10025 + }, + { + "epoch": 0.20406087239583334, + "grad_norm": 23.187456130981445, + "learning_rate": 9.937126810581123e-06, + "loss": 5.4842, + "step": 10030 + }, + { + "epoch": 0.20416259765625, + "grad_norm": 11.135866165161133, + "learning_rate": 9.937063611435341e-06, + "loss": 5.4458, + "step": 10035 + }, + { + "epoch": 0.20426432291666666, + "grad_norm": 20.449724197387695, + "learning_rate": 9.93700038074329e-06, + "loss": 5.2908, + "step": 10040 + }, + { + "epoch": 0.20436604817708334, + "grad_norm": 18.33529281616211, + "learning_rate": 9.936937118505374e-06, + "loss": 5.2328, + "step": 10045 + }, + { + "epoch": 0.2044677734375, + "grad_norm": 16.815813064575195, + "learning_rate": 9.936873824721997e-06, + "loss": 5.1048, + "step": 10050 + }, + { + "epoch": 0.20456949869791666, + "grad_norm": 16.003864288330078, + "learning_rate": 9.936810499393563e-06, + "loss": 5.2986, + "step": 10055 + }, + { + "epoch": 0.20467122395833334, + "grad_norm": 15.273567199707031, + "learning_rate": 9.936747142520477e-06, + "loss": 5.2065, + "step": 10060 + }, + { + "epoch": 0.20477294921875, + "grad_norm": 20.4141902923584, + "learning_rate": 9.936683754103144e-06, + "loss": 5.2315, + "step": 10065 + }, + { + "epoch": 0.20487467447916666, + "grad_norm": 16.31639289855957, + "learning_rate": 9.93662033414197e-06, + "loss": 5.7781, + "step": 10070 + }, + { + "epoch": 0.20497639973958334, + "grad_norm": 19.9144287109375, + "learning_rate": 9.936556882637358e-06, + "loss": 5.4019, + "step": 10075 + }, + { + "epoch": 0.205078125, + "grad_norm": 15.020304679870605, + "learning_rate": 9.936493399589715e-06, + "loss": 5.7591, + "step": 10080 + }, + { + "epoch": 0.20517985026041666, + "grad_norm": 16.618408203125, + "learning_rate": 9.936429884999445e-06, + "loss": 5.5979, + "step": 10085 + }, + { + "epoch": 0.20528157552083334, + "grad_norm": 16.061559677124023, + "learning_rate": 9.936366338866958e-06, + "loss": 5.4451, + "step": 10090 + }, + { + "epoch": 0.20538330078125, + "grad_norm": 16.367950439453125, + "learning_rate": 9.936302761192655e-06, + "loss": 5.4103, + "step": 10095 + }, + { + "epoch": 0.20548502604166666, + "grad_norm": 12.7061767578125, + "learning_rate": 9.936239151976945e-06, + "loss": 5.4392, + "step": 10100 + }, + { + "epoch": 0.20558675130208334, + "grad_norm": 14.29818058013916, + "learning_rate": 9.936175511220233e-06, + "loss": 5.6104, + "step": 10105 + }, + { + "epoch": 0.2056884765625, + "grad_norm": 21.31599235534668, + "learning_rate": 9.936111838922927e-06, + "loss": 5.4479, + "step": 10110 + }, + { + "epoch": 0.20579020182291666, + "grad_norm": 17.106220245361328, + "learning_rate": 9.936048135085432e-06, + "loss": 5.1858, + "step": 10115 + }, + { + "epoch": 0.20589192708333334, + "grad_norm": 16.776145935058594, + "learning_rate": 9.935984399708159e-06, + "loss": 5.4338, + "step": 10120 + }, + { + "epoch": 0.20599365234375, + "grad_norm": 12.663948059082031, + "learning_rate": 9.93592063279151e-06, + "loss": 5.3469, + "step": 10125 + }, + { + "epoch": 0.20609537760416666, + "grad_norm": 15.135226249694824, + "learning_rate": 9.935856834335897e-06, + "loss": 5.1973, + "step": 10130 + }, + { + "epoch": 0.20619710286458334, + "grad_norm": 13.908034324645996, + "learning_rate": 9.935793004341725e-06, + "loss": 5.4898, + "step": 10135 + }, + { + "epoch": 0.206298828125, + "grad_norm": 16.026519775390625, + "learning_rate": 9.935729142809402e-06, + "loss": 5.4179, + "step": 10140 + }, + { + "epoch": 0.20640055338541666, + "grad_norm": 15.182182312011719, + "learning_rate": 9.935665249739336e-06, + "loss": 5.3684, + "step": 10145 + }, + { + "epoch": 0.20650227864583334, + "grad_norm": 17.468502044677734, + "learning_rate": 9.935601325131938e-06, + "loss": 5.2648, + "step": 10150 + }, + { + "epoch": 0.20660400390625, + "grad_norm": 17.04147720336914, + "learning_rate": 9.935537368987612e-06, + "loss": 5.378, + "step": 10155 + }, + { + "epoch": 0.20670572916666666, + "grad_norm": 21.822996139526367, + "learning_rate": 9.93547338130677e-06, + "loss": 5.1758, + "step": 10160 + }, + { + "epoch": 0.20680745442708334, + "grad_norm": 19.349777221679688, + "learning_rate": 9.93540936208982e-06, + "loss": 5.3448, + "step": 10165 + }, + { + "epoch": 0.2069091796875, + "grad_norm": 14.136645317077637, + "learning_rate": 9.935345311337171e-06, + "loss": 5.29, + "step": 10170 + }, + { + "epoch": 0.20701090494791666, + "grad_norm": 17.30967903137207, + "learning_rate": 9.935281229049231e-06, + "loss": 5.2106, + "step": 10175 + }, + { + "epoch": 0.20711263020833334, + "grad_norm": 13.797098159790039, + "learning_rate": 9.935217115226412e-06, + "loss": 5.4736, + "step": 10180 + }, + { + "epoch": 0.20721435546875, + "grad_norm": 13.402127265930176, + "learning_rate": 9.935152969869121e-06, + "loss": 5.5143, + "step": 10185 + }, + { + "epoch": 0.20731608072916666, + "grad_norm": 14.48731803894043, + "learning_rate": 9.935088792977772e-06, + "loss": 5.5931, + "step": 10190 + }, + { + "epoch": 0.20741780598958334, + "grad_norm": 17.97011947631836, + "learning_rate": 9.93502458455277e-06, + "loss": 5.3917, + "step": 10195 + }, + { + "epoch": 0.20751953125, + "grad_norm": 17.93714714050293, + "learning_rate": 9.934960344594527e-06, + "loss": 5.372, + "step": 10200 + }, + { + "epoch": 0.20762125651041666, + "grad_norm": 12.730122566223145, + "learning_rate": 9.934896073103456e-06, + "loss": 5.4206, + "step": 10205 + }, + { + "epoch": 0.20772298177083334, + "grad_norm": 17.00704002380371, + "learning_rate": 9.934831770079962e-06, + "loss": 5.3647, + "step": 10210 + }, + { + "epoch": 0.20782470703125, + "grad_norm": 13.113630294799805, + "learning_rate": 9.934767435524461e-06, + "loss": 5.3728, + "step": 10215 + }, + { + "epoch": 0.20792643229166666, + "grad_norm": 16.591842651367188, + "learning_rate": 9.934703069437365e-06, + "loss": 5.1833, + "step": 10220 + }, + { + "epoch": 0.20802815755208334, + "grad_norm": 17.7924747467041, + "learning_rate": 9.934638671819082e-06, + "loss": 5.3218, + "step": 10225 + }, + { + "epoch": 0.2081298828125, + "grad_norm": 20.778301239013672, + "learning_rate": 9.934574242670023e-06, + "loss": 5.2702, + "step": 10230 + }, + { + "epoch": 0.20823160807291666, + "grad_norm": 16.941699981689453, + "learning_rate": 9.934509781990601e-06, + "loss": 5.5265, + "step": 10235 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 14.09347152709961, + "learning_rate": 9.934445289781229e-06, + "loss": 5.7437, + "step": 10240 + }, + { + "epoch": 0.20843505859375, + "grad_norm": 17.92060089111328, + "learning_rate": 9.934380766042318e-06, + "loss": 5.495, + "step": 10245 + }, + { + "epoch": 0.20853678385416666, + "grad_norm": 16.932254791259766, + "learning_rate": 9.93431621077428e-06, + "loss": 5.1542, + "step": 10250 + }, + { + "epoch": 0.20863850911458334, + "grad_norm": 18.468502044677734, + "learning_rate": 9.934251623977527e-06, + "loss": 5.652, + "step": 10255 + }, + { + "epoch": 0.208740234375, + "grad_norm": 23.60531234741211, + "learning_rate": 9.934187005652473e-06, + "loss": 5.5072, + "step": 10260 + }, + { + "epoch": 0.20884195963541666, + "grad_norm": 19.58544158935547, + "learning_rate": 9.934122355799529e-06, + "loss": 5.6841, + "step": 10265 + }, + { + "epoch": 0.20894368489583334, + "grad_norm": 12.709397315979004, + "learning_rate": 9.934057674419113e-06, + "loss": 5.16, + "step": 10270 + }, + { + "epoch": 0.20904541015625, + "grad_norm": 13.792555809020996, + "learning_rate": 9.933992961511632e-06, + "loss": 5.7082, + "step": 10275 + }, + { + "epoch": 0.20914713541666666, + "grad_norm": 16.528453826904297, + "learning_rate": 9.933928217077502e-06, + "loss": 5.2195, + "step": 10280 + }, + { + "epoch": 0.20924886067708334, + "grad_norm": 17.180723190307617, + "learning_rate": 9.933863441117138e-06, + "loss": 5.4198, + "step": 10285 + }, + { + "epoch": 0.2093505859375, + "grad_norm": 19.39708137512207, + "learning_rate": 9.933798633630954e-06, + "loss": 5.2794, + "step": 10290 + }, + { + "epoch": 0.20945231119791666, + "grad_norm": 13.520461082458496, + "learning_rate": 9.93373379461936e-06, + "loss": 5.5064, + "step": 10295 + }, + { + "epoch": 0.20955403645833334, + "grad_norm": 14.882644653320312, + "learning_rate": 9.933668924082776e-06, + "loss": 5.2872, + "step": 10300 + }, + { + "epoch": 0.20965576171875, + "grad_norm": 17.404020309448242, + "learning_rate": 9.933604022021612e-06, + "loss": 5.4274, + "step": 10305 + }, + { + "epoch": 0.20975748697916666, + "grad_norm": 21.104780197143555, + "learning_rate": 9.933539088436283e-06, + "loss": 5.2411, + "step": 10310 + }, + { + "epoch": 0.20985921223958334, + "grad_norm": 14.759839057922363, + "learning_rate": 9.933474123327209e-06, + "loss": 5.1878, + "step": 10315 + }, + { + "epoch": 0.2099609375, + "grad_norm": 13.218441009521484, + "learning_rate": 9.933409126694799e-06, + "loss": 5.5136, + "step": 10320 + }, + { + "epoch": 0.21006266276041666, + "grad_norm": 16.467809677124023, + "learning_rate": 9.933344098539472e-06, + "loss": 5.4415, + "step": 10325 + }, + { + "epoch": 0.21016438802083334, + "grad_norm": 17.79744529724121, + "learning_rate": 9.93327903886164e-06, + "loss": 5.2155, + "step": 10330 + }, + { + "epoch": 0.21026611328125, + "grad_norm": 15.171148300170898, + "learning_rate": 9.933213947661724e-06, + "loss": 5.4999, + "step": 10335 + }, + { + "epoch": 0.21036783854166666, + "grad_norm": 18.937437057495117, + "learning_rate": 9.933148824940135e-06, + "loss": 5.353, + "step": 10340 + }, + { + "epoch": 0.21046956380208334, + "grad_norm": 12.94460678100586, + "learning_rate": 9.933083670697293e-06, + "loss": 5.5422, + "step": 10345 + }, + { + "epoch": 0.2105712890625, + "grad_norm": 14.078347206115723, + "learning_rate": 9.93301848493361e-06, + "loss": 5.3787, + "step": 10350 + }, + { + "epoch": 0.21067301432291666, + "grad_norm": 16.057292938232422, + "learning_rate": 9.932953267649505e-06, + "loss": 5.4076, + "step": 10355 + }, + { + "epoch": 0.21077473958333334, + "grad_norm": 14.359938621520996, + "learning_rate": 9.932888018845397e-06, + "loss": 5.6706, + "step": 10360 + }, + { + "epoch": 0.21087646484375, + "grad_norm": 16.265941619873047, + "learning_rate": 9.932822738521699e-06, + "loss": 5.8967, + "step": 10365 + }, + { + "epoch": 0.21097819010416666, + "grad_norm": 12.344340324401855, + "learning_rate": 9.932757426678829e-06, + "loss": 5.5446, + "step": 10370 + }, + { + "epoch": 0.21107991536458334, + "grad_norm": 16.600357055664062, + "learning_rate": 9.932692083317205e-06, + "loss": 5.4185, + "step": 10375 + }, + { + "epoch": 0.211181640625, + "grad_norm": 23.16216278076172, + "learning_rate": 9.932626708437244e-06, + "loss": 5.4493, + "step": 10380 + }, + { + "epoch": 0.21128336588541666, + "grad_norm": 13.68563175201416, + "learning_rate": 9.932561302039366e-06, + "loss": 5.164, + "step": 10385 + }, + { + "epoch": 0.21138509114583334, + "grad_norm": 17.84819984436035, + "learning_rate": 9.932495864123985e-06, + "loss": 5.5143, + "step": 10390 + }, + { + "epoch": 0.21148681640625, + "grad_norm": 19.18485450744629, + "learning_rate": 9.932430394691523e-06, + "loss": 5.3679, + "step": 10395 + }, + { + "epoch": 0.21158854166666666, + "grad_norm": 26.1064453125, + "learning_rate": 9.932364893742394e-06, + "loss": 5.7849, + "step": 10400 + }, + { + "epoch": 0.21169026692708334, + "grad_norm": 16.483644485473633, + "learning_rate": 9.932299361277023e-06, + "loss": 5.4699, + "step": 10405 + }, + { + "epoch": 0.2117919921875, + "grad_norm": 15.69568920135498, + "learning_rate": 9.932233797295821e-06, + "loss": 5.301, + "step": 10410 + }, + { + "epoch": 0.21189371744791666, + "grad_norm": 16.02783966064453, + "learning_rate": 9.932168201799213e-06, + "loss": 5.1994, + "step": 10415 + }, + { + "epoch": 0.21199544270833334, + "grad_norm": 21.82683753967285, + "learning_rate": 9.932102574787615e-06, + "loss": 5.1429, + "step": 10420 + }, + { + "epoch": 0.21209716796875, + "grad_norm": 17.564970016479492, + "learning_rate": 9.932036916261447e-06, + "loss": 5.3056, + "step": 10425 + }, + { + "epoch": 0.21219889322916666, + "grad_norm": 20.675064086914062, + "learning_rate": 9.93197122622113e-06, + "loss": 5.3623, + "step": 10430 + }, + { + "epoch": 0.21230061848958334, + "grad_norm": 16.310874938964844, + "learning_rate": 9.931905504667081e-06, + "loss": 5.1902, + "step": 10435 + }, + { + "epoch": 0.21240234375, + "grad_norm": 18.47498893737793, + "learning_rate": 9.93183975159972e-06, + "loss": 5.1699, + "step": 10440 + }, + { + "epoch": 0.21250406901041666, + "grad_norm": 16.33698081970215, + "learning_rate": 9.931773967019472e-06, + "loss": 5.3975, + "step": 10445 + }, + { + "epoch": 0.21260579427083334, + "grad_norm": 17.443918228149414, + "learning_rate": 9.931708150926753e-06, + "loss": 5.6212, + "step": 10450 + }, + { + "epoch": 0.21270751953125, + "grad_norm": 12.953048706054688, + "learning_rate": 9.931642303321985e-06, + "loss": 5.3721, + "step": 10455 + }, + { + "epoch": 0.21280924479166666, + "grad_norm": 16.706464767456055, + "learning_rate": 9.931576424205587e-06, + "loss": 5.4136, + "step": 10460 + }, + { + "epoch": 0.21291097005208334, + "grad_norm": 17.33697509765625, + "learning_rate": 9.93151051357798e-06, + "loss": 5.3445, + "step": 10465 + }, + { + "epoch": 0.2130126953125, + "grad_norm": 18.700891494750977, + "learning_rate": 9.931444571439588e-06, + "loss": 5.1962, + "step": 10470 + }, + { + "epoch": 0.21311442057291666, + "grad_norm": 16.395137786865234, + "learning_rate": 9.93137859779083e-06, + "loss": 5.1019, + "step": 10475 + }, + { + "epoch": 0.21321614583333334, + "grad_norm": 15.247488021850586, + "learning_rate": 9.931312592632128e-06, + "loss": 5.2494, + "step": 10480 + }, + { + "epoch": 0.21331787109375, + "grad_norm": 21.83818244934082, + "learning_rate": 9.931246555963905e-06, + "loss": 5.382, + "step": 10485 + }, + { + "epoch": 0.21341959635416666, + "grad_norm": 15.827108383178711, + "learning_rate": 9.93118048778658e-06, + "loss": 5.6204, + "step": 10490 + }, + { + "epoch": 0.21352132161458334, + "grad_norm": 16.029836654663086, + "learning_rate": 9.93111438810058e-06, + "loss": 5.0275, + "step": 10495 + }, + { + "epoch": 0.213623046875, + "grad_norm": 19.268186569213867, + "learning_rate": 9.931048256906321e-06, + "loss": 5.5667, + "step": 10500 + }, + { + "epoch": 0.21372477213541666, + "grad_norm": 13.250783920288086, + "learning_rate": 9.93098209420423e-06, + "loss": 5.2449, + "step": 10505 + }, + { + "epoch": 0.21382649739583334, + "grad_norm": 13.921801567077637, + "learning_rate": 9.93091589999473e-06, + "loss": 5.4346, + "step": 10510 + }, + { + "epoch": 0.21392822265625, + "grad_norm": 15.491987228393555, + "learning_rate": 9.930849674278242e-06, + "loss": 5.279, + "step": 10515 + }, + { + "epoch": 0.21402994791666666, + "grad_norm": 14.781401634216309, + "learning_rate": 9.930783417055191e-06, + "loss": 5.2849, + "step": 10520 + }, + { + "epoch": 0.21413167317708334, + "grad_norm": 16.285051345825195, + "learning_rate": 9.930717128325998e-06, + "loss": 5.3548, + "step": 10525 + }, + { + "epoch": 0.2142333984375, + "grad_norm": 18.835426330566406, + "learning_rate": 9.930650808091088e-06, + "loss": 5.4845, + "step": 10530 + }, + { + "epoch": 0.21433512369791666, + "grad_norm": 17.03691864013672, + "learning_rate": 9.930584456350885e-06, + "loss": 5.7379, + "step": 10535 + }, + { + "epoch": 0.21443684895833334, + "grad_norm": 15.907588958740234, + "learning_rate": 9.930518073105811e-06, + "loss": 5.4436, + "step": 10540 + }, + { + "epoch": 0.21453857421875, + "grad_norm": 15.321430206298828, + "learning_rate": 9.930451658356292e-06, + "loss": 5.4689, + "step": 10545 + }, + { + "epoch": 0.21464029947916666, + "grad_norm": 11.76620101928711, + "learning_rate": 9.930385212102754e-06, + "loss": 5.7453, + "step": 10550 + }, + { + "epoch": 0.21474202473958334, + "grad_norm": 18.009016036987305, + "learning_rate": 9.930318734345618e-06, + "loss": 5.4514, + "step": 10555 + }, + { + "epoch": 0.21484375, + "grad_norm": 18.709327697753906, + "learning_rate": 9.93025222508531e-06, + "loss": 5.1941, + "step": 10560 + }, + { + "epoch": 0.21494547526041666, + "grad_norm": 16.293962478637695, + "learning_rate": 9.930185684322256e-06, + "loss": 5.3405, + "step": 10565 + }, + { + "epoch": 0.21504720052083334, + "grad_norm": 19.163415908813477, + "learning_rate": 9.93011911205688e-06, + "loss": 5.286, + "step": 10570 + }, + { + "epoch": 0.21514892578125, + "grad_norm": 17.571861267089844, + "learning_rate": 9.93005250828961e-06, + "loss": 5.5016, + "step": 10575 + }, + { + "epoch": 0.21525065104166666, + "grad_norm": 19.080951690673828, + "learning_rate": 9.929985873020867e-06, + "loss": 5.2875, + "step": 10580 + }, + { + "epoch": 0.21535237630208334, + "grad_norm": 21.216365814208984, + "learning_rate": 9.92991920625108e-06, + "loss": 5.2919, + "step": 10585 + }, + { + "epoch": 0.2154541015625, + "grad_norm": 21.4503116607666, + "learning_rate": 9.929852507980675e-06, + "loss": 5.5424, + "step": 10590 + }, + { + "epoch": 0.21555582682291666, + "grad_norm": 18.742353439331055, + "learning_rate": 9.929785778210077e-06, + "loss": 5.1542, + "step": 10595 + }, + { + "epoch": 0.21565755208333334, + "grad_norm": 17.76578712463379, + "learning_rate": 9.929719016939711e-06, + "loss": 5.3873, + "step": 10600 + }, + { + "epoch": 0.21575927734375, + "grad_norm": 17.117523193359375, + "learning_rate": 9.929652224170008e-06, + "loss": 5.0551, + "step": 10605 + }, + { + "epoch": 0.21586100260416666, + "grad_norm": 17.855484008789062, + "learning_rate": 9.929585399901391e-06, + "loss": 5.1953, + "step": 10610 + }, + { + "epoch": 0.21596272786458334, + "grad_norm": 16.8924560546875, + "learning_rate": 9.929518544134289e-06, + "loss": 5.3679, + "step": 10615 + }, + { + "epoch": 0.216064453125, + "grad_norm": 13.930648803710938, + "learning_rate": 9.929451656869127e-06, + "loss": 5.3221, + "step": 10620 + }, + { + "epoch": 0.21616617838541666, + "grad_norm": 17.864044189453125, + "learning_rate": 9.929384738106334e-06, + "loss": 5.2383, + "step": 10625 + }, + { + "epoch": 0.21626790364583334, + "grad_norm": 20.9058895111084, + "learning_rate": 9.929317787846337e-06, + "loss": 5.2364, + "step": 10630 + }, + { + "epoch": 0.21636962890625, + "grad_norm": 13.799013137817383, + "learning_rate": 9.929250806089565e-06, + "loss": 5.4427, + "step": 10635 + }, + { + "epoch": 0.21647135416666666, + "grad_norm": 17.454898834228516, + "learning_rate": 9.929183792836445e-06, + "loss": 5.3829, + "step": 10640 + }, + { + "epoch": 0.21657307942708334, + "grad_norm": 15.315484046936035, + "learning_rate": 9.929116748087407e-06, + "loss": 5.1901, + "step": 10645 + }, + { + "epoch": 0.2166748046875, + "grad_norm": 17.7401180267334, + "learning_rate": 9.929049671842874e-06, + "loss": 5.1382, + "step": 10650 + }, + { + "epoch": 0.21677652994791666, + "grad_norm": 17.795120239257812, + "learning_rate": 9.928982564103282e-06, + "loss": 5.6307, + "step": 10655 + }, + { + "epoch": 0.21687825520833334, + "grad_norm": 14.766138076782227, + "learning_rate": 9.928915424869055e-06, + "loss": 5.5185, + "step": 10660 + }, + { + "epoch": 0.21697998046875, + "grad_norm": 17.749408721923828, + "learning_rate": 9.928848254140621e-06, + "loss": 5.4608, + "step": 10665 + }, + { + "epoch": 0.21708170572916666, + "grad_norm": 15.329617500305176, + "learning_rate": 9.928781051918412e-06, + "loss": 5.2609, + "step": 10670 + }, + { + "epoch": 0.21718343098958334, + "grad_norm": 14.66836166381836, + "learning_rate": 9.928713818202857e-06, + "loss": 5.327, + "step": 10675 + }, + { + "epoch": 0.21728515625, + "grad_norm": 12.296348571777344, + "learning_rate": 9.928646552994387e-06, + "loss": 5.505, + "step": 10680 + }, + { + "epoch": 0.21738688151041666, + "grad_norm": 22.091922760009766, + "learning_rate": 9.92857925629343e-06, + "loss": 5.5789, + "step": 10685 + }, + { + "epoch": 0.21748860677083334, + "grad_norm": 12.750056266784668, + "learning_rate": 9.928511928100412e-06, + "loss": 5.3173, + "step": 10690 + }, + { + "epoch": 0.21759033203125, + "grad_norm": 15.359807968139648, + "learning_rate": 9.92844456841577e-06, + "loss": 5.2819, + "step": 10695 + }, + { + "epoch": 0.21769205729166666, + "grad_norm": 16.632658004760742, + "learning_rate": 9.928377177239933e-06, + "loss": 5.3891, + "step": 10700 + }, + { + "epoch": 0.21779378255208334, + "grad_norm": 19.364152908325195, + "learning_rate": 9.928309754573329e-06, + "loss": 5.2884, + "step": 10705 + }, + { + "epoch": 0.2178955078125, + "grad_norm": 12.631019592285156, + "learning_rate": 9.92824230041639e-06, + "loss": 5.1911, + "step": 10710 + }, + { + "epoch": 0.21799723307291666, + "grad_norm": 15.165093421936035, + "learning_rate": 9.928174814769548e-06, + "loss": 5.1389, + "step": 10715 + }, + { + "epoch": 0.21809895833333334, + "grad_norm": 18.150362014770508, + "learning_rate": 9.928107297633232e-06, + "loss": 5.6075, + "step": 10720 + }, + { + "epoch": 0.21820068359375, + "grad_norm": 15.735028266906738, + "learning_rate": 9.928039749007876e-06, + "loss": 5.2344, + "step": 10725 + }, + { + "epoch": 0.21830240885416666, + "grad_norm": 17.91574478149414, + "learning_rate": 9.92797216889391e-06, + "loss": 5.4105, + "step": 10730 + }, + { + "epoch": 0.21840413411458334, + "grad_norm": 18.445493698120117, + "learning_rate": 9.927904557291766e-06, + "loss": 5.3597, + "step": 10735 + }, + { + "epoch": 0.218505859375, + "grad_norm": 14.146286964416504, + "learning_rate": 9.927836914201876e-06, + "loss": 5.3031, + "step": 10740 + }, + { + "epoch": 0.21860758463541666, + "grad_norm": 13.551701545715332, + "learning_rate": 9.927769239624672e-06, + "loss": 5.3849, + "step": 10745 + }, + { + "epoch": 0.21870930989583334, + "grad_norm": 21.168371200561523, + "learning_rate": 9.927701533560588e-06, + "loss": 5.3106, + "step": 10750 + }, + { + "epoch": 0.21881103515625, + "grad_norm": 16.216150283813477, + "learning_rate": 9.927633796010056e-06, + "loss": 5.2116, + "step": 10755 + }, + { + "epoch": 0.21891276041666666, + "grad_norm": 17.621416091918945, + "learning_rate": 9.927566026973507e-06, + "loss": 5.6629, + "step": 10760 + }, + { + "epoch": 0.21901448567708334, + "grad_norm": 15.680888175964355, + "learning_rate": 9.927498226451376e-06, + "loss": 5.3261, + "step": 10765 + }, + { + "epoch": 0.2191162109375, + "grad_norm": 16.891603469848633, + "learning_rate": 9.927430394444093e-06, + "loss": 5.3431, + "step": 10770 + }, + { + "epoch": 0.21921793619791666, + "grad_norm": 20.22040557861328, + "learning_rate": 9.927362530952097e-06, + "loss": 5.1625, + "step": 10775 + }, + { + "epoch": 0.21931966145833334, + "grad_norm": 15.977385520935059, + "learning_rate": 9.927294635975817e-06, + "loss": 5.6603, + "step": 10780 + }, + { + "epoch": 0.21942138671875, + "grad_norm": 14.567987442016602, + "learning_rate": 9.92722670951569e-06, + "loss": 5.5649, + "step": 10785 + }, + { + "epoch": 0.21952311197916666, + "grad_norm": 15.366209983825684, + "learning_rate": 9.927158751572148e-06, + "loss": 5.2923, + "step": 10790 + }, + { + "epoch": 0.21962483723958334, + "grad_norm": 14.09630012512207, + "learning_rate": 9.927090762145624e-06, + "loss": 5.2504, + "step": 10795 + }, + { + "epoch": 0.2197265625, + "grad_norm": 15.382092475891113, + "learning_rate": 9.927022741236555e-06, + "loss": 5.2666, + "step": 10800 + }, + { + "epoch": 0.21982828776041666, + "grad_norm": 18.758573532104492, + "learning_rate": 9.926954688845376e-06, + "loss": 5.7603, + "step": 10805 + }, + { + "epoch": 0.21993001302083334, + "grad_norm": 14.367460250854492, + "learning_rate": 9.926886604972518e-06, + "loss": 5.3511, + "step": 10810 + }, + { + "epoch": 0.22003173828125, + "grad_norm": 16.524770736694336, + "learning_rate": 9.926818489618422e-06, + "loss": 5.483, + "step": 10815 + }, + { + "epoch": 0.22013346354166666, + "grad_norm": 16.649320602416992, + "learning_rate": 9.926750342783518e-06, + "loss": 5.439, + "step": 10820 + }, + { + "epoch": 0.22023518880208334, + "grad_norm": 14.244699478149414, + "learning_rate": 9.926682164468243e-06, + "loss": 5.2544, + "step": 10825 + }, + { + "epoch": 0.2203369140625, + "grad_norm": 14.596467018127441, + "learning_rate": 9.926613954673033e-06, + "loss": 5.6126, + "step": 10830 + }, + { + "epoch": 0.22043863932291666, + "grad_norm": 18.143362045288086, + "learning_rate": 9.926545713398323e-06, + "loss": 5.5758, + "step": 10835 + }, + { + "epoch": 0.22054036458333334, + "grad_norm": 10.800374031066895, + "learning_rate": 9.926477440644551e-06, + "loss": 5.2667, + "step": 10840 + }, + { + "epoch": 0.22064208984375, + "grad_norm": 18.664342880249023, + "learning_rate": 9.926409136412152e-06, + "loss": 5.4899, + "step": 10845 + }, + { + "epoch": 0.22074381510416666, + "grad_norm": 15.275148391723633, + "learning_rate": 9.926340800701563e-06, + "loss": 5.2889, + "step": 10850 + }, + { + "epoch": 0.22084554036458334, + "grad_norm": 13.240418434143066, + "learning_rate": 9.92627243351322e-06, + "loss": 5.4171, + "step": 10855 + }, + { + "epoch": 0.220947265625, + "grad_norm": 14.302739143371582, + "learning_rate": 9.926204034847558e-06, + "loss": 5.2721, + "step": 10860 + }, + { + "epoch": 0.22104899088541666, + "grad_norm": 14.059492111206055, + "learning_rate": 9.926135604705017e-06, + "loss": 5.2772, + "step": 10865 + }, + { + "epoch": 0.22115071614583334, + "grad_norm": 19.67574119567871, + "learning_rate": 9.926067143086033e-06, + "loss": 5.3148, + "step": 10870 + }, + { + "epoch": 0.22125244140625, + "grad_norm": 23.238733291625977, + "learning_rate": 9.925998649991045e-06, + "loss": 5.2015, + "step": 10875 + }, + { + "epoch": 0.22135416666666666, + "grad_norm": 17.144943237304688, + "learning_rate": 9.925930125420488e-06, + "loss": 5.1101, + "step": 10880 + }, + { + "epoch": 0.22145589192708334, + "grad_norm": 27.19576072692871, + "learning_rate": 9.925861569374803e-06, + "loss": 5.6229, + "step": 10885 + }, + { + "epoch": 0.2215576171875, + "grad_norm": 15.423973083496094, + "learning_rate": 9.925792981854425e-06, + "loss": 5.4257, + "step": 10890 + }, + { + "epoch": 0.22165934244791666, + "grad_norm": 14.780378341674805, + "learning_rate": 9.925724362859793e-06, + "loss": 5.2689, + "step": 10895 + }, + { + "epoch": 0.22176106770833334, + "grad_norm": 14.874410629272461, + "learning_rate": 9.925655712391347e-06, + "loss": 5.3388, + "step": 10900 + }, + { + "epoch": 0.22186279296875, + "grad_norm": 19.371028900146484, + "learning_rate": 9.925587030449525e-06, + "loss": 5.0197, + "step": 10905 + }, + { + "epoch": 0.22196451822916666, + "grad_norm": 16.316112518310547, + "learning_rate": 9.925518317034763e-06, + "loss": 5.4377, + "step": 10910 + }, + { + "epoch": 0.22206624348958334, + "grad_norm": 18.871553421020508, + "learning_rate": 9.925449572147504e-06, + "loss": 5.2467, + "step": 10915 + }, + { + "epoch": 0.22216796875, + "grad_norm": 17.60874366760254, + "learning_rate": 9.925380795788186e-06, + "loss": 5.4808, + "step": 10920 + }, + { + "epoch": 0.22226969401041666, + "grad_norm": 18.644102096557617, + "learning_rate": 9.925311987957249e-06, + "loss": 5.439, + "step": 10925 + }, + { + "epoch": 0.22237141927083334, + "grad_norm": 10.596040725708008, + "learning_rate": 9.92524314865513e-06, + "loss": 5.4704, + "step": 10930 + }, + { + "epoch": 0.22247314453125, + "grad_norm": 16.087642669677734, + "learning_rate": 9.925174277882272e-06, + "loss": 5.2982, + "step": 10935 + }, + { + "epoch": 0.22257486979166666, + "grad_norm": 17.60976219177246, + "learning_rate": 9.925105375639113e-06, + "loss": 5.3407, + "step": 10940 + }, + { + "epoch": 0.22267659505208334, + "grad_norm": 15.404062271118164, + "learning_rate": 9.925036441926095e-06, + "loss": 5.5071, + "step": 10945 + }, + { + "epoch": 0.2227783203125, + "grad_norm": 14.568496704101562, + "learning_rate": 9.924967476743659e-06, + "loss": 5.3571, + "step": 10950 + }, + { + "epoch": 0.22288004557291666, + "grad_norm": 16.332616806030273, + "learning_rate": 9.92489848009224e-06, + "loss": 5.5581, + "step": 10955 + }, + { + "epoch": 0.22298177083333334, + "grad_norm": 15.837993621826172, + "learning_rate": 9.924829451972286e-06, + "loss": 5.269, + "step": 10960 + }, + { + "epoch": 0.22308349609375, + "grad_norm": 11.626983642578125, + "learning_rate": 9.924760392384236e-06, + "loss": 5.3112, + "step": 10965 + }, + { + "epoch": 0.22318522135416666, + "grad_norm": 15.872383117675781, + "learning_rate": 9.924691301328529e-06, + "loss": 5.4534, + "step": 10970 + }, + { + "epoch": 0.22328694661458334, + "grad_norm": 20.933395385742188, + "learning_rate": 9.924622178805608e-06, + "loss": 5.4812, + "step": 10975 + }, + { + "epoch": 0.223388671875, + "grad_norm": 16.644472122192383, + "learning_rate": 9.924553024815913e-06, + "loss": 5.1479, + "step": 10980 + }, + { + "epoch": 0.22349039713541666, + "grad_norm": 13.605578422546387, + "learning_rate": 9.92448383935989e-06, + "loss": 5.4397, + "step": 10985 + }, + { + "epoch": 0.22359212239583334, + "grad_norm": 16.043922424316406, + "learning_rate": 9.924414622437976e-06, + "loss": 5.2592, + "step": 10990 + }, + { + "epoch": 0.22369384765625, + "grad_norm": 18.486297607421875, + "learning_rate": 9.92434537405062e-06, + "loss": 5.3101, + "step": 10995 + }, + { + "epoch": 0.22379557291666666, + "grad_norm": 15.251697540283203, + "learning_rate": 9.924276094198256e-06, + "loss": 5.3082, + "step": 11000 + }, + { + "epoch": 0.22389729817708334, + "grad_norm": 18.829050064086914, + "learning_rate": 9.924206782881332e-06, + "loss": 5.3558, + "step": 11005 + }, + { + "epoch": 0.2239990234375, + "grad_norm": 14.641939163208008, + "learning_rate": 9.924137440100291e-06, + "loss": 5.3185, + "step": 11010 + }, + { + "epoch": 0.22410074869791666, + "grad_norm": 16.661245346069336, + "learning_rate": 9.924068065855575e-06, + "loss": 5.0898, + "step": 11015 + }, + { + "epoch": 0.22420247395833334, + "grad_norm": 18.78082275390625, + "learning_rate": 9.923998660147626e-06, + "loss": 5.4065, + "step": 11020 + }, + { + "epoch": 0.22430419921875, + "grad_norm": 13.413220405578613, + "learning_rate": 9.92392922297689e-06, + "loss": 5.6314, + "step": 11025 + }, + { + "epoch": 0.22440592447916666, + "grad_norm": 9.985331535339355, + "learning_rate": 9.923859754343807e-06, + "loss": 5.1839, + "step": 11030 + }, + { + "epoch": 0.22450764973958334, + "grad_norm": 17.758201599121094, + "learning_rate": 9.923790254248824e-06, + "loss": 5.2142, + "step": 11035 + }, + { + "epoch": 0.224609375, + "grad_norm": 16.721878051757812, + "learning_rate": 9.923720722692384e-06, + "loss": 5.5267, + "step": 11040 + }, + { + "epoch": 0.22471110026041666, + "grad_norm": 14.779067039489746, + "learning_rate": 9.923651159674934e-06, + "loss": 5.5714, + "step": 11045 + }, + { + "epoch": 0.22481282552083334, + "grad_norm": 14.458312034606934, + "learning_rate": 9.923581565196914e-06, + "loss": 5.4943, + "step": 11050 + }, + { + "epoch": 0.22491455078125, + "grad_norm": 20.336164474487305, + "learning_rate": 9.923511939258771e-06, + "loss": 5.4291, + "step": 11055 + }, + { + "epoch": 0.22501627604166666, + "grad_norm": 17.566293716430664, + "learning_rate": 9.92344228186095e-06, + "loss": 5.5792, + "step": 11060 + }, + { + "epoch": 0.22511800130208334, + "grad_norm": 20.56073760986328, + "learning_rate": 9.923372593003896e-06, + "loss": 5.2013, + "step": 11065 + }, + { + "epoch": 0.2252197265625, + "grad_norm": 12.243074417114258, + "learning_rate": 9.923302872688055e-06, + "loss": 5.3995, + "step": 11070 + }, + { + "epoch": 0.22532145182291666, + "grad_norm": 19.311731338500977, + "learning_rate": 9.923233120913869e-06, + "loss": 5.353, + "step": 11075 + }, + { + "epoch": 0.22542317708333334, + "grad_norm": 15.699078559875488, + "learning_rate": 9.923163337681787e-06, + "loss": 5.3309, + "step": 11080 + }, + { + "epoch": 0.22552490234375, + "grad_norm": 17.712749481201172, + "learning_rate": 9.923093522992256e-06, + "loss": 5.2115, + "step": 11085 + }, + { + "epoch": 0.22562662760416666, + "grad_norm": 18.241840362548828, + "learning_rate": 9.923023676845718e-06, + "loss": 5.043, + "step": 11090 + }, + { + "epoch": 0.22572835286458334, + "grad_norm": 12.240376472473145, + "learning_rate": 9.922953799242621e-06, + "loss": 5.4805, + "step": 11095 + }, + { + "epoch": 0.225830078125, + "grad_norm": 14.644441604614258, + "learning_rate": 9.922883890183414e-06, + "loss": 4.9734, + "step": 11100 + }, + { + "epoch": 0.22593180338541666, + "grad_norm": 12.81989860534668, + "learning_rate": 9.92281394966854e-06, + "loss": 5.3942, + "step": 11105 + }, + { + "epoch": 0.22603352864583334, + "grad_norm": 16.785377502441406, + "learning_rate": 9.922743977698449e-06, + "loss": 5.2565, + "step": 11110 + }, + { + "epoch": 0.22613525390625, + "grad_norm": 25.074472427368164, + "learning_rate": 9.922673974273586e-06, + "loss": 5.5159, + "step": 11115 + }, + { + "epoch": 0.22623697916666666, + "grad_norm": 15.324980735778809, + "learning_rate": 9.922603939394398e-06, + "loss": 5.3429, + "step": 11120 + }, + { + "epoch": 0.22633870442708334, + "grad_norm": 20.589277267456055, + "learning_rate": 9.922533873061335e-06, + "loss": 5.668, + "step": 11125 + }, + { + "epoch": 0.2264404296875, + "grad_norm": 15.878705024719238, + "learning_rate": 9.922463775274843e-06, + "loss": 4.9403, + "step": 11130 + }, + { + "epoch": 0.22654215494791666, + "grad_norm": 16.942441940307617, + "learning_rate": 9.922393646035369e-06, + "loss": 5.2795, + "step": 11135 + }, + { + "epoch": 0.22664388020833334, + "grad_norm": 16.9084415435791, + "learning_rate": 9.922323485343363e-06, + "loss": 5.2458, + "step": 11140 + }, + { + "epoch": 0.22674560546875, + "grad_norm": 16.643247604370117, + "learning_rate": 9.922253293199274e-06, + "loss": 5.1475, + "step": 11145 + }, + { + "epoch": 0.22684733072916666, + "grad_norm": 17.542455673217773, + "learning_rate": 9.922183069603546e-06, + "loss": 5.4482, + "step": 11150 + }, + { + "epoch": 0.22694905598958334, + "grad_norm": 19.123626708984375, + "learning_rate": 9.922112814556632e-06, + "loss": 5.1746, + "step": 11155 + }, + { + "epoch": 0.22705078125, + "grad_norm": 10.867239952087402, + "learning_rate": 9.92204252805898e-06, + "loss": 5.4632, + "step": 11160 + }, + { + "epoch": 0.22715250651041666, + "grad_norm": 18.780235290527344, + "learning_rate": 9.921972210111037e-06, + "loss": 5.4827, + "step": 11165 + }, + { + "epoch": 0.22725423177083334, + "grad_norm": 19.23054313659668, + "learning_rate": 9.921901860713256e-06, + "loss": 5.2964, + "step": 11170 + }, + { + "epoch": 0.22735595703125, + "grad_norm": 16.690725326538086, + "learning_rate": 9.921831479866084e-06, + "loss": 5.325, + "step": 11175 + }, + { + "epoch": 0.22745768229166666, + "grad_norm": 18.0146484375, + "learning_rate": 9.921761067569972e-06, + "loss": 5.4599, + "step": 11180 + }, + { + "epoch": 0.22755940755208334, + "grad_norm": 18.687297821044922, + "learning_rate": 9.921690623825367e-06, + "loss": 5.2669, + "step": 11185 + }, + { + "epoch": 0.2276611328125, + "grad_norm": 18.05073356628418, + "learning_rate": 9.921620148632722e-06, + "loss": 5.5812, + "step": 11190 + }, + { + "epoch": 0.22776285807291666, + "grad_norm": 13.756197929382324, + "learning_rate": 9.921549641992489e-06, + "loss": 5.4462, + "step": 11195 + }, + { + "epoch": 0.22786458333333334, + "grad_norm": 28.080230712890625, + "learning_rate": 9.921479103905114e-06, + "loss": 5.3936, + "step": 11200 + }, + { + "epoch": 0.22796630859375, + "grad_norm": 16.958961486816406, + "learning_rate": 9.92140853437105e-06, + "loss": 5.3729, + "step": 11205 + }, + { + "epoch": 0.22806803385416666, + "grad_norm": 15.318947792053223, + "learning_rate": 9.921337933390747e-06, + "loss": 5.4142, + "step": 11210 + }, + { + "epoch": 0.22816975911458334, + "grad_norm": 17.901042938232422, + "learning_rate": 9.92126730096466e-06, + "loss": 5.4022, + "step": 11215 + }, + { + "epoch": 0.228271484375, + "grad_norm": 13.08847427368164, + "learning_rate": 9.921196637093234e-06, + "loss": 5.4289, + "step": 11220 + }, + { + "epoch": 0.22837320963541666, + "grad_norm": 14.198060035705566, + "learning_rate": 9.921125941776924e-06, + "loss": 5.1327, + "step": 11225 + }, + { + "epoch": 0.22847493489583334, + "grad_norm": 12.00886344909668, + "learning_rate": 9.921055215016182e-06, + "loss": 5.082, + "step": 11230 + }, + { + "epoch": 0.22857666015625, + "grad_norm": 11.952877044677734, + "learning_rate": 9.920984456811458e-06, + "loss": 5.1023, + "step": 11235 + }, + { + "epoch": 0.22867838541666666, + "grad_norm": 14.962508201599121, + "learning_rate": 9.920913667163208e-06, + "loss": 5.2174, + "step": 11240 + }, + { + "epoch": 0.22878011067708334, + "grad_norm": 14.28534984588623, + "learning_rate": 9.92084284607188e-06, + "loss": 5.2867, + "step": 11245 + }, + { + "epoch": 0.2288818359375, + "grad_norm": 19.806854248046875, + "learning_rate": 9.920771993537928e-06, + "loss": 5.2686, + "step": 11250 + }, + { + "epoch": 0.22898356119791666, + "grad_norm": 13.112110137939453, + "learning_rate": 9.920701109561805e-06, + "loss": 5.1551, + "step": 11255 + }, + { + "epoch": 0.22908528645833334, + "grad_norm": 23.2834415435791, + "learning_rate": 9.920630194143965e-06, + "loss": 5.3788, + "step": 11260 + }, + { + "epoch": 0.22918701171875, + "grad_norm": 21.870948791503906, + "learning_rate": 9.920559247284861e-06, + "loss": 5.6256, + "step": 11265 + }, + { + "epoch": 0.22928873697916666, + "grad_norm": 19.48235321044922, + "learning_rate": 9.920488268984945e-06, + "loss": 5.1173, + "step": 11270 + }, + { + "epoch": 0.22939046223958334, + "grad_norm": 13.4725923538208, + "learning_rate": 9.92041725924467e-06, + "loss": 5.6017, + "step": 11275 + }, + { + "epoch": 0.2294921875, + "grad_norm": 20.870407104492188, + "learning_rate": 9.92034621806449e-06, + "loss": 5.2355, + "step": 11280 + }, + { + "epoch": 0.22959391276041666, + "grad_norm": 17.87741470336914, + "learning_rate": 9.92027514544486e-06, + "loss": 5.3057, + "step": 11285 + }, + { + "epoch": 0.22969563802083334, + "grad_norm": 25.48149871826172, + "learning_rate": 9.920204041386234e-06, + "loss": 5.1794, + "step": 11290 + }, + { + "epoch": 0.22979736328125, + "grad_norm": 17.74811363220215, + "learning_rate": 9.920132905889066e-06, + "loss": 5.2934, + "step": 11295 + }, + { + "epoch": 0.22989908854166666, + "grad_norm": 16.657516479492188, + "learning_rate": 9.920061738953812e-06, + "loss": 5.6948, + "step": 11300 + }, + { + "epoch": 0.23000081380208334, + "grad_norm": 11.515690803527832, + "learning_rate": 9.919990540580924e-06, + "loss": 5.3865, + "step": 11305 + }, + { + "epoch": 0.2301025390625, + "grad_norm": 19.74277114868164, + "learning_rate": 9.919919310770857e-06, + "loss": 5.5539, + "step": 11310 + }, + { + "epoch": 0.23020426432291666, + "grad_norm": 16.45286750793457, + "learning_rate": 9.91984804952407e-06, + "loss": 5.4233, + "step": 11315 + }, + { + "epoch": 0.23030598958333334, + "grad_norm": 14.553608894348145, + "learning_rate": 9.919776756841015e-06, + "loss": 5.3084, + "step": 11320 + }, + { + "epoch": 0.23040771484375, + "grad_norm": 19.58328628540039, + "learning_rate": 9.919705432722146e-06, + "loss": 5.2314, + "step": 11325 + }, + { + "epoch": 0.23050944010416666, + "grad_norm": 15.909989356994629, + "learning_rate": 9.919634077167924e-06, + "loss": 5.302, + "step": 11330 + }, + { + "epoch": 0.23061116536458334, + "grad_norm": 15.903388023376465, + "learning_rate": 9.919562690178802e-06, + "loss": 5.344, + "step": 11335 + }, + { + "epoch": 0.230712890625, + "grad_norm": 15.604330062866211, + "learning_rate": 9.919491271755233e-06, + "loss": 5.1503, + "step": 11340 + }, + { + "epoch": 0.23081461588541666, + "grad_norm": 17.470552444458008, + "learning_rate": 9.919419821897679e-06, + "loss": 5.2573, + "step": 11345 + }, + { + "epoch": 0.23091634114583334, + "grad_norm": 20.808185577392578, + "learning_rate": 9.919348340606593e-06, + "loss": 5.3624, + "step": 11350 + }, + { + "epoch": 0.23101806640625, + "grad_norm": 26.364734649658203, + "learning_rate": 9.919276827882433e-06, + "loss": 5.6511, + "step": 11355 + }, + { + "epoch": 0.23111979166666666, + "grad_norm": 15.393523216247559, + "learning_rate": 9.919205283725656e-06, + "loss": 5.2404, + "step": 11360 + }, + { + "epoch": 0.23122151692708334, + "grad_norm": 13.97301197052002, + "learning_rate": 9.919133708136718e-06, + "loss": 5.29, + "step": 11365 + }, + { + "epoch": 0.2313232421875, + "grad_norm": 15.947701454162598, + "learning_rate": 9.919062101116077e-06, + "loss": 5.5595, + "step": 11370 + }, + { + "epoch": 0.23142496744791666, + "grad_norm": 12.809492111206055, + "learning_rate": 9.918990462664192e-06, + "loss": 5.2704, + "step": 11375 + }, + { + "epoch": 0.23152669270833334, + "grad_norm": 14.041559219360352, + "learning_rate": 9.918918792781517e-06, + "loss": 5.2971, + "step": 11380 + }, + { + "epoch": 0.23162841796875, + "grad_norm": 15.73489761352539, + "learning_rate": 9.918847091468516e-06, + "loss": 5.3105, + "step": 11385 + }, + { + "epoch": 0.23173014322916666, + "grad_norm": 14.323838233947754, + "learning_rate": 9.91877535872564e-06, + "loss": 5.4613, + "step": 11390 + }, + { + "epoch": 0.23183186848958334, + "grad_norm": 15.235762596130371, + "learning_rate": 9.918703594553354e-06, + "loss": 5.2745, + "step": 11395 + }, + { + "epoch": 0.23193359375, + "grad_norm": 16.120189666748047, + "learning_rate": 9.918631798952111e-06, + "loss": 5.739, + "step": 11400 + }, + { + "epoch": 0.23203531901041666, + "grad_norm": 15.954242706298828, + "learning_rate": 9.918559971922373e-06, + "loss": 5.3396, + "step": 11405 + }, + { + "epoch": 0.23213704427083334, + "grad_norm": 21.036216735839844, + "learning_rate": 9.918488113464599e-06, + "loss": 5.3148, + "step": 11410 + }, + { + "epoch": 0.23223876953125, + "grad_norm": 12.912982940673828, + "learning_rate": 9.918416223579246e-06, + "loss": 5.4238, + "step": 11415 + }, + { + "epoch": 0.23234049479166666, + "grad_norm": 18.47644805908203, + "learning_rate": 9.918344302266775e-06, + "loss": 5.4361, + "step": 11420 + }, + { + "epoch": 0.23244222005208334, + "grad_norm": 16.87586212158203, + "learning_rate": 9.918272349527647e-06, + "loss": 5.3158, + "step": 11425 + }, + { + "epoch": 0.2325439453125, + "grad_norm": 13.526440620422363, + "learning_rate": 9.918200365362318e-06, + "loss": 4.9923, + "step": 11430 + }, + { + "epoch": 0.23264567057291666, + "grad_norm": 18.302568435668945, + "learning_rate": 9.91812834977125e-06, + "loss": 5.2819, + "step": 11435 + }, + { + "epoch": 0.23274739583333334, + "grad_norm": 20.151439666748047, + "learning_rate": 9.918056302754904e-06, + "loss": 5.5669, + "step": 11440 + }, + { + "epoch": 0.23284912109375, + "grad_norm": 18.488046646118164, + "learning_rate": 9.917984224313739e-06, + "loss": 5.4017, + "step": 11445 + }, + { + "epoch": 0.23295084635416666, + "grad_norm": 14.662923812866211, + "learning_rate": 9.917912114448213e-06, + "loss": 5.4916, + "step": 11450 + }, + { + "epoch": 0.23305257161458334, + "grad_norm": 16.72043800354004, + "learning_rate": 9.917839973158794e-06, + "loss": 5.4239, + "step": 11455 + }, + { + "epoch": 0.233154296875, + "grad_norm": 11.697235107421875, + "learning_rate": 9.917767800445937e-06, + "loss": 5.4834, + "step": 11460 + }, + { + "epoch": 0.23325602213541666, + "grad_norm": 19.719505310058594, + "learning_rate": 9.917695596310105e-06, + "loss": 5.363, + "step": 11465 + }, + { + "epoch": 0.23335774739583334, + "grad_norm": 16.558338165283203, + "learning_rate": 9.91762336075176e-06, + "loss": 5.2564, + "step": 11470 + }, + { + "epoch": 0.23345947265625, + "grad_norm": 19.245071411132812, + "learning_rate": 9.91755109377136e-06, + "loss": 5.7089, + "step": 11475 + }, + { + "epoch": 0.23356119791666666, + "grad_norm": 13.90700626373291, + "learning_rate": 9.917478795369373e-06, + "loss": 5.4614, + "step": 11480 + }, + { + "epoch": 0.23366292317708334, + "grad_norm": 16.621875762939453, + "learning_rate": 9.917406465546256e-06, + "loss": 5.3657, + "step": 11485 + }, + { + "epoch": 0.2337646484375, + "grad_norm": 16.89861297607422, + "learning_rate": 9.917334104302474e-06, + "loss": 5.2258, + "step": 11490 + }, + { + "epoch": 0.23386637369791666, + "grad_norm": 17.43910789489746, + "learning_rate": 9.917261711638485e-06, + "loss": 5.3355, + "step": 11495 + }, + { + "epoch": 0.23396809895833334, + "grad_norm": 17.635866165161133, + "learning_rate": 9.917189287554759e-06, + "loss": 5.3748, + "step": 11500 + }, + { + "epoch": 0.23406982421875, + "grad_norm": 24.73565673828125, + "learning_rate": 9.917116832051749e-06, + "loss": 5.6308, + "step": 11505 + }, + { + "epoch": 0.23417154947916666, + "grad_norm": 18.128543853759766, + "learning_rate": 9.917044345129928e-06, + "loss": 5.528, + "step": 11510 + }, + { + "epoch": 0.23427327473958334, + "grad_norm": 14.548800468444824, + "learning_rate": 9.916971826789752e-06, + "loss": 5.3592, + "step": 11515 + }, + { + "epoch": 0.234375, + "grad_norm": 15.115767478942871, + "learning_rate": 9.916899277031689e-06, + "loss": 5.2778, + "step": 11520 + }, + { + "epoch": 0.23447672526041666, + "grad_norm": 14.538713455200195, + "learning_rate": 9.916826695856198e-06, + "loss": 5.3836, + "step": 11525 + }, + { + "epoch": 0.23457845052083334, + "grad_norm": 17.736703872680664, + "learning_rate": 9.916754083263746e-06, + "loss": 5.3108, + "step": 11530 + }, + { + "epoch": 0.23468017578125, + "grad_norm": 16.9951229095459, + "learning_rate": 9.916681439254796e-06, + "loss": 5.7118, + "step": 11535 + }, + { + "epoch": 0.23478190104166666, + "grad_norm": 16.300838470458984, + "learning_rate": 9.916608763829813e-06, + "loss": 5.1346, + "step": 11540 + }, + { + "epoch": 0.23488362630208334, + "grad_norm": 16.070301055908203, + "learning_rate": 9.916536056989262e-06, + "loss": 5.5255, + "step": 11545 + }, + { + "epoch": 0.2349853515625, + "grad_norm": 14.517631530761719, + "learning_rate": 9.916463318733605e-06, + "loss": 5.3374, + "step": 11550 + }, + { + "epoch": 0.23508707682291666, + "grad_norm": 12.939146041870117, + "learning_rate": 9.916390549063307e-06, + "loss": 5.2248, + "step": 11555 + }, + { + "epoch": 0.23518880208333334, + "grad_norm": 14.36535930633545, + "learning_rate": 9.916317747978836e-06, + "loss": 5.5883, + "step": 11560 + }, + { + "epoch": 0.23529052734375, + "grad_norm": 13.610946655273438, + "learning_rate": 9.916244915480654e-06, + "loss": 5.2965, + "step": 11565 + }, + { + "epoch": 0.23539225260416666, + "grad_norm": 12.347288131713867, + "learning_rate": 9.916172051569228e-06, + "loss": 5.1467, + "step": 11570 + }, + { + "epoch": 0.23549397786458334, + "grad_norm": 13.50830364227295, + "learning_rate": 9.916099156245023e-06, + "loss": 5.1518, + "step": 11575 + }, + { + "epoch": 0.235595703125, + "grad_norm": 15.931774139404297, + "learning_rate": 9.916026229508504e-06, + "loss": 5.4332, + "step": 11580 + }, + { + "epoch": 0.23569742838541666, + "grad_norm": 16.951866149902344, + "learning_rate": 9.91595327136014e-06, + "loss": 5.2541, + "step": 11585 + }, + { + "epoch": 0.23579915364583334, + "grad_norm": 14.520648956298828, + "learning_rate": 9.915880281800394e-06, + "loss": 5.5142, + "step": 11590 + }, + { + "epoch": 0.23590087890625, + "grad_norm": 15.701983451843262, + "learning_rate": 9.915807260829734e-06, + "loss": 5.4125, + "step": 11595 + }, + { + "epoch": 0.23600260416666666, + "grad_norm": 15.495074272155762, + "learning_rate": 9.915734208448625e-06, + "loss": 5.5551, + "step": 11600 + }, + { + "epoch": 0.23610432942708334, + "grad_norm": 13.541733741760254, + "learning_rate": 9.915661124657536e-06, + "loss": 5.4763, + "step": 11605 + }, + { + "epoch": 0.2362060546875, + "grad_norm": 14.557682037353516, + "learning_rate": 9.915588009456932e-06, + "loss": 5.132, + "step": 11610 + }, + { + "epoch": 0.23630777994791666, + "grad_norm": 16.362730026245117, + "learning_rate": 9.915514862847281e-06, + "loss": 5.4559, + "step": 11615 + }, + { + "epoch": 0.23640950520833334, + "grad_norm": 25.88382911682129, + "learning_rate": 9.915441684829052e-06, + "loss": 5.5915, + "step": 11620 + }, + { + "epoch": 0.23651123046875, + "grad_norm": 16.849708557128906, + "learning_rate": 9.91536847540271e-06, + "loss": 5.3905, + "step": 11625 + }, + { + "epoch": 0.23661295572916666, + "grad_norm": 12.6632080078125, + "learning_rate": 9.915295234568723e-06, + "loss": 5.3941, + "step": 11630 + }, + { + "epoch": 0.23671468098958334, + "grad_norm": 17.52554702758789, + "learning_rate": 9.91522196232756e-06, + "loss": 5.3846, + "step": 11635 + }, + { + "epoch": 0.23681640625, + "grad_norm": 24.28203582763672, + "learning_rate": 9.91514865867969e-06, + "loss": 5.4626, + "step": 11640 + }, + { + "epoch": 0.23691813151041666, + "grad_norm": 12.64306640625, + "learning_rate": 9.915075323625581e-06, + "loss": 5.4558, + "step": 11645 + }, + { + "epoch": 0.23701985677083334, + "grad_norm": 15.400808334350586, + "learning_rate": 9.9150019571657e-06, + "loss": 5.298, + "step": 11650 + }, + { + "epoch": 0.23712158203125, + "grad_norm": 21.285568237304688, + "learning_rate": 9.914928559300516e-06, + "loss": 5.4177, + "step": 11655 + }, + { + "epoch": 0.23722330729166666, + "grad_norm": 18.820646286010742, + "learning_rate": 9.914855130030498e-06, + "loss": 5.447, + "step": 11660 + }, + { + "epoch": 0.23732503255208334, + "grad_norm": 12.44219970703125, + "learning_rate": 9.914781669356118e-06, + "loss": 5.1628, + "step": 11665 + }, + { + "epoch": 0.2374267578125, + "grad_norm": 19.473995208740234, + "learning_rate": 9.914708177277842e-06, + "loss": 5.0805, + "step": 11670 + }, + { + "epoch": 0.23752848307291666, + "grad_norm": 16.17839813232422, + "learning_rate": 9.91463465379614e-06, + "loss": 5.1934, + "step": 11675 + }, + { + "epoch": 0.23763020833333334, + "grad_norm": 14.6658935546875, + "learning_rate": 9.914561098911485e-06, + "loss": 5.1579, + "step": 11680 + }, + { + "epoch": 0.23773193359375, + "grad_norm": 13.673337936401367, + "learning_rate": 9.914487512624344e-06, + "loss": 5.0133, + "step": 11685 + }, + { + "epoch": 0.23783365885416666, + "grad_norm": 14.668375968933105, + "learning_rate": 9.914413894935186e-06, + "loss": 5.4663, + "step": 11690 + }, + { + "epoch": 0.23793538411458334, + "grad_norm": 13.679025650024414, + "learning_rate": 9.914340245844486e-06, + "loss": 5.2984, + "step": 11695 + }, + { + "epoch": 0.238037109375, + "grad_norm": 15.149916648864746, + "learning_rate": 9.914266565352711e-06, + "loss": 5.5831, + "step": 11700 + }, + { + "epoch": 0.23813883463541666, + "grad_norm": 14.620588302612305, + "learning_rate": 9.914192853460332e-06, + "loss": 5.6003, + "step": 11705 + }, + { + "epoch": 0.23824055989583334, + "grad_norm": 16.815488815307617, + "learning_rate": 9.914119110167822e-06, + "loss": 5.2178, + "step": 11710 + }, + { + "epoch": 0.23834228515625, + "grad_norm": 16.254566192626953, + "learning_rate": 9.914045335475649e-06, + "loss": 5.2337, + "step": 11715 + }, + { + "epoch": 0.23844401041666666, + "grad_norm": 14.741978645324707, + "learning_rate": 9.913971529384287e-06, + "loss": 5.291, + "step": 11720 + }, + { + "epoch": 0.23854573567708334, + "grad_norm": 13.600303649902344, + "learning_rate": 9.913897691894206e-06, + "loss": 5.0856, + "step": 11725 + }, + { + "epoch": 0.2386474609375, + "grad_norm": 19.835275650024414, + "learning_rate": 9.91382382300588e-06, + "loss": 5.1442, + "step": 11730 + }, + { + "epoch": 0.23874918619791666, + "grad_norm": 15.405753135681152, + "learning_rate": 9.913749922719778e-06, + "loss": 5.2236, + "step": 11735 + }, + { + "epoch": 0.23885091145833334, + "grad_norm": 11.911864280700684, + "learning_rate": 9.913675991036376e-06, + "loss": 5.295, + "step": 11740 + }, + { + "epoch": 0.23895263671875, + "grad_norm": 17.82322883605957, + "learning_rate": 9.913602027956141e-06, + "loss": 5.4745, + "step": 11745 + }, + { + "epoch": 0.23905436197916666, + "grad_norm": 20.417970657348633, + "learning_rate": 9.913528033479552e-06, + "loss": 5.561, + "step": 11750 + }, + { + "epoch": 0.23915608723958334, + "grad_norm": 17.45111656188965, + "learning_rate": 9.913454007607078e-06, + "loss": 5.4471, + "step": 11755 + }, + { + "epoch": 0.2392578125, + "grad_norm": 14.76699447631836, + "learning_rate": 9.913379950339193e-06, + "loss": 5.1839, + "step": 11760 + }, + { + "epoch": 0.23935953776041666, + "grad_norm": 19.194272994995117, + "learning_rate": 9.913305861676367e-06, + "loss": 5.0897, + "step": 11765 + }, + { + "epoch": 0.23946126302083334, + "grad_norm": 15.189436912536621, + "learning_rate": 9.913231741619079e-06, + "loss": 5.3003, + "step": 11770 + }, + { + "epoch": 0.23956298828125, + "grad_norm": 13.320898056030273, + "learning_rate": 9.9131575901678e-06, + "loss": 4.9894, + "step": 11775 + }, + { + "epoch": 0.23966471354166666, + "grad_norm": 13.187898635864258, + "learning_rate": 9.913083407323001e-06, + "loss": 5.1369, + "step": 11780 + }, + { + "epoch": 0.23976643880208334, + "grad_norm": 12.713990211486816, + "learning_rate": 9.913009193085161e-06, + "loss": 5.2482, + "step": 11785 + }, + { + "epoch": 0.2398681640625, + "grad_norm": 15.120379447937012, + "learning_rate": 9.91293494745475e-06, + "loss": 5.1755, + "step": 11790 + }, + { + "epoch": 0.23996988932291666, + "grad_norm": 13.006828308105469, + "learning_rate": 9.912860670432248e-06, + "loss": 5.3543, + "step": 11795 + }, + { + "epoch": 0.24007161458333334, + "grad_norm": 14.492498397827148, + "learning_rate": 9.912786362018124e-06, + "loss": 5.3866, + "step": 11800 + }, + { + "epoch": 0.24017333984375, + "grad_norm": 16.359664916992188, + "learning_rate": 9.912712022212853e-06, + "loss": 5.2019, + "step": 11805 + }, + { + "epoch": 0.24027506510416666, + "grad_norm": 19.630830764770508, + "learning_rate": 9.912637651016913e-06, + "loss": 5.2812, + "step": 11810 + }, + { + "epoch": 0.24037679036458334, + "grad_norm": 15.795555114746094, + "learning_rate": 9.912563248430778e-06, + "loss": 5.0143, + "step": 11815 + }, + { + "epoch": 0.240478515625, + "grad_norm": 16.317773818969727, + "learning_rate": 9.912488814454921e-06, + "loss": 5.5265, + "step": 11820 + }, + { + "epoch": 0.24058024088541666, + "grad_norm": 14.824578285217285, + "learning_rate": 9.912414349089823e-06, + "loss": 5.1051, + "step": 11825 + }, + { + "epoch": 0.24068196614583334, + "grad_norm": 13.400445938110352, + "learning_rate": 9.912339852335955e-06, + "loss": 5.6189, + "step": 11830 + }, + { + "epoch": 0.24078369140625, + "grad_norm": 20.81941032409668, + "learning_rate": 9.912265324193794e-06, + "loss": 5.0873, + "step": 11835 + }, + { + "epoch": 0.24088541666666666, + "grad_norm": 13.071749687194824, + "learning_rate": 9.912190764663819e-06, + "loss": 5.4046, + "step": 11840 + }, + { + "epoch": 0.24098714192708334, + "grad_norm": 16.99751853942871, + "learning_rate": 9.912116173746505e-06, + "loss": 5.206, + "step": 11845 + }, + { + "epoch": 0.2410888671875, + "grad_norm": 14.911616325378418, + "learning_rate": 9.912041551442326e-06, + "loss": 5.5767, + "step": 11850 + }, + { + "epoch": 0.24119059244791666, + "grad_norm": 15.975245475769043, + "learning_rate": 9.911966897751762e-06, + "loss": 5.3492, + "step": 11855 + }, + { + "epoch": 0.24129231770833334, + "grad_norm": 14.79683780670166, + "learning_rate": 9.911892212675288e-06, + "loss": 5.4609, + "step": 11860 + }, + { + "epoch": 0.24139404296875, + "grad_norm": 21.260141372680664, + "learning_rate": 9.91181749621338e-06, + "loss": 5.4876, + "step": 11865 + }, + { + "epoch": 0.24149576822916666, + "grad_norm": 19.07159996032715, + "learning_rate": 9.911742748366521e-06, + "loss": 5.4154, + "step": 11870 + }, + { + "epoch": 0.24159749348958334, + "grad_norm": 16.12066650390625, + "learning_rate": 9.911667969135184e-06, + "loss": 5.2277, + "step": 11875 + }, + { + "epoch": 0.24169921875, + "grad_norm": 16.5841007232666, + "learning_rate": 9.911593158519847e-06, + "loss": 5.1751, + "step": 11880 + }, + { + "epoch": 0.24180094401041666, + "grad_norm": 14.331503868103027, + "learning_rate": 9.911518316520989e-06, + "loss": 5.3108, + "step": 11885 + }, + { + "epoch": 0.24190266927083334, + "grad_norm": 15.841553688049316, + "learning_rate": 9.911443443139089e-06, + "loss": 5.2304, + "step": 11890 + }, + { + "epoch": 0.24200439453125, + "grad_norm": 18.744050979614258, + "learning_rate": 9.911368538374625e-06, + "loss": 5.5257, + "step": 11895 + }, + { + "epoch": 0.24210611979166666, + "grad_norm": 15.738924026489258, + "learning_rate": 9.911293602228071e-06, + "loss": 5.6475, + "step": 11900 + }, + { + "epoch": 0.24220784505208334, + "grad_norm": 21.05257225036621, + "learning_rate": 9.911218634699914e-06, + "loss": 5.3174, + "step": 11905 + }, + { + "epoch": 0.2423095703125, + "grad_norm": 14.229331016540527, + "learning_rate": 9.911143635790628e-06, + "loss": 5.723, + "step": 11910 + }, + { + "epoch": 0.24241129557291666, + "grad_norm": 13.860556602478027, + "learning_rate": 9.911068605500694e-06, + "loss": 5.4433, + "step": 11915 + }, + { + "epoch": 0.24251302083333334, + "grad_norm": 18.50934410095215, + "learning_rate": 9.910993543830588e-06, + "loss": 5.2667, + "step": 11920 + }, + { + "epoch": 0.24261474609375, + "grad_norm": 25.712583541870117, + "learning_rate": 9.910918450780795e-06, + "loss": 5.2128, + "step": 11925 + }, + { + "epoch": 0.24271647135416666, + "grad_norm": 24.13140296936035, + "learning_rate": 9.91084332635179e-06, + "loss": 5.608, + "step": 11930 + }, + { + "epoch": 0.24281819661458334, + "grad_norm": 22.57966423034668, + "learning_rate": 9.910768170544056e-06, + "loss": 5.5108, + "step": 11935 + }, + { + "epoch": 0.242919921875, + "grad_norm": 18.896398544311523, + "learning_rate": 9.910692983358072e-06, + "loss": 5.2954, + "step": 11940 + }, + { + "epoch": 0.24302164713541666, + "grad_norm": 17.831254959106445, + "learning_rate": 9.91061776479432e-06, + "loss": 5.5584, + "step": 11945 + }, + { + "epoch": 0.24312337239583334, + "grad_norm": 11.765920639038086, + "learning_rate": 9.910542514853277e-06, + "loss": 5.305, + "step": 11950 + }, + { + "epoch": 0.24322509765625, + "grad_norm": 20.575027465820312, + "learning_rate": 9.910467233535428e-06, + "loss": 5.2475, + "step": 11955 + }, + { + "epoch": 0.24332682291666666, + "grad_norm": 17.46868896484375, + "learning_rate": 9.910391920841251e-06, + "loss": 5.2735, + "step": 11960 + }, + { + "epoch": 0.24342854817708334, + "grad_norm": 16.855674743652344, + "learning_rate": 9.91031657677123e-06, + "loss": 5.4883, + "step": 11965 + }, + { + "epoch": 0.2435302734375, + "grad_norm": 21.461950302124023, + "learning_rate": 9.910241201325843e-06, + "loss": 5.3245, + "step": 11970 + }, + { + "epoch": 0.24363199869791666, + "grad_norm": 13.74295425415039, + "learning_rate": 9.910165794505574e-06, + "loss": 5.1109, + "step": 11975 + }, + { + "epoch": 0.24373372395833334, + "grad_norm": 13.30771541595459, + "learning_rate": 9.910090356310904e-06, + "loss": 5.2821, + "step": 11980 + }, + { + "epoch": 0.24383544921875, + "grad_norm": 28.923322677612305, + "learning_rate": 9.910014886742316e-06, + "loss": 5.1698, + "step": 11985 + }, + { + "epoch": 0.24393717447916666, + "grad_norm": 16.626113891601562, + "learning_rate": 9.909939385800292e-06, + "loss": 5.3077, + "step": 11990 + }, + { + "epoch": 0.24403889973958334, + "grad_norm": 18.65044403076172, + "learning_rate": 9.909863853485312e-06, + "loss": 5.3342, + "step": 11995 + }, + { + "epoch": 0.244140625, + "grad_norm": 13.729620933532715, + "learning_rate": 9.909788289797864e-06, + "loss": 5.1689, + "step": 12000 + }, + { + "epoch": 0.24424235026041666, + "grad_norm": 25.04327392578125, + "learning_rate": 9.909712694738425e-06, + "loss": 5.3131, + "step": 12005 + }, + { + "epoch": 0.24434407552083334, + "grad_norm": 20.630958557128906, + "learning_rate": 9.90963706830748e-06, + "loss": 5.7972, + "step": 12010 + }, + { + "epoch": 0.24444580078125, + "grad_norm": 18.006759643554688, + "learning_rate": 9.909561410505513e-06, + "loss": 5.6181, + "step": 12015 + }, + { + "epoch": 0.24454752604166666, + "grad_norm": 18.18675422668457, + "learning_rate": 9.909485721333008e-06, + "loss": 5.707, + "step": 12020 + }, + { + "epoch": 0.24464925130208334, + "grad_norm": 22.861528396606445, + "learning_rate": 9.909410000790447e-06, + "loss": 5.2096, + "step": 12025 + }, + { + "epoch": 0.2447509765625, + "grad_norm": 13.102017402648926, + "learning_rate": 9.909334248878314e-06, + "loss": 5.5856, + "step": 12030 + }, + { + "epoch": 0.24485270182291666, + "grad_norm": 17.431800842285156, + "learning_rate": 9.909258465597094e-06, + "loss": 5.3405, + "step": 12035 + }, + { + "epoch": 0.24495442708333334, + "grad_norm": 18.977672576904297, + "learning_rate": 9.90918265094727e-06, + "loss": 5.2059, + "step": 12040 + }, + { + "epoch": 0.24505615234375, + "grad_norm": 14.518733978271484, + "learning_rate": 9.90910680492933e-06, + "loss": 5.3929, + "step": 12045 + }, + { + "epoch": 0.24515787760416666, + "grad_norm": 18.72972297668457, + "learning_rate": 9.909030927543754e-06, + "loss": 5.0823, + "step": 12050 + }, + { + "epoch": 0.24525960286458334, + "grad_norm": 20.673450469970703, + "learning_rate": 9.90895501879103e-06, + "loss": 5.6087, + "step": 12055 + }, + { + "epoch": 0.245361328125, + "grad_norm": 17.82303237915039, + "learning_rate": 9.90887907867164e-06, + "loss": 5.3415, + "step": 12060 + }, + { + "epoch": 0.24546305338541666, + "grad_norm": 15.566089630126953, + "learning_rate": 9.90880310718607e-06, + "loss": 5.3345, + "step": 12065 + }, + { + "epoch": 0.24556477864583334, + "grad_norm": 19.12847328186035, + "learning_rate": 9.90872710433481e-06, + "loss": 5.4202, + "step": 12070 + }, + { + "epoch": 0.24566650390625, + "grad_norm": 35.17559814453125, + "learning_rate": 9.90865107011834e-06, + "loss": 5.6327, + "step": 12075 + }, + { + "epoch": 0.24576822916666666, + "grad_norm": 15.143199920654297, + "learning_rate": 9.908575004537146e-06, + "loss": 5.3724, + "step": 12080 + }, + { + "epoch": 0.24586995442708334, + "grad_norm": 14.74338150024414, + "learning_rate": 9.90849890759172e-06, + "loss": 5.3627, + "step": 12085 + }, + { + "epoch": 0.2459716796875, + "grad_norm": 14.688422203063965, + "learning_rate": 9.908422779282541e-06, + "loss": 5.3643, + "step": 12090 + }, + { + "epoch": 0.24607340494791666, + "grad_norm": 14.185523986816406, + "learning_rate": 9.9083466196101e-06, + "loss": 5.2591, + "step": 12095 + }, + { + "epoch": 0.24617513020833334, + "grad_norm": 16.615814208984375, + "learning_rate": 9.908270428574882e-06, + "loss": 5.1403, + "step": 12100 + }, + { + "epoch": 0.24627685546875, + "grad_norm": 17.450428009033203, + "learning_rate": 9.908194206177374e-06, + "loss": 5.4285, + "step": 12105 + }, + { + "epoch": 0.24637858072916666, + "grad_norm": 14.826376914978027, + "learning_rate": 9.908117952418062e-06, + "loss": 5.1296, + "step": 12110 + }, + { + "epoch": 0.24648030598958334, + "grad_norm": 16.009098052978516, + "learning_rate": 9.908041667297436e-06, + "loss": 5.2924, + "step": 12115 + }, + { + "epoch": 0.24658203125, + "grad_norm": 17.446033477783203, + "learning_rate": 9.90796535081598e-06, + "loss": 5.4965, + "step": 12120 + }, + { + "epoch": 0.24668375651041666, + "grad_norm": 19.44838523864746, + "learning_rate": 9.907889002974185e-06, + "loss": 5.2154, + "step": 12125 + }, + { + "epoch": 0.24678548177083334, + "grad_norm": 17.020204544067383, + "learning_rate": 9.907812623772536e-06, + "loss": 5.2235, + "step": 12130 + }, + { + "epoch": 0.24688720703125, + "grad_norm": 19.69089126586914, + "learning_rate": 9.907736213211524e-06, + "loss": 5.2743, + "step": 12135 + }, + { + "epoch": 0.24698893229166666, + "grad_norm": 15.259157180786133, + "learning_rate": 9.907659771291635e-06, + "loss": 5.1872, + "step": 12140 + }, + { + "epoch": 0.24709065755208334, + "grad_norm": 14.564926147460938, + "learning_rate": 9.907583298013357e-06, + "loss": 5.2574, + "step": 12145 + }, + { + "epoch": 0.2471923828125, + "grad_norm": 17.525264739990234, + "learning_rate": 9.90750679337718e-06, + "loss": 5.2983, + "step": 12150 + }, + { + "epoch": 0.24729410807291666, + "grad_norm": 29.63145637512207, + "learning_rate": 9.907430257383592e-06, + "loss": 5.2139, + "step": 12155 + }, + { + "epoch": 0.24739583333333334, + "grad_norm": 18.198579788208008, + "learning_rate": 9.907353690033084e-06, + "loss": 5.4201, + "step": 12160 + }, + { + "epoch": 0.24749755859375, + "grad_norm": 11.888587951660156, + "learning_rate": 9.907277091326142e-06, + "loss": 5.4412, + "step": 12165 + }, + { + "epoch": 0.24759928385416666, + "grad_norm": 16.435617446899414, + "learning_rate": 9.907200461263257e-06, + "loss": 5.0673, + "step": 12170 + }, + { + "epoch": 0.24770100911458334, + "grad_norm": 17.59524917602539, + "learning_rate": 9.90712379984492e-06, + "loss": 5.4547, + "step": 12175 + }, + { + "epoch": 0.247802734375, + "grad_norm": 19.66819953918457, + "learning_rate": 9.907047107071621e-06, + "loss": 5.3594, + "step": 12180 + }, + { + "epoch": 0.24790445963541666, + "grad_norm": 15.544583320617676, + "learning_rate": 9.906970382943846e-06, + "loss": 5.2976, + "step": 12185 + }, + { + "epoch": 0.24800618489583334, + "grad_norm": 14.948651313781738, + "learning_rate": 9.90689362746209e-06, + "loss": 5.2058, + "step": 12190 + }, + { + "epoch": 0.24810791015625, + "grad_norm": 17.508289337158203, + "learning_rate": 9.906816840626839e-06, + "loss": 5.453, + "step": 12195 + }, + { + "epoch": 0.24820963541666666, + "grad_norm": 22.408462524414062, + "learning_rate": 9.906740022438588e-06, + "loss": 5.4958, + "step": 12200 + }, + { + "epoch": 0.24831136067708334, + "grad_norm": 17.311357498168945, + "learning_rate": 9.906663172897825e-06, + "loss": 5.0949, + "step": 12205 + }, + { + "epoch": 0.2484130859375, + "grad_norm": 15.71983814239502, + "learning_rate": 9.906586292005043e-06, + "loss": 5.2376, + "step": 12210 + }, + { + "epoch": 0.24851481119791666, + "grad_norm": 15.686180114746094, + "learning_rate": 9.90650937976073e-06, + "loss": 5.3771, + "step": 12215 + }, + { + "epoch": 0.24861653645833334, + "grad_norm": 15.45450496673584, + "learning_rate": 9.90643243616538e-06, + "loss": 5.2679, + "step": 12220 + }, + { + "epoch": 0.24871826171875, + "grad_norm": 15.95462703704834, + "learning_rate": 9.906355461219486e-06, + "loss": 5.1069, + "step": 12225 + }, + { + "epoch": 0.24881998697916666, + "grad_norm": 15.712340354919434, + "learning_rate": 9.906278454923537e-06, + "loss": 5.2226, + "step": 12230 + }, + { + "epoch": 0.24892171223958334, + "grad_norm": 17.765846252441406, + "learning_rate": 9.906201417278028e-06, + "loss": 5.4198, + "step": 12235 + }, + { + "epoch": 0.2490234375, + "grad_norm": 16.162593841552734, + "learning_rate": 9.906124348283447e-06, + "loss": 5.2016, + "step": 12240 + }, + { + "epoch": 0.24912516276041666, + "grad_norm": 16.292551040649414, + "learning_rate": 9.90604724794029e-06, + "loss": 5.4116, + "step": 12245 + }, + { + "epoch": 0.24922688802083334, + "grad_norm": 17.58541488647461, + "learning_rate": 9.905970116249047e-06, + "loss": 5.1496, + "step": 12250 + }, + { + "epoch": 0.24932861328125, + "grad_norm": 15.533937454223633, + "learning_rate": 9.905892953210214e-06, + "loss": 5.2441, + "step": 12255 + }, + { + "epoch": 0.24943033854166666, + "grad_norm": 17.84988021850586, + "learning_rate": 9.905815758824282e-06, + "loss": 5.3926, + "step": 12260 + }, + { + "epoch": 0.24953206380208334, + "grad_norm": 22.26787757873535, + "learning_rate": 9.905738533091744e-06, + "loss": 5.0508, + "step": 12265 + }, + { + "epoch": 0.2496337890625, + "grad_norm": 16.99559783935547, + "learning_rate": 9.905661276013095e-06, + "loss": 5.2195, + "step": 12270 + }, + { + "epoch": 0.24973551432291666, + "grad_norm": 15.936094284057617, + "learning_rate": 9.905583987588828e-06, + "loss": 5.3662, + "step": 12275 + }, + { + "epoch": 0.24983723958333334, + "grad_norm": 17.44186019897461, + "learning_rate": 9.905506667819434e-06, + "loss": 5.3713, + "step": 12280 + }, + { + "epoch": 0.24993896484375, + "grad_norm": 14.014657020568848, + "learning_rate": 9.905429316705412e-06, + "loss": 5.1697, + "step": 12285 + }, + { + "epoch": 0.25, + "eval_loss": 5.335362911224365, + "eval_runtime": 107.6305, + "eval_samples_per_second": 18.647, + "eval_steps_per_second": 9.328, + "step": 12288 + }, + { + "epoch": 0.2500406901041667, + "grad_norm": 23.868392944335938, + "learning_rate": 9.905351934247252e-06, + "loss": 5.506, + "step": 12290 + }, + { + "epoch": 0.2501424153645833, + "grad_norm": 18.98614501953125, + "learning_rate": 9.905274520445451e-06, + "loss": 5.1294, + "step": 12295 + }, + { + "epoch": 0.250244140625, + "grad_norm": 16.246068954467773, + "learning_rate": 9.905197075300504e-06, + "loss": 5.2791, + "step": 12300 + }, + { + "epoch": 0.2503458658854167, + "grad_norm": 14.97091007232666, + "learning_rate": 9.905119598812905e-06, + "loss": 5.5972, + "step": 12305 + }, + { + "epoch": 0.2504475911458333, + "grad_norm": 17.202316284179688, + "learning_rate": 9.905042090983149e-06, + "loss": 5.1088, + "step": 12310 + }, + { + "epoch": 0.25054931640625, + "grad_norm": 18.893491744995117, + "learning_rate": 9.904964551811728e-06, + "loss": 5.1544, + "step": 12315 + }, + { + "epoch": 0.2506510416666667, + "grad_norm": 14.125558853149414, + "learning_rate": 9.904886981299144e-06, + "loss": 5.3869, + "step": 12320 + }, + { + "epoch": 0.2507527669270833, + "grad_norm": 12.451529502868652, + "learning_rate": 9.904809379445888e-06, + "loss": 5.2819, + "step": 12325 + }, + { + "epoch": 0.2508544921875, + "grad_norm": 12.40572452545166, + "learning_rate": 9.904731746252456e-06, + "loss": 5.4184, + "step": 12330 + }, + { + "epoch": 0.2509562174479167, + "grad_norm": 16.336381912231445, + "learning_rate": 9.904654081719348e-06, + "loss": 5.3976, + "step": 12335 + }, + { + "epoch": 0.2510579427083333, + "grad_norm": 16.563480377197266, + "learning_rate": 9.904576385847054e-06, + "loss": 5.3937, + "step": 12340 + }, + { + "epoch": 0.25115966796875, + "grad_norm": 14.253082275390625, + "learning_rate": 9.904498658636077e-06, + "loss": 5.4202, + "step": 12345 + }, + { + "epoch": 0.2512613932291667, + "grad_norm": 16.611961364746094, + "learning_rate": 9.904420900086908e-06, + "loss": 5.4986, + "step": 12350 + }, + { + "epoch": 0.2513631184895833, + "grad_norm": 13.713336944580078, + "learning_rate": 9.904343110200047e-06, + "loss": 5.0012, + "step": 12355 + }, + { + "epoch": 0.25146484375, + "grad_norm": 15.022968292236328, + "learning_rate": 9.90426528897599e-06, + "loss": 5.2238, + "step": 12360 + }, + { + "epoch": 0.2515665690104167, + "grad_norm": 17.41073226928711, + "learning_rate": 9.904187436415236e-06, + "loss": 5.2851, + "step": 12365 + }, + { + "epoch": 0.2516682942708333, + "grad_norm": 24.26201820373535, + "learning_rate": 9.904109552518279e-06, + "loss": 5.312, + "step": 12370 + }, + { + "epoch": 0.25177001953125, + "grad_norm": 15.003277778625488, + "learning_rate": 9.904031637285621e-06, + "loss": 5.221, + "step": 12375 + }, + { + "epoch": 0.2518717447916667, + "grad_norm": 17.76188087463379, + "learning_rate": 9.903953690717755e-06, + "loss": 5.129, + "step": 12380 + }, + { + "epoch": 0.2519734700520833, + "grad_norm": 23.60122299194336, + "learning_rate": 9.903875712815185e-06, + "loss": 5.5472, + "step": 12385 + }, + { + "epoch": 0.2520751953125, + "grad_norm": 17.267602920532227, + "learning_rate": 9.903797703578402e-06, + "loss": 5.4453, + "step": 12390 + }, + { + "epoch": 0.2521769205729167, + "grad_norm": 16.09999656677246, + "learning_rate": 9.903719663007911e-06, + "loss": 5.2629, + "step": 12395 + }, + { + "epoch": 0.2522786458333333, + "grad_norm": 15.421293258666992, + "learning_rate": 9.903641591104207e-06, + "loss": 5.3539, + "step": 12400 + }, + { + "epoch": 0.25238037109375, + "grad_norm": 15.804912567138672, + "learning_rate": 9.903563487867791e-06, + "loss": 5.4326, + "step": 12405 + }, + { + "epoch": 0.2524820963541667, + "grad_norm": 18.796409606933594, + "learning_rate": 9.90348535329916e-06, + "loss": 5.0594, + "step": 12410 + }, + { + "epoch": 0.2525838216145833, + "grad_norm": 13.682985305786133, + "learning_rate": 9.903407187398814e-06, + "loss": 5.3326, + "step": 12415 + }, + { + "epoch": 0.252685546875, + "grad_norm": 22.05466079711914, + "learning_rate": 9.903328990167253e-06, + "loss": 5.5025, + "step": 12420 + }, + { + "epoch": 0.2527872721354167, + "grad_norm": 12.908963203430176, + "learning_rate": 9.903250761604977e-06, + "loss": 5.2429, + "step": 12425 + }, + { + "epoch": 0.2528889973958333, + "grad_norm": 18.40552520751953, + "learning_rate": 9.903172501712484e-06, + "loss": 5.4589, + "step": 12430 + }, + { + "epoch": 0.25299072265625, + "grad_norm": 18.387121200561523, + "learning_rate": 9.903094210490276e-06, + "loss": 5.5294, + "step": 12435 + }, + { + "epoch": 0.2530924479166667, + "grad_norm": 13.873187065124512, + "learning_rate": 9.903015887938851e-06, + "loss": 5.3399, + "step": 12440 + }, + { + "epoch": 0.2531941731770833, + "grad_norm": 16.825403213500977, + "learning_rate": 9.902937534058713e-06, + "loss": 5.0862, + "step": 12445 + }, + { + "epoch": 0.2532958984375, + "grad_norm": 13.939661979675293, + "learning_rate": 9.90285914885036e-06, + "loss": 5.5578, + "step": 12450 + }, + { + "epoch": 0.2533976236979167, + "grad_norm": 13.736137390136719, + "learning_rate": 9.902780732314293e-06, + "loss": 5.5447, + "step": 12455 + }, + { + "epoch": 0.2534993489583333, + "grad_norm": 15.833312034606934, + "learning_rate": 9.902702284451012e-06, + "loss": 5.5655, + "step": 12460 + }, + { + "epoch": 0.25360107421875, + "grad_norm": 16.06295394897461, + "learning_rate": 9.902623805261021e-06, + "loss": 5.5369, + "step": 12465 + }, + { + "epoch": 0.2537027994791667, + "grad_norm": 15.429590225219727, + "learning_rate": 9.902545294744821e-06, + "loss": 5.2259, + "step": 12470 + }, + { + "epoch": 0.2538045247395833, + "grad_norm": 16.500213623046875, + "learning_rate": 9.902466752902912e-06, + "loss": 5.6264, + "step": 12475 + }, + { + "epoch": 0.25390625, + "grad_norm": 21.034666061401367, + "learning_rate": 9.902388179735796e-06, + "loss": 5.3768, + "step": 12480 + }, + { + "epoch": 0.2540079752604167, + "grad_norm": 13.26884937286377, + "learning_rate": 9.902309575243977e-06, + "loss": 5.194, + "step": 12485 + }, + { + "epoch": 0.2541097005208333, + "grad_norm": 17.76666831970215, + "learning_rate": 9.902230939427955e-06, + "loss": 5.2308, + "step": 12490 + }, + { + "epoch": 0.25421142578125, + "grad_norm": 16.012182235717773, + "learning_rate": 9.902152272288235e-06, + "loss": 5.5118, + "step": 12495 + }, + { + "epoch": 0.2543131510416667, + "grad_norm": 15.60171890258789, + "learning_rate": 9.902073573825317e-06, + "loss": 5.6299, + "step": 12500 + }, + { + "epoch": 0.2544148763020833, + "grad_norm": 15.86223030090332, + "learning_rate": 9.901994844039705e-06, + "loss": 5.1458, + "step": 12505 + }, + { + "epoch": 0.2545166015625, + "grad_norm": 12.564409255981445, + "learning_rate": 9.901916082931901e-06, + "loss": 5.149, + "step": 12510 + }, + { + "epoch": 0.2546183268229167, + "grad_norm": 19.107627868652344, + "learning_rate": 9.901837290502411e-06, + "loss": 5.1931, + "step": 12515 + }, + { + "epoch": 0.2547200520833333, + "grad_norm": 14.79619312286377, + "learning_rate": 9.901758466751738e-06, + "loss": 5.1682, + "step": 12520 + }, + { + "epoch": 0.25482177734375, + "grad_norm": 14.18963623046875, + "learning_rate": 9.901679611680382e-06, + "loss": 5.3515, + "step": 12525 + }, + { + "epoch": 0.2549235026041667, + "grad_norm": 14.272517204284668, + "learning_rate": 9.90160072528885e-06, + "loss": 5.4899, + "step": 12530 + }, + { + "epoch": 0.2550252278645833, + "grad_norm": 17.60537338256836, + "learning_rate": 9.901521807577645e-06, + "loss": 5.1718, + "step": 12535 + }, + { + "epoch": 0.255126953125, + "grad_norm": 14.801812171936035, + "learning_rate": 9.901442858547272e-06, + "loss": 5.4163, + "step": 12540 + }, + { + "epoch": 0.2552286783854167, + "grad_norm": 14.205656051635742, + "learning_rate": 9.901363878198236e-06, + "loss": 5.3091, + "step": 12545 + }, + { + "epoch": 0.2553304036458333, + "grad_norm": 16.38910484313965, + "learning_rate": 9.901284866531038e-06, + "loss": 5.25, + "step": 12550 + }, + { + "epoch": 0.25543212890625, + "grad_norm": 20.608083724975586, + "learning_rate": 9.901205823546189e-06, + "loss": 5.4585, + "step": 12555 + }, + { + "epoch": 0.2555338541666667, + "grad_norm": 18.05166244506836, + "learning_rate": 9.901126749244188e-06, + "loss": 5.199, + "step": 12560 + }, + { + "epoch": 0.2556355794270833, + "grad_norm": 15.028164863586426, + "learning_rate": 9.901047643625545e-06, + "loss": 5.0104, + "step": 12565 + }, + { + "epoch": 0.2557373046875, + "grad_norm": 15.850996017456055, + "learning_rate": 9.900968506690762e-06, + "loss": 5.3041, + "step": 12570 + }, + { + "epoch": 0.2558390299479167, + "grad_norm": 15.6508150100708, + "learning_rate": 9.900889338440344e-06, + "loss": 5.3803, + "step": 12575 + }, + { + "epoch": 0.2559407552083333, + "grad_norm": 12.271531105041504, + "learning_rate": 9.900810138874803e-06, + "loss": 5.478, + "step": 12580 + }, + { + "epoch": 0.25604248046875, + "grad_norm": 15.826911926269531, + "learning_rate": 9.900730907994639e-06, + "loss": 5.1593, + "step": 12585 + }, + { + "epoch": 0.2561442057291667, + "grad_norm": 19.24066162109375, + "learning_rate": 9.90065164580036e-06, + "loss": 5.153, + "step": 12590 + }, + { + "epoch": 0.2562459309895833, + "grad_norm": 18.90581703186035, + "learning_rate": 9.900572352292473e-06, + "loss": 5.3733, + "step": 12595 + }, + { + "epoch": 0.25634765625, + "grad_norm": 15.075800895690918, + "learning_rate": 9.900493027471485e-06, + "loss": 5.3141, + "step": 12600 + }, + { + "epoch": 0.2564493815104167, + "grad_norm": 21.70273208618164, + "learning_rate": 9.9004136713379e-06, + "loss": 5.8566, + "step": 12605 + }, + { + "epoch": 0.2565511067708333, + "grad_norm": 15.075587272644043, + "learning_rate": 9.90033428389223e-06, + "loss": 5.211, + "step": 12610 + }, + { + "epoch": 0.25665283203125, + "grad_norm": 22.119739532470703, + "learning_rate": 9.900254865134978e-06, + "loss": 5.409, + "step": 12615 + }, + { + "epoch": 0.2567545572916667, + "grad_norm": 17.131195068359375, + "learning_rate": 9.900175415066653e-06, + "loss": 5.0689, + "step": 12620 + }, + { + "epoch": 0.2568562825520833, + "grad_norm": 12.950575828552246, + "learning_rate": 9.900095933687762e-06, + "loss": 5.3858, + "step": 12625 + }, + { + "epoch": 0.2569580078125, + "grad_norm": 18.745807647705078, + "learning_rate": 9.900016420998814e-06, + "loss": 5.408, + "step": 12630 + }, + { + "epoch": 0.2570597330729167, + "grad_norm": 13.830245971679688, + "learning_rate": 9.899936877000317e-06, + "loss": 5.4572, + "step": 12635 + }, + { + "epoch": 0.2571614583333333, + "grad_norm": 11.974953651428223, + "learning_rate": 9.899857301692779e-06, + "loss": 5.529, + "step": 12640 + }, + { + "epoch": 0.25726318359375, + "grad_norm": 12.89578628540039, + "learning_rate": 9.899777695076707e-06, + "loss": 5.3141, + "step": 12645 + }, + { + "epoch": 0.2573649088541667, + "grad_norm": 19.986263275146484, + "learning_rate": 9.899698057152612e-06, + "loss": 5.3572, + "step": 12650 + }, + { + "epoch": 0.2574666341145833, + "grad_norm": 40.02768325805664, + "learning_rate": 9.899618387921002e-06, + "loss": 5.5787, + "step": 12655 + }, + { + "epoch": 0.257568359375, + "grad_norm": 18.58473014831543, + "learning_rate": 9.899538687382386e-06, + "loss": 5.0533, + "step": 12660 + }, + { + "epoch": 0.2576700846354167, + "grad_norm": 17.113588333129883, + "learning_rate": 9.899458955537272e-06, + "loss": 5.2389, + "step": 12665 + }, + { + "epoch": 0.2577718098958333, + "grad_norm": 15.792595863342285, + "learning_rate": 9.899379192386171e-06, + "loss": 5.4968, + "step": 12670 + }, + { + "epoch": 0.25787353515625, + "grad_norm": 14.766813278198242, + "learning_rate": 9.899299397929593e-06, + "loss": 5.2031, + "step": 12675 + }, + { + "epoch": 0.2579752604166667, + "grad_norm": 17.16661262512207, + "learning_rate": 9.899219572168046e-06, + "loss": 5.2179, + "step": 12680 + }, + { + "epoch": 0.2580769856770833, + "grad_norm": 18.00537872314453, + "learning_rate": 9.899139715102041e-06, + "loss": 5.5445, + "step": 12685 + }, + { + "epoch": 0.2581787109375, + "grad_norm": 15.882901191711426, + "learning_rate": 9.89905982673209e-06, + "loss": 5.4537, + "step": 12690 + }, + { + "epoch": 0.2582804361979167, + "grad_norm": 18.56308937072754, + "learning_rate": 9.8989799070587e-06, + "loss": 5.3864, + "step": 12695 + }, + { + "epoch": 0.2583821614583333, + "grad_norm": 17.319515228271484, + "learning_rate": 9.898899956082385e-06, + "loss": 5.3884, + "step": 12700 + }, + { + "epoch": 0.25848388671875, + "grad_norm": 12.246554374694824, + "learning_rate": 9.898819973803653e-06, + "loss": 5.4777, + "step": 12705 + }, + { + "epoch": 0.2585856119791667, + "grad_norm": 16.11704444885254, + "learning_rate": 9.898739960223018e-06, + "loss": 5.5287, + "step": 12710 + }, + { + "epoch": 0.2586873372395833, + "grad_norm": 16.34166717529297, + "learning_rate": 9.89865991534099e-06, + "loss": 5.3863, + "step": 12715 + }, + { + "epoch": 0.2587890625, + "grad_norm": 16.003175735473633, + "learning_rate": 9.898579839158078e-06, + "loss": 5.458, + "step": 12720 + }, + { + "epoch": 0.2588907877604167, + "grad_norm": 14.435694694519043, + "learning_rate": 9.898499731674797e-06, + "loss": 5.2214, + "step": 12725 + }, + { + "epoch": 0.2589925130208333, + "grad_norm": 16.626981735229492, + "learning_rate": 9.898419592891657e-06, + "loss": 5.7364, + "step": 12730 + }, + { + "epoch": 0.25909423828125, + "grad_norm": 13.18168830871582, + "learning_rate": 9.89833942280917e-06, + "loss": 5.3102, + "step": 12735 + }, + { + "epoch": 0.2591959635416667, + "grad_norm": 14.930715560913086, + "learning_rate": 9.898259221427852e-06, + "loss": 5.3013, + "step": 12740 + }, + { + "epoch": 0.2592976888020833, + "grad_norm": 16.70423698425293, + "learning_rate": 9.898178988748211e-06, + "loss": 5.4559, + "step": 12745 + }, + { + "epoch": 0.2593994140625, + "grad_norm": 14.495390892028809, + "learning_rate": 9.898098724770762e-06, + "loss": 5.1695, + "step": 12750 + }, + { + "epoch": 0.2595011393229167, + "grad_norm": 17.246265411376953, + "learning_rate": 9.898018429496016e-06, + "loss": 5.4133, + "step": 12755 + }, + { + "epoch": 0.2596028645833333, + "grad_norm": 18.69270896911621, + "learning_rate": 9.897938102924487e-06, + "loss": 5.3659, + "step": 12760 + }, + { + "epoch": 0.25970458984375, + "grad_norm": 16.284116744995117, + "learning_rate": 9.897857745056689e-06, + "loss": 5.2421, + "step": 12765 + }, + { + "epoch": 0.2598063151041667, + "grad_norm": 18.229930877685547, + "learning_rate": 9.897777355893135e-06, + "loss": 5.5088, + "step": 12770 + }, + { + "epoch": 0.2599080403645833, + "grad_norm": 28.94542121887207, + "learning_rate": 9.897696935434338e-06, + "loss": 5.5116, + "step": 12775 + }, + { + "epoch": 0.260009765625, + "grad_norm": 9.973943710327148, + "learning_rate": 9.897616483680811e-06, + "loss": 5.4007, + "step": 12780 + }, + { + "epoch": 0.2601114908854167, + "grad_norm": 14.109615325927734, + "learning_rate": 9.89753600063307e-06, + "loss": 5.1782, + "step": 12785 + }, + { + "epoch": 0.2602132161458333, + "grad_norm": 14.287701606750488, + "learning_rate": 9.89745548629163e-06, + "loss": 5.3795, + "step": 12790 + }, + { + "epoch": 0.26031494140625, + "grad_norm": 25.009540557861328, + "learning_rate": 9.897374940657003e-06, + "loss": 5.068, + "step": 12795 + }, + { + "epoch": 0.2604166666666667, + "grad_norm": 30.78545379638672, + "learning_rate": 9.897294363729706e-06, + "loss": 5.189, + "step": 12800 + }, + { + "epoch": 0.2605183919270833, + "grad_norm": 17.873300552368164, + "learning_rate": 9.897213755510251e-06, + "loss": 5.1738, + "step": 12805 + }, + { + "epoch": 0.2606201171875, + "grad_norm": 16.881582260131836, + "learning_rate": 9.897133115999155e-06, + "loss": 5.6138, + "step": 12810 + }, + { + "epoch": 0.2607218424479167, + "grad_norm": 14.71168327331543, + "learning_rate": 9.897052445196933e-06, + "loss": 5.3615, + "step": 12815 + }, + { + "epoch": 0.2608235677083333, + "grad_norm": 19.072755813598633, + "learning_rate": 9.896971743104101e-06, + "loss": 5.2817, + "step": 12820 + }, + { + "epoch": 0.26092529296875, + "grad_norm": 19.020849227905273, + "learning_rate": 9.896891009721174e-06, + "loss": 5.2279, + "step": 12825 + }, + { + "epoch": 0.2610270182291667, + "grad_norm": 24.731613159179688, + "learning_rate": 9.896810245048668e-06, + "loss": 5.293, + "step": 12830 + }, + { + "epoch": 0.2611287434895833, + "grad_norm": 21.621049880981445, + "learning_rate": 9.8967294490871e-06, + "loss": 5.5261, + "step": 12835 + }, + { + "epoch": 0.26123046875, + "grad_norm": 13.062932968139648, + "learning_rate": 9.896648621836982e-06, + "loss": 5.1044, + "step": 12840 + }, + { + "epoch": 0.2613321940104167, + "grad_norm": 16.27477264404297, + "learning_rate": 9.896567763298837e-06, + "loss": 5.2438, + "step": 12845 + }, + { + "epoch": 0.2614339192708333, + "grad_norm": 14.681429862976074, + "learning_rate": 9.896486873473176e-06, + "loss": 5.5065, + "step": 12850 + }, + { + "epoch": 0.26153564453125, + "grad_norm": 19.923738479614258, + "learning_rate": 9.89640595236052e-06, + "loss": 5.6544, + "step": 12855 + }, + { + "epoch": 0.2616373697916667, + "grad_norm": 14.20258903503418, + "learning_rate": 9.896324999961383e-06, + "loss": 5.3213, + "step": 12860 + }, + { + "epoch": 0.2617390950520833, + "grad_norm": 16.628355026245117, + "learning_rate": 9.896244016276284e-06, + "loss": 5.2051, + "step": 12865 + }, + { + "epoch": 0.2618408203125, + "grad_norm": 16.116931915283203, + "learning_rate": 9.896163001305741e-06, + "loss": 5.1287, + "step": 12870 + }, + { + "epoch": 0.2619425455729167, + "grad_norm": 21.19381332397461, + "learning_rate": 9.89608195505027e-06, + "loss": 5.1819, + "step": 12875 + }, + { + "epoch": 0.2620442708333333, + "grad_norm": 18.653461456298828, + "learning_rate": 9.896000877510388e-06, + "loss": 5.2725, + "step": 12880 + }, + { + "epoch": 0.26214599609375, + "grad_norm": 18.277442932128906, + "learning_rate": 9.895919768686618e-06, + "loss": 5.2775, + "step": 12885 + }, + { + "epoch": 0.2622477213541667, + "grad_norm": 17.165813446044922, + "learning_rate": 9.89583862857947e-06, + "loss": 5.0805, + "step": 12890 + }, + { + "epoch": 0.2623494466145833, + "grad_norm": 13.337553977966309, + "learning_rate": 9.895757457189472e-06, + "loss": 5.4565, + "step": 12895 + }, + { + "epoch": 0.262451171875, + "grad_norm": 13.60356330871582, + "learning_rate": 9.895676254517134e-06, + "loss": 5.106, + "step": 12900 + }, + { + "epoch": 0.2625528971354167, + "grad_norm": 14.504141807556152, + "learning_rate": 9.895595020562982e-06, + "loss": 5.3984, + "step": 12905 + }, + { + "epoch": 0.2626546223958333, + "grad_norm": 11.485662460327148, + "learning_rate": 9.89551375532753e-06, + "loss": 5.2934, + "step": 12910 + }, + { + "epoch": 0.26275634765625, + "grad_norm": 14.702557563781738, + "learning_rate": 9.8954324588113e-06, + "loss": 5.3965, + "step": 12915 + }, + { + "epoch": 0.2628580729166667, + "grad_norm": 15.806118965148926, + "learning_rate": 9.895351131014812e-06, + "loss": 5.2167, + "step": 12920 + }, + { + "epoch": 0.2629597981770833, + "grad_norm": 21.109941482543945, + "learning_rate": 9.895269771938582e-06, + "loss": 5.2931, + "step": 12925 + }, + { + "epoch": 0.2630615234375, + "grad_norm": 15.978586196899414, + "learning_rate": 9.895188381583134e-06, + "loss": 5.3382, + "step": 12930 + }, + { + "epoch": 0.2631632486979167, + "grad_norm": 17.450517654418945, + "learning_rate": 9.895106959948984e-06, + "loss": 5.1926, + "step": 12935 + }, + { + "epoch": 0.2632649739583333, + "grad_norm": 17.335559844970703, + "learning_rate": 9.895025507036656e-06, + "loss": 5.2794, + "step": 12940 + }, + { + "epoch": 0.26336669921875, + "grad_norm": 14.121610641479492, + "learning_rate": 9.894944022846668e-06, + "loss": 5.2705, + "step": 12945 + }, + { + "epoch": 0.2634684244791667, + "grad_norm": 17.286348342895508, + "learning_rate": 9.894862507379542e-06, + "loss": 5.2535, + "step": 12950 + }, + { + "epoch": 0.2635701497395833, + "grad_norm": 12.603748321533203, + "learning_rate": 9.894780960635799e-06, + "loss": 5.2705, + "step": 12955 + }, + { + "epoch": 0.263671875, + "grad_norm": 19.38240623474121, + "learning_rate": 9.894699382615959e-06, + "loss": 5.2709, + "step": 12960 + }, + { + "epoch": 0.2637736002604167, + "grad_norm": 21.119569778442383, + "learning_rate": 9.894617773320544e-06, + "loss": 5.1768, + "step": 12965 + }, + { + "epoch": 0.2638753255208333, + "grad_norm": 18.527507781982422, + "learning_rate": 9.894536132750075e-06, + "loss": 5.5964, + "step": 12970 + }, + { + "epoch": 0.26397705078125, + "grad_norm": 20.04734230041504, + "learning_rate": 9.894454460905073e-06, + "loss": 5.2251, + "step": 12975 + }, + { + "epoch": 0.2640787760416667, + "grad_norm": 14.880629539489746, + "learning_rate": 9.89437275778606e-06, + "loss": 5.5481, + "step": 12980 + }, + { + "epoch": 0.2641805013020833, + "grad_norm": 15.694635391235352, + "learning_rate": 9.894291023393562e-06, + "loss": 5.2753, + "step": 12985 + }, + { + "epoch": 0.2642822265625, + "grad_norm": 17.423988342285156, + "learning_rate": 9.894209257728096e-06, + "loss": 5.2157, + "step": 12990 + }, + { + "epoch": 0.2643839518229167, + "grad_norm": 16.78944206237793, + "learning_rate": 9.894127460790187e-06, + "loss": 5.3015, + "step": 12995 + }, + { + "epoch": 0.2644856770833333, + "grad_norm": 14.426192283630371, + "learning_rate": 9.894045632580356e-06, + "loss": 5.5589, + "step": 13000 + }, + { + "epoch": 0.26458740234375, + "grad_norm": 20.570775985717773, + "learning_rate": 9.893963773099129e-06, + "loss": 5.415, + "step": 13005 + }, + { + "epoch": 0.2646891276041667, + "grad_norm": 22.817800521850586, + "learning_rate": 9.893881882347025e-06, + "loss": 5.4525, + "step": 13010 + }, + { + "epoch": 0.2647908528645833, + "grad_norm": 15.972760200500488, + "learning_rate": 9.89379996032457e-06, + "loss": 5.5086, + "step": 13015 + }, + { + "epoch": 0.264892578125, + "grad_norm": 14.752562522888184, + "learning_rate": 9.893718007032287e-06, + "loss": 5.3784, + "step": 13020 + }, + { + "epoch": 0.2649943033854167, + "grad_norm": 11.516414642333984, + "learning_rate": 9.893636022470698e-06, + "loss": 5.4529, + "step": 13025 + }, + { + "epoch": 0.2650960286458333, + "grad_norm": 16.06868553161621, + "learning_rate": 9.89355400664033e-06, + "loss": 5.6013, + "step": 13030 + }, + { + "epoch": 0.26519775390625, + "grad_norm": 22.95839500427246, + "learning_rate": 9.893471959541705e-06, + "loss": 5.2271, + "step": 13035 + }, + { + "epoch": 0.2652994791666667, + "grad_norm": 14.510856628417969, + "learning_rate": 9.893389881175349e-06, + "loss": 5.3854, + "step": 13040 + }, + { + "epoch": 0.2654012044270833, + "grad_norm": 17.071096420288086, + "learning_rate": 9.893307771541783e-06, + "loss": 5.5172, + "step": 13045 + }, + { + "epoch": 0.2655029296875, + "grad_norm": 15.698291778564453, + "learning_rate": 9.893225630641534e-06, + "loss": 5.0168, + "step": 13050 + }, + { + "epoch": 0.2656046549479167, + "grad_norm": 19.32567596435547, + "learning_rate": 9.893143458475125e-06, + "loss": 5.244, + "step": 13055 + }, + { + "epoch": 0.2657063802083333, + "grad_norm": 17.52934455871582, + "learning_rate": 9.893061255043086e-06, + "loss": 5.222, + "step": 13060 + }, + { + "epoch": 0.26580810546875, + "grad_norm": 17.92099380493164, + "learning_rate": 9.892979020345936e-06, + "loss": 5.3403, + "step": 13065 + }, + { + "epoch": 0.2659098307291667, + "grad_norm": 17.511131286621094, + "learning_rate": 9.892896754384204e-06, + "loss": 5.4474, + "step": 13070 + }, + { + "epoch": 0.2660115559895833, + "grad_norm": 15.792664527893066, + "learning_rate": 9.892814457158416e-06, + "loss": 5.2452, + "step": 13075 + }, + { + "epoch": 0.26611328125, + "grad_norm": 13.054996490478516, + "learning_rate": 9.892732128669096e-06, + "loss": 5.2886, + "step": 13080 + }, + { + "epoch": 0.2662150065104167, + "grad_norm": 23.357139587402344, + "learning_rate": 9.892649768916772e-06, + "loss": 5.2748, + "step": 13085 + }, + { + "epoch": 0.2663167317708333, + "grad_norm": 15.781134605407715, + "learning_rate": 9.892567377901968e-06, + "loss": 5.3174, + "step": 13090 + }, + { + "epoch": 0.26641845703125, + "grad_norm": 18.63416862487793, + "learning_rate": 9.89248495562521e-06, + "loss": 5.2606, + "step": 13095 + }, + { + "epoch": 0.2665201822916667, + "grad_norm": 17.093666076660156, + "learning_rate": 9.892402502087029e-06, + "loss": 5.1699, + "step": 13100 + }, + { + "epoch": 0.2666219075520833, + "grad_norm": 16.734100341796875, + "learning_rate": 9.892320017287949e-06, + "loss": 5.3906, + "step": 13105 + }, + { + "epoch": 0.2667236328125, + "grad_norm": 15.007877349853516, + "learning_rate": 9.892237501228495e-06, + "loss": 5.2727, + "step": 13110 + }, + { + "epoch": 0.2668253580729167, + "grad_norm": 16.64688491821289, + "learning_rate": 9.892154953909197e-06, + "loss": 5.4707, + "step": 13115 + }, + { + "epoch": 0.2669270833333333, + "grad_norm": 18.39405059814453, + "learning_rate": 9.892072375330583e-06, + "loss": 5.08, + "step": 13120 + }, + { + "epoch": 0.26702880859375, + "grad_norm": 13.381853103637695, + "learning_rate": 9.89198976549318e-06, + "loss": 5.4557, + "step": 13125 + }, + { + "epoch": 0.2671305338541667, + "grad_norm": 12.869882583618164, + "learning_rate": 9.891907124397514e-06, + "loss": 4.9221, + "step": 13130 + }, + { + "epoch": 0.2672322591145833, + "grad_norm": 18.86626434326172, + "learning_rate": 9.891824452044114e-06, + "loss": 5.4444, + "step": 13135 + }, + { + "epoch": 0.267333984375, + "grad_norm": 18.46175193786621, + "learning_rate": 9.891741748433509e-06, + "loss": 5.2452, + "step": 13140 + }, + { + "epoch": 0.2674357096354167, + "grad_norm": 13.837759017944336, + "learning_rate": 9.891659013566225e-06, + "loss": 5.3111, + "step": 13145 + }, + { + "epoch": 0.2675374348958333, + "grad_norm": 17.248138427734375, + "learning_rate": 9.891576247442796e-06, + "loss": 4.982, + "step": 13150 + }, + { + "epoch": 0.26763916015625, + "grad_norm": 19.272462844848633, + "learning_rate": 9.891493450063748e-06, + "loss": 5.553, + "step": 13155 + }, + { + "epoch": 0.2677408854166667, + "grad_norm": 13.356307029724121, + "learning_rate": 9.891410621429607e-06, + "loss": 5.1556, + "step": 13160 + }, + { + "epoch": 0.2678426106770833, + "grad_norm": 13.737645149230957, + "learning_rate": 9.891327761540906e-06, + "loss": 5.1857, + "step": 13165 + }, + { + "epoch": 0.2679443359375, + "grad_norm": 14.877026557922363, + "learning_rate": 9.891244870398173e-06, + "loss": 5.2695, + "step": 13170 + }, + { + "epoch": 0.2680460611979167, + "grad_norm": 19.157344818115234, + "learning_rate": 9.891161948001939e-06, + "loss": 5.344, + "step": 13175 + }, + { + "epoch": 0.2681477864583333, + "grad_norm": 16.576690673828125, + "learning_rate": 9.891078994352731e-06, + "loss": 5.2428, + "step": 13180 + }, + { + "epoch": 0.26824951171875, + "grad_norm": 17.89756965637207, + "learning_rate": 9.890996009451083e-06, + "loss": 5.3594, + "step": 13185 + }, + { + "epoch": 0.2683512369791667, + "grad_norm": 22.028057098388672, + "learning_rate": 9.89091299329752e-06, + "loss": 5.1963, + "step": 13190 + }, + { + "epoch": 0.2684529622395833, + "grad_norm": 17.381206512451172, + "learning_rate": 9.89082994589258e-06, + "loss": 5.7112, + "step": 13195 + }, + { + "epoch": 0.2685546875, + "grad_norm": 24.412023544311523, + "learning_rate": 9.890746867236786e-06, + "loss": 5.1155, + "step": 13200 + }, + { + "epoch": 0.2686564127604167, + "grad_norm": 24.314268112182617, + "learning_rate": 9.890663757330673e-06, + "loss": 5.3872, + "step": 13205 + }, + { + "epoch": 0.2687581380208333, + "grad_norm": 14.634308815002441, + "learning_rate": 9.890580616174771e-06, + "loss": 5.2224, + "step": 13210 + }, + { + "epoch": 0.26885986328125, + "grad_norm": 12.198347091674805, + "learning_rate": 9.89049744376961e-06, + "loss": 5.1265, + "step": 13215 + }, + { + "epoch": 0.2689615885416667, + "grad_norm": 22.448808670043945, + "learning_rate": 9.890414240115725e-06, + "loss": 5.4577, + "step": 13220 + }, + { + "epoch": 0.2690633138020833, + "grad_norm": 16.315378189086914, + "learning_rate": 9.890331005213644e-06, + "loss": 5.5776, + "step": 13225 + }, + { + "epoch": 0.2691650390625, + "grad_norm": 12.556106567382812, + "learning_rate": 9.890247739063901e-06, + "loss": 5.2949, + "step": 13230 + }, + { + "epoch": 0.2692667643229167, + "grad_norm": 16.46505355834961, + "learning_rate": 9.890164441667026e-06, + "loss": 5.8791, + "step": 13235 + }, + { + "epoch": 0.2693684895833333, + "grad_norm": 17.946434020996094, + "learning_rate": 9.890081113023555e-06, + "loss": 5.3932, + "step": 13240 + }, + { + "epoch": 0.26947021484375, + "grad_norm": 21.515262603759766, + "learning_rate": 9.889997753134018e-06, + "loss": 5.2751, + "step": 13245 + }, + { + "epoch": 0.2695719401041667, + "grad_norm": 16.44999885559082, + "learning_rate": 9.889914361998946e-06, + "loss": 5.3253, + "step": 13250 + }, + { + "epoch": 0.2696736653645833, + "grad_norm": 12.388406753540039, + "learning_rate": 9.889830939618875e-06, + "loss": 5.1558, + "step": 13255 + }, + { + "epoch": 0.269775390625, + "grad_norm": 12.827423095703125, + "learning_rate": 9.889747485994335e-06, + "loss": 5.2164, + "step": 13260 + }, + { + "epoch": 0.2698771158854167, + "grad_norm": 16.154146194458008, + "learning_rate": 9.88966400112586e-06, + "loss": 5.1766, + "step": 13265 + }, + { + "epoch": 0.2699788411458333, + "grad_norm": 14.657944679260254, + "learning_rate": 9.889580485013986e-06, + "loss": 5.4069, + "step": 13270 + }, + { + "epoch": 0.27008056640625, + "grad_norm": 20.17526626586914, + "learning_rate": 9.889496937659245e-06, + "loss": 5.6123, + "step": 13275 + }, + { + "epoch": 0.2701822916666667, + "grad_norm": 13.561796188354492, + "learning_rate": 9.889413359062168e-06, + "loss": 5.3652, + "step": 13280 + }, + { + "epoch": 0.2702840169270833, + "grad_norm": 19.37751579284668, + "learning_rate": 9.889329749223295e-06, + "loss": 5.3605, + "step": 13285 + }, + { + "epoch": 0.2703857421875, + "grad_norm": 13.570669174194336, + "learning_rate": 9.889246108143155e-06, + "loss": 5.2437, + "step": 13290 + }, + { + "epoch": 0.2704874674479167, + "grad_norm": 21.280488967895508, + "learning_rate": 9.889162435822285e-06, + "loss": 5.3052, + "step": 13295 + }, + { + "epoch": 0.2705891927083333, + "grad_norm": 16.652658462524414, + "learning_rate": 9.88907873226122e-06, + "loss": 5.2042, + "step": 13300 + }, + { + "epoch": 0.27069091796875, + "grad_norm": 15.945419311523438, + "learning_rate": 9.888994997460493e-06, + "loss": 5.1162, + "step": 13305 + }, + { + "epoch": 0.2707926432291667, + "grad_norm": 21.68709945678711, + "learning_rate": 9.88891123142064e-06, + "loss": 5.794, + "step": 13310 + }, + { + "epoch": 0.2708943684895833, + "grad_norm": 16.722280502319336, + "learning_rate": 9.888827434142195e-06, + "loss": 5.2876, + "step": 13315 + }, + { + "epoch": 0.27099609375, + "grad_norm": 15.32243824005127, + "learning_rate": 9.888743605625698e-06, + "loss": 5.5824, + "step": 13320 + }, + { + "epoch": 0.2710978190104167, + "grad_norm": 13.20791244506836, + "learning_rate": 9.888659745871677e-06, + "loss": 5.2162, + "step": 13325 + }, + { + "epoch": 0.2711995442708333, + "grad_norm": 25.98303985595703, + "learning_rate": 9.888575854880674e-06, + "loss": 5.4782, + "step": 13330 + }, + { + "epoch": 0.27130126953125, + "grad_norm": 12.819252014160156, + "learning_rate": 9.888491932653224e-06, + "loss": 5.191, + "step": 13335 + }, + { + "epoch": 0.2714029947916667, + "grad_norm": 20.537189483642578, + "learning_rate": 9.888407979189861e-06, + "loss": 5.4951, + "step": 13340 + }, + { + "epoch": 0.2715047200520833, + "grad_norm": 18.54091453552246, + "learning_rate": 9.888323994491124e-06, + "loss": 5.1576, + "step": 13345 + }, + { + "epoch": 0.2716064453125, + "grad_norm": 15.95925235748291, + "learning_rate": 9.888239978557548e-06, + "loss": 5.1562, + "step": 13350 + }, + { + "epoch": 0.2717081705729167, + "grad_norm": 15.733892440795898, + "learning_rate": 9.88815593138967e-06, + "loss": 5.3049, + "step": 13355 + }, + { + "epoch": 0.2718098958333333, + "grad_norm": 14.696085929870605, + "learning_rate": 9.888071852988028e-06, + "loss": 5.2868, + "step": 13360 + }, + { + "epoch": 0.27191162109375, + "grad_norm": 18.76316261291504, + "learning_rate": 9.887987743353159e-06, + "loss": 5.3956, + "step": 13365 + }, + { + "epoch": 0.2720133463541667, + "grad_norm": 15.16319751739502, + "learning_rate": 9.887903602485598e-06, + "loss": 5.3629, + "step": 13370 + }, + { + "epoch": 0.2721150716145833, + "grad_norm": 17.44271469116211, + "learning_rate": 9.887819430385886e-06, + "loss": 5.3829, + "step": 13375 + }, + { + "epoch": 0.272216796875, + "grad_norm": 17.10733985900879, + "learning_rate": 9.887735227054558e-06, + "loss": 5.3702, + "step": 13380 + }, + { + "epoch": 0.2723185221354167, + "grad_norm": 19.1688232421875, + "learning_rate": 9.887650992492155e-06, + "loss": 5.4468, + "step": 13385 + }, + { + "epoch": 0.2724202473958333, + "grad_norm": 19.040115356445312, + "learning_rate": 9.887566726699212e-06, + "loss": 5.4956, + "step": 13390 + }, + { + "epoch": 0.27252197265625, + "grad_norm": 13.673918724060059, + "learning_rate": 9.887482429676272e-06, + "loss": 5.3488, + "step": 13395 + }, + { + "epoch": 0.2726236979166667, + "grad_norm": 14.647294998168945, + "learning_rate": 9.887398101423868e-06, + "loss": 5.5209, + "step": 13400 + }, + { + "epoch": 0.2727254231770833, + "grad_norm": 16.676822662353516, + "learning_rate": 9.887313741942543e-06, + "loss": 5.1745, + "step": 13405 + }, + { + "epoch": 0.2728271484375, + "grad_norm": 20.33732032775879, + "learning_rate": 9.887229351232834e-06, + "loss": 5.1716, + "step": 13410 + }, + { + "epoch": 0.2729288736979167, + "grad_norm": 21.311750411987305, + "learning_rate": 9.887144929295282e-06, + "loss": 5.154, + "step": 13415 + }, + { + "epoch": 0.2730305989583333, + "grad_norm": 16.088863372802734, + "learning_rate": 9.887060476130423e-06, + "loss": 5.3406, + "step": 13420 + }, + { + "epoch": 0.27313232421875, + "grad_norm": 15.982013702392578, + "learning_rate": 9.886975991738801e-06, + "loss": 5.3675, + "step": 13425 + }, + { + "epoch": 0.2732340494791667, + "grad_norm": 23.52423095703125, + "learning_rate": 9.886891476120954e-06, + "loss": 5.3428, + "step": 13430 + }, + { + "epoch": 0.2733357747395833, + "grad_norm": 20.4605770111084, + "learning_rate": 9.88680692927742e-06, + "loss": 5.4594, + "step": 13435 + }, + { + "epoch": 0.2734375, + "grad_norm": 13.906359672546387, + "learning_rate": 9.886722351208742e-06, + "loss": 5.5805, + "step": 13440 + }, + { + "epoch": 0.2735392252604167, + "grad_norm": 14.36928653717041, + "learning_rate": 9.88663774191546e-06, + "loss": 5.2073, + "step": 13445 + }, + { + "epoch": 0.2736409505208333, + "grad_norm": 16.715377807617188, + "learning_rate": 9.886553101398113e-06, + "loss": 5.3021, + "step": 13450 + }, + { + "epoch": 0.27374267578125, + "grad_norm": 13.705324172973633, + "learning_rate": 9.886468429657244e-06, + "loss": 5.2369, + "step": 13455 + }, + { + "epoch": 0.2738444010416667, + "grad_norm": 12.896946907043457, + "learning_rate": 9.886383726693392e-06, + "loss": 5.2445, + "step": 13460 + }, + { + "epoch": 0.2739461263020833, + "grad_norm": 13.513357162475586, + "learning_rate": 9.8862989925071e-06, + "loss": 5.6916, + "step": 13465 + }, + { + "epoch": 0.2740478515625, + "grad_norm": 16.578876495361328, + "learning_rate": 9.886214227098907e-06, + "loss": 5.595, + "step": 13470 + }, + { + "epoch": 0.2741495768229167, + "grad_norm": 13.075139045715332, + "learning_rate": 9.886129430469358e-06, + "loss": 5.4595, + "step": 13475 + }, + { + "epoch": 0.2742513020833333, + "grad_norm": 16.313627243041992, + "learning_rate": 9.886044602618992e-06, + "loss": 5.6515, + "step": 13480 + }, + { + "epoch": 0.27435302734375, + "grad_norm": 17.889192581176758, + "learning_rate": 9.885959743548351e-06, + "loss": 5.6363, + "step": 13485 + }, + { + "epoch": 0.2744547526041667, + "grad_norm": 15.316781997680664, + "learning_rate": 9.885874853257979e-06, + "loss": 5.5195, + "step": 13490 + }, + { + "epoch": 0.2745564778645833, + "grad_norm": 16.716110229492188, + "learning_rate": 9.885789931748418e-06, + "loss": 5.382, + "step": 13495 + }, + { + "epoch": 0.274658203125, + "grad_norm": 16.226335525512695, + "learning_rate": 9.885704979020211e-06, + "loss": 5.2632, + "step": 13500 + }, + { + "epoch": 0.2747599283854167, + "grad_norm": 17.901004791259766, + "learning_rate": 9.885619995073899e-06, + "loss": 5.096, + "step": 13505 + }, + { + "epoch": 0.2748616536458333, + "grad_norm": 27.77037811279297, + "learning_rate": 9.885534979910028e-06, + "loss": 5.5472, + "step": 13510 + }, + { + "epoch": 0.27496337890625, + "grad_norm": 14.879955291748047, + "learning_rate": 9.885449933529137e-06, + "loss": 5.4005, + "step": 13515 + }, + { + "epoch": 0.2750651041666667, + "grad_norm": 15.7170991897583, + "learning_rate": 9.885364855931772e-06, + "loss": 5.337, + "step": 13520 + }, + { + "epoch": 0.2751668294270833, + "grad_norm": 19.575199127197266, + "learning_rate": 9.885279747118478e-06, + "loss": 5.3216, + "step": 13525 + }, + { + "epoch": 0.2752685546875, + "grad_norm": 19.362911224365234, + "learning_rate": 9.885194607089796e-06, + "loss": 5.2814, + "step": 13530 + }, + { + "epoch": 0.2753702799479167, + "grad_norm": 18.403207778930664, + "learning_rate": 9.885109435846272e-06, + "loss": 5.3112, + "step": 13535 + }, + { + "epoch": 0.2754720052083333, + "grad_norm": 14.181669235229492, + "learning_rate": 9.88502423338845e-06, + "loss": 5.3228, + "step": 13540 + }, + { + "epoch": 0.27557373046875, + "grad_norm": 16.085926055908203, + "learning_rate": 9.884938999716872e-06, + "loss": 5.5486, + "step": 13545 + }, + { + "epoch": 0.2756754557291667, + "grad_norm": 15.793941497802734, + "learning_rate": 9.884853734832085e-06, + "loss": 5.1047, + "step": 13550 + }, + { + "epoch": 0.2757771809895833, + "grad_norm": 19.47258758544922, + "learning_rate": 9.884768438734633e-06, + "loss": 5.3506, + "step": 13555 + }, + { + "epoch": 0.27587890625, + "grad_norm": 14.105216979980469, + "learning_rate": 9.884683111425062e-06, + "loss": 5.1852, + "step": 13560 + }, + { + "epoch": 0.2759806315104167, + "grad_norm": 13.647377967834473, + "learning_rate": 9.884597752903918e-06, + "loss": 5.1666, + "step": 13565 + }, + { + "epoch": 0.2760823567708333, + "grad_norm": 15.49664306640625, + "learning_rate": 9.884512363171744e-06, + "loss": 5.0928, + "step": 13570 + }, + { + "epoch": 0.27618408203125, + "grad_norm": 35.25752639770508, + "learning_rate": 9.884426942229084e-06, + "loss": 5.1396, + "step": 13575 + }, + { + "epoch": 0.2762858072916667, + "grad_norm": 15.64350414276123, + "learning_rate": 9.884341490076489e-06, + "loss": 5.1818, + "step": 13580 + }, + { + "epoch": 0.2763875325520833, + "grad_norm": 15.06644344329834, + "learning_rate": 9.8842560067145e-06, + "loss": 5.3236, + "step": 13585 + }, + { + "epoch": 0.2764892578125, + "grad_norm": 16.682498931884766, + "learning_rate": 9.884170492143668e-06, + "loss": 5.4, + "step": 13590 + }, + { + "epoch": 0.2765909830729167, + "grad_norm": 13.46158218383789, + "learning_rate": 9.884084946364537e-06, + "loss": 5.1653, + "step": 13595 + }, + { + "epoch": 0.2766927083333333, + "grad_norm": 14.88409423828125, + "learning_rate": 9.883999369377652e-06, + "loss": 5.44, + "step": 13600 + }, + { + "epoch": 0.27679443359375, + "grad_norm": 19.814537048339844, + "learning_rate": 9.883913761183564e-06, + "loss": 5.2079, + "step": 13605 + }, + { + "epoch": 0.2768961588541667, + "grad_norm": 14.06745719909668, + "learning_rate": 9.883828121782814e-06, + "loss": 5.4003, + "step": 13610 + }, + { + "epoch": 0.2769978841145833, + "grad_norm": 17.528093338012695, + "learning_rate": 9.883742451175955e-06, + "loss": 5.3598, + "step": 13615 + }, + { + "epoch": 0.277099609375, + "grad_norm": 16.526782989501953, + "learning_rate": 9.883656749363533e-06, + "loss": 5.4146, + "step": 13620 + }, + { + "epoch": 0.2772013346354167, + "grad_norm": 13.307659149169922, + "learning_rate": 9.883571016346091e-06, + "loss": 5.3769, + "step": 13625 + }, + { + "epoch": 0.2773030598958333, + "grad_norm": 17.48053550720215, + "learning_rate": 9.883485252124183e-06, + "loss": 5.3302, + "step": 13630 + }, + { + "epoch": 0.27740478515625, + "grad_norm": 21.283966064453125, + "learning_rate": 9.883399456698356e-06, + "loss": 5.4107, + "step": 13635 + }, + { + "epoch": 0.2775065104166667, + "grad_norm": 17.523147583007812, + "learning_rate": 9.883313630069154e-06, + "loss": 5.3033, + "step": 13640 + }, + { + "epoch": 0.2776082356770833, + "grad_norm": 18.098953247070312, + "learning_rate": 9.883227772237129e-06, + "loss": 5.2907, + "step": 13645 + }, + { + "epoch": 0.2777099609375, + "grad_norm": 13.3773832321167, + "learning_rate": 9.883141883202827e-06, + "loss": 5.0716, + "step": 13650 + }, + { + "epoch": 0.2778116861979167, + "grad_norm": 22.04088020324707, + "learning_rate": 9.883055962966801e-06, + "loss": 5.2921, + "step": 13655 + }, + { + "epoch": 0.2779134114583333, + "grad_norm": 14.220072746276855, + "learning_rate": 9.882970011529595e-06, + "loss": 5.2029, + "step": 13660 + }, + { + "epoch": 0.27801513671875, + "grad_norm": 12.688633918762207, + "learning_rate": 9.882884028891763e-06, + "loss": 5.4445, + "step": 13665 + }, + { + "epoch": 0.2781168619791667, + "grad_norm": 13.431916236877441, + "learning_rate": 9.882798015053852e-06, + "loss": 5.373, + "step": 13670 + }, + { + "epoch": 0.2782185872395833, + "grad_norm": 17.785913467407227, + "learning_rate": 9.88271197001641e-06, + "loss": 5.3396, + "step": 13675 + }, + { + "epoch": 0.2783203125, + "grad_norm": 16.09080696105957, + "learning_rate": 9.882625893779988e-06, + "loss": 5.2459, + "step": 13680 + }, + { + "epoch": 0.2784220377604167, + "grad_norm": 16.201974868774414, + "learning_rate": 9.88253978634514e-06, + "loss": 5.3094, + "step": 13685 + }, + { + "epoch": 0.2785237630208333, + "grad_norm": 24.32994842529297, + "learning_rate": 9.88245364771241e-06, + "loss": 5.3529, + "step": 13690 + }, + { + "epoch": 0.27862548828125, + "grad_norm": 14.071203231811523, + "learning_rate": 9.882367477882352e-06, + "loss": 5.1088, + "step": 13695 + }, + { + "epoch": 0.2787272135416667, + "grad_norm": 13.819483757019043, + "learning_rate": 9.882281276855514e-06, + "loss": 5.4494, + "step": 13700 + }, + { + "epoch": 0.2788289388020833, + "grad_norm": 16.591476440429688, + "learning_rate": 9.882195044632451e-06, + "loss": 5.3635, + "step": 13705 + }, + { + "epoch": 0.2789306640625, + "grad_norm": 22.214162826538086, + "learning_rate": 9.88210878121371e-06, + "loss": 5.2187, + "step": 13710 + }, + { + "epoch": 0.2790323893229167, + "grad_norm": 18.626415252685547, + "learning_rate": 9.882022486599845e-06, + "loss": 5.2084, + "step": 13715 + }, + { + "epoch": 0.2791341145833333, + "grad_norm": 16.55177879333496, + "learning_rate": 9.881936160791405e-06, + "loss": 5.33, + "step": 13720 + }, + { + "epoch": 0.27923583984375, + "grad_norm": 20.229753494262695, + "learning_rate": 9.881849803788942e-06, + "loss": 5.5447, + "step": 13725 + }, + { + "epoch": 0.2793375651041667, + "grad_norm": 18.02922821044922, + "learning_rate": 9.881763415593009e-06, + "loss": 5.1444, + "step": 13730 + }, + { + "epoch": 0.2794392903645833, + "grad_norm": 14.503365516662598, + "learning_rate": 9.881676996204158e-06, + "loss": 5.6193, + "step": 13735 + }, + { + "epoch": 0.279541015625, + "grad_norm": 14.343502044677734, + "learning_rate": 9.88159054562294e-06, + "loss": 5.2699, + "step": 13740 + }, + { + "epoch": 0.2796427408854167, + "grad_norm": 17.4074649810791, + "learning_rate": 9.881504063849909e-06, + "loss": 5.2629, + "step": 13745 + }, + { + "epoch": 0.2797444661458333, + "grad_norm": 13.60301399230957, + "learning_rate": 9.881417550885614e-06, + "loss": 5.4414, + "step": 13750 + }, + { + "epoch": 0.27984619140625, + "grad_norm": 16.502521514892578, + "learning_rate": 9.881331006730615e-06, + "loss": 5.2779, + "step": 13755 + }, + { + "epoch": 0.2799479166666667, + "grad_norm": 16.752933502197266, + "learning_rate": 9.881244431385457e-06, + "loss": 5.1977, + "step": 13760 + }, + { + "epoch": 0.2800496419270833, + "grad_norm": 13.659860610961914, + "learning_rate": 9.881157824850696e-06, + "loss": 5.3021, + "step": 13765 + }, + { + "epoch": 0.2801513671875, + "grad_norm": 16.25604248046875, + "learning_rate": 9.881071187126888e-06, + "loss": 5.3426, + "step": 13770 + }, + { + "epoch": 0.2802530924479167, + "grad_norm": 18.96290397644043, + "learning_rate": 9.880984518214585e-06, + "loss": 5.1953, + "step": 13775 + }, + { + "epoch": 0.2803548177083333, + "grad_norm": 14.046361923217773, + "learning_rate": 9.880897818114338e-06, + "loss": 5.3573, + "step": 13780 + }, + { + "epoch": 0.28045654296875, + "grad_norm": 13.858022689819336, + "learning_rate": 9.880811086826705e-06, + "loss": 5.1675, + "step": 13785 + }, + { + "epoch": 0.2805582682291667, + "grad_norm": 14.52027702331543, + "learning_rate": 9.880724324352238e-06, + "loss": 5.2585, + "step": 13790 + }, + { + "epoch": 0.2806599934895833, + "grad_norm": 12.068622589111328, + "learning_rate": 9.880637530691493e-06, + "loss": 5.3394, + "step": 13795 + }, + { + "epoch": 0.28076171875, + "grad_norm": 15.006784439086914, + "learning_rate": 9.880550705845023e-06, + "loss": 5.4389, + "step": 13800 + }, + { + "epoch": 0.2808634440104167, + "grad_norm": 16.78105354309082, + "learning_rate": 9.880463849813382e-06, + "loss": 5.1348, + "step": 13805 + }, + { + "epoch": 0.2809651692708333, + "grad_norm": 19.371440887451172, + "learning_rate": 9.880376962597127e-06, + "loss": 5.4165, + "step": 13810 + }, + { + "epoch": 0.28106689453125, + "grad_norm": 18.72345542907715, + "learning_rate": 9.880290044196812e-06, + "loss": 5.0407, + "step": 13815 + }, + { + "epoch": 0.2811686197916667, + "grad_norm": 17.14739227294922, + "learning_rate": 9.880203094612995e-06, + "loss": 5.4727, + "step": 13820 + }, + { + "epoch": 0.2812703450520833, + "grad_norm": 17.643789291381836, + "learning_rate": 9.880116113846228e-06, + "loss": 5.4685, + "step": 13825 + }, + { + "epoch": 0.2813720703125, + "grad_norm": 20.421009063720703, + "learning_rate": 9.880029101897067e-06, + "loss": 5.378, + "step": 13830 + }, + { + "epoch": 0.2814737955729167, + "grad_norm": 20.076128005981445, + "learning_rate": 9.87994205876607e-06, + "loss": 5.2872, + "step": 13835 + }, + { + "epoch": 0.2815755208333333, + "grad_norm": 18.399642944335938, + "learning_rate": 9.879854984453793e-06, + "loss": 5.3529, + "step": 13840 + }, + { + "epoch": 0.28167724609375, + "grad_norm": 11.925419807434082, + "learning_rate": 9.879767878960792e-06, + "loss": 5.3977, + "step": 13845 + }, + { + "epoch": 0.2817789713541667, + "grad_norm": 18.883981704711914, + "learning_rate": 9.879680742287621e-06, + "loss": 5.1811, + "step": 13850 + }, + { + "epoch": 0.2818806966145833, + "grad_norm": 16.482158660888672, + "learning_rate": 9.879593574434841e-06, + "loss": 5.1772, + "step": 13855 + }, + { + "epoch": 0.281982421875, + "grad_norm": 18.888351440429688, + "learning_rate": 9.879506375403007e-06, + "loss": 5.4671, + "step": 13860 + }, + { + "epoch": 0.2820841471354167, + "grad_norm": 20.71738624572754, + "learning_rate": 9.879419145192676e-06, + "loss": 5.4266, + "step": 13865 + }, + { + "epoch": 0.2821858723958333, + "grad_norm": 18.43577766418457, + "learning_rate": 9.879331883804406e-06, + "loss": 5.1973, + "step": 13870 + }, + { + "epoch": 0.28228759765625, + "grad_norm": 15.284587860107422, + "learning_rate": 9.879244591238754e-06, + "loss": 5.5072, + "step": 13875 + }, + { + "epoch": 0.2823893229166667, + "grad_norm": 14.086450576782227, + "learning_rate": 9.879157267496277e-06, + "loss": 5.3169, + "step": 13880 + }, + { + "epoch": 0.2824910481770833, + "grad_norm": 14.644593238830566, + "learning_rate": 9.879069912577534e-06, + "loss": 5.4165, + "step": 13885 + }, + { + "epoch": 0.2825927734375, + "grad_norm": 16.20244026184082, + "learning_rate": 9.878982526483083e-06, + "loss": 5.3287, + "step": 13890 + }, + { + "epoch": 0.2826944986979167, + "grad_norm": 17.89614486694336, + "learning_rate": 9.878895109213485e-06, + "loss": 5.4596, + "step": 13895 + }, + { + "epoch": 0.2827962239583333, + "grad_norm": 16.860013961791992, + "learning_rate": 9.878807660769294e-06, + "loss": 5.521, + "step": 13900 + }, + { + "epoch": 0.28289794921875, + "grad_norm": 16.16902732849121, + "learning_rate": 9.87872018115107e-06, + "loss": 5.2936, + "step": 13905 + }, + { + "epoch": 0.2829996744791667, + "grad_norm": 15.003143310546875, + "learning_rate": 9.878632670359374e-06, + "loss": 5.4673, + "step": 13910 + }, + { + "epoch": 0.2831013997395833, + "grad_norm": 15.957413673400879, + "learning_rate": 9.878545128394764e-06, + "loss": 5.4187, + "step": 13915 + }, + { + "epoch": 0.283203125, + "grad_norm": 14.566047668457031, + "learning_rate": 9.8784575552578e-06, + "loss": 5.3048, + "step": 13920 + }, + { + "epoch": 0.2833048502604167, + "grad_norm": 15.130486488342285, + "learning_rate": 9.878369950949038e-06, + "loss": 5.5084, + "step": 13925 + }, + { + "epoch": 0.2834065755208333, + "grad_norm": 19.73043441772461, + "learning_rate": 9.878282315469043e-06, + "loss": 5.3095, + "step": 13930 + }, + { + "epoch": 0.28350830078125, + "grad_norm": 16.33690643310547, + "learning_rate": 9.87819464881837e-06, + "loss": 5.2024, + "step": 13935 + }, + { + "epoch": 0.2836100260416667, + "grad_norm": 15.2044038772583, + "learning_rate": 9.878106950997586e-06, + "loss": 5.1166, + "step": 13940 + }, + { + "epoch": 0.2837117513020833, + "grad_norm": 18.936033248901367, + "learning_rate": 9.878019222007245e-06, + "loss": 5.1919, + "step": 13945 + }, + { + "epoch": 0.2838134765625, + "grad_norm": 14.943696975708008, + "learning_rate": 9.877931461847908e-06, + "loss": 5.2798, + "step": 13950 + }, + { + "epoch": 0.2839152018229167, + "grad_norm": 16.632604598999023, + "learning_rate": 9.877843670520139e-06, + "loss": 5.3737, + "step": 13955 + }, + { + "epoch": 0.2840169270833333, + "grad_norm": 13.943733215332031, + "learning_rate": 9.877755848024497e-06, + "loss": 5.4678, + "step": 13960 + }, + { + "epoch": 0.28411865234375, + "grad_norm": 14.652146339416504, + "learning_rate": 9.877667994361543e-06, + "loss": 5.2514, + "step": 13965 + }, + { + "epoch": 0.2842203776041667, + "grad_norm": 20.126379013061523, + "learning_rate": 9.877580109531842e-06, + "loss": 5.4156, + "step": 13970 + }, + { + "epoch": 0.2843221028645833, + "grad_norm": 15.742185592651367, + "learning_rate": 9.877492193535949e-06, + "loss": 5.2612, + "step": 13975 + }, + { + "epoch": 0.284423828125, + "grad_norm": 18.837491989135742, + "learning_rate": 9.877404246374431e-06, + "loss": 5.5444, + "step": 13980 + }, + { + "epoch": 0.2845255533854167, + "grad_norm": 14.136551856994629, + "learning_rate": 9.877316268047847e-06, + "loss": 5.3016, + "step": 13985 + }, + { + "epoch": 0.2846272786458333, + "grad_norm": 14.896967887878418, + "learning_rate": 9.87722825855676e-06, + "loss": 5.4666, + "step": 13990 + }, + { + "epoch": 0.28472900390625, + "grad_norm": 18.682682037353516, + "learning_rate": 9.877140217901735e-06, + "loss": 5.1736, + "step": 13995 + }, + { + "epoch": 0.2848307291666667, + "grad_norm": 15.299565315246582, + "learning_rate": 9.87705214608333e-06, + "loss": 5.1064, + "step": 14000 + }, + { + "epoch": 0.2849324544270833, + "grad_norm": 18.72525978088379, + "learning_rate": 9.87696404310211e-06, + "loss": 5.2512, + "step": 14005 + }, + { + "epoch": 0.2850341796875, + "grad_norm": 14.35976791381836, + "learning_rate": 9.876875908958639e-06, + "loss": 5.2163, + "step": 14010 + }, + { + "epoch": 0.2851359049479167, + "grad_norm": 18.51493263244629, + "learning_rate": 9.876787743653478e-06, + "loss": 5.3594, + "step": 14015 + }, + { + "epoch": 0.2852376302083333, + "grad_norm": 16.843549728393555, + "learning_rate": 9.876699547187193e-06, + "loss": 5.431, + "step": 14020 + }, + { + "epoch": 0.28533935546875, + "grad_norm": 16.067481994628906, + "learning_rate": 9.876611319560345e-06, + "loss": 5.1922, + "step": 14025 + }, + { + "epoch": 0.2854410807291667, + "grad_norm": 16.86836051940918, + "learning_rate": 9.876523060773498e-06, + "loss": 5.5159, + "step": 14030 + }, + { + "epoch": 0.2855428059895833, + "grad_norm": 18.324691772460938, + "learning_rate": 9.876434770827218e-06, + "loss": 5.2923, + "step": 14035 + }, + { + "epoch": 0.28564453125, + "grad_norm": 17.186494827270508, + "learning_rate": 9.876346449722067e-06, + "loss": 5.4703, + "step": 14040 + }, + { + "epoch": 0.2857462565104167, + "grad_norm": 19.112499237060547, + "learning_rate": 9.87625809745861e-06, + "loss": 5.4247, + "step": 14045 + }, + { + "epoch": 0.2858479817708333, + "grad_norm": 14.15392780303955, + "learning_rate": 9.876169714037413e-06, + "loss": 5.2793, + "step": 14050 + }, + { + "epoch": 0.28594970703125, + "grad_norm": 17.966764450073242, + "learning_rate": 9.876081299459037e-06, + "loss": 5.3126, + "step": 14055 + }, + { + "epoch": 0.2860514322916667, + "grad_norm": 12.048892974853516, + "learning_rate": 9.875992853724052e-06, + "loss": 5.59, + "step": 14060 + }, + { + "epoch": 0.2861531575520833, + "grad_norm": 12.306517601013184, + "learning_rate": 9.87590437683302e-06, + "loss": 5.1627, + "step": 14065 + }, + { + "epoch": 0.2862548828125, + "grad_norm": 12.380208969116211, + "learning_rate": 9.875815868786504e-06, + "loss": 5.3024, + "step": 14070 + }, + { + "epoch": 0.2863566080729167, + "grad_norm": 16.76248550415039, + "learning_rate": 9.875727329585077e-06, + "loss": 5.3266, + "step": 14075 + }, + { + "epoch": 0.2864583333333333, + "grad_norm": 15.482563018798828, + "learning_rate": 9.875638759229297e-06, + "loss": 5.1619, + "step": 14080 + }, + { + "epoch": 0.28656005859375, + "grad_norm": 13.556976318359375, + "learning_rate": 9.875550157719733e-06, + "loss": 5.1294, + "step": 14085 + }, + { + "epoch": 0.2866617838541667, + "grad_norm": 20.264265060424805, + "learning_rate": 9.875461525056953e-06, + "loss": 5.1303, + "step": 14090 + }, + { + "epoch": 0.2867635091145833, + "grad_norm": 15.667293548583984, + "learning_rate": 9.87537286124152e-06, + "loss": 5.2419, + "step": 14095 + }, + { + "epoch": 0.286865234375, + "grad_norm": 18.681194305419922, + "learning_rate": 9.875284166274002e-06, + "loss": 5.2476, + "step": 14100 + }, + { + "epoch": 0.2869669596354167, + "grad_norm": 20.106372833251953, + "learning_rate": 9.875195440154968e-06, + "loss": 4.98, + "step": 14105 + }, + { + "epoch": 0.2870686848958333, + "grad_norm": 15.26014232635498, + "learning_rate": 9.87510668288498e-06, + "loss": 5.131, + "step": 14110 + }, + { + "epoch": 0.28717041015625, + "grad_norm": 16.23550796508789, + "learning_rate": 9.87501789446461e-06, + "loss": 5.2521, + "step": 14115 + }, + { + "epoch": 0.2872721354166667, + "grad_norm": 14.14841365814209, + "learning_rate": 9.874929074894421e-06, + "loss": 5.188, + "step": 14120 + }, + { + "epoch": 0.2873738606770833, + "grad_norm": 20.430490493774414, + "learning_rate": 9.874840224174984e-06, + "loss": 5.3988, + "step": 14125 + }, + { + "epoch": 0.2874755859375, + "grad_norm": 19.166337966918945, + "learning_rate": 9.874751342306865e-06, + "loss": 5.3664, + "step": 14130 + }, + { + "epoch": 0.2875773111979167, + "grad_norm": 21.18666648864746, + "learning_rate": 9.874662429290633e-06, + "loss": 5.2817, + "step": 14135 + }, + { + "epoch": 0.2876790364583333, + "grad_norm": 16.09397315979004, + "learning_rate": 9.874573485126855e-06, + "loss": 5.3001, + "step": 14140 + }, + { + "epoch": 0.28778076171875, + "grad_norm": 21.79697608947754, + "learning_rate": 9.8744845098161e-06, + "loss": 5.313, + "step": 14145 + }, + { + "epoch": 0.2878824869791667, + "grad_norm": 15.901567459106445, + "learning_rate": 9.874395503358937e-06, + "loss": 5.5071, + "step": 14150 + }, + { + "epoch": 0.2879842122395833, + "grad_norm": 16.83692169189453, + "learning_rate": 9.874306465755934e-06, + "loss": 5.1369, + "step": 14155 + }, + { + "epoch": 0.2880859375, + "grad_norm": 20.25044059753418, + "learning_rate": 9.874217397007659e-06, + "loss": 5.6143, + "step": 14160 + }, + { + "epoch": 0.2881876627604167, + "grad_norm": 14.286147117614746, + "learning_rate": 9.874128297114683e-06, + "loss": 5.3208, + "step": 14165 + }, + { + "epoch": 0.2882893880208333, + "grad_norm": 15.878064155578613, + "learning_rate": 9.874039166077573e-06, + "loss": 5.3041, + "step": 14170 + }, + { + "epoch": 0.28839111328125, + "grad_norm": 21.170509338378906, + "learning_rate": 9.873950003896901e-06, + "loss": 5.2466, + "step": 14175 + }, + { + "epoch": 0.2884928385416667, + "grad_norm": 16.606998443603516, + "learning_rate": 9.873860810573237e-06, + "loss": 5.2378, + "step": 14180 + }, + { + "epoch": 0.2885945638020833, + "grad_norm": 14.700422286987305, + "learning_rate": 9.873771586107147e-06, + "loss": 5.287, + "step": 14185 + }, + { + "epoch": 0.2886962890625, + "grad_norm": 21.14668083190918, + "learning_rate": 9.873682330499206e-06, + "loss": 5.1287, + "step": 14190 + }, + { + "epoch": 0.2887980143229167, + "grad_norm": 16.80910873413086, + "learning_rate": 9.87359304374998e-06, + "loss": 5.4088, + "step": 14195 + }, + { + "epoch": 0.2888997395833333, + "grad_norm": 20.918317794799805, + "learning_rate": 9.873503725860044e-06, + "loss": 5.3188, + "step": 14200 + }, + { + "epoch": 0.28900146484375, + "grad_norm": 17.56306266784668, + "learning_rate": 9.873414376829963e-06, + "loss": 5.4499, + "step": 14205 + }, + { + "epoch": 0.2891031901041667, + "grad_norm": 16.503746032714844, + "learning_rate": 9.873324996660314e-06, + "loss": 5.2278, + "step": 14210 + }, + { + "epoch": 0.2892049153645833, + "grad_norm": 15.792562484741211, + "learning_rate": 9.873235585351664e-06, + "loss": 5.2855, + "step": 14215 + }, + { + "epoch": 0.289306640625, + "grad_norm": 16.65228843688965, + "learning_rate": 9.873146142904587e-06, + "loss": 5.6001, + "step": 14220 + }, + { + "epoch": 0.2894083658854167, + "grad_norm": 18.289289474487305, + "learning_rate": 9.873056669319651e-06, + "loss": 5.1268, + "step": 14225 + }, + { + "epoch": 0.2895100911458333, + "grad_norm": 23.111661911010742, + "learning_rate": 9.87296716459743e-06, + "loss": 5.2413, + "step": 14230 + }, + { + "epoch": 0.28961181640625, + "grad_norm": 20.399364471435547, + "learning_rate": 9.872877628738496e-06, + "loss": 4.9997, + "step": 14235 + }, + { + "epoch": 0.2897135416666667, + "grad_norm": 16.71027946472168, + "learning_rate": 9.872788061743422e-06, + "loss": 5.3921, + "step": 14240 + }, + { + "epoch": 0.2898152669270833, + "grad_norm": 22.07367515563965, + "learning_rate": 9.872698463612778e-06, + "loss": 5.4624, + "step": 14245 + }, + { + "epoch": 0.2899169921875, + "grad_norm": 13.200479507446289, + "learning_rate": 9.872608834347138e-06, + "loss": 5.3962, + "step": 14250 + }, + { + "epoch": 0.2900187174479167, + "grad_norm": 14.624479293823242, + "learning_rate": 9.872519173947075e-06, + "loss": 5.3019, + "step": 14255 + }, + { + "epoch": 0.2901204427083333, + "grad_norm": 15.867794036865234, + "learning_rate": 9.87242948241316e-06, + "loss": 5.2153, + "step": 14260 + }, + { + "epoch": 0.29022216796875, + "grad_norm": 20.5881290435791, + "learning_rate": 9.872339759745969e-06, + "loss": 5.2286, + "step": 14265 + }, + { + "epoch": 0.2903238932291667, + "grad_norm": 22.79790496826172, + "learning_rate": 9.872250005946072e-06, + "loss": 5.4336, + "step": 14270 + }, + { + "epoch": 0.2904256184895833, + "grad_norm": 11.54698371887207, + "learning_rate": 9.872160221014044e-06, + "loss": 5.2588, + "step": 14275 + }, + { + "epoch": 0.29052734375, + "grad_norm": 14.721654891967773, + "learning_rate": 9.87207040495046e-06, + "loss": 5.2325, + "step": 14280 + }, + { + "epoch": 0.2906290690104167, + "grad_norm": 15.575557708740234, + "learning_rate": 9.871980557755892e-06, + "loss": 5.3117, + "step": 14285 + }, + { + "epoch": 0.2907307942708333, + "grad_norm": 19.544597625732422, + "learning_rate": 9.871890679430913e-06, + "loss": 5.5131, + "step": 14290 + }, + { + "epoch": 0.29083251953125, + "grad_norm": 15.575849533081055, + "learning_rate": 9.871800769976103e-06, + "loss": 5.3516, + "step": 14295 + }, + { + "epoch": 0.2909342447916667, + "grad_norm": 13.183148384094238, + "learning_rate": 9.871710829392029e-06, + "loss": 5.2609, + "step": 14300 + }, + { + "epoch": 0.2910359700520833, + "grad_norm": 17.62382698059082, + "learning_rate": 9.871620857679272e-06, + "loss": 5.2639, + "step": 14305 + }, + { + "epoch": 0.2911376953125, + "grad_norm": 14.941354751586914, + "learning_rate": 9.871530854838402e-06, + "loss": 5.0777, + "step": 14310 + }, + { + "epoch": 0.2912394205729167, + "grad_norm": 31.816072463989258, + "learning_rate": 9.871440820869998e-06, + "loss": 5.3353, + "step": 14315 + }, + { + "epoch": 0.2913411458333333, + "grad_norm": 15.068482398986816, + "learning_rate": 9.871350755774634e-06, + "loss": 5.3385, + "step": 14320 + }, + { + "epoch": 0.29144287109375, + "grad_norm": 17.761411666870117, + "learning_rate": 9.871260659552882e-06, + "loss": 5.5214, + "step": 14325 + }, + { + "epoch": 0.2915445963541667, + "grad_norm": 16.819616317749023, + "learning_rate": 9.871170532205323e-06, + "loss": 5.3434, + "step": 14330 + }, + { + "epoch": 0.2916463216145833, + "grad_norm": 18.116117477416992, + "learning_rate": 9.871080373732532e-06, + "loss": 5.3042, + "step": 14335 + }, + { + "epoch": 0.291748046875, + "grad_norm": 17.219913482666016, + "learning_rate": 9.870990184135081e-06, + "loss": 5.0299, + "step": 14340 + }, + { + "epoch": 0.2918497721354167, + "grad_norm": 15.502840995788574, + "learning_rate": 9.87089996341355e-06, + "loss": 5.1278, + "step": 14345 + }, + { + "epoch": 0.2919514973958333, + "grad_norm": 16.56673240661621, + "learning_rate": 9.870809711568515e-06, + "loss": 5.0727, + "step": 14350 + }, + { + "epoch": 0.29205322265625, + "grad_norm": 17.672719955444336, + "learning_rate": 9.87071942860055e-06, + "loss": 5.4635, + "step": 14355 + }, + { + "epoch": 0.2921549479166667, + "grad_norm": 13.247444152832031, + "learning_rate": 9.870629114510236e-06, + "loss": 5.3098, + "step": 14360 + }, + { + "epoch": 0.2922566731770833, + "grad_norm": 13.507805824279785, + "learning_rate": 9.870538769298147e-06, + "loss": 5.0379, + "step": 14365 + }, + { + "epoch": 0.2923583984375, + "grad_norm": 16.15684700012207, + "learning_rate": 9.870448392964863e-06, + "loss": 5.0987, + "step": 14370 + }, + { + "epoch": 0.2924601236979167, + "grad_norm": 15.40950870513916, + "learning_rate": 9.870357985510959e-06, + "loss": 5.1239, + "step": 14375 + }, + { + "epoch": 0.2925618489583333, + "grad_norm": 13.134892463684082, + "learning_rate": 9.870267546937013e-06, + "loss": 5.3752, + "step": 14380 + }, + { + "epoch": 0.29266357421875, + "grad_norm": 16.523605346679688, + "learning_rate": 9.870177077243604e-06, + "loss": 5.2707, + "step": 14385 + }, + { + "epoch": 0.2927652994791667, + "grad_norm": 15.795208930969238, + "learning_rate": 9.87008657643131e-06, + "loss": 5.3173, + "step": 14390 + }, + { + "epoch": 0.2928670247395833, + "grad_norm": 14.398679733276367, + "learning_rate": 9.869996044500708e-06, + "loss": 5.4643, + "step": 14395 + }, + { + "epoch": 0.29296875, + "grad_norm": 19.520700454711914, + "learning_rate": 9.869905481452377e-06, + "loss": 5.6186, + "step": 14400 + }, + { + "epoch": 0.2930704752604167, + "grad_norm": 18.973352432250977, + "learning_rate": 9.869814887286897e-06, + "loss": 5.4267, + "step": 14405 + }, + { + "epoch": 0.2931722005208333, + "grad_norm": 13.545119285583496, + "learning_rate": 9.869724262004845e-06, + "loss": 5.456, + "step": 14410 + }, + { + "epoch": 0.29327392578125, + "grad_norm": 16.087966918945312, + "learning_rate": 9.8696336056068e-06, + "loss": 5.3043, + "step": 14415 + }, + { + "epoch": 0.2933756510416667, + "grad_norm": 14.869192123413086, + "learning_rate": 9.869542918093344e-06, + "loss": 5.3743, + "step": 14420 + }, + { + "epoch": 0.2934773763020833, + "grad_norm": 18.900367736816406, + "learning_rate": 9.869452199465052e-06, + "loss": 5.3971, + "step": 14425 + }, + { + "epoch": 0.2935791015625, + "grad_norm": 20.462688446044922, + "learning_rate": 9.869361449722508e-06, + "loss": 5.1597, + "step": 14430 + }, + { + "epoch": 0.2936808268229167, + "grad_norm": 17.170303344726562, + "learning_rate": 9.86927066886629e-06, + "loss": 5.1925, + "step": 14435 + }, + { + "epoch": 0.2937825520833333, + "grad_norm": 15.824849128723145, + "learning_rate": 9.869179856896977e-06, + "loss": 5.1856, + "step": 14440 + }, + { + "epoch": 0.29388427734375, + "grad_norm": 22.06117057800293, + "learning_rate": 9.869089013815151e-06, + "loss": 5.3188, + "step": 14445 + }, + { + "epoch": 0.2939860026041667, + "grad_norm": 18.277433395385742, + "learning_rate": 9.868998139621394e-06, + "loss": 5.2619, + "step": 14450 + }, + { + "epoch": 0.2940877278645833, + "grad_norm": 14.251946449279785, + "learning_rate": 9.868907234316283e-06, + "loss": 5.0257, + "step": 14455 + }, + { + "epoch": 0.294189453125, + "grad_norm": 20.966333389282227, + "learning_rate": 9.868816297900398e-06, + "loss": 5.6754, + "step": 14460 + }, + { + "epoch": 0.2942911783854167, + "grad_norm": 35.59882354736328, + "learning_rate": 9.868725330374326e-06, + "loss": 5.1232, + "step": 14465 + }, + { + "epoch": 0.2943929036458333, + "grad_norm": 12.61946964263916, + "learning_rate": 9.868634331738643e-06, + "loss": 5.2461, + "step": 14470 + }, + { + "epoch": 0.29449462890625, + "grad_norm": 16.771770477294922, + "learning_rate": 9.868543301993932e-06, + "loss": 5.5227, + "step": 14475 + }, + { + "epoch": 0.2945963541666667, + "grad_norm": 20.012136459350586, + "learning_rate": 9.868452241140776e-06, + "loss": 5.3532, + "step": 14480 + }, + { + "epoch": 0.2946980794270833, + "grad_norm": 17.626543045043945, + "learning_rate": 9.868361149179753e-06, + "loss": 5.5607, + "step": 14485 + }, + { + "epoch": 0.2947998046875, + "grad_norm": 18.613773345947266, + "learning_rate": 9.86827002611145e-06, + "loss": 5.3513, + "step": 14490 + }, + { + "epoch": 0.2949015299479167, + "grad_norm": 22.039142608642578, + "learning_rate": 9.868178871936447e-06, + "loss": 5.2494, + "step": 14495 + }, + { + "epoch": 0.2950032552083333, + "grad_norm": 17.815311431884766, + "learning_rate": 9.868087686655325e-06, + "loss": 4.9754, + "step": 14500 + }, + { + "epoch": 0.29510498046875, + "grad_norm": 17.915496826171875, + "learning_rate": 9.867996470268667e-06, + "loss": 5.4055, + "step": 14505 + }, + { + "epoch": 0.2952067057291667, + "grad_norm": 16.734848022460938, + "learning_rate": 9.867905222777061e-06, + "loss": 5.4908, + "step": 14510 + }, + { + "epoch": 0.2953084309895833, + "grad_norm": 23.068857192993164, + "learning_rate": 9.867813944181083e-06, + "loss": 5.3704, + "step": 14515 + }, + { + "epoch": 0.29541015625, + "grad_norm": 13.313309669494629, + "learning_rate": 9.867722634481319e-06, + "loss": 5.1531, + "step": 14520 + }, + { + "epoch": 0.2955118815104167, + "grad_norm": 13.423197746276855, + "learning_rate": 9.867631293678354e-06, + "loss": 5.6891, + "step": 14525 + }, + { + "epoch": 0.2956136067708333, + "grad_norm": 21.49677085876465, + "learning_rate": 9.867539921772768e-06, + "loss": 5.3486, + "step": 14530 + }, + { + "epoch": 0.29571533203125, + "grad_norm": 23.650840759277344, + "learning_rate": 9.86744851876515e-06, + "loss": 5.5453, + "step": 14535 + }, + { + "epoch": 0.2958170572916667, + "grad_norm": 17.951969146728516, + "learning_rate": 9.86735708465608e-06, + "loss": 5.1283, + "step": 14540 + }, + { + "epoch": 0.2959187825520833, + "grad_norm": 15.802026748657227, + "learning_rate": 9.867265619446142e-06, + "loss": 5.0531, + "step": 14545 + }, + { + "epoch": 0.2960205078125, + "grad_norm": 16.32301902770996, + "learning_rate": 9.867174123135924e-06, + "loss": 5.1175, + "step": 14550 + }, + { + "epoch": 0.2961222330729167, + "grad_norm": 15.423812866210938, + "learning_rate": 9.867082595726005e-06, + "loss": 5.2924, + "step": 14555 + }, + { + "epoch": 0.2962239583333333, + "grad_norm": 14.752782821655273, + "learning_rate": 9.866991037216977e-06, + "loss": 4.8946, + "step": 14560 + }, + { + "epoch": 0.29632568359375, + "grad_norm": 17.77853012084961, + "learning_rate": 9.866899447609419e-06, + "loss": 5.3326, + "step": 14565 + }, + { + "epoch": 0.2964274088541667, + "grad_norm": 18.644216537475586, + "learning_rate": 9.866807826903918e-06, + "loss": 5.1986, + "step": 14570 + }, + { + "epoch": 0.2965291341145833, + "grad_norm": 13.400472640991211, + "learning_rate": 9.866716175101062e-06, + "loss": 5.1685, + "step": 14575 + }, + { + "epoch": 0.296630859375, + "grad_norm": 16.80168342590332, + "learning_rate": 9.866624492201434e-06, + "loss": 5.2225, + "step": 14580 + }, + { + "epoch": 0.2967325846354167, + "grad_norm": 16.168453216552734, + "learning_rate": 9.86653277820562e-06, + "loss": 5.0694, + "step": 14585 + }, + { + "epoch": 0.2968343098958333, + "grad_norm": 13.47756576538086, + "learning_rate": 9.866441033114206e-06, + "loss": 5.3814, + "step": 14590 + }, + { + "epoch": 0.29693603515625, + "grad_norm": 12.173975944519043, + "learning_rate": 9.866349256927778e-06, + "loss": 5.1063, + "step": 14595 + }, + { + "epoch": 0.2970377604166667, + "grad_norm": 15.742836952209473, + "learning_rate": 9.866257449646925e-06, + "loss": 5.3058, + "step": 14600 + }, + { + "epoch": 0.2971394856770833, + "grad_norm": 17.555078506469727, + "learning_rate": 9.86616561127223e-06, + "loss": 5.0221, + "step": 14605 + }, + { + "epoch": 0.2972412109375, + "grad_norm": 19.241151809692383, + "learning_rate": 9.866073741804282e-06, + "loss": 5.4284, + "step": 14610 + }, + { + "epoch": 0.2973429361979167, + "grad_norm": 29.21381950378418, + "learning_rate": 9.865981841243668e-06, + "loss": 5.4943, + "step": 14615 + }, + { + "epoch": 0.2974446614583333, + "grad_norm": 15.403910636901855, + "learning_rate": 9.865889909590974e-06, + "loss": 5.3515, + "step": 14620 + }, + { + "epoch": 0.29754638671875, + "grad_norm": 17.03386688232422, + "learning_rate": 9.86579794684679e-06, + "loss": 5.3898, + "step": 14625 + }, + { + "epoch": 0.2976481119791667, + "grad_norm": 15.446100234985352, + "learning_rate": 9.8657059530117e-06, + "loss": 5.0758, + "step": 14630 + }, + { + "epoch": 0.2977498372395833, + "grad_norm": 15.195470809936523, + "learning_rate": 9.865613928086292e-06, + "loss": 5.1457, + "step": 14635 + }, + { + "epoch": 0.2978515625, + "grad_norm": 16.664064407348633, + "learning_rate": 9.865521872071158e-06, + "loss": 5.358, + "step": 14640 + }, + { + "epoch": 0.2979532877604167, + "grad_norm": 19.309791564941406, + "learning_rate": 9.865429784966882e-06, + "loss": 5.4198, + "step": 14645 + }, + { + "epoch": 0.2980550130208333, + "grad_norm": 16.41381072998047, + "learning_rate": 9.865337666774055e-06, + "loss": 5.1846, + "step": 14650 + }, + { + "epoch": 0.29815673828125, + "grad_norm": 14.125533103942871, + "learning_rate": 9.865245517493266e-06, + "loss": 4.9772, + "step": 14655 + }, + { + "epoch": 0.2982584635416667, + "grad_norm": 15.152359008789062, + "learning_rate": 9.865153337125101e-06, + "loss": 5.2993, + "step": 14660 + }, + { + "epoch": 0.2983601888020833, + "grad_norm": 13.020310401916504, + "learning_rate": 9.865061125670153e-06, + "loss": 5.4168, + "step": 14665 + }, + { + "epoch": 0.2984619140625, + "grad_norm": 15.035855293273926, + "learning_rate": 9.864968883129004e-06, + "loss": 5.2614, + "step": 14670 + }, + { + "epoch": 0.2985636393229167, + "grad_norm": 18.081335067749023, + "learning_rate": 9.864876609502251e-06, + "loss": 5.1624, + "step": 14675 + }, + { + "epoch": 0.2986653645833333, + "grad_norm": 15.166487693786621, + "learning_rate": 9.864784304790483e-06, + "loss": 5.413, + "step": 14680 + }, + { + "epoch": 0.29876708984375, + "grad_norm": 14.87021541595459, + "learning_rate": 9.864691968994285e-06, + "loss": 5.1425, + "step": 14685 + }, + { + "epoch": 0.2988688151041667, + "grad_norm": 17.340595245361328, + "learning_rate": 9.864599602114249e-06, + "loss": 5.2716, + "step": 14690 + }, + { + "epoch": 0.2989705403645833, + "grad_norm": 18.59521484375, + "learning_rate": 9.864507204150969e-06, + "loss": 5.3848, + "step": 14695 + }, + { + "epoch": 0.299072265625, + "grad_norm": 29.67017364501953, + "learning_rate": 9.864414775105029e-06, + "loss": 5.2748, + "step": 14700 + }, + { + "epoch": 0.2991739908854167, + "grad_norm": 16.693635940551758, + "learning_rate": 9.864322314977025e-06, + "loss": 5.549, + "step": 14705 + }, + { + "epoch": 0.2992757161458333, + "grad_norm": 17.39828872680664, + "learning_rate": 9.864229823767543e-06, + "loss": 5.1966, + "step": 14710 + }, + { + "epoch": 0.29937744140625, + "grad_norm": 14.555005073547363, + "learning_rate": 9.864137301477177e-06, + "loss": 5.4035, + "step": 14715 + }, + { + "epoch": 0.2994791666666667, + "grad_norm": 13.546113967895508, + "learning_rate": 9.864044748106519e-06, + "loss": 5.5723, + "step": 14720 + }, + { + "epoch": 0.2995808919270833, + "grad_norm": 17.400943756103516, + "learning_rate": 9.863952163656158e-06, + "loss": 5.6095, + "step": 14725 + }, + { + "epoch": 0.2996826171875, + "grad_norm": 12.90721607208252, + "learning_rate": 9.863859548126689e-06, + "loss": 5.2548, + "step": 14730 + }, + { + "epoch": 0.2997843424479167, + "grad_norm": 14.430899620056152, + "learning_rate": 9.863766901518698e-06, + "loss": 5.4661, + "step": 14735 + }, + { + "epoch": 0.2998860677083333, + "grad_norm": 18.20379066467285, + "learning_rate": 9.863674223832784e-06, + "loss": 5.0006, + "step": 14740 + }, + { + "epoch": 0.29998779296875, + "grad_norm": 17.473365783691406, + "learning_rate": 9.863581515069534e-06, + "loss": 5.2037, + "step": 14745 + }, + { + "epoch": 0.3000895182291667, + "grad_norm": 19.688444137573242, + "learning_rate": 9.863488775229541e-06, + "loss": 5.1139, + "step": 14750 + }, + { + "epoch": 0.3001912434895833, + "grad_norm": 18.689111709594727, + "learning_rate": 9.863396004313399e-06, + "loss": 5.3028, + "step": 14755 + }, + { + "epoch": 0.30029296875, + "grad_norm": 18.48975372314453, + "learning_rate": 9.863303202321701e-06, + "loss": 5.2423, + "step": 14760 + }, + { + "epoch": 0.3003946940104167, + "grad_norm": 14.622264862060547, + "learning_rate": 9.863210369255041e-06, + "loss": 5.3009, + "step": 14765 + }, + { + "epoch": 0.3004964192708333, + "grad_norm": 15.566975593566895, + "learning_rate": 9.863117505114009e-06, + "loss": 5.1417, + "step": 14770 + }, + { + "epoch": 0.30059814453125, + "grad_norm": 16.1068172454834, + "learning_rate": 9.863024609899198e-06, + "loss": 5.0898, + "step": 14775 + }, + { + "epoch": 0.3006998697916667, + "grad_norm": 13.124166488647461, + "learning_rate": 9.862931683611206e-06, + "loss": 5.2406, + "step": 14780 + }, + { + "epoch": 0.3008015950520833, + "grad_norm": 14.229702949523926, + "learning_rate": 9.862838726250623e-06, + "loss": 5.1883, + "step": 14785 + }, + { + "epoch": 0.3009033203125, + "grad_norm": 26.793153762817383, + "learning_rate": 9.862745737818045e-06, + "loss": 5.243, + "step": 14790 + }, + { + "epoch": 0.3010050455729167, + "grad_norm": 16.869787216186523, + "learning_rate": 9.862652718314065e-06, + "loss": 5.3548, + "step": 14795 + }, + { + "epoch": 0.3011067708333333, + "grad_norm": 19.15056037902832, + "learning_rate": 9.862559667739278e-06, + "loss": 5.187, + "step": 14800 + }, + { + "epoch": 0.30120849609375, + "grad_norm": 19.05392837524414, + "learning_rate": 9.862466586094278e-06, + "loss": 5.2683, + "step": 14805 + }, + { + "epoch": 0.3013102213541667, + "grad_norm": 15.688380241394043, + "learning_rate": 9.86237347337966e-06, + "loss": 5.1949, + "step": 14810 + }, + { + "epoch": 0.3014119466145833, + "grad_norm": 17.994050979614258, + "learning_rate": 9.86228032959602e-06, + "loss": 5.3195, + "step": 14815 + }, + { + "epoch": 0.301513671875, + "grad_norm": 15.440589904785156, + "learning_rate": 9.86218715474395e-06, + "loss": 5.2563, + "step": 14820 + }, + { + "epoch": 0.3016153971354167, + "grad_norm": 15.075746536254883, + "learning_rate": 9.86209394882405e-06, + "loss": 5.1836, + "step": 14825 + }, + { + "epoch": 0.3017171223958333, + "grad_norm": 22.59146499633789, + "learning_rate": 9.862000711836912e-06, + "loss": 5.0741, + "step": 14830 + }, + { + "epoch": 0.30181884765625, + "grad_norm": 13.374832153320312, + "learning_rate": 9.861907443783131e-06, + "loss": 5.2646, + "step": 14835 + }, + { + "epoch": 0.3019205729166667, + "grad_norm": 15.566507339477539, + "learning_rate": 9.861814144663308e-06, + "loss": 5.5104, + "step": 14840 + }, + { + "epoch": 0.3020222981770833, + "grad_norm": 16.200382232666016, + "learning_rate": 9.861720814478036e-06, + "loss": 5.1651, + "step": 14845 + }, + { + "epoch": 0.3021240234375, + "grad_norm": 13.924320220947266, + "learning_rate": 9.861627453227909e-06, + "loss": 5.3485, + "step": 14850 + }, + { + "epoch": 0.3022257486979167, + "grad_norm": 15.791015625, + "learning_rate": 9.861534060913527e-06, + "loss": 5.2329, + "step": 14855 + }, + { + "epoch": 0.3023274739583333, + "grad_norm": 14.590411186218262, + "learning_rate": 9.861440637535485e-06, + "loss": 5.3952, + "step": 14860 + }, + { + "epoch": 0.30242919921875, + "grad_norm": 22.993600845336914, + "learning_rate": 9.861347183094381e-06, + "loss": 5.1772, + "step": 14865 + }, + { + "epoch": 0.3025309244791667, + "grad_norm": 15.755889892578125, + "learning_rate": 9.86125369759081e-06, + "loss": 5.2704, + "step": 14870 + }, + { + "epoch": 0.3026326497395833, + "grad_norm": 14.630293846130371, + "learning_rate": 9.861160181025376e-06, + "loss": 5.317, + "step": 14875 + }, + { + "epoch": 0.302734375, + "grad_norm": 12.753288269042969, + "learning_rate": 9.861066633398668e-06, + "loss": 5.2997, + "step": 14880 + }, + { + "epoch": 0.3028361002604167, + "grad_norm": 13.707475662231445, + "learning_rate": 9.860973054711287e-06, + "loss": 5.254, + "step": 14885 + }, + { + "epoch": 0.3029378255208333, + "grad_norm": 15.99266529083252, + "learning_rate": 9.86087944496383e-06, + "loss": 5.5196, + "step": 14890 + }, + { + "epoch": 0.30303955078125, + "grad_norm": 15.952250480651855, + "learning_rate": 9.860785804156901e-06, + "loss": 5.4954, + "step": 14895 + }, + { + "epoch": 0.3031412760416667, + "grad_norm": 19.05000877380371, + "learning_rate": 9.86069213229109e-06, + "loss": 5.2652, + "step": 14900 + }, + { + "epoch": 0.3032430013020833, + "grad_norm": 18.21732521057129, + "learning_rate": 9.860598429367001e-06, + "loss": 5.2998, + "step": 14905 + }, + { + "epoch": 0.3033447265625, + "grad_norm": 22.94626808166504, + "learning_rate": 9.86050469538523e-06, + "loss": 5.0521, + "step": 14910 + }, + { + "epoch": 0.3034464518229167, + "grad_norm": 13.962320327758789, + "learning_rate": 9.860410930346378e-06, + "loss": 5.0749, + "step": 14915 + }, + { + "epoch": 0.3035481770833333, + "grad_norm": 13.566903114318848, + "learning_rate": 9.86031713425104e-06, + "loss": 5.4046, + "step": 14920 + }, + { + "epoch": 0.30364990234375, + "grad_norm": 19.38048553466797, + "learning_rate": 9.860223307099822e-06, + "loss": 5.3345, + "step": 14925 + }, + { + "epoch": 0.3037516276041667, + "grad_norm": 13.825709342956543, + "learning_rate": 9.860129448893318e-06, + "loss": 5.3682, + "step": 14930 + }, + { + "epoch": 0.3038533528645833, + "grad_norm": 18.784786224365234, + "learning_rate": 9.86003555963213e-06, + "loss": 5.203, + "step": 14935 + }, + { + "epoch": 0.303955078125, + "grad_norm": 19.723005294799805, + "learning_rate": 9.859941639316859e-06, + "loss": 5.4165, + "step": 14940 + }, + { + "epoch": 0.3040568033854167, + "grad_norm": 15.6521577835083, + "learning_rate": 9.859847687948102e-06, + "loss": 5.0823, + "step": 14945 + }, + { + "epoch": 0.3041585286458333, + "grad_norm": 18.49175262451172, + "learning_rate": 9.859753705526462e-06, + "loss": 5.06, + "step": 14950 + }, + { + "epoch": 0.30426025390625, + "grad_norm": 22.732614517211914, + "learning_rate": 9.859659692052538e-06, + "loss": 5.6277, + "step": 14955 + }, + { + "epoch": 0.3043619791666667, + "grad_norm": 20.92228889465332, + "learning_rate": 9.85956564752693e-06, + "loss": 5.2746, + "step": 14960 + }, + { + "epoch": 0.3044637044270833, + "grad_norm": 20.765623092651367, + "learning_rate": 9.859471571950243e-06, + "loss": 5.2335, + "step": 14965 + }, + { + "epoch": 0.3045654296875, + "grad_norm": 16.53041648864746, + "learning_rate": 9.859377465323072e-06, + "loss": 5.2556, + "step": 14970 + }, + { + "epoch": 0.3046671549479167, + "grad_norm": 15.460558891296387, + "learning_rate": 9.859283327646024e-06, + "loss": 5.3636, + "step": 14975 + }, + { + "epoch": 0.3047688802083333, + "grad_norm": 20.615324020385742, + "learning_rate": 9.859189158919697e-06, + "loss": 5.0738, + "step": 14980 + }, + { + "epoch": 0.30487060546875, + "grad_norm": 16.432785034179688, + "learning_rate": 9.859094959144695e-06, + "loss": 5.0944, + "step": 14985 + }, + { + "epoch": 0.3049723307291667, + "grad_norm": 13.74108600616455, + "learning_rate": 9.859000728321617e-06, + "loss": 5.1756, + "step": 14990 + }, + { + "epoch": 0.3050740559895833, + "grad_norm": 14.229251861572266, + "learning_rate": 9.858906466451069e-06, + "loss": 5.3068, + "step": 14995 + }, + { + "epoch": 0.30517578125, + "grad_norm": 15.246853828430176, + "learning_rate": 9.858812173533648e-06, + "loss": 5.3491, + "step": 15000 + }, + { + "epoch": 0.3052775065104167, + "grad_norm": 16.21933937072754, + "learning_rate": 9.85871784956996e-06, + "loss": 5.4227, + "step": 15005 + }, + { + "epoch": 0.3053792317708333, + "grad_norm": 23.48605728149414, + "learning_rate": 9.85862349456061e-06, + "loss": 5.4231, + "step": 15010 + }, + { + "epoch": 0.30548095703125, + "grad_norm": 15.061319351196289, + "learning_rate": 9.858529108506196e-06, + "loss": 5.6873, + "step": 15015 + }, + { + "epoch": 0.3055826822916667, + "grad_norm": 16.723947525024414, + "learning_rate": 9.858434691407325e-06, + "loss": 5.2065, + "step": 15020 + }, + { + "epoch": 0.3056844075520833, + "grad_norm": 17.30099868774414, + "learning_rate": 9.858340243264598e-06, + "loss": 5.5672, + "step": 15025 + }, + { + "epoch": 0.3057861328125, + "grad_norm": 17.183013916015625, + "learning_rate": 9.858245764078618e-06, + "loss": 4.9866, + "step": 15030 + }, + { + "epoch": 0.3058878580729167, + "grad_norm": 16.28622817993164, + "learning_rate": 9.858151253849992e-06, + "loss": 5.4798, + "step": 15035 + }, + { + "epoch": 0.3059895833333333, + "grad_norm": 13.607527732849121, + "learning_rate": 9.858056712579319e-06, + "loss": 5.4517, + "step": 15040 + }, + { + "epoch": 0.30609130859375, + "grad_norm": 19.68058967590332, + "learning_rate": 9.857962140267208e-06, + "loss": 5.1869, + "step": 15045 + }, + { + "epoch": 0.3061930338541667, + "grad_norm": 21.730424880981445, + "learning_rate": 9.857867536914259e-06, + "loss": 5.3438, + "step": 15050 + }, + { + "epoch": 0.3062947591145833, + "grad_norm": 16.844497680664062, + "learning_rate": 9.85777290252108e-06, + "loss": 5.1218, + "step": 15055 + }, + { + "epoch": 0.306396484375, + "grad_norm": 13.255350112915039, + "learning_rate": 9.857678237088273e-06, + "loss": 5.5402, + "step": 15060 + }, + { + "epoch": 0.3064982096354167, + "grad_norm": 19.66354751586914, + "learning_rate": 9.857583540616446e-06, + "loss": 5.2686, + "step": 15065 + }, + { + "epoch": 0.3065999348958333, + "grad_norm": 17.987926483154297, + "learning_rate": 9.8574888131062e-06, + "loss": 5.3696, + "step": 15070 + }, + { + "epoch": 0.30670166015625, + "grad_norm": 23.045793533325195, + "learning_rate": 9.857394054558144e-06, + "loss": 5.0028, + "step": 15075 + }, + { + "epoch": 0.3068033854166667, + "grad_norm": 16.892942428588867, + "learning_rate": 9.857299264972882e-06, + "loss": 5.0722, + "step": 15080 + }, + { + "epoch": 0.3069051106770833, + "grad_norm": 20.857437133789062, + "learning_rate": 9.857204444351018e-06, + "loss": 5.6734, + "step": 15085 + }, + { + "epoch": 0.3070068359375, + "grad_norm": 17.466014862060547, + "learning_rate": 9.857109592693162e-06, + "loss": 5.2902, + "step": 15090 + }, + { + "epoch": 0.3071085611979167, + "grad_norm": 15.772829055786133, + "learning_rate": 9.857014709999915e-06, + "loss": 5.3693, + "step": 15095 + }, + { + "epoch": 0.3072102864583333, + "grad_norm": 13.188721656799316, + "learning_rate": 9.856919796271886e-06, + "loss": 5.5321, + "step": 15100 + }, + { + "epoch": 0.30731201171875, + "grad_norm": 21.326444625854492, + "learning_rate": 9.856824851509683e-06, + "loss": 5.2488, + "step": 15105 + }, + { + "epoch": 0.3074137369791667, + "grad_norm": 14.98425006866455, + "learning_rate": 9.85672987571391e-06, + "loss": 5.1836, + "step": 15110 + }, + { + "epoch": 0.3075154622395833, + "grad_norm": 17.64933204650879, + "learning_rate": 9.856634868885175e-06, + "loss": 5.3847, + "step": 15115 + }, + { + "epoch": 0.3076171875, + "grad_norm": 21.12501335144043, + "learning_rate": 9.856539831024085e-06, + "loss": 5.7121, + "step": 15120 + }, + { + "epoch": 0.3077189127604167, + "grad_norm": 13.620758056640625, + "learning_rate": 9.856444762131248e-06, + "loss": 5.4443, + "step": 15125 + }, + { + "epoch": 0.3078206380208333, + "grad_norm": 18.890254974365234, + "learning_rate": 9.85634966220727e-06, + "loss": 5.4979, + "step": 15130 + }, + { + "epoch": 0.30792236328125, + "grad_norm": 22.475561141967773, + "learning_rate": 9.856254531252757e-06, + "loss": 5.3369, + "step": 15135 + }, + { + "epoch": 0.3080240885416667, + "grad_norm": 23.55547523498535, + "learning_rate": 9.856159369268321e-06, + "loss": 5.2194, + "step": 15140 + }, + { + "epoch": 0.3081258138020833, + "grad_norm": 18.91642189025879, + "learning_rate": 9.85606417625457e-06, + "loss": 5.364, + "step": 15145 + }, + { + "epoch": 0.3082275390625, + "grad_norm": 15.055624961853027, + "learning_rate": 9.855968952212107e-06, + "loss": 5.1709, + "step": 15150 + }, + { + "epoch": 0.3083292643229167, + "grad_norm": 16.70476722717285, + "learning_rate": 9.855873697141547e-06, + "loss": 5.3324, + "step": 15155 + }, + { + "epoch": 0.3084309895833333, + "grad_norm": 23.773244857788086, + "learning_rate": 9.855778411043493e-06, + "loss": 5.4538, + "step": 15160 + }, + { + "epoch": 0.30853271484375, + "grad_norm": 14.4259614944458, + "learning_rate": 9.855683093918557e-06, + "loss": 5.091, + "step": 15165 + }, + { + "epoch": 0.3086344401041667, + "grad_norm": 19.715322494506836, + "learning_rate": 9.855587745767348e-06, + "loss": 5.1612, + "step": 15170 + }, + { + "epoch": 0.3087361653645833, + "grad_norm": 20.445505142211914, + "learning_rate": 9.855492366590476e-06, + "loss": 4.9714, + "step": 15175 + }, + { + "epoch": 0.308837890625, + "grad_norm": 16.10354995727539, + "learning_rate": 9.855396956388548e-06, + "loss": 5.2994, + "step": 15180 + }, + { + "epoch": 0.3089396158854167, + "grad_norm": 12.28260612487793, + "learning_rate": 9.855301515162174e-06, + "loss": 5.2283, + "step": 15185 + }, + { + "epoch": 0.3090413411458333, + "grad_norm": 10.98578929901123, + "learning_rate": 9.855206042911965e-06, + "loss": 5.222, + "step": 15190 + }, + { + "epoch": 0.30914306640625, + "grad_norm": 17.463979721069336, + "learning_rate": 9.85511053963853e-06, + "loss": 5.2439, + "step": 15195 + }, + { + "epoch": 0.3092447916666667, + "grad_norm": 13.245140075683594, + "learning_rate": 9.855015005342481e-06, + "loss": 5.1606, + "step": 15200 + }, + { + "epoch": 0.3093465169270833, + "grad_norm": 14.206634521484375, + "learning_rate": 9.854919440024426e-06, + "loss": 5.2447, + "step": 15205 + }, + { + "epoch": 0.3094482421875, + "grad_norm": 19.64805793762207, + "learning_rate": 9.854823843684979e-06, + "loss": 5.1659, + "step": 15210 + }, + { + "epoch": 0.3095499674479167, + "grad_norm": 13.97397232055664, + "learning_rate": 9.854728216324748e-06, + "loss": 5.2084, + "step": 15215 + }, + { + "epoch": 0.3096516927083333, + "grad_norm": 14.359854698181152, + "learning_rate": 9.854632557944344e-06, + "loss": 5.0006, + "step": 15220 + }, + { + "epoch": 0.30975341796875, + "grad_norm": 14.365852355957031, + "learning_rate": 9.85453686854438e-06, + "loss": 5.2846, + "step": 15225 + }, + { + "epoch": 0.3098551432291667, + "grad_norm": 11.41340446472168, + "learning_rate": 9.854441148125465e-06, + "loss": 5.0832, + "step": 15230 + }, + { + "epoch": 0.3099568684895833, + "grad_norm": 18.5701961517334, + "learning_rate": 9.854345396688212e-06, + "loss": 5.6349, + "step": 15235 + }, + { + "epoch": 0.31005859375, + "grad_norm": 21.260589599609375, + "learning_rate": 9.854249614233235e-06, + "loss": 5.1085, + "step": 15240 + }, + { + "epoch": 0.3101603190104167, + "grad_norm": 29.11577796936035, + "learning_rate": 9.854153800761143e-06, + "loss": 5.2833, + "step": 15245 + }, + { + "epoch": 0.3102620442708333, + "grad_norm": 13.79372501373291, + "learning_rate": 9.854057956272549e-06, + "loss": 5.2601, + "step": 15250 + }, + { + "epoch": 0.31036376953125, + "grad_norm": 16.616436004638672, + "learning_rate": 9.853962080768065e-06, + "loss": 5.2698, + "step": 15255 + }, + { + "epoch": 0.3104654947916667, + "grad_norm": 19.973861694335938, + "learning_rate": 9.853866174248304e-06, + "loss": 5.2614, + "step": 15260 + }, + { + "epoch": 0.3105672200520833, + "grad_norm": 21.560068130493164, + "learning_rate": 9.85377023671388e-06, + "loss": 5.1, + "step": 15265 + }, + { + "epoch": 0.3106689453125, + "grad_norm": 14.271023750305176, + "learning_rate": 9.853674268165406e-06, + "loss": 5.4478, + "step": 15270 + }, + { + "epoch": 0.3107706705729167, + "grad_norm": 17.985090255737305, + "learning_rate": 9.85357826860349e-06, + "loss": 5.353, + "step": 15275 + }, + { + "epoch": 0.3108723958333333, + "grad_norm": 21.30684471130371, + "learning_rate": 9.853482238028752e-06, + "loss": 5.5669, + "step": 15280 + }, + { + "epoch": 0.31097412109375, + "grad_norm": 17.286500930786133, + "learning_rate": 9.853386176441804e-06, + "loss": 5.2321, + "step": 15285 + }, + { + "epoch": 0.3110758463541667, + "grad_norm": 22.098764419555664, + "learning_rate": 9.853290083843258e-06, + "loss": 5.2306, + "step": 15290 + }, + { + "epoch": 0.3111775716145833, + "grad_norm": 14.456120491027832, + "learning_rate": 9.853193960233732e-06, + "loss": 5.3127, + "step": 15295 + }, + { + "epoch": 0.311279296875, + "grad_norm": 16.356178283691406, + "learning_rate": 9.853097805613833e-06, + "loss": 5.195, + "step": 15300 + }, + { + "epoch": 0.3113810221354167, + "grad_norm": 16.671934127807617, + "learning_rate": 9.853001619984182e-06, + "loss": 5.1489, + "step": 15305 + }, + { + "epoch": 0.3114827473958333, + "grad_norm": 13.874044418334961, + "learning_rate": 9.852905403345389e-06, + "loss": 5.0078, + "step": 15310 + }, + { + "epoch": 0.31158447265625, + "grad_norm": 23.091020584106445, + "learning_rate": 9.852809155698074e-06, + "loss": 5.3167, + "step": 15315 + }, + { + "epoch": 0.3116861979166667, + "grad_norm": 18.33286476135254, + "learning_rate": 9.852712877042848e-06, + "loss": 5.3089, + "step": 15320 + }, + { + "epoch": 0.3117879231770833, + "grad_norm": 18.070959091186523, + "learning_rate": 9.852616567380327e-06, + "loss": 5.1177, + "step": 15325 + }, + { + "epoch": 0.3118896484375, + "grad_norm": 18.177337646484375, + "learning_rate": 9.852520226711125e-06, + "loss": 5.2086, + "step": 15330 + }, + { + "epoch": 0.3119913736979167, + "grad_norm": 17.748291015625, + "learning_rate": 9.85242385503586e-06, + "loss": 4.9976, + "step": 15335 + }, + { + "epoch": 0.3120930989583333, + "grad_norm": 19.539894104003906, + "learning_rate": 9.852327452355148e-06, + "loss": 5.4995, + "step": 15340 + }, + { + "epoch": 0.31219482421875, + "grad_norm": 13.938675880432129, + "learning_rate": 9.852231018669602e-06, + "loss": 5.3079, + "step": 15345 + }, + { + "epoch": 0.3122965494791667, + "grad_norm": 16.28421974182129, + "learning_rate": 9.852134553979842e-06, + "loss": 5.7132, + "step": 15350 + }, + { + "epoch": 0.3123982747395833, + "grad_norm": 18.423599243164062, + "learning_rate": 9.852038058286482e-06, + "loss": 5.2325, + "step": 15355 + }, + { + "epoch": 0.3125, + "grad_norm": 19.38621711730957, + "learning_rate": 9.85194153159014e-06, + "loss": 5.1449, + "step": 15360 + }, + { + "epoch": 0.3126017252604167, + "grad_norm": 17.582229614257812, + "learning_rate": 9.851844973891428e-06, + "loss": 5.1755, + "step": 15365 + }, + { + "epoch": 0.3127034505208333, + "grad_norm": 20.77060317993164, + "learning_rate": 9.851748385190971e-06, + "loss": 5.0942, + "step": 15370 + }, + { + "epoch": 0.31280517578125, + "grad_norm": 19.456985473632812, + "learning_rate": 9.85165176548938e-06, + "loss": 5.2077, + "step": 15375 + }, + { + "epoch": 0.3129069010416667, + "grad_norm": 19.01134490966797, + "learning_rate": 9.851555114787275e-06, + "loss": 5.6164, + "step": 15380 + }, + { + "epoch": 0.3130086263020833, + "grad_norm": 17.09613800048828, + "learning_rate": 9.851458433085273e-06, + "loss": 5.1372, + "step": 15385 + }, + { + "epoch": 0.3131103515625, + "grad_norm": 14.572750091552734, + "learning_rate": 9.851361720383992e-06, + "loss": 5.3523, + "step": 15390 + }, + { + "epoch": 0.3132120768229167, + "grad_norm": 18.767507553100586, + "learning_rate": 9.851264976684049e-06, + "loss": 5.3634, + "step": 15395 + }, + { + "epoch": 0.3133138020833333, + "grad_norm": 18.13228416442871, + "learning_rate": 9.851168201986062e-06, + "loss": 5.3391, + "step": 15400 + }, + { + "epoch": 0.31341552734375, + "grad_norm": 16.122243881225586, + "learning_rate": 9.851071396290651e-06, + "loss": 5.0872, + "step": 15405 + }, + { + "epoch": 0.3135172526041667, + "grad_norm": 15.334650039672852, + "learning_rate": 9.850974559598434e-06, + "loss": 5.2275, + "step": 15410 + }, + { + "epoch": 0.3136189778645833, + "grad_norm": 14.5213041305542, + "learning_rate": 9.85087769191003e-06, + "loss": 5.3616, + "step": 15415 + }, + { + "epoch": 0.313720703125, + "grad_norm": 12.091636657714844, + "learning_rate": 9.850780793226056e-06, + "loss": 5.2855, + "step": 15420 + }, + { + "epoch": 0.3138224283854167, + "grad_norm": 23.312572479248047, + "learning_rate": 9.850683863547132e-06, + "loss": 5.2915, + "step": 15425 + }, + { + "epoch": 0.3139241536458333, + "grad_norm": 15.276582717895508, + "learning_rate": 9.850586902873879e-06, + "loss": 5.1968, + "step": 15430 + }, + { + "epoch": 0.31402587890625, + "grad_norm": 17.85337257385254, + "learning_rate": 9.850489911206916e-06, + "loss": 5.2637, + "step": 15435 + }, + { + "epoch": 0.3141276041666667, + "grad_norm": 21.065284729003906, + "learning_rate": 9.850392888546863e-06, + "loss": 5.1795, + "step": 15440 + }, + { + "epoch": 0.3142293294270833, + "grad_norm": 20.996700286865234, + "learning_rate": 9.850295834894337e-06, + "loss": 5.1762, + "step": 15445 + }, + { + "epoch": 0.3143310546875, + "grad_norm": 14.346749305725098, + "learning_rate": 9.85019875024996e-06, + "loss": 5.3604, + "step": 15450 + }, + { + "epoch": 0.3144327799479167, + "grad_norm": 17.204212188720703, + "learning_rate": 9.850101634614355e-06, + "loss": 5.2382, + "step": 15455 + }, + { + "epoch": 0.3145345052083333, + "grad_norm": 20.13806915283203, + "learning_rate": 9.850004487988139e-06, + "loss": 5.3354, + "step": 15460 + }, + { + "epoch": 0.31463623046875, + "grad_norm": 15.483142852783203, + "learning_rate": 9.849907310371933e-06, + "loss": 5.4623, + "step": 15465 + }, + { + "epoch": 0.3147379557291667, + "grad_norm": 13.133553504943848, + "learning_rate": 9.849810101766361e-06, + "loss": 5.2182, + "step": 15470 + }, + { + "epoch": 0.3148396809895833, + "grad_norm": 13.339311599731445, + "learning_rate": 9.849712862172041e-06, + "loss": 5.3393, + "step": 15475 + }, + { + "epoch": 0.31494140625, + "grad_norm": 12.960018157958984, + "learning_rate": 9.849615591589593e-06, + "loss": 5.7194, + "step": 15480 + }, + { + "epoch": 0.3150431315104167, + "grad_norm": 15.641376495361328, + "learning_rate": 9.849518290019645e-06, + "loss": 5.0854, + "step": 15485 + }, + { + "epoch": 0.3151448567708333, + "grad_norm": 12.422840118408203, + "learning_rate": 9.849420957462812e-06, + "loss": 5.6222, + "step": 15490 + }, + { + "epoch": 0.31524658203125, + "grad_norm": 15.307605743408203, + "learning_rate": 9.84932359391972e-06, + "loss": 5.1781, + "step": 15495 + }, + { + "epoch": 0.3153483072916667, + "grad_norm": 19.01751136779785, + "learning_rate": 9.849226199390988e-06, + "loss": 5.4313, + "step": 15500 + }, + { + "epoch": 0.3154500325520833, + "grad_norm": 18.133665084838867, + "learning_rate": 9.849128773877241e-06, + "loss": 5.251, + "step": 15505 + }, + { + "epoch": 0.3155517578125, + "grad_norm": 18.930377960205078, + "learning_rate": 9.8490313173791e-06, + "loss": 5.7321, + "step": 15510 + }, + { + "epoch": 0.3156534830729167, + "grad_norm": 18.792633056640625, + "learning_rate": 9.848933829897186e-06, + "loss": 5.5337, + "step": 15515 + }, + { + "epoch": 0.3157552083333333, + "grad_norm": 18.122194290161133, + "learning_rate": 9.848836311432129e-06, + "loss": 5.2173, + "step": 15520 + }, + { + "epoch": 0.31585693359375, + "grad_norm": 15.953411102294922, + "learning_rate": 9.848738761984544e-06, + "loss": 5.2229, + "step": 15525 + }, + { + "epoch": 0.3159586588541667, + "grad_norm": 14.15269947052002, + "learning_rate": 9.848641181555058e-06, + "loss": 5.0004, + "step": 15530 + }, + { + "epoch": 0.3160603841145833, + "grad_norm": 14.979864120483398, + "learning_rate": 9.848543570144294e-06, + "loss": 5.3748, + "step": 15535 + }, + { + "epoch": 0.316162109375, + "grad_norm": 14.492388725280762, + "learning_rate": 9.848445927752875e-06, + "loss": 5.4299, + "step": 15540 + }, + { + "epoch": 0.3162638346354167, + "grad_norm": 16.728242874145508, + "learning_rate": 9.848348254381428e-06, + "loss": 5.2417, + "step": 15545 + }, + { + "epoch": 0.3163655598958333, + "grad_norm": 17.730457305908203, + "learning_rate": 9.848250550030572e-06, + "loss": 5.2856, + "step": 15550 + }, + { + "epoch": 0.31646728515625, + "grad_norm": 15.9320707321167, + "learning_rate": 9.848152814700936e-06, + "loss": 5.3018, + "step": 15555 + }, + { + "epoch": 0.3165690104166667, + "grad_norm": 11.872086524963379, + "learning_rate": 9.84805504839314e-06, + "loss": 5.5246, + "step": 15560 + }, + { + "epoch": 0.3166707356770833, + "grad_norm": 15.636974334716797, + "learning_rate": 9.847957251107813e-06, + "loss": 5.3514, + "step": 15565 + }, + { + "epoch": 0.3167724609375, + "grad_norm": 13.028634071350098, + "learning_rate": 9.847859422845578e-06, + "loss": 5.243, + "step": 15570 + }, + { + "epoch": 0.3168741861979167, + "grad_norm": 19.736736297607422, + "learning_rate": 9.84776156360706e-06, + "loss": 5.2845, + "step": 15575 + }, + { + "epoch": 0.3169759114583333, + "grad_norm": 15.94448184967041, + "learning_rate": 9.847663673392885e-06, + "loss": 5.3224, + "step": 15580 + }, + { + "epoch": 0.31707763671875, + "grad_norm": 16.257110595703125, + "learning_rate": 9.847565752203678e-06, + "loss": 5.4679, + "step": 15585 + }, + { + "epoch": 0.3171793619791667, + "grad_norm": 14.339095115661621, + "learning_rate": 9.847467800040064e-06, + "loss": 5.2783, + "step": 15590 + }, + { + "epoch": 0.3172810872395833, + "grad_norm": 24.5327205657959, + "learning_rate": 9.84736981690267e-06, + "loss": 5.166, + "step": 15595 + }, + { + "epoch": 0.3173828125, + "grad_norm": 16.620256423950195, + "learning_rate": 9.847271802792122e-06, + "loss": 5.0577, + "step": 15600 + }, + { + "epoch": 0.3174845377604167, + "grad_norm": 17.731151580810547, + "learning_rate": 9.847173757709046e-06, + "loss": 5.2618, + "step": 15605 + }, + { + "epoch": 0.3175862630208333, + "grad_norm": 14.209230422973633, + "learning_rate": 9.847075681654066e-06, + "loss": 5.0385, + "step": 15610 + }, + { + "epoch": 0.31768798828125, + "grad_norm": 16.758827209472656, + "learning_rate": 9.846977574627813e-06, + "loss": 5.6223, + "step": 15615 + }, + { + "epoch": 0.3177897135416667, + "grad_norm": 19.425209045410156, + "learning_rate": 9.846879436630912e-06, + "loss": 5.071, + "step": 15620 + }, + { + "epoch": 0.3178914388020833, + "grad_norm": 14.086197853088379, + "learning_rate": 9.84678126766399e-06, + "loss": 5.2445, + "step": 15625 + }, + { + "epoch": 0.3179931640625, + "grad_norm": 17.249713897705078, + "learning_rate": 9.846683067727674e-06, + "loss": 4.9712, + "step": 15630 + }, + { + "epoch": 0.3180948893229167, + "grad_norm": 17.287090301513672, + "learning_rate": 9.846584836822593e-06, + "loss": 5.188, + "step": 15635 + }, + { + "epoch": 0.3181966145833333, + "grad_norm": 18.376922607421875, + "learning_rate": 9.846486574949373e-06, + "loss": 5.2102, + "step": 15640 + }, + { + "epoch": 0.31829833984375, + "grad_norm": 15.196942329406738, + "learning_rate": 9.84638828210864e-06, + "loss": 5.0317, + "step": 15645 + }, + { + "epoch": 0.3184000651041667, + "grad_norm": 15.77061939239502, + "learning_rate": 9.846289958301026e-06, + "loss": 5.2966, + "step": 15650 + }, + { + "epoch": 0.3185017903645833, + "grad_norm": 19.440481185913086, + "learning_rate": 9.846191603527158e-06, + "loss": 5.2578, + "step": 15655 + }, + { + "epoch": 0.318603515625, + "grad_norm": 17.030771255493164, + "learning_rate": 9.846093217787663e-06, + "loss": 5.1628, + "step": 15660 + }, + { + "epoch": 0.3187052408854167, + "grad_norm": 17.22333335876465, + "learning_rate": 9.84599480108317e-06, + "loss": 5.273, + "step": 15665 + }, + { + "epoch": 0.3188069661458333, + "grad_norm": 24.584627151489258, + "learning_rate": 9.84589635341431e-06, + "loss": 5.4811, + "step": 15670 + }, + { + "epoch": 0.31890869140625, + "grad_norm": 14.241735458374023, + "learning_rate": 9.845797874781711e-06, + "loss": 5.2611, + "step": 15675 + }, + { + "epoch": 0.3190104166666667, + "grad_norm": 19.89160919189453, + "learning_rate": 9.845699365186e-06, + "loss": 5.302, + "step": 15680 + }, + { + "epoch": 0.3191121419270833, + "grad_norm": 17.007394790649414, + "learning_rate": 9.84560082462781e-06, + "loss": 5.2917, + "step": 15685 + }, + { + "epoch": 0.3192138671875, + "grad_norm": 10.863831520080566, + "learning_rate": 9.845502253107768e-06, + "loss": 5.0833, + "step": 15690 + }, + { + "epoch": 0.3193155924479167, + "grad_norm": 20.250822067260742, + "learning_rate": 9.845403650626505e-06, + "loss": 5.3536, + "step": 15695 + }, + { + "epoch": 0.3194173177083333, + "grad_norm": 15.159502983093262, + "learning_rate": 9.845305017184653e-06, + "loss": 5.4336, + "step": 15700 + }, + { + "epoch": 0.31951904296875, + "grad_norm": 19.951183319091797, + "learning_rate": 9.845206352782838e-06, + "loss": 5.3709, + "step": 15705 + }, + { + "epoch": 0.3196207682291667, + "grad_norm": 18.599807739257812, + "learning_rate": 9.84510765742169e-06, + "loss": 5.131, + "step": 15710 + }, + { + "epoch": 0.3197224934895833, + "grad_norm": 11.526122093200684, + "learning_rate": 9.845008931101846e-06, + "loss": 5.3015, + "step": 15715 + }, + { + "epoch": 0.31982421875, + "grad_norm": 21.978322982788086, + "learning_rate": 9.844910173823931e-06, + "loss": 5.0883, + "step": 15720 + }, + { + "epoch": 0.3199259440104167, + "grad_norm": 13.71734619140625, + "learning_rate": 9.844811385588579e-06, + "loss": 5.4883, + "step": 15725 + }, + { + "epoch": 0.3200276692708333, + "grad_norm": 21.531919479370117, + "learning_rate": 9.844712566396419e-06, + "loss": 5.2432, + "step": 15730 + }, + { + "epoch": 0.32012939453125, + "grad_norm": 16.73660659790039, + "learning_rate": 9.844613716248085e-06, + "loss": 5.4796, + "step": 15735 + }, + { + "epoch": 0.3202311197916667, + "grad_norm": 18.05613899230957, + "learning_rate": 9.844514835144207e-06, + "loss": 5.6286, + "step": 15740 + }, + { + "epoch": 0.3203328450520833, + "grad_norm": 13.839470863342285, + "learning_rate": 9.844415923085416e-06, + "loss": 5.5799, + "step": 15745 + }, + { + "epoch": 0.3204345703125, + "grad_norm": 20.764781951904297, + "learning_rate": 9.844316980072345e-06, + "loss": 5.2695, + "step": 15750 + }, + { + "epoch": 0.3205362955729167, + "grad_norm": 19.460630416870117, + "learning_rate": 9.844218006105628e-06, + "loss": 5.369, + "step": 15755 + }, + { + "epoch": 0.3206380208333333, + "grad_norm": 15.628459930419922, + "learning_rate": 9.844119001185895e-06, + "loss": 5.5141, + "step": 15760 + }, + { + "epoch": 0.32073974609375, + "grad_norm": 14.388643264770508, + "learning_rate": 9.844019965313778e-06, + "loss": 5.3209, + "step": 15765 + }, + { + "epoch": 0.3208414713541667, + "grad_norm": 14.763102531433105, + "learning_rate": 9.843920898489911e-06, + "loss": 5.3183, + "step": 15770 + }, + { + "epoch": 0.3209431966145833, + "grad_norm": 19.93488883972168, + "learning_rate": 9.84382180071493e-06, + "loss": 5.4945, + "step": 15775 + }, + { + "epoch": 0.321044921875, + "grad_norm": 17.380834579467773, + "learning_rate": 9.843722671989462e-06, + "loss": 5.2745, + "step": 15780 + }, + { + "epoch": 0.3211466471354167, + "grad_norm": 12.465240478515625, + "learning_rate": 9.843623512314147e-06, + "loss": 5.211, + "step": 15785 + }, + { + "epoch": 0.3212483723958333, + "grad_norm": 13.063862800598145, + "learning_rate": 9.843524321689612e-06, + "loss": 5.1211, + "step": 15790 + }, + { + "epoch": 0.32135009765625, + "grad_norm": 18.48358726501465, + "learning_rate": 9.843425100116497e-06, + "loss": 5.1839, + "step": 15795 + }, + { + "epoch": 0.3214518229166667, + "grad_norm": 13.43949031829834, + "learning_rate": 9.843325847595433e-06, + "loss": 5.2109, + "step": 15800 + }, + { + "epoch": 0.3215535481770833, + "grad_norm": 15.87829875946045, + "learning_rate": 9.843226564127052e-06, + "loss": 5.3521, + "step": 15805 + }, + { + "epoch": 0.3216552734375, + "grad_norm": 18.145612716674805, + "learning_rate": 9.843127249711992e-06, + "loss": 5.2576, + "step": 15810 + }, + { + "epoch": 0.3217569986979167, + "grad_norm": 14.219240188598633, + "learning_rate": 9.843027904350885e-06, + "loss": 5.4418, + "step": 15815 + }, + { + "epoch": 0.3218587239583333, + "grad_norm": 19.810062408447266, + "learning_rate": 9.842928528044368e-06, + "loss": 5.0902, + "step": 15820 + }, + { + "epoch": 0.32196044921875, + "grad_norm": 14.486769676208496, + "learning_rate": 9.842829120793076e-06, + "loss": 5.1249, + "step": 15825 + }, + { + "epoch": 0.3220621744791667, + "grad_norm": 16.043384552001953, + "learning_rate": 9.842729682597642e-06, + "loss": 5.201, + "step": 15830 + }, + { + "epoch": 0.3221638997395833, + "grad_norm": 16.04343605041504, + "learning_rate": 9.842630213458705e-06, + "loss": 5.5518, + "step": 15835 + }, + { + "epoch": 0.322265625, + "grad_norm": 22.09510040283203, + "learning_rate": 9.842530713376896e-06, + "loss": 5.4679, + "step": 15840 + }, + { + "epoch": 0.3223673502604167, + "grad_norm": 21.745838165283203, + "learning_rate": 9.842431182352853e-06, + "loss": 5.1826, + "step": 15845 + }, + { + "epoch": 0.3224690755208333, + "grad_norm": 17.333345413208008, + "learning_rate": 9.842331620387212e-06, + "loss": 5.4874, + "step": 15850 + }, + { + "epoch": 0.32257080078125, + "grad_norm": 22.518373489379883, + "learning_rate": 9.84223202748061e-06, + "loss": 5.348, + "step": 15855 + }, + { + "epoch": 0.3226725260416667, + "grad_norm": 18.1658992767334, + "learning_rate": 9.842132403633682e-06, + "loss": 5.2685, + "step": 15860 + }, + { + "epoch": 0.3227742513020833, + "grad_norm": 17.215030670166016, + "learning_rate": 9.842032748847064e-06, + "loss": 5.2515, + "step": 15865 + }, + { + "epoch": 0.3228759765625, + "grad_norm": 15.029926300048828, + "learning_rate": 9.841933063121395e-06, + "loss": 5.4725, + "step": 15870 + }, + { + "epoch": 0.3229777018229167, + "grad_norm": 16.197275161743164, + "learning_rate": 9.84183334645731e-06, + "loss": 5.2499, + "step": 15875 + }, + { + "epoch": 0.3230794270833333, + "grad_norm": 15.736812591552734, + "learning_rate": 9.841733598855448e-06, + "loss": 5.3884, + "step": 15880 + }, + { + "epoch": 0.32318115234375, + "grad_norm": 22.05318832397461, + "learning_rate": 9.841633820316443e-06, + "loss": 5.421, + "step": 15885 + }, + { + "epoch": 0.3232828776041667, + "grad_norm": 13.801077842712402, + "learning_rate": 9.841534010840938e-06, + "loss": 5.1322, + "step": 15890 + }, + { + "epoch": 0.3233846028645833, + "grad_norm": 15.162664413452148, + "learning_rate": 9.841434170429566e-06, + "loss": 5.0706, + "step": 15895 + }, + { + "epoch": 0.323486328125, + "grad_norm": 19.792156219482422, + "learning_rate": 9.841334299082966e-06, + "loss": 5.2828, + "step": 15900 + }, + { + "epoch": 0.3235880533854167, + "grad_norm": 15.234286308288574, + "learning_rate": 9.841234396801777e-06, + "loss": 5.398, + "step": 15905 + }, + { + "epoch": 0.3236897786458333, + "grad_norm": 16.25388526916504, + "learning_rate": 9.841134463586637e-06, + "loss": 5.1969, + "step": 15910 + }, + { + "epoch": 0.32379150390625, + "grad_norm": 17.75886344909668, + "learning_rate": 9.841034499438186e-06, + "loss": 5.4579, + "step": 15915 + }, + { + "epoch": 0.3238932291666667, + "grad_norm": 21.733030319213867, + "learning_rate": 9.84093450435706e-06, + "loss": 5.084, + "step": 15920 + }, + { + "epoch": 0.3239949544270833, + "grad_norm": 15.321318626403809, + "learning_rate": 9.840834478343899e-06, + "loss": 5.3468, + "step": 15925 + }, + { + "epoch": 0.3240966796875, + "grad_norm": 20.147680282592773, + "learning_rate": 9.840734421399342e-06, + "loss": 5.3708, + "step": 15930 + }, + { + "epoch": 0.3241984049479167, + "grad_norm": 19.73408317565918, + "learning_rate": 9.84063433352403e-06, + "loss": 5.5004, + "step": 15935 + }, + { + "epoch": 0.3243001302083333, + "grad_norm": 17.03325080871582, + "learning_rate": 9.840534214718601e-06, + "loss": 5.3649, + "step": 15940 + }, + { + "epoch": 0.32440185546875, + "grad_norm": 15.700655937194824, + "learning_rate": 9.840434064983695e-06, + "loss": 5.1436, + "step": 15945 + }, + { + "epoch": 0.3245035807291667, + "grad_norm": 15.255401611328125, + "learning_rate": 9.84033388431995e-06, + "loss": 5.397, + "step": 15950 + }, + { + "epoch": 0.3246053059895833, + "grad_norm": 21.623292922973633, + "learning_rate": 9.84023367272801e-06, + "loss": 5.2584, + "step": 15955 + }, + { + "epoch": 0.32470703125, + "grad_norm": 17.877378463745117, + "learning_rate": 9.840133430208514e-06, + "loss": 5.2089, + "step": 15960 + }, + { + "epoch": 0.3248087565104167, + "grad_norm": 14.101017951965332, + "learning_rate": 9.840033156762101e-06, + "loss": 5.0889, + "step": 15965 + }, + { + "epoch": 0.3249104817708333, + "grad_norm": 13.586960792541504, + "learning_rate": 9.83993285238941e-06, + "loss": 5.173, + "step": 15970 + }, + { + "epoch": 0.32501220703125, + "grad_norm": 13.3849458694458, + "learning_rate": 9.839832517091088e-06, + "loss": 5.2181, + "step": 15975 + }, + { + "epoch": 0.3251139322916667, + "grad_norm": 24.129383087158203, + "learning_rate": 9.839732150867772e-06, + "loss": 5.4265, + "step": 15980 + }, + { + "epoch": 0.3252156575520833, + "grad_norm": 13.989419937133789, + "learning_rate": 9.839631753720103e-06, + "loss": 5.3746, + "step": 15985 + }, + { + "epoch": 0.3253173828125, + "grad_norm": 14.476128578186035, + "learning_rate": 9.839531325648724e-06, + "loss": 5.0901, + "step": 15990 + }, + { + "epoch": 0.3254191080729167, + "grad_norm": 15.122712135314941, + "learning_rate": 9.839430866654275e-06, + "loss": 5.1557, + "step": 15995 + }, + { + "epoch": 0.3255208333333333, + "grad_norm": 15.08204174041748, + "learning_rate": 9.8393303767374e-06, + "loss": 5.4781, + "step": 16000 + }, + { + "epoch": 0.32562255859375, + "grad_norm": 15.780187606811523, + "learning_rate": 9.83922985589874e-06, + "loss": 5.2928, + "step": 16005 + }, + { + "epoch": 0.3257242838541667, + "grad_norm": 16.716217041015625, + "learning_rate": 9.839129304138936e-06, + "loss": 5.3061, + "step": 16010 + }, + { + "epoch": 0.3258260091145833, + "grad_norm": 16.31974220275879, + "learning_rate": 9.839028721458634e-06, + "loss": 5.1243, + "step": 16015 + }, + { + "epoch": 0.325927734375, + "grad_norm": 17.855297088623047, + "learning_rate": 9.838928107858475e-06, + "loss": 5.2309, + "step": 16020 + }, + { + "epoch": 0.3260294596354167, + "grad_norm": 17.676563262939453, + "learning_rate": 9.8388274633391e-06, + "loss": 5.2608, + "step": 16025 + }, + { + "epoch": 0.3261311848958333, + "grad_norm": 15.336312294006348, + "learning_rate": 9.838726787901153e-06, + "loss": 5.1872, + "step": 16030 + }, + { + "epoch": 0.32623291015625, + "grad_norm": 17.932292938232422, + "learning_rate": 9.838626081545278e-06, + "loss": 5.3401, + "step": 16035 + }, + { + "epoch": 0.3263346354166667, + "grad_norm": 17.403383255004883, + "learning_rate": 9.838525344272119e-06, + "loss": 5.4753, + "step": 16040 + }, + { + "epoch": 0.3264363606770833, + "grad_norm": 16.9847469329834, + "learning_rate": 9.83842457608232e-06, + "loss": 5.3796, + "step": 16045 + }, + { + "epoch": 0.3265380859375, + "grad_norm": 15.372896194458008, + "learning_rate": 9.838323776976522e-06, + "loss": 5.6292, + "step": 16050 + }, + { + "epoch": 0.3266398111979167, + "grad_norm": 13.782724380493164, + "learning_rate": 9.838222946955371e-06, + "loss": 5.3248, + "step": 16055 + }, + { + "epoch": 0.3267415364583333, + "grad_norm": 21.121484756469727, + "learning_rate": 9.838122086019513e-06, + "loss": 5.6077, + "step": 16060 + }, + { + "epoch": 0.32684326171875, + "grad_norm": 26.734724044799805, + "learning_rate": 9.83802119416959e-06, + "loss": 5.2465, + "step": 16065 + }, + { + "epoch": 0.3269449869791667, + "grad_norm": 14.547486305236816, + "learning_rate": 9.837920271406247e-06, + "loss": 5.0805, + "step": 16070 + }, + { + "epoch": 0.3270467122395833, + "grad_norm": 22.25446319580078, + "learning_rate": 9.837819317730129e-06, + "loss": 5.2296, + "step": 16075 + }, + { + "epoch": 0.3271484375, + "grad_norm": 21.70213508605957, + "learning_rate": 9.837718333141882e-06, + "loss": 5.2877, + "step": 16080 + }, + { + "epoch": 0.3272501627604167, + "grad_norm": 13.753722190856934, + "learning_rate": 9.83761731764215e-06, + "loss": 5.1915, + "step": 16085 + }, + { + "epoch": 0.3273518880208333, + "grad_norm": 16.20344352722168, + "learning_rate": 9.837516271231578e-06, + "loss": 5.2156, + "step": 16090 + }, + { + "epoch": 0.32745361328125, + "grad_norm": 16.67436981201172, + "learning_rate": 9.837415193910815e-06, + "loss": 5.4647, + "step": 16095 + }, + { + "epoch": 0.3275553385416667, + "grad_norm": 16.633541107177734, + "learning_rate": 9.837314085680503e-06, + "loss": 5.3488, + "step": 16100 + }, + { + "epoch": 0.3276570638020833, + "grad_norm": 16.57516098022461, + "learning_rate": 9.83721294654129e-06, + "loss": 5.0709, + "step": 16105 + }, + { + "epoch": 0.3277587890625, + "grad_norm": 15.485136032104492, + "learning_rate": 9.837111776493822e-06, + "loss": 5.1631, + "step": 16110 + }, + { + "epoch": 0.3278605143229167, + "grad_norm": 16.027999877929688, + "learning_rate": 9.837010575538743e-06, + "loss": 5.461, + "step": 16115 + }, + { + "epoch": 0.3279622395833333, + "grad_norm": 17.428754806518555, + "learning_rate": 9.836909343676705e-06, + "loss": 5.548, + "step": 16120 + }, + { + "epoch": 0.32806396484375, + "grad_norm": 17.506799697875977, + "learning_rate": 9.83680808090835e-06, + "loss": 5.179, + "step": 16125 + }, + { + "epoch": 0.3281656901041667, + "grad_norm": 22.95178985595703, + "learning_rate": 9.836706787234327e-06, + "loss": 4.9995, + "step": 16130 + }, + { + "epoch": 0.3282674153645833, + "grad_norm": 16.975730895996094, + "learning_rate": 9.836605462655285e-06, + "loss": 5.1771, + "step": 16135 + }, + { + "epoch": 0.328369140625, + "grad_norm": 17.799896240234375, + "learning_rate": 9.836504107171868e-06, + "loss": 5.2905, + "step": 16140 + }, + { + "epoch": 0.3284708658854167, + "grad_norm": 20.49864959716797, + "learning_rate": 9.836402720784723e-06, + "loss": 5.0183, + "step": 16145 + }, + { + "epoch": 0.3285725911458333, + "grad_norm": 17.613447189331055, + "learning_rate": 9.836301303494502e-06, + "loss": 5.2951, + "step": 16150 + }, + { + "epoch": 0.32867431640625, + "grad_norm": 21.50612449645996, + "learning_rate": 9.836199855301852e-06, + "loss": 5.3276, + "step": 16155 + }, + { + "epoch": 0.3287760416666667, + "grad_norm": 15.892477989196777, + "learning_rate": 9.836098376207417e-06, + "loss": 5.3658, + "step": 16160 + }, + { + "epoch": 0.3288777669270833, + "grad_norm": 17.519254684448242, + "learning_rate": 9.835996866211851e-06, + "loss": 4.9747, + "step": 16165 + }, + { + "epoch": 0.3289794921875, + "grad_norm": 15.167780876159668, + "learning_rate": 9.8358953253158e-06, + "loss": 5.2358, + "step": 16170 + }, + { + "epoch": 0.3290812174479167, + "grad_norm": 16.294109344482422, + "learning_rate": 9.835793753519911e-06, + "loss": 5.0759, + "step": 16175 + }, + { + "epoch": 0.3291829427083333, + "grad_norm": 26.600502014160156, + "learning_rate": 9.835692150824838e-06, + "loss": 5.3139, + "step": 16180 + }, + { + "epoch": 0.32928466796875, + "grad_norm": 20.803499221801758, + "learning_rate": 9.835590517231225e-06, + "loss": 5.1773, + "step": 16185 + }, + { + "epoch": 0.3293863932291667, + "grad_norm": 18.689552307128906, + "learning_rate": 9.835488852739725e-06, + "loss": 5.3637, + "step": 16190 + }, + { + "epoch": 0.3294881184895833, + "grad_norm": 17.80657386779785, + "learning_rate": 9.835387157350985e-06, + "loss": 5.6772, + "step": 16195 + }, + { + "epoch": 0.32958984375, + "grad_norm": 16.85621452331543, + "learning_rate": 9.835285431065656e-06, + "loss": 5.2413, + "step": 16200 + }, + { + "epoch": 0.3296915690104167, + "grad_norm": 15.170636177062988, + "learning_rate": 9.83518367388439e-06, + "loss": 5.4113, + "step": 16205 + }, + { + "epoch": 0.3297932942708333, + "grad_norm": 13.861109733581543, + "learning_rate": 9.835081885807833e-06, + "loss": 5.4668, + "step": 16210 + }, + { + "epoch": 0.32989501953125, + "grad_norm": 18.297426223754883, + "learning_rate": 9.83498006683664e-06, + "loss": 5.2306, + "step": 16215 + }, + { + "epoch": 0.3299967447916667, + "grad_norm": 17.92118263244629, + "learning_rate": 9.834878216971456e-06, + "loss": 5.013, + "step": 16220 + }, + { + "epoch": 0.3300984700520833, + "grad_norm": 18.017297744750977, + "learning_rate": 9.834776336212937e-06, + "loss": 5.1391, + "step": 16225 + }, + { + "epoch": 0.3302001953125, + "grad_norm": 16.669879913330078, + "learning_rate": 9.834674424561732e-06, + "loss": 5.2501, + "step": 16230 + }, + { + "epoch": 0.3303019205729167, + "grad_norm": 15.708576202392578, + "learning_rate": 9.834572482018492e-06, + "loss": 5.1069, + "step": 16235 + }, + { + "epoch": 0.3304036458333333, + "grad_norm": 16.72035789489746, + "learning_rate": 9.834470508583867e-06, + "loss": 5.2164, + "step": 16240 + }, + { + "epoch": 0.33050537109375, + "grad_norm": 19.0867919921875, + "learning_rate": 9.834368504258511e-06, + "loss": 5.239, + "step": 16245 + }, + { + "epoch": 0.3306070963541667, + "grad_norm": 21.910552978515625, + "learning_rate": 9.834266469043077e-06, + "loss": 5.6541, + "step": 16250 + }, + { + "epoch": 0.3307088216145833, + "grad_norm": 20.525835037231445, + "learning_rate": 9.834164402938213e-06, + "loss": 5.1335, + "step": 16255 + }, + { + "epoch": 0.330810546875, + "grad_norm": 14.074350357055664, + "learning_rate": 9.834062305944571e-06, + "loss": 5.4635, + "step": 16260 + }, + { + "epoch": 0.3309122721354167, + "grad_norm": 14.683272361755371, + "learning_rate": 9.833960178062809e-06, + "loss": 5.5626, + "step": 16265 + }, + { + "epoch": 0.3310139973958333, + "grad_norm": 15.453561782836914, + "learning_rate": 9.833858019293573e-06, + "loss": 5.3632, + "step": 16270 + }, + { + "epoch": 0.33111572265625, + "grad_norm": 17.11794662475586, + "learning_rate": 9.83375582963752e-06, + "loss": 5.232, + "step": 16275 + }, + { + "epoch": 0.3312174479166667, + "grad_norm": 15.207950592041016, + "learning_rate": 9.833653609095304e-06, + "loss": 5.1496, + "step": 16280 + }, + { + "epoch": 0.3313191731770833, + "grad_norm": 13.997062683105469, + "learning_rate": 9.833551357667574e-06, + "loss": 5.2121, + "step": 16285 + }, + { + "epoch": 0.3314208984375, + "grad_norm": 17.319541931152344, + "learning_rate": 9.833449075354986e-06, + "loss": 5.4247, + "step": 16290 + }, + { + "epoch": 0.3315226236979167, + "grad_norm": 16.819961547851562, + "learning_rate": 9.833346762158191e-06, + "loss": 5.5166, + "step": 16295 + }, + { + "epoch": 0.3316243489583333, + "grad_norm": 16.13787841796875, + "learning_rate": 9.833244418077846e-06, + "loss": 4.9247, + "step": 16300 + }, + { + "epoch": 0.33172607421875, + "grad_norm": 19.282262802124023, + "learning_rate": 9.833142043114601e-06, + "loss": 5.2714, + "step": 16305 + }, + { + "epoch": 0.3318277994791667, + "grad_norm": 15.443867683410645, + "learning_rate": 9.833039637269114e-06, + "loss": 5.1672, + "step": 16310 + }, + { + "epoch": 0.3319295247395833, + "grad_norm": 12.31566047668457, + "learning_rate": 9.832937200542038e-06, + "loss": 5.1745, + "step": 16315 + }, + { + "epoch": 0.33203125, + "grad_norm": 13.998064994812012, + "learning_rate": 9.83283473293403e-06, + "loss": 5.5291, + "step": 16320 + }, + { + "epoch": 0.3321329752604167, + "grad_norm": 18.06601905822754, + "learning_rate": 9.832732234445739e-06, + "loss": 5.3774, + "step": 16325 + }, + { + "epoch": 0.3322347005208333, + "grad_norm": 19.758167266845703, + "learning_rate": 9.832629705077824e-06, + "loss": 5.3426, + "step": 16330 + }, + { + "epoch": 0.33233642578125, + "grad_norm": 18.477201461791992, + "learning_rate": 9.832527144830938e-06, + "loss": 5.3738, + "step": 16335 + }, + { + "epoch": 0.3324381510416667, + "grad_norm": 14.801587104797363, + "learning_rate": 9.832424553705739e-06, + "loss": 5.1856, + "step": 16340 + }, + { + "epoch": 0.3325398763020833, + "grad_norm": 15.26753044128418, + "learning_rate": 9.832321931702882e-06, + "loss": 5.2132, + "step": 16345 + }, + { + "epoch": 0.3326416015625, + "grad_norm": 25.262344360351562, + "learning_rate": 9.832219278823022e-06, + "loss": 5.2782, + "step": 16350 + }, + { + "epoch": 0.3327433268229167, + "grad_norm": 24.50555419921875, + "learning_rate": 9.832116595066813e-06, + "loss": 4.9814, + "step": 16355 + }, + { + "epoch": 0.3328450520833333, + "grad_norm": 20.115705490112305, + "learning_rate": 9.832013880434913e-06, + "loss": 5.4382, + "step": 16360 + }, + { + "epoch": 0.33294677734375, + "grad_norm": 16.96609878540039, + "learning_rate": 9.831911134927978e-06, + "loss": 5.1947, + "step": 16365 + }, + { + "epoch": 0.3330485026041667, + "grad_norm": 16.732683181762695, + "learning_rate": 9.831808358546666e-06, + "loss": 5.1474, + "step": 16370 + }, + { + "epoch": 0.3331502278645833, + "grad_norm": 14.545509338378906, + "learning_rate": 9.831705551291632e-06, + "loss": 5.2101, + "step": 16375 + }, + { + "epoch": 0.333251953125, + "grad_norm": 14.65388011932373, + "learning_rate": 9.831602713163533e-06, + "loss": 4.7243, + "step": 16380 + }, + { + "epoch": 0.3333536783854167, + "grad_norm": 13.219878196716309, + "learning_rate": 9.831499844163026e-06, + "loss": 5.4184, + "step": 16385 + }, + { + "epoch": 0.3334554036458333, + "grad_norm": 15.418916702270508, + "learning_rate": 9.831396944290769e-06, + "loss": 5.4155, + "step": 16390 + }, + { + "epoch": 0.33355712890625, + "grad_norm": 13.778020858764648, + "learning_rate": 9.831294013547422e-06, + "loss": 5.2076, + "step": 16395 + }, + { + "epoch": 0.3336588541666667, + "grad_norm": 18.67862319946289, + "learning_rate": 9.831191051933637e-06, + "loss": 5.1427, + "step": 16400 + }, + { + "epoch": 0.3337605794270833, + "grad_norm": 12.652849197387695, + "learning_rate": 9.831088059450075e-06, + "loss": 5.606, + "step": 16405 + }, + { + "epoch": 0.3338623046875, + "grad_norm": 16.722017288208008, + "learning_rate": 9.830985036097393e-06, + "loss": 5.2133, + "step": 16410 + }, + { + "epoch": 0.3339640299479167, + "grad_norm": 16.1689453125, + "learning_rate": 9.830881981876253e-06, + "loss": 5.2158, + "step": 16415 + }, + { + "epoch": 0.3340657552083333, + "grad_norm": 19.454757690429688, + "learning_rate": 9.830778896787307e-06, + "loss": 5.2221, + "step": 16420 + }, + { + "epoch": 0.33416748046875, + "grad_norm": 17.111122131347656, + "learning_rate": 9.830675780831221e-06, + "loss": 5.4745, + "step": 16425 + }, + { + "epoch": 0.3342692057291667, + "grad_norm": 16.00069236755371, + "learning_rate": 9.830572634008649e-06, + "loss": 5.4522, + "step": 16430 + }, + { + "epoch": 0.3343709309895833, + "grad_norm": 16.661144256591797, + "learning_rate": 9.830469456320249e-06, + "loss": 5.3412, + "step": 16435 + }, + { + "epoch": 0.33447265625, + "grad_norm": 17.0942325592041, + "learning_rate": 9.830366247766685e-06, + "loss": 5.3267, + "step": 16440 + }, + { + "epoch": 0.3345743815104167, + "grad_norm": 14.915984153747559, + "learning_rate": 9.830263008348613e-06, + "loss": 5.2976, + "step": 16445 + }, + { + "epoch": 0.3346761067708333, + "grad_norm": 17.3852481842041, + "learning_rate": 9.830159738066694e-06, + "loss": 5.276, + "step": 16450 + }, + { + "epoch": 0.33477783203125, + "grad_norm": 17.80579948425293, + "learning_rate": 9.830056436921589e-06, + "loss": 5.276, + "step": 16455 + }, + { + "epoch": 0.3348795572916667, + "grad_norm": 21.75140953063965, + "learning_rate": 9.829953104913954e-06, + "loss": 5.6956, + "step": 16460 + }, + { + "epoch": 0.3349812825520833, + "grad_norm": 12.784041404724121, + "learning_rate": 9.829849742044455e-06, + "loss": 5.2076, + "step": 16465 + }, + { + "epoch": 0.3350830078125, + "grad_norm": 23.14268684387207, + "learning_rate": 9.829746348313747e-06, + "loss": 5.1576, + "step": 16470 + }, + { + "epoch": 0.3351847330729167, + "grad_norm": 15.601995468139648, + "learning_rate": 9.829642923722494e-06, + "loss": 5.1361, + "step": 16475 + }, + { + "epoch": 0.3352864583333333, + "grad_norm": 22.527070999145508, + "learning_rate": 9.829539468271355e-06, + "loss": 5.0607, + "step": 16480 + }, + { + "epoch": 0.33538818359375, + "grad_norm": 17.154741287231445, + "learning_rate": 9.829435981960991e-06, + "loss": 4.889, + "step": 16485 + }, + { + "epoch": 0.3354899088541667, + "grad_norm": 17.395294189453125, + "learning_rate": 9.829332464792066e-06, + "loss": 5.5666, + "step": 16490 + }, + { + "epoch": 0.3355916341145833, + "grad_norm": 21.253732681274414, + "learning_rate": 9.829228916765238e-06, + "loss": 5.0504, + "step": 16495 + }, + { + "epoch": 0.335693359375, + "grad_norm": 17.335479736328125, + "learning_rate": 9.829125337881172e-06, + "loss": 5.3576, + "step": 16500 + }, + { + "epoch": 0.3357950846354167, + "grad_norm": 18.47808074951172, + "learning_rate": 9.829021728140529e-06, + "loss": 5.322, + "step": 16505 + }, + { + "epoch": 0.3358968098958333, + "grad_norm": 17.831315994262695, + "learning_rate": 9.828918087543966e-06, + "loss": 5.1951, + "step": 16510 + }, + { + "epoch": 0.33599853515625, + "grad_norm": 16.386072158813477, + "learning_rate": 9.828814416092153e-06, + "loss": 5.3885, + "step": 16515 + }, + { + "epoch": 0.3361002604166667, + "grad_norm": 16.926864624023438, + "learning_rate": 9.828710713785747e-06, + "loss": 5.3392, + "step": 16520 + }, + { + "epoch": 0.3362019856770833, + "grad_norm": 13.274506568908691, + "learning_rate": 9.828606980625414e-06, + "loss": 5.1877, + "step": 16525 + }, + { + "epoch": 0.3363037109375, + "grad_norm": 17.150936126708984, + "learning_rate": 9.828503216611813e-06, + "loss": 5.6168, + "step": 16530 + }, + { + "epoch": 0.3364054361979167, + "grad_norm": 19.540985107421875, + "learning_rate": 9.828399421745612e-06, + "loss": 5.2065, + "step": 16535 + }, + { + "epoch": 0.3365071614583333, + "grad_norm": 22.179405212402344, + "learning_rate": 9.828295596027468e-06, + "loss": 5.0449, + "step": 16540 + }, + { + "epoch": 0.33660888671875, + "grad_norm": 18.90184211730957, + "learning_rate": 9.82819173945805e-06, + "loss": 5.1258, + "step": 16545 + }, + { + "epoch": 0.3367106119791667, + "grad_norm": 15.375399589538574, + "learning_rate": 9.82808785203802e-06, + "loss": 5.3815, + "step": 16550 + }, + { + "epoch": 0.3368123372395833, + "grad_norm": 14.661465644836426, + "learning_rate": 9.82798393376804e-06, + "loss": 5.077, + "step": 16555 + }, + { + "epoch": 0.3369140625, + "grad_norm": 14.57669734954834, + "learning_rate": 9.827879984648776e-06, + "loss": 5.3503, + "step": 16560 + }, + { + "epoch": 0.3370157877604167, + "grad_norm": 21.415790557861328, + "learning_rate": 9.82777600468089e-06, + "loss": 5.1767, + "step": 16565 + }, + { + "epoch": 0.3371175130208333, + "grad_norm": 16.040849685668945, + "learning_rate": 9.82767199386505e-06, + "loss": 5.4681, + "step": 16570 + }, + { + "epoch": 0.33721923828125, + "grad_norm": 14.055045127868652, + "learning_rate": 9.827567952201917e-06, + "loss": 5.374, + "step": 16575 + }, + { + "epoch": 0.3373209635416667, + "grad_norm": 15.305682182312012, + "learning_rate": 9.827463879692158e-06, + "loss": 5.5353, + "step": 16580 + }, + { + "epoch": 0.3374226888020833, + "grad_norm": 20.056934356689453, + "learning_rate": 9.827359776336436e-06, + "loss": 5.4329, + "step": 16585 + }, + { + "epoch": 0.3375244140625, + "grad_norm": 17.213150024414062, + "learning_rate": 9.82725564213542e-06, + "loss": 5.1444, + "step": 16590 + }, + { + "epoch": 0.3376261393229167, + "grad_norm": 17.787824630737305, + "learning_rate": 9.82715147708977e-06, + "loss": 5.2693, + "step": 16595 + }, + { + "epoch": 0.3377278645833333, + "grad_norm": 14.085766792297363, + "learning_rate": 9.827047281200156e-06, + "loss": 5.3573, + "step": 16600 + }, + { + "epoch": 0.33782958984375, + "grad_norm": 14.792264938354492, + "learning_rate": 9.826943054467242e-06, + "loss": 5.1017, + "step": 16605 + }, + { + "epoch": 0.3379313151041667, + "grad_norm": 16.88961410522461, + "learning_rate": 9.826838796891693e-06, + "loss": 5.4184, + "step": 16610 + }, + { + "epoch": 0.3380330403645833, + "grad_norm": 13.406508445739746, + "learning_rate": 9.826734508474178e-06, + "loss": 5.1351, + "step": 16615 + }, + { + "epoch": 0.338134765625, + "grad_norm": 20.111583709716797, + "learning_rate": 9.82663018921536e-06, + "loss": 5.3534, + "step": 16620 + }, + { + "epoch": 0.3382364908854167, + "grad_norm": 15.691067695617676, + "learning_rate": 9.82652583911591e-06, + "loss": 5.2378, + "step": 16625 + }, + { + "epoch": 0.3383382161458333, + "grad_norm": 17.009740829467773, + "learning_rate": 9.82642145817649e-06, + "loss": 5.2904, + "step": 16630 + }, + { + "epoch": 0.33843994140625, + "grad_norm": 14.013676643371582, + "learning_rate": 9.826317046397769e-06, + "loss": 5.2586, + "step": 16635 + }, + { + "epoch": 0.3385416666666667, + "grad_norm": 17.01918601989746, + "learning_rate": 9.826212603780415e-06, + "loss": 5.0773, + "step": 16640 + }, + { + "epoch": 0.3386433919270833, + "grad_norm": 15.278485298156738, + "learning_rate": 9.826108130325093e-06, + "loss": 5.1504, + "step": 16645 + }, + { + "epoch": 0.3387451171875, + "grad_norm": 14.16551399230957, + "learning_rate": 9.826003626032473e-06, + "loss": 4.9157, + "step": 16650 + }, + { + "epoch": 0.3388468424479167, + "grad_norm": 14.244200706481934, + "learning_rate": 9.825899090903221e-06, + "loss": 5.1221, + "step": 16655 + }, + { + "epoch": 0.3389485677083333, + "grad_norm": 14.103958129882812, + "learning_rate": 9.825794524938007e-06, + "loss": 5.2839, + "step": 16660 + }, + { + "epoch": 0.33905029296875, + "grad_norm": 19.7537841796875, + "learning_rate": 9.825689928137499e-06, + "loss": 5.1674, + "step": 16665 + }, + { + "epoch": 0.3391520182291667, + "grad_norm": 19.157556533813477, + "learning_rate": 9.825585300502362e-06, + "loss": 5.0804, + "step": 16670 + }, + { + "epoch": 0.3392537434895833, + "grad_norm": 17.179624557495117, + "learning_rate": 9.825480642033267e-06, + "loss": 5.3016, + "step": 16675 + }, + { + "epoch": 0.33935546875, + "grad_norm": 15.815378189086914, + "learning_rate": 9.825375952730883e-06, + "loss": 5.312, + "step": 16680 + }, + { + "epoch": 0.3394571940104167, + "grad_norm": 12.402484893798828, + "learning_rate": 9.825271232595879e-06, + "loss": 5.4802, + "step": 16685 + }, + { + "epoch": 0.3395589192708333, + "grad_norm": 16.746810913085938, + "learning_rate": 9.825166481628923e-06, + "loss": 5.2314, + "step": 16690 + }, + { + "epoch": 0.33966064453125, + "grad_norm": 24.392669677734375, + "learning_rate": 9.825061699830685e-06, + "loss": 5.4275, + "step": 16695 + }, + { + "epoch": 0.3397623697916667, + "grad_norm": 14.158646583557129, + "learning_rate": 9.824956887201833e-06, + "loss": 5.2674, + "step": 16700 + }, + { + "epoch": 0.3398640950520833, + "grad_norm": 19.4167537689209, + "learning_rate": 9.824852043743041e-06, + "loss": 5.2511, + "step": 16705 + }, + { + "epoch": 0.3399658203125, + "grad_norm": 12.242448806762695, + "learning_rate": 9.824747169454973e-06, + "loss": 5.1374, + "step": 16710 + }, + { + "epoch": 0.3400675455729167, + "grad_norm": 23.29029655456543, + "learning_rate": 9.824642264338304e-06, + "loss": 5.4453, + "step": 16715 + }, + { + "epoch": 0.3401692708333333, + "grad_norm": 16.73177719116211, + "learning_rate": 9.824537328393701e-06, + "loss": 5.4738, + "step": 16720 + }, + { + "epoch": 0.34027099609375, + "grad_norm": 13.941364288330078, + "learning_rate": 9.824432361621838e-06, + "loss": 5.2564, + "step": 16725 + }, + { + "epoch": 0.3403727213541667, + "grad_norm": 18.128429412841797, + "learning_rate": 9.824327364023382e-06, + "loss": 5.1326, + "step": 16730 + }, + { + "epoch": 0.3404744466145833, + "grad_norm": 18.058685302734375, + "learning_rate": 9.824222335599007e-06, + "loss": 5.3856, + "step": 16735 + }, + { + "epoch": 0.340576171875, + "grad_norm": 21.59507179260254, + "learning_rate": 9.824117276349383e-06, + "loss": 5.1371, + "step": 16740 + }, + { + "epoch": 0.3406778971354167, + "grad_norm": 24.717905044555664, + "learning_rate": 9.82401218627518e-06, + "loss": 5.4476, + "step": 16745 + }, + { + "epoch": 0.3407796223958333, + "grad_norm": 17.555301666259766, + "learning_rate": 9.823907065377071e-06, + "loss": 5.2562, + "step": 16750 + }, + { + "epoch": 0.34088134765625, + "grad_norm": 28.74183464050293, + "learning_rate": 9.823801913655726e-06, + "loss": 5.3317, + "step": 16755 + }, + { + "epoch": 0.3409830729166667, + "grad_norm": 15.93504524230957, + "learning_rate": 9.823696731111819e-06, + "loss": 5.3108, + "step": 16760 + }, + { + "epoch": 0.3410847981770833, + "grad_norm": 14.878809928894043, + "learning_rate": 9.82359151774602e-06, + "loss": 5.334, + "step": 16765 + }, + { + "epoch": 0.3411865234375, + "grad_norm": 22.16539764404297, + "learning_rate": 9.823486273559004e-06, + "loss": 5.0583, + "step": 16770 + }, + { + "epoch": 0.3412882486979167, + "grad_norm": 15.263195037841797, + "learning_rate": 9.823380998551441e-06, + "loss": 5.1171, + "step": 16775 + }, + { + "epoch": 0.3413899739583333, + "grad_norm": 13.97872257232666, + "learning_rate": 9.823275692724006e-06, + "loss": 5.3641, + "step": 16780 + }, + { + "epoch": 0.34149169921875, + "grad_norm": 16.584976196289062, + "learning_rate": 9.82317035607737e-06, + "loss": 5.2056, + "step": 16785 + }, + { + "epoch": 0.3415934244791667, + "grad_norm": 15.55351448059082, + "learning_rate": 9.823064988612205e-06, + "loss": 4.9838, + "step": 16790 + }, + { + "epoch": 0.3416951497395833, + "grad_norm": 16.0356502532959, + "learning_rate": 9.822959590329186e-06, + "loss": 5.361, + "step": 16795 + }, + { + "epoch": 0.341796875, + "grad_norm": 14.441080093383789, + "learning_rate": 9.822854161228987e-06, + "loss": 5.5492, + "step": 16800 + }, + { + "epoch": 0.3418986002604167, + "grad_norm": 15.206454277038574, + "learning_rate": 9.82274870131228e-06, + "loss": 5.3224, + "step": 16805 + }, + { + "epoch": 0.3420003255208333, + "grad_norm": 16.26996612548828, + "learning_rate": 9.82264321057974e-06, + "loss": 5.2066, + "step": 16810 + }, + { + "epoch": 0.34210205078125, + "grad_norm": 20.00419044494629, + "learning_rate": 9.822537689032042e-06, + "loss": 5.2857, + "step": 16815 + }, + { + "epoch": 0.3422037760416667, + "grad_norm": 15.503232955932617, + "learning_rate": 9.822432136669859e-06, + "loss": 5.4447, + "step": 16820 + }, + { + "epoch": 0.3423055013020833, + "grad_norm": 21.52096939086914, + "learning_rate": 9.822326553493864e-06, + "loss": 5.1785, + "step": 16825 + }, + { + "epoch": 0.3424072265625, + "grad_norm": 18.623964309692383, + "learning_rate": 9.822220939504734e-06, + "loss": 5.111, + "step": 16830 + }, + { + "epoch": 0.3425089518229167, + "grad_norm": 17.037141799926758, + "learning_rate": 9.822115294703143e-06, + "loss": 5.2739, + "step": 16835 + }, + { + "epoch": 0.3426106770833333, + "grad_norm": 18.36560821533203, + "learning_rate": 9.822009619089766e-06, + "loss": 5.3047, + "step": 16840 + }, + { + "epoch": 0.34271240234375, + "grad_norm": 15.463924407958984, + "learning_rate": 9.821903912665278e-06, + "loss": 5.1168, + "step": 16845 + }, + { + "epoch": 0.3428141276041667, + "grad_norm": 16.680654525756836, + "learning_rate": 9.821798175430355e-06, + "loss": 5.2087, + "step": 16850 + }, + { + "epoch": 0.3429158528645833, + "grad_norm": 16.553937911987305, + "learning_rate": 9.82169240738567e-06, + "loss": 5.1917, + "step": 16855 + }, + { + "epoch": 0.343017578125, + "grad_norm": 16.826351165771484, + "learning_rate": 9.821586608531904e-06, + "loss": 5.2743, + "step": 16860 + }, + { + "epoch": 0.3431193033854167, + "grad_norm": 17.4448184967041, + "learning_rate": 9.82148077886973e-06, + "loss": 5.2244, + "step": 16865 + }, + { + "epoch": 0.3432210286458333, + "grad_norm": 15.116878509521484, + "learning_rate": 9.821374918399825e-06, + "loss": 5.1935, + "step": 16870 + }, + { + "epoch": 0.34332275390625, + "grad_norm": 19.22504234313965, + "learning_rate": 9.821269027122861e-06, + "loss": 5.1702, + "step": 16875 + }, + { + "epoch": 0.3434244791666667, + "grad_norm": 15.175506591796875, + "learning_rate": 9.821163105039521e-06, + "loss": 5.1809, + "step": 16880 + }, + { + "epoch": 0.3435262044270833, + "grad_norm": 15.887053489685059, + "learning_rate": 9.821057152150479e-06, + "loss": 5.2145, + "step": 16885 + }, + { + "epoch": 0.3436279296875, + "grad_norm": 16.342432022094727, + "learning_rate": 9.820951168456414e-06, + "loss": 5.2776, + "step": 16890 + }, + { + "epoch": 0.3437296549479167, + "grad_norm": 14.183149337768555, + "learning_rate": 9.820845153957998e-06, + "loss": 5.0784, + "step": 16895 + }, + { + "epoch": 0.3438313802083333, + "grad_norm": 17.274354934692383, + "learning_rate": 9.820739108655915e-06, + "loss": 5.3578, + "step": 16900 + }, + { + "epoch": 0.34393310546875, + "grad_norm": 15.425395965576172, + "learning_rate": 9.820633032550837e-06, + "loss": 5.5196, + "step": 16905 + }, + { + "epoch": 0.3440348307291667, + "grad_norm": 18.802488327026367, + "learning_rate": 9.820526925643446e-06, + "loss": 5.1489, + "step": 16910 + }, + { + "epoch": 0.3441365559895833, + "grad_norm": 22.824663162231445, + "learning_rate": 9.820420787934416e-06, + "loss": 4.9445, + "step": 16915 + }, + { + "epoch": 0.34423828125, + "grad_norm": 14.912516593933105, + "learning_rate": 9.82031461942443e-06, + "loss": 5.2102, + "step": 16920 + }, + { + "epoch": 0.3443400065104167, + "grad_norm": 16.88731575012207, + "learning_rate": 9.82020842011416e-06, + "loss": 5.4041, + "step": 16925 + }, + { + "epoch": 0.3444417317708333, + "grad_norm": 14.876289367675781, + "learning_rate": 9.820102190004293e-06, + "loss": 5.456, + "step": 16930 + }, + { + "epoch": 0.34454345703125, + "grad_norm": 20.09380340576172, + "learning_rate": 9.8199959290955e-06, + "loss": 5.4668, + "step": 16935 + }, + { + "epoch": 0.3446451822916667, + "grad_norm": 12.783712387084961, + "learning_rate": 9.819889637388465e-06, + "loss": 5.5431, + "step": 16940 + }, + { + "epoch": 0.3447469075520833, + "grad_norm": 19.589893341064453, + "learning_rate": 9.819783314883863e-06, + "loss": 5.5314, + "step": 16945 + }, + { + "epoch": 0.3448486328125, + "grad_norm": 14.410888671875, + "learning_rate": 9.819676961582377e-06, + "loss": 5.2513, + "step": 16950 + }, + { + "epoch": 0.3449503580729167, + "grad_norm": 17.818302154541016, + "learning_rate": 9.819570577484684e-06, + "loss": 5.0748, + "step": 16955 + }, + { + "epoch": 0.3450520833333333, + "grad_norm": 20.255395889282227, + "learning_rate": 9.819464162591467e-06, + "loss": 5.2073, + "step": 16960 + }, + { + "epoch": 0.34515380859375, + "grad_norm": 15.766462326049805, + "learning_rate": 9.819357716903405e-06, + "loss": 5.0763, + "step": 16965 + }, + { + "epoch": 0.3452555338541667, + "grad_norm": 18.82774543762207, + "learning_rate": 9.819251240421173e-06, + "loss": 5.2739, + "step": 16970 + }, + { + "epoch": 0.3453572591145833, + "grad_norm": 13.56013298034668, + "learning_rate": 9.819144733145459e-06, + "loss": 5.3268, + "step": 16975 + }, + { + "epoch": 0.345458984375, + "grad_norm": 17.874414443969727, + "learning_rate": 9.819038195076937e-06, + "loss": 5.2265, + "step": 16980 + }, + { + "epoch": 0.3455607096354167, + "grad_norm": 16.70048713684082, + "learning_rate": 9.818931626216293e-06, + "loss": 5.1685, + "step": 16985 + }, + { + "epoch": 0.3456624348958333, + "grad_norm": 16.166629791259766, + "learning_rate": 9.818825026564203e-06, + "loss": 4.9758, + "step": 16990 + }, + { + "epoch": 0.34576416015625, + "grad_norm": 12.847216606140137, + "learning_rate": 9.818718396121353e-06, + "loss": 4.9068, + "step": 16995 + }, + { + "epoch": 0.3458658854166667, + "grad_norm": 12.7055082321167, + "learning_rate": 9.81861173488842e-06, + "loss": 5.1592, + "step": 17000 + }, + { + "epoch": 0.3459676106770833, + "grad_norm": 21.08046531677246, + "learning_rate": 9.81850504286609e-06, + "loss": 5.2804, + "step": 17005 + }, + { + "epoch": 0.3460693359375, + "grad_norm": 13.11296272277832, + "learning_rate": 9.818398320055043e-06, + "loss": 5.0964, + "step": 17010 + }, + { + "epoch": 0.3461710611979167, + "grad_norm": 17.091571807861328, + "learning_rate": 9.818291566455958e-06, + "loss": 5.1544, + "step": 17015 + }, + { + "epoch": 0.3462727864583333, + "grad_norm": 32.71065902709961, + "learning_rate": 9.81818478206952e-06, + "loss": 5.6389, + "step": 17020 + }, + { + "epoch": 0.34637451171875, + "grad_norm": 17.299774169921875, + "learning_rate": 9.81807796689641e-06, + "loss": 5.1384, + "step": 17025 + }, + { + "epoch": 0.3464762369791667, + "grad_norm": 20.815074920654297, + "learning_rate": 9.817971120937313e-06, + "loss": 5.3114, + "step": 17030 + }, + { + "epoch": 0.3465779622395833, + "grad_norm": 17.838388442993164, + "learning_rate": 9.81786424419291e-06, + "loss": 5.5456, + "step": 17035 + }, + { + "epoch": 0.3466796875, + "grad_norm": 19.12969398498535, + "learning_rate": 9.817757336663882e-06, + "loss": 5.0567, + "step": 17040 + }, + { + "epoch": 0.3467814127604167, + "grad_norm": 19.140201568603516, + "learning_rate": 9.817650398350915e-06, + "loss": 5.1146, + "step": 17045 + }, + { + "epoch": 0.3468831380208333, + "grad_norm": 15.593518257141113, + "learning_rate": 9.817543429254693e-06, + "loss": 5.2675, + "step": 17050 + }, + { + "epoch": 0.34698486328125, + "grad_norm": 13.43029499053955, + "learning_rate": 9.817436429375896e-06, + "loss": 5.4325, + "step": 17055 + }, + { + "epoch": 0.3470865885416667, + "grad_norm": 17.58652114868164, + "learning_rate": 9.81732939871521e-06, + "loss": 5.6611, + "step": 17060 + }, + { + "epoch": 0.3471883138020833, + "grad_norm": 17.471946716308594, + "learning_rate": 9.817222337273317e-06, + "loss": 5.3954, + "step": 17065 + }, + { + "epoch": 0.3472900390625, + "grad_norm": 17.262592315673828, + "learning_rate": 9.817115245050903e-06, + "loss": 5.3633, + "step": 17070 + }, + { + "epoch": 0.3473917643229167, + "grad_norm": 26.49144172668457, + "learning_rate": 9.817008122048652e-06, + "loss": 5.0085, + "step": 17075 + }, + { + "epoch": 0.3474934895833333, + "grad_norm": 21.08639907836914, + "learning_rate": 9.81690096826725e-06, + "loss": 5.299, + "step": 17080 + }, + { + "epoch": 0.34759521484375, + "grad_norm": 19.665996551513672, + "learning_rate": 9.816793783707376e-06, + "loss": 5.4543, + "step": 17085 + }, + { + "epoch": 0.3476969401041667, + "grad_norm": 12.407235145568848, + "learning_rate": 9.816686568369721e-06, + "loss": 5.2103, + "step": 17090 + }, + { + "epoch": 0.3477986653645833, + "grad_norm": 15.58510971069336, + "learning_rate": 9.81657932225497e-06, + "loss": 5.4516, + "step": 17095 + }, + { + "epoch": 0.347900390625, + "grad_norm": 21.232818603515625, + "learning_rate": 9.816472045363803e-06, + "loss": 5.4973, + "step": 17100 + }, + { + "epoch": 0.3480021158854167, + "grad_norm": 24.878721237182617, + "learning_rate": 9.81636473769691e-06, + "loss": 5.3833, + "step": 17105 + }, + { + "epoch": 0.3481038411458333, + "grad_norm": 12.469351768493652, + "learning_rate": 9.816257399254975e-06, + "loss": 5.2807, + "step": 17110 + }, + { + "epoch": 0.34820556640625, + "grad_norm": 18.591156005859375, + "learning_rate": 9.816150030038684e-06, + "loss": 5.319, + "step": 17115 + }, + { + "epoch": 0.3483072916666667, + "grad_norm": 17.294422149658203, + "learning_rate": 9.816042630048723e-06, + "loss": 5.1545, + "step": 17120 + }, + { + "epoch": 0.3484090169270833, + "grad_norm": 13.303838729858398, + "learning_rate": 9.81593519928578e-06, + "loss": 5.0285, + "step": 17125 + }, + { + "epoch": 0.3485107421875, + "grad_norm": 18.02662467956543, + "learning_rate": 9.815827737750538e-06, + "loss": 5.2856, + "step": 17130 + }, + { + "epoch": 0.3486124674479167, + "grad_norm": 16.18470573425293, + "learning_rate": 9.815720245443685e-06, + "loss": 5.2464, + "step": 17135 + }, + { + "epoch": 0.3487141927083333, + "grad_norm": 18.756925582885742, + "learning_rate": 9.81561272236591e-06, + "loss": 4.9391, + "step": 17140 + }, + { + "epoch": 0.34881591796875, + "grad_norm": 14.984801292419434, + "learning_rate": 9.815505168517898e-06, + "loss": 5.0816, + "step": 17145 + }, + { + "epoch": 0.3489176432291667, + "grad_norm": 12.84400749206543, + "learning_rate": 9.815397583900335e-06, + "loss": 5.3552, + "step": 17150 + }, + { + "epoch": 0.3490193684895833, + "grad_norm": 19.020544052124023, + "learning_rate": 9.815289968513912e-06, + "loss": 5.1038, + "step": 17155 + }, + { + "epoch": 0.34912109375, + "grad_norm": 15.651814460754395, + "learning_rate": 9.815182322359313e-06, + "loss": 5.2241, + "step": 17160 + }, + { + "epoch": 0.3492228190104167, + "grad_norm": 17.264501571655273, + "learning_rate": 9.81507464543723e-06, + "loss": 5.3012, + "step": 17165 + }, + { + "epoch": 0.3493245442708333, + "grad_norm": 32.07028579711914, + "learning_rate": 9.814966937748347e-06, + "loss": 5.4121, + "step": 17170 + }, + { + "epoch": 0.34942626953125, + "grad_norm": 14.129508018493652, + "learning_rate": 9.814859199293354e-06, + "loss": 5.145, + "step": 17175 + }, + { + "epoch": 0.3495279947916667, + "grad_norm": 19.38291358947754, + "learning_rate": 9.814751430072938e-06, + "loss": 5.4843, + "step": 17180 + }, + { + "epoch": 0.3496297200520833, + "grad_norm": 17.30689811706543, + "learning_rate": 9.81464363008779e-06, + "loss": 5.3968, + "step": 17185 + }, + { + "epoch": 0.3497314453125, + "grad_norm": 12.762657165527344, + "learning_rate": 9.814535799338599e-06, + "loss": 5.0565, + "step": 17190 + }, + { + "epoch": 0.3498331705729167, + "grad_norm": 19.774076461791992, + "learning_rate": 9.81442793782605e-06, + "loss": 5.0404, + "step": 17195 + }, + { + "epoch": 0.3499348958333333, + "grad_norm": 15.181994438171387, + "learning_rate": 9.814320045550835e-06, + "loss": 5.269, + "step": 17200 + }, + { + "epoch": 0.35003662109375, + "grad_norm": 25.556119918823242, + "learning_rate": 9.814212122513644e-06, + "loss": 5.3342, + "step": 17205 + }, + { + "epoch": 0.3501383463541667, + "grad_norm": 12.365560531616211, + "learning_rate": 9.814104168715166e-06, + "loss": 5.2139, + "step": 17210 + }, + { + "epoch": 0.3502400716145833, + "grad_norm": 20.582868576049805, + "learning_rate": 9.81399618415609e-06, + "loss": 5.2562, + "step": 17215 + }, + { + "epoch": 0.350341796875, + "grad_norm": 13.605620384216309, + "learning_rate": 9.813888168837107e-06, + "loss": 5.6602, + "step": 17220 + }, + { + "epoch": 0.3504435221354167, + "grad_norm": 16.62467384338379, + "learning_rate": 9.813780122758906e-06, + "loss": 5.2154, + "step": 17225 + }, + { + "epoch": 0.3505452473958333, + "grad_norm": 12.935856819152832, + "learning_rate": 9.813672045922178e-06, + "loss": 5.3247, + "step": 17230 + }, + { + "epoch": 0.35064697265625, + "grad_norm": 14.276646614074707, + "learning_rate": 9.813563938327614e-06, + "loss": 5.3514, + "step": 17235 + }, + { + "epoch": 0.3507486979166667, + "grad_norm": 16.200063705444336, + "learning_rate": 9.813455799975907e-06, + "loss": 5.3576, + "step": 17240 + }, + { + "epoch": 0.3508504231770833, + "grad_norm": 16.51966094970703, + "learning_rate": 9.813347630867742e-06, + "loss": 5.1095, + "step": 17245 + }, + { + "epoch": 0.3509521484375, + "grad_norm": 23.302661895751953, + "learning_rate": 9.813239431003814e-06, + "loss": 5.1153, + "step": 17250 + }, + { + "epoch": 0.3510538736979167, + "grad_norm": 17.277721405029297, + "learning_rate": 9.813131200384816e-06, + "loss": 5.425, + "step": 17255 + }, + { + "epoch": 0.3511555989583333, + "grad_norm": 14.62983226776123, + "learning_rate": 9.813022939011434e-06, + "loss": 5.2333, + "step": 17260 + }, + { + "epoch": 0.35125732421875, + "grad_norm": 19.51612091064453, + "learning_rate": 9.812914646884366e-06, + "loss": 4.9735, + "step": 17265 + }, + { + "epoch": 0.3513590494791667, + "grad_norm": 12.948827743530273, + "learning_rate": 9.812806324004302e-06, + "loss": 5.2121, + "step": 17270 + }, + { + "epoch": 0.3514607747395833, + "grad_norm": 17.703073501586914, + "learning_rate": 9.812697970371932e-06, + "loss": 5.2634, + "step": 17275 + }, + { + "epoch": 0.3515625, + "grad_norm": 13.887580871582031, + "learning_rate": 9.81258958598795e-06, + "loss": 5.4997, + "step": 17280 + }, + { + "epoch": 0.3516642252604167, + "grad_norm": 16.070199966430664, + "learning_rate": 9.812481170853048e-06, + "loss": 5.2009, + "step": 17285 + }, + { + "epoch": 0.3517659505208333, + "grad_norm": 19.38536262512207, + "learning_rate": 9.81237272496792e-06, + "loss": 5.333, + "step": 17290 + }, + { + "epoch": 0.35186767578125, + "grad_norm": 13.750870704650879, + "learning_rate": 9.812264248333256e-06, + "loss": 5.2309, + "step": 17295 + }, + { + "epoch": 0.3519694010416667, + "grad_norm": 17.062255859375, + "learning_rate": 9.81215574094975e-06, + "loss": 5.0688, + "step": 17300 + }, + { + "epoch": 0.3520711263020833, + "grad_norm": 20.324447631835938, + "learning_rate": 9.812047202818101e-06, + "loss": 5.4803, + "step": 17305 + }, + { + "epoch": 0.3521728515625, + "grad_norm": 18.701086044311523, + "learning_rate": 9.811938633938996e-06, + "loss": 5.158, + "step": 17310 + }, + { + "epoch": 0.3522745768229167, + "grad_norm": 15.314988136291504, + "learning_rate": 9.81183003431313e-06, + "loss": 5.2563, + "step": 17315 + }, + { + "epoch": 0.3523763020833333, + "grad_norm": 16.51127052307129, + "learning_rate": 9.811721403941197e-06, + "loss": 5.275, + "step": 17320 + }, + { + "epoch": 0.35247802734375, + "grad_norm": 15.470259666442871, + "learning_rate": 9.811612742823892e-06, + "loss": 5.1321, + "step": 17325 + }, + { + "epoch": 0.3525797526041667, + "grad_norm": 17.751220703125, + "learning_rate": 9.81150405096191e-06, + "loss": 5.0451, + "step": 17330 + }, + { + "epoch": 0.3526814778645833, + "grad_norm": 15.167455673217773, + "learning_rate": 9.811395328355944e-06, + "loss": 5.3989, + "step": 17335 + }, + { + "epoch": 0.352783203125, + "grad_norm": 16.294313430786133, + "learning_rate": 9.811286575006689e-06, + "loss": 5.1129, + "step": 17340 + }, + { + "epoch": 0.3528849283854167, + "grad_norm": 20.912992477416992, + "learning_rate": 9.81117779091484e-06, + "loss": 5.255, + "step": 17345 + }, + { + "epoch": 0.3529866536458333, + "grad_norm": 22.214611053466797, + "learning_rate": 9.811068976081092e-06, + "loss": 5.2575, + "step": 17350 + }, + { + "epoch": 0.35308837890625, + "grad_norm": 14.647848129272461, + "learning_rate": 9.810960130506141e-06, + "loss": 5.3674, + "step": 17355 + }, + { + "epoch": 0.3531901041666667, + "grad_norm": 12.446414947509766, + "learning_rate": 9.810851254190681e-06, + "loss": 5.1753, + "step": 17360 + }, + { + "epoch": 0.3532918294270833, + "grad_norm": 18.20749282836914, + "learning_rate": 9.81074234713541e-06, + "loss": 5.1848, + "step": 17365 + }, + { + "epoch": 0.3533935546875, + "grad_norm": 17.99945831298828, + "learning_rate": 9.810633409341022e-06, + "loss": 5.2805, + "step": 17370 + }, + { + "epoch": 0.3534952799479167, + "grad_norm": 18.934860229492188, + "learning_rate": 9.810524440808213e-06, + "loss": 5.2641, + "step": 17375 + }, + { + "epoch": 0.3535970052083333, + "grad_norm": 17.365676879882812, + "learning_rate": 9.810415441537682e-06, + "loss": 5.2925, + "step": 17380 + }, + { + "epoch": 0.35369873046875, + "grad_norm": 14.581875801086426, + "learning_rate": 9.81030641153012e-06, + "loss": 5.4182, + "step": 17385 + }, + { + "epoch": 0.3538004557291667, + "grad_norm": 15.575628280639648, + "learning_rate": 9.81019735078623e-06, + "loss": 5.1266, + "step": 17390 + }, + { + "epoch": 0.3539021809895833, + "grad_norm": 15.626903533935547, + "learning_rate": 9.810088259306704e-06, + "loss": 5.3219, + "step": 17395 + }, + { + "epoch": 0.35400390625, + "grad_norm": 16.13385581970215, + "learning_rate": 9.809979137092242e-06, + "loss": 5.3756, + "step": 17400 + }, + { + "epoch": 0.3541056315104167, + "grad_norm": 16.793210983276367, + "learning_rate": 9.80986998414354e-06, + "loss": 5.288, + "step": 17405 + }, + { + "epoch": 0.3542073567708333, + "grad_norm": 20.270360946655273, + "learning_rate": 9.809760800461295e-06, + "loss": 5.2768, + "step": 17410 + }, + { + "epoch": 0.35430908203125, + "grad_norm": 20.970027923583984, + "learning_rate": 9.809651586046205e-06, + "loss": 5.0189, + "step": 17415 + }, + { + "epoch": 0.3544108072916667, + "grad_norm": 15.550622940063477, + "learning_rate": 9.80954234089897e-06, + "loss": 5.3397, + "step": 17420 + }, + { + "epoch": 0.3545125325520833, + "grad_norm": 16.720684051513672, + "learning_rate": 9.809433065020287e-06, + "loss": 5.4225, + "step": 17425 + }, + { + "epoch": 0.3546142578125, + "grad_norm": 13.941922187805176, + "learning_rate": 9.80932375841085e-06, + "loss": 5.3402, + "step": 17430 + }, + { + "epoch": 0.3547159830729167, + "grad_norm": 16.251407623291016, + "learning_rate": 9.809214421071363e-06, + "loss": 5.315, + "step": 17435 + }, + { + "epoch": 0.3548177083333333, + "grad_norm": 19.374116897583008, + "learning_rate": 9.809105053002522e-06, + "loss": 5.4665, + "step": 17440 + }, + { + "epoch": 0.35491943359375, + "grad_norm": 14.719901084899902, + "learning_rate": 9.808995654205026e-06, + "loss": 5.2319, + "step": 17445 + }, + { + "epoch": 0.3550211588541667, + "grad_norm": 20.50086212158203, + "learning_rate": 9.808886224679574e-06, + "loss": 5.4934, + "step": 17450 + }, + { + "epoch": 0.3551228841145833, + "grad_norm": 17.85249137878418, + "learning_rate": 9.808776764426866e-06, + "loss": 5.3039, + "step": 17455 + }, + { + "epoch": 0.355224609375, + "grad_norm": 16.518810272216797, + "learning_rate": 9.808667273447604e-06, + "loss": 5.3065, + "step": 17460 + }, + { + "epoch": 0.3553263346354167, + "grad_norm": 13.040223121643066, + "learning_rate": 9.80855775174248e-06, + "loss": 5.2203, + "step": 17465 + }, + { + "epoch": 0.3554280598958333, + "grad_norm": 16.185041427612305, + "learning_rate": 9.808448199312201e-06, + "loss": 5.3303, + "step": 17470 + }, + { + "epoch": 0.35552978515625, + "grad_norm": 16.262590408325195, + "learning_rate": 9.808338616157463e-06, + "loss": 5.2023, + "step": 17475 + }, + { + "epoch": 0.3556315104166667, + "grad_norm": 20.708158493041992, + "learning_rate": 9.808229002278969e-06, + "loss": 5.3882, + "step": 17480 + }, + { + "epoch": 0.3557332356770833, + "grad_norm": 16.058713912963867, + "learning_rate": 9.808119357677416e-06, + "loss": 5.3399, + "step": 17485 + }, + { + "epoch": 0.3558349609375, + "grad_norm": 13.604840278625488, + "learning_rate": 9.808009682353508e-06, + "loss": 5.4929, + "step": 17490 + }, + { + "epoch": 0.3559366861979167, + "grad_norm": 16.929574966430664, + "learning_rate": 9.807899976307944e-06, + "loss": 5.0075, + "step": 17495 + }, + { + "epoch": 0.3560384114583333, + "grad_norm": 14.353821754455566, + "learning_rate": 9.807790239541426e-06, + "loss": 5.4048, + "step": 17500 + }, + { + "epoch": 0.35614013671875, + "grad_norm": 24.150514602661133, + "learning_rate": 9.807680472054654e-06, + "loss": 5.3562, + "step": 17505 + }, + { + "epoch": 0.3562418619791667, + "grad_norm": 14.036866188049316, + "learning_rate": 9.80757067384833e-06, + "loss": 5.1905, + "step": 17510 + }, + { + "epoch": 0.3563435872395833, + "grad_norm": 17.77507781982422, + "learning_rate": 9.807460844923155e-06, + "loss": 5.3087, + "step": 17515 + }, + { + "epoch": 0.3564453125, + "grad_norm": 11.980073928833008, + "learning_rate": 9.807350985279832e-06, + "loss": 4.9478, + "step": 17520 + }, + { + "epoch": 0.3565470377604167, + "grad_norm": 16.074411392211914, + "learning_rate": 9.807241094919065e-06, + "loss": 5.3936, + "step": 17525 + }, + { + "epoch": 0.3566487630208333, + "grad_norm": 22.72775650024414, + "learning_rate": 9.807131173841548e-06, + "loss": 5.2037, + "step": 17530 + }, + { + "epoch": 0.35675048828125, + "grad_norm": 14.326070785522461, + "learning_rate": 9.807021222047993e-06, + "loss": 5.3576, + "step": 17535 + }, + { + "epoch": 0.3568522135416667, + "grad_norm": 20.1030216217041, + "learning_rate": 9.806911239539096e-06, + "loss": 4.9388, + "step": 17540 + }, + { + "epoch": 0.3569539388020833, + "grad_norm": 15.963438034057617, + "learning_rate": 9.806801226315564e-06, + "loss": 5.2157, + "step": 17545 + }, + { + "epoch": 0.3570556640625, + "grad_norm": 16.071720123291016, + "learning_rate": 9.806691182378097e-06, + "loss": 4.9707, + "step": 17550 + }, + { + "epoch": 0.3571573893229167, + "grad_norm": 17.166868209838867, + "learning_rate": 9.806581107727398e-06, + "loss": 5.4428, + "step": 17555 + }, + { + "epoch": 0.3572591145833333, + "grad_norm": 21.54922866821289, + "learning_rate": 9.806471002364173e-06, + "loss": 5.3634, + "step": 17560 + }, + { + "epoch": 0.35736083984375, + "grad_norm": 14.604931831359863, + "learning_rate": 9.806360866289124e-06, + "loss": 5.3319, + "step": 17565 + }, + { + "epoch": 0.3574625651041667, + "grad_norm": 15.206236839294434, + "learning_rate": 9.806250699502955e-06, + "loss": 5.3833, + "step": 17570 + }, + { + "epoch": 0.3575642903645833, + "grad_norm": 17.02522850036621, + "learning_rate": 9.806140502006369e-06, + "loss": 5.3588, + "step": 17575 + }, + { + "epoch": 0.357666015625, + "grad_norm": 20.98918914794922, + "learning_rate": 9.80603027380007e-06, + "loss": 5.2275, + "step": 17580 + }, + { + "epoch": 0.3577677408854167, + "grad_norm": 17.798498153686523, + "learning_rate": 9.805920014884766e-06, + "loss": 5.2345, + "step": 17585 + }, + { + "epoch": 0.3578694661458333, + "grad_norm": 15.309915542602539, + "learning_rate": 9.805809725261158e-06, + "loss": 5.1527, + "step": 17590 + }, + { + "epoch": 0.35797119140625, + "grad_norm": 18.0219669342041, + "learning_rate": 9.805699404929949e-06, + "loss": 4.7482, + "step": 17595 + }, + { + "epoch": 0.3580729166666667, + "grad_norm": 19.3292293548584, + "learning_rate": 9.80558905389185e-06, + "loss": 5.0987, + "step": 17600 + }, + { + "epoch": 0.3581746419270833, + "grad_norm": 17.326555252075195, + "learning_rate": 9.80547867214756e-06, + "loss": 5.3472, + "step": 17605 + }, + { + "epoch": 0.3582763671875, + "grad_norm": 21.538192749023438, + "learning_rate": 9.805368259697784e-06, + "loss": 5.6855, + "step": 17610 + }, + { + "epoch": 0.3583780924479167, + "grad_norm": 14.888368606567383, + "learning_rate": 9.805257816543234e-06, + "loss": 4.9586, + "step": 17615 + }, + { + "epoch": 0.3584798177083333, + "grad_norm": 28.188066482543945, + "learning_rate": 9.805147342684613e-06, + "loss": 5.215, + "step": 17620 + }, + { + "epoch": 0.35858154296875, + "grad_norm": 22.816761016845703, + "learning_rate": 9.805036838122623e-06, + "loss": 4.9738, + "step": 17625 + }, + { + "epoch": 0.3586832682291667, + "grad_norm": 19.977018356323242, + "learning_rate": 9.804926302857973e-06, + "loss": 5.2156, + "step": 17630 + }, + { + "epoch": 0.3587849934895833, + "grad_norm": 19.4199161529541, + "learning_rate": 9.804815736891372e-06, + "loss": 5.1285, + "step": 17635 + }, + { + "epoch": 0.35888671875, + "grad_norm": 15.814250946044922, + "learning_rate": 9.804705140223521e-06, + "loss": 5.3805, + "step": 17640 + }, + { + "epoch": 0.3589884440104167, + "grad_norm": 14.436250686645508, + "learning_rate": 9.80459451285513e-06, + "loss": 4.9965, + "step": 17645 + }, + { + "epoch": 0.3590901692708333, + "grad_norm": 15.53551197052002, + "learning_rate": 9.804483854786906e-06, + "loss": 5.3365, + "step": 17650 + }, + { + "epoch": 0.35919189453125, + "grad_norm": 17.77556610107422, + "learning_rate": 9.804373166019555e-06, + "loss": 5.0861, + "step": 17655 + }, + { + "epoch": 0.3592936197916667, + "grad_norm": 14.013080596923828, + "learning_rate": 9.804262446553786e-06, + "loss": 5.1938, + "step": 17660 + }, + { + "epoch": 0.3593953450520833, + "grad_norm": 14.827096939086914, + "learning_rate": 9.804151696390302e-06, + "loss": 5.3566, + "step": 17665 + }, + { + "epoch": 0.3594970703125, + "grad_norm": 19.924015045166016, + "learning_rate": 9.804040915529816e-06, + "loss": 5.2026, + "step": 17670 + }, + { + "epoch": 0.3595987955729167, + "grad_norm": 15.445606231689453, + "learning_rate": 9.803930103973032e-06, + "loss": 5.1214, + "step": 17675 + }, + { + "epoch": 0.3597005208333333, + "grad_norm": 13.16565990447998, + "learning_rate": 9.80381926172066e-06, + "loss": 4.9319, + "step": 17680 + }, + { + "epoch": 0.35980224609375, + "grad_norm": 14.800577163696289, + "learning_rate": 9.803708388773407e-06, + "loss": 5.0761, + "step": 17685 + }, + { + "epoch": 0.3599039713541667, + "grad_norm": 13.961432456970215, + "learning_rate": 9.803597485131985e-06, + "loss": 5.2383, + "step": 17690 + }, + { + "epoch": 0.3600056966145833, + "grad_norm": 14.738567352294922, + "learning_rate": 9.8034865507971e-06, + "loss": 4.9805, + "step": 17695 + }, + { + "epoch": 0.360107421875, + "grad_norm": 14.571747779846191, + "learning_rate": 9.803375585769456e-06, + "loss": 5.2388, + "step": 17700 + }, + { + "epoch": 0.3602091471354167, + "grad_norm": 19.61717414855957, + "learning_rate": 9.803264590049771e-06, + "loss": 5.1587, + "step": 17705 + }, + { + "epoch": 0.3603108723958333, + "grad_norm": 17.369901657104492, + "learning_rate": 9.80315356363875e-06, + "loss": 5.3061, + "step": 17710 + }, + { + "epoch": 0.36041259765625, + "grad_norm": 13.072566032409668, + "learning_rate": 9.803042506537101e-06, + "loss": 5.3753, + "step": 17715 + }, + { + "epoch": 0.3605143229166667, + "grad_norm": 14.44155502319336, + "learning_rate": 9.802931418745536e-06, + "loss": 5.1442, + "step": 17720 + }, + { + "epoch": 0.3606160481770833, + "grad_norm": 19.344860076904297, + "learning_rate": 9.802820300264762e-06, + "loss": 5.4972, + "step": 17725 + }, + { + "epoch": 0.3607177734375, + "grad_norm": 21.631690979003906, + "learning_rate": 9.802709151095494e-06, + "loss": 5.2749, + "step": 17730 + }, + { + "epoch": 0.3608194986979167, + "grad_norm": 20.604116439819336, + "learning_rate": 9.802597971238437e-06, + "loss": 5.2987, + "step": 17735 + }, + { + "epoch": 0.3609212239583333, + "grad_norm": 13.759369850158691, + "learning_rate": 9.802486760694303e-06, + "loss": 5.2302, + "step": 17740 + }, + { + "epoch": 0.36102294921875, + "grad_norm": 14.951830863952637, + "learning_rate": 9.802375519463805e-06, + "loss": 5.2471, + "step": 17745 + }, + { + "epoch": 0.3611246744791667, + "grad_norm": 21.092493057250977, + "learning_rate": 9.80226424754765e-06, + "loss": 5.1359, + "step": 17750 + }, + { + "epoch": 0.3612263997395833, + "grad_norm": 25.31049346923828, + "learning_rate": 9.802152944946552e-06, + "loss": 5.196, + "step": 17755 + }, + { + "epoch": 0.361328125, + "grad_norm": 14.630163192749023, + "learning_rate": 9.802041611661222e-06, + "loss": 5.0423, + "step": 17760 + }, + { + "epoch": 0.3614298502604167, + "grad_norm": 15.327558517456055, + "learning_rate": 9.801930247692368e-06, + "loss": 5.043, + "step": 17765 + }, + { + "epoch": 0.3615315755208333, + "grad_norm": 17.820491790771484, + "learning_rate": 9.801818853040707e-06, + "loss": 5.3522, + "step": 17770 + }, + { + "epoch": 0.36163330078125, + "grad_norm": 17.23073387145996, + "learning_rate": 9.801707427706946e-06, + "loss": 5.2911, + "step": 17775 + }, + { + "epoch": 0.3617350260416667, + "grad_norm": 15.083492279052734, + "learning_rate": 9.8015959716918e-06, + "loss": 5.1527, + "step": 17780 + }, + { + "epoch": 0.3618367513020833, + "grad_norm": 18.757814407348633, + "learning_rate": 9.801484484995977e-06, + "loss": 5.155, + "step": 17785 + }, + { + "epoch": 0.3619384765625, + "grad_norm": 17.338960647583008, + "learning_rate": 9.801372967620196e-06, + "loss": 5.0433, + "step": 17790 + }, + { + "epoch": 0.3620402018229167, + "grad_norm": 13.995816230773926, + "learning_rate": 9.801261419565165e-06, + "loss": 5.1182, + "step": 17795 + }, + { + "epoch": 0.3621419270833333, + "grad_norm": 15.473891258239746, + "learning_rate": 9.801149840831596e-06, + "loss": 4.9814, + "step": 17800 + }, + { + "epoch": 0.36224365234375, + "grad_norm": 17.31134033203125, + "learning_rate": 9.801038231420205e-06, + "loss": 5.2497, + "step": 17805 + }, + { + "epoch": 0.3623453776041667, + "grad_norm": 18.363737106323242, + "learning_rate": 9.800926591331702e-06, + "loss": 5.2093, + "step": 17810 + }, + { + "epoch": 0.3624471028645833, + "grad_norm": 61.71294021606445, + "learning_rate": 9.800814920566804e-06, + "loss": 5.4361, + "step": 17815 + }, + { + "epoch": 0.362548828125, + "grad_norm": 17.10593032836914, + "learning_rate": 9.800703219126221e-06, + "loss": 5.0356, + "step": 17820 + }, + { + "epoch": 0.3626505533854167, + "grad_norm": 12.045926094055176, + "learning_rate": 9.800591487010671e-06, + "loss": 5.3545, + "step": 17825 + }, + { + "epoch": 0.3627522786458333, + "grad_norm": 16.53950309753418, + "learning_rate": 9.800479724220864e-06, + "loss": 5.4303, + "step": 17830 + }, + { + "epoch": 0.36285400390625, + "grad_norm": 13.51955509185791, + "learning_rate": 9.800367930757516e-06, + "loss": 5.2829, + "step": 17835 + }, + { + "epoch": 0.3629557291666667, + "grad_norm": 15.020669937133789, + "learning_rate": 9.800256106621339e-06, + "loss": 4.9818, + "step": 17840 + }, + { + "epoch": 0.3630574544270833, + "grad_norm": 17.85348129272461, + "learning_rate": 9.800144251813051e-06, + "loss": 5.1935, + "step": 17845 + }, + { + "epoch": 0.3631591796875, + "grad_norm": 16.290922164916992, + "learning_rate": 9.800032366333363e-06, + "loss": 5.3089, + "step": 17850 + }, + { + "epoch": 0.3632609049479167, + "grad_norm": 16.5367431640625, + "learning_rate": 9.799920450182994e-06, + "loss": 5.1752, + "step": 17855 + }, + { + "epoch": 0.3633626302083333, + "grad_norm": 18.196712493896484, + "learning_rate": 9.799808503362655e-06, + "loss": 5.2236, + "step": 17860 + }, + { + "epoch": 0.36346435546875, + "grad_norm": 13.762907981872559, + "learning_rate": 9.799696525873066e-06, + "loss": 5.1729, + "step": 17865 + }, + { + "epoch": 0.3635660807291667, + "grad_norm": 15.546979904174805, + "learning_rate": 9.799584517714938e-06, + "loss": 5.4035, + "step": 17870 + }, + { + "epoch": 0.3636678059895833, + "grad_norm": 21.429384231567383, + "learning_rate": 9.79947247888899e-06, + "loss": 5.2403, + "step": 17875 + }, + { + "epoch": 0.36376953125, + "grad_norm": 14.46178913116455, + "learning_rate": 9.799360409395935e-06, + "loss": 5.3143, + "step": 17880 + }, + { + "epoch": 0.3638712565104167, + "grad_norm": 15.376455307006836, + "learning_rate": 9.799248309236492e-06, + "loss": 5.2689, + "step": 17885 + }, + { + "epoch": 0.3639729817708333, + "grad_norm": 20.086681365966797, + "learning_rate": 9.799136178411372e-06, + "loss": 5.1172, + "step": 17890 + }, + { + "epoch": 0.36407470703125, + "grad_norm": 14.262514114379883, + "learning_rate": 9.799024016921299e-06, + "loss": 5.382, + "step": 17895 + }, + { + "epoch": 0.3641764322916667, + "grad_norm": 16.381328582763672, + "learning_rate": 9.798911824766984e-06, + "loss": 5.3128, + "step": 17900 + }, + { + "epoch": 0.3642781575520833, + "grad_norm": 13.394072532653809, + "learning_rate": 9.798799601949147e-06, + "loss": 5.1664, + "step": 17905 + }, + { + "epoch": 0.3643798828125, + "grad_norm": 17.690134048461914, + "learning_rate": 9.798687348468503e-06, + "loss": 5.1797, + "step": 17910 + }, + { + "epoch": 0.3644816080729167, + "grad_norm": 18.20242691040039, + "learning_rate": 9.798575064325772e-06, + "loss": 5.0714, + "step": 17915 + }, + { + "epoch": 0.3645833333333333, + "grad_norm": 14.195669174194336, + "learning_rate": 9.798462749521668e-06, + "loss": 4.9675, + "step": 17920 + }, + { + "epoch": 0.36468505859375, + "grad_norm": 16.788766860961914, + "learning_rate": 9.79835040405691e-06, + "loss": 5.3611, + "step": 17925 + }, + { + "epoch": 0.3647867838541667, + "grad_norm": 23.421588897705078, + "learning_rate": 9.798238027932217e-06, + "loss": 5.1414, + "step": 17930 + }, + { + "epoch": 0.3648885091145833, + "grad_norm": 15.580326080322266, + "learning_rate": 9.798125621148307e-06, + "loss": 5.3744, + "step": 17935 + }, + { + "epoch": 0.364990234375, + "grad_norm": 11.069721221923828, + "learning_rate": 9.798013183705894e-06, + "loss": 5.2329, + "step": 17940 + }, + { + "epoch": 0.3650919596354167, + "grad_norm": 18.15121841430664, + "learning_rate": 9.797900715605704e-06, + "loss": 5.2813, + "step": 17945 + }, + { + "epoch": 0.3651936848958333, + "grad_norm": 17.769371032714844, + "learning_rate": 9.79778821684845e-06, + "loss": 5.2551, + "step": 17950 + }, + { + "epoch": 0.36529541015625, + "grad_norm": 15.849822998046875, + "learning_rate": 9.797675687434851e-06, + "loss": 5.6123, + "step": 17955 + }, + { + "epoch": 0.3653971354166667, + "grad_norm": 14.70334243774414, + "learning_rate": 9.797563127365628e-06, + "loss": 5.2959, + "step": 17960 + }, + { + "epoch": 0.3654988606770833, + "grad_norm": 10.978031158447266, + "learning_rate": 9.7974505366415e-06, + "loss": 5.2532, + "step": 17965 + }, + { + "epoch": 0.3656005859375, + "grad_norm": 24.08580780029297, + "learning_rate": 9.797337915263186e-06, + "loss": 5.0255, + "step": 17970 + }, + { + "epoch": 0.3657023111979167, + "grad_norm": 18.446001052856445, + "learning_rate": 9.797225263231404e-06, + "loss": 5.5332, + "step": 17975 + }, + { + "epoch": 0.3658040364583333, + "grad_norm": 12.72257137298584, + "learning_rate": 9.797112580546878e-06, + "loss": 5.5253, + "step": 17980 + }, + { + "epoch": 0.36590576171875, + "grad_norm": 18.627193450927734, + "learning_rate": 9.796999867210324e-06, + "loss": 5.2433, + "step": 17985 + }, + { + "epoch": 0.3660074869791667, + "grad_norm": 17.66160774230957, + "learning_rate": 9.796887123222466e-06, + "loss": 5.3774, + "step": 17990 + }, + { + "epoch": 0.3661092122395833, + "grad_norm": 14.718180656433105, + "learning_rate": 9.796774348584019e-06, + "loss": 4.9482, + "step": 17995 + }, + { + "epoch": 0.3662109375, + "grad_norm": 13.800759315490723, + "learning_rate": 9.796661543295706e-06, + "loss": 5.3181, + "step": 18000 + }, + { + "epoch": 0.3663126627604167, + "grad_norm": 17.902074813842773, + "learning_rate": 9.79654870735825e-06, + "loss": 5.2228, + "step": 18005 + }, + { + "epoch": 0.3664143880208333, + "grad_norm": 27.025409698486328, + "learning_rate": 9.796435840772372e-06, + "loss": 5.4569, + "step": 18010 + }, + { + "epoch": 0.36651611328125, + "grad_norm": 14.75337028503418, + "learning_rate": 9.796322943538789e-06, + "loss": 5.2975, + "step": 18015 + }, + { + "epoch": 0.3666178385416667, + "grad_norm": 20.255151748657227, + "learning_rate": 9.796210015658226e-06, + "loss": 5.3653, + "step": 18020 + }, + { + "epoch": 0.3667195638020833, + "grad_norm": 15.736201286315918, + "learning_rate": 9.796097057131403e-06, + "loss": 5.0836, + "step": 18025 + }, + { + "epoch": 0.3668212890625, + "grad_norm": 19.672466278076172, + "learning_rate": 9.795984067959043e-06, + "loss": 5.1399, + "step": 18030 + }, + { + "epoch": 0.3669230143229167, + "grad_norm": 19.776927947998047, + "learning_rate": 9.795871048141866e-06, + "loss": 5.3549, + "step": 18035 + }, + { + "epoch": 0.3670247395833333, + "grad_norm": 15.665398597717285, + "learning_rate": 9.795757997680597e-06, + "loss": 5.4335, + "step": 18040 + }, + { + "epoch": 0.36712646484375, + "grad_norm": 13.921073913574219, + "learning_rate": 9.795644916575957e-06, + "loss": 5.2714, + "step": 18045 + }, + { + "epoch": 0.3672281901041667, + "grad_norm": 16.92786979675293, + "learning_rate": 9.795531804828666e-06, + "loss": 5.1231, + "step": 18050 + }, + { + "epoch": 0.3673299153645833, + "grad_norm": 14.93761920928955, + "learning_rate": 9.79541866243945e-06, + "loss": 5.1387, + "step": 18055 + }, + { + "epoch": 0.367431640625, + "grad_norm": 13.323935508728027, + "learning_rate": 9.795305489409033e-06, + "loss": 5.099, + "step": 18060 + }, + { + "epoch": 0.3675333658854167, + "grad_norm": 18.673742294311523, + "learning_rate": 9.795192285738134e-06, + "loss": 5.4419, + "step": 18065 + }, + { + "epoch": 0.3676350911458333, + "grad_norm": 17.703475952148438, + "learning_rate": 9.795079051427478e-06, + "loss": 5.3469, + "step": 18070 + }, + { + "epoch": 0.36773681640625, + "grad_norm": 15.27579116821289, + "learning_rate": 9.79496578647779e-06, + "loss": 5.1139, + "step": 18075 + }, + { + "epoch": 0.3678385416666667, + "grad_norm": 12.653162956237793, + "learning_rate": 9.794852490889795e-06, + "loss": 5.1728, + "step": 18080 + }, + { + "epoch": 0.3679402669270833, + "grad_norm": 15.610568046569824, + "learning_rate": 9.79473916466421e-06, + "loss": 4.9788, + "step": 18085 + }, + { + "epoch": 0.3680419921875, + "grad_norm": 12.70355224609375, + "learning_rate": 9.794625807801767e-06, + "loss": 5.0092, + "step": 18090 + }, + { + "epoch": 0.3681437174479167, + "grad_norm": 17.44635772705078, + "learning_rate": 9.794512420303188e-06, + "loss": 5.3773, + "step": 18095 + }, + { + "epoch": 0.3682454427083333, + "grad_norm": 15.288827896118164, + "learning_rate": 9.794399002169195e-06, + "loss": 5.0197, + "step": 18100 + }, + { + "epoch": 0.36834716796875, + "grad_norm": 14.999924659729004, + "learning_rate": 9.794285553400514e-06, + "loss": 5.248, + "step": 18105 + }, + { + "epoch": 0.3684488932291667, + "grad_norm": 17.24403190612793, + "learning_rate": 9.794172073997871e-06, + "loss": 5.2711, + "step": 18110 + }, + { + "epoch": 0.3685506184895833, + "grad_norm": 17.632471084594727, + "learning_rate": 9.79405856396199e-06, + "loss": 5.1484, + "step": 18115 + }, + { + "epoch": 0.36865234375, + "grad_norm": 14.45099925994873, + "learning_rate": 9.793945023293597e-06, + "loss": 5.1176, + "step": 18120 + }, + { + "epoch": 0.3687540690104167, + "grad_norm": 13.35098648071289, + "learning_rate": 9.793831451993417e-06, + "loss": 5.1518, + "step": 18125 + }, + { + "epoch": 0.3688557942708333, + "grad_norm": 19.709348678588867, + "learning_rate": 9.793717850062175e-06, + "loss": 5.2015, + "step": 18130 + }, + { + "epoch": 0.36895751953125, + "grad_norm": 16.456710815429688, + "learning_rate": 9.793604217500598e-06, + "loss": 5.2787, + "step": 18135 + }, + { + "epoch": 0.3690592447916667, + "grad_norm": 16.055419921875, + "learning_rate": 9.793490554309412e-06, + "loss": 4.9218, + "step": 18140 + }, + { + "epoch": 0.3691609700520833, + "grad_norm": 16.968618392944336, + "learning_rate": 9.793376860489342e-06, + "loss": 5.1085, + "step": 18145 + }, + { + "epoch": 0.3692626953125, + "grad_norm": 11.58379077911377, + "learning_rate": 9.793263136041117e-06, + "loss": 4.9377, + "step": 18150 + }, + { + "epoch": 0.3693644205729167, + "grad_norm": 19.121543884277344, + "learning_rate": 9.79314938096546e-06, + "loss": 5.129, + "step": 18155 + }, + { + "epoch": 0.3694661458333333, + "grad_norm": 28.22822380065918, + "learning_rate": 9.793035595263103e-06, + "loss": 5.5623, + "step": 18160 + }, + { + "epoch": 0.36956787109375, + "grad_norm": 16.25670623779297, + "learning_rate": 9.79292177893477e-06, + "loss": 5.3006, + "step": 18165 + }, + { + "epoch": 0.3696695963541667, + "grad_norm": 19.161067962646484, + "learning_rate": 9.792807931981185e-06, + "loss": 5.2285, + "step": 18170 + }, + { + "epoch": 0.3697713216145833, + "grad_norm": 12.707950592041016, + "learning_rate": 9.792694054403082e-06, + "loss": 5.1066, + "step": 18175 + }, + { + "epoch": 0.369873046875, + "grad_norm": 16.830841064453125, + "learning_rate": 9.792580146201182e-06, + "loss": 5.4173, + "step": 18180 + }, + { + "epoch": 0.3699747721354167, + "grad_norm": 15.863368034362793, + "learning_rate": 9.792466207376219e-06, + "loss": 5.2056, + "step": 18185 + }, + { + "epoch": 0.3700764973958333, + "grad_norm": 19.919382095336914, + "learning_rate": 9.792352237928919e-06, + "loss": 5.2284, + "step": 18190 + }, + { + "epoch": 0.37017822265625, + "grad_norm": 16.145925521850586, + "learning_rate": 9.792238237860008e-06, + "loss": 5.3329, + "step": 18195 + }, + { + "epoch": 0.3702799479166667, + "grad_norm": 21.433937072753906, + "learning_rate": 9.792124207170215e-06, + "loss": 5.2346, + "step": 18200 + }, + { + "epoch": 0.3703816731770833, + "grad_norm": 16.262067794799805, + "learning_rate": 9.79201014586027e-06, + "loss": 5.1805, + "step": 18205 + }, + { + "epoch": 0.3704833984375, + "grad_norm": 14.028776168823242, + "learning_rate": 9.791896053930902e-06, + "loss": 5.1831, + "step": 18210 + }, + { + "epoch": 0.3705851236979167, + "grad_norm": 14.675068855285645, + "learning_rate": 9.79178193138284e-06, + "loss": 5.1947, + "step": 18215 + }, + { + "epoch": 0.3706868489583333, + "grad_norm": 16.474422454833984, + "learning_rate": 9.791667778216812e-06, + "loss": 5.1353, + "step": 18220 + }, + { + "epoch": 0.37078857421875, + "grad_norm": 19.63677978515625, + "learning_rate": 9.791553594433547e-06, + "loss": 5.4274, + "step": 18225 + }, + { + "epoch": 0.3708902994791667, + "grad_norm": 14.773954391479492, + "learning_rate": 9.791439380033777e-06, + "loss": 5.1977, + "step": 18230 + }, + { + "epoch": 0.3709920247395833, + "grad_norm": 20.961809158325195, + "learning_rate": 9.79132513501823e-06, + "loss": 5.1894, + "step": 18235 + }, + { + "epoch": 0.37109375, + "grad_norm": 15.388531684875488, + "learning_rate": 9.791210859387637e-06, + "loss": 5.3304, + "step": 18240 + }, + { + "epoch": 0.3711954752604167, + "grad_norm": 14.56883716583252, + "learning_rate": 9.791096553142726e-06, + "loss": 5.2066, + "step": 18245 + }, + { + "epoch": 0.3712972005208333, + "grad_norm": 19.97942352294922, + "learning_rate": 9.790982216284228e-06, + "loss": 5.3217, + "step": 18250 + }, + { + "epoch": 0.37139892578125, + "grad_norm": 19.194915771484375, + "learning_rate": 9.790867848812878e-06, + "loss": 5.3789, + "step": 18255 + }, + { + "epoch": 0.3715006510416667, + "grad_norm": 15.406316757202148, + "learning_rate": 9.7907534507294e-06, + "loss": 5.2724, + "step": 18260 + }, + { + "epoch": 0.3716023763020833, + "grad_norm": 19.671110153198242, + "learning_rate": 9.79063902203453e-06, + "loss": 5.5328, + "step": 18265 + }, + { + "epoch": 0.3717041015625, + "grad_norm": 11.69530963897705, + "learning_rate": 9.790524562728998e-06, + "loss": 5.0103, + "step": 18270 + }, + { + "epoch": 0.3718058268229167, + "grad_norm": 15.270082473754883, + "learning_rate": 9.790410072813532e-06, + "loss": 5.4038, + "step": 18275 + }, + { + "epoch": 0.3719075520833333, + "grad_norm": 17.937036514282227, + "learning_rate": 9.790295552288868e-06, + "loss": 5.5821, + "step": 18280 + }, + { + "epoch": 0.37200927734375, + "grad_norm": 13.446045875549316, + "learning_rate": 9.790181001155737e-06, + "loss": 5.2692, + "step": 18285 + }, + { + "epoch": 0.3721110026041667, + "grad_norm": 16.03243064880371, + "learning_rate": 9.790066419414869e-06, + "loss": 5.2481, + "step": 18290 + }, + { + "epoch": 0.3722127278645833, + "grad_norm": 16.385129928588867, + "learning_rate": 9.789951807066997e-06, + "loss": 5.2183, + "step": 18295 + }, + { + "epoch": 0.372314453125, + "grad_norm": 13.110020637512207, + "learning_rate": 9.789837164112854e-06, + "loss": 5.282, + "step": 18300 + }, + { + "epoch": 0.3724161783854167, + "grad_norm": 15.036179542541504, + "learning_rate": 9.789722490553172e-06, + "loss": 5.1983, + "step": 18305 + }, + { + "epoch": 0.3725179036458333, + "grad_norm": 22.384553909301758, + "learning_rate": 9.789607786388684e-06, + "loss": 5.0582, + "step": 18310 + }, + { + "epoch": 0.37261962890625, + "grad_norm": 15.852128028869629, + "learning_rate": 9.789493051620123e-06, + "loss": 5.2161, + "step": 18315 + }, + { + "epoch": 0.3727213541666667, + "grad_norm": 12.597589492797852, + "learning_rate": 9.78937828624822e-06, + "loss": 5.1501, + "step": 18320 + }, + { + "epoch": 0.3728230794270833, + "grad_norm": 19.273706436157227, + "learning_rate": 9.78926349027371e-06, + "loss": 5.4152, + "step": 18325 + }, + { + "epoch": 0.3729248046875, + "grad_norm": 30.64134407043457, + "learning_rate": 9.789148663697328e-06, + "loss": 5.5043, + "step": 18330 + }, + { + "epoch": 0.3730265299479167, + "grad_norm": 15.959654808044434, + "learning_rate": 9.789033806519806e-06, + "loss": 5.1888, + "step": 18335 + }, + { + "epoch": 0.3731282552083333, + "grad_norm": 16.57468032836914, + "learning_rate": 9.788918918741879e-06, + "loss": 5.2495, + "step": 18340 + }, + { + "epoch": 0.37322998046875, + "grad_norm": 20.637365341186523, + "learning_rate": 9.78880400036428e-06, + "loss": 5.227, + "step": 18345 + }, + { + "epoch": 0.3733317057291667, + "grad_norm": 14.188008308410645, + "learning_rate": 9.788689051387742e-06, + "loss": 5.1705, + "step": 18350 + }, + { + "epoch": 0.3734334309895833, + "grad_norm": 18.92221450805664, + "learning_rate": 9.788574071813004e-06, + "loss": 5.0678, + "step": 18355 + }, + { + "epoch": 0.37353515625, + "grad_norm": 18.979516983032227, + "learning_rate": 9.788459061640796e-06, + "loss": 5.1314, + "step": 18360 + }, + { + "epoch": 0.3736368815104167, + "grad_norm": 18.681867599487305, + "learning_rate": 9.788344020871854e-06, + "loss": 5.1145, + "step": 18365 + }, + { + "epoch": 0.3737386067708333, + "grad_norm": 17.840620040893555, + "learning_rate": 9.788228949506914e-06, + "loss": 5.3782, + "step": 18370 + }, + { + "epoch": 0.37384033203125, + "grad_norm": 15.00316047668457, + "learning_rate": 9.788113847546713e-06, + "loss": 5.1752, + "step": 18375 + }, + { + "epoch": 0.3739420572916667, + "grad_norm": 17.22214698791504, + "learning_rate": 9.787998714991983e-06, + "loss": 5.1745, + "step": 18380 + }, + { + "epoch": 0.3740437825520833, + "grad_norm": 19.073339462280273, + "learning_rate": 9.78788355184346e-06, + "loss": 5.381, + "step": 18385 + }, + { + "epoch": 0.3741455078125, + "grad_norm": 11.519755363464355, + "learning_rate": 9.787768358101881e-06, + "loss": 5.0459, + "step": 18390 + }, + { + "epoch": 0.3742472330729167, + "grad_norm": 16.465131759643555, + "learning_rate": 9.787653133767984e-06, + "loss": 5.2683, + "step": 18395 + }, + { + "epoch": 0.3743489583333333, + "grad_norm": 20.701053619384766, + "learning_rate": 9.787537878842502e-06, + "loss": 4.9478, + "step": 18400 + }, + { + "epoch": 0.37445068359375, + "grad_norm": 15.90327262878418, + "learning_rate": 9.787422593326174e-06, + "loss": 5.2029, + "step": 18405 + }, + { + "epoch": 0.3745524088541667, + "grad_norm": 15.256084442138672, + "learning_rate": 9.787307277219733e-06, + "loss": 5.1897, + "step": 18410 + }, + { + "epoch": 0.3746541341145833, + "grad_norm": 17.011526107788086, + "learning_rate": 9.78719193052392e-06, + "loss": 5.0879, + "step": 18415 + }, + { + "epoch": 0.374755859375, + "grad_norm": 14.179672241210938, + "learning_rate": 9.787076553239469e-06, + "loss": 4.9688, + "step": 18420 + }, + { + "epoch": 0.3748575846354167, + "grad_norm": 16.83329963684082, + "learning_rate": 9.78696114536712e-06, + "loss": 5.1641, + "step": 18425 + }, + { + "epoch": 0.3749593098958333, + "grad_norm": 20.896392822265625, + "learning_rate": 9.786845706907608e-06, + "loss": 5.3671, + "step": 18430 + }, + { + "epoch": 0.37506103515625, + "grad_norm": 15.691426277160645, + "learning_rate": 9.786730237861672e-06, + "loss": 5.256, + "step": 18435 + }, + { + "epoch": 0.3751627604166667, + "grad_norm": 15.65243911743164, + "learning_rate": 9.78661473823005e-06, + "loss": 5.1557, + "step": 18440 + }, + { + "epoch": 0.3752644856770833, + "grad_norm": 18.42641830444336, + "learning_rate": 9.786499208013478e-06, + "loss": 5.2292, + "step": 18445 + }, + { + "epoch": 0.3753662109375, + "grad_norm": 16.5950984954834, + "learning_rate": 9.786383647212696e-06, + "loss": 5.4468, + "step": 18450 + }, + { + "epoch": 0.3754679361979167, + "grad_norm": 16.62608528137207, + "learning_rate": 9.78626805582844e-06, + "loss": 5.4965, + "step": 18455 + }, + { + "epoch": 0.3755696614583333, + "grad_norm": 16.09642219543457, + "learning_rate": 9.786152433861453e-06, + "loss": 5.0334, + "step": 18460 + }, + { + "epoch": 0.37567138671875, + "grad_norm": 22.86848258972168, + "learning_rate": 9.78603678131247e-06, + "loss": 5.4125, + "step": 18465 + }, + { + "epoch": 0.3757731119791667, + "grad_norm": 16.54465103149414, + "learning_rate": 9.785921098182231e-06, + "loss": 4.928, + "step": 18470 + }, + { + "epoch": 0.3758748372395833, + "grad_norm": 17.82538414001465, + "learning_rate": 9.785805384471478e-06, + "loss": 5.1815, + "step": 18475 + }, + { + "epoch": 0.3759765625, + "grad_norm": 18.659395217895508, + "learning_rate": 9.785689640180944e-06, + "loss": 5.1444, + "step": 18480 + }, + { + "epoch": 0.3760782877604167, + "grad_norm": 23.449604034423828, + "learning_rate": 9.785573865311376e-06, + "loss": 5.2291, + "step": 18485 + }, + { + "epoch": 0.3761800130208333, + "grad_norm": 13.416553497314453, + "learning_rate": 9.785458059863508e-06, + "loss": 5.1312, + "step": 18490 + }, + { + "epoch": 0.37628173828125, + "grad_norm": 20.54108238220215, + "learning_rate": 9.785342223838083e-06, + "loss": 5.5406, + "step": 18495 + }, + { + "epoch": 0.3763834635416667, + "grad_norm": 13.670320510864258, + "learning_rate": 9.785226357235839e-06, + "loss": 5.2421, + "step": 18500 + }, + { + "epoch": 0.3764851888020833, + "grad_norm": 18.38820457458496, + "learning_rate": 9.785110460057519e-06, + "loss": 5.6288, + "step": 18505 + }, + { + "epoch": 0.3765869140625, + "grad_norm": 17.83949089050293, + "learning_rate": 9.784994532303862e-06, + "loss": 5.101, + "step": 18510 + }, + { + "epoch": 0.3766886393229167, + "grad_norm": 16.111614227294922, + "learning_rate": 9.784878573975607e-06, + "loss": 5.0826, + "step": 18515 + }, + { + "epoch": 0.3767903645833333, + "grad_norm": 17.7734432220459, + "learning_rate": 9.7847625850735e-06, + "loss": 5.0834, + "step": 18520 + }, + { + "epoch": 0.37689208984375, + "grad_norm": 15.257444381713867, + "learning_rate": 9.784646565598277e-06, + "loss": 5.3027, + "step": 18525 + }, + { + "epoch": 0.3769938151041667, + "grad_norm": 17.654115676879883, + "learning_rate": 9.78453051555068e-06, + "loss": 5.2645, + "step": 18530 + }, + { + "epoch": 0.3770955403645833, + "grad_norm": 14.292893409729004, + "learning_rate": 9.784414434931455e-06, + "loss": 5.3626, + "step": 18535 + }, + { + "epoch": 0.377197265625, + "grad_norm": 12.667604446411133, + "learning_rate": 9.784298323741337e-06, + "loss": 5.2669, + "step": 18540 + }, + { + "epoch": 0.3772989908854167, + "grad_norm": 18.61713409423828, + "learning_rate": 9.784182181981074e-06, + "loss": 5.427, + "step": 18545 + }, + { + "epoch": 0.3774007161458333, + "grad_norm": 15.400463104248047, + "learning_rate": 9.784066009651405e-06, + "loss": 5.0553, + "step": 18550 + }, + { + "epoch": 0.37750244140625, + "grad_norm": 19.474973678588867, + "learning_rate": 9.783949806753072e-06, + "loss": 4.9484, + "step": 18555 + }, + { + "epoch": 0.3776041666666667, + "grad_norm": 17.18440818786621, + "learning_rate": 9.783833573286821e-06, + "loss": 5.1644, + "step": 18560 + }, + { + "epoch": 0.3777058919270833, + "grad_norm": 21.212474822998047, + "learning_rate": 9.783717309253388e-06, + "loss": 5.2703, + "step": 18565 + }, + { + "epoch": 0.3778076171875, + "grad_norm": 20.884096145629883, + "learning_rate": 9.783601014653523e-06, + "loss": 5.3178, + "step": 18570 + }, + { + "epoch": 0.3779093424479167, + "grad_norm": 16.2794246673584, + "learning_rate": 9.783484689487965e-06, + "loss": 5.2138, + "step": 18575 + }, + { + "epoch": 0.3780110677083333, + "grad_norm": 24.92637825012207, + "learning_rate": 9.783368333757459e-06, + "loss": 5.1816, + "step": 18580 + }, + { + "epoch": 0.37811279296875, + "grad_norm": 18.692358016967773, + "learning_rate": 9.783251947462748e-06, + "loss": 5.3302, + "step": 18585 + }, + { + "epoch": 0.3782145182291667, + "grad_norm": 23.751707077026367, + "learning_rate": 9.783135530604574e-06, + "loss": 5.4603, + "step": 18590 + }, + { + "epoch": 0.3783162434895833, + "grad_norm": 17.386503219604492, + "learning_rate": 9.783019083183683e-06, + "loss": 5.3178, + "step": 18595 + }, + { + "epoch": 0.37841796875, + "grad_norm": 15.828874588012695, + "learning_rate": 9.782902605200819e-06, + "loss": 5.2678, + "step": 18600 + }, + { + "epoch": 0.3785196940104167, + "grad_norm": 12.214749336242676, + "learning_rate": 9.782786096656726e-06, + "loss": 5.2439, + "step": 18605 + }, + { + "epoch": 0.3786214192708333, + "grad_norm": 18.88268280029297, + "learning_rate": 9.782669557552147e-06, + "loss": 5.0844, + "step": 18610 + }, + { + "epoch": 0.37872314453125, + "grad_norm": 15.326437950134277, + "learning_rate": 9.782552987887829e-06, + "loss": 5.3274, + "step": 18615 + }, + { + "epoch": 0.3788248697916667, + "grad_norm": 16.981260299682617, + "learning_rate": 9.782436387664513e-06, + "loss": 5.2131, + "step": 18620 + }, + { + "epoch": 0.3789265950520833, + "grad_norm": 16.143848419189453, + "learning_rate": 9.78231975688295e-06, + "loss": 5.0874, + "step": 18625 + }, + { + "epoch": 0.3790283203125, + "grad_norm": 23.404891967773438, + "learning_rate": 9.782203095543879e-06, + "loss": 5.2342, + "step": 18630 + }, + { + "epoch": 0.3791300455729167, + "grad_norm": 18.38137435913086, + "learning_rate": 9.78208640364805e-06, + "loss": 5.2089, + "step": 18635 + }, + { + "epoch": 0.3792317708333333, + "grad_norm": 14.703808784484863, + "learning_rate": 9.781969681196206e-06, + "loss": 5.2248, + "step": 18640 + }, + { + "epoch": 0.37933349609375, + "grad_norm": 18.699783325195312, + "learning_rate": 9.781852928189092e-06, + "loss": 5.1252, + "step": 18645 + }, + { + "epoch": 0.3794352213541667, + "grad_norm": 20.295644760131836, + "learning_rate": 9.781736144627459e-06, + "loss": 5.2543, + "step": 18650 + }, + { + "epoch": 0.3795369466145833, + "grad_norm": 17.302600860595703, + "learning_rate": 9.78161933051205e-06, + "loss": 5.3598, + "step": 18655 + }, + { + "epoch": 0.379638671875, + "grad_norm": 13.56551456451416, + "learning_rate": 9.781502485843609e-06, + "loss": 4.8905, + "step": 18660 + }, + { + "epoch": 0.3797403971354167, + "grad_norm": 17.83414077758789, + "learning_rate": 9.781385610622887e-06, + "loss": 4.9909, + "step": 18665 + }, + { + "epoch": 0.3798421223958333, + "grad_norm": 21.135229110717773, + "learning_rate": 9.781268704850627e-06, + "loss": 5.0767, + "step": 18670 + }, + { + "epoch": 0.37994384765625, + "grad_norm": 14.718531608581543, + "learning_rate": 9.781151768527579e-06, + "loss": 5.1206, + "step": 18675 + }, + { + "epoch": 0.3800455729166667, + "grad_norm": 15.499728202819824, + "learning_rate": 9.781034801654488e-06, + "loss": 5.1017, + "step": 18680 + }, + { + "epoch": 0.3801472981770833, + "grad_norm": 15.525670051574707, + "learning_rate": 9.780917804232104e-06, + "loss": 5.4062, + "step": 18685 + }, + { + "epoch": 0.3802490234375, + "grad_norm": 14.848773956298828, + "learning_rate": 9.78080077626117e-06, + "loss": 5.0819, + "step": 18690 + }, + { + "epoch": 0.3803507486979167, + "grad_norm": 14.903079986572266, + "learning_rate": 9.780683717742439e-06, + "loss": 5.1535, + "step": 18695 + }, + { + "epoch": 0.3804524739583333, + "grad_norm": 17.775043487548828, + "learning_rate": 9.780566628676657e-06, + "loss": 4.9537, + "step": 18700 + }, + { + "epoch": 0.38055419921875, + "grad_norm": 15.850993156433105, + "learning_rate": 9.780449509064571e-06, + "loss": 5.1913, + "step": 18705 + }, + { + "epoch": 0.3806559244791667, + "grad_norm": 16.639890670776367, + "learning_rate": 9.78033235890693e-06, + "loss": 5.0689, + "step": 18710 + }, + { + "epoch": 0.3807576497395833, + "grad_norm": 16.194517135620117, + "learning_rate": 9.780215178204481e-06, + "loss": 5.2323, + "step": 18715 + }, + { + "epoch": 0.380859375, + "grad_norm": 23.680307388305664, + "learning_rate": 9.780097966957978e-06, + "loss": 5.2555, + "step": 18720 + }, + { + "epoch": 0.3809611002604167, + "grad_norm": 18.230072021484375, + "learning_rate": 9.779980725168164e-06, + "loss": 5.1235, + "step": 18725 + }, + { + "epoch": 0.3810628255208333, + "grad_norm": 23.414905548095703, + "learning_rate": 9.779863452835792e-06, + "loss": 5.5147, + "step": 18730 + }, + { + "epoch": 0.38116455078125, + "grad_norm": 15.261309623718262, + "learning_rate": 9.779746149961608e-06, + "loss": 5.3633, + "step": 18735 + }, + { + "epoch": 0.3812662760416667, + "grad_norm": 20.500789642333984, + "learning_rate": 9.779628816546365e-06, + "loss": 5.1316, + "step": 18740 + }, + { + "epoch": 0.3813680013020833, + "grad_norm": 15.886900901794434, + "learning_rate": 9.77951145259081e-06, + "loss": 5.5409, + "step": 18745 + }, + { + "epoch": 0.3814697265625, + "grad_norm": 16.150388717651367, + "learning_rate": 9.779394058095695e-06, + "loss": 5.217, + "step": 18750 + }, + { + "epoch": 0.3815714518229167, + "grad_norm": 15.216958999633789, + "learning_rate": 9.77927663306177e-06, + "loss": 5.3048, + "step": 18755 + }, + { + "epoch": 0.3816731770833333, + "grad_norm": 16.104265213012695, + "learning_rate": 9.779159177489781e-06, + "loss": 5.0887, + "step": 18760 + }, + { + "epoch": 0.38177490234375, + "grad_norm": 16.682254791259766, + "learning_rate": 9.779041691380483e-06, + "loss": 4.85, + "step": 18765 + }, + { + "epoch": 0.3818766276041667, + "grad_norm": 25.98978042602539, + "learning_rate": 9.77892417473463e-06, + "loss": 5.3445, + "step": 18770 + }, + { + "epoch": 0.3819783528645833, + "grad_norm": 15.299857139587402, + "learning_rate": 9.778806627552963e-06, + "loss": 5.1802, + "step": 18775 + }, + { + "epoch": 0.382080078125, + "grad_norm": 18.242080688476562, + "learning_rate": 9.778689049836242e-06, + "loss": 5.1041, + "step": 18780 + }, + { + "epoch": 0.3821818033854167, + "grad_norm": 20.10264015197754, + "learning_rate": 9.778571441585213e-06, + "loss": 5.3996, + "step": 18785 + }, + { + "epoch": 0.3822835286458333, + "grad_norm": 19.318470001220703, + "learning_rate": 9.778453802800632e-06, + "loss": 5.4625, + "step": 18790 + }, + { + "epoch": 0.38238525390625, + "grad_norm": 16.275657653808594, + "learning_rate": 9.778336133483246e-06, + "loss": 5.3353, + "step": 18795 + }, + { + "epoch": 0.3824869791666667, + "grad_norm": 19.292451858520508, + "learning_rate": 9.77821843363381e-06, + "loss": 5.2017, + "step": 18800 + }, + { + "epoch": 0.3825887044270833, + "grad_norm": 16.553997039794922, + "learning_rate": 9.778100703253074e-06, + "loss": 5.2116, + "step": 18805 + }, + { + "epoch": 0.3826904296875, + "grad_norm": 14.6026029586792, + "learning_rate": 9.777982942341792e-06, + "loss": 5.3329, + "step": 18810 + }, + { + "epoch": 0.3827921549479167, + "grad_norm": 26.692506790161133, + "learning_rate": 9.777865150900715e-06, + "loss": 5.3522, + "step": 18815 + }, + { + "epoch": 0.3828938802083333, + "grad_norm": 21.88552474975586, + "learning_rate": 9.777747328930596e-06, + "loss": 5.5171, + "step": 18820 + }, + { + "epoch": 0.38299560546875, + "grad_norm": 16.624109268188477, + "learning_rate": 9.77762947643219e-06, + "loss": 5.0774, + "step": 18825 + }, + { + "epoch": 0.3830973307291667, + "grad_norm": 23.87824058532715, + "learning_rate": 9.777511593406247e-06, + "loss": 5.295, + "step": 18830 + }, + { + "epoch": 0.3831990559895833, + "grad_norm": 19.49684715270996, + "learning_rate": 9.777393679853522e-06, + "loss": 5.205, + "step": 18835 + }, + { + "epoch": 0.38330078125, + "grad_norm": 22.896677017211914, + "learning_rate": 9.777275735774767e-06, + "loss": 5.1033, + "step": 18840 + }, + { + "epoch": 0.3834025065104167, + "grad_norm": 25.088565826416016, + "learning_rate": 9.777157761170738e-06, + "loss": 5.2574, + "step": 18845 + }, + { + "epoch": 0.3835042317708333, + "grad_norm": 12.943825721740723, + "learning_rate": 9.777039756042186e-06, + "loss": 5.2324, + "step": 18850 + }, + { + "epoch": 0.38360595703125, + "grad_norm": 17.440031051635742, + "learning_rate": 9.776921720389868e-06, + "loss": 5.4205, + "step": 18855 + }, + { + "epoch": 0.3837076822916667, + "grad_norm": 14.313323020935059, + "learning_rate": 9.776803654214536e-06, + "loss": 5.3447, + "step": 18860 + }, + { + "epoch": 0.3838094075520833, + "grad_norm": 13.537360191345215, + "learning_rate": 9.776685557516945e-06, + "loss": 5.0436, + "step": 18865 + }, + { + "epoch": 0.3839111328125, + "grad_norm": 16.681447982788086, + "learning_rate": 9.776567430297848e-06, + "loss": 5.3664, + "step": 18870 + }, + { + "epoch": 0.3840128580729167, + "grad_norm": 17.085430145263672, + "learning_rate": 9.776449272558003e-06, + "loss": 5.3974, + "step": 18875 + }, + { + "epoch": 0.3841145833333333, + "grad_norm": 18.989978790283203, + "learning_rate": 9.776331084298164e-06, + "loss": 4.9891, + "step": 18880 + }, + { + "epoch": 0.38421630859375, + "grad_norm": 14.71794319152832, + "learning_rate": 9.776212865519083e-06, + "loss": 5.1302, + "step": 18885 + }, + { + "epoch": 0.3843180338541667, + "grad_norm": 19.002771377563477, + "learning_rate": 9.77609461622152e-06, + "loss": 5.3634, + "step": 18890 + }, + { + "epoch": 0.3844197591145833, + "grad_norm": 14.583451271057129, + "learning_rate": 9.775976336406228e-06, + "loss": 5.5949, + "step": 18895 + }, + { + "epoch": 0.384521484375, + "grad_norm": 15.028233528137207, + "learning_rate": 9.775858026073964e-06, + "loss": 5.354, + "step": 18900 + }, + { + "epoch": 0.3846232096354167, + "grad_norm": 15.309066772460938, + "learning_rate": 9.775739685225482e-06, + "loss": 5.5533, + "step": 18905 + }, + { + "epoch": 0.3847249348958333, + "grad_norm": 17.071090698242188, + "learning_rate": 9.77562131386154e-06, + "loss": 5.0419, + "step": 18910 + }, + { + "epoch": 0.38482666015625, + "grad_norm": 18.90648078918457, + "learning_rate": 9.775502911982895e-06, + "loss": 5.509, + "step": 18915 + }, + { + "epoch": 0.3849283854166667, + "grad_norm": 17.369308471679688, + "learning_rate": 9.775384479590301e-06, + "loss": 5.5872, + "step": 18920 + }, + { + "epoch": 0.3850301106770833, + "grad_norm": 16.645523071289062, + "learning_rate": 9.775266016684518e-06, + "loss": 5.5324, + "step": 18925 + }, + { + "epoch": 0.3851318359375, + "grad_norm": 15.484994888305664, + "learning_rate": 9.7751475232663e-06, + "loss": 5.292, + "step": 18930 + }, + { + "epoch": 0.3852335611979167, + "grad_norm": 18.131061553955078, + "learning_rate": 9.775028999336404e-06, + "loss": 5.4078, + "step": 18935 + }, + { + "epoch": 0.3853352864583333, + "grad_norm": 18.696290969848633, + "learning_rate": 9.77491044489559e-06, + "loss": 5.1577, + "step": 18940 + }, + { + "epoch": 0.38543701171875, + "grad_norm": 13.213574409484863, + "learning_rate": 9.774791859944613e-06, + "loss": 5.1823, + "step": 18945 + }, + { + "epoch": 0.3855387369791667, + "grad_norm": 23.262048721313477, + "learning_rate": 9.774673244484231e-06, + "loss": 5.4268, + "step": 18950 + }, + { + "epoch": 0.3856404622395833, + "grad_norm": 17.461532592773438, + "learning_rate": 9.774554598515202e-06, + "loss": 5.2454, + "step": 18955 + }, + { + "epoch": 0.3857421875, + "grad_norm": 15.392217636108398, + "learning_rate": 9.774435922038287e-06, + "loss": 5.2642, + "step": 18960 + }, + { + "epoch": 0.3858439127604167, + "grad_norm": 15.941062927246094, + "learning_rate": 9.774317215054242e-06, + "loss": 5.3342, + "step": 18965 + }, + { + "epoch": 0.3859456380208333, + "grad_norm": 17.5860595703125, + "learning_rate": 9.774198477563825e-06, + "loss": 5.2301, + "step": 18970 + }, + { + "epoch": 0.38604736328125, + "grad_norm": 16.313058853149414, + "learning_rate": 9.774079709567793e-06, + "loss": 5.0446, + "step": 18975 + }, + { + "epoch": 0.3861490885416667, + "grad_norm": 18.29085350036621, + "learning_rate": 9.77396091106691e-06, + "loss": 5.3148, + "step": 18980 + }, + { + "epoch": 0.3862508138020833, + "grad_norm": 20.928504943847656, + "learning_rate": 9.773842082061934e-06, + "loss": 5.3299, + "step": 18985 + }, + { + "epoch": 0.3863525390625, + "grad_norm": 15.716146469116211, + "learning_rate": 9.77372322255362e-06, + "loss": 5.3963, + "step": 18990 + }, + { + "epoch": 0.3864542643229167, + "grad_norm": 15.235368728637695, + "learning_rate": 9.77360433254273e-06, + "loss": 5.1474, + "step": 18995 + }, + { + "epoch": 0.3865559895833333, + "grad_norm": 18.634254455566406, + "learning_rate": 9.773485412030022e-06, + "loss": 5.1368, + "step": 19000 + }, + { + "epoch": 0.38665771484375, + "grad_norm": 24.561479568481445, + "learning_rate": 9.773366461016261e-06, + "loss": 5.24, + "step": 19005 + }, + { + "epoch": 0.3867594401041667, + "grad_norm": 19.2109432220459, + "learning_rate": 9.7732474795022e-06, + "loss": 5.3756, + "step": 19010 + }, + { + "epoch": 0.3868611653645833, + "grad_norm": 24.049455642700195, + "learning_rate": 9.773128467488606e-06, + "loss": 4.9743, + "step": 19015 + }, + { + "epoch": 0.386962890625, + "grad_norm": 16.790233612060547, + "learning_rate": 9.773009424976234e-06, + "loss": 5.563, + "step": 19020 + }, + { + "epoch": 0.3870646158854167, + "grad_norm": 17.652650833129883, + "learning_rate": 9.772890351965849e-06, + "loss": 5.2967, + "step": 19025 + }, + { + "epoch": 0.3871663411458333, + "grad_norm": 17.408039093017578, + "learning_rate": 9.772771248458209e-06, + "loss": 5.2298, + "step": 19030 + }, + { + "epoch": 0.38726806640625, + "grad_norm": 16.355714797973633, + "learning_rate": 9.772652114454076e-06, + "loss": 5.3622, + "step": 19035 + }, + { + "epoch": 0.3873697916666667, + "grad_norm": 17.616283416748047, + "learning_rate": 9.77253294995421e-06, + "loss": 5.1256, + "step": 19040 + }, + { + "epoch": 0.3874715169270833, + "grad_norm": 20.308095932006836, + "learning_rate": 9.772413754959374e-06, + "loss": 5.1924, + "step": 19045 + }, + { + "epoch": 0.3875732421875, + "grad_norm": 16.356138229370117, + "learning_rate": 9.772294529470327e-06, + "loss": 5.2578, + "step": 19050 + }, + { + "epoch": 0.3876749674479167, + "grad_norm": 19.877561569213867, + "learning_rate": 9.772175273487836e-06, + "loss": 4.979, + "step": 19055 + }, + { + "epoch": 0.3877766927083333, + "grad_norm": 21.084232330322266, + "learning_rate": 9.772055987012658e-06, + "loss": 5.1102, + "step": 19060 + }, + { + "epoch": 0.38787841796875, + "grad_norm": 16.907468795776367, + "learning_rate": 9.77193667004556e-06, + "loss": 5.2077, + "step": 19065 + }, + { + "epoch": 0.3879801432291667, + "grad_norm": 16.971298217773438, + "learning_rate": 9.771817322587299e-06, + "loss": 5.1644, + "step": 19070 + }, + { + "epoch": 0.3880818684895833, + "grad_norm": 12.456055641174316, + "learning_rate": 9.771697944638641e-06, + "loss": 5.1033, + "step": 19075 + }, + { + "epoch": 0.38818359375, + "grad_norm": 18.70301628112793, + "learning_rate": 9.771578536200346e-06, + "loss": 4.9454, + "step": 19080 + }, + { + "epoch": 0.3882853190104167, + "grad_norm": 22.494457244873047, + "learning_rate": 9.77145909727318e-06, + "loss": 5.2393, + "step": 19085 + }, + { + "epoch": 0.3883870442708333, + "grad_norm": 16.693126678466797, + "learning_rate": 9.771339627857906e-06, + "loss": 5.2967, + "step": 19090 + }, + { + "epoch": 0.38848876953125, + "grad_norm": 18.881572723388672, + "learning_rate": 9.771220127955286e-06, + "loss": 4.826, + "step": 19095 + }, + { + "epoch": 0.3885904947916667, + "grad_norm": 17.399015426635742, + "learning_rate": 9.771100597566083e-06, + "loss": 5.2564, + "step": 19100 + }, + { + "epoch": 0.3886922200520833, + "grad_norm": 18.547700881958008, + "learning_rate": 9.770981036691063e-06, + "loss": 5.4452, + "step": 19105 + }, + { + "epoch": 0.3887939453125, + "grad_norm": 14.000970840454102, + "learning_rate": 9.770861445330989e-06, + "loss": 5.4305, + "step": 19110 + }, + { + "epoch": 0.3888956705729167, + "grad_norm": 17.378116607666016, + "learning_rate": 9.770741823486622e-06, + "loss": 5.2851, + "step": 19115 + }, + { + "epoch": 0.3889973958333333, + "grad_norm": 14.484907150268555, + "learning_rate": 9.77062217115873e-06, + "loss": 4.9243, + "step": 19120 + }, + { + "epoch": 0.38909912109375, + "grad_norm": 15.792591094970703, + "learning_rate": 9.770502488348078e-06, + "loss": 5.0405, + "step": 19125 + }, + { + "epoch": 0.3892008463541667, + "grad_norm": 12.510726928710938, + "learning_rate": 9.770382775055431e-06, + "loss": 5.1396, + "step": 19130 + }, + { + "epoch": 0.3893025716145833, + "grad_norm": 16.738182067871094, + "learning_rate": 9.770263031281551e-06, + "loss": 5.3215, + "step": 19135 + }, + { + "epoch": 0.389404296875, + "grad_norm": 20.22895050048828, + "learning_rate": 9.770143257027204e-06, + "loss": 5.2873, + "step": 19140 + }, + { + "epoch": 0.3895060221354167, + "grad_norm": 20.559749603271484, + "learning_rate": 9.770023452293155e-06, + "loss": 5.0835, + "step": 19145 + }, + { + "epoch": 0.3896077473958333, + "grad_norm": 18.49563980102539, + "learning_rate": 9.769903617080173e-06, + "loss": 5.1244, + "step": 19150 + }, + { + "epoch": 0.38970947265625, + "grad_norm": 16.607271194458008, + "learning_rate": 9.769783751389019e-06, + "loss": 5.3838, + "step": 19155 + }, + { + "epoch": 0.3898111979166667, + "grad_norm": 16.67616081237793, + "learning_rate": 9.76966385522046e-06, + "loss": 5.4169, + "step": 19160 + }, + { + "epoch": 0.3899129231770833, + "grad_norm": 17.35544204711914, + "learning_rate": 9.769543928575265e-06, + "loss": 5.3052, + "step": 19165 + }, + { + "epoch": 0.3900146484375, + "grad_norm": 15.811360359191895, + "learning_rate": 9.769423971454198e-06, + "loss": 4.9429, + "step": 19170 + }, + { + "epoch": 0.3901163736979167, + "grad_norm": 21.72793197631836, + "learning_rate": 9.769303983858024e-06, + "loss": 5.0818, + "step": 19175 + }, + { + "epoch": 0.3902180989583333, + "grad_norm": 19.9454402923584, + "learning_rate": 9.769183965787515e-06, + "loss": 5.3859, + "step": 19180 + }, + { + "epoch": 0.39031982421875, + "grad_norm": 16.48603057861328, + "learning_rate": 9.769063917243432e-06, + "loss": 5.5064, + "step": 19185 + }, + { + "epoch": 0.3904215494791667, + "grad_norm": 13.523905754089355, + "learning_rate": 9.768943838226545e-06, + "loss": 5.2176, + "step": 19190 + }, + { + "epoch": 0.3905232747395833, + "grad_norm": 13.95773696899414, + "learning_rate": 9.76882372873762e-06, + "loss": 5.2538, + "step": 19195 + }, + { + "epoch": 0.390625, + "grad_norm": 18.7205753326416, + "learning_rate": 9.768703588777426e-06, + "loss": 5.2696, + "step": 19200 + }, + { + "epoch": 0.3907267252604167, + "grad_norm": 19.818635940551758, + "learning_rate": 9.76858341834673e-06, + "loss": 5.3902, + "step": 19205 + }, + { + "epoch": 0.3908284505208333, + "grad_norm": 17.321386337280273, + "learning_rate": 9.7684632174463e-06, + "loss": 5.2207, + "step": 19210 + }, + { + "epoch": 0.39093017578125, + "grad_norm": 14.12020492553711, + "learning_rate": 9.768342986076905e-06, + "loss": 5.2787, + "step": 19215 + }, + { + "epoch": 0.3910319010416667, + "grad_norm": 13.725682258605957, + "learning_rate": 9.768222724239309e-06, + "loss": 4.8948, + "step": 19220 + }, + { + "epoch": 0.3911336263020833, + "grad_norm": 18.50550651550293, + "learning_rate": 9.768102431934285e-06, + "loss": 5.3526, + "step": 19225 + }, + { + "epoch": 0.3912353515625, + "grad_norm": 19.342391967773438, + "learning_rate": 9.767982109162599e-06, + "loss": 5.2738, + "step": 19230 + }, + { + "epoch": 0.3913370768229167, + "grad_norm": 14.730395317077637, + "learning_rate": 9.767861755925021e-06, + "loss": 5.3136, + "step": 19235 + }, + { + "epoch": 0.3914388020833333, + "grad_norm": 14.3853178024292, + "learning_rate": 9.767741372222322e-06, + "loss": 5.2426, + "step": 19240 + }, + { + "epoch": 0.39154052734375, + "grad_norm": 15.95604419708252, + "learning_rate": 9.767620958055268e-06, + "loss": 5.0942, + "step": 19245 + }, + { + "epoch": 0.3916422526041667, + "grad_norm": 14.256505966186523, + "learning_rate": 9.767500513424627e-06, + "loss": 5.007, + "step": 19250 + }, + { + "epoch": 0.3917439778645833, + "grad_norm": 22.070680618286133, + "learning_rate": 9.767380038331176e-06, + "loss": 5.2004, + "step": 19255 + }, + { + "epoch": 0.391845703125, + "grad_norm": 16.163551330566406, + "learning_rate": 9.767259532775677e-06, + "loss": 5.2362, + "step": 19260 + }, + { + "epoch": 0.3919474283854167, + "grad_norm": 13.345327377319336, + "learning_rate": 9.767138996758903e-06, + "loss": 5.221, + "step": 19265 + }, + { + "epoch": 0.3920491536458333, + "grad_norm": 17.39878273010254, + "learning_rate": 9.767018430281623e-06, + "loss": 5.1802, + "step": 19270 + }, + { + "epoch": 0.39215087890625, + "grad_norm": 18.545475006103516, + "learning_rate": 9.76689783334461e-06, + "loss": 5.1078, + "step": 19275 + }, + { + "epoch": 0.3922526041666667, + "grad_norm": 21.44965171813965, + "learning_rate": 9.766777205948632e-06, + "loss": 5.0807, + "step": 19280 + }, + { + "epoch": 0.3923543294270833, + "grad_norm": 17.046171188354492, + "learning_rate": 9.766656548094463e-06, + "loss": 5.0367, + "step": 19285 + }, + { + "epoch": 0.3924560546875, + "grad_norm": 18.276012420654297, + "learning_rate": 9.766535859782871e-06, + "loss": 5.1873, + "step": 19290 + }, + { + "epoch": 0.3925577799479167, + "grad_norm": 16.565088272094727, + "learning_rate": 9.766415141014627e-06, + "loss": 5.25, + "step": 19295 + }, + { + "epoch": 0.3926595052083333, + "grad_norm": 22.175140380859375, + "learning_rate": 9.766294391790502e-06, + "loss": 5.0689, + "step": 19300 + }, + { + "epoch": 0.39276123046875, + "grad_norm": 15.509163856506348, + "learning_rate": 9.766173612111272e-06, + "loss": 5.1319, + "step": 19305 + }, + { + "epoch": 0.3928629557291667, + "grad_norm": 15.915925979614258, + "learning_rate": 9.766052801977704e-06, + "loss": 5.3874, + "step": 19310 + }, + { + "epoch": 0.3929646809895833, + "grad_norm": 19.131532669067383, + "learning_rate": 9.765931961390571e-06, + "loss": 5.2757, + "step": 19315 + }, + { + "epoch": 0.39306640625, + "grad_norm": 15.122759819030762, + "learning_rate": 9.765811090350647e-06, + "loss": 5.072, + "step": 19320 + }, + { + "epoch": 0.3931681315104167, + "grad_norm": 16.179506301879883, + "learning_rate": 9.765690188858702e-06, + "loss": 5.0275, + "step": 19325 + }, + { + "epoch": 0.3932698567708333, + "grad_norm": 22.398191452026367, + "learning_rate": 9.76556925691551e-06, + "loss": 5.6523, + "step": 19330 + }, + { + "epoch": 0.39337158203125, + "grad_norm": 18.96259307861328, + "learning_rate": 9.765448294521844e-06, + "loss": 5.207, + "step": 19335 + }, + { + "epoch": 0.3934733072916667, + "grad_norm": 17.372295379638672, + "learning_rate": 9.765327301678475e-06, + "loss": 5.1657, + "step": 19340 + }, + { + "epoch": 0.3935750325520833, + "grad_norm": 14.971473693847656, + "learning_rate": 9.765206278386178e-06, + "loss": 5.3484, + "step": 19345 + }, + { + "epoch": 0.3936767578125, + "grad_norm": 18.892810821533203, + "learning_rate": 9.765085224645724e-06, + "loss": 5.0485, + "step": 19350 + }, + { + "epoch": 0.3937784830729167, + "grad_norm": 16.903520584106445, + "learning_rate": 9.76496414045789e-06, + "loss": 5.0837, + "step": 19355 + }, + { + "epoch": 0.3938802083333333, + "grad_norm": 15.140774726867676, + "learning_rate": 9.764843025823447e-06, + "loss": 5.1531, + "step": 19360 + }, + { + "epoch": 0.39398193359375, + "grad_norm": 18.444782257080078, + "learning_rate": 9.76472188074317e-06, + "loss": 4.9914, + "step": 19365 + }, + { + "epoch": 0.3940836588541667, + "grad_norm": 16.962614059448242, + "learning_rate": 9.764600705217832e-06, + "loss": 5.1027, + "step": 19370 + }, + { + "epoch": 0.3941853841145833, + "grad_norm": 18.33901596069336, + "learning_rate": 9.76447949924821e-06, + "loss": 5.1202, + "step": 19375 + }, + { + "epoch": 0.394287109375, + "grad_norm": 13.535527229309082, + "learning_rate": 9.764358262835073e-06, + "loss": 5.1535, + "step": 19380 + }, + { + "epoch": 0.3943888346354167, + "grad_norm": 18.38083267211914, + "learning_rate": 9.764236995979202e-06, + "loss": 5.5026, + "step": 19385 + }, + { + "epoch": 0.3944905598958333, + "grad_norm": 17.951618194580078, + "learning_rate": 9.764115698681367e-06, + "loss": 5.3521, + "step": 19390 + }, + { + "epoch": 0.39459228515625, + "grad_norm": 16.85857391357422, + "learning_rate": 9.763994370942345e-06, + "loss": 5.244, + "step": 19395 + }, + { + "epoch": 0.3946940104166667, + "grad_norm": 14.505989074707031, + "learning_rate": 9.763873012762912e-06, + "loss": 5.1137, + "step": 19400 + }, + { + "epoch": 0.3947957356770833, + "grad_norm": 13.80378246307373, + "learning_rate": 9.763751624143844e-06, + "loss": 5.18, + "step": 19405 + }, + { + "epoch": 0.3948974609375, + "grad_norm": 13.850140571594238, + "learning_rate": 9.763630205085914e-06, + "loss": 5.2164, + "step": 19410 + }, + { + "epoch": 0.3949991861979167, + "grad_norm": 15.19741153717041, + "learning_rate": 9.763508755589899e-06, + "loss": 5.1469, + "step": 19415 + }, + { + "epoch": 0.3951009114583333, + "grad_norm": 21.269704818725586, + "learning_rate": 9.763387275656574e-06, + "loss": 5.1486, + "step": 19420 + }, + { + "epoch": 0.39520263671875, + "grad_norm": 17.41317367553711, + "learning_rate": 9.76326576528672e-06, + "loss": 5.2744, + "step": 19425 + }, + { + "epoch": 0.3953043619791667, + "grad_norm": 20.2772159576416, + "learning_rate": 9.763144224481107e-06, + "loss": 5.3541, + "step": 19430 + }, + { + "epoch": 0.3954060872395833, + "grad_norm": 13.712509155273438, + "learning_rate": 9.763022653240514e-06, + "loss": 5.1846, + "step": 19435 + }, + { + "epoch": 0.3955078125, + "grad_norm": 17.132980346679688, + "learning_rate": 9.762901051565721e-06, + "loss": 5.3075, + "step": 19440 + }, + { + "epoch": 0.3956095377604167, + "grad_norm": 18.80116844177246, + "learning_rate": 9.7627794194575e-06, + "loss": 5.1238, + "step": 19445 + }, + { + "epoch": 0.3957112630208333, + "grad_norm": 14.895691871643066, + "learning_rate": 9.762657756916632e-06, + "loss": 5.2259, + "step": 19450 + }, + { + "epoch": 0.39581298828125, + "grad_norm": 16.227333068847656, + "learning_rate": 9.762536063943891e-06, + "loss": 5.2173, + "step": 19455 + }, + { + "epoch": 0.3959147135416667, + "grad_norm": 16.109474182128906, + "learning_rate": 9.762414340540057e-06, + "loss": 5.0876, + "step": 19460 + }, + { + "epoch": 0.3960164388020833, + "grad_norm": 12.6137056350708, + "learning_rate": 9.762292586705908e-06, + "loss": 5.3116, + "step": 19465 + }, + { + "epoch": 0.3961181640625, + "grad_norm": 16.498485565185547, + "learning_rate": 9.762170802442222e-06, + "loss": 5.3447, + "step": 19470 + }, + { + "epoch": 0.3962198893229167, + "grad_norm": 16.217235565185547, + "learning_rate": 9.762048987749772e-06, + "loss": 5.1001, + "step": 19475 + }, + { + "epoch": 0.3963216145833333, + "grad_norm": 20.02046775817871, + "learning_rate": 9.761927142629345e-06, + "loss": 5.1282, + "step": 19480 + }, + { + "epoch": 0.39642333984375, + "grad_norm": 14.843721389770508, + "learning_rate": 9.761805267081713e-06, + "loss": 5.4043, + "step": 19485 + }, + { + "epoch": 0.3965250651041667, + "grad_norm": 13.809285163879395, + "learning_rate": 9.761683361107657e-06, + "loss": 5.1321, + "step": 19490 + }, + { + "epoch": 0.3966267903645833, + "grad_norm": 16.115219116210938, + "learning_rate": 9.761561424707957e-06, + "loss": 5.2177, + "step": 19495 + }, + { + "epoch": 0.396728515625, + "grad_norm": 19.086660385131836, + "learning_rate": 9.76143945788339e-06, + "loss": 5.2841, + "step": 19500 + }, + { + "epoch": 0.3968302408854167, + "grad_norm": 20.741928100585938, + "learning_rate": 9.761317460634739e-06, + "loss": 5.2475, + "step": 19505 + }, + { + "epoch": 0.3969319661458333, + "grad_norm": 16.614953994750977, + "learning_rate": 9.761195432962778e-06, + "loss": 5.2555, + "step": 19510 + }, + { + "epoch": 0.39703369140625, + "grad_norm": 17.50452423095703, + "learning_rate": 9.76107337486829e-06, + "loss": 5.2418, + "step": 19515 + }, + { + "epoch": 0.3971354166666667, + "grad_norm": 24.905982971191406, + "learning_rate": 9.760951286352054e-06, + "loss": 5.1143, + "step": 19520 + }, + { + "epoch": 0.3972371419270833, + "grad_norm": 14.542245864868164, + "learning_rate": 9.760829167414852e-06, + "loss": 5.1707, + "step": 19525 + }, + { + "epoch": 0.3973388671875, + "grad_norm": 16.110279083251953, + "learning_rate": 9.760707018057462e-06, + "loss": 5.1815, + "step": 19530 + }, + { + "epoch": 0.3974405924479167, + "grad_norm": 13.509376525878906, + "learning_rate": 9.760584838280667e-06, + "loss": 5.1771, + "step": 19535 + }, + { + "epoch": 0.3975423177083333, + "grad_norm": 21.301362991333008, + "learning_rate": 9.760462628085244e-06, + "loss": 5.244, + "step": 19540 + }, + { + "epoch": 0.39764404296875, + "grad_norm": 13.786454200744629, + "learning_rate": 9.760340387471976e-06, + "loss": 5.2674, + "step": 19545 + }, + { + "epoch": 0.3977457682291667, + "grad_norm": 18.070066452026367, + "learning_rate": 9.760218116441646e-06, + "loss": 5.0543, + "step": 19550 + }, + { + "epoch": 0.3978474934895833, + "grad_norm": 19.545713424682617, + "learning_rate": 9.760095814995032e-06, + "loss": 5.2002, + "step": 19555 + }, + { + "epoch": 0.39794921875, + "grad_norm": 12.43732738494873, + "learning_rate": 9.759973483132917e-06, + "loss": 5.4288, + "step": 19560 + }, + { + "epoch": 0.3980509440104167, + "grad_norm": 18.710678100585938, + "learning_rate": 9.759851120856083e-06, + "loss": 5.2089, + "step": 19565 + }, + { + "epoch": 0.3981526692708333, + "grad_norm": 16.523887634277344, + "learning_rate": 9.75972872816531e-06, + "loss": 4.9561, + "step": 19570 + }, + { + "epoch": 0.39825439453125, + "grad_norm": 16.836376190185547, + "learning_rate": 9.759606305061383e-06, + "loss": 5.1198, + "step": 19575 + }, + { + "epoch": 0.3983561197916667, + "grad_norm": 16.69854736328125, + "learning_rate": 9.75948385154508e-06, + "loss": 5.1996, + "step": 19580 + }, + { + "epoch": 0.3984578450520833, + "grad_norm": 12.800176620483398, + "learning_rate": 9.759361367617188e-06, + "loss": 5.062, + "step": 19585 + }, + { + "epoch": 0.3985595703125, + "grad_norm": 16.89525604248047, + "learning_rate": 9.759238853278487e-06, + "loss": 5.2576, + "step": 19590 + }, + { + "epoch": 0.3986612955729167, + "grad_norm": 17.86984634399414, + "learning_rate": 9.759116308529762e-06, + "loss": 5.1388, + "step": 19595 + }, + { + "epoch": 0.3987630208333333, + "grad_norm": 16.85563850402832, + "learning_rate": 9.758993733371793e-06, + "loss": 5.1531, + "step": 19600 + }, + { + "epoch": 0.39886474609375, + "grad_norm": 15.59533405303955, + "learning_rate": 9.758871127805366e-06, + "loss": 5.2513, + "step": 19605 + }, + { + "epoch": 0.3989664713541667, + "grad_norm": 30.983821868896484, + "learning_rate": 9.758748491831263e-06, + "loss": 5.2897, + "step": 19610 + }, + { + "epoch": 0.3990681966145833, + "grad_norm": 14.647451400756836, + "learning_rate": 9.758625825450267e-06, + "loss": 5.2577, + "step": 19615 + }, + { + "epoch": 0.399169921875, + "grad_norm": 14.76584529876709, + "learning_rate": 9.758503128663163e-06, + "loss": 5.1771, + "step": 19620 + }, + { + "epoch": 0.3992716471354167, + "grad_norm": 14.672021865844727, + "learning_rate": 9.758380401470733e-06, + "loss": 5.2294, + "step": 19625 + }, + { + "epoch": 0.3993733723958333, + "grad_norm": 17.380388259887695, + "learning_rate": 9.758257643873765e-06, + "loss": 5.2317, + "step": 19630 + }, + { + "epoch": 0.39947509765625, + "grad_norm": 19.364948272705078, + "learning_rate": 9.75813485587304e-06, + "loss": 5.5068, + "step": 19635 + }, + { + "epoch": 0.3995768229166667, + "grad_norm": 21.491762161254883, + "learning_rate": 9.758012037469344e-06, + "loss": 5.1505, + "step": 19640 + }, + { + "epoch": 0.3996785481770833, + "grad_norm": 17.948997497558594, + "learning_rate": 9.757889188663462e-06, + "loss": 5.1572, + "step": 19645 + }, + { + "epoch": 0.3997802734375, + "grad_norm": 14.808860778808594, + "learning_rate": 9.757766309456178e-06, + "loss": 5.5545, + "step": 19650 + }, + { + "epoch": 0.3998819986979167, + "grad_norm": 17.93759536743164, + "learning_rate": 9.757643399848276e-06, + "loss": 4.9839, + "step": 19655 + }, + { + "epoch": 0.3999837239583333, + "grad_norm": 21.683725357055664, + "learning_rate": 9.757520459840545e-06, + "loss": 5.389, + "step": 19660 + }, + { + "epoch": 0.40008544921875, + "grad_norm": 12.201227188110352, + "learning_rate": 9.75739748943377e-06, + "loss": 5.3966, + "step": 19665 + }, + { + "epoch": 0.4001871744791667, + "grad_norm": 19.918453216552734, + "learning_rate": 9.757274488628733e-06, + "loss": 5.2913, + "step": 19670 + }, + { + "epoch": 0.4002888997395833, + "grad_norm": 15.104535102844238, + "learning_rate": 9.757151457426221e-06, + "loss": 5.3965, + "step": 19675 + }, + { + "epoch": 0.400390625, + "grad_norm": 15.91722583770752, + "learning_rate": 9.757028395827022e-06, + "loss": 5.1871, + "step": 19680 + }, + { + "epoch": 0.4004923502604167, + "grad_norm": 15.254850387573242, + "learning_rate": 9.756905303831923e-06, + "loss": 4.8936, + "step": 19685 + }, + { + "epoch": 0.4005940755208333, + "grad_norm": 12.59363079071045, + "learning_rate": 9.756782181441708e-06, + "loss": 5.4499, + "step": 19690 + }, + { + "epoch": 0.40069580078125, + "grad_norm": 18.530183792114258, + "learning_rate": 9.756659028657165e-06, + "loss": 5.0455, + "step": 19695 + }, + { + "epoch": 0.4007975260416667, + "grad_norm": 12.941289901733398, + "learning_rate": 9.756535845479082e-06, + "loss": 5.358, + "step": 19700 + }, + { + "epoch": 0.4008992513020833, + "grad_norm": 27.072477340698242, + "learning_rate": 9.756412631908244e-06, + "loss": 5.4587, + "step": 19705 + }, + { + "epoch": 0.4010009765625, + "grad_norm": 15.236505508422852, + "learning_rate": 9.756289387945439e-06, + "loss": 5.3337, + "step": 19710 + }, + { + "epoch": 0.4011027018229167, + "grad_norm": 22.88157081604004, + "learning_rate": 9.756166113591454e-06, + "loss": 5.1503, + "step": 19715 + }, + { + "epoch": 0.4012044270833333, + "grad_norm": 19.529666900634766, + "learning_rate": 9.756042808847078e-06, + "loss": 5.3352, + "step": 19720 + }, + { + "epoch": 0.40130615234375, + "grad_norm": 12.971786499023438, + "learning_rate": 9.755919473713097e-06, + "loss": 5.0982, + "step": 19725 + }, + { + "epoch": 0.4014078776041667, + "grad_norm": 21.219552993774414, + "learning_rate": 9.755796108190303e-06, + "loss": 5.0467, + "step": 19730 + }, + { + "epoch": 0.4015096028645833, + "grad_norm": 15.7188720703125, + "learning_rate": 9.75567271227948e-06, + "loss": 5.1892, + "step": 19735 + }, + { + "epoch": 0.401611328125, + "grad_norm": 12.63295841217041, + "learning_rate": 9.755549285981419e-06, + "loss": 5.1938, + "step": 19740 + }, + { + "epoch": 0.4017130533854167, + "grad_norm": 15.596260070800781, + "learning_rate": 9.755425829296907e-06, + "loss": 4.9442, + "step": 19745 + }, + { + "epoch": 0.4018147786458333, + "grad_norm": 16.012863159179688, + "learning_rate": 9.755302342226734e-06, + "loss": 5.29, + "step": 19750 + }, + { + "epoch": 0.40191650390625, + "grad_norm": 19.78497314453125, + "learning_rate": 9.755178824771687e-06, + "loss": 5.0993, + "step": 19755 + }, + { + "epoch": 0.4020182291666667, + "grad_norm": 19.94322395324707, + "learning_rate": 9.755055276932558e-06, + "loss": 5.4699, + "step": 19760 + }, + { + "epoch": 0.4021199544270833, + "grad_norm": 16.886268615722656, + "learning_rate": 9.754931698710135e-06, + "loss": 5.1295, + "step": 19765 + }, + { + "epoch": 0.4022216796875, + "grad_norm": 16.975868225097656, + "learning_rate": 9.754808090105207e-06, + "loss": 4.9606, + "step": 19770 + }, + { + "epoch": 0.4023234049479167, + "grad_norm": 18.769920349121094, + "learning_rate": 9.754684451118566e-06, + "loss": 4.9798, + "step": 19775 + }, + { + "epoch": 0.4024251302083333, + "grad_norm": 16.18288230895996, + "learning_rate": 9.754560781751002e-06, + "loss": 5.3059, + "step": 19780 + }, + { + "epoch": 0.40252685546875, + "grad_norm": 23.604206085205078, + "learning_rate": 9.754437082003302e-06, + "loss": 5.0143, + "step": 19785 + }, + { + "epoch": 0.4026285807291667, + "grad_norm": 19.203039169311523, + "learning_rate": 9.754313351876256e-06, + "loss": 5.2111, + "step": 19790 + }, + { + "epoch": 0.4027303059895833, + "grad_norm": 23.57630729675293, + "learning_rate": 9.75418959137066e-06, + "loss": 5.1388, + "step": 19795 + }, + { + "epoch": 0.40283203125, + "grad_norm": 15.857295036315918, + "learning_rate": 9.754065800487302e-06, + "loss": 5.2722, + "step": 19800 + }, + { + "epoch": 0.4029337565104167, + "grad_norm": 14.074243545532227, + "learning_rate": 9.753941979226972e-06, + "loss": 5.2456, + "step": 19805 + }, + { + "epoch": 0.4030354817708333, + "grad_norm": 13.835692405700684, + "learning_rate": 9.753818127590462e-06, + "loss": 4.9602, + "step": 19810 + }, + { + "epoch": 0.40313720703125, + "grad_norm": 16.43366813659668, + "learning_rate": 9.753694245578563e-06, + "loss": 5.5114, + "step": 19815 + }, + { + "epoch": 0.4032389322916667, + "grad_norm": 15.394122123718262, + "learning_rate": 9.753570333192066e-06, + "loss": 5.0759, + "step": 19820 + }, + { + "epoch": 0.4033406575520833, + "grad_norm": 12.562480926513672, + "learning_rate": 9.753446390431764e-06, + "loss": 5.3137, + "step": 19825 + }, + { + "epoch": 0.4034423828125, + "grad_norm": 17.566347122192383, + "learning_rate": 9.75332241729845e-06, + "loss": 5.1948, + "step": 19830 + }, + { + "epoch": 0.4035441080729167, + "grad_norm": 19.48394012451172, + "learning_rate": 9.753198413792913e-06, + "loss": 5.2243, + "step": 19835 + }, + { + "epoch": 0.4036458333333333, + "grad_norm": 16.848188400268555, + "learning_rate": 9.753074379915946e-06, + "loss": 5.266, + "step": 19840 + }, + { + "epoch": 0.40374755859375, + "grad_norm": 18.813425064086914, + "learning_rate": 9.752950315668344e-06, + "loss": 5.1268, + "step": 19845 + }, + { + "epoch": 0.4038492838541667, + "grad_norm": 13.130245208740234, + "learning_rate": 9.752826221050898e-06, + "loss": 5.1506, + "step": 19850 + }, + { + "epoch": 0.4039510091145833, + "grad_norm": 16.50583839416504, + "learning_rate": 9.752702096064401e-06, + "loss": 5.2259, + "step": 19855 + }, + { + "epoch": 0.404052734375, + "grad_norm": 18.303138732910156, + "learning_rate": 9.752577940709647e-06, + "loss": 5.2, + "step": 19860 + }, + { + "epoch": 0.4041544596354167, + "grad_norm": 17.30682373046875, + "learning_rate": 9.752453754987429e-06, + "loss": 5.2751, + "step": 19865 + }, + { + "epoch": 0.4042561848958333, + "grad_norm": 13.806625366210938, + "learning_rate": 9.75232953889854e-06, + "loss": 4.9871, + "step": 19870 + }, + { + "epoch": 0.40435791015625, + "grad_norm": 18.78516960144043, + "learning_rate": 9.752205292443773e-06, + "loss": 5.1721, + "step": 19875 + }, + { + "epoch": 0.4044596354166667, + "grad_norm": 20.724348068237305, + "learning_rate": 9.752081015623922e-06, + "loss": 5.234, + "step": 19880 + }, + { + "epoch": 0.4045613606770833, + "grad_norm": 17.569429397583008, + "learning_rate": 9.751956708439784e-06, + "loss": 5.1681, + "step": 19885 + }, + { + "epoch": 0.4046630859375, + "grad_norm": 16.572284698486328, + "learning_rate": 9.75183237089215e-06, + "loss": 5.1915, + "step": 19890 + }, + { + "epoch": 0.4047648111979167, + "grad_norm": 18.441434860229492, + "learning_rate": 9.751708002981814e-06, + "loss": 5.4017, + "step": 19895 + }, + { + "epoch": 0.4048665364583333, + "grad_norm": 14.802095413208008, + "learning_rate": 9.751583604709576e-06, + "loss": 5.5806, + "step": 19900 + }, + { + "epoch": 0.40496826171875, + "grad_norm": 17.97659683227539, + "learning_rate": 9.751459176076223e-06, + "loss": 5.2845, + "step": 19905 + }, + { + "epoch": 0.4050699869791667, + "grad_norm": 12.37513542175293, + "learning_rate": 9.751334717082558e-06, + "loss": 5.2207, + "step": 19910 + }, + { + "epoch": 0.4051717122395833, + "grad_norm": 16.045333862304688, + "learning_rate": 9.75121022772937e-06, + "loss": 5.4747, + "step": 19915 + }, + { + "epoch": 0.4052734375, + "grad_norm": 20.734590530395508, + "learning_rate": 9.751085708017457e-06, + "loss": 4.9689, + "step": 19920 + }, + { + "epoch": 0.4053751627604167, + "grad_norm": 16.365097045898438, + "learning_rate": 9.750961157947615e-06, + "loss": 5.0956, + "step": 19925 + }, + { + "epoch": 0.4054768880208333, + "grad_norm": 16.031787872314453, + "learning_rate": 9.75083657752064e-06, + "loss": 5.0422, + "step": 19930 + }, + { + "epoch": 0.40557861328125, + "grad_norm": 15.738871574401855, + "learning_rate": 9.750711966737327e-06, + "loss": 5.3372, + "step": 19935 + }, + { + "epoch": 0.4056803385416667, + "grad_norm": 18.93865203857422, + "learning_rate": 9.750587325598471e-06, + "loss": 5.1423, + "step": 19940 + }, + { + "epoch": 0.4057820638020833, + "grad_norm": 16.992204666137695, + "learning_rate": 9.750462654104871e-06, + "loss": 5.1646, + "step": 19945 + }, + { + "epoch": 0.4058837890625, + "grad_norm": 17.960283279418945, + "learning_rate": 9.750337952257324e-06, + "loss": 5.0538, + "step": 19950 + }, + { + "epoch": 0.4059855143229167, + "grad_norm": 16.053958892822266, + "learning_rate": 9.750213220056623e-06, + "loss": 5.326, + "step": 19955 + }, + { + "epoch": 0.4060872395833333, + "grad_norm": 15.860724449157715, + "learning_rate": 9.75008845750357e-06, + "loss": 5.1953, + "step": 19960 + }, + { + "epoch": 0.40618896484375, + "grad_norm": 17.995628356933594, + "learning_rate": 9.749963664598957e-06, + "loss": 5.1241, + "step": 19965 + }, + { + "epoch": 0.4062906901041667, + "grad_norm": 19.77199363708496, + "learning_rate": 9.749838841343586e-06, + "loss": 5.3609, + "step": 19970 + }, + { + "epoch": 0.4063924153645833, + "grad_norm": 16.418521881103516, + "learning_rate": 9.749713987738254e-06, + "loss": 5.1499, + "step": 19975 + }, + { + "epoch": 0.406494140625, + "grad_norm": 23.861064910888672, + "learning_rate": 9.749589103783754e-06, + "loss": 5.4114, + "step": 19980 + }, + { + "epoch": 0.4065958658854167, + "grad_norm": 19.602033615112305, + "learning_rate": 9.74946418948089e-06, + "loss": 5.1931, + "step": 19985 + }, + { + "epoch": 0.4066975911458333, + "grad_norm": 16.82630729675293, + "learning_rate": 9.749339244830455e-06, + "loss": 5.2008, + "step": 19990 + }, + { + "epoch": 0.40679931640625, + "grad_norm": 19.203128814697266, + "learning_rate": 9.749214269833251e-06, + "loss": 5.258, + "step": 19995 + }, + { + "epoch": 0.4069010416666667, + "grad_norm": 17.635251998901367, + "learning_rate": 9.749089264490078e-06, + "loss": 5.0937, + "step": 20000 + }, + { + "epoch": 0.4070027669270833, + "grad_norm": 15.830440521240234, + "learning_rate": 9.74896422880173e-06, + "loss": 5.327, + "step": 20005 + }, + { + "epoch": 0.4071044921875, + "grad_norm": 16.2730712890625, + "learning_rate": 9.748839162769006e-06, + "loss": 5.0139, + "step": 20010 + }, + { + "epoch": 0.4072062174479167, + "grad_norm": 17.039867401123047, + "learning_rate": 9.748714066392711e-06, + "loss": 5.2459, + "step": 20015 + }, + { + "epoch": 0.4073079427083333, + "grad_norm": 14.765944480895996, + "learning_rate": 9.748588939673638e-06, + "loss": 5.0629, + "step": 20020 + }, + { + "epoch": 0.40740966796875, + "grad_norm": 16.796077728271484, + "learning_rate": 9.748463782612589e-06, + "loss": 5.5151, + "step": 20025 + }, + { + "epoch": 0.4075113932291667, + "grad_norm": 16.595447540283203, + "learning_rate": 9.748338595210366e-06, + "loss": 5.1076, + "step": 20030 + }, + { + "epoch": 0.4076131184895833, + "grad_norm": 23.094926834106445, + "learning_rate": 9.748213377467763e-06, + "loss": 5.2863, + "step": 20035 + }, + { + "epoch": 0.40771484375, + "grad_norm": 12.254791259765625, + "learning_rate": 9.748088129385587e-06, + "loss": 5.0802, + "step": 20040 + }, + { + "epoch": 0.4078165690104167, + "grad_norm": 17.944971084594727, + "learning_rate": 9.747962850964634e-06, + "loss": 5.2529, + "step": 20045 + }, + { + "epoch": 0.4079182942708333, + "grad_norm": 15.61330795288086, + "learning_rate": 9.747837542205706e-06, + "loss": 5.2498, + "step": 20050 + }, + { + "epoch": 0.40802001953125, + "grad_norm": 23.119834899902344, + "learning_rate": 9.747712203109603e-06, + "loss": 5.0824, + "step": 20055 + }, + { + "epoch": 0.4081217447916667, + "grad_norm": 18.210750579833984, + "learning_rate": 9.747586833677128e-06, + "loss": 5.0524, + "step": 20060 + }, + { + "epoch": 0.4082234700520833, + "grad_norm": 14.852325439453125, + "learning_rate": 9.747461433909078e-06, + "loss": 5.3088, + "step": 20065 + }, + { + "epoch": 0.4083251953125, + "grad_norm": 13.420110702514648, + "learning_rate": 9.747336003806255e-06, + "loss": 5.3344, + "step": 20070 + }, + { + "epoch": 0.4084269205729167, + "grad_norm": 18.186460494995117, + "learning_rate": 9.747210543369465e-06, + "loss": 5.2217, + "step": 20075 + }, + { + "epoch": 0.4085286458333333, + "grad_norm": 15.12550163269043, + "learning_rate": 9.747085052599505e-06, + "loss": 5.2483, + "step": 20080 + }, + { + "epoch": 0.40863037109375, + "grad_norm": 22.373289108276367, + "learning_rate": 9.74695953149718e-06, + "loss": 5.2209, + "step": 20085 + }, + { + "epoch": 0.4087320963541667, + "grad_norm": 17.347457885742188, + "learning_rate": 9.746833980063288e-06, + "loss": 5.2198, + "step": 20090 + }, + { + "epoch": 0.4088338216145833, + "grad_norm": 24.005233764648438, + "learning_rate": 9.746708398298635e-06, + "loss": 5.5447, + "step": 20095 + }, + { + "epoch": 0.408935546875, + "grad_norm": 17.66124725341797, + "learning_rate": 9.746582786204023e-06, + "loss": 5.0453, + "step": 20100 + }, + { + "epoch": 0.4090372721354167, + "grad_norm": 17.737985610961914, + "learning_rate": 9.746457143780253e-06, + "loss": 5.6049, + "step": 20105 + }, + { + "epoch": 0.4091389973958333, + "grad_norm": 18.88758087158203, + "learning_rate": 9.746331471028127e-06, + "loss": 5.3221, + "step": 20110 + }, + { + "epoch": 0.40924072265625, + "grad_norm": 17.27764892578125, + "learning_rate": 9.746205767948453e-06, + "loss": 5.3243, + "step": 20115 + }, + { + "epoch": 0.4093424479166667, + "grad_norm": 17.10250473022461, + "learning_rate": 9.746080034542029e-06, + "loss": 5.242, + "step": 20120 + }, + { + "epoch": 0.4094441731770833, + "grad_norm": 13.971616744995117, + "learning_rate": 9.74595427080966e-06, + "loss": 5.4719, + "step": 20125 + }, + { + "epoch": 0.4095458984375, + "grad_norm": 15.817903518676758, + "learning_rate": 9.74582847675215e-06, + "loss": 5.195, + "step": 20130 + }, + { + "epoch": 0.4096476236979167, + "grad_norm": 14.626581192016602, + "learning_rate": 9.745702652370301e-06, + "loss": 5.5553, + "step": 20135 + }, + { + "epoch": 0.4097493489583333, + "grad_norm": 12.559467315673828, + "learning_rate": 9.745576797664922e-06, + "loss": 5.2296, + "step": 20140 + }, + { + "epoch": 0.40985107421875, + "grad_norm": 14.679424285888672, + "learning_rate": 9.745450912636811e-06, + "loss": 4.8622, + "step": 20145 + }, + { + "epoch": 0.4099527994791667, + "grad_norm": 16.35430335998535, + "learning_rate": 9.745324997286775e-06, + "loss": 5.0927, + "step": 20150 + }, + { + "epoch": 0.4100545247395833, + "grad_norm": 14.083658218383789, + "learning_rate": 9.74519905161562e-06, + "loss": 5.2446, + "step": 20155 + }, + { + "epoch": 0.41015625, + "grad_norm": 18.6956844329834, + "learning_rate": 9.745073075624149e-06, + "loss": 4.9348, + "step": 20160 + }, + { + "epoch": 0.4102579752604167, + "grad_norm": 16.013521194458008, + "learning_rate": 9.744947069313166e-06, + "loss": 5.0474, + "step": 20165 + }, + { + "epoch": 0.4103597005208333, + "grad_norm": 15.14260196685791, + "learning_rate": 9.74482103268348e-06, + "loss": 5.1957, + "step": 20170 + }, + { + "epoch": 0.41046142578125, + "grad_norm": 15.835021018981934, + "learning_rate": 9.744694965735892e-06, + "loss": 5.5, + "step": 20175 + }, + { + "epoch": 0.4105631510416667, + "grad_norm": 21.788713455200195, + "learning_rate": 9.74456886847121e-06, + "loss": 5.136, + "step": 20180 + }, + { + "epoch": 0.4106648763020833, + "grad_norm": 27.841468811035156, + "learning_rate": 9.744442740890238e-06, + "loss": 5.5828, + "step": 20185 + }, + { + "epoch": 0.4107666015625, + "grad_norm": 20.00762939453125, + "learning_rate": 9.744316582993785e-06, + "loss": 5.2977, + "step": 20190 + }, + { + "epoch": 0.4108683268229167, + "grad_norm": 41.57071304321289, + "learning_rate": 9.744190394782653e-06, + "loss": 5.6272, + "step": 20195 + }, + { + "epoch": 0.4109700520833333, + "grad_norm": 14.198173522949219, + "learning_rate": 9.744064176257653e-06, + "loss": 4.8526, + "step": 20200 + }, + { + "epoch": 0.41107177734375, + "grad_norm": 14.548542022705078, + "learning_rate": 9.743937927419587e-06, + "loss": 5.3051, + "step": 20205 + }, + { + "epoch": 0.4111735026041667, + "grad_norm": 19.49906349182129, + "learning_rate": 9.743811648269263e-06, + "loss": 5.5964, + "step": 20210 + }, + { + "epoch": 0.4112752278645833, + "grad_norm": 16.606454849243164, + "learning_rate": 9.743685338807489e-06, + "loss": 5.1848, + "step": 20215 + }, + { + "epoch": 0.411376953125, + "grad_norm": 15.892847061157227, + "learning_rate": 9.743558999035072e-06, + "loss": 5.1399, + "step": 20220 + }, + { + "epoch": 0.4114786783854167, + "grad_norm": 17.36992073059082, + "learning_rate": 9.743432628952819e-06, + "loss": 5.2424, + "step": 20225 + }, + { + "epoch": 0.4115804036458333, + "grad_norm": 21.331748962402344, + "learning_rate": 9.743306228561536e-06, + "loss": 5.279, + "step": 20230 + }, + { + "epoch": 0.41168212890625, + "grad_norm": 16.956098556518555, + "learning_rate": 9.743179797862032e-06, + "loss": 5.3429, + "step": 20235 + }, + { + "epoch": 0.4117838541666667, + "grad_norm": 18.893394470214844, + "learning_rate": 9.743053336855116e-06, + "loss": 5.2002, + "step": 20240 + }, + { + "epoch": 0.4118855794270833, + "grad_norm": 17.604732513427734, + "learning_rate": 9.742926845541596e-06, + "loss": 5.3935, + "step": 20245 + }, + { + "epoch": 0.4119873046875, + "grad_norm": 13.16313362121582, + "learning_rate": 9.742800323922276e-06, + "loss": 5.1193, + "step": 20250 + }, + { + "epoch": 0.4120890299479167, + "grad_norm": 18.811397552490234, + "learning_rate": 9.74267377199797e-06, + "loss": 5.3028, + "step": 20255 + }, + { + "epoch": 0.4121907552083333, + "grad_norm": 13.02652645111084, + "learning_rate": 9.742547189769482e-06, + "loss": 5.2962, + "step": 20260 + }, + { + "epoch": 0.41229248046875, + "grad_norm": 19.58499526977539, + "learning_rate": 9.742420577237624e-06, + "loss": 5.3222, + "step": 20265 + }, + { + "epoch": 0.4123942057291667, + "grad_norm": 17.219568252563477, + "learning_rate": 9.742293934403204e-06, + "loss": 5.1883, + "step": 20270 + }, + { + "epoch": 0.4124959309895833, + "grad_norm": 16.276371002197266, + "learning_rate": 9.74216726126703e-06, + "loss": 5.3986, + "step": 20275 + }, + { + "epoch": 0.41259765625, + "grad_norm": 14.150047302246094, + "learning_rate": 9.742040557829914e-06, + "loss": 5.585, + "step": 20280 + }, + { + "epoch": 0.4126993815104167, + "grad_norm": 15.574024200439453, + "learning_rate": 9.741913824092665e-06, + "loss": 5.2022, + "step": 20285 + }, + { + "epoch": 0.4128011067708333, + "grad_norm": 20.19426727294922, + "learning_rate": 9.741787060056091e-06, + "loss": 5.1324, + "step": 20290 + }, + { + "epoch": 0.41290283203125, + "grad_norm": 15.099447250366211, + "learning_rate": 9.741660265721003e-06, + "loss": 5.1724, + "step": 20295 + }, + { + "epoch": 0.4130045572916667, + "grad_norm": 15.530495643615723, + "learning_rate": 9.74153344108821e-06, + "loss": 5.1808, + "step": 20300 + }, + { + "epoch": 0.4131062825520833, + "grad_norm": 24.367313385009766, + "learning_rate": 9.741406586158523e-06, + "loss": 5.3927, + "step": 20305 + }, + { + "epoch": 0.4132080078125, + "grad_norm": 14.889876365661621, + "learning_rate": 9.741279700932756e-06, + "loss": 5.3213, + "step": 20310 + }, + { + "epoch": 0.4133097330729167, + "grad_norm": 20.594318389892578, + "learning_rate": 9.741152785411713e-06, + "loss": 5.2887, + "step": 20315 + }, + { + "epoch": 0.4134114583333333, + "grad_norm": 16.16902732849121, + "learning_rate": 9.741025839596212e-06, + "loss": 5.2395, + "step": 20320 + }, + { + "epoch": 0.41351318359375, + "grad_norm": 30.97599983215332, + "learning_rate": 9.74089886348706e-06, + "loss": 5.2349, + "step": 20325 + }, + { + "epoch": 0.4136149088541667, + "grad_norm": 17.906274795532227, + "learning_rate": 9.740771857085069e-06, + "loss": 5.2773, + "step": 20330 + }, + { + "epoch": 0.4137166341145833, + "grad_norm": 16.97410774230957, + "learning_rate": 9.74064482039105e-06, + "loss": 5.1514, + "step": 20335 + }, + { + "epoch": 0.413818359375, + "grad_norm": 14.009833335876465, + "learning_rate": 9.740517753405817e-06, + "loss": 5.2828, + "step": 20340 + }, + { + "epoch": 0.4139200846354167, + "grad_norm": 15.930315971374512, + "learning_rate": 9.74039065613018e-06, + "loss": 5.3145, + "step": 20345 + }, + { + "epoch": 0.4140218098958333, + "grad_norm": 19.118427276611328, + "learning_rate": 9.74026352856495e-06, + "loss": 5.2616, + "step": 20350 + }, + { + "epoch": 0.41412353515625, + "grad_norm": 17.186365127563477, + "learning_rate": 9.740136370710942e-06, + "loss": 4.9907, + "step": 20355 + }, + { + "epoch": 0.4142252604166667, + "grad_norm": 20.037328720092773, + "learning_rate": 9.740009182568968e-06, + "loss": 5.549, + "step": 20360 + }, + { + "epoch": 0.4143269856770833, + "grad_norm": 14.663934707641602, + "learning_rate": 9.739881964139839e-06, + "loss": 5.2964, + "step": 20365 + }, + { + "epoch": 0.4144287109375, + "grad_norm": 20.463796615600586, + "learning_rate": 9.73975471542437e-06, + "loss": 5.2871, + "step": 20370 + }, + { + "epoch": 0.4145304361979167, + "grad_norm": 16.551639556884766, + "learning_rate": 9.739627436423372e-06, + "loss": 5.1544, + "step": 20375 + }, + { + "epoch": 0.4146321614583333, + "grad_norm": 12.398900032043457, + "learning_rate": 9.739500127137659e-06, + "loss": 5.0996, + "step": 20380 + }, + { + "epoch": 0.41473388671875, + "grad_norm": 14.07906436920166, + "learning_rate": 9.739372787568045e-06, + "loss": 5.0948, + "step": 20385 + }, + { + "epoch": 0.4148356119791667, + "grad_norm": 17.17926025390625, + "learning_rate": 9.739245417715344e-06, + "loss": 5.5721, + "step": 20390 + }, + { + "epoch": 0.4149373372395833, + "grad_norm": 16.315265655517578, + "learning_rate": 9.739118017580368e-06, + "loss": 5.6247, + "step": 20395 + }, + { + "epoch": 0.4150390625, + "grad_norm": 14.97169303894043, + "learning_rate": 9.738990587163932e-06, + "loss": 5.2492, + "step": 20400 + }, + { + "epoch": 0.4151407877604167, + "grad_norm": 16.23391342163086, + "learning_rate": 9.738863126466854e-06, + "loss": 5.3786, + "step": 20405 + }, + { + "epoch": 0.4152425130208333, + "grad_norm": 25.900596618652344, + "learning_rate": 9.738735635489941e-06, + "loss": 5.2932, + "step": 20410 + }, + { + "epoch": 0.41534423828125, + "grad_norm": 15.460684776306152, + "learning_rate": 9.738608114234013e-06, + "loss": 5.3028, + "step": 20415 + }, + { + "epoch": 0.4154459635416667, + "grad_norm": 23.960453033447266, + "learning_rate": 9.738480562699884e-06, + "loss": 5.0525, + "step": 20420 + }, + { + "epoch": 0.4155476888020833, + "grad_norm": 17.063323974609375, + "learning_rate": 9.738352980888366e-06, + "loss": 5.0163, + "step": 20425 + }, + { + "epoch": 0.4156494140625, + "grad_norm": 14.199673652648926, + "learning_rate": 9.73822536880028e-06, + "loss": 5.2647, + "step": 20430 + }, + { + "epoch": 0.4157511393229167, + "grad_norm": 15.335702896118164, + "learning_rate": 9.738097726436436e-06, + "loss": 5.2517, + "step": 20435 + }, + { + "epoch": 0.4158528645833333, + "grad_norm": 18.789194107055664, + "learning_rate": 9.737970053797651e-06, + "loss": 5.3576, + "step": 20440 + }, + { + "epoch": 0.41595458984375, + "grad_norm": 14.675522804260254, + "learning_rate": 9.737842350884744e-06, + "loss": 5.2686, + "step": 20445 + }, + { + "epoch": 0.4160563151041667, + "grad_norm": 14.782692909240723, + "learning_rate": 9.737714617698526e-06, + "loss": 5.363, + "step": 20450 + }, + { + "epoch": 0.4161580403645833, + "grad_norm": 13.852202415466309, + "learning_rate": 9.737586854239815e-06, + "loss": 5.2474, + "step": 20455 + }, + { + "epoch": 0.416259765625, + "grad_norm": 16.046812057495117, + "learning_rate": 9.73745906050943e-06, + "loss": 5.1296, + "step": 20460 + }, + { + "epoch": 0.4163614908854167, + "grad_norm": 15.107261657714844, + "learning_rate": 9.737331236508184e-06, + "loss": 5.0692, + "step": 20465 + }, + { + "epoch": 0.4164632161458333, + "grad_norm": 22.421663284301758, + "learning_rate": 9.737203382236896e-06, + "loss": 5.4748, + "step": 20470 + }, + { + "epoch": 0.41656494140625, + "grad_norm": 13.051400184631348, + "learning_rate": 9.737075497696382e-06, + "loss": 5.0525, + "step": 20475 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 15.502554893493652, + "learning_rate": 9.73694758288746e-06, + "loss": 5.0144, + "step": 20480 + }, + { + "epoch": 0.4167683919270833, + "grad_norm": 17.619815826416016, + "learning_rate": 9.736819637810946e-06, + "loss": 5.2677, + "step": 20485 + }, + { + "epoch": 0.4168701171875, + "grad_norm": 14.595215797424316, + "learning_rate": 9.736691662467658e-06, + "loss": 5.2098, + "step": 20490 + }, + { + "epoch": 0.4169718424479167, + "grad_norm": 13.5521821975708, + "learning_rate": 9.736563656858412e-06, + "loss": 5.0769, + "step": 20495 + }, + { + "epoch": 0.4170735677083333, + "grad_norm": 18.00469970703125, + "learning_rate": 9.736435620984031e-06, + "loss": 5.2341, + "step": 20500 + }, + { + "epoch": 0.41717529296875, + "grad_norm": 18.43522834777832, + "learning_rate": 9.73630755484533e-06, + "loss": 5.3107, + "step": 20505 + }, + { + "epoch": 0.4172770182291667, + "grad_norm": 20.4969539642334, + "learning_rate": 9.736179458443126e-06, + "loss": 5.2593, + "step": 20510 + }, + { + "epoch": 0.4173787434895833, + "grad_norm": 21.88129425048828, + "learning_rate": 9.736051331778237e-06, + "loss": 5.0156, + "step": 20515 + }, + { + "epoch": 0.41748046875, + "grad_norm": 13.106583595275879, + "learning_rate": 9.735923174851487e-06, + "loss": 5.1159, + "step": 20520 + }, + { + "epoch": 0.4175821940104167, + "grad_norm": 18.5609188079834, + "learning_rate": 9.735794987663689e-06, + "loss": 5.1924, + "step": 20525 + }, + { + "epoch": 0.4176839192708333, + "grad_norm": 18.706911087036133, + "learning_rate": 9.735666770215666e-06, + "loss": 5.5502, + "step": 20530 + }, + { + "epoch": 0.41778564453125, + "grad_norm": 16.95153045654297, + "learning_rate": 9.735538522508234e-06, + "loss": 5.2721, + "step": 20535 + }, + { + "epoch": 0.4178873697916667, + "grad_norm": 19.12413215637207, + "learning_rate": 9.735410244542214e-06, + "loss": 5.1785, + "step": 20540 + }, + { + "epoch": 0.4179890950520833, + "grad_norm": 19.654619216918945, + "learning_rate": 9.735281936318427e-06, + "loss": 5.0887, + "step": 20545 + }, + { + "epoch": 0.4180908203125, + "grad_norm": 19.460861206054688, + "learning_rate": 9.735153597837692e-06, + "loss": 5.3303, + "step": 20550 + }, + { + "epoch": 0.4181925455729167, + "grad_norm": 16.378536224365234, + "learning_rate": 9.735025229100826e-06, + "loss": 5.4085, + "step": 20555 + }, + { + "epoch": 0.4182942708333333, + "grad_norm": 16.346254348754883, + "learning_rate": 9.734896830108653e-06, + "loss": 5.1344, + "step": 20560 + }, + { + "epoch": 0.41839599609375, + "grad_norm": 14.507396697998047, + "learning_rate": 9.734768400861994e-06, + "loss": 5.0988, + "step": 20565 + }, + { + "epoch": 0.4184977213541667, + "grad_norm": 15.51321792602539, + "learning_rate": 9.734639941361667e-06, + "loss": 5.6634, + "step": 20570 + }, + { + "epoch": 0.4185994466145833, + "grad_norm": 13.395142555236816, + "learning_rate": 9.734511451608492e-06, + "loss": 4.9303, + "step": 20575 + }, + { + "epoch": 0.418701171875, + "grad_norm": 20.741260528564453, + "learning_rate": 9.734382931603295e-06, + "loss": 5.3795, + "step": 20580 + }, + { + "epoch": 0.4188028971354167, + "grad_norm": 19.295618057250977, + "learning_rate": 9.734254381346891e-06, + "loss": 5.4165, + "step": 20585 + }, + { + "epoch": 0.4189046223958333, + "grad_norm": 17.096397399902344, + "learning_rate": 9.734125800840106e-06, + "loss": 5.1026, + "step": 20590 + }, + { + "epoch": 0.41900634765625, + "grad_norm": 17.948007583618164, + "learning_rate": 9.733997190083759e-06, + "loss": 5.0534, + "step": 20595 + }, + { + "epoch": 0.4191080729166667, + "grad_norm": 18.46503257751465, + "learning_rate": 9.733868549078674e-06, + "loss": 5.2144, + "step": 20600 + }, + { + "epoch": 0.4192097981770833, + "grad_norm": 15.067648887634277, + "learning_rate": 9.733739877825671e-06, + "loss": 5.4837, + "step": 20605 + }, + { + "epoch": 0.4193115234375, + "grad_norm": 14.520651817321777, + "learning_rate": 9.733611176325573e-06, + "loss": 5.3005, + "step": 20610 + }, + { + "epoch": 0.4194132486979167, + "grad_norm": 13.83544635772705, + "learning_rate": 9.733482444579201e-06, + "loss": 5.1095, + "step": 20615 + }, + { + "epoch": 0.4195149739583333, + "grad_norm": 20.992340087890625, + "learning_rate": 9.733353682587381e-06, + "loss": 5.43, + "step": 20620 + }, + { + "epoch": 0.41961669921875, + "grad_norm": 13.480748176574707, + "learning_rate": 9.733224890350931e-06, + "loss": 5.1959, + "step": 20625 + }, + { + "epoch": 0.4197184244791667, + "grad_norm": 14.21049976348877, + "learning_rate": 9.73309606787068e-06, + "loss": 5.1642, + "step": 20630 + }, + { + "epoch": 0.4198201497395833, + "grad_norm": 18.008394241333008, + "learning_rate": 9.732967215147444e-06, + "loss": 5.2728, + "step": 20635 + }, + { + "epoch": 0.419921875, + "grad_norm": 10.536187171936035, + "learning_rate": 9.732838332182053e-06, + "loss": 5.2548, + "step": 20640 + }, + { + "epoch": 0.4200236002604167, + "grad_norm": 18.865488052368164, + "learning_rate": 9.732709418975325e-06, + "loss": 5.0196, + "step": 20645 + }, + { + "epoch": 0.4201253255208333, + "grad_norm": 24.797937393188477, + "learning_rate": 9.732580475528087e-06, + "loss": 5.4226, + "step": 20650 + }, + { + "epoch": 0.42022705078125, + "grad_norm": 17.414432525634766, + "learning_rate": 9.732451501841163e-06, + "loss": 5.291, + "step": 20655 + }, + { + "epoch": 0.4203287760416667, + "grad_norm": 16.614917755126953, + "learning_rate": 9.732322497915374e-06, + "loss": 5.2404, + "step": 20660 + }, + { + "epoch": 0.4204305013020833, + "grad_norm": 17.130220413208008, + "learning_rate": 9.732193463751549e-06, + "loss": 5.1697, + "step": 20665 + }, + { + "epoch": 0.4205322265625, + "grad_norm": 21.043212890625, + "learning_rate": 9.732064399350508e-06, + "loss": 5.2159, + "step": 20670 + }, + { + "epoch": 0.4206339518229167, + "grad_norm": 13.634456634521484, + "learning_rate": 9.731935304713079e-06, + "loss": 4.8669, + "step": 20675 + }, + { + "epoch": 0.4207356770833333, + "grad_norm": 16.004215240478516, + "learning_rate": 9.731806179840084e-06, + "loss": 5.3431, + "step": 20680 + }, + { + "epoch": 0.42083740234375, + "grad_norm": 12.877626419067383, + "learning_rate": 9.731677024732351e-06, + "loss": 5.0716, + "step": 20685 + }, + { + "epoch": 0.4209391276041667, + "grad_norm": 15.660167694091797, + "learning_rate": 9.731547839390704e-06, + "loss": 5.1596, + "step": 20690 + }, + { + "epoch": 0.4210408528645833, + "grad_norm": 19.04845428466797, + "learning_rate": 9.731418623815966e-06, + "loss": 5.0753, + "step": 20695 + }, + { + "epoch": 0.421142578125, + "grad_norm": 15.801918983459473, + "learning_rate": 9.731289378008967e-06, + "loss": 5.0931, + "step": 20700 + }, + { + "epoch": 0.4212443033854167, + "grad_norm": 21.673887252807617, + "learning_rate": 9.731160101970532e-06, + "loss": 5.2166, + "step": 20705 + }, + { + "epoch": 0.4213460286458333, + "grad_norm": 17.778034210205078, + "learning_rate": 9.731030795701484e-06, + "loss": 5.206, + "step": 20710 + }, + { + "epoch": 0.42144775390625, + "grad_norm": 15.151814460754395, + "learning_rate": 9.73090145920265e-06, + "loss": 5.2488, + "step": 20715 + }, + { + "epoch": 0.4215494791666667, + "grad_norm": 29.93948745727539, + "learning_rate": 9.730772092474861e-06, + "loss": 5.3637, + "step": 20720 + }, + { + "epoch": 0.4216512044270833, + "grad_norm": 17.19672393798828, + "learning_rate": 9.730642695518937e-06, + "loss": 5.2764, + "step": 20725 + }, + { + "epoch": 0.4217529296875, + "grad_norm": 12.530747413635254, + "learning_rate": 9.730513268335708e-06, + "loss": 4.974, + "step": 20730 + }, + { + "epoch": 0.4218546549479167, + "grad_norm": 23.868322372436523, + "learning_rate": 9.730383810926002e-06, + "loss": 5.6394, + "step": 20735 + }, + { + "epoch": 0.4219563802083333, + "grad_norm": 19.161375045776367, + "learning_rate": 9.730254323290644e-06, + "loss": 5.2496, + "step": 20740 + }, + { + "epoch": 0.42205810546875, + "grad_norm": 28.628820419311523, + "learning_rate": 9.730124805430464e-06, + "loss": 5.3308, + "step": 20745 + }, + { + "epoch": 0.4221598307291667, + "grad_norm": 21.368688583374023, + "learning_rate": 9.729995257346288e-06, + "loss": 5.307, + "step": 20750 + }, + { + "epoch": 0.4222615559895833, + "grad_norm": 17.08028221130371, + "learning_rate": 9.729865679038942e-06, + "loss": 5.1595, + "step": 20755 + }, + { + "epoch": 0.42236328125, + "grad_norm": 16.051700592041016, + "learning_rate": 9.729736070509257e-06, + "loss": 5.1392, + "step": 20760 + }, + { + "epoch": 0.4224650065104167, + "grad_norm": 21.4493465423584, + "learning_rate": 9.729606431758059e-06, + "loss": 5.2189, + "step": 20765 + }, + { + "epoch": 0.4225667317708333, + "grad_norm": 21.350753784179688, + "learning_rate": 9.729476762786178e-06, + "loss": 5.2967, + "step": 20770 + }, + { + "epoch": 0.42266845703125, + "grad_norm": 26.299114227294922, + "learning_rate": 9.729347063594443e-06, + "loss": 5.498, + "step": 20775 + }, + { + "epoch": 0.4227701822916667, + "grad_norm": 15.703675270080566, + "learning_rate": 9.729217334183681e-06, + "loss": 5.1888, + "step": 20780 + }, + { + "epoch": 0.4228719075520833, + "grad_norm": 12.964010238647461, + "learning_rate": 9.729087574554719e-06, + "loss": 5.0412, + "step": 20785 + }, + { + "epoch": 0.4229736328125, + "grad_norm": 16.597745895385742, + "learning_rate": 9.72895778470839e-06, + "loss": 5.306, + "step": 20790 + }, + { + "epoch": 0.4230753580729167, + "grad_norm": 18.08775520324707, + "learning_rate": 9.728827964645525e-06, + "loss": 5.328, + "step": 20795 + }, + { + "epoch": 0.4231770833333333, + "grad_norm": 17.668109893798828, + "learning_rate": 9.728698114366946e-06, + "loss": 5.0495, + "step": 20800 + }, + { + "epoch": 0.42327880859375, + "grad_norm": 11.887195587158203, + "learning_rate": 9.72856823387349e-06, + "loss": 5.029, + "step": 20805 + }, + { + "epoch": 0.4233805338541667, + "grad_norm": 15.31873607635498, + "learning_rate": 9.728438323165982e-06, + "loss": 5.3085, + "step": 20810 + }, + { + "epoch": 0.4234822591145833, + "grad_norm": 14.421680450439453, + "learning_rate": 9.728308382245256e-06, + "loss": 5.1813, + "step": 20815 + }, + { + "epoch": 0.423583984375, + "grad_norm": 18.759122848510742, + "learning_rate": 9.72817841111214e-06, + "loss": 5.1119, + "step": 20820 + }, + { + "epoch": 0.4236857096354167, + "grad_norm": 18.71889877319336, + "learning_rate": 9.728048409767464e-06, + "loss": 5.114, + "step": 20825 + }, + { + "epoch": 0.4237874348958333, + "grad_norm": 16.034832000732422, + "learning_rate": 9.72791837821206e-06, + "loss": 5.1962, + "step": 20830 + }, + { + "epoch": 0.42388916015625, + "grad_norm": 17.998247146606445, + "learning_rate": 9.727788316446758e-06, + "loss": 5.2577, + "step": 20835 + }, + { + "epoch": 0.4239908854166667, + "grad_norm": 21.77909278869629, + "learning_rate": 9.72765822447239e-06, + "loss": 5.3072, + "step": 20840 + }, + { + "epoch": 0.4240926106770833, + "grad_norm": 14.526596069335938, + "learning_rate": 9.727528102289785e-06, + "loss": 5.0176, + "step": 20845 + }, + { + "epoch": 0.4241943359375, + "grad_norm": 24.95800018310547, + "learning_rate": 9.727397949899779e-06, + "loss": 5.0541, + "step": 20850 + }, + { + "epoch": 0.4242960611979167, + "grad_norm": 12.900300979614258, + "learning_rate": 9.727267767303199e-06, + "loss": 5.1734, + "step": 20855 + }, + { + "epoch": 0.4243977864583333, + "grad_norm": 19.299055099487305, + "learning_rate": 9.72713755450088e-06, + "loss": 5.5411, + "step": 20860 + }, + { + "epoch": 0.42449951171875, + "grad_norm": 16.88306999206543, + "learning_rate": 9.72700731149365e-06, + "loss": 5.2383, + "step": 20865 + }, + { + "epoch": 0.4246012369791667, + "grad_norm": 18.95932960510254, + "learning_rate": 9.726877038282346e-06, + "loss": 5.3333, + "step": 20870 + }, + { + "epoch": 0.4247029622395833, + "grad_norm": 14.650044441223145, + "learning_rate": 9.726746734867798e-06, + "loss": 5.5484, + "step": 20875 + }, + { + "epoch": 0.4248046875, + "grad_norm": 19.08786964416504, + "learning_rate": 9.726616401250838e-06, + "loss": 5.2389, + "step": 20880 + }, + { + "epoch": 0.4249064127604167, + "grad_norm": 17.30793571472168, + "learning_rate": 9.726486037432301e-06, + "loss": 5.2423, + "step": 20885 + }, + { + "epoch": 0.4250081380208333, + "grad_norm": 19.66515350341797, + "learning_rate": 9.726355643413017e-06, + "loss": 5.123, + "step": 20890 + }, + { + "epoch": 0.42510986328125, + "grad_norm": 15.742254257202148, + "learning_rate": 9.726225219193823e-06, + "loss": 5.2196, + "step": 20895 + }, + { + "epoch": 0.4252115885416667, + "grad_norm": 21.055850982666016, + "learning_rate": 9.726094764775548e-06, + "loss": 5.2306, + "step": 20900 + }, + { + "epoch": 0.4253133138020833, + "grad_norm": 14.923839569091797, + "learning_rate": 9.725964280159028e-06, + "loss": 5.2133, + "step": 20905 + }, + { + "epoch": 0.4254150390625, + "grad_norm": 18.962800979614258, + "learning_rate": 9.725833765345098e-06, + "loss": 5.1285, + "step": 20910 + }, + { + "epoch": 0.4255167643229167, + "grad_norm": 17.767255783081055, + "learning_rate": 9.725703220334588e-06, + "loss": 4.9319, + "step": 20915 + }, + { + "epoch": 0.4256184895833333, + "grad_norm": 13.263202667236328, + "learning_rate": 9.725572645128337e-06, + "loss": 5.2118, + "step": 20920 + }, + { + "epoch": 0.42572021484375, + "grad_norm": 17.227685928344727, + "learning_rate": 9.725442039727175e-06, + "loss": 5.1367, + "step": 20925 + }, + { + "epoch": 0.4258219401041667, + "grad_norm": 16.48752784729004, + "learning_rate": 9.72531140413194e-06, + "loss": 5.1041, + "step": 20930 + }, + { + "epoch": 0.4259236653645833, + "grad_norm": 16.940555572509766, + "learning_rate": 9.725180738343465e-06, + "loss": 5.1424, + "step": 20935 + }, + { + "epoch": 0.426025390625, + "grad_norm": 17.695594787597656, + "learning_rate": 9.725050042362584e-06, + "loss": 5.0632, + "step": 20940 + }, + { + "epoch": 0.4261271158854167, + "grad_norm": 16.433406829833984, + "learning_rate": 9.724919316190133e-06, + "loss": 5.3312, + "step": 20945 + }, + { + "epoch": 0.4262288411458333, + "grad_norm": 22.37578582763672, + "learning_rate": 9.724788559826949e-06, + "loss": 5.2548, + "step": 20950 + }, + { + "epoch": 0.42633056640625, + "grad_norm": 19.92133331298828, + "learning_rate": 9.724657773273865e-06, + "loss": 5.3089, + "step": 20955 + }, + { + "epoch": 0.4264322916666667, + "grad_norm": 21.25330352783203, + "learning_rate": 9.724526956531716e-06, + "loss": 5.2671, + "step": 20960 + }, + { + "epoch": 0.4265340169270833, + "grad_norm": 18.273784637451172, + "learning_rate": 9.724396109601342e-06, + "loss": 5.2922, + "step": 20965 + }, + { + "epoch": 0.4266357421875, + "grad_norm": 15.525444030761719, + "learning_rate": 9.724265232483577e-06, + "loss": 5.2353, + "step": 20970 + }, + { + "epoch": 0.4267374674479167, + "grad_norm": 17.283784866333008, + "learning_rate": 9.724134325179256e-06, + "loss": 4.9176, + "step": 20975 + }, + { + "epoch": 0.4268391927083333, + "grad_norm": 14.859170913696289, + "learning_rate": 9.724003387689215e-06, + "loss": 5.1427, + "step": 20980 + }, + { + "epoch": 0.42694091796875, + "grad_norm": 17.711088180541992, + "learning_rate": 9.723872420014293e-06, + "loss": 5.0387, + "step": 20985 + }, + { + "epoch": 0.4270426432291667, + "grad_norm": 16.896915435791016, + "learning_rate": 9.723741422155326e-06, + "loss": 5.1726, + "step": 20990 + }, + { + "epoch": 0.4271443684895833, + "grad_norm": 20.003520965576172, + "learning_rate": 9.72361039411315e-06, + "loss": 4.8936, + "step": 20995 + }, + { + "epoch": 0.42724609375, + "grad_norm": 15.240890502929688, + "learning_rate": 9.723479335888603e-06, + "loss": 5.2007, + "step": 21000 + }, + { + "epoch": 0.4273478190104167, + "grad_norm": 12.080591201782227, + "learning_rate": 9.723348247482522e-06, + "loss": 5.305, + "step": 21005 + }, + { + "epoch": 0.4274495442708333, + "grad_norm": 13.05339241027832, + "learning_rate": 9.723217128895748e-06, + "loss": 5.0738, + "step": 21010 + }, + { + "epoch": 0.42755126953125, + "grad_norm": 16.943086624145508, + "learning_rate": 9.723085980129114e-06, + "loss": 5.1947, + "step": 21015 + }, + { + "epoch": 0.4276529947916667, + "grad_norm": 20.057662963867188, + "learning_rate": 9.72295480118346e-06, + "loss": 5.08, + "step": 21020 + }, + { + "epoch": 0.4277547200520833, + "grad_norm": 14.407784461975098, + "learning_rate": 9.722823592059622e-06, + "loss": 5.0544, + "step": 21025 + }, + { + "epoch": 0.4278564453125, + "grad_norm": 22.90355682373047, + "learning_rate": 9.722692352758444e-06, + "loss": 5.5286, + "step": 21030 + }, + { + "epoch": 0.4279581705729167, + "grad_norm": 21.122039794921875, + "learning_rate": 9.72256108328076e-06, + "loss": 5.3824, + "step": 21035 + }, + { + "epoch": 0.4280598958333333, + "grad_norm": 13.437788009643555, + "learning_rate": 9.722429783627408e-06, + "loss": 4.9099, + "step": 21040 + }, + { + "epoch": 0.42816162109375, + "grad_norm": 19.95538330078125, + "learning_rate": 9.722298453799231e-06, + "loss": 5.0213, + "step": 21045 + }, + { + "epoch": 0.4282633463541667, + "grad_norm": 12.688067436218262, + "learning_rate": 9.722167093797065e-06, + "loss": 5.1897, + "step": 21050 + }, + { + "epoch": 0.4283650716145833, + "grad_norm": 16.105243682861328, + "learning_rate": 9.72203570362175e-06, + "loss": 4.9333, + "step": 21055 + }, + { + "epoch": 0.428466796875, + "grad_norm": 13.622504234313965, + "learning_rate": 9.721904283274127e-06, + "loss": 5.1168, + "step": 21060 + }, + { + "epoch": 0.4285685221354167, + "grad_norm": 15.252872467041016, + "learning_rate": 9.721772832755034e-06, + "loss": 5.2048, + "step": 21065 + }, + { + "epoch": 0.4286702473958333, + "grad_norm": 19.303224563598633, + "learning_rate": 9.72164135206531e-06, + "loss": 5.2725, + "step": 21070 + }, + { + "epoch": 0.42877197265625, + "grad_norm": 19.36894989013672, + "learning_rate": 9.721509841205799e-06, + "loss": 4.9611, + "step": 21075 + }, + { + "epoch": 0.4288736979166667, + "grad_norm": 18.904664993286133, + "learning_rate": 9.721378300177338e-06, + "loss": 5.2135, + "step": 21080 + }, + { + "epoch": 0.4289754231770833, + "grad_norm": 16.51578140258789, + "learning_rate": 9.721246728980766e-06, + "loss": 5.187, + "step": 21085 + }, + { + "epoch": 0.4290771484375, + "grad_norm": 16.758365631103516, + "learning_rate": 9.721115127616928e-06, + "loss": 5.0987, + "step": 21090 + }, + { + "epoch": 0.4291788736979167, + "grad_norm": 24.40549659729004, + "learning_rate": 9.720983496086664e-06, + "loss": 5.3925, + "step": 21095 + }, + { + "epoch": 0.4292805989583333, + "grad_norm": 14.150823593139648, + "learning_rate": 9.720851834390814e-06, + "loss": 5.1493, + "step": 21100 + }, + { + "epoch": 0.42938232421875, + "grad_norm": 16.663118362426758, + "learning_rate": 9.720720142530218e-06, + "loss": 4.8681, + "step": 21105 + }, + { + "epoch": 0.4294840494791667, + "grad_norm": 21.11780548095703, + "learning_rate": 9.72058842050572e-06, + "loss": 5.2517, + "step": 21110 + }, + { + "epoch": 0.4295857747395833, + "grad_norm": 17.740909576416016, + "learning_rate": 9.720456668318161e-06, + "loss": 5.2093, + "step": 21115 + }, + { + "epoch": 0.4296875, + "grad_norm": 11.743327140808105, + "learning_rate": 9.720324885968382e-06, + "loss": 5.4379, + "step": 21120 + }, + { + "epoch": 0.4297892252604167, + "grad_norm": 19.069231033325195, + "learning_rate": 9.720193073457224e-06, + "loss": 5.1934, + "step": 21125 + }, + { + "epoch": 0.4298909505208333, + "grad_norm": 17.855119705200195, + "learning_rate": 9.720061230785531e-06, + "loss": 5.412, + "step": 21130 + }, + { + "epoch": 0.42999267578125, + "grad_norm": 14.78591537475586, + "learning_rate": 9.719929357954146e-06, + "loss": 5.3426, + "step": 21135 + }, + { + "epoch": 0.4300944010416667, + "grad_norm": 13.417505264282227, + "learning_rate": 9.71979745496391e-06, + "loss": 5.1156, + "step": 21140 + }, + { + "epoch": 0.4301961263020833, + "grad_norm": 17.724626541137695, + "learning_rate": 9.719665521815669e-06, + "loss": 5.486, + "step": 21145 + }, + { + "epoch": 0.4302978515625, + "grad_norm": 21.149120330810547, + "learning_rate": 9.719533558510262e-06, + "loss": 5.5152, + "step": 21150 + }, + { + "epoch": 0.4303995768229167, + "grad_norm": 17.71177864074707, + "learning_rate": 9.719401565048532e-06, + "loss": 5.1714, + "step": 21155 + }, + { + "epoch": 0.4305013020833333, + "grad_norm": 13.420735359191895, + "learning_rate": 9.719269541431324e-06, + "loss": 5.1483, + "step": 21160 + }, + { + "epoch": 0.43060302734375, + "grad_norm": 18.9901180267334, + "learning_rate": 9.719137487659485e-06, + "loss": 5.025, + "step": 21165 + }, + { + "epoch": 0.4307047526041667, + "grad_norm": 18.58553123474121, + "learning_rate": 9.719005403733851e-06, + "loss": 5.3604, + "step": 21170 + }, + { + "epoch": 0.4308064778645833, + "grad_norm": 22.24885368347168, + "learning_rate": 9.718873289655275e-06, + "loss": 5.0213, + "step": 21175 + }, + { + "epoch": 0.430908203125, + "grad_norm": 16.71074104309082, + "learning_rate": 9.718741145424594e-06, + "loss": 5.1446, + "step": 21180 + }, + { + "epoch": 0.4310099283854167, + "grad_norm": 18.344762802124023, + "learning_rate": 9.718608971042656e-06, + "loss": 5.134, + "step": 21185 + }, + { + "epoch": 0.4311116536458333, + "grad_norm": 16.376880645751953, + "learning_rate": 9.718476766510303e-06, + "loss": 5.1726, + "step": 21190 + }, + { + "epoch": 0.43121337890625, + "grad_norm": 14.659795761108398, + "learning_rate": 9.718344531828383e-06, + "loss": 5.1289, + "step": 21195 + }, + { + "epoch": 0.4313151041666667, + "grad_norm": 17.831541061401367, + "learning_rate": 9.71821226699774e-06, + "loss": 5.0488, + "step": 21200 + }, + { + "epoch": 0.4314168294270833, + "grad_norm": 23.781005859375, + "learning_rate": 9.718079972019217e-06, + "loss": 5.169, + "step": 21205 + }, + { + "epoch": 0.4315185546875, + "grad_norm": 15.134095191955566, + "learning_rate": 9.717947646893662e-06, + "loss": 5.6469, + "step": 21210 + }, + { + "epoch": 0.4316202799479167, + "grad_norm": 14.915077209472656, + "learning_rate": 9.717815291621917e-06, + "loss": 5.1068, + "step": 21215 + }, + { + "epoch": 0.4317220052083333, + "grad_norm": 18.038463592529297, + "learning_rate": 9.71768290620483e-06, + "loss": 5.1257, + "step": 21220 + }, + { + "epoch": 0.43182373046875, + "grad_norm": 14.845311164855957, + "learning_rate": 9.717550490643248e-06, + "loss": 5.0422, + "step": 21225 + }, + { + "epoch": 0.4319254557291667, + "grad_norm": 15.164895057678223, + "learning_rate": 9.717418044938015e-06, + "loss": 5.7191, + "step": 21230 + }, + { + "epoch": 0.4320271809895833, + "grad_norm": 18.607467651367188, + "learning_rate": 9.71728556908998e-06, + "loss": 5.158, + "step": 21235 + }, + { + "epoch": 0.43212890625, + "grad_norm": 18.06108283996582, + "learning_rate": 9.717153063099986e-06, + "loss": 5.1509, + "step": 21240 + }, + { + "epoch": 0.4322306315104167, + "grad_norm": 16.299962997436523, + "learning_rate": 9.717020526968883e-06, + "loss": 5.1769, + "step": 21245 + }, + { + "epoch": 0.4323323567708333, + "grad_norm": 15.146202087402344, + "learning_rate": 9.716887960697516e-06, + "loss": 5.1802, + "step": 21250 + }, + { + "epoch": 0.43243408203125, + "grad_norm": 15.17374324798584, + "learning_rate": 9.716755364286733e-06, + "loss": 5.0805, + "step": 21255 + }, + { + "epoch": 0.4325358072916667, + "grad_norm": 18.19688606262207, + "learning_rate": 9.716622737737379e-06, + "loss": 4.985, + "step": 21260 + }, + { + "epoch": 0.4326375325520833, + "grad_norm": 20.909883499145508, + "learning_rate": 9.716490081050301e-06, + "loss": 5.098, + "step": 21265 + }, + { + "epoch": 0.4327392578125, + "grad_norm": 15.021774291992188, + "learning_rate": 9.71635739422635e-06, + "loss": 5.256, + "step": 21270 + }, + { + "epoch": 0.4328409830729167, + "grad_norm": 15.208291053771973, + "learning_rate": 9.716224677266373e-06, + "loss": 4.8997, + "step": 21275 + }, + { + "epoch": 0.4329427083333333, + "grad_norm": 13.125770568847656, + "learning_rate": 9.716091930171219e-06, + "loss": 5.3936, + "step": 21280 + }, + { + "epoch": 0.43304443359375, + "grad_norm": 15.042616844177246, + "learning_rate": 9.715959152941732e-06, + "loss": 4.8606, + "step": 21285 + }, + { + "epoch": 0.4331461588541667, + "grad_norm": 20.34540367126465, + "learning_rate": 9.715826345578765e-06, + "loss": 5.06, + "step": 21290 + }, + { + "epoch": 0.4332478841145833, + "grad_norm": 17.769346237182617, + "learning_rate": 9.715693508083164e-06, + "loss": 5.233, + "step": 21295 + }, + { + "epoch": 0.433349609375, + "grad_norm": 17.762426376342773, + "learning_rate": 9.715560640455778e-06, + "loss": 5.1151, + "step": 21300 + }, + { + "epoch": 0.4334513346354167, + "grad_norm": 17.994461059570312, + "learning_rate": 9.715427742697456e-06, + "loss": 5.3856, + "step": 21305 + }, + { + "epoch": 0.4335530598958333, + "grad_norm": 14.416504859924316, + "learning_rate": 9.71529481480905e-06, + "loss": 5.0131, + "step": 21310 + }, + { + "epoch": 0.43365478515625, + "grad_norm": 14.17530345916748, + "learning_rate": 9.715161856791404e-06, + "loss": 5.1546, + "step": 21315 + }, + { + "epoch": 0.4337565104166667, + "grad_norm": 15.290117263793945, + "learning_rate": 9.715028868645372e-06, + "loss": 5.1992, + "step": 21320 + }, + { + "epoch": 0.4338582356770833, + "grad_norm": 12.757314682006836, + "learning_rate": 9.7148958503718e-06, + "loss": 5.1749, + "step": 21325 + }, + { + "epoch": 0.4339599609375, + "grad_norm": 15.54615592956543, + "learning_rate": 9.714762801971545e-06, + "loss": 5.6603, + "step": 21330 + }, + { + "epoch": 0.4340616861979167, + "grad_norm": 17.621206283569336, + "learning_rate": 9.71462972344545e-06, + "loss": 5.3697, + "step": 21335 + }, + { + "epoch": 0.4341634114583333, + "grad_norm": 16.495718002319336, + "learning_rate": 9.714496614794366e-06, + "loss": 5.2247, + "step": 21340 + }, + { + "epoch": 0.43426513671875, + "grad_norm": 18.711111068725586, + "learning_rate": 9.714363476019147e-06, + "loss": 5.4273, + "step": 21345 + }, + { + "epoch": 0.4343668619791667, + "grad_norm": 21.618839263916016, + "learning_rate": 9.714230307120641e-06, + "loss": 5.556, + "step": 21350 + }, + { + "epoch": 0.4344685872395833, + "grad_norm": 18.95964813232422, + "learning_rate": 9.7140971080997e-06, + "loss": 5.2878, + "step": 21355 + }, + { + "epoch": 0.4345703125, + "grad_norm": 17.440906524658203, + "learning_rate": 9.713963878957176e-06, + "loss": 5.0799, + "step": 21360 + }, + { + "epoch": 0.4346720377604167, + "grad_norm": 17.312557220458984, + "learning_rate": 9.713830619693918e-06, + "loss": 5.1895, + "step": 21365 + }, + { + "epoch": 0.4347737630208333, + "grad_norm": 13.904565811157227, + "learning_rate": 9.713697330310779e-06, + "loss": 5.2262, + "step": 21370 + }, + { + "epoch": 0.43487548828125, + "grad_norm": 19.238752365112305, + "learning_rate": 9.71356401080861e-06, + "loss": 5.3828, + "step": 21375 + }, + { + "epoch": 0.4349772135416667, + "grad_norm": 14.108538627624512, + "learning_rate": 9.713430661188265e-06, + "loss": 5.1406, + "step": 21380 + }, + { + "epoch": 0.4350789388020833, + "grad_norm": 15.705227851867676, + "learning_rate": 9.713297281450594e-06, + "loss": 5.1342, + "step": 21385 + }, + { + "epoch": 0.4351806640625, + "grad_norm": 15.613298416137695, + "learning_rate": 9.713163871596448e-06, + "loss": 5.3866, + "step": 21390 + }, + { + "epoch": 0.4352823893229167, + "grad_norm": 19.80035972595215, + "learning_rate": 9.713030431626682e-06, + "loss": 5.514, + "step": 21395 + }, + { + "epoch": 0.4353841145833333, + "grad_norm": 12.844326972961426, + "learning_rate": 9.712896961542148e-06, + "loss": 5.248, + "step": 21400 + }, + { + "epoch": 0.43548583984375, + "grad_norm": 19.649389266967773, + "learning_rate": 9.712763461343698e-06, + "loss": 5.2866, + "step": 21405 + }, + { + "epoch": 0.4355875651041667, + "grad_norm": 17.802162170410156, + "learning_rate": 9.712629931032185e-06, + "loss": 5.3932, + "step": 21410 + }, + { + "epoch": 0.4356892903645833, + "grad_norm": 14.586886405944824, + "learning_rate": 9.712496370608463e-06, + "loss": 5.2772, + "step": 21415 + }, + { + "epoch": 0.435791015625, + "grad_norm": 16.44700050354004, + "learning_rate": 9.712362780073385e-06, + "loss": 5.1295, + "step": 21420 + }, + { + "epoch": 0.4358927408854167, + "grad_norm": 16.14716911315918, + "learning_rate": 9.712229159427805e-06, + "loss": 5.1095, + "step": 21425 + }, + { + "epoch": 0.4359944661458333, + "grad_norm": 13.806693077087402, + "learning_rate": 9.712095508672576e-06, + "loss": 5.1251, + "step": 21430 + }, + { + "epoch": 0.43609619140625, + "grad_norm": 17.29931640625, + "learning_rate": 9.711961827808553e-06, + "loss": 5.0564, + "step": 21435 + }, + { + "epoch": 0.4361979166666667, + "grad_norm": 20.059459686279297, + "learning_rate": 9.71182811683659e-06, + "loss": 4.9652, + "step": 21440 + }, + { + "epoch": 0.4362996419270833, + "grad_norm": 17.951719284057617, + "learning_rate": 9.711694375757542e-06, + "loss": 5.1495, + "step": 21445 + }, + { + "epoch": 0.4364013671875, + "grad_norm": 20.760787963867188, + "learning_rate": 9.71156060457226e-06, + "loss": 5.4294, + "step": 21450 + }, + { + "epoch": 0.4365030924479167, + "grad_norm": 13.08244800567627, + "learning_rate": 9.711426803281604e-06, + "loss": 5.1909, + "step": 21455 + }, + { + "epoch": 0.4366048177083333, + "grad_norm": 13.145360946655273, + "learning_rate": 9.711292971886426e-06, + "loss": 5.2753, + "step": 21460 + }, + { + "epoch": 0.43670654296875, + "grad_norm": 16.290790557861328, + "learning_rate": 9.71115911038758e-06, + "loss": 5.3231, + "step": 21465 + }, + { + "epoch": 0.4368082682291667, + "grad_norm": 16.59366798400879, + "learning_rate": 9.711025218785923e-06, + "loss": 5.3525, + "step": 21470 + }, + { + "epoch": 0.4369099934895833, + "grad_norm": 18.600797653198242, + "learning_rate": 9.710891297082311e-06, + "loss": 5.1081, + "step": 21475 + }, + { + "epoch": 0.43701171875, + "grad_norm": 14.090620994567871, + "learning_rate": 9.7107573452776e-06, + "loss": 5.0251, + "step": 21480 + }, + { + "epoch": 0.4371134440104167, + "grad_norm": 12.41346263885498, + "learning_rate": 9.710623363372643e-06, + "loss": 5.182, + "step": 21485 + }, + { + "epoch": 0.4372151692708333, + "grad_norm": 24.482290267944336, + "learning_rate": 9.710489351368299e-06, + "loss": 5.331, + "step": 21490 + }, + { + "epoch": 0.43731689453125, + "grad_norm": 23.999053955078125, + "learning_rate": 9.710355309265422e-06, + "loss": 5.5086, + "step": 21495 + }, + { + "epoch": 0.4374186197916667, + "grad_norm": 18.53911018371582, + "learning_rate": 9.710221237064872e-06, + "loss": 5.0605, + "step": 21500 + }, + { + "epoch": 0.4375203450520833, + "grad_norm": 16.165130615234375, + "learning_rate": 9.710087134767502e-06, + "loss": 5.1953, + "step": 21505 + }, + { + "epoch": 0.4376220703125, + "grad_norm": 17.753311157226562, + "learning_rate": 9.709953002374172e-06, + "loss": 5.1308, + "step": 21510 + }, + { + "epoch": 0.4377237955729167, + "grad_norm": 16.430946350097656, + "learning_rate": 9.709818839885736e-06, + "loss": 5.1667, + "step": 21515 + }, + { + "epoch": 0.4378255208333333, + "grad_norm": 20.051271438598633, + "learning_rate": 9.709684647303053e-06, + "loss": 4.8794, + "step": 21520 + }, + { + "epoch": 0.43792724609375, + "grad_norm": 20.275453567504883, + "learning_rate": 9.70955042462698e-06, + "loss": 5.3951, + "step": 21525 + }, + { + "epoch": 0.4380289713541667, + "grad_norm": 20.068944931030273, + "learning_rate": 9.709416171858376e-06, + "loss": 5.1762, + "step": 21530 + }, + { + "epoch": 0.4381306966145833, + "grad_norm": 14.68043327331543, + "learning_rate": 9.709281888998097e-06, + "loss": 5.2221, + "step": 21535 + }, + { + "epoch": 0.438232421875, + "grad_norm": 23.611825942993164, + "learning_rate": 9.709147576047e-06, + "loss": 5.0656, + "step": 21540 + }, + { + "epoch": 0.4383341471354167, + "grad_norm": 14.6028413772583, + "learning_rate": 9.709013233005947e-06, + "loss": 5.4097, + "step": 21545 + }, + { + "epoch": 0.4384358723958333, + "grad_norm": 16.365285873413086, + "learning_rate": 9.708878859875794e-06, + "loss": 4.9269, + "step": 21550 + }, + { + "epoch": 0.43853759765625, + "grad_norm": 16.853200912475586, + "learning_rate": 9.7087444566574e-06, + "loss": 5.0796, + "step": 21555 + }, + { + "epoch": 0.4386393229166667, + "grad_norm": 14.492009162902832, + "learning_rate": 9.708610023351624e-06, + "loss": 5.2461, + "step": 21560 + }, + { + "epoch": 0.4387410481770833, + "grad_norm": 11.564762115478516, + "learning_rate": 9.708475559959323e-06, + "loss": 5.3705, + "step": 21565 + }, + { + "epoch": 0.4388427734375, + "grad_norm": 22.00505828857422, + "learning_rate": 9.708341066481358e-06, + "loss": 5.0668, + "step": 21570 + }, + { + "epoch": 0.4389444986979167, + "grad_norm": 19.074132919311523, + "learning_rate": 9.70820654291859e-06, + "loss": 5.1092, + "step": 21575 + }, + { + "epoch": 0.4390462239583333, + "grad_norm": 12.967419624328613, + "learning_rate": 9.708071989271875e-06, + "loss": 5.2363, + "step": 21580 + }, + { + "epoch": 0.43914794921875, + "grad_norm": 19.097148895263672, + "learning_rate": 9.707937405542076e-06, + "loss": 5.2759, + "step": 21585 + }, + { + "epoch": 0.4392496744791667, + "grad_norm": 15.819908142089844, + "learning_rate": 9.70780279173005e-06, + "loss": 5.3728, + "step": 21590 + }, + { + "epoch": 0.4393513997395833, + "grad_norm": 12.644155502319336, + "learning_rate": 9.70766814783666e-06, + "loss": 5.4564, + "step": 21595 + }, + { + "epoch": 0.439453125, + "grad_norm": 15.102373123168945, + "learning_rate": 9.707533473862762e-06, + "loss": 5.4924, + "step": 21600 + }, + { + "epoch": 0.4395548502604167, + "grad_norm": 15.698161125183105, + "learning_rate": 9.707398769809222e-06, + "loss": 5.3467, + "step": 21605 + }, + { + "epoch": 0.4396565755208333, + "grad_norm": 14.274174690246582, + "learning_rate": 9.707264035676898e-06, + "loss": 5.3686, + "step": 21610 + }, + { + "epoch": 0.43975830078125, + "grad_norm": 15.132723808288574, + "learning_rate": 9.707129271466651e-06, + "loss": 5.4203, + "step": 21615 + }, + { + "epoch": 0.4398600260416667, + "grad_norm": 23.96868133544922, + "learning_rate": 9.706994477179344e-06, + "loss": 5.2147, + "step": 21620 + }, + { + "epoch": 0.4399617513020833, + "grad_norm": 14.46100902557373, + "learning_rate": 9.706859652815832e-06, + "loss": 5.3461, + "step": 21625 + }, + { + "epoch": 0.4400634765625, + "grad_norm": 20.342208862304688, + "learning_rate": 9.706724798376985e-06, + "loss": 5.3672, + "step": 21630 + }, + { + "epoch": 0.4401652018229167, + "grad_norm": 17.893033981323242, + "learning_rate": 9.70658991386366e-06, + "loss": 5.208, + "step": 21635 + }, + { + "epoch": 0.4402669270833333, + "grad_norm": 19.650798797607422, + "learning_rate": 9.706454999276718e-06, + "loss": 5.1267, + "step": 21640 + }, + { + "epoch": 0.44036865234375, + "grad_norm": 17.061359405517578, + "learning_rate": 9.706320054617024e-06, + "loss": 5.3146, + "step": 21645 + }, + { + "epoch": 0.4404703776041667, + "grad_norm": 15.506134033203125, + "learning_rate": 9.706185079885437e-06, + "loss": 5.079, + "step": 21650 + }, + { + "epoch": 0.4405721028645833, + "grad_norm": 15.976726531982422, + "learning_rate": 9.706050075082823e-06, + "loss": 5.08, + "step": 21655 + }, + { + "epoch": 0.440673828125, + "grad_norm": 16.64453125, + "learning_rate": 9.705915040210044e-06, + "loss": 5.3325, + "step": 21660 + }, + { + "epoch": 0.4407755533854167, + "grad_norm": 20.05837059020996, + "learning_rate": 9.705779975267959e-06, + "loss": 5.2884, + "step": 21665 + }, + { + "epoch": 0.4408772786458333, + "grad_norm": 14.511201858520508, + "learning_rate": 9.705644880257435e-06, + "loss": 5.1359, + "step": 21670 + }, + { + "epoch": 0.44097900390625, + "grad_norm": 22.948516845703125, + "learning_rate": 9.705509755179334e-06, + "loss": 5.2998, + "step": 21675 + }, + { + "epoch": 0.4410807291666667, + "grad_norm": 12.86394214630127, + "learning_rate": 9.705374600034519e-06, + "loss": 5.2279, + "step": 21680 + }, + { + "epoch": 0.4411824544270833, + "grad_norm": 14.912232398986816, + "learning_rate": 9.705239414823855e-06, + "loss": 5.1845, + "step": 21685 + }, + { + "epoch": 0.4412841796875, + "grad_norm": 17.96878433227539, + "learning_rate": 9.705104199548203e-06, + "loss": 5.1629, + "step": 21690 + }, + { + "epoch": 0.4413859049479167, + "grad_norm": 17.658615112304688, + "learning_rate": 9.70496895420843e-06, + "loss": 5.1302, + "step": 21695 + }, + { + "epoch": 0.4414876302083333, + "grad_norm": 15.555442810058594, + "learning_rate": 9.704833678805398e-06, + "loss": 4.9692, + "step": 21700 + }, + { + "epoch": 0.44158935546875, + "grad_norm": 13.997861862182617, + "learning_rate": 9.704698373339975e-06, + "loss": 4.9616, + "step": 21705 + }, + { + "epoch": 0.4416910807291667, + "grad_norm": 22.507043838500977, + "learning_rate": 9.70456303781302e-06, + "loss": 5.3963, + "step": 21710 + }, + { + "epoch": 0.4417928059895833, + "grad_norm": 16.513057708740234, + "learning_rate": 9.704427672225403e-06, + "loss": 5.2794, + "step": 21715 + }, + { + "epoch": 0.44189453125, + "grad_norm": 17.10872459411621, + "learning_rate": 9.704292276577985e-06, + "loss": 5.0069, + "step": 21720 + }, + { + "epoch": 0.4419962565104167, + "grad_norm": 14.934173583984375, + "learning_rate": 9.704156850871632e-06, + "loss": 4.9965, + "step": 21725 + }, + { + "epoch": 0.4420979817708333, + "grad_norm": 15.669989585876465, + "learning_rate": 9.70402139510721e-06, + "loss": 5.4085, + "step": 21730 + }, + { + "epoch": 0.44219970703125, + "grad_norm": 15.608147621154785, + "learning_rate": 9.703885909285586e-06, + "loss": 5.2792, + "step": 21735 + }, + { + "epoch": 0.4423014322916667, + "grad_norm": 18.731767654418945, + "learning_rate": 9.703750393407625e-06, + "loss": 5.194, + "step": 21740 + }, + { + "epoch": 0.4424031575520833, + "grad_norm": 20.09510612487793, + "learning_rate": 9.70361484747419e-06, + "loss": 4.9353, + "step": 21745 + }, + { + "epoch": 0.4425048828125, + "grad_norm": 20.499996185302734, + "learning_rate": 9.70347927148615e-06, + "loss": 5.3184, + "step": 21750 + }, + { + "epoch": 0.4426066080729167, + "grad_norm": 30.7756404876709, + "learning_rate": 9.70334366544437e-06, + "loss": 5.4583, + "step": 21755 + }, + { + "epoch": 0.4427083333333333, + "grad_norm": 16.532270431518555, + "learning_rate": 9.703208029349718e-06, + "loss": 5.2184, + "step": 21760 + }, + { + "epoch": 0.44281005859375, + "grad_norm": 17.31528091430664, + "learning_rate": 9.70307236320306e-06, + "loss": 5.247, + "step": 21765 + }, + { + "epoch": 0.4429117838541667, + "grad_norm": 18.895078659057617, + "learning_rate": 9.702936667005261e-06, + "loss": 5.3835, + "step": 21770 + }, + { + "epoch": 0.4430135091145833, + "grad_norm": 15.520242691040039, + "learning_rate": 9.702800940757192e-06, + "loss": 5.1269, + "step": 21775 + }, + { + "epoch": 0.443115234375, + "grad_norm": 17.764245986938477, + "learning_rate": 9.702665184459717e-06, + "loss": 5.1995, + "step": 21780 + }, + { + "epoch": 0.4432169596354167, + "grad_norm": 15.769357681274414, + "learning_rate": 9.702529398113703e-06, + "loss": 4.9668, + "step": 21785 + }, + { + "epoch": 0.4433186848958333, + "grad_norm": 19.411163330078125, + "learning_rate": 9.702393581720019e-06, + "loss": 5.2219, + "step": 21790 + }, + { + "epoch": 0.44342041015625, + "grad_norm": 23.257614135742188, + "learning_rate": 9.702257735279533e-06, + "loss": 5.433, + "step": 21795 + }, + { + "epoch": 0.4435221354166667, + "grad_norm": 16.775203704833984, + "learning_rate": 9.702121858793114e-06, + "loss": 5.2293, + "step": 21800 + }, + { + "epoch": 0.4436238606770833, + "grad_norm": 14.234431266784668, + "learning_rate": 9.701985952261628e-06, + "loss": 5.0926, + "step": 21805 + }, + { + "epoch": 0.4437255859375, + "grad_norm": 20.366884231567383, + "learning_rate": 9.701850015685945e-06, + "loss": 5.1886, + "step": 21810 + }, + { + "epoch": 0.4438273111979167, + "grad_norm": 17.81372833251953, + "learning_rate": 9.701714049066932e-06, + "loss": 4.9517, + "step": 21815 + }, + { + "epoch": 0.4439290364583333, + "grad_norm": 16.63152503967285, + "learning_rate": 9.701578052405458e-06, + "loss": 5.1154, + "step": 21820 + }, + { + "epoch": 0.44403076171875, + "grad_norm": 17.81133270263672, + "learning_rate": 9.701442025702395e-06, + "loss": 5.099, + "step": 21825 + }, + { + "epoch": 0.4441324869791667, + "grad_norm": 21.41663932800293, + "learning_rate": 9.701305968958609e-06, + "loss": 5.3529, + "step": 21830 + }, + { + "epoch": 0.4442342122395833, + "grad_norm": 16.623443603515625, + "learning_rate": 9.70116988217497e-06, + "loss": 5.1186, + "step": 21835 + }, + { + "epoch": 0.4443359375, + "grad_norm": 18.1769962310791, + "learning_rate": 9.701033765352348e-06, + "loss": 5.3302, + "step": 21840 + }, + { + "epoch": 0.4444376627604167, + "grad_norm": 15.030570030212402, + "learning_rate": 9.700897618491612e-06, + "loss": 5.3638, + "step": 21845 + }, + { + "epoch": 0.4445393880208333, + "grad_norm": 19.760446548461914, + "learning_rate": 9.700761441593634e-06, + "loss": 5.4858, + "step": 21850 + }, + { + "epoch": 0.44464111328125, + "grad_norm": 16.487932205200195, + "learning_rate": 9.70062523465928e-06, + "loss": 5.4534, + "step": 21855 + }, + { + "epoch": 0.4447428385416667, + "grad_norm": 19.57889747619629, + "learning_rate": 9.700488997689426e-06, + "loss": 5.1569, + "step": 21860 + }, + { + "epoch": 0.4448445638020833, + "grad_norm": 12.158406257629395, + "learning_rate": 9.700352730684937e-06, + "loss": 5.2279, + "step": 21865 + }, + { + "epoch": 0.4449462890625, + "grad_norm": 22.68716049194336, + "learning_rate": 9.700216433646687e-06, + "loss": 5.0996, + "step": 21870 + }, + { + "epoch": 0.4450480143229167, + "grad_norm": 18.38849639892578, + "learning_rate": 9.700080106575546e-06, + "loss": 5.0848, + "step": 21875 + }, + { + "epoch": 0.4451497395833333, + "grad_norm": 14.419346809387207, + "learning_rate": 9.699943749472385e-06, + "loss": 5.1254, + "step": 21880 + }, + { + "epoch": 0.44525146484375, + "grad_norm": 19.44683074951172, + "learning_rate": 9.699807362338076e-06, + "loss": 5.4917, + "step": 21885 + }, + { + "epoch": 0.4453531901041667, + "grad_norm": 15.546610832214355, + "learning_rate": 9.699670945173489e-06, + "loss": 5.3759, + "step": 21890 + }, + { + "epoch": 0.4454549153645833, + "grad_norm": 15.009594917297363, + "learning_rate": 9.699534497979496e-06, + "loss": 5.0155, + "step": 21895 + }, + { + "epoch": 0.445556640625, + "grad_norm": 20.302749633789062, + "learning_rate": 9.699398020756971e-06, + "loss": 5.1586, + "step": 21900 + }, + { + "epoch": 0.4456583658854167, + "grad_norm": 21.612401962280273, + "learning_rate": 9.699261513506784e-06, + "loss": 5.2667, + "step": 21905 + }, + { + "epoch": 0.4457600911458333, + "grad_norm": 15.02286434173584, + "learning_rate": 9.699124976229808e-06, + "loss": 5.2684, + "step": 21910 + }, + { + "epoch": 0.44586181640625, + "grad_norm": 15.729004859924316, + "learning_rate": 9.698988408926914e-06, + "loss": 5.3832, + "step": 21915 + }, + { + "epoch": 0.4459635416666667, + "grad_norm": 16.62214469909668, + "learning_rate": 9.698851811598978e-06, + "loss": 5.5019, + "step": 21920 + }, + { + "epoch": 0.4460652669270833, + "grad_norm": 21.27591323852539, + "learning_rate": 9.698715184246868e-06, + "loss": 5.3191, + "step": 21925 + }, + { + "epoch": 0.4461669921875, + "grad_norm": 14.757766723632812, + "learning_rate": 9.69857852687146e-06, + "loss": 5.2584, + "step": 21930 + }, + { + "epoch": 0.4462687174479167, + "grad_norm": 18.84818458557129, + "learning_rate": 9.698441839473626e-06, + "loss": 5.6012, + "step": 21935 + }, + { + "epoch": 0.4463704427083333, + "grad_norm": 20.626171112060547, + "learning_rate": 9.698305122054243e-06, + "loss": 5.1346, + "step": 21940 + }, + { + "epoch": 0.44647216796875, + "grad_norm": 15.50588607788086, + "learning_rate": 9.698168374614179e-06, + "loss": 5.0224, + "step": 21945 + }, + { + "epoch": 0.4465738932291667, + "grad_norm": 12.772571563720703, + "learning_rate": 9.698031597154312e-06, + "loss": 5.2326, + "step": 21950 + }, + { + "epoch": 0.4466756184895833, + "grad_norm": 16.210378646850586, + "learning_rate": 9.697894789675514e-06, + "loss": 5.2489, + "step": 21955 + }, + { + "epoch": 0.44677734375, + "grad_norm": 14.922961235046387, + "learning_rate": 9.697757952178658e-06, + "loss": 5.1394, + "step": 21960 + }, + { + "epoch": 0.4468790690104167, + "grad_norm": 17.559284210205078, + "learning_rate": 9.697621084664624e-06, + "loss": 5.1292, + "step": 21965 + }, + { + "epoch": 0.4469807942708333, + "grad_norm": 13.684420585632324, + "learning_rate": 9.697484187134279e-06, + "loss": 5.0288, + "step": 21970 + }, + { + "epoch": 0.44708251953125, + "grad_norm": 19.02187728881836, + "learning_rate": 9.697347259588503e-06, + "loss": 5.0211, + "step": 21975 + }, + { + "epoch": 0.4471842447916667, + "grad_norm": 15.402444839477539, + "learning_rate": 9.697210302028168e-06, + "loss": 5.2375, + "step": 21980 + }, + { + "epoch": 0.4472859700520833, + "grad_norm": 20.486501693725586, + "learning_rate": 9.697073314454151e-06, + "loss": 5.2233, + "step": 21985 + }, + { + "epoch": 0.4473876953125, + "grad_norm": 18.39158821105957, + "learning_rate": 9.696936296867325e-06, + "loss": 5.1572, + "step": 21990 + }, + { + "epoch": 0.4474894205729167, + "grad_norm": 14.635111808776855, + "learning_rate": 9.69679924926857e-06, + "loss": 4.98, + "step": 21995 + }, + { + "epoch": 0.4475911458333333, + "grad_norm": 20.201114654541016, + "learning_rate": 9.696662171658756e-06, + "loss": 5.4095, + "step": 22000 + }, + { + "epoch": 0.44769287109375, + "grad_norm": 14.272466659545898, + "learning_rate": 9.696525064038762e-06, + "loss": 5.2137, + "step": 22005 + }, + { + "epoch": 0.4477945963541667, + "grad_norm": 18.543682098388672, + "learning_rate": 9.696387926409466e-06, + "loss": 5.1585, + "step": 22010 + }, + { + "epoch": 0.4478963216145833, + "grad_norm": 16.9285831451416, + "learning_rate": 9.696250758771739e-06, + "loss": 5.1756, + "step": 22015 + }, + { + "epoch": 0.447998046875, + "grad_norm": 16.7357234954834, + "learning_rate": 9.696113561126462e-06, + "loss": 5.1213, + "step": 22020 + }, + { + "epoch": 0.4480997721354167, + "grad_norm": 19.368898391723633, + "learning_rate": 9.69597633347451e-06, + "loss": 5.3133, + "step": 22025 + }, + { + "epoch": 0.4482014973958333, + "grad_norm": 21.943607330322266, + "learning_rate": 9.69583907581676e-06, + "loss": 5.1478, + "step": 22030 + }, + { + "epoch": 0.44830322265625, + "grad_norm": 19.858612060546875, + "learning_rate": 9.69570178815409e-06, + "loss": 5.1581, + "step": 22035 + }, + { + "epoch": 0.4484049479166667, + "grad_norm": 15.866512298583984, + "learning_rate": 9.695564470487373e-06, + "loss": 5.2434, + "step": 22040 + }, + { + "epoch": 0.4485066731770833, + "grad_norm": 19.89897346496582, + "learning_rate": 9.695427122817493e-06, + "loss": 5.0532, + "step": 22045 + }, + { + "epoch": 0.4486083984375, + "grad_norm": 19.119972229003906, + "learning_rate": 9.695289745145323e-06, + "loss": 5.1811, + "step": 22050 + }, + { + "epoch": 0.4487101236979167, + "grad_norm": 19.621984481811523, + "learning_rate": 9.695152337471742e-06, + "loss": 5.349, + "step": 22055 + }, + { + "epoch": 0.4488118489583333, + "grad_norm": 17.827228546142578, + "learning_rate": 9.695014899797628e-06, + "loss": 5.0489, + "step": 22060 + }, + { + "epoch": 0.44891357421875, + "grad_norm": 16.409706115722656, + "learning_rate": 9.694877432123858e-06, + "loss": 5.1926, + "step": 22065 + }, + { + "epoch": 0.4490152994791667, + "grad_norm": 16.915796279907227, + "learning_rate": 9.694739934451313e-06, + "loss": 5.2082, + "step": 22070 + }, + { + "epoch": 0.4491170247395833, + "grad_norm": 16.381729125976562, + "learning_rate": 9.69460240678087e-06, + "loss": 5.3521, + "step": 22075 + }, + { + "epoch": 0.44921875, + "grad_norm": 16.785762786865234, + "learning_rate": 9.694464849113408e-06, + "loss": 5.286, + "step": 22080 + }, + { + "epoch": 0.4493204752604167, + "grad_norm": 15.522215843200684, + "learning_rate": 9.694327261449806e-06, + "loss": 5.4431, + "step": 22085 + }, + { + "epoch": 0.4494222005208333, + "grad_norm": 16.970951080322266, + "learning_rate": 9.694189643790942e-06, + "loss": 5.1947, + "step": 22090 + }, + { + "epoch": 0.44952392578125, + "grad_norm": 15.675413131713867, + "learning_rate": 9.694051996137697e-06, + "loss": 4.9375, + "step": 22095 + }, + { + "epoch": 0.4496256510416667, + "grad_norm": 14.257793426513672, + "learning_rate": 9.69391431849095e-06, + "loss": 5.341, + "step": 22100 + }, + { + "epoch": 0.4497273763020833, + "grad_norm": 17.654993057250977, + "learning_rate": 9.693776610851579e-06, + "loss": 5.2691, + "step": 22105 + }, + { + "epoch": 0.4498291015625, + "grad_norm": 17.92700958251953, + "learning_rate": 9.693638873220467e-06, + "loss": 5.1945, + "step": 22110 + }, + { + "epoch": 0.4499308268229167, + "grad_norm": 12.505136489868164, + "learning_rate": 9.693501105598492e-06, + "loss": 5.0899, + "step": 22115 + }, + { + "epoch": 0.4500325520833333, + "grad_norm": 12.634857177734375, + "learning_rate": 9.693363307986534e-06, + "loss": 5.2292, + "step": 22120 + }, + { + "epoch": 0.45013427734375, + "grad_norm": 13.27489948272705, + "learning_rate": 9.693225480385475e-06, + "loss": 5.1983, + "step": 22125 + }, + { + "epoch": 0.4502360026041667, + "grad_norm": 18.526065826416016, + "learning_rate": 9.693087622796195e-06, + "loss": 5.1905, + "step": 22130 + }, + { + "epoch": 0.4503377278645833, + "grad_norm": 16.90886878967285, + "learning_rate": 9.692949735219576e-06, + "loss": 5.0396, + "step": 22135 + }, + { + "epoch": 0.450439453125, + "grad_norm": 15.750554084777832, + "learning_rate": 9.692811817656498e-06, + "loss": 5.2053, + "step": 22140 + }, + { + "epoch": 0.4505411783854167, + "grad_norm": 18.582012176513672, + "learning_rate": 9.69267387010784e-06, + "loss": 4.9621, + "step": 22145 + }, + { + "epoch": 0.4506429036458333, + "grad_norm": 13.202836036682129, + "learning_rate": 9.692535892574488e-06, + "loss": 5.0403, + "step": 22150 + }, + { + "epoch": 0.45074462890625, + "grad_norm": 16.073528289794922, + "learning_rate": 9.69239788505732e-06, + "loss": 5.2232, + "step": 22155 + }, + { + "epoch": 0.4508463541666667, + "grad_norm": 15.73413372039795, + "learning_rate": 9.69225984755722e-06, + "loss": 5.132, + "step": 22160 + }, + { + "epoch": 0.4509480794270833, + "grad_norm": 16.235654830932617, + "learning_rate": 9.692121780075068e-06, + "loss": 5.2261, + "step": 22165 + }, + { + "epoch": 0.4510498046875, + "grad_norm": 16.461692810058594, + "learning_rate": 9.691983682611748e-06, + "loss": 5.5694, + "step": 22170 + }, + { + "epoch": 0.4511515299479167, + "grad_norm": 12.04104232788086, + "learning_rate": 9.69184555516814e-06, + "loss": 5.3888, + "step": 22175 + }, + { + "epoch": 0.4512532552083333, + "grad_norm": 12.855375289916992, + "learning_rate": 9.691707397745132e-06, + "loss": 5.1341, + "step": 22180 + }, + { + "epoch": 0.45135498046875, + "grad_norm": 15.617329597473145, + "learning_rate": 9.6915692103436e-06, + "loss": 5.0531, + "step": 22185 + }, + { + "epoch": 0.4514567057291667, + "grad_norm": 24.349966049194336, + "learning_rate": 9.69143099296443e-06, + "loss": 5.3362, + "step": 22190 + }, + { + "epoch": 0.4515584309895833, + "grad_norm": 21.000316619873047, + "learning_rate": 9.691292745608507e-06, + "loss": 5.1077, + "step": 22195 + }, + { + "epoch": 0.45166015625, + "grad_norm": 18.49724006652832, + "learning_rate": 9.69115446827671e-06, + "loss": 5.152, + "step": 22200 + }, + { + "epoch": 0.4517618815104167, + "grad_norm": 12.37110710144043, + "learning_rate": 9.691016160969927e-06, + "loss": 4.9007, + "step": 22205 + }, + { + "epoch": 0.4518636067708333, + "grad_norm": 19.381797790527344, + "learning_rate": 9.69087782368904e-06, + "loss": 5.2072, + "step": 22210 + }, + { + "epoch": 0.45196533203125, + "grad_norm": 13.631892204284668, + "learning_rate": 9.690739456434934e-06, + "loss": 5.2807, + "step": 22215 + }, + { + "epoch": 0.4520670572916667, + "grad_norm": 21.123653411865234, + "learning_rate": 9.69060105920849e-06, + "loss": 4.9849, + "step": 22220 + }, + { + "epoch": 0.4521687825520833, + "grad_norm": 14.873435974121094, + "learning_rate": 9.690462632010594e-06, + "loss": 5.3631, + "step": 22225 + }, + { + "epoch": 0.4522705078125, + "grad_norm": 21.237892150878906, + "learning_rate": 9.690324174842132e-06, + "loss": 5.197, + "step": 22230 + }, + { + "epoch": 0.4523722330729167, + "grad_norm": 14.072562217712402, + "learning_rate": 9.690185687703986e-06, + "loss": 4.8488, + "step": 22235 + }, + { + "epoch": 0.4524739583333333, + "grad_norm": 15.66750717163086, + "learning_rate": 9.690047170597044e-06, + "loss": 5.0945, + "step": 22240 + }, + { + "epoch": 0.45257568359375, + "grad_norm": 15.161727905273438, + "learning_rate": 9.689908623522188e-06, + "loss": 5.2778, + "step": 22245 + }, + { + "epoch": 0.4526774088541667, + "grad_norm": 16.701488494873047, + "learning_rate": 9.689770046480304e-06, + "loss": 5.1416, + "step": 22250 + }, + { + "epoch": 0.4527791341145833, + "grad_norm": 14.05594539642334, + "learning_rate": 9.68963143947228e-06, + "loss": 4.9182, + "step": 22255 + }, + { + "epoch": 0.452880859375, + "grad_norm": 14.349746704101562, + "learning_rate": 9.689492802498998e-06, + "loss": 4.8869, + "step": 22260 + }, + { + "epoch": 0.4529825846354167, + "grad_norm": 20.752531051635742, + "learning_rate": 9.689354135561348e-06, + "loss": 5.454, + "step": 22265 + }, + { + "epoch": 0.4530843098958333, + "grad_norm": 25.12639045715332, + "learning_rate": 9.689215438660213e-06, + "loss": 5.0427, + "step": 22270 + }, + { + "epoch": 0.45318603515625, + "grad_norm": 16.655641555786133, + "learning_rate": 9.68907671179648e-06, + "loss": 5.2185, + "step": 22275 + }, + { + "epoch": 0.4532877604166667, + "grad_norm": 21.70589828491211, + "learning_rate": 9.688937954971033e-06, + "loss": 5.2852, + "step": 22280 + }, + { + "epoch": 0.4533894856770833, + "grad_norm": 22.350601196289062, + "learning_rate": 9.688799168184763e-06, + "loss": 4.9826, + "step": 22285 + }, + { + "epoch": 0.4534912109375, + "grad_norm": 16.752735137939453, + "learning_rate": 9.688660351438554e-06, + "loss": 5.1817, + "step": 22290 + }, + { + "epoch": 0.4535929361979167, + "grad_norm": 18.065567016601562, + "learning_rate": 9.688521504733294e-06, + "loss": 4.8848, + "step": 22295 + }, + { + "epoch": 0.4536946614583333, + "grad_norm": 17.198902130126953, + "learning_rate": 9.68838262806987e-06, + "loss": 5.0351, + "step": 22300 + }, + { + "epoch": 0.45379638671875, + "grad_norm": 15.038206100463867, + "learning_rate": 9.688243721449168e-06, + "loss": 5.3469, + "step": 22305 + }, + { + "epoch": 0.4538981119791667, + "grad_norm": 17.115455627441406, + "learning_rate": 9.688104784872078e-06, + "loss": 5.2433, + "step": 22310 + }, + { + "epoch": 0.4539998372395833, + "grad_norm": 14.576728820800781, + "learning_rate": 9.687965818339488e-06, + "loss": 4.9554, + "step": 22315 + }, + { + "epoch": 0.4541015625, + "grad_norm": 20.6390323638916, + "learning_rate": 9.687826821852282e-06, + "loss": 5.2123, + "step": 22320 + }, + { + "epoch": 0.4542032877604167, + "grad_norm": 17.501676559448242, + "learning_rate": 9.687687795411354e-06, + "loss": 5.4758, + "step": 22325 + }, + { + "epoch": 0.4543050130208333, + "grad_norm": 13.238372802734375, + "learning_rate": 9.687548739017586e-06, + "loss": 5.3371, + "step": 22330 + }, + { + "epoch": 0.45440673828125, + "grad_norm": 17.61545181274414, + "learning_rate": 9.68740965267187e-06, + "loss": 5.08, + "step": 22335 + }, + { + "epoch": 0.4545084635416667, + "grad_norm": 17.40553855895996, + "learning_rate": 9.687270536375095e-06, + "loss": 5.0459, + "step": 22340 + }, + { + "epoch": 0.4546101888020833, + "grad_norm": 15.965609550476074, + "learning_rate": 9.68713139012815e-06, + "loss": 5.0587, + "step": 22345 + }, + { + "epoch": 0.4547119140625, + "grad_norm": 13.94389533996582, + "learning_rate": 9.686992213931924e-06, + "loss": 5.2906, + "step": 22350 + }, + { + "epoch": 0.4548136393229167, + "grad_norm": 23.580322265625, + "learning_rate": 9.686853007787303e-06, + "loss": 5.1383, + "step": 22355 + }, + { + "epoch": 0.4549153645833333, + "grad_norm": 17.932435989379883, + "learning_rate": 9.68671377169518e-06, + "loss": 5.1471, + "step": 22360 + }, + { + "epoch": 0.45501708984375, + "grad_norm": 14.922982215881348, + "learning_rate": 9.686574505656444e-06, + "loss": 5.1897, + "step": 22365 + }, + { + "epoch": 0.4551188151041667, + "grad_norm": 13.65367603302002, + "learning_rate": 9.686435209671985e-06, + "loss": 5.1069, + "step": 22370 + }, + { + "epoch": 0.4552205403645833, + "grad_norm": 18.52198028564453, + "learning_rate": 9.686295883742692e-06, + "loss": 5.4866, + "step": 22375 + }, + { + "epoch": 0.455322265625, + "grad_norm": 13.302515983581543, + "learning_rate": 9.686156527869456e-06, + "loss": 5.1807, + "step": 22380 + }, + { + "epoch": 0.4554239908854167, + "grad_norm": 17.135265350341797, + "learning_rate": 9.686017142053168e-06, + "loss": 5.1873, + "step": 22385 + }, + { + "epoch": 0.4555257161458333, + "grad_norm": 19.535709381103516, + "learning_rate": 9.685877726294716e-06, + "loss": 5.261, + "step": 22390 + }, + { + "epoch": 0.45562744140625, + "grad_norm": 12.179122924804688, + "learning_rate": 9.685738280594995e-06, + "loss": 5.2066, + "step": 22395 + }, + { + "epoch": 0.4557291666666667, + "grad_norm": 17.896984100341797, + "learning_rate": 9.685598804954894e-06, + "loss": 5.3294, + "step": 22400 + }, + { + "epoch": 0.4558308919270833, + "grad_norm": 13.538403511047363, + "learning_rate": 9.685459299375303e-06, + "loss": 5.314, + "step": 22405 + }, + { + "epoch": 0.4559326171875, + "grad_norm": 14.1248197555542, + "learning_rate": 9.685319763857114e-06, + "loss": 5.3619, + "step": 22410 + }, + { + "epoch": 0.4560343424479167, + "grad_norm": 17.354541778564453, + "learning_rate": 9.68518019840122e-06, + "loss": 5.1995, + "step": 22415 + }, + { + "epoch": 0.4561360677083333, + "grad_norm": 19.362285614013672, + "learning_rate": 9.685040603008512e-06, + "loss": 5.2945, + "step": 22420 + }, + { + "epoch": 0.45623779296875, + "grad_norm": 20.799400329589844, + "learning_rate": 9.684900977679881e-06, + "loss": 5.1444, + "step": 22425 + }, + { + "epoch": 0.4563395182291667, + "grad_norm": 16.982410430908203, + "learning_rate": 9.684761322416221e-06, + "loss": 5.1265, + "step": 22430 + }, + { + "epoch": 0.4564412434895833, + "grad_norm": 18.47684669494629, + "learning_rate": 9.684621637218422e-06, + "loss": 5.0599, + "step": 22435 + }, + { + "epoch": 0.45654296875, + "grad_norm": 15.075881004333496, + "learning_rate": 9.684481922087378e-06, + "loss": 5.3551, + "step": 22440 + }, + { + "epoch": 0.4566446940104167, + "grad_norm": 18.10749626159668, + "learning_rate": 9.684342177023982e-06, + "loss": 5.0993, + "step": 22445 + }, + { + "epoch": 0.4567464192708333, + "grad_norm": 16.49908447265625, + "learning_rate": 9.684202402029126e-06, + "loss": 5.2753, + "step": 22450 + }, + { + "epoch": 0.45684814453125, + "grad_norm": 17.78779411315918, + "learning_rate": 9.684062597103705e-06, + "loss": 5.1524, + "step": 22455 + }, + { + "epoch": 0.4569498697916667, + "grad_norm": 17.781465530395508, + "learning_rate": 9.68392276224861e-06, + "loss": 5.0832, + "step": 22460 + }, + { + "epoch": 0.4570515950520833, + "grad_norm": 15.333683013916016, + "learning_rate": 9.683782897464734e-06, + "loss": 4.946, + "step": 22465 + }, + { + "epoch": 0.4571533203125, + "grad_norm": 21.82579231262207, + "learning_rate": 9.683643002752975e-06, + "loss": 5.2814, + "step": 22470 + }, + { + "epoch": 0.4572550455729167, + "grad_norm": 16.78913688659668, + "learning_rate": 9.683503078114223e-06, + "loss": 5.102, + "step": 22475 + }, + { + "epoch": 0.4573567708333333, + "grad_norm": 22.43134307861328, + "learning_rate": 9.683363123549373e-06, + "loss": 5.2678, + "step": 22480 + }, + { + "epoch": 0.45745849609375, + "grad_norm": 16.174692153930664, + "learning_rate": 9.683223139059319e-06, + "loss": 4.9428, + "step": 22485 + }, + { + "epoch": 0.4575602213541667, + "grad_norm": 18.598329544067383, + "learning_rate": 9.683083124644954e-06, + "loss": 5.1553, + "step": 22490 + }, + { + "epoch": 0.4576619466145833, + "grad_norm": 14.26465892791748, + "learning_rate": 9.682943080307176e-06, + "loss": 5.2434, + "step": 22495 + }, + { + "epoch": 0.457763671875, + "grad_norm": 16.18085479736328, + "learning_rate": 9.68280300604688e-06, + "loss": 5.1873, + "step": 22500 + }, + { + "epoch": 0.4578653971354167, + "grad_norm": 16.4917049407959, + "learning_rate": 9.682662901864958e-06, + "loss": 5.2771, + "step": 22505 + }, + { + "epoch": 0.4579671223958333, + "grad_norm": 16.033641815185547, + "learning_rate": 9.682522767762306e-06, + "loss": 5.4066, + "step": 22510 + }, + { + "epoch": 0.45806884765625, + "grad_norm": 22.35399627685547, + "learning_rate": 9.68238260373982e-06, + "loss": 5.5617, + "step": 22515 + }, + { + "epoch": 0.4581705729166667, + "grad_norm": 14.714438438415527, + "learning_rate": 9.682242409798396e-06, + "loss": 5.255, + "step": 22520 + }, + { + "epoch": 0.4582722981770833, + "grad_norm": 14.283019065856934, + "learning_rate": 9.682102185938928e-06, + "loss": 5.45, + "step": 22525 + }, + { + "epoch": 0.4583740234375, + "grad_norm": 16.152647018432617, + "learning_rate": 9.681961932162316e-06, + "loss": 5.1134, + "step": 22530 + }, + { + "epoch": 0.4584757486979167, + "grad_norm": 18.638870239257812, + "learning_rate": 9.681821648469451e-06, + "loss": 5.1289, + "step": 22535 + }, + { + "epoch": 0.4585774739583333, + "grad_norm": 21.394733428955078, + "learning_rate": 9.681681334861234e-06, + "loss": 5.1227, + "step": 22540 + }, + { + "epoch": 0.45867919921875, + "grad_norm": 15.240309715270996, + "learning_rate": 9.681540991338559e-06, + "loss": 5.2041, + "step": 22545 + }, + { + "epoch": 0.4587809244791667, + "grad_norm": 14.075970649719238, + "learning_rate": 9.68140061790232e-06, + "loss": 5.1625, + "step": 22550 + }, + { + "epoch": 0.4588826497395833, + "grad_norm": 18.596776962280273, + "learning_rate": 9.68126021455342e-06, + "loss": 5.2969, + "step": 22555 + }, + { + "epoch": 0.458984375, + "grad_norm": 16.298120498657227, + "learning_rate": 9.681119781292753e-06, + "loss": 5.3669, + "step": 22560 + }, + { + "epoch": 0.4590861002604167, + "grad_norm": 17.422435760498047, + "learning_rate": 9.680979318121218e-06, + "loss": 5.0841, + "step": 22565 + }, + { + "epoch": 0.4591878255208333, + "grad_norm": 19.814329147338867, + "learning_rate": 9.68083882503971e-06, + "loss": 5.4698, + "step": 22570 + }, + { + "epoch": 0.45928955078125, + "grad_norm": 15.796643257141113, + "learning_rate": 9.680698302049127e-06, + "loss": 5.1827, + "step": 22575 + }, + { + "epoch": 0.4593912760416667, + "grad_norm": 15.457585334777832, + "learning_rate": 9.680557749150368e-06, + "loss": 5.3812, + "step": 22580 + }, + { + "epoch": 0.4594930013020833, + "grad_norm": 16.4129695892334, + "learning_rate": 9.680417166344331e-06, + "loss": 5.3012, + "step": 22585 + }, + { + "epoch": 0.4595947265625, + "grad_norm": 23.55593490600586, + "learning_rate": 9.680276553631914e-06, + "loss": 5.2897, + "step": 22590 + }, + { + "epoch": 0.4596964518229167, + "grad_norm": 19.20635223388672, + "learning_rate": 9.680135911014014e-06, + "loss": 5.2225, + "step": 22595 + }, + { + "epoch": 0.4597981770833333, + "grad_norm": 30.911865234375, + "learning_rate": 9.679995238491534e-06, + "loss": 5.1943, + "step": 22600 + }, + { + "epoch": 0.45989990234375, + "grad_norm": 15.828886985778809, + "learning_rate": 9.679854536065368e-06, + "loss": 5.3487, + "step": 22605 + }, + { + "epoch": 0.4600016276041667, + "grad_norm": 22.397512435913086, + "learning_rate": 9.67971380373642e-06, + "loss": 4.9737, + "step": 22610 + }, + { + "epoch": 0.4601033528645833, + "grad_norm": 17.98531150817871, + "learning_rate": 9.679573041505583e-06, + "loss": 5.1526, + "step": 22615 + }, + { + "epoch": 0.460205078125, + "grad_norm": 14.322988510131836, + "learning_rate": 9.679432249373762e-06, + "loss": 5.623, + "step": 22620 + }, + { + "epoch": 0.4603068033854167, + "grad_norm": 12.988487243652344, + "learning_rate": 9.679291427341853e-06, + "loss": 5.2611, + "step": 22625 + }, + { + "epoch": 0.4604085286458333, + "grad_norm": 25.565296173095703, + "learning_rate": 9.679150575410755e-06, + "loss": 5.3118, + "step": 22630 + }, + { + "epoch": 0.46051025390625, + "grad_norm": 17.231714248657227, + "learning_rate": 9.679009693581373e-06, + "loss": 5.2867, + "step": 22635 + }, + { + "epoch": 0.4606119791666667, + "grad_norm": 14.774677276611328, + "learning_rate": 9.678868781854605e-06, + "loss": 5.2553, + "step": 22640 + }, + { + "epoch": 0.4607137044270833, + "grad_norm": 14.39670467376709, + "learning_rate": 9.67872784023135e-06, + "loss": 5.1355, + "step": 22645 + }, + { + "epoch": 0.4608154296875, + "grad_norm": 18.011241912841797, + "learning_rate": 9.678586868712509e-06, + "loss": 5.4693, + "step": 22650 + }, + { + "epoch": 0.4609171549479167, + "grad_norm": 15.473970413208008, + "learning_rate": 9.678445867298983e-06, + "loss": 5.1629, + "step": 22655 + }, + { + "epoch": 0.4610188802083333, + "grad_norm": 18.947349548339844, + "learning_rate": 9.678304835991674e-06, + "loss": 4.9715, + "step": 22660 + }, + { + "epoch": 0.46112060546875, + "grad_norm": 17.600122451782227, + "learning_rate": 9.678163774791481e-06, + "loss": 5.1512, + "step": 22665 + }, + { + "epoch": 0.4612223307291667, + "grad_norm": 14.72983455657959, + "learning_rate": 9.678022683699308e-06, + "loss": 5.2814, + "step": 22670 + }, + { + "epoch": 0.4613240559895833, + "grad_norm": 16.047550201416016, + "learning_rate": 9.677881562716054e-06, + "loss": 5.2128, + "step": 22675 + }, + { + "epoch": 0.46142578125, + "grad_norm": 15.155278205871582, + "learning_rate": 9.677740411842621e-06, + "loss": 5.2015, + "step": 22680 + }, + { + "epoch": 0.4615275065104167, + "grad_norm": 18.591949462890625, + "learning_rate": 9.677599231079913e-06, + "loss": 5.2301, + "step": 22685 + }, + { + "epoch": 0.4616292317708333, + "grad_norm": 17.199787139892578, + "learning_rate": 9.67745802042883e-06, + "loss": 5.2166, + "step": 22690 + }, + { + "epoch": 0.46173095703125, + "grad_norm": 13.313461303710938, + "learning_rate": 9.677316779890275e-06, + "loss": 5.144, + "step": 22695 + }, + { + "epoch": 0.4618326822916667, + "grad_norm": 18.018104553222656, + "learning_rate": 9.67717550946515e-06, + "loss": 4.9317, + "step": 22700 + }, + { + "epoch": 0.4619344075520833, + "grad_norm": 15.242374420166016, + "learning_rate": 9.677034209154361e-06, + "loss": 5.301, + "step": 22705 + }, + { + "epoch": 0.4620361328125, + "grad_norm": 23.101085662841797, + "learning_rate": 9.676892878958805e-06, + "loss": 5.2429, + "step": 22710 + }, + { + "epoch": 0.4621378580729167, + "grad_norm": 12.484789848327637, + "learning_rate": 9.67675151887939e-06, + "loss": 5.0295, + "step": 22715 + }, + { + "epoch": 0.4622395833333333, + "grad_norm": 16.426054000854492, + "learning_rate": 9.676610128917017e-06, + "loss": 5.3969, + "step": 22720 + }, + { + "epoch": 0.46234130859375, + "grad_norm": 15.32658863067627, + "learning_rate": 9.67646870907259e-06, + "loss": 5.2663, + "step": 22725 + }, + { + "epoch": 0.4624430338541667, + "grad_norm": 24.96002960205078, + "learning_rate": 9.67632725934701e-06, + "loss": 5.1304, + "step": 22730 + }, + { + "epoch": 0.4625447591145833, + "grad_norm": 12.9132080078125, + "learning_rate": 9.676185779741187e-06, + "loss": 5.1665, + "step": 22735 + }, + { + "epoch": 0.462646484375, + "grad_norm": 19.289196014404297, + "learning_rate": 9.67604427025602e-06, + "loss": 4.9449, + "step": 22740 + }, + { + "epoch": 0.4627482096354167, + "grad_norm": 17.553539276123047, + "learning_rate": 9.675902730892415e-06, + "loss": 5.2992, + "step": 22745 + }, + { + "epoch": 0.4628499348958333, + "grad_norm": 17.708263397216797, + "learning_rate": 9.675761161651274e-06, + "loss": 5.269, + "step": 22750 + }, + { + "epoch": 0.46295166015625, + "grad_norm": 20.43770980834961, + "learning_rate": 9.675619562533504e-06, + "loss": 4.9924, + "step": 22755 + }, + { + "epoch": 0.4630533854166667, + "grad_norm": 13.816055297851562, + "learning_rate": 9.67547793354001e-06, + "loss": 5.1946, + "step": 22760 + }, + { + "epoch": 0.4631551106770833, + "grad_norm": 19.85179901123047, + "learning_rate": 9.675336274671696e-06, + "loss": 4.8906, + "step": 22765 + }, + { + "epoch": 0.4632568359375, + "grad_norm": 13.08003044128418, + "learning_rate": 9.675194585929468e-06, + "loss": 5.4495, + "step": 22770 + }, + { + "epoch": 0.4633585611979167, + "grad_norm": 20.939605712890625, + "learning_rate": 9.67505286731423e-06, + "loss": 4.9624, + "step": 22775 + }, + { + "epoch": 0.4634602864583333, + "grad_norm": 17.8450870513916, + "learning_rate": 9.674911118826889e-06, + "loss": 5.0974, + "step": 22780 + }, + { + "epoch": 0.46356201171875, + "grad_norm": 12.322661399841309, + "learning_rate": 9.674769340468349e-06, + "loss": 5.0347, + "step": 22785 + }, + { + "epoch": 0.4636637369791667, + "grad_norm": 19.0739688873291, + "learning_rate": 9.674627532239516e-06, + "loss": 5.1967, + "step": 22790 + }, + { + "epoch": 0.4637654622395833, + "grad_norm": 15.21291446685791, + "learning_rate": 9.674485694141299e-06, + "loss": 5.2951, + "step": 22795 + }, + { + "epoch": 0.4638671875, + "grad_norm": 17.884258270263672, + "learning_rate": 9.6743438261746e-06, + "loss": 5.119, + "step": 22800 + }, + { + "epoch": 0.4639689127604167, + "grad_norm": 15.612198829650879, + "learning_rate": 9.67420192834033e-06, + "loss": 5.0312, + "step": 22805 + }, + { + "epoch": 0.4640706380208333, + "grad_norm": 12.971384048461914, + "learning_rate": 9.674060000639393e-06, + "loss": 5.0661, + "step": 22810 + }, + { + "epoch": 0.46417236328125, + "grad_norm": 18.636869430541992, + "learning_rate": 9.673918043072696e-06, + "loss": 5.0744, + "step": 22815 + }, + { + "epoch": 0.4642740885416667, + "grad_norm": 17.498672485351562, + "learning_rate": 9.673776055641147e-06, + "loss": 5.1557, + "step": 22820 + }, + { + "epoch": 0.4643758138020833, + "grad_norm": 21.132993698120117, + "learning_rate": 9.673634038345652e-06, + "loss": 5.1898, + "step": 22825 + }, + { + "epoch": 0.4644775390625, + "grad_norm": 17.147056579589844, + "learning_rate": 9.673491991187118e-06, + "loss": 5.1635, + "step": 22830 + }, + { + "epoch": 0.4645792643229167, + "grad_norm": 15.530454635620117, + "learning_rate": 9.673349914166456e-06, + "loss": 5.0586, + "step": 22835 + }, + { + "epoch": 0.4646809895833333, + "grad_norm": 18.82200813293457, + "learning_rate": 9.673207807284571e-06, + "loss": 5.3635, + "step": 22840 + }, + { + "epoch": 0.46478271484375, + "grad_norm": 14.111454963684082, + "learning_rate": 9.67306567054237e-06, + "loss": 5.2006, + "step": 22845 + }, + { + "epoch": 0.4648844401041667, + "grad_norm": 15.013161659240723, + "learning_rate": 9.672923503940763e-06, + "loss": 5.0981, + "step": 22850 + }, + { + "epoch": 0.4649861653645833, + "grad_norm": 13.563011169433594, + "learning_rate": 9.672781307480659e-06, + "loss": 4.9925, + "step": 22855 + }, + { + "epoch": 0.465087890625, + "grad_norm": 18.37213897705078, + "learning_rate": 9.672639081162966e-06, + "loss": 5.4626, + "step": 22860 + }, + { + "epoch": 0.4651896158854167, + "grad_norm": 19.009939193725586, + "learning_rate": 9.672496824988593e-06, + "loss": 5.0471, + "step": 22865 + }, + { + "epoch": 0.4652913411458333, + "grad_norm": 14.965631484985352, + "learning_rate": 9.672354538958448e-06, + "loss": 5.121, + "step": 22870 + }, + { + "epoch": 0.46539306640625, + "grad_norm": 22.924060821533203, + "learning_rate": 9.67221222307344e-06, + "loss": 5.1271, + "step": 22875 + }, + { + "epoch": 0.4654947916666667, + "grad_norm": 17.57509422302246, + "learning_rate": 9.67206987733448e-06, + "loss": 5.4658, + "step": 22880 + }, + { + "epoch": 0.4655965169270833, + "grad_norm": 16.949338912963867, + "learning_rate": 9.671927501742477e-06, + "loss": 5.5797, + "step": 22885 + }, + { + "epoch": 0.4656982421875, + "grad_norm": 13.577919006347656, + "learning_rate": 9.671785096298339e-06, + "loss": 4.954, + "step": 22890 + }, + { + "epoch": 0.4657999674479167, + "grad_norm": 26.336809158325195, + "learning_rate": 9.671642661002978e-06, + "loss": 5.233, + "step": 22895 + }, + { + "epoch": 0.4659016927083333, + "grad_norm": 20.75061798095703, + "learning_rate": 9.671500195857305e-06, + "loss": 5.2541, + "step": 22900 + }, + { + "epoch": 0.46600341796875, + "grad_norm": 15.505037307739258, + "learning_rate": 9.671357700862224e-06, + "loss": 5.5945, + "step": 22905 + }, + { + "epoch": 0.4661051432291667, + "grad_norm": 19.489534378051758, + "learning_rate": 9.671215176018655e-06, + "loss": 5.2558, + "step": 22910 + }, + { + "epoch": 0.4662068684895833, + "grad_norm": 13.85179615020752, + "learning_rate": 9.671072621327501e-06, + "loss": 5.1263, + "step": 22915 + }, + { + "epoch": 0.46630859375, + "grad_norm": 17.5207576751709, + "learning_rate": 9.670930036789678e-06, + "loss": 5.3666, + "step": 22920 + }, + { + "epoch": 0.4664103190104167, + "grad_norm": 20.504728317260742, + "learning_rate": 9.670787422406093e-06, + "loss": 5.2719, + "step": 22925 + }, + { + "epoch": 0.4665120442708333, + "grad_norm": 17.40818214416504, + "learning_rate": 9.67064477817766e-06, + "loss": 5.209, + "step": 22930 + }, + { + "epoch": 0.46661376953125, + "grad_norm": 18.694564819335938, + "learning_rate": 9.670502104105288e-06, + "loss": 5.4129, + "step": 22935 + }, + { + "epoch": 0.4667154947916667, + "grad_norm": 13.52584457397461, + "learning_rate": 9.670359400189892e-06, + "loss": 5.181, + "step": 22940 + }, + { + "epoch": 0.4668172200520833, + "grad_norm": 15.888542175292969, + "learning_rate": 9.670216666432382e-06, + "loss": 5.1254, + "step": 22945 + }, + { + "epoch": 0.4669189453125, + "grad_norm": 17.04798698425293, + "learning_rate": 9.670073902833669e-06, + "loss": 5.2113, + "step": 22950 + }, + { + "epoch": 0.4670206705729167, + "grad_norm": 16.82530403137207, + "learning_rate": 9.669931109394666e-06, + "loss": 5.3727, + "step": 22955 + }, + { + "epoch": 0.4671223958333333, + "grad_norm": 13.399603843688965, + "learning_rate": 9.669788286116287e-06, + "loss": 5.0812, + "step": 22960 + }, + { + "epoch": 0.46722412109375, + "grad_norm": 16.566192626953125, + "learning_rate": 9.669645432999442e-06, + "loss": 5.0508, + "step": 22965 + }, + { + "epoch": 0.4673258463541667, + "grad_norm": 17.06537628173828, + "learning_rate": 9.669502550045046e-06, + "loss": 5.3384, + "step": 22970 + }, + { + "epoch": 0.4674275716145833, + "grad_norm": 19.798152923583984, + "learning_rate": 9.66935963725401e-06, + "loss": 5.1047, + "step": 22975 + }, + { + "epoch": 0.467529296875, + "grad_norm": 17.93370246887207, + "learning_rate": 9.669216694627249e-06, + "loss": 5.1403, + "step": 22980 + }, + { + "epoch": 0.4676310221354167, + "grad_norm": 17.5959529876709, + "learning_rate": 9.669073722165674e-06, + "loss": 4.9645, + "step": 22985 + }, + { + "epoch": 0.4677327473958333, + "grad_norm": 19.365306854248047, + "learning_rate": 9.668930719870202e-06, + "loss": 4.9788, + "step": 22990 + }, + { + "epoch": 0.46783447265625, + "grad_norm": 15.578295707702637, + "learning_rate": 9.668787687741745e-06, + "loss": 5.0806, + "step": 22995 + }, + { + "epoch": 0.4679361979166667, + "grad_norm": 26.573223114013672, + "learning_rate": 9.668644625781216e-06, + "loss": 5.2522, + "step": 23000 + }, + { + "epoch": 0.4680379231770833, + "grad_norm": 16.325223922729492, + "learning_rate": 9.66850153398953e-06, + "loss": 5.2399, + "step": 23005 + }, + { + "epoch": 0.4681396484375, + "grad_norm": 17.082714080810547, + "learning_rate": 9.6683584123676e-06, + "loss": 5.1072, + "step": 23010 + }, + { + "epoch": 0.4682413736979167, + "grad_norm": 22.338279724121094, + "learning_rate": 9.668215260916342e-06, + "loss": 5.4078, + "step": 23015 + }, + { + "epoch": 0.4683430989583333, + "grad_norm": 20.07613182067871, + "learning_rate": 9.668072079636672e-06, + "loss": 5.4491, + "step": 23020 + }, + { + "epoch": 0.46844482421875, + "grad_norm": 17.664724349975586, + "learning_rate": 9.667928868529501e-06, + "loss": 5.0401, + "step": 23025 + }, + { + "epoch": 0.4685465494791667, + "grad_norm": 18.09357261657715, + "learning_rate": 9.667785627595748e-06, + "loss": 5.0668, + "step": 23030 + }, + { + "epoch": 0.4686482747395833, + "grad_norm": 20.197065353393555, + "learning_rate": 9.667642356836326e-06, + "loss": 5.3892, + "step": 23035 + }, + { + "epoch": 0.46875, + "grad_norm": 15.378982543945312, + "learning_rate": 9.667499056252152e-06, + "loss": 5.5253, + "step": 23040 + }, + { + "epoch": 0.4688517252604167, + "grad_norm": 12.990601539611816, + "learning_rate": 9.66735572584414e-06, + "loss": 5.1004, + "step": 23045 + }, + { + "epoch": 0.4689534505208333, + "grad_norm": 18.83979606628418, + "learning_rate": 9.667212365613206e-06, + "loss": 4.9862, + "step": 23050 + }, + { + "epoch": 0.46905517578125, + "grad_norm": 25.094636917114258, + "learning_rate": 9.667068975560268e-06, + "loss": 5.5162, + "step": 23055 + }, + { + "epoch": 0.4691569010416667, + "grad_norm": 13.647418022155762, + "learning_rate": 9.66692555568624e-06, + "loss": 5.4568, + "step": 23060 + }, + { + "epoch": 0.4692586263020833, + "grad_norm": 16.514301300048828, + "learning_rate": 9.66678210599204e-06, + "loss": 5.1855, + "step": 23065 + }, + { + "epoch": 0.4693603515625, + "grad_norm": 20.042844772338867, + "learning_rate": 9.66663862647858e-06, + "loss": 5.142, + "step": 23070 + }, + { + "epoch": 0.4694620768229167, + "grad_norm": 18.120365142822266, + "learning_rate": 9.666495117146784e-06, + "loss": 5.2877, + "step": 23075 + }, + { + "epoch": 0.4695638020833333, + "grad_norm": 16.43113136291504, + "learning_rate": 9.666351577997565e-06, + "loss": 5.314, + "step": 23080 + }, + { + "epoch": 0.46966552734375, + "grad_norm": 20.54209327697754, + "learning_rate": 9.66620800903184e-06, + "loss": 5.2342, + "step": 23085 + }, + { + "epoch": 0.4697672526041667, + "grad_norm": 18.934005737304688, + "learning_rate": 9.666064410250529e-06, + "loss": 5.1766, + "step": 23090 + }, + { + "epoch": 0.4698689778645833, + "grad_norm": 20.999082565307617, + "learning_rate": 9.665920781654545e-06, + "loss": 5.2208, + "step": 23095 + }, + { + "epoch": 0.469970703125, + "grad_norm": 16.246397018432617, + "learning_rate": 9.665777123244809e-06, + "loss": 5.174, + "step": 23100 + }, + { + "epoch": 0.4700724283854167, + "grad_norm": 14.450432777404785, + "learning_rate": 9.665633435022239e-06, + "loss": 5.2951, + "step": 23105 + }, + { + "epoch": 0.4701741536458333, + "grad_norm": 16.000282287597656, + "learning_rate": 9.665489716987753e-06, + "loss": 5.1482, + "step": 23110 + }, + { + "epoch": 0.47027587890625, + "grad_norm": 18.957231521606445, + "learning_rate": 9.665345969142267e-06, + "loss": 5.1768, + "step": 23115 + }, + { + "epoch": 0.4703776041666667, + "grad_norm": 16.938247680664062, + "learning_rate": 9.665202191486703e-06, + "loss": 4.9409, + "step": 23120 + }, + { + "epoch": 0.4704793294270833, + "grad_norm": 17.736976623535156, + "learning_rate": 9.665058384021977e-06, + "loss": 4.9755, + "step": 23125 + }, + { + "epoch": 0.4705810546875, + "grad_norm": 16.51349449157715, + "learning_rate": 9.66491454674901e-06, + "loss": 5.0977, + "step": 23130 + }, + { + "epoch": 0.4706827799479167, + "grad_norm": 14.123788833618164, + "learning_rate": 9.664770679668718e-06, + "loss": 5.1264, + "step": 23135 + }, + { + "epoch": 0.4707845052083333, + "grad_norm": 14.982301712036133, + "learning_rate": 9.664626782782022e-06, + "loss": 4.9734, + "step": 23140 + }, + { + "epoch": 0.47088623046875, + "grad_norm": 12.5905179977417, + "learning_rate": 9.664482856089845e-06, + "loss": 5.063, + "step": 23145 + }, + { + "epoch": 0.4709879557291667, + "grad_norm": 14.24673843383789, + "learning_rate": 9.664338899593099e-06, + "loss": 4.7899, + "step": 23150 + }, + { + "epoch": 0.4710896809895833, + "grad_norm": 16.04170799255371, + "learning_rate": 9.664194913292711e-06, + "loss": 5.1841, + "step": 23155 + }, + { + "epoch": 0.47119140625, + "grad_norm": 17.36473274230957, + "learning_rate": 9.664050897189598e-06, + "loss": 5.3348, + "step": 23160 + }, + { + "epoch": 0.4712931315104167, + "grad_norm": 23.262954711914062, + "learning_rate": 9.663906851284679e-06, + "loss": 5.273, + "step": 23165 + }, + { + "epoch": 0.4713948567708333, + "grad_norm": 16.45766830444336, + "learning_rate": 9.663762775578876e-06, + "loss": 5.1345, + "step": 23170 + }, + { + "epoch": 0.47149658203125, + "grad_norm": 13.720438003540039, + "learning_rate": 9.663618670073109e-06, + "loss": 5.0227, + "step": 23175 + }, + { + "epoch": 0.4715983072916667, + "grad_norm": 17.253971099853516, + "learning_rate": 9.6634745347683e-06, + "loss": 5.1762, + "step": 23180 + }, + { + "epoch": 0.4717000325520833, + "grad_norm": 19.28756332397461, + "learning_rate": 9.66333036966537e-06, + "loss": 5.245, + "step": 23185 + }, + { + "epoch": 0.4718017578125, + "grad_norm": 18.129491806030273, + "learning_rate": 9.663186174765238e-06, + "loss": 5.2158, + "step": 23190 + }, + { + "epoch": 0.4719034830729167, + "grad_norm": 25.84429359436035, + "learning_rate": 9.663041950068826e-06, + "loss": 5.1666, + "step": 23195 + }, + { + "epoch": 0.4720052083333333, + "grad_norm": 18.984344482421875, + "learning_rate": 9.662897695577057e-06, + "loss": 5.4526, + "step": 23200 + }, + { + "epoch": 0.47210693359375, + "grad_norm": 20.004179000854492, + "learning_rate": 9.662753411290852e-06, + "loss": 5.3676, + "step": 23205 + }, + { + "epoch": 0.4722086588541667, + "grad_norm": 13.710082054138184, + "learning_rate": 9.662609097211133e-06, + "loss": 5.1532, + "step": 23210 + }, + { + "epoch": 0.4723103841145833, + "grad_norm": 17.889930725097656, + "learning_rate": 9.662464753338822e-06, + "loss": 5.1292, + "step": 23215 + }, + { + "epoch": 0.472412109375, + "grad_norm": 17.999649047851562, + "learning_rate": 9.662320379674841e-06, + "loss": 5.2387, + "step": 23220 + }, + { + "epoch": 0.4725138346354167, + "grad_norm": 14.673113822937012, + "learning_rate": 9.662175976220112e-06, + "loss": 5.4829, + "step": 23225 + }, + { + "epoch": 0.4726155598958333, + "grad_norm": 14.738616943359375, + "learning_rate": 9.66203154297556e-06, + "loss": 5.0699, + "step": 23230 + }, + { + "epoch": 0.47271728515625, + "grad_norm": 17.339599609375, + "learning_rate": 9.661887079942105e-06, + "loss": 5.3161, + "step": 23235 + }, + { + "epoch": 0.4728190104166667, + "grad_norm": 19.5250186920166, + "learning_rate": 9.661742587120673e-06, + "loss": 5.0183, + "step": 23240 + }, + { + "epoch": 0.4729207356770833, + "grad_norm": 19.442575454711914, + "learning_rate": 9.661598064512184e-06, + "loss": 5.2425, + "step": 23245 + }, + { + "epoch": 0.4730224609375, + "grad_norm": 16.095935821533203, + "learning_rate": 9.661453512117565e-06, + "loss": 5.123, + "step": 23250 + }, + { + "epoch": 0.4731241861979167, + "grad_norm": 16.49070167541504, + "learning_rate": 9.661308929937736e-06, + "loss": 5.1204, + "step": 23255 + }, + { + "epoch": 0.4732259114583333, + "grad_norm": 19.312511444091797, + "learning_rate": 9.661164317973624e-06, + "loss": 5.2033, + "step": 23260 + }, + { + "epoch": 0.47332763671875, + "grad_norm": 14.848114013671875, + "learning_rate": 9.66101967622615e-06, + "loss": 5.1615, + "step": 23265 + }, + { + "epoch": 0.4734293619791667, + "grad_norm": 16.19026756286621, + "learning_rate": 9.660875004696241e-06, + "loss": 5.4883, + "step": 23270 + }, + { + "epoch": 0.4735310872395833, + "grad_norm": 21.06908416748047, + "learning_rate": 9.66073030338482e-06, + "loss": 5.2103, + "step": 23275 + }, + { + "epoch": 0.4736328125, + "grad_norm": 22.28191566467285, + "learning_rate": 9.660585572292813e-06, + "loss": 4.9787, + "step": 23280 + }, + { + "epoch": 0.4737345377604167, + "grad_norm": 21.264921188354492, + "learning_rate": 9.660440811421142e-06, + "loss": 5.1044, + "step": 23285 + }, + { + "epoch": 0.4738362630208333, + "grad_norm": 19.215425491333008, + "learning_rate": 9.660296020770734e-06, + "loss": 5.166, + "step": 23290 + }, + { + "epoch": 0.47393798828125, + "grad_norm": 17.233705520629883, + "learning_rate": 9.660151200342515e-06, + "loss": 5.0681, + "step": 23295 + }, + { + "epoch": 0.4740397135416667, + "grad_norm": 18.6588134765625, + "learning_rate": 9.660006350137407e-06, + "loss": 5.1703, + "step": 23300 + }, + { + "epoch": 0.4741414388020833, + "grad_norm": 14.878541946411133, + "learning_rate": 9.65986147015634e-06, + "loss": 5.1112, + "step": 23305 + }, + { + "epoch": 0.4742431640625, + "grad_norm": 13.729074478149414, + "learning_rate": 9.659716560400236e-06, + "loss": 5.0848, + "step": 23310 + }, + { + "epoch": 0.4743448893229167, + "grad_norm": 17.41480827331543, + "learning_rate": 9.659571620870023e-06, + "loss": 5.4868, + "step": 23315 + }, + { + "epoch": 0.4744466145833333, + "grad_norm": 27.078109741210938, + "learning_rate": 9.659426651566627e-06, + "loss": 5.1263, + "step": 23320 + }, + { + "epoch": 0.47454833984375, + "grad_norm": 19.056581497192383, + "learning_rate": 9.659281652490972e-06, + "loss": 5.4849, + "step": 23325 + }, + { + "epoch": 0.4746500651041667, + "grad_norm": 15.191753387451172, + "learning_rate": 9.659136623643988e-06, + "loss": 5.1984, + "step": 23330 + }, + { + "epoch": 0.4747517903645833, + "grad_norm": 18.774980545043945, + "learning_rate": 9.6589915650266e-06, + "loss": 4.8876, + "step": 23335 + }, + { + "epoch": 0.474853515625, + "grad_norm": 12.04016399383545, + "learning_rate": 9.658846476639735e-06, + "loss": 5.1182, + "step": 23340 + }, + { + "epoch": 0.4749552408854167, + "grad_norm": 19.709062576293945, + "learning_rate": 9.658701358484319e-06, + "loss": 5.1697, + "step": 23345 + }, + { + "epoch": 0.4750569661458333, + "grad_norm": 20.122098922729492, + "learning_rate": 9.65855621056128e-06, + "loss": 5.2441, + "step": 23350 + }, + { + "epoch": 0.47515869140625, + "grad_norm": 17.648202896118164, + "learning_rate": 9.658411032871547e-06, + "loss": 5.1449, + "step": 23355 + }, + { + "epoch": 0.4752604166666667, + "grad_norm": 13.2044038772583, + "learning_rate": 9.658265825416044e-06, + "loss": 5.1488, + "step": 23360 + }, + { + "epoch": 0.4753621419270833, + "grad_norm": 12.896883010864258, + "learning_rate": 9.658120588195702e-06, + "loss": 5.0533, + "step": 23365 + }, + { + "epoch": 0.4754638671875, + "grad_norm": 17.932159423828125, + "learning_rate": 9.65797532121145e-06, + "loss": 5.0198, + "step": 23370 + }, + { + "epoch": 0.4755655924479167, + "grad_norm": 18.843496322631836, + "learning_rate": 9.657830024464212e-06, + "loss": 5.2271, + "step": 23375 + }, + { + "epoch": 0.4756673177083333, + "grad_norm": 18.608781814575195, + "learning_rate": 9.657684697954918e-06, + "loss": 5.1094, + "step": 23380 + }, + { + "epoch": 0.47576904296875, + "grad_norm": 16.181903839111328, + "learning_rate": 9.6575393416845e-06, + "loss": 5.2761, + "step": 23385 + }, + { + "epoch": 0.4758707682291667, + "grad_norm": 20.110198974609375, + "learning_rate": 9.657393955653883e-06, + "loss": 4.9957, + "step": 23390 + }, + { + "epoch": 0.4759724934895833, + "grad_norm": 16.19821548461914, + "learning_rate": 9.657248539863997e-06, + "loss": 5.2054, + "step": 23395 + }, + { + "epoch": 0.47607421875, + "grad_norm": 19.540874481201172, + "learning_rate": 9.657103094315771e-06, + "loss": 5.2047, + "step": 23400 + }, + { + "epoch": 0.4761759440104167, + "grad_norm": 19.32128143310547, + "learning_rate": 9.656957619010136e-06, + "loss": 5.3011, + "step": 23405 + }, + { + "epoch": 0.4762776692708333, + "grad_norm": 22.983699798583984, + "learning_rate": 9.65681211394802e-06, + "loss": 5.1428, + "step": 23410 + }, + { + "epoch": 0.47637939453125, + "grad_norm": 16.99256706237793, + "learning_rate": 9.656666579130352e-06, + "loss": 5.1497, + "step": 23415 + }, + { + "epoch": 0.4764811197916667, + "grad_norm": 15.941673278808594, + "learning_rate": 9.656521014558063e-06, + "loss": 5.2215, + "step": 23420 + }, + { + "epoch": 0.4765828450520833, + "grad_norm": 17.67299461364746, + "learning_rate": 9.656375420232081e-06, + "loss": 5.0076, + "step": 23425 + }, + { + "epoch": 0.4766845703125, + "grad_norm": 19.454198837280273, + "learning_rate": 9.65622979615334e-06, + "loss": 5.0222, + "step": 23430 + }, + { + "epoch": 0.4767862955729167, + "grad_norm": 17.405147552490234, + "learning_rate": 9.656084142322769e-06, + "loss": 5.114, + "step": 23435 + }, + { + "epoch": 0.4768880208333333, + "grad_norm": 12.603543281555176, + "learning_rate": 9.655938458741297e-06, + "loss": 5.1004, + "step": 23440 + }, + { + "epoch": 0.47698974609375, + "grad_norm": 20.05758285522461, + "learning_rate": 9.655792745409857e-06, + "loss": 5.0355, + "step": 23445 + }, + { + "epoch": 0.4770914713541667, + "grad_norm": 15.148639678955078, + "learning_rate": 9.655647002329381e-06, + "loss": 5.0766, + "step": 23450 + }, + { + "epoch": 0.4771931966145833, + "grad_norm": 16.926725387573242, + "learning_rate": 9.655501229500796e-06, + "loss": 5.5893, + "step": 23455 + }, + { + "epoch": 0.477294921875, + "grad_norm": 16.38241195678711, + "learning_rate": 9.655355426925037e-06, + "loss": 5.376, + "step": 23460 + }, + { + "epoch": 0.4773966471354167, + "grad_norm": 17.593721389770508, + "learning_rate": 9.655209594603034e-06, + "loss": 5.2689, + "step": 23465 + }, + { + "epoch": 0.4774983723958333, + "grad_norm": 15.440144538879395, + "learning_rate": 9.655063732535721e-06, + "loss": 5.11, + "step": 23470 + }, + { + "epoch": 0.47760009765625, + "grad_norm": 18.26227569580078, + "learning_rate": 9.654917840724027e-06, + "loss": 5.2906, + "step": 23475 + }, + { + "epoch": 0.4777018229166667, + "grad_norm": 13.798332214355469, + "learning_rate": 9.654771919168885e-06, + "loss": 5.0865, + "step": 23480 + }, + { + "epoch": 0.4778035481770833, + "grad_norm": 20.201810836791992, + "learning_rate": 9.654625967871232e-06, + "loss": 4.9707, + "step": 23485 + }, + { + "epoch": 0.4779052734375, + "grad_norm": 16.710491180419922, + "learning_rate": 9.654479986831993e-06, + "loss": 5.2546, + "step": 23490 + }, + { + "epoch": 0.4780069986979167, + "grad_norm": 15.249740600585938, + "learning_rate": 9.654333976052107e-06, + "loss": 5.3666, + "step": 23495 + }, + { + "epoch": 0.4781087239583333, + "grad_norm": 14.789714813232422, + "learning_rate": 9.654187935532502e-06, + "loss": 4.8863, + "step": 23500 + }, + { + "epoch": 0.47821044921875, + "grad_norm": 18.94029426574707, + "learning_rate": 9.654041865274113e-06, + "loss": 5.2437, + "step": 23505 + }, + { + "epoch": 0.4783121744791667, + "grad_norm": 12.55636215209961, + "learning_rate": 9.653895765277875e-06, + "loss": 5.3551, + "step": 23510 + }, + { + "epoch": 0.4784138997395833, + "grad_norm": 17.133071899414062, + "learning_rate": 9.65374963554472e-06, + "loss": 5.1635, + "step": 23515 + }, + { + "epoch": 0.478515625, + "grad_norm": 22.134794235229492, + "learning_rate": 9.653603476075583e-06, + "loss": 5.0411, + "step": 23520 + }, + { + "epoch": 0.4786173502604167, + "grad_norm": 13.719337463378906, + "learning_rate": 9.653457286871396e-06, + "loss": 5.2198, + "step": 23525 + }, + { + "epoch": 0.4787190755208333, + "grad_norm": 21.5228271484375, + "learning_rate": 9.653311067933096e-06, + "loss": 4.983, + "step": 23530 + }, + { + "epoch": 0.47882080078125, + "grad_norm": 19.22565269470215, + "learning_rate": 9.653164819261614e-06, + "loss": 5.2518, + "step": 23535 + }, + { + "epoch": 0.4789225260416667, + "grad_norm": 17.226545333862305, + "learning_rate": 9.653018540857886e-06, + "loss": 5.1703, + "step": 23540 + }, + { + "epoch": 0.4790242513020833, + "grad_norm": 14.646623611450195, + "learning_rate": 9.652872232722848e-06, + "loss": 5.2347, + "step": 23545 + }, + { + "epoch": 0.4791259765625, + "grad_norm": 19.382099151611328, + "learning_rate": 9.65272589485743e-06, + "loss": 5.1611, + "step": 23550 + }, + { + "epoch": 0.4792277018229167, + "grad_norm": 16.84292221069336, + "learning_rate": 9.652579527262574e-06, + "loss": 4.9642, + "step": 23555 + }, + { + "epoch": 0.4793294270833333, + "grad_norm": 25.26438331604004, + "learning_rate": 9.65243312993921e-06, + "loss": 5.1822, + "step": 23560 + }, + { + "epoch": 0.47943115234375, + "grad_norm": 22.344514846801758, + "learning_rate": 9.652286702888276e-06, + "loss": 5.2236, + "step": 23565 + }, + { + "epoch": 0.4795328776041667, + "grad_norm": 14.92993450164795, + "learning_rate": 9.652140246110706e-06, + "loss": 5.3217, + "step": 23570 + }, + { + "epoch": 0.4796346028645833, + "grad_norm": 28.075387954711914, + "learning_rate": 9.651993759607439e-06, + "loss": 5.3637, + "step": 23575 + }, + { + "epoch": 0.479736328125, + "grad_norm": 20.116361618041992, + "learning_rate": 9.651847243379406e-06, + "loss": 4.98, + "step": 23580 + }, + { + "epoch": 0.4798380533854167, + "grad_norm": 16.90420150756836, + "learning_rate": 9.651700697427546e-06, + "loss": 5.1594, + "step": 23585 + }, + { + "epoch": 0.4799397786458333, + "grad_norm": 14.213973045349121, + "learning_rate": 9.651554121752795e-06, + "loss": 5.0793, + "step": 23590 + }, + { + "epoch": 0.48004150390625, + "grad_norm": 15.17658805847168, + "learning_rate": 9.651407516356092e-06, + "loss": 5.0225, + "step": 23595 + }, + { + "epoch": 0.4801432291666667, + "grad_norm": 17.514732360839844, + "learning_rate": 9.65126088123837e-06, + "loss": 5.0692, + "step": 23600 + }, + { + "epoch": 0.4802449544270833, + "grad_norm": 13.295560836791992, + "learning_rate": 9.651114216400567e-06, + "loss": 5.3858, + "step": 23605 + }, + { + "epoch": 0.4803466796875, + "grad_norm": 12.043135643005371, + "learning_rate": 9.650967521843622e-06, + "loss": 5.0799, + "step": 23610 + }, + { + "epoch": 0.4804484049479167, + "grad_norm": 20.532268524169922, + "learning_rate": 9.650820797568468e-06, + "loss": 5.4951, + "step": 23615 + }, + { + "epoch": 0.4805501302083333, + "grad_norm": 18.19882583618164, + "learning_rate": 9.65067404357605e-06, + "loss": 5.1851, + "step": 23620 + }, + { + "epoch": 0.48065185546875, + "grad_norm": 14.85983943939209, + "learning_rate": 9.650527259867297e-06, + "loss": 5.3215, + "step": 23625 + }, + { + "epoch": 0.4807535807291667, + "grad_norm": 17.778034210205078, + "learning_rate": 9.650380446443152e-06, + "loss": 5.0818, + "step": 23630 + }, + { + "epoch": 0.4808553059895833, + "grad_norm": 18.663219451904297, + "learning_rate": 9.650233603304554e-06, + "loss": 5.1621, + "step": 23635 + }, + { + "epoch": 0.48095703125, + "grad_norm": 19.70875358581543, + "learning_rate": 9.650086730452438e-06, + "loss": 4.8685, + "step": 23640 + }, + { + "epoch": 0.4810587565104167, + "grad_norm": 15.825725555419922, + "learning_rate": 9.649939827887745e-06, + "loss": 5.3107, + "step": 23645 + }, + { + "epoch": 0.4811604817708333, + "grad_norm": 17.895055770874023, + "learning_rate": 9.649792895611412e-06, + "loss": 5.4253, + "step": 23650 + }, + { + "epoch": 0.48126220703125, + "grad_norm": 21.928550720214844, + "learning_rate": 9.649645933624378e-06, + "loss": 5.1916, + "step": 23655 + }, + { + "epoch": 0.4813639322916667, + "grad_norm": 17.009666442871094, + "learning_rate": 9.649498941927582e-06, + "loss": 5.1924, + "step": 23660 + }, + { + "epoch": 0.4814656575520833, + "grad_norm": 15.705376625061035, + "learning_rate": 9.649351920521965e-06, + "loss": 4.8925, + "step": 23665 + }, + { + "epoch": 0.4815673828125, + "grad_norm": 17.457406997680664, + "learning_rate": 9.649204869408464e-06, + "loss": 5.3971, + "step": 23670 + }, + { + "epoch": 0.4816691080729167, + "grad_norm": 20.118921279907227, + "learning_rate": 9.64905778858802e-06, + "loss": 5.2245, + "step": 23675 + }, + { + "epoch": 0.4817708333333333, + "grad_norm": 15.77507209777832, + "learning_rate": 9.648910678061574e-06, + "loss": 5.1113, + "step": 23680 + }, + { + "epoch": 0.48187255859375, + "grad_norm": 16.210458755493164, + "learning_rate": 9.648763537830064e-06, + "loss": 5.0312, + "step": 23685 + }, + { + "epoch": 0.4819742838541667, + "grad_norm": 19.455272674560547, + "learning_rate": 9.64861636789443e-06, + "loss": 5.2504, + "step": 23690 + }, + { + "epoch": 0.4820760091145833, + "grad_norm": 15.971542358398438, + "learning_rate": 9.648469168255613e-06, + "loss": 5.3035, + "step": 23695 + }, + { + "epoch": 0.482177734375, + "grad_norm": 21.653141021728516, + "learning_rate": 9.648321938914554e-06, + "loss": 5.1827, + "step": 23700 + }, + { + "epoch": 0.4822794596354167, + "grad_norm": 16.574792861938477, + "learning_rate": 9.648174679872194e-06, + "loss": 5.1724, + "step": 23705 + }, + { + "epoch": 0.4823811848958333, + "grad_norm": 15.197013854980469, + "learning_rate": 9.648027391129473e-06, + "loss": 5.3531, + "step": 23710 + }, + { + "epoch": 0.48248291015625, + "grad_norm": 17.272747039794922, + "learning_rate": 9.647880072687331e-06, + "loss": 5.2049, + "step": 23715 + }, + { + "epoch": 0.4825846354166667, + "grad_norm": 16.187938690185547, + "learning_rate": 9.647732724546713e-06, + "loss": 5.1423, + "step": 23720 + }, + { + "epoch": 0.4826863606770833, + "grad_norm": 17.55901527404785, + "learning_rate": 9.647585346708557e-06, + "loss": 5.3885, + "step": 23725 + }, + { + "epoch": 0.4827880859375, + "grad_norm": 13.502035140991211, + "learning_rate": 9.647437939173804e-06, + "loss": 5.0893, + "step": 23730 + }, + { + "epoch": 0.4828898111979167, + "grad_norm": 11.158696174621582, + "learning_rate": 9.6472905019434e-06, + "loss": 5.193, + "step": 23735 + }, + { + "epoch": 0.4829915364583333, + "grad_norm": 20.613996505737305, + "learning_rate": 9.647143035018287e-06, + "loss": 5.0432, + "step": 23740 + }, + { + "epoch": 0.48309326171875, + "grad_norm": 18.298240661621094, + "learning_rate": 9.646995538399402e-06, + "loss": 5.0484, + "step": 23745 + }, + { + "epoch": 0.4831949869791667, + "grad_norm": 17.86116600036621, + "learning_rate": 9.646848012087692e-06, + "loss": 5.3302, + "step": 23750 + }, + { + "epoch": 0.4832967122395833, + "grad_norm": 12.802651405334473, + "learning_rate": 9.646700456084098e-06, + "loss": 5.1864, + "step": 23755 + }, + { + "epoch": 0.4833984375, + "grad_norm": 19.252113342285156, + "learning_rate": 9.646552870389564e-06, + "loss": 5.3062, + "step": 23760 + }, + { + "epoch": 0.4835001627604167, + "grad_norm": 18.454811096191406, + "learning_rate": 9.646405255005032e-06, + "loss": 5.4198, + "step": 23765 + }, + { + "epoch": 0.4836018880208333, + "grad_norm": 23.353757858276367, + "learning_rate": 9.646257609931444e-06, + "loss": 5.2412, + "step": 23770 + }, + { + "epoch": 0.48370361328125, + "grad_norm": 15.863349914550781, + "learning_rate": 9.646109935169745e-06, + "loss": 4.9762, + "step": 23775 + }, + { + "epoch": 0.4838053385416667, + "grad_norm": 21.805742263793945, + "learning_rate": 9.64596223072088e-06, + "loss": 4.8898, + "step": 23780 + }, + { + "epoch": 0.4839070638020833, + "grad_norm": 15.78990364074707, + "learning_rate": 9.64581449658579e-06, + "loss": 5.3185, + "step": 23785 + }, + { + "epoch": 0.4840087890625, + "grad_norm": 15.555179595947266, + "learning_rate": 9.64566673276542e-06, + "loss": 5.1627, + "step": 23790 + }, + { + "epoch": 0.4841105143229167, + "grad_norm": 18.27060317993164, + "learning_rate": 9.645518939260714e-06, + "loss": 5.1515, + "step": 23795 + }, + { + "epoch": 0.4842122395833333, + "grad_norm": 24.444332122802734, + "learning_rate": 9.645371116072618e-06, + "loss": 5.4564, + "step": 23800 + }, + { + "epoch": 0.48431396484375, + "grad_norm": 16.543405532836914, + "learning_rate": 9.645223263202075e-06, + "loss": 5.1884, + "step": 23805 + }, + { + "epoch": 0.4844156901041667, + "grad_norm": 19.73287010192871, + "learning_rate": 9.645075380650029e-06, + "loss": 5.4975, + "step": 23810 + }, + { + "epoch": 0.4845174153645833, + "grad_norm": 16.562170028686523, + "learning_rate": 9.644927468417426e-06, + "loss": 5.0471, + "step": 23815 + }, + { + "epoch": 0.484619140625, + "grad_norm": 13.534943580627441, + "learning_rate": 9.644779526505211e-06, + "loss": 5.2389, + "step": 23820 + }, + { + "epoch": 0.4847208658854167, + "grad_norm": 17.072397232055664, + "learning_rate": 9.64463155491433e-06, + "loss": 5.1493, + "step": 23825 + }, + { + "epoch": 0.4848225911458333, + "grad_norm": 16.668630599975586, + "learning_rate": 9.644483553645728e-06, + "loss": 5.644, + "step": 23830 + }, + { + "epoch": 0.48492431640625, + "grad_norm": 17.138137817382812, + "learning_rate": 9.644335522700348e-06, + "loss": 4.9338, + "step": 23835 + }, + { + "epoch": 0.4850260416666667, + "grad_norm": 16.199466705322266, + "learning_rate": 9.64418746207914e-06, + "loss": 5.1568, + "step": 23840 + }, + { + "epoch": 0.4851277669270833, + "grad_norm": 15.081305503845215, + "learning_rate": 9.64403937178305e-06, + "loss": 5.2335, + "step": 23845 + }, + { + "epoch": 0.4852294921875, + "grad_norm": 18.911279678344727, + "learning_rate": 9.64389125181302e-06, + "loss": 5.0934, + "step": 23850 + }, + { + "epoch": 0.4853312174479167, + "grad_norm": 15.615998268127441, + "learning_rate": 9.64374310217e-06, + "loss": 5.0029, + "step": 23855 + }, + { + "epoch": 0.4854329427083333, + "grad_norm": 17.046295166015625, + "learning_rate": 9.643594922854938e-06, + "loss": 4.9384, + "step": 23860 + }, + { + "epoch": 0.48553466796875, + "grad_norm": 14.756237030029297, + "learning_rate": 9.643446713868776e-06, + "loss": 4.9075, + "step": 23865 + }, + { + "epoch": 0.4856363932291667, + "grad_norm": 14.443024635314941, + "learning_rate": 9.643298475212464e-06, + "loss": 5.1574, + "step": 23870 + }, + { + "epoch": 0.4857381184895833, + "grad_norm": 13.626707077026367, + "learning_rate": 9.643150206886948e-06, + "loss": 5.496, + "step": 23875 + }, + { + "epoch": 0.48583984375, + "grad_norm": 11.852567672729492, + "learning_rate": 9.643001908893176e-06, + "loss": 5.1579, + "step": 23880 + }, + { + "epoch": 0.4859415690104167, + "grad_norm": 17.601634979248047, + "learning_rate": 9.642853581232096e-06, + "loss": 5.2533, + "step": 23885 + }, + { + "epoch": 0.4860432942708333, + "grad_norm": 18.880699157714844, + "learning_rate": 9.642705223904656e-06, + "loss": 4.9888, + "step": 23890 + }, + { + "epoch": 0.48614501953125, + "grad_norm": 20.198986053466797, + "learning_rate": 9.642556836911804e-06, + "loss": 4.9232, + "step": 23895 + }, + { + "epoch": 0.4862467447916667, + "grad_norm": 16.482162475585938, + "learning_rate": 9.642408420254487e-06, + "loss": 5.1679, + "step": 23900 + }, + { + "epoch": 0.4863484700520833, + "grad_norm": 19.52825164794922, + "learning_rate": 9.642259973933653e-06, + "loss": 5.2176, + "step": 23905 + }, + { + "epoch": 0.4864501953125, + "grad_norm": 20.2548828125, + "learning_rate": 9.642111497950253e-06, + "loss": 5.3085, + "step": 23910 + }, + { + "epoch": 0.4865519205729167, + "grad_norm": 16.254642486572266, + "learning_rate": 9.641962992305233e-06, + "loss": 5.3426, + "step": 23915 + }, + { + "epoch": 0.4866536458333333, + "grad_norm": 23.375709533691406, + "learning_rate": 9.641814456999543e-06, + "loss": 5.0658, + "step": 23920 + }, + { + "epoch": 0.48675537109375, + "grad_norm": 15.266608238220215, + "learning_rate": 9.641665892034133e-06, + "loss": 5.0581, + "step": 23925 + }, + { + "epoch": 0.4868570963541667, + "grad_norm": 13.983994483947754, + "learning_rate": 9.64151729740995e-06, + "loss": 5.251, + "step": 23930 + }, + { + "epoch": 0.4869588216145833, + "grad_norm": 15.307515144348145, + "learning_rate": 9.641368673127947e-06, + "loss": 5.1445, + "step": 23935 + }, + { + "epoch": 0.487060546875, + "grad_norm": 13.92756462097168, + "learning_rate": 9.64122001918907e-06, + "loss": 5.0867, + "step": 23940 + }, + { + "epoch": 0.4871622721354167, + "grad_norm": 19.348312377929688, + "learning_rate": 9.64107133559427e-06, + "loss": 5.2363, + "step": 23945 + }, + { + "epoch": 0.4872639973958333, + "grad_norm": 18.93596649169922, + "learning_rate": 9.640922622344498e-06, + "loss": 5.5047, + "step": 23950 + }, + { + "epoch": 0.48736572265625, + "grad_norm": 15.493742942810059, + "learning_rate": 9.640773879440704e-06, + "loss": 5.0083, + "step": 23955 + }, + { + "epoch": 0.4874674479166667, + "grad_norm": 17.972023010253906, + "learning_rate": 9.640625106883839e-06, + "loss": 5.3533, + "step": 23960 + }, + { + "epoch": 0.4875691731770833, + "grad_norm": 15.005330085754395, + "learning_rate": 9.64047630467485e-06, + "loss": 5.1188, + "step": 23965 + }, + { + "epoch": 0.4876708984375, + "grad_norm": 17.046342849731445, + "learning_rate": 9.640327472814693e-06, + "loss": 5.1461, + "step": 23970 + }, + { + "epoch": 0.4877726236979167, + "grad_norm": 17.14545440673828, + "learning_rate": 9.640178611304316e-06, + "loss": 5.4553, + "step": 23975 + }, + { + "epoch": 0.4878743489583333, + "grad_norm": 13.65581226348877, + "learning_rate": 9.640029720144671e-06, + "loss": 5.0599, + "step": 23980 + }, + { + "epoch": 0.48797607421875, + "grad_norm": 14.258811950683594, + "learning_rate": 9.639880799336708e-06, + "loss": 5.2911, + "step": 23985 + }, + { + "epoch": 0.4880777994791667, + "grad_norm": 16.085060119628906, + "learning_rate": 9.63973184888138e-06, + "loss": 4.9788, + "step": 23990 + }, + { + "epoch": 0.4881795247395833, + "grad_norm": 17.189130783081055, + "learning_rate": 9.639582868779638e-06, + "loss": 5.0221, + "step": 23995 + }, + { + "epoch": 0.48828125, + "grad_norm": 23.545970916748047, + "learning_rate": 9.639433859032434e-06, + "loss": 5.1447, + "step": 24000 + }, + { + "epoch": 0.4883829752604167, + "grad_norm": 15.632966995239258, + "learning_rate": 9.63928481964072e-06, + "loss": 5.1361, + "step": 24005 + }, + { + "epoch": 0.4884847005208333, + "grad_norm": 13.186652183532715, + "learning_rate": 9.63913575060545e-06, + "loss": 5.1802, + "step": 24010 + }, + { + "epoch": 0.48858642578125, + "grad_norm": 21.547887802124023, + "learning_rate": 9.638986651927575e-06, + "loss": 5.1263, + "step": 24015 + }, + { + "epoch": 0.4886881510416667, + "grad_norm": 11.981167793273926, + "learning_rate": 9.638837523608047e-06, + "loss": 5.0657, + "step": 24020 + }, + { + "epoch": 0.4887898763020833, + "grad_norm": 17.473125457763672, + "learning_rate": 9.638688365647822e-06, + "loss": 5.2083, + "step": 24025 + }, + { + "epoch": 0.4888916015625, + "grad_norm": 19.43924903869629, + "learning_rate": 9.63853917804785e-06, + "loss": 5.235, + "step": 24030 + }, + { + "epoch": 0.4889933268229167, + "grad_norm": 14.6753511428833, + "learning_rate": 9.638389960809083e-06, + "loss": 5.3798, + "step": 24035 + }, + { + "epoch": 0.4890950520833333, + "grad_norm": 22.272296905517578, + "learning_rate": 9.638240713932478e-06, + "loss": 5.1337, + "step": 24040 + }, + { + "epoch": 0.48919677734375, + "grad_norm": 14.87501049041748, + "learning_rate": 9.638091437418987e-06, + "loss": 5.282, + "step": 24045 + }, + { + "epoch": 0.4892985026041667, + "grad_norm": 19.33081817626953, + "learning_rate": 9.637942131269564e-06, + "loss": 5.1059, + "step": 24050 + }, + { + "epoch": 0.4894002278645833, + "grad_norm": 13.738369941711426, + "learning_rate": 9.637792795485164e-06, + "loss": 5.3313, + "step": 24055 + }, + { + "epoch": 0.489501953125, + "grad_norm": 16.994407653808594, + "learning_rate": 9.63764343006674e-06, + "loss": 5.2171, + "step": 24060 + }, + { + "epoch": 0.4896036783854167, + "grad_norm": 16.833778381347656, + "learning_rate": 9.637494035015245e-06, + "loss": 5.6488, + "step": 24065 + }, + { + "epoch": 0.4897054036458333, + "grad_norm": 27.969743728637695, + "learning_rate": 9.637344610331639e-06, + "loss": 5.0703, + "step": 24070 + }, + { + "epoch": 0.48980712890625, + "grad_norm": 16.213090896606445, + "learning_rate": 9.63719515601687e-06, + "loss": 4.9457, + "step": 24075 + }, + { + "epoch": 0.4899088541666667, + "grad_norm": 16.448204040527344, + "learning_rate": 9.637045672071896e-06, + "loss": 5.1951, + "step": 24080 + }, + { + "epoch": 0.4900105794270833, + "grad_norm": 19.992591857910156, + "learning_rate": 9.636896158497672e-06, + "loss": 5.3428, + "step": 24085 + }, + { + "epoch": 0.4901123046875, + "grad_norm": 19.38615608215332, + "learning_rate": 9.636746615295154e-06, + "loss": 5.1719, + "step": 24090 + }, + { + "epoch": 0.4902140299479167, + "grad_norm": 16.212024688720703, + "learning_rate": 9.636597042465297e-06, + "loss": 4.9946, + "step": 24095 + }, + { + "epoch": 0.4903157552083333, + "grad_norm": 17.47454071044922, + "learning_rate": 9.636447440009057e-06, + "loss": 5.0927, + "step": 24100 + }, + { + "epoch": 0.49041748046875, + "grad_norm": 15.5836763381958, + "learning_rate": 9.63629780792739e-06, + "loss": 5.2035, + "step": 24105 + }, + { + "epoch": 0.4905192057291667, + "grad_norm": 16.621994018554688, + "learning_rate": 9.636148146221252e-06, + "loss": 5.0987, + "step": 24110 + }, + { + "epoch": 0.4906209309895833, + "grad_norm": 15.610703468322754, + "learning_rate": 9.635998454891599e-06, + "loss": 4.9978, + "step": 24115 + }, + { + "epoch": 0.49072265625, + "grad_norm": 15.873514175415039, + "learning_rate": 9.635848733939386e-06, + "loss": 5.2382, + "step": 24120 + }, + { + "epoch": 0.4908243815104167, + "grad_norm": 18.827877044677734, + "learning_rate": 9.635698983365573e-06, + "loss": 5.3, + "step": 24125 + }, + { + "epoch": 0.4909261067708333, + "grad_norm": 15.036874771118164, + "learning_rate": 9.635549203171114e-06, + "loss": 5.1701, + "step": 24130 + }, + { + "epoch": 0.49102783203125, + "grad_norm": 18.621566772460938, + "learning_rate": 9.635399393356968e-06, + "loss": 5.2206, + "step": 24135 + }, + { + "epoch": 0.4911295572916667, + "grad_norm": 17.97666358947754, + "learning_rate": 9.63524955392409e-06, + "loss": 5.1464, + "step": 24140 + }, + { + "epoch": 0.4912312825520833, + "grad_norm": 16.436443328857422, + "learning_rate": 9.635099684873439e-06, + "loss": 5.2178, + "step": 24145 + }, + { + "epoch": 0.4913330078125, + "grad_norm": 17.121164321899414, + "learning_rate": 9.634949786205974e-06, + "loss": 5.1896, + "step": 24150 + }, + { + "epoch": 0.4914347330729167, + "grad_norm": 15.60840892791748, + "learning_rate": 9.63479985792265e-06, + "loss": 5.2697, + "step": 24155 + }, + { + "epoch": 0.4915364583333333, + "grad_norm": 18.96401023864746, + "learning_rate": 9.634649900024426e-06, + "loss": 5.1846, + "step": 24160 + }, + { + "epoch": 0.49163818359375, + "grad_norm": 22.04267692565918, + "learning_rate": 9.634499912512261e-06, + "loss": 5.256, + "step": 24165 + }, + { + "epoch": 0.4917399088541667, + "grad_norm": 14.836254119873047, + "learning_rate": 9.634349895387111e-06, + "loss": 4.9381, + "step": 24170 + }, + { + "epoch": 0.4918416341145833, + "grad_norm": 14.32054615020752, + "learning_rate": 9.634199848649939e-06, + "loss": 5.7077, + "step": 24175 + }, + { + "epoch": 0.491943359375, + "grad_norm": 13.121365547180176, + "learning_rate": 9.6340497723017e-06, + "loss": 5.1382, + "step": 24180 + }, + { + "epoch": 0.4920450846354167, + "grad_norm": 17.655550003051758, + "learning_rate": 9.633899666343354e-06, + "loss": 5.0706, + "step": 24185 + }, + { + "epoch": 0.4921468098958333, + "grad_norm": 28.54133415222168, + "learning_rate": 9.63374953077586e-06, + "loss": 5.2908, + "step": 24190 + }, + { + "epoch": 0.49224853515625, + "grad_norm": 20.41023826599121, + "learning_rate": 9.633599365600177e-06, + "loss": 5.1969, + "step": 24195 + }, + { + "epoch": 0.4923502604166667, + "grad_norm": 20.810400009155273, + "learning_rate": 9.633449170817266e-06, + "loss": 4.9926, + "step": 24200 + }, + { + "epoch": 0.4924519856770833, + "grad_norm": 19.407121658325195, + "learning_rate": 9.633298946428085e-06, + "loss": 5.238, + "step": 24205 + }, + { + "epoch": 0.4925537109375, + "grad_norm": 15.88644027709961, + "learning_rate": 9.633148692433594e-06, + "loss": 5.2306, + "step": 24210 + }, + { + "epoch": 0.4926554361979167, + "grad_norm": 17.967697143554688, + "learning_rate": 9.632998408834755e-06, + "loss": 5.224, + "step": 24215 + }, + { + "epoch": 0.4927571614583333, + "grad_norm": 16.876277923583984, + "learning_rate": 9.632848095632525e-06, + "loss": 5.2867, + "step": 24220 + }, + { + "epoch": 0.49285888671875, + "grad_norm": 17.89946174621582, + "learning_rate": 9.632697752827868e-06, + "loss": 5.2796, + "step": 24225 + }, + { + "epoch": 0.4929606119791667, + "grad_norm": 18.763851165771484, + "learning_rate": 9.632547380421741e-06, + "loss": 4.9552, + "step": 24230 + }, + { + "epoch": 0.4930623372395833, + "grad_norm": 18.469547271728516, + "learning_rate": 9.63239697841511e-06, + "loss": 5.2801, + "step": 24235 + }, + { + "epoch": 0.4931640625, + "grad_norm": 12.236815452575684, + "learning_rate": 9.632246546808929e-06, + "loss": 5.033, + "step": 24240 + }, + { + "epoch": 0.4932657877604167, + "grad_norm": 16.8654842376709, + "learning_rate": 9.632096085604166e-06, + "loss": 5.0735, + "step": 24245 + }, + { + "epoch": 0.4933675130208333, + "grad_norm": 20.334697723388672, + "learning_rate": 9.631945594801779e-06, + "loss": 4.9409, + "step": 24250 + }, + { + "epoch": 0.49346923828125, + "grad_norm": 19.7803897857666, + "learning_rate": 9.631795074402728e-06, + "loss": 5.1114, + "step": 24255 + }, + { + "epoch": 0.4935709635416667, + "grad_norm": 15.333322525024414, + "learning_rate": 9.631644524407979e-06, + "loss": 5.1078, + "step": 24260 + }, + { + "epoch": 0.4936726888020833, + "grad_norm": 19.265310287475586, + "learning_rate": 9.631493944818491e-06, + "loss": 5.096, + "step": 24265 + }, + { + "epoch": 0.4937744140625, + "grad_norm": 18.76092529296875, + "learning_rate": 9.631343335635228e-06, + "loss": 5.2719, + "step": 24270 + }, + { + "epoch": 0.4938761393229167, + "grad_norm": 14.679433822631836, + "learning_rate": 9.631192696859149e-06, + "loss": 5.0486, + "step": 24275 + }, + { + "epoch": 0.4939778645833333, + "grad_norm": 14.317996978759766, + "learning_rate": 9.631042028491222e-06, + "loss": 4.9806, + "step": 24280 + }, + { + "epoch": 0.49407958984375, + "grad_norm": 18.405336380004883, + "learning_rate": 9.630891330532403e-06, + "loss": 4.9586, + "step": 24285 + }, + { + "epoch": 0.4941813151041667, + "grad_norm": 12.74551010131836, + "learning_rate": 9.63074060298366e-06, + "loss": 5.0706, + "step": 24290 + }, + { + "epoch": 0.4942830403645833, + "grad_norm": 19.423873901367188, + "learning_rate": 9.630589845845955e-06, + "loss": 5.1107, + "step": 24295 + }, + { + "epoch": 0.494384765625, + "grad_norm": 20.08292579650879, + "learning_rate": 9.630439059120252e-06, + "loss": 5.3006, + "step": 24300 + }, + { + "epoch": 0.4944864908854167, + "grad_norm": 19.569313049316406, + "learning_rate": 9.63028824280751e-06, + "loss": 5.1267, + "step": 24305 + }, + { + "epoch": 0.4945882161458333, + "grad_norm": 17.47850799560547, + "learning_rate": 9.630137396908698e-06, + "loss": 5.1635, + "step": 24310 + }, + { + "epoch": 0.49468994140625, + "grad_norm": 14.03426742553711, + "learning_rate": 9.629986521424778e-06, + "loss": 5.6598, + "step": 24315 + }, + { + "epoch": 0.4947916666666667, + "grad_norm": 14.589723587036133, + "learning_rate": 9.629835616356714e-06, + "loss": 4.9717, + "step": 24320 + }, + { + "epoch": 0.4948933919270833, + "grad_norm": 15.87348747253418, + "learning_rate": 9.629684681705468e-06, + "loss": 5.2426, + "step": 24325 + }, + { + "epoch": 0.4949951171875, + "grad_norm": 16.793079376220703, + "learning_rate": 9.629533717472008e-06, + "loss": 5.238, + "step": 24330 + }, + { + "epoch": 0.4950968424479167, + "grad_norm": 15.218042373657227, + "learning_rate": 9.629382723657298e-06, + "loss": 5.092, + "step": 24335 + }, + { + "epoch": 0.4951985677083333, + "grad_norm": 16.41922378540039, + "learning_rate": 9.629231700262301e-06, + "loss": 5.1417, + "step": 24340 + }, + { + "epoch": 0.49530029296875, + "grad_norm": 23.879446029663086, + "learning_rate": 9.629080647287983e-06, + "loss": 5.1333, + "step": 24345 + }, + { + "epoch": 0.4954020182291667, + "grad_norm": 13.859989166259766, + "learning_rate": 9.62892956473531e-06, + "loss": 5.2315, + "step": 24350 + }, + { + "epoch": 0.4955037434895833, + "grad_norm": 15.515597343444824, + "learning_rate": 9.628778452605245e-06, + "loss": 5.1818, + "step": 24355 + }, + { + "epoch": 0.49560546875, + "grad_norm": 14.027379989624023, + "learning_rate": 9.628627310898756e-06, + "loss": 5.0856, + "step": 24360 + }, + { + "epoch": 0.4957071940104167, + "grad_norm": 17.24405288696289, + "learning_rate": 9.628476139616807e-06, + "loss": 5.0866, + "step": 24365 + }, + { + "epoch": 0.4958089192708333, + "grad_norm": 15.8631010055542, + "learning_rate": 9.628324938760366e-06, + "loss": 5.2323, + "step": 24370 + }, + { + "epoch": 0.49591064453125, + "grad_norm": 20.392433166503906, + "learning_rate": 9.628173708330397e-06, + "loss": 5.0849, + "step": 24375 + }, + { + "epoch": 0.4960123697916667, + "grad_norm": 13.791893005371094, + "learning_rate": 9.628022448327868e-06, + "loss": 5.1236, + "step": 24380 + }, + { + "epoch": 0.4961140950520833, + "grad_norm": 17.54197120666504, + "learning_rate": 9.627871158753743e-06, + "loss": 5.1355, + "step": 24385 + }, + { + "epoch": 0.4962158203125, + "grad_norm": 17.955411911010742, + "learning_rate": 9.627719839608993e-06, + "loss": 5.139, + "step": 24390 + }, + { + "epoch": 0.4963175455729167, + "grad_norm": 16.08799171447754, + "learning_rate": 9.62756849089458e-06, + "loss": 4.9067, + "step": 24395 + }, + { + "epoch": 0.4964192708333333, + "grad_norm": 16.09320068359375, + "learning_rate": 9.627417112611475e-06, + "loss": 5.3403, + "step": 24400 + }, + { + "epoch": 0.49652099609375, + "grad_norm": 14.146628379821777, + "learning_rate": 9.627265704760643e-06, + "loss": 5.268, + "step": 24405 + }, + { + "epoch": 0.4966227213541667, + "grad_norm": 16.573917388916016, + "learning_rate": 9.62711426734305e-06, + "loss": 5.2142, + "step": 24410 + }, + { + "epoch": 0.4967244466145833, + "grad_norm": 15.234368324279785, + "learning_rate": 9.626962800359667e-06, + "loss": 5.1126, + "step": 24415 + }, + { + "epoch": 0.496826171875, + "grad_norm": 12.344883918762207, + "learning_rate": 9.626811303811463e-06, + "loss": 5.2003, + "step": 24420 + }, + { + "epoch": 0.4969278971354167, + "grad_norm": 14.017895698547363, + "learning_rate": 9.6266597776994e-06, + "loss": 5.1642, + "step": 24425 + }, + { + "epoch": 0.4970296223958333, + "grad_norm": 13.152073860168457, + "learning_rate": 9.626508222024451e-06, + "loss": 5.1246, + "step": 24430 + }, + { + "epoch": 0.49713134765625, + "grad_norm": 15.538213729858398, + "learning_rate": 9.626356636787584e-06, + "loss": 5.134, + "step": 24435 + }, + { + "epoch": 0.4972330729166667, + "grad_norm": 16.23841094970703, + "learning_rate": 9.626205021989765e-06, + "loss": 4.8969, + "step": 24440 + }, + { + "epoch": 0.4973347981770833, + "grad_norm": 16.671741485595703, + "learning_rate": 9.626053377631966e-06, + "loss": 5.0619, + "step": 24445 + }, + { + "epoch": 0.4974365234375, + "grad_norm": 14.930729866027832, + "learning_rate": 9.625901703715154e-06, + "loss": 5.1082, + "step": 24450 + }, + { + "epoch": 0.4975382486979167, + "grad_norm": 15.038495063781738, + "learning_rate": 9.625750000240298e-06, + "loss": 4.8869, + "step": 24455 + }, + { + "epoch": 0.4976399739583333, + "grad_norm": 15.736631393432617, + "learning_rate": 9.625598267208369e-06, + "loss": 5.0908, + "step": 24460 + }, + { + "epoch": 0.49774169921875, + "grad_norm": 22.35830307006836, + "learning_rate": 9.625446504620334e-06, + "loss": 5.3108, + "step": 24465 + }, + { + "epoch": 0.4978434244791667, + "grad_norm": 15.791208267211914, + "learning_rate": 9.625294712477165e-06, + "loss": 5.0134, + "step": 24470 + }, + { + "epoch": 0.4979451497395833, + "grad_norm": 20.775955200195312, + "learning_rate": 9.625142890779831e-06, + "loss": 5.0275, + "step": 24475 + }, + { + "epoch": 0.498046875, + "grad_norm": 20.89739990234375, + "learning_rate": 9.624991039529302e-06, + "loss": 5.3451, + "step": 24480 + }, + { + "epoch": 0.4981486002604167, + "grad_norm": 16.821086883544922, + "learning_rate": 9.624839158726549e-06, + "loss": 5.2954, + "step": 24485 + }, + { + "epoch": 0.4982503255208333, + "grad_norm": 14.416380882263184, + "learning_rate": 9.624687248372541e-06, + "loss": 5.1176, + "step": 24490 + }, + { + "epoch": 0.49835205078125, + "grad_norm": 14.991535186767578, + "learning_rate": 9.62453530846825e-06, + "loss": 5.0508, + "step": 24495 + }, + { + "epoch": 0.4984537760416667, + "grad_norm": 18.516660690307617, + "learning_rate": 9.624383339014646e-06, + "loss": 5.1894, + "step": 24500 + }, + { + "epoch": 0.4985555013020833, + "grad_norm": 14.522954940795898, + "learning_rate": 9.624231340012701e-06, + "loss": 5.2133, + "step": 24505 + }, + { + "epoch": 0.4986572265625, + "grad_norm": 20.958982467651367, + "learning_rate": 9.624079311463385e-06, + "loss": 5.3799, + "step": 24510 + }, + { + "epoch": 0.4987589518229167, + "grad_norm": 14.450620651245117, + "learning_rate": 9.623927253367672e-06, + "loss": 5.1995, + "step": 24515 + }, + { + "epoch": 0.4988606770833333, + "grad_norm": 22.408843994140625, + "learning_rate": 9.62377516572653e-06, + "loss": 5.3222, + "step": 24520 + }, + { + "epoch": 0.49896240234375, + "grad_norm": 14.640572547912598, + "learning_rate": 9.623623048540932e-06, + "loss": 5.402, + "step": 24525 + }, + { + "epoch": 0.4990641276041667, + "grad_norm": 19.816986083984375, + "learning_rate": 9.62347090181185e-06, + "loss": 4.8426, + "step": 24530 + }, + { + "epoch": 0.4991658528645833, + "grad_norm": 14.436939239501953, + "learning_rate": 9.62331872554026e-06, + "loss": 4.9453, + "step": 24535 + }, + { + "epoch": 0.499267578125, + "grad_norm": 13.93411922454834, + "learning_rate": 9.623166519727128e-06, + "loss": 5.2519, + "step": 24540 + }, + { + "epoch": 0.4993693033854167, + "grad_norm": 14.640132904052734, + "learning_rate": 9.62301428437343e-06, + "loss": 5.4454, + "step": 24545 + }, + { + "epoch": 0.4994710286458333, + "grad_norm": 16.632204055786133, + "learning_rate": 9.62286201948014e-06, + "loss": 4.8759, + "step": 24550 + }, + { + "epoch": 0.49957275390625, + "grad_norm": 15.601277351379395, + "learning_rate": 9.622709725048227e-06, + "loss": 5.1601, + "step": 24555 + }, + { + "epoch": 0.4996744791666667, + "grad_norm": 17.62322425842285, + "learning_rate": 9.622557401078666e-06, + "loss": 5.0481, + "step": 24560 + }, + { + "epoch": 0.4997762044270833, + "grad_norm": 19.424291610717773, + "learning_rate": 9.622405047572431e-06, + "loss": 5.2238, + "step": 24565 + }, + { + "epoch": 0.4998779296875, + "grad_norm": 17.120296478271484, + "learning_rate": 9.622252664530498e-06, + "loss": 5.1955, + "step": 24570 + }, + { + "epoch": 0.4999796549479167, + "grad_norm": 21.13141632080078, + "learning_rate": 9.622100251953834e-06, + "loss": 5.0898, + "step": 24575 + }, + { + "epoch": 0.5, + "eval_loss": 5.169230937957764, + "eval_runtime": 107.6642, + "eval_samples_per_second": 18.641, + "eval_steps_per_second": 9.325, + "step": 24576 + }, + { + "epoch": 0.5000813802083334, + "grad_norm": 15.565807342529297, + "learning_rate": 9.621947809843419e-06, + "loss": 5.1883, + "step": 24580 + }, + { + "epoch": 0.50018310546875, + "grad_norm": 22.05304718017578, + "learning_rate": 9.621795338200223e-06, + "loss": 5.12, + "step": 24585 + }, + { + "epoch": 0.5002848307291666, + "grad_norm": 20.503005981445312, + "learning_rate": 9.621642837025224e-06, + "loss": 5.3462, + "step": 24590 + }, + { + "epoch": 0.5003865559895834, + "grad_norm": 16.996875762939453, + "learning_rate": 9.621490306319393e-06, + "loss": 5.1114, + "step": 24595 + }, + { + "epoch": 0.50048828125, + "grad_norm": 12.754294395446777, + "learning_rate": 9.621337746083705e-06, + "loss": 4.9207, + "step": 24600 + }, + { + "epoch": 0.5005900065104166, + "grad_norm": 13.871254920959473, + "learning_rate": 9.621185156319139e-06, + "loss": 5.3964, + "step": 24605 + }, + { + "epoch": 0.5006917317708334, + "grad_norm": 15.607601165771484, + "learning_rate": 9.621032537026665e-06, + "loss": 5.0693, + "step": 24610 + }, + { + "epoch": 0.50079345703125, + "grad_norm": 14.929997444152832, + "learning_rate": 9.620879888207258e-06, + "loss": 5.2523, + "step": 24615 + }, + { + "epoch": 0.5008951822916666, + "grad_norm": 33.463829040527344, + "learning_rate": 9.620727209861898e-06, + "loss": 5.274, + "step": 24620 + }, + { + "epoch": 0.5009969075520834, + "grad_norm": 18.089160919189453, + "learning_rate": 9.620574501991558e-06, + "loss": 5.679, + "step": 24625 + }, + { + "epoch": 0.5010986328125, + "grad_norm": 22.820209503173828, + "learning_rate": 9.620421764597212e-06, + "loss": 5.0575, + "step": 24630 + }, + { + "epoch": 0.5012003580729166, + "grad_norm": 20.336164474487305, + "learning_rate": 9.620268997679838e-06, + "loss": 5.1575, + "step": 24635 + }, + { + "epoch": 0.5013020833333334, + "grad_norm": 16.293333053588867, + "learning_rate": 9.620116201240412e-06, + "loss": 5.0711, + "step": 24640 + }, + { + "epoch": 0.50140380859375, + "grad_norm": 14.019972801208496, + "learning_rate": 9.61996337527991e-06, + "loss": 5.3288, + "step": 24645 + }, + { + "epoch": 0.5015055338541666, + "grad_norm": 15.952667236328125, + "learning_rate": 9.619810519799311e-06, + "loss": 5.1705, + "step": 24650 + }, + { + "epoch": 0.5016072591145834, + "grad_norm": 16.516366958618164, + "learning_rate": 9.619657634799587e-06, + "loss": 5.0558, + "step": 24655 + }, + { + "epoch": 0.501708984375, + "grad_norm": 15.622396469116211, + "learning_rate": 9.619504720281717e-06, + "loss": 5.2301, + "step": 24660 + }, + { + "epoch": 0.5018107096354166, + "grad_norm": 12.059234619140625, + "learning_rate": 9.619351776246679e-06, + "loss": 5.0779, + "step": 24665 + }, + { + "epoch": 0.5019124348958334, + "grad_norm": 19.943382263183594, + "learning_rate": 9.619198802695449e-06, + "loss": 5.2055, + "step": 24670 + }, + { + "epoch": 0.50201416015625, + "grad_norm": 22.879209518432617, + "learning_rate": 9.619045799629006e-06, + "loss": 5.1952, + "step": 24675 + }, + { + "epoch": 0.5021158854166666, + "grad_norm": 18.397968292236328, + "learning_rate": 9.618892767048324e-06, + "loss": 5.2451, + "step": 24680 + }, + { + "epoch": 0.5022176106770834, + "grad_norm": 10.750333786010742, + "learning_rate": 9.618739704954387e-06, + "loss": 5.2478, + "step": 24685 + }, + { + "epoch": 0.5023193359375, + "grad_norm": 15.636775970458984, + "learning_rate": 9.618586613348166e-06, + "loss": 5.1204, + "step": 24690 + }, + { + "epoch": 0.5024210611979166, + "grad_norm": 15.21705436706543, + "learning_rate": 9.618433492230645e-06, + "loss": 5.1368, + "step": 24695 + }, + { + "epoch": 0.5025227864583334, + "grad_norm": 14.102604866027832, + "learning_rate": 9.6182803416028e-06, + "loss": 5.6169, + "step": 24700 + }, + { + "epoch": 0.50262451171875, + "grad_norm": 17.215024948120117, + "learning_rate": 9.618127161465609e-06, + "loss": 5.3646, + "step": 24705 + }, + { + "epoch": 0.5027262369791666, + "grad_norm": 14.006219863891602, + "learning_rate": 9.617973951820052e-06, + "loss": 5.0262, + "step": 24710 + }, + { + "epoch": 0.5028279622395834, + "grad_norm": 31.53338623046875, + "learning_rate": 9.617820712667107e-06, + "loss": 5.236, + "step": 24715 + }, + { + "epoch": 0.5029296875, + "grad_norm": 18.120731353759766, + "learning_rate": 9.617667444007753e-06, + "loss": 5.1502, + "step": 24720 + }, + { + "epoch": 0.5030314127604166, + "grad_norm": 26.570270538330078, + "learning_rate": 9.61751414584297e-06, + "loss": 5.3439, + "step": 24725 + }, + { + "epoch": 0.5031331380208334, + "grad_norm": 18.772151947021484, + "learning_rate": 9.61736081817374e-06, + "loss": 5.3011, + "step": 24730 + }, + { + "epoch": 0.50323486328125, + "grad_norm": 18.382396697998047, + "learning_rate": 9.617207461001037e-06, + "loss": 5.0868, + "step": 24735 + }, + { + "epoch": 0.5033365885416666, + "grad_norm": 19.443872451782227, + "learning_rate": 9.617054074325844e-06, + "loss": 5.0933, + "step": 24740 + }, + { + "epoch": 0.5034383138020834, + "grad_norm": 12.329854011535645, + "learning_rate": 9.616900658149141e-06, + "loss": 5.2217, + "step": 24745 + }, + { + "epoch": 0.5035400390625, + "grad_norm": 15.956316947937012, + "learning_rate": 9.61674721247191e-06, + "loss": 5.119, + "step": 24750 + }, + { + "epoch": 0.5036417643229166, + "grad_norm": 17.650177001953125, + "learning_rate": 9.616593737295131e-06, + "loss": 5.2168, + "step": 24755 + }, + { + "epoch": 0.5037434895833334, + "grad_norm": 14.337836265563965, + "learning_rate": 9.616440232619782e-06, + "loss": 5.2911, + "step": 24760 + }, + { + "epoch": 0.50384521484375, + "grad_norm": 18.10527992248535, + "learning_rate": 9.616286698446844e-06, + "loss": 5.0113, + "step": 24765 + }, + { + "epoch": 0.5039469401041666, + "grad_norm": 26.019411087036133, + "learning_rate": 9.616133134777302e-06, + "loss": 5.1332, + "step": 24770 + }, + { + "epoch": 0.5040486653645834, + "grad_norm": 15.482717514038086, + "learning_rate": 9.615979541612133e-06, + "loss": 5.0092, + "step": 24775 + }, + { + "epoch": 0.504150390625, + "grad_norm": 16.099828720092773, + "learning_rate": 9.61582591895232e-06, + "loss": 5.0907, + "step": 24780 + }, + { + "epoch": 0.5042521158854166, + "grad_norm": 19.048498153686523, + "learning_rate": 9.615672266798846e-06, + "loss": 5.0934, + "step": 24785 + }, + { + "epoch": 0.5043538411458334, + "grad_norm": 17.69253921508789, + "learning_rate": 9.615518585152688e-06, + "loss": 5.1336, + "step": 24790 + }, + { + "epoch": 0.50445556640625, + "grad_norm": 16.256929397583008, + "learning_rate": 9.615364874014834e-06, + "loss": 5.0284, + "step": 24795 + }, + { + "epoch": 0.5045572916666666, + "grad_norm": 17.80290412902832, + "learning_rate": 9.615211133386263e-06, + "loss": 5.244, + "step": 24800 + }, + { + "epoch": 0.5046590169270834, + "grad_norm": 14.746432304382324, + "learning_rate": 9.615057363267958e-06, + "loss": 5.3179, + "step": 24805 + }, + { + "epoch": 0.5047607421875, + "grad_norm": 14.15833854675293, + "learning_rate": 9.614903563660904e-06, + "loss": 5.0639, + "step": 24810 + }, + { + "epoch": 0.5048624674479166, + "grad_norm": 14.141382217407227, + "learning_rate": 9.614749734566078e-06, + "loss": 5.1639, + "step": 24815 + }, + { + "epoch": 0.5049641927083334, + "grad_norm": 15.97409439086914, + "learning_rate": 9.614595875984466e-06, + "loss": 5.1343, + "step": 24820 + }, + { + "epoch": 0.50506591796875, + "grad_norm": 20.240711212158203, + "learning_rate": 9.614441987917054e-06, + "loss": 5.0762, + "step": 24825 + }, + { + "epoch": 0.5051676432291666, + "grad_norm": 18.096607208251953, + "learning_rate": 9.61428807036482e-06, + "loss": 5.1189, + "step": 24830 + }, + { + "epoch": 0.5052693684895834, + "grad_norm": 15.379273414611816, + "learning_rate": 9.61413412332875e-06, + "loss": 4.9422, + "step": 24835 + }, + { + "epoch": 0.50537109375, + "grad_norm": 17.336618423461914, + "learning_rate": 9.613980146809829e-06, + "loss": 5.1536, + "step": 24840 + }, + { + "epoch": 0.5054728190104166, + "grad_norm": 19.79825782775879, + "learning_rate": 9.61382614080904e-06, + "loss": 5.3471, + "step": 24845 + }, + { + "epoch": 0.5055745442708334, + "grad_norm": 15.718648910522461, + "learning_rate": 9.613672105327365e-06, + "loss": 5.3701, + "step": 24850 + }, + { + "epoch": 0.50567626953125, + "grad_norm": 12.698308944702148, + "learning_rate": 9.61351804036579e-06, + "loss": 5.2392, + "step": 24855 + }, + { + "epoch": 0.5057779947916666, + "grad_norm": 14.545037269592285, + "learning_rate": 9.6133639459253e-06, + "loss": 5.3322, + "step": 24860 + }, + { + "epoch": 0.5058797200520834, + "grad_norm": 15.023244857788086, + "learning_rate": 9.613209822006879e-06, + "loss": 5.0943, + "step": 24865 + }, + { + "epoch": 0.5059814453125, + "grad_norm": 13.595902442932129, + "learning_rate": 9.613055668611511e-06, + "loss": 5.1218, + "step": 24870 + }, + { + "epoch": 0.5060831705729166, + "grad_norm": 13.475979804992676, + "learning_rate": 9.612901485740182e-06, + "loss": 5.1054, + "step": 24875 + }, + { + "epoch": 0.5061848958333334, + "grad_norm": 13.410444259643555, + "learning_rate": 9.612747273393877e-06, + "loss": 5.224, + "step": 24880 + }, + { + "epoch": 0.50628662109375, + "grad_norm": 14.737875938415527, + "learning_rate": 9.612593031573581e-06, + "loss": 5.1456, + "step": 24885 + }, + { + "epoch": 0.5063883463541666, + "grad_norm": 19.952150344848633, + "learning_rate": 9.612438760280279e-06, + "loss": 5.1197, + "step": 24890 + }, + { + "epoch": 0.5064900716145834, + "grad_norm": 17.284090042114258, + "learning_rate": 9.612284459514958e-06, + "loss": 5.0749, + "step": 24895 + }, + { + "epoch": 0.506591796875, + "grad_norm": 22.388032913208008, + "learning_rate": 9.612130129278603e-06, + "loss": 5.1064, + "step": 24900 + }, + { + "epoch": 0.5066935221354166, + "grad_norm": 14.371082305908203, + "learning_rate": 9.611975769572201e-06, + "loss": 5.1487, + "step": 24905 + }, + { + "epoch": 0.5067952473958334, + "grad_norm": 12.759660720825195, + "learning_rate": 9.61182138039674e-06, + "loss": 5.0867, + "step": 24910 + }, + { + "epoch": 0.50689697265625, + "grad_norm": 18.537982940673828, + "learning_rate": 9.611666961753201e-06, + "loss": 5.1208, + "step": 24915 + }, + { + "epoch": 0.5069986979166666, + "grad_norm": 17.2394962310791, + "learning_rate": 9.611512513642575e-06, + "loss": 5.2612, + "step": 24920 + }, + { + "epoch": 0.5071004231770834, + "grad_norm": 23.699708938598633, + "learning_rate": 9.611358036065849e-06, + "loss": 4.9691, + "step": 24925 + }, + { + "epoch": 0.5072021484375, + "grad_norm": 16.369556427001953, + "learning_rate": 9.611203529024009e-06, + "loss": 4.9798, + "step": 24930 + }, + { + "epoch": 0.5073038736979166, + "grad_norm": 16.657852172851562, + "learning_rate": 9.611048992518042e-06, + "loss": 4.891, + "step": 24935 + }, + { + "epoch": 0.5074055989583334, + "grad_norm": 15.883289337158203, + "learning_rate": 9.610894426548933e-06, + "loss": 5.0005, + "step": 24940 + }, + { + "epoch": 0.50750732421875, + "grad_norm": 16.901941299438477, + "learning_rate": 9.610739831117675e-06, + "loss": 5.2346, + "step": 24945 + }, + { + "epoch": 0.5076090494791666, + "grad_norm": 21.330915451049805, + "learning_rate": 9.610585206225253e-06, + "loss": 5.2233, + "step": 24950 + }, + { + "epoch": 0.5077107747395834, + "grad_norm": 19.429672241210938, + "learning_rate": 9.610430551872656e-06, + "loss": 5.4895, + "step": 24955 + }, + { + "epoch": 0.5078125, + "grad_norm": 14.392637252807617, + "learning_rate": 9.61027586806087e-06, + "loss": 5.403, + "step": 24960 + }, + { + "epoch": 0.5079142252604166, + "grad_norm": 13.519229888916016, + "learning_rate": 9.610121154790884e-06, + "loss": 5.1046, + "step": 24965 + }, + { + "epoch": 0.5080159505208334, + "grad_norm": 19.730772018432617, + "learning_rate": 9.60996641206369e-06, + "loss": 5.2666, + "step": 24970 + }, + { + "epoch": 0.50811767578125, + "grad_norm": 18.701496124267578, + "learning_rate": 9.609811639880272e-06, + "loss": 4.7703, + "step": 24975 + }, + { + "epoch": 0.5082194010416666, + "grad_norm": 19.81943702697754, + "learning_rate": 9.60965683824162e-06, + "loss": 5.0527, + "step": 24980 + }, + { + "epoch": 0.5083211263020834, + "grad_norm": 13.670988082885742, + "learning_rate": 9.609502007148725e-06, + "loss": 5.2786, + "step": 24985 + }, + { + "epoch": 0.5084228515625, + "grad_norm": 12.853215217590332, + "learning_rate": 9.609347146602575e-06, + "loss": 5.258, + "step": 24990 + }, + { + "epoch": 0.5085245768229166, + "grad_norm": 19.998065948486328, + "learning_rate": 9.60919225660416e-06, + "loss": 5.2677, + "step": 24995 + }, + { + "epoch": 0.5086263020833334, + "grad_norm": 21.127614974975586, + "learning_rate": 9.609037337154472e-06, + "loss": 5.2338, + "step": 25000 + }, + { + "epoch": 0.50872802734375, + "grad_norm": 19.936689376831055, + "learning_rate": 9.608882388254497e-06, + "loss": 5.2606, + "step": 25005 + }, + { + "epoch": 0.5088297526041666, + "grad_norm": 18.94000244140625, + "learning_rate": 9.608727409905226e-06, + "loss": 5.2063, + "step": 25010 + }, + { + "epoch": 0.5089314778645834, + "grad_norm": 14.288313865661621, + "learning_rate": 9.60857240210765e-06, + "loss": 4.989, + "step": 25015 + }, + { + "epoch": 0.509033203125, + "grad_norm": 16.916362762451172, + "learning_rate": 9.60841736486276e-06, + "loss": 5.3102, + "step": 25020 + }, + { + "epoch": 0.5091349283854166, + "grad_norm": 17.706815719604492, + "learning_rate": 9.608262298171544e-06, + "loss": 5.265, + "step": 25025 + }, + { + "epoch": 0.5092366536458334, + "grad_norm": 16.95524024963379, + "learning_rate": 9.608107202034998e-06, + "loss": 4.9433, + "step": 25030 + }, + { + "epoch": 0.50933837890625, + "grad_norm": 13.687314987182617, + "learning_rate": 9.607952076454107e-06, + "loss": 5.0698, + "step": 25035 + }, + { + "epoch": 0.5094401041666666, + "grad_norm": 17.147342681884766, + "learning_rate": 9.607796921429866e-06, + "loss": 5.0222, + "step": 25040 + }, + { + "epoch": 0.5095418294270834, + "grad_norm": 19.56587028503418, + "learning_rate": 9.607641736963265e-06, + "loss": 5.2874, + "step": 25045 + }, + { + "epoch": 0.5096435546875, + "grad_norm": 17.406482696533203, + "learning_rate": 9.607486523055297e-06, + "loss": 5.1118, + "step": 25050 + }, + { + "epoch": 0.5097452799479166, + "grad_norm": 14.856159210205078, + "learning_rate": 9.607331279706952e-06, + "loss": 4.949, + "step": 25055 + }, + { + "epoch": 0.5098470052083334, + "grad_norm": 18.881099700927734, + "learning_rate": 9.607176006919224e-06, + "loss": 5.0227, + "step": 25060 + }, + { + "epoch": 0.50994873046875, + "grad_norm": 22.763212203979492, + "learning_rate": 9.607020704693102e-06, + "loss": 5.6356, + "step": 25065 + }, + { + "epoch": 0.5100504557291666, + "grad_norm": 18.00067138671875, + "learning_rate": 9.606865373029582e-06, + "loss": 5.3782, + "step": 25070 + }, + { + "epoch": 0.5101521809895834, + "grad_norm": 22.1876277923584, + "learning_rate": 9.606710011929652e-06, + "loss": 5.0656, + "step": 25075 + }, + { + "epoch": 0.51025390625, + "grad_norm": 17.513038635253906, + "learning_rate": 9.606554621394308e-06, + "loss": 5.2565, + "step": 25080 + }, + { + "epoch": 0.5103556315104166, + "grad_norm": 15.920730590820312, + "learning_rate": 9.606399201424544e-06, + "loss": 5.1969, + "step": 25085 + }, + { + "epoch": 0.5104573567708334, + "grad_norm": 17.335500717163086, + "learning_rate": 9.60624375202135e-06, + "loss": 5.2712, + "step": 25090 + }, + { + "epoch": 0.51055908203125, + "grad_norm": 19.12936782836914, + "learning_rate": 9.60608827318572e-06, + "loss": 5.2777, + "step": 25095 + }, + { + "epoch": 0.5106608072916666, + "grad_norm": 16.094209671020508, + "learning_rate": 9.605932764918651e-06, + "loss": 5.3327, + "step": 25100 + }, + { + "epoch": 0.5107625325520834, + "grad_norm": 19.216283798217773, + "learning_rate": 9.60577722722113e-06, + "loss": 5.2391, + "step": 25105 + }, + { + "epoch": 0.5108642578125, + "grad_norm": 13.12026596069336, + "learning_rate": 9.605621660094159e-06, + "loss": 5.0695, + "step": 25110 + }, + { + "epoch": 0.5109659830729166, + "grad_norm": 15.809826850891113, + "learning_rate": 9.605466063538725e-06, + "loss": 4.9236, + "step": 25115 + }, + { + "epoch": 0.5110677083333334, + "grad_norm": 17.480958938598633, + "learning_rate": 9.605310437555825e-06, + "loss": 5.2646, + "step": 25120 + }, + { + "epoch": 0.51116943359375, + "grad_norm": 19.93852996826172, + "learning_rate": 9.605154782146452e-06, + "loss": 5.0148, + "step": 25125 + }, + { + "epoch": 0.5112711588541666, + "grad_norm": 13.407774925231934, + "learning_rate": 9.604999097311605e-06, + "loss": 5.1658, + "step": 25130 + }, + { + "epoch": 0.5113728841145834, + "grad_norm": 20.05322265625, + "learning_rate": 9.604843383052274e-06, + "loss": 4.9524, + "step": 25135 + }, + { + "epoch": 0.511474609375, + "grad_norm": 19.790067672729492, + "learning_rate": 9.604687639369455e-06, + "loss": 4.8282, + "step": 25140 + }, + { + "epoch": 0.5115763346354166, + "grad_norm": 18.6205997467041, + "learning_rate": 9.604531866264143e-06, + "loss": 5.0542, + "step": 25145 + }, + { + "epoch": 0.5116780598958334, + "grad_norm": 17.29355812072754, + "learning_rate": 9.604376063737335e-06, + "loss": 5.2496, + "step": 25150 + }, + { + "epoch": 0.51177978515625, + "grad_norm": 14.523309707641602, + "learning_rate": 9.604220231790026e-06, + "loss": 5.1297, + "step": 25155 + }, + { + "epoch": 0.5118815104166666, + "grad_norm": 15.44129753112793, + "learning_rate": 9.60406437042321e-06, + "loss": 5.3952, + "step": 25160 + }, + { + "epoch": 0.5119832356770834, + "grad_norm": 12.988557815551758, + "learning_rate": 9.603908479637885e-06, + "loss": 4.8093, + "step": 25165 + }, + { + "epoch": 0.5120849609375, + "grad_norm": 14.446481704711914, + "learning_rate": 9.603752559435046e-06, + "loss": 5.142, + "step": 25170 + }, + { + "epoch": 0.5121866861979166, + "grad_norm": 18.073894500732422, + "learning_rate": 9.603596609815692e-06, + "loss": 5.1548, + "step": 25175 + }, + { + "epoch": 0.5122884114583334, + "grad_norm": 18.545621871948242, + "learning_rate": 9.603440630780814e-06, + "loss": 5.1288, + "step": 25180 + }, + { + "epoch": 0.51239013671875, + "grad_norm": 19.257713317871094, + "learning_rate": 9.603284622331413e-06, + "loss": 5.3228, + "step": 25185 + }, + { + "epoch": 0.5124918619791666, + "grad_norm": 20.170513153076172, + "learning_rate": 9.603128584468485e-06, + "loss": 5.2073, + "step": 25190 + }, + { + "epoch": 0.5125935872395834, + "grad_norm": 15.119446754455566, + "learning_rate": 9.602972517193023e-06, + "loss": 4.9005, + "step": 25195 + }, + { + "epoch": 0.5126953125, + "grad_norm": 13.598370552062988, + "learning_rate": 9.602816420506032e-06, + "loss": 5.2041, + "step": 25200 + }, + { + "epoch": 0.5127970377604166, + "grad_norm": 13.402128219604492, + "learning_rate": 9.602660294408504e-06, + "loss": 5.2442, + "step": 25205 + }, + { + "epoch": 0.5128987630208334, + "grad_norm": 15.810794830322266, + "learning_rate": 9.602504138901437e-06, + "loss": 4.8801, + "step": 25210 + }, + { + "epoch": 0.51300048828125, + "grad_norm": 12.709358215332031, + "learning_rate": 9.60234795398583e-06, + "loss": 5.059, + "step": 25215 + }, + { + "epoch": 0.5131022135416666, + "grad_norm": 20.837440490722656, + "learning_rate": 9.602191739662681e-06, + "loss": 5.3154, + "step": 25220 + }, + { + "epoch": 0.5132039388020834, + "grad_norm": 18.852033615112305, + "learning_rate": 9.602035495932988e-06, + "loss": 5.023, + "step": 25225 + }, + { + "epoch": 0.5133056640625, + "grad_norm": 24.805158615112305, + "learning_rate": 9.601879222797747e-06, + "loss": 5.397, + "step": 25230 + }, + { + "epoch": 0.5134073893229166, + "grad_norm": 30.68853187561035, + "learning_rate": 9.601722920257961e-06, + "loss": 5.5178, + "step": 25235 + }, + { + "epoch": 0.5135091145833334, + "grad_norm": 13.74907398223877, + "learning_rate": 9.601566588314626e-06, + "loss": 5.0965, + "step": 25240 + }, + { + "epoch": 0.51361083984375, + "grad_norm": 17.661081314086914, + "learning_rate": 9.60141022696874e-06, + "loss": 5.0345, + "step": 25245 + }, + { + "epoch": 0.5137125651041666, + "grad_norm": 18.406940460205078, + "learning_rate": 9.601253836221304e-06, + "loss": 4.9744, + "step": 25250 + }, + { + "epoch": 0.5138142903645834, + "grad_norm": 14.666051864624023, + "learning_rate": 9.601097416073318e-06, + "loss": 5.1805, + "step": 25255 + }, + { + "epoch": 0.513916015625, + "grad_norm": 16.522323608398438, + "learning_rate": 9.600940966525778e-06, + "loss": 5.045, + "step": 25260 + }, + { + "epoch": 0.5140177408854166, + "grad_norm": 16.052589416503906, + "learning_rate": 9.600784487579687e-06, + "loss": 5.351, + "step": 25265 + }, + { + "epoch": 0.5141194661458334, + "grad_norm": 15.452046394348145, + "learning_rate": 9.600627979236042e-06, + "loss": 5.2352, + "step": 25270 + }, + { + "epoch": 0.51422119140625, + "grad_norm": 20.541290283203125, + "learning_rate": 9.600471441495845e-06, + "loss": 5.1934, + "step": 25275 + }, + { + "epoch": 0.5143229166666666, + "grad_norm": 33.96351623535156, + "learning_rate": 9.600314874360097e-06, + "loss": 5.1757, + "step": 25280 + }, + { + "epoch": 0.5144246419270834, + "grad_norm": 15.471282958984375, + "learning_rate": 9.600158277829797e-06, + "loss": 5.212, + "step": 25285 + }, + { + "epoch": 0.5145263671875, + "grad_norm": 17.619226455688477, + "learning_rate": 9.600001651905946e-06, + "loss": 5.08, + "step": 25290 + }, + { + "epoch": 0.5146280924479166, + "grad_norm": 17.96546173095703, + "learning_rate": 9.599844996589544e-06, + "loss": 5.2611, + "step": 25295 + }, + { + "epoch": 0.5147298177083334, + "grad_norm": 19.199125289916992, + "learning_rate": 9.599688311881593e-06, + "loss": 5.1419, + "step": 25300 + }, + { + "epoch": 0.51483154296875, + "grad_norm": 15.284379005432129, + "learning_rate": 9.599531597783094e-06, + "loss": 5.1558, + "step": 25305 + }, + { + "epoch": 0.5149332682291666, + "grad_norm": 12.683929443359375, + "learning_rate": 9.599374854295047e-06, + "loss": 5.1577, + "step": 25310 + }, + { + "epoch": 0.5150349934895834, + "grad_norm": 22.1774959564209, + "learning_rate": 9.599218081418457e-06, + "loss": 5.0313, + "step": 25315 + }, + { + "epoch": 0.51513671875, + "grad_norm": 17.503372192382812, + "learning_rate": 9.599061279154322e-06, + "loss": 5.0333, + "step": 25320 + }, + { + "epoch": 0.5152384440104166, + "grad_norm": 20.418479919433594, + "learning_rate": 9.598904447503645e-06, + "loss": 5.1096, + "step": 25325 + }, + { + "epoch": 0.5153401692708334, + "grad_norm": 18.416805267333984, + "learning_rate": 9.598747586467429e-06, + "loss": 5.0861, + "step": 25330 + }, + { + "epoch": 0.51544189453125, + "grad_norm": 18.924062728881836, + "learning_rate": 9.598590696046677e-06, + "loss": 5.23, + "step": 25335 + }, + { + "epoch": 0.5155436197916666, + "grad_norm": 18.464784622192383, + "learning_rate": 9.598433776242389e-06, + "loss": 5.2217, + "step": 25340 + }, + { + "epoch": 0.5156453450520834, + "grad_norm": 16.901208877563477, + "learning_rate": 9.598276827055569e-06, + "loss": 5.1016, + "step": 25345 + }, + { + "epoch": 0.5157470703125, + "grad_norm": 21.96406364440918, + "learning_rate": 9.598119848487219e-06, + "loss": 5.4093, + "step": 25350 + }, + { + "epoch": 0.5158487955729166, + "grad_norm": 15.992688179016113, + "learning_rate": 9.597962840538344e-06, + "loss": 5.1589, + "step": 25355 + }, + { + "epoch": 0.5159505208333334, + "grad_norm": 15.016253471374512, + "learning_rate": 9.597805803209946e-06, + "loss": 5.314, + "step": 25360 + }, + { + "epoch": 0.51605224609375, + "grad_norm": 17.18465805053711, + "learning_rate": 9.597648736503027e-06, + "loss": 5.2392, + "step": 25365 + }, + { + "epoch": 0.5161539713541666, + "grad_norm": 14.759913444519043, + "learning_rate": 9.597491640418595e-06, + "loss": 5.1861, + "step": 25370 + }, + { + "epoch": 0.5162556966145834, + "grad_norm": 15.181002616882324, + "learning_rate": 9.597334514957648e-06, + "loss": 4.974, + "step": 25375 + }, + { + "epoch": 0.516357421875, + "grad_norm": 15.812889099121094, + "learning_rate": 9.597177360121195e-06, + "loss": 5.1788, + "step": 25380 + }, + { + "epoch": 0.5164591471354166, + "grad_norm": 14.866677284240723, + "learning_rate": 9.597020175910237e-06, + "loss": 5.321, + "step": 25385 + }, + { + "epoch": 0.5165608723958334, + "grad_norm": 23.757963180541992, + "learning_rate": 9.596862962325779e-06, + "loss": 5.0394, + "step": 25390 + }, + { + "epoch": 0.51666259765625, + "grad_norm": 16.200603485107422, + "learning_rate": 9.596705719368827e-06, + "loss": 5.3331, + "step": 25395 + }, + { + "epoch": 0.5167643229166666, + "grad_norm": 17.461084365844727, + "learning_rate": 9.596548447040383e-06, + "loss": 5.4245, + "step": 25400 + }, + { + "epoch": 0.5168660481770834, + "grad_norm": 17.716787338256836, + "learning_rate": 9.596391145341454e-06, + "loss": 5.4181, + "step": 25405 + }, + { + "epoch": 0.5169677734375, + "grad_norm": 17.20650863647461, + "learning_rate": 9.596233814273043e-06, + "loss": 5.1659, + "step": 25410 + }, + { + "epoch": 0.5170694986979166, + "grad_norm": 18.717050552368164, + "learning_rate": 9.59607645383616e-06, + "loss": 5.0307, + "step": 25415 + }, + { + "epoch": 0.5171712239583334, + "grad_norm": 24.474897384643555, + "learning_rate": 9.595919064031807e-06, + "loss": 5.101, + "step": 25420 + }, + { + "epoch": 0.51727294921875, + "grad_norm": 22.149354934692383, + "learning_rate": 9.595761644860991e-06, + "loss": 5.326, + "step": 25425 + }, + { + "epoch": 0.5173746744791666, + "grad_norm": 21.64432144165039, + "learning_rate": 9.595604196324715e-06, + "loss": 4.9799, + "step": 25430 + }, + { + "epoch": 0.5174763997395834, + "grad_norm": 15.079298973083496, + "learning_rate": 9.595446718423986e-06, + "loss": 5.1849, + "step": 25435 + }, + { + "epoch": 0.517578125, + "grad_norm": 18.750499725341797, + "learning_rate": 9.595289211159812e-06, + "loss": 5.2517, + "step": 25440 + }, + { + "epoch": 0.5176798502604166, + "grad_norm": 19.96014976501465, + "learning_rate": 9.5951316745332e-06, + "loss": 5.3971, + "step": 25445 + }, + { + "epoch": 0.5177815755208334, + "grad_norm": 13.558910369873047, + "learning_rate": 9.594974108545156e-06, + "loss": 4.9604, + "step": 25450 + }, + { + "epoch": 0.51788330078125, + "grad_norm": 18.54937744140625, + "learning_rate": 9.594816513196685e-06, + "loss": 5.0651, + "step": 25455 + }, + { + "epoch": 0.5179850260416666, + "grad_norm": 13.53268814086914, + "learning_rate": 9.594658888488795e-06, + "loss": 5.3406, + "step": 25460 + }, + { + "epoch": 0.5180867513020834, + "grad_norm": 25.520158767700195, + "learning_rate": 9.594501234422493e-06, + "loss": 5.4664, + "step": 25465 + }, + { + "epoch": 0.5181884765625, + "grad_norm": 15.58751392364502, + "learning_rate": 9.594343550998786e-06, + "loss": 4.969, + "step": 25470 + }, + { + "epoch": 0.5182902018229166, + "grad_norm": 18.80500030517578, + "learning_rate": 9.594185838218684e-06, + "loss": 5.0321, + "step": 25475 + }, + { + "epoch": 0.5183919270833334, + "grad_norm": 19.73370933532715, + "learning_rate": 9.59402809608319e-06, + "loss": 5.4573, + "step": 25480 + }, + { + "epoch": 0.51849365234375, + "grad_norm": 19.165462493896484, + "learning_rate": 9.593870324593318e-06, + "loss": 4.9473, + "step": 25485 + }, + { + "epoch": 0.5185953776041666, + "grad_norm": 18.5328369140625, + "learning_rate": 9.593712523750072e-06, + "loss": 5.2412, + "step": 25490 + }, + { + "epoch": 0.5186971028645834, + "grad_norm": 17.776836395263672, + "learning_rate": 9.593554693554462e-06, + "loss": 5.0063, + "step": 25495 + }, + { + "epoch": 0.518798828125, + "grad_norm": 13.682665824890137, + "learning_rate": 9.593396834007494e-06, + "loss": 5.3532, + "step": 25500 + }, + { + "epoch": 0.5189005533854166, + "grad_norm": 17.767425537109375, + "learning_rate": 9.593238945110178e-06, + "loss": 5.4037, + "step": 25505 + }, + { + "epoch": 0.5190022786458334, + "grad_norm": 17.282642364501953, + "learning_rate": 9.593081026863524e-06, + "loss": 5.3121, + "step": 25510 + }, + { + "epoch": 0.51910400390625, + "grad_norm": 25.26002311706543, + "learning_rate": 9.59292307926854e-06, + "loss": 5.0675, + "step": 25515 + }, + { + "epoch": 0.5192057291666666, + "grad_norm": 15.178634643554688, + "learning_rate": 9.592765102326235e-06, + "loss": 5.2034, + "step": 25520 + }, + { + "epoch": 0.5193074544270834, + "grad_norm": 19.003271102905273, + "learning_rate": 9.59260709603762e-06, + "loss": 5.4775, + "step": 25525 + }, + { + "epoch": 0.5194091796875, + "grad_norm": 15.301800727844238, + "learning_rate": 9.592449060403704e-06, + "loss": 5.6719, + "step": 25530 + }, + { + "epoch": 0.5195109049479166, + "grad_norm": 20.8083553314209, + "learning_rate": 9.592290995425495e-06, + "loss": 5.4021, + "step": 25535 + }, + { + "epoch": 0.5196126302083334, + "grad_norm": 20.797883987426758, + "learning_rate": 9.592132901104005e-06, + "loss": 5.0271, + "step": 25540 + }, + { + "epoch": 0.51971435546875, + "grad_norm": 18.741914749145508, + "learning_rate": 9.591974777440245e-06, + "loss": 5.1238, + "step": 25545 + }, + { + "epoch": 0.5198160807291666, + "grad_norm": 18.81517791748047, + "learning_rate": 9.591816624435222e-06, + "loss": 5.4604, + "step": 25550 + }, + { + "epoch": 0.5199178059895834, + "grad_norm": 15.459240913391113, + "learning_rate": 9.591658442089948e-06, + "loss": 5.2593, + "step": 25555 + }, + { + "epoch": 0.52001953125, + "grad_norm": 17.6088924407959, + "learning_rate": 9.591500230405435e-06, + "loss": 5.4096, + "step": 25560 + }, + { + "epoch": 0.5201212565104166, + "grad_norm": 17.771121978759766, + "learning_rate": 9.591341989382693e-06, + "loss": 5.6715, + "step": 25565 + }, + { + "epoch": 0.5202229817708334, + "grad_norm": 13.979878425598145, + "learning_rate": 9.591183719022734e-06, + "loss": 5.1379, + "step": 25570 + }, + { + "epoch": 0.52032470703125, + "grad_norm": 19.603363037109375, + "learning_rate": 9.591025419326568e-06, + "loss": 5.1873, + "step": 25575 + }, + { + "epoch": 0.5204264322916666, + "grad_norm": 19.952861785888672, + "learning_rate": 9.590867090295206e-06, + "loss": 5.3402, + "step": 25580 + }, + { + "epoch": 0.5205281575520834, + "grad_norm": 22.071849822998047, + "learning_rate": 9.590708731929662e-06, + "loss": 5.1173, + "step": 25585 + }, + { + "epoch": 0.5206298828125, + "grad_norm": 15.608780860900879, + "learning_rate": 9.590550344230945e-06, + "loss": 5.0479, + "step": 25590 + }, + { + "epoch": 0.5207316080729166, + "grad_norm": 21.206274032592773, + "learning_rate": 9.59039192720007e-06, + "loss": 5.0363, + "step": 25595 + }, + { + "epoch": 0.5208333333333334, + "grad_norm": 18.34174156188965, + "learning_rate": 9.590233480838046e-06, + "loss": 5.1955, + "step": 25600 + }, + { + "epoch": 0.52093505859375, + "grad_norm": 14.724275588989258, + "learning_rate": 9.590075005145888e-06, + "loss": 5.1212, + "step": 25605 + }, + { + "epoch": 0.5210367838541666, + "grad_norm": 18.836917877197266, + "learning_rate": 9.589916500124608e-06, + "loss": 5.3363, + "step": 25610 + }, + { + "epoch": 0.5211385091145834, + "grad_norm": 17.248666763305664, + "learning_rate": 9.58975796577522e-06, + "loss": 5.0937, + "step": 25615 + }, + { + "epoch": 0.521240234375, + "grad_norm": 17.67180633544922, + "learning_rate": 9.589599402098733e-06, + "loss": 5.157, + "step": 25620 + }, + { + "epoch": 0.5213419596354166, + "grad_norm": 12.41011905670166, + "learning_rate": 9.589440809096164e-06, + "loss": 5.0646, + "step": 25625 + }, + { + "epoch": 0.5214436848958334, + "grad_norm": 22.57916831970215, + "learning_rate": 9.589282186768524e-06, + "loss": 5.1233, + "step": 25630 + }, + { + "epoch": 0.52154541015625, + "grad_norm": 19.985260009765625, + "learning_rate": 9.58912353511683e-06, + "loss": 5.045, + "step": 25635 + }, + { + "epoch": 0.5216471354166666, + "grad_norm": 16.43364906311035, + "learning_rate": 9.588964854142092e-06, + "loss": 5.1252, + "step": 25640 + }, + { + "epoch": 0.5217488606770834, + "grad_norm": 16.483198165893555, + "learning_rate": 9.588806143845326e-06, + "loss": 5.0763, + "step": 25645 + }, + { + "epoch": 0.5218505859375, + "grad_norm": 21.247323989868164, + "learning_rate": 9.588647404227545e-06, + "loss": 5.087, + "step": 25650 + }, + { + "epoch": 0.5219523111979166, + "grad_norm": 21.52535629272461, + "learning_rate": 9.588488635289762e-06, + "loss": 5.1019, + "step": 25655 + }, + { + "epoch": 0.5220540364583334, + "grad_norm": 15.503020286560059, + "learning_rate": 9.588329837032995e-06, + "loss": 5.233, + "step": 25660 + }, + { + "epoch": 0.52215576171875, + "grad_norm": 18.324886322021484, + "learning_rate": 9.588171009458258e-06, + "loss": 5.2817, + "step": 25665 + }, + { + "epoch": 0.5222574869791666, + "grad_norm": 20.553342819213867, + "learning_rate": 9.588012152566563e-06, + "loss": 4.8294, + "step": 25670 + }, + { + "epoch": 0.5223592122395834, + "grad_norm": 12.389060974121094, + "learning_rate": 9.587853266358926e-06, + "loss": 5.2854, + "step": 25675 + }, + { + "epoch": 0.5224609375, + "grad_norm": 18.202550888061523, + "learning_rate": 9.587694350836364e-06, + "loss": 5.1861, + "step": 25680 + }, + { + "epoch": 0.5225626627604166, + "grad_norm": 16.551822662353516, + "learning_rate": 9.587535405999893e-06, + "loss": 4.9654, + "step": 25685 + }, + { + "epoch": 0.5226643880208334, + "grad_norm": 19.902070999145508, + "learning_rate": 9.587376431850526e-06, + "loss": 5.4328, + "step": 25690 + }, + { + "epoch": 0.52276611328125, + "grad_norm": 15.209125518798828, + "learning_rate": 9.58721742838928e-06, + "loss": 5.1383, + "step": 25695 + }, + { + "epoch": 0.5228678385416666, + "grad_norm": 17.25515365600586, + "learning_rate": 9.587058395617169e-06, + "loss": 5.2659, + "step": 25700 + }, + { + "epoch": 0.5229695638020834, + "grad_norm": 14.685185432434082, + "learning_rate": 9.586899333535212e-06, + "loss": 5.2034, + "step": 25705 + }, + { + "epoch": 0.5230712890625, + "grad_norm": 19.9639949798584, + "learning_rate": 9.586740242144426e-06, + "loss": 5.0081, + "step": 25710 + }, + { + "epoch": 0.5231730143229166, + "grad_norm": 23.412384033203125, + "learning_rate": 9.586581121445824e-06, + "loss": 5.2747, + "step": 25715 + }, + { + "epoch": 0.5232747395833334, + "grad_norm": 18.19957160949707, + "learning_rate": 9.586421971440425e-06, + "loss": 5.1624, + "step": 25720 + }, + { + "epoch": 0.52337646484375, + "grad_norm": 17.307336807250977, + "learning_rate": 9.586262792129245e-06, + "loss": 5.0662, + "step": 25725 + }, + { + "epoch": 0.5234781901041666, + "grad_norm": 18.630970001220703, + "learning_rate": 9.586103583513302e-06, + "loss": 5.3631, + "step": 25730 + }, + { + "epoch": 0.5235799153645834, + "grad_norm": 16.466510772705078, + "learning_rate": 9.585944345593614e-06, + "loss": 5.0691, + "step": 25735 + }, + { + "epoch": 0.523681640625, + "grad_norm": 13.309252738952637, + "learning_rate": 9.585785078371196e-06, + "loss": 5.0959, + "step": 25740 + }, + { + "epoch": 0.5237833658854166, + "grad_norm": 17.091432571411133, + "learning_rate": 9.585625781847067e-06, + "loss": 5.2016, + "step": 25745 + }, + { + "epoch": 0.5238850911458334, + "grad_norm": 14.403236389160156, + "learning_rate": 9.585466456022246e-06, + "loss": 5.1549, + "step": 25750 + }, + { + "epoch": 0.52398681640625, + "grad_norm": 15.078558921813965, + "learning_rate": 9.58530710089775e-06, + "loss": 5.0844, + "step": 25755 + }, + { + "epoch": 0.5240885416666666, + "grad_norm": 20.19329261779785, + "learning_rate": 9.585147716474597e-06, + "loss": 5.2198, + "step": 25760 + }, + { + "epoch": 0.5241902669270834, + "grad_norm": 17.044740676879883, + "learning_rate": 9.584988302753805e-06, + "loss": 4.9534, + "step": 25765 + }, + { + "epoch": 0.5242919921875, + "grad_norm": 18.075172424316406, + "learning_rate": 9.584828859736393e-06, + "loss": 5.1122, + "step": 25770 + }, + { + "epoch": 0.5243937174479166, + "grad_norm": 25.228424072265625, + "learning_rate": 9.584669387423382e-06, + "loss": 4.9998, + "step": 25775 + }, + { + "epoch": 0.5244954427083334, + "grad_norm": 15.373437881469727, + "learning_rate": 9.584509885815786e-06, + "loss": 5.0727, + "step": 25780 + }, + { + "epoch": 0.52459716796875, + "grad_norm": 17.512897491455078, + "learning_rate": 9.584350354914629e-06, + "loss": 4.8927, + "step": 25785 + }, + { + "epoch": 0.5246988932291666, + "grad_norm": 25.06592559814453, + "learning_rate": 9.584190794720929e-06, + "loss": 5.119, + "step": 25790 + }, + { + "epoch": 0.5248006184895834, + "grad_norm": 18.4107608795166, + "learning_rate": 9.584031205235704e-06, + "loss": 5.0038, + "step": 25795 + }, + { + "epoch": 0.52490234375, + "grad_norm": 19.589895248413086, + "learning_rate": 9.583871586459974e-06, + "loss": 5.1193, + "step": 25800 + }, + { + "epoch": 0.5250040690104166, + "grad_norm": 16.778766632080078, + "learning_rate": 9.583711938394762e-06, + "loss": 5.1605, + "step": 25805 + }, + { + "epoch": 0.5251057942708334, + "grad_norm": 22.080493927001953, + "learning_rate": 9.583552261041085e-06, + "loss": 5.2204, + "step": 25810 + }, + { + "epoch": 0.52520751953125, + "grad_norm": 21.711040496826172, + "learning_rate": 9.583392554399963e-06, + "loss": 5.005, + "step": 25815 + }, + { + "epoch": 0.5253092447916666, + "grad_norm": 12.497228622436523, + "learning_rate": 9.583232818472418e-06, + "loss": 5.1045, + "step": 25820 + }, + { + "epoch": 0.5254109700520834, + "grad_norm": 17.397048950195312, + "learning_rate": 9.58307305325947e-06, + "loss": 4.8699, + "step": 25825 + }, + { + "epoch": 0.5255126953125, + "grad_norm": 15.105122566223145, + "learning_rate": 9.582913258762141e-06, + "loss": 5.049, + "step": 25830 + }, + { + "epoch": 0.5256144205729166, + "grad_norm": 17.791744232177734, + "learning_rate": 9.58275343498145e-06, + "loss": 5.3115, + "step": 25835 + }, + { + "epoch": 0.5257161458333334, + "grad_norm": 20.916797637939453, + "learning_rate": 9.58259358191842e-06, + "loss": 4.9484, + "step": 25840 + }, + { + "epoch": 0.52581787109375, + "grad_norm": 19.46289825439453, + "learning_rate": 9.582433699574071e-06, + "loss": 5.0663, + "step": 25845 + }, + { + "epoch": 0.5259195963541666, + "grad_norm": 12.95954704284668, + "learning_rate": 9.582273787949425e-06, + "loss": 5.1901, + "step": 25850 + }, + { + "epoch": 0.5260213216145834, + "grad_norm": 16.779016494750977, + "learning_rate": 9.582113847045505e-06, + "loss": 5.1575, + "step": 25855 + }, + { + "epoch": 0.526123046875, + "grad_norm": 20.870508193969727, + "learning_rate": 9.581953876863332e-06, + "loss": 4.9941, + "step": 25860 + }, + { + "epoch": 0.5262247721354166, + "grad_norm": 23.2066593170166, + "learning_rate": 9.581793877403928e-06, + "loss": 5.0415, + "step": 25865 + }, + { + "epoch": 0.5263264973958334, + "grad_norm": 13.509159088134766, + "learning_rate": 9.581633848668317e-06, + "loss": 5.0925, + "step": 25870 + }, + { + "epoch": 0.52642822265625, + "grad_norm": 22.297571182250977, + "learning_rate": 9.581473790657518e-06, + "loss": 5.2387, + "step": 25875 + }, + { + "epoch": 0.5265299479166666, + "grad_norm": 14.030600547790527, + "learning_rate": 9.581313703372556e-06, + "loss": 5.2365, + "step": 25880 + }, + { + "epoch": 0.5266316731770834, + "grad_norm": 16.586912155151367, + "learning_rate": 9.581153586814455e-06, + "loss": 5.2833, + "step": 25885 + }, + { + "epoch": 0.5267333984375, + "grad_norm": 20.02341079711914, + "learning_rate": 9.580993440984237e-06, + "loss": 4.9085, + "step": 25890 + }, + { + "epoch": 0.5268351236979166, + "grad_norm": 28.040760040283203, + "learning_rate": 9.580833265882924e-06, + "loss": 5.4395, + "step": 25895 + }, + { + "epoch": 0.5269368489583334, + "grad_norm": 15.438876152038574, + "learning_rate": 9.58067306151154e-06, + "loss": 4.9881, + "step": 25900 + }, + { + "epoch": 0.52703857421875, + "grad_norm": 16.166257858276367, + "learning_rate": 9.58051282787111e-06, + "loss": 5.2938, + "step": 25905 + }, + { + "epoch": 0.5271402994791666, + "grad_norm": 19.375633239746094, + "learning_rate": 9.580352564962659e-06, + "loss": 5.1816, + "step": 25910 + }, + { + "epoch": 0.5272420247395834, + "grad_norm": 14.9541597366333, + "learning_rate": 9.580192272787208e-06, + "loss": 5.365, + "step": 25915 + }, + { + "epoch": 0.52734375, + "grad_norm": 12.091390609741211, + "learning_rate": 9.580031951345781e-06, + "loss": 5.0859, + "step": 25920 + }, + { + "epoch": 0.5274454752604166, + "grad_norm": 18.87215232849121, + "learning_rate": 9.579871600639405e-06, + "loss": 4.9567, + "step": 25925 + }, + { + "epoch": 0.5275472005208334, + "grad_norm": 14.166312217712402, + "learning_rate": 9.579711220669102e-06, + "loss": 5.4308, + "step": 25930 + }, + { + "epoch": 0.52764892578125, + "grad_norm": 15.707047462463379, + "learning_rate": 9.579550811435899e-06, + "loss": 5.3023, + "step": 25935 + }, + { + "epoch": 0.5277506510416666, + "grad_norm": 23.870166778564453, + "learning_rate": 9.57939037294082e-06, + "loss": 5.2673, + "step": 25940 + }, + { + "epoch": 0.5278523763020834, + "grad_norm": 15.580667495727539, + "learning_rate": 9.57922990518489e-06, + "loss": 5.0054, + "step": 25945 + }, + { + "epoch": 0.5279541015625, + "grad_norm": 23.945091247558594, + "learning_rate": 9.579069408169134e-06, + "loss": 5.108, + "step": 25950 + }, + { + "epoch": 0.5280558268229166, + "grad_norm": 20.329792022705078, + "learning_rate": 9.57890888189458e-06, + "loss": 5.2164, + "step": 25955 + }, + { + "epoch": 0.5281575520833334, + "grad_norm": 18.324554443359375, + "learning_rate": 9.57874832636225e-06, + "loss": 5.267, + "step": 25960 + }, + { + "epoch": 0.52825927734375, + "grad_norm": 18.68773078918457, + "learning_rate": 9.578587741573172e-06, + "loss": 5.0614, + "step": 25965 + }, + { + "epoch": 0.5283610026041666, + "grad_norm": 16.369304656982422, + "learning_rate": 9.578427127528373e-06, + "loss": 5.1223, + "step": 25970 + }, + { + "epoch": 0.5284627278645834, + "grad_norm": 19.134916305541992, + "learning_rate": 9.578266484228878e-06, + "loss": 5.0924, + "step": 25975 + }, + { + "epoch": 0.528564453125, + "grad_norm": 14.640033721923828, + "learning_rate": 9.578105811675711e-06, + "loss": 5.1619, + "step": 25980 + }, + { + "epoch": 0.5286661783854166, + "grad_norm": 15.465594291687012, + "learning_rate": 9.577945109869902e-06, + "loss": 5.0426, + "step": 25985 + }, + { + "epoch": 0.5287679036458334, + "grad_norm": 15.934280395507812, + "learning_rate": 9.577784378812478e-06, + "loss": 5.3853, + "step": 25990 + }, + { + "epoch": 0.52886962890625, + "grad_norm": 16.530492782592773, + "learning_rate": 9.577623618504465e-06, + "loss": 5.2133, + "step": 25995 + }, + { + "epoch": 0.5289713541666666, + "grad_norm": 15.594820976257324, + "learning_rate": 9.57746282894689e-06, + "loss": 5.0078, + "step": 26000 + }, + { + "epoch": 0.5290730794270834, + "grad_norm": 20.433345794677734, + "learning_rate": 9.577302010140781e-06, + "loss": 4.9671, + "step": 26005 + }, + { + "epoch": 0.5291748046875, + "grad_norm": 14.949853897094727, + "learning_rate": 9.577141162087165e-06, + "loss": 5.0434, + "step": 26010 + }, + { + "epoch": 0.5292765299479166, + "grad_norm": 17.06753158569336, + "learning_rate": 9.57698028478707e-06, + "loss": 5.069, + "step": 26015 + }, + { + "epoch": 0.5293782552083334, + "grad_norm": 16.263519287109375, + "learning_rate": 9.576819378241522e-06, + "loss": 5.1508, + "step": 26020 + }, + { + "epoch": 0.52947998046875, + "grad_norm": 26.065113067626953, + "learning_rate": 9.576658442451554e-06, + "loss": 5.2322, + "step": 26025 + }, + { + "epoch": 0.5295817057291666, + "grad_norm": 19.201757431030273, + "learning_rate": 9.57649747741819e-06, + "loss": 5.3786, + "step": 26030 + }, + { + "epoch": 0.5296834309895834, + "grad_norm": 19.6964054107666, + "learning_rate": 9.576336483142459e-06, + "loss": 5.0717, + "step": 26035 + }, + { + "epoch": 0.52978515625, + "grad_norm": 15.398101806640625, + "learning_rate": 9.576175459625391e-06, + "loss": 5.0419, + "step": 26040 + }, + { + "epoch": 0.5298868815104166, + "grad_norm": 12.652341842651367, + "learning_rate": 9.576014406868015e-06, + "loss": 4.9918, + "step": 26045 + }, + { + "epoch": 0.5299886067708334, + "grad_norm": 14.521097183227539, + "learning_rate": 9.57585332487136e-06, + "loss": 4.8986, + "step": 26050 + }, + { + "epoch": 0.53009033203125, + "grad_norm": 17.352895736694336, + "learning_rate": 9.575692213636455e-06, + "loss": 5.323, + "step": 26055 + }, + { + "epoch": 0.5301920572916666, + "grad_norm": 13.9312162399292, + "learning_rate": 9.575531073164327e-06, + "loss": 5.051, + "step": 26060 + }, + { + "epoch": 0.5302937825520834, + "grad_norm": 22.847930908203125, + "learning_rate": 9.57536990345601e-06, + "loss": 5.6886, + "step": 26065 + }, + { + "epoch": 0.5303955078125, + "grad_norm": 20.899572372436523, + "learning_rate": 9.575208704512532e-06, + "loss": 4.9105, + "step": 26070 + }, + { + "epoch": 0.5304972330729166, + "grad_norm": 18.433183670043945, + "learning_rate": 9.575047476334922e-06, + "loss": 5.0311, + "step": 26075 + }, + { + "epoch": 0.5305989583333334, + "grad_norm": 16.312488555908203, + "learning_rate": 9.57488621892421e-06, + "loss": 5.0075, + "step": 26080 + }, + { + "epoch": 0.53070068359375, + "grad_norm": 15.366402626037598, + "learning_rate": 9.57472493228143e-06, + "loss": 4.9219, + "step": 26085 + }, + { + "epoch": 0.5308024088541666, + "grad_norm": 17.991249084472656, + "learning_rate": 9.574563616407609e-06, + "loss": 5.1273, + "step": 26090 + }, + { + "epoch": 0.5309041341145834, + "grad_norm": 12.395438194274902, + "learning_rate": 9.574402271303777e-06, + "loss": 5.1802, + "step": 26095 + }, + { + "epoch": 0.531005859375, + "grad_norm": 19.975055694580078, + "learning_rate": 9.574240896970967e-06, + "loss": 5.1044, + "step": 26100 + }, + { + "epoch": 0.5311075846354166, + "grad_norm": 12.273757934570312, + "learning_rate": 9.57407949341021e-06, + "loss": 5.0191, + "step": 26105 + }, + { + "epoch": 0.5312093098958334, + "grad_norm": 18.660419464111328, + "learning_rate": 9.573918060622539e-06, + "loss": 5.2656, + "step": 26110 + }, + { + "epoch": 0.53131103515625, + "grad_norm": 18.285396575927734, + "learning_rate": 9.573756598608981e-06, + "loss": 4.9055, + "step": 26115 + }, + { + "epoch": 0.5314127604166666, + "grad_norm": 15.103423118591309, + "learning_rate": 9.57359510737057e-06, + "loss": 5.1873, + "step": 26120 + }, + { + "epoch": 0.5315144856770834, + "grad_norm": 25.393888473510742, + "learning_rate": 9.573433586908339e-06, + "loss": 5.2964, + "step": 26125 + }, + { + "epoch": 0.5316162109375, + "grad_norm": 14.60049057006836, + "learning_rate": 9.57327203722332e-06, + "loss": 5.0413, + "step": 26130 + }, + { + "epoch": 0.5317179361979166, + "grad_norm": 14.525989532470703, + "learning_rate": 9.573110458316543e-06, + "loss": 4.9576, + "step": 26135 + }, + { + "epoch": 0.5318196614583334, + "grad_norm": 16.00360870361328, + "learning_rate": 9.572948850189044e-06, + "loss": 5.1676, + "step": 26140 + }, + { + "epoch": 0.53192138671875, + "grad_norm": 17.99925994873047, + "learning_rate": 9.572787212841851e-06, + "loss": 5.3882, + "step": 26145 + }, + { + "epoch": 0.5320231119791666, + "grad_norm": 14.96079158782959, + "learning_rate": 9.572625546276e-06, + "loss": 5.2827, + "step": 26150 + }, + { + "epoch": 0.5321248372395834, + "grad_norm": 22.942594528198242, + "learning_rate": 9.572463850492524e-06, + "loss": 4.9989, + "step": 26155 + }, + { + "epoch": 0.5322265625, + "grad_norm": 17.7741641998291, + "learning_rate": 9.572302125492455e-06, + "loss": 5.3592, + "step": 26160 + }, + { + "epoch": 0.5323282877604166, + "grad_norm": 17.7944278717041, + "learning_rate": 9.572140371276828e-06, + "loss": 5.2363, + "step": 26165 + }, + { + "epoch": 0.5324300130208334, + "grad_norm": 13.811031341552734, + "learning_rate": 9.571978587846675e-06, + "loss": 4.9519, + "step": 26170 + }, + { + "epoch": 0.53253173828125, + "grad_norm": 20.24031639099121, + "learning_rate": 9.57181677520303e-06, + "loss": 4.9921, + "step": 26175 + }, + { + "epoch": 0.5326334635416666, + "grad_norm": 17.632951736450195, + "learning_rate": 9.571654933346925e-06, + "loss": 5.3533, + "step": 26180 + }, + { + "epoch": 0.5327351888020834, + "grad_norm": 17.693456649780273, + "learning_rate": 9.571493062279397e-06, + "loss": 5.3576, + "step": 26185 + }, + { + "epoch": 0.5328369140625, + "grad_norm": 16.525333404541016, + "learning_rate": 9.571331162001483e-06, + "loss": 5.0411, + "step": 26190 + }, + { + "epoch": 0.5329386393229166, + "grad_norm": 31.692808151245117, + "learning_rate": 9.571169232514211e-06, + "loss": 5.5346, + "step": 26195 + }, + { + "epoch": 0.5330403645833334, + "grad_norm": 17.558467864990234, + "learning_rate": 9.571007273818617e-06, + "loss": 5.1127, + "step": 26200 + }, + { + "epoch": 0.53314208984375, + "grad_norm": 20.83608627319336, + "learning_rate": 9.570845285915742e-06, + "loss": 5.0666, + "step": 26205 + }, + { + "epoch": 0.5332438151041666, + "grad_norm": 16.289470672607422, + "learning_rate": 9.570683268806614e-06, + "loss": 4.8611, + "step": 26210 + }, + { + "epoch": 0.5333455403645834, + "grad_norm": 16.149288177490234, + "learning_rate": 9.57052122249227e-06, + "loss": 4.9435, + "step": 26215 + }, + { + "epoch": 0.533447265625, + "grad_norm": 18.325990676879883, + "learning_rate": 9.570359146973749e-06, + "loss": 5.1284, + "step": 26220 + }, + { + "epoch": 0.5335489908854166, + "grad_norm": 14.35696029663086, + "learning_rate": 9.57019704225208e-06, + "loss": 5.0072, + "step": 26225 + }, + { + "epoch": 0.5336507161458334, + "grad_norm": 17.046144485473633, + "learning_rate": 9.570034908328306e-06, + "loss": 5.0384, + "step": 26230 + }, + { + "epoch": 0.53375244140625, + "grad_norm": 17.760944366455078, + "learning_rate": 9.56987274520346e-06, + "loss": 5.0111, + "step": 26235 + }, + { + "epoch": 0.5338541666666666, + "grad_norm": 17.82707977294922, + "learning_rate": 9.569710552878576e-06, + "loss": 5.2185, + "step": 26240 + }, + { + "epoch": 0.5339558919270834, + "grad_norm": 15.753804206848145, + "learning_rate": 9.569548331354691e-06, + "loss": 5.1625, + "step": 26245 + }, + { + "epoch": 0.5340576171875, + "grad_norm": 15.527420997619629, + "learning_rate": 9.569386080632846e-06, + "loss": 5.1572, + "step": 26250 + }, + { + "epoch": 0.5341593424479166, + "grad_norm": 16.5806941986084, + "learning_rate": 9.569223800714073e-06, + "loss": 5.0533, + "step": 26255 + }, + { + "epoch": 0.5342610677083334, + "grad_norm": 18.034395217895508, + "learning_rate": 9.56906149159941e-06, + "loss": 5.4023, + "step": 26260 + }, + { + "epoch": 0.53436279296875, + "grad_norm": 17.166370391845703, + "learning_rate": 9.568899153289895e-06, + "loss": 5.2422, + "step": 26265 + }, + { + "epoch": 0.5344645182291666, + "grad_norm": 16.898319244384766, + "learning_rate": 9.568736785786564e-06, + "loss": 5.0555, + "step": 26270 + }, + { + "epoch": 0.5345662434895834, + "grad_norm": 13.270625114440918, + "learning_rate": 9.568574389090457e-06, + "loss": 4.926, + "step": 26275 + }, + { + "epoch": 0.53466796875, + "grad_norm": 16.50812339782715, + "learning_rate": 9.568411963202608e-06, + "loss": 5.1437, + "step": 26280 + }, + { + "epoch": 0.5347696940104166, + "grad_norm": 15.227187156677246, + "learning_rate": 9.568249508124057e-06, + "loss": 5.0354, + "step": 26285 + }, + { + "epoch": 0.5348714192708334, + "grad_norm": 21.58112335205078, + "learning_rate": 9.568087023855843e-06, + "loss": 5.1085, + "step": 26290 + }, + { + "epoch": 0.53497314453125, + "grad_norm": 17.95667266845703, + "learning_rate": 9.567924510399002e-06, + "loss": 5.3734, + "step": 26295 + }, + { + "epoch": 0.5350748697916666, + "grad_norm": 19.76057243347168, + "learning_rate": 9.567761967754573e-06, + "loss": 5.0401, + "step": 26300 + }, + { + "epoch": 0.5351765950520834, + "grad_norm": 16.461681365966797, + "learning_rate": 9.567599395923596e-06, + "loss": 5.2652, + "step": 26305 + }, + { + "epoch": 0.5352783203125, + "grad_norm": 23.124813079833984, + "learning_rate": 9.567436794907108e-06, + "loss": 5.2774, + "step": 26310 + }, + { + "epoch": 0.5353800455729166, + "grad_norm": 22.426950454711914, + "learning_rate": 9.56727416470615e-06, + "loss": 5.2062, + "step": 26315 + }, + { + "epoch": 0.5354817708333334, + "grad_norm": 15.770591735839844, + "learning_rate": 9.567111505321759e-06, + "loss": 5.0571, + "step": 26320 + }, + { + "epoch": 0.53558349609375, + "grad_norm": 14.743668556213379, + "learning_rate": 9.566948816754975e-06, + "loss": 4.9877, + "step": 26325 + }, + { + "epoch": 0.5356852213541666, + "grad_norm": 20.316299438476562, + "learning_rate": 9.56678609900684e-06, + "loss": 5.3509, + "step": 26330 + }, + { + "epoch": 0.5357869466145834, + "grad_norm": 27.948652267456055, + "learning_rate": 9.566623352078388e-06, + "loss": 5.0039, + "step": 26335 + }, + { + "epoch": 0.535888671875, + "grad_norm": 19.843069076538086, + "learning_rate": 9.566460575970665e-06, + "loss": 5.001, + "step": 26340 + }, + { + "epoch": 0.5359903971354166, + "grad_norm": 17.9671573638916, + "learning_rate": 9.566297770684708e-06, + "loss": 5.1423, + "step": 26345 + }, + { + "epoch": 0.5360921223958334, + "grad_norm": 14.940194129943848, + "learning_rate": 9.566134936221557e-06, + "loss": 5.1965, + "step": 26350 + }, + { + "epoch": 0.53619384765625, + "grad_norm": 24.02495002746582, + "learning_rate": 9.565972072582255e-06, + "loss": 5.3647, + "step": 26355 + }, + { + "epoch": 0.5362955729166666, + "grad_norm": 16.587703704833984, + "learning_rate": 9.565809179767839e-06, + "loss": 5.167, + "step": 26360 + }, + { + "epoch": 0.5363972981770834, + "grad_norm": 16.586505889892578, + "learning_rate": 9.565646257779352e-06, + "loss": 4.8599, + "step": 26365 + }, + { + "epoch": 0.5364990234375, + "grad_norm": 15.804230690002441, + "learning_rate": 9.565483306617835e-06, + "loss": 5.1003, + "step": 26370 + }, + { + "epoch": 0.5366007486979166, + "grad_norm": 18.653663635253906, + "learning_rate": 9.56532032628433e-06, + "loss": 5.2981, + "step": 26375 + }, + { + "epoch": 0.5367024739583334, + "grad_norm": 17.28420066833496, + "learning_rate": 9.565157316779877e-06, + "loss": 5.0175, + "step": 26380 + }, + { + "epoch": 0.53680419921875, + "grad_norm": 18.863245010375977, + "learning_rate": 9.564994278105517e-06, + "loss": 5.0288, + "step": 26385 + }, + { + "epoch": 0.5369059244791666, + "grad_norm": 16.577945709228516, + "learning_rate": 9.564831210262294e-06, + "loss": 5.0752, + "step": 26390 + }, + { + "epoch": 0.5370076497395834, + "grad_norm": 16.95825958251953, + "learning_rate": 9.564668113251247e-06, + "loss": 5.0477, + "step": 26395 + }, + { + "epoch": 0.537109375, + "grad_norm": 16.53137969970703, + "learning_rate": 9.56450498707342e-06, + "loss": 5.3632, + "step": 26400 + }, + { + "epoch": 0.5372111002604166, + "grad_norm": 14.336532592773438, + "learning_rate": 9.564341831729855e-06, + "loss": 5.3611, + "step": 26405 + }, + { + "epoch": 0.5373128255208334, + "grad_norm": 19.74730110168457, + "learning_rate": 9.564178647221596e-06, + "loss": 5.0832, + "step": 26410 + }, + { + "epoch": 0.53741455078125, + "grad_norm": 19.82278060913086, + "learning_rate": 9.564015433549682e-06, + "loss": 5.336, + "step": 26415 + }, + { + "epoch": 0.5375162760416666, + "grad_norm": 13.788887023925781, + "learning_rate": 9.563852190715162e-06, + "loss": 5.0113, + "step": 26420 + }, + { + "epoch": 0.5376180013020834, + "grad_norm": 21.60955810546875, + "learning_rate": 9.563688918719072e-06, + "loss": 5.1094, + "step": 26425 + }, + { + "epoch": 0.5377197265625, + "grad_norm": 14.092100143432617, + "learning_rate": 9.56352561756246e-06, + "loss": 5.0612, + "step": 26430 + }, + { + "epoch": 0.5378214518229166, + "grad_norm": 15.514357566833496, + "learning_rate": 9.563362287246367e-06, + "loss": 5.0567, + "step": 26435 + }, + { + "epoch": 0.5379231770833334, + "grad_norm": 14.380485534667969, + "learning_rate": 9.563198927771838e-06, + "loss": 4.9394, + "step": 26440 + }, + { + "epoch": 0.53802490234375, + "grad_norm": 16.273603439331055, + "learning_rate": 9.563035539139915e-06, + "loss": 5.1142, + "step": 26445 + }, + { + "epoch": 0.5381266276041666, + "grad_norm": 17.18828010559082, + "learning_rate": 9.562872121351647e-06, + "loss": 5.0001, + "step": 26450 + }, + { + "epoch": 0.5382283528645834, + "grad_norm": 15.98421859741211, + "learning_rate": 9.562708674408072e-06, + "loss": 5.3105, + "step": 26455 + }, + { + "epoch": 0.538330078125, + "grad_norm": 18.10580825805664, + "learning_rate": 9.562545198310237e-06, + "loss": 5.1864, + "step": 26460 + }, + { + "epoch": 0.5384318033854166, + "grad_norm": 18.25111198425293, + "learning_rate": 9.562381693059186e-06, + "loss": 5.0158, + "step": 26465 + }, + { + "epoch": 0.5385335286458334, + "grad_norm": 26.4348201751709, + "learning_rate": 9.562218158655967e-06, + "loss": 5.2214, + "step": 26470 + }, + { + "epoch": 0.53863525390625, + "grad_norm": 14.20276927947998, + "learning_rate": 9.56205459510162e-06, + "loss": 5.0797, + "step": 26475 + }, + { + "epoch": 0.5387369791666666, + "grad_norm": 15.162766456604004, + "learning_rate": 9.561891002397192e-06, + "loss": 5.4143, + "step": 26480 + }, + { + "epoch": 0.5388387044270834, + "grad_norm": 16.98404312133789, + "learning_rate": 9.56172738054373e-06, + "loss": 5.1451, + "step": 26485 + }, + { + "epoch": 0.5389404296875, + "grad_norm": 13.030374526977539, + "learning_rate": 9.561563729542277e-06, + "loss": 5.1236, + "step": 26490 + }, + { + "epoch": 0.5390421549479166, + "grad_norm": 17.636512756347656, + "learning_rate": 9.56140004939388e-06, + "loss": 5.3094, + "step": 26495 + }, + { + "epoch": 0.5391438802083334, + "grad_norm": 14.616223335266113, + "learning_rate": 9.561236340099586e-06, + "loss": 5.3216, + "step": 26500 + }, + { + "epoch": 0.53924560546875, + "grad_norm": 13.428364753723145, + "learning_rate": 9.561072601660439e-06, + "loss": 5.1717, + "step": 26505 + }, + { + "epoch": 0.5393473307291666, + "grad_norm": 15.786961555480957, + "learning_rate": 9.560908834077484e-06, + "loss": 5.0634, + "step": 26510 + }, + { + "epoch": 0.5394490559895834, + "grad_norm": 14.973657608032227, + "learning_rate": 9.560745037351772e-06, + "loss": 5.5255, + "step": 26515 + }, + { + "epoch": 0.53955078125, + "grad_norm": 16.808565139770508, + "learning_rate": 9.560581211484346e-06, + "loss": 4.9806, + "step": 26520 + }, + { + "epoch": 0.5396525065104166, + "grad_norm": 13.260862350463867, + "learning_rate": 9.560417356476254e-06, + "loss": 5.2235, + "step": 26525 + }, + { + "epoch": 0.5397542317708334, + "grad_norm": 13.713059425354004, + "learning_rate": 9.560253472328542e-06, + "loss": 4.936, + "step": 26530 + }, + { + "epoch": 0.53985595703125, + "grad_norm": 17.95842170715332, + "learning_rate": 9.560089559042259e-06, + "loss": 5.1598, + "step": 26535 + }, + { + "epoch": 0.5399576822916666, + "grad_norm": 17.60909080505371, + "learning_rate": 9.55992561661845e-06, + "loss": 5.0987, + "step": 26540 + }, + { + "epoch": 0.5400594075520834, + "grad_norm": 17.780290603637695, + "learning_rate": 9.559761645058166e-06, + "loss": 4.9483, + "step": 26545 + }, + { + "epoch": 0.5401611328125, + "grad_norm": 18.6614933013916, + "learning_rate": 9.55959764436245e-06, + "loss": 5.1872, + "step": 26550 + }, + { + "epoch": 0.5402628580729166, + "grad_norm": 18.375354766845703, + "learning_rate": 9.559433614532355e-06, + "loss": 4.9829, + "step": 26555 + }, + { + "epoch": 0.5403645833333334, + "grad_norm": 19.013460159301758, + "learning_rate": 9.559269555568924e-06, + "loss": 5.1527, + "step": 26560 + }, + { + "epoch": 0.54046630859375, + "grad_norm": 16.471134185791016, + "learning_rate": 9.55910546747321e-06, + "loss": 5.2763, + "step": 26565 + }, + { + "epoch": 0.5405680338541666, + "grad_norm": 12.8566255569458, + "learning_rate": 9.558941350246258e-06, + "loss": 5.2986, + "step": 26570 + }, + { + "epoch": 0.5406697591145834, + "grad_norm": 14.897216796875, + "learning_rate": 9.558777203889117e-06, + "loss": 4.9546, + "step": 26575 + }, + { + "epoch": 0.540771484375, + "grad_norm": 12.941911697387695, + "learning_rate": 9.558613028402838e-06, + "loss": 5.199, + "step": 26580 + }, + { + "epoch": 0.5408732096354166, + "grad_norm": 12.399489402770996, + "learning_rate": 9.55844882378847e-06, + "loss": 4.9753, + "step": 26585 + }, + { + "epoch": 0.5409749348958334, + "grad_norm": 18.804523468017578, + "learning_rate": 9.55828459004706e-06, + "loss": 5.3007, + "step": 26590 + }, + { + "epoch": 0.54107666015625, + "grad_norm": 15.61961555480957, + "learning_rate": 9.558120327179658e-06, + "loss": 5.1078, + "step": 26595 + }, + { + "epoch": 0.5411783854166666, + "grad_norm": 20.111339569091797, + "learning_rate": 9.557956035187314e-06, + "loss": 5.2396, + "step": 26600 + }, + { + "epoch": 0.5412801106770834, + "grad_norm": 18.956966400146484, + "learning_rate": 9.55779171407108e-06, + "loss": 4.9354, + "step": 26605 + }, + { + "epoch": 0.5413818359375, + "grad_norm": 17.915239334106445, + "learning_rate": 9.557627363832002e-06, + "loss": 5.3949, + "step": 26610 + }, + { + "epoch": 0.5414835611979166, + "grad_norm": 17.38917350769043, + "learning_rate": 9.557462984471132e-06, + "loss": 5.1034, + "step": 26615 + }, + { + "epoch": 0.5415852864583334, + "grad_norm": 18.833330154418945, + "learning_rate": 9.55729857598952e-06, + "loss": 5.0857, + "step": 26620 + }, + { + "epoch": 0.54168701171875, + "grad_norm": 13.087908744812012, + "learning_rate": 9.557134138388218e-06, + "loss": 5.0489, + "step": 26625 + }, + { + "epoch": 0.5417887369791666, + "grad_norm": 23.965404510498047, + "learning_rate": 9.556969671668274e-06, + "loss": 5.3237, + "step": 26630 + }, + { + "epoch": 0.5418904622395834, + "grad_norm": 17.45770835876465, + "learning_rate": 9.556805175830741e-06, + "loss": 4.9842, + "step": 26635 + }, + { + "epoch": 0.5419921875, + "grad_norm": 13.3431396484375, + "learning_rate": 9.55664065087667e-06, + "loss": 5.3284, + "step": 26640 + }, + { + "epoch": 0.5420939127604166, + "grad_norm": 14.176102638244629, + "learning_rate": 9.556476096807111e-06, + "loss": 5.5907, + "step": 26645 + }, + { + "epoch": 0.5421956380208334, + "grad_norm": 20.157339096069336, + "learning_rate": 9.556311513623116e-06, + "loss": 5.1463, + "step": 26650 + }, + { + "epoch": 0.54229736328125, + "grad_norm": 17.228076934814453, + "learning_rate": 9.556146901325737e-06, + "loss": 4.9435, + "step": 26655 + }, + { + "epoch": 0.5423990885416666, + "grad_norm": 20.60755729675293, + "learning_rate": 9.555982259916026e-06, + "loss": 5.2588, + "step": 26660 + }, + { + "epoch": 0.5425008138020834, + "grad_norm": 12.03007984161377, + "learning_rate": 9.555817589395033e-06, + "loss": 5.2317, + "step": 26665 + }, + { + "epoch": 0.5426025390625, + "grad_norm": 16.21941566467285, + "learning_rate": 9.555652889763814e-06, + "loss": 5.1766, + "step": 26670 + }, + { + "epoch": 0.5427042643229166, + "grad_norm": 16.906177520751953, + "learning_rate": 9.555488161023419e-06, + "loss": 4.8977, + "step": 26675 + }, + { + "epoch": 0.5428059895833334, + "grad_norm": 13.605812072753906, + "learning_rate": 9.5553234031749e-06, + "loss": 5.243, + "step": 26680 + }, + { + "epoch": 0.54290771484375, + "grad_norm": 15.825494766235352, + "learning_rate": 9.555158616219311e-06, + "loss": 5.1459, + "step": 26685 + }, + { + "epoch": 0.5430094401041666, + "grad_norm": 22.884233474731445, + "learning_rate": 9.554993800157704e-06, + "loss": 5.0898, + "step": 26690 + }, + { + "epoch": 0.5431111653645834, + "grad_norm": 12.657238960266113, + "learning_rate": 9.554828954991132e-06, + "loss": 4.9276, + "step": 26695 + }, + { + "epoch": 0.543212890625, + "grad_norm": 12.692980766296387, + "learning_rate": 9.554664080720651e-06, + "loss": 5.0593, + "step": 26700 + }, + { + "epoch": 0.5433146158854166, + "grad_norm": 17.28712272644043, + "learning_rate": 9.55449917734731e-06, + "loss": 5.3762, + "step": 26705 + }, + { + "epoch": 0.5434163411458334, + "grad_norm": 20.582414627075195, + "learning_rate": 9.554334244872166e-06, + "loss": 5.3892, + "step": 26710 + }, + { + "epoch": 0.54351806640625, + "grad_norm": 22.4606990814209, + "learning_rate": 9.55416928329627e-06, + "loss": 4.9458, + "step": 26715 + }, + { + "epoch": 0.5436197916666666, + "grad_norm": 13.74673080444336, + "learning_rate": 9.55400429262068e-06, + "loss": 5.2392, + "step": 26720 + }, + { + "epoch": 0.5437215169270834, + "grad_norm": 14.792511940002441, + "learning_rate": 9.55383927284645e-06, + "loss": 5.2777, + "step": 26725 + }, + { + "epoch": 0.5438232421875, + "grad_norm": 15.490012168884277, + "learning_rate": 9.553674223974629e-06, + "loss": 5.1029, + "step": 26730 + }, + { + "epoch": 0.5439249674479166, + "grad_norm": 12.77464771270752, + "learning_rate": 9.553509146006277e-06, + "loss": 5.226, + "step": 26735 + }, + { + "epoch": 0.5440266927083334, + "grad_norm": 13.453810691833496, + "learning_rate": 9.553344038942447e-06, + "loss": 5.2066, + "step": 26740 + }, + { + "epoch": 0.54412841796875, + "grad_norm": 17.285869598388672, + "learning_rate": 9.553178902784195e-06, + "loss": 5.1114, + "step": 26745 + }, + { + "epoch": 0.5442301432291666, + "grad_norm": 14.472302436828613, + "learning_rate": 9.553013737532574e-06, + "loss": 5.2299, + "step": 26750 + }, + { + "epoch": 0.5443318684895834, + "grad_norm": 15.55294418334961, + "learning_rate": 9.55284854318864e-06, + "loss": 5.1523, + "step": 26755 + }, + { + "epoch": 0.54443359375, + "grad_norm": 20.630992889404297, + "learning_rate": 9.552683319753451e-06, + "loss": 5.2034, + "step": 26760 + }, + { + "epoch": 0.5445353190104166, + "grad_norm": 13.028382301330566, + "learning_rate": 9.552518067228058e-06, + "loss": 5.0814, + "step": 26765 + }, + { + "epoch": 0.5446370442708334, + "grad_norm": 17.248258590698242, + "learning_rate": 9.552352785613522e-06, + "loss": 5.2215, + "step": 26770 + }, + { + "epoch": 0.54473876953125, + "grad_norm": 14.795450210571289, + "learning_rate": 9.552187474910895e-06, + "loss": 5.1418, + "step": 26775 + }, + { + "epoch": 0.5448404947916666, + "grad_norm": 20.863554000854492, + "learning_rate": 9.552022135121236e-06, + "loss": 5.2181, + "step": 26780 + }, + { + "epoch": 0.5449422200520834, + "grad_norm": 13.637272834777832, + "learning_rate": 9.551856766245601e-06, + "loss": 5.2608, + "step": 26785 + }, + { + "epoch": 0.5450439453125, + "grad_norm": 19.20461654663086, + "learning_rate": 9.551691368285047e-06, + "loss": 5.259, + "step": 26790 + }, + { + "epoch": 0.5451456705729166, + "grad_norm": 19.24713706970215, + "learning_rate": 9.55152594124063e-06, + "loss": 4.9975, + "step": 26795 + }, + { + "epoch": 0.5452473958333334, + "grad_norm": 16.941513061523438, + "learning_rate": 9.551360485113404e-06, + "loss": 5.566, + "step": 26800 + }, + { + "epoch": 0.54534912109375, + "grad_norm": 16.415119171142578, + "learning_rate": 9.551194999904432e-06, + "loss": 5.1518, + "step": 26805 + }, + { + "epoch": 0.5454508463541666, + "grad_norm": 15.19627571105957, + "learning_rate": 9.55102948561477e-06, + "loss": 4.7296, + "step": 26810 + }, + { + "epoch": 0.5455525716145834, + "grad_norm": 17.172508239746094, + "learning_rate": 9.55086394224547e-06, + "loss": 5.1165, + "step": 26815 + }, + { + "epoch": 0.545654296875, + "grad_norm": 18.03996467590332, + "learning_rate": 9.550698369797598e-06, + "loss": 5.0591, + "step": 26820 + }, + { + "epoch": 0.5457560221354166, + "grad_norm": 18.231170654296875, + "learning_rate": 9.550532768272206e-06, + "loss": 5.1572, + "step": 26825 + }, + { + "epoch": 0.5458577473958334, + "grad_norm": 15.769381523132324, + "learning_rate": 9.550367137670353e-06, + "loss": 4.9658, + "step": 26830 + }, + { + "epoch": 0.54595947265625, + "grad_norm": 17.027015686035156, + "learning_rate": 9.550201477993101e-06, + "loss": 5.3435, + "step": 26835 + }, + { + "epoch": 0.5460611979166666, + "grad_norm": 26.541133880615234, + "learning_rate": 9.550035789241505e-06, + "loss": 5.2127, + "step": 26840 + }, + { + "epoch": 0.5461629231770834, + "grad_norm": 15.776599884033203, + "learning_rate": 9.549870071416625e-06, + "loss": 5.1809, + "step": 26845 + }, + { + "epoch": 0.5462646484375, + "grad_norm": 18.045167922973633, + "learning_rate": 9.549704324519519e-06, + "loss": 5.1464, + "step": 26850 + }, + { + "epoch": 0.5463663736979166, + "grad_norm": 14.803946495056152, + "learning_rate": 9.549538548551247e-06, + "loss": 5.0443, + "step": 26855 + }, + { + "epoch": 0.5464680989583334, + "grad_norm": 15.582239151000977, + "learning_rate": 9.549372743512869e-06, + "loss": 5.0198, + "step": 26860 + }, + { + "epoch": 0.54656982421875, + "grad_norm": 16.01943016052246, + "learning_rate": 9.549206909405441e-06, + "loss": 4.9749, + "step": 26865 + }, + { + "epoch": 0.5466715494791666, + "grad_norm": 13.10030746459961, + "learning_rate": 9.549041046230025e-06, + "loss": 4.9815, + "step": 26870 + }, + { + "epoch": 0.5467732747395834, + "grad_norm": 14.711336135864258, + "learning_rate": 9.548875153987682e-06, + "loss": 4.8902, + "step": 26875 + }, + { + "epoch": 0.546875, + "grad_norm": 26.964513778686523, + "learning_rate": 9.548709232679471e-06, + "loss": 5.1722, + "step": 26880 + }, + { + "epoch": 0.5469767252604166, + "grad_norm": 15.422977447509766, + "learning_rate": 9.548543282306452e-06, + "loss": 5.0948, + "step": 26885 + }, + { + "epoch": 0.5470784505208334, + "grad_norm": 15.75989818572998, + "learning_rate": 9.548377302869685e-06, + "loss": 5.0411, + "step": 26890 + }, + { + "epoch": 0.54718017578125, + "grad_norm": 15.610610961914062, + "learning_rate": 9.54821129437023e-06, + "loss": 5.1135, + "step": 26895 + }, + { + "epoch": 0.5472819010416666, + "grad_norm": 25.222230911254883, + "learning_rate": 9.54804525680915e-06, + "loss": 5.5444, + "step": 26900 + }, + { + "epoch": 0.5473836263020834, + "grad_norm": 14.268513679504395, + "learning_rate": 9.547879190187504e-06, + "loss": 5.0709, + "step": 26905 + }, + { + "epoch": 0.5474853515625, + "grad_norm": 13.517364501953125, + "learning_rate": 9.547713094506353e-06, + "loss": 5.2869, + "step": 26910 + }, + { + "epoch": 0.5475870768229166, + "grad_norm": 16.23525047302246, + "learning_rate": 9.54754696976676e-06, + "loss": 5.2106, + "step": 26915 + }, + { + "epoch": 0.5476888020833334, + "grad_norm": 17.82880401611328, + "learning_rate": 9.547380815969784e-06, + "loss": 4.879, + "step": 26920 + }, + { + "epoch": 0.54779052734375, + "grad_norm": 13.899118423461914, + "learning_rate": 9.54721463311649e-06, + "loss": 5.3631, + "step": 26925 + }, + { + "epoch": 0.5478922526041666, + "grad_norm": 16.054370880126953, + "learning_rate": 9.547048421207938e-06, + "loss": 5.1594, + "step": 26930 + }, + { + "epoch": 0.5479939778645834, + "grad_norm": 19.103317260742188, + "learning_rate": 9.54688218024519e-06, + "loss": 5.3971, + "step": 26935 + }, + { + "epoch": 0.548095703125, + "grad_norm": 24.29625129699707, + "learning_rate": 9.546715910229305e-06, + "loss": 5.4447, + "step": 26940 + }, + { + "epoch": 0.5481974283854166, + "grad_norm": 15.388321876525879, + "learning_rate": 9.546549611161353e-06, + "loss": 5.0739, + "step": 26945 + }, + { + "epoch": 0.5482991536458334, + "grad_norm": 15.324467658996582, + "learning_rate": 9.546383283042389e-06, + "loss": 5.1785, + "step": 26950 + }, + { + "epoch": 0.54840087890625, + "grad_norm": 16.68899154663086, + "learning_rate": 9.54621692587348e-06, + "loss": 5.1936, + "step": 26955 + }, + { + "epoch": 0.5485026041666666, + "grad_norm": 16.052703857421875, + "learning_rate": 9.546050539655688e-06, + "loss": 5.5129, + "step": 26960 + }, + { + "epoch": 0.5486043294270834, + "grad_norm": 17.224966049194336, + "learning_rate": 9.545884124390077e-06, + "loss": 5.0747, + "step": 26965 + }, + { + "epoch": 0.5487060546875, + "grad_norm": 14.837730407714844, + "learning_rate": 9.545717680077709e-06, + "loss": 5.0291, + "step": 26970 + }, + { + "epoch": 0.5488077799479166, + "grad_norm": 12.372455596923828, + "learning_rate": 9.54555120671965e-06, + "loss": 5.2533, + "step": 26975 + }, + { + "epoch": 0.5489095052083334, + "grad_norm": 19.82147979736328, + "learning_rate": 9.545384704316959e-06, + "loss": 5.5318, + "step": 26980 + }, + { + "epoch": 0.54901123046875, + "grad_norm": 17.838687896728516, + "learning_rate": 9.545218172870702e-06, + "loss": 5.3298, + "step": 26985 + }, + { + "epoch": 0.5491129557291666, + "grad_norm": 15.424943923950195, + "learning_rate": 9.545051612381946e-06, + "loss": 5.1491, + "step": 26990 + }, + { + "epoch": 0.5492146809895834, + "grad_norm": 13.704370498657227, + "learning_rate": 9.544885022851753e-06, + "loss": 5.2202, + "step": 26995 + }, + { + "epoch": 0.54931640625, + "grad_norm": 22.191884994506836, + "learning_rate": 9.544718404281185e-06, + "loss": 5.4791, + "step": 27000 + }, + { + "epoch": 0.5494181315104166, + "grad_norm": 14.598846435546875, + "learning_rate": 9.544551756671313e-06, + "loss": 5.0622, + "step": 27005 + }, + { + "epoch": 0.5495198567708334, + "grad_norm": 18.061275482177734, + "learning_rate": 9.544385080023194e-06, + "loss": 5.1127, + "step": 27010 + }, + { + "epoch": 0.54962158203125, + "grad_norm": 18.828842163085938, + "learning_rate": 9.544218374337898e-06, + "loss": 5.0377, + "step": 27015 + }, + { + "epoch": 0.5497233072916666, + "grad_norm": 24.529342651367188, + "learning_rate": 9.54405163961649e-06, + "loss": 5.183, + "step": 27020 + }, + { + "epoch": 0.5498250325520834, + "grad_norm": 14.576648712158203, + "learning_rate": 9.543884875860035e-06, + "loss": 5.1233, + "step": 27025 + }, + { + "epoch": 0.5499267578125, + "grad_norm": 18.487398147583008, + "learning_rate": 9.543718083069596e-06, + "loss": 5.0812, + "step": 27030 + }, + { + "epoch": 0.5500284830729166, + "grad_norm": 17.41339874267578, + "learning_rate": 9.543551261246242e-06, + "loss": 5.1243, + "step": 27035 + }, + { + "epoch": 0.5501302083333334, + "grad_norm": 17.07828140258789, + "learning_rate": 9.54338441039104e-06, + "loss": 5.1731, + "step": 27040 + }, + { + "epoch": 0.55023193359375, + "grad_norm": 14.513949394226074, + "learning_rate": 9.54321753050505e-06, + "loss": 4.9508, + "step": 27045 + }, + { + "epoch": 0.5503336588541666, + "grad_norm": 18.91312026977539, + "learning_rate": 9.543050621589343e-06, + "loss": 5.1769, + "step": 27050 + }, + { + "epoch": 0.5504353841145834, + "grad_norm": 14.691702842712402, + "learning_rate": 9.542883683644986e-06, + "loss": 5.0944, + "step": 27055 + }, + { + "epoch": 0.550537109375, + "grad_norm": 17.98108673095703, + "learning_rate": 9.542716716673043e-06, + "loss": 5.2633, + "step": 27060 + }, + { + "epoch": 0.5506388346354166, + "grad_norm": 22.763261795043945, + "learning_rate": 9.542549720674583e-06, + "loss": 4.8149, + "step": 27065 + }, + { + "epoch": 0.5507405598958334, + "grad_norm": 17.114463806152344, + "learning_rate": 9.542382695650675e-06, + "loss": 4.8991, + "step": 27070 + }, + { + "epoch": 0.55084228515625, + "grad_norm": 14.816930770874023, + "learning_rate": 9.54221564160238e-06, + "loss": 5.2297, + "step": 27075 + }, + { + "epoch": 0.5509440104166666, + "grad_norm": 15.567892074584961, + "learning_rate": 9.54204855853077e-06, + "loss": 5.1698, + "step": 27080 + }, + { + "epoch": 0.5510457356770834, + "grad_norm": 20.616226196289062, + "learning_rate": 9.541881446436911e-06, + "loss": 4.9895, + "step": 27085 + }, + { + "epoch": 0.5511474609375, + "grad_norm": 13.073633193969727, + "learning_rate": 9.541714305321873e-06, + "loss": 5.1296, + "step": 27090 + }, + { + "epoch": 0.5512491861979166, + "grad_norm": 23.874176025390625, + "learning_rate": 9.54154713518672e-06, + "loss": 5.1044, + "step": 27095 + }, + { + "epoch": 0.5513509114583334, + "grad_norm": 20.261089324951172, + "learning_rate": 9.541379936032523e-06, + "loss": 5.1946, + "step": 27100 + }, + { + "epoch": 0.55145263671875, + "grad_norm": 19.60373878479004, + "learning_rate": 9.54121270786035e-06, + "loss": 5.0628, + "step": 27105 + }, + { + "epoch": 0.5515543619791666, + "grad_norm": 16.31322479248047, + "learning_rate": 9.54104545067127e-06, + "loss": 5.1376, + "step": 27110 + }, + { + "epoch": 0.5516560872395834, + "grad_norm": 14.691543579101562, + "learning_rate": 9.540878164466351e-06, + "loss": 5.155, + "step": 27115 + }, + { + "epoch": 0.5517578125, + "grad_norm": 20.900062561035156, + "learning_rate": 9.540710849246662e-06, + "loss": 5.2235, + "step": 27120 + }, + { + "epoch": 0.5518595377604166, + "grad_norm": 18.63294792175293, + "learning_rate": 9.540543505013271e-06, + "loss": 5.6444, + "step": 27125 + }, + { + "epoch": 0.5519612630208334, + "grad_norm": 17.384218215942383, + "learning_rate": 9.540376131767251e-06, + "loss": 5.307, + "step": 27130 + }, + { + "epoch": 0.55206298828125, + "grad_norm": 16.457252502441406, + "learning_rate": 9.540208729509666e-06, + "loss": 5.1421, + "step": 27135 + }, + { + "epoch": 0.5521647135416666, + "grad_norm": 16.20568084716797, + "learning_rate": 9.54004129824159e-06, + "loss": 5.1818, + "step": 27140 + }, + { + "epoch": 0.5522664388020834, + "grad_norm": 23.607568740844727, + "learning_rate": 9.53987383796409e-06, + "loss": 5.0449, + "step": 27145 + }, + { + "epoch": 0.5523681640625, + "grad_norm": 16.48710060119629, + "learning_rate": 9.539706348678239e-06, + "loss": 5.2573, + "step": 27150 + }, + { + "epoch": 0.5524698893229166, + "grad_norm": 20.045686721801758, + "learning_rate": 9.539538830385104e-06, + "loss": 5.1067, + "step": 27155 + }, + { + "epoch": 0.5525716145833334, + "grad_norm": 16.919490814208984, + "learning_rate": 9.53937128308576e-06, + "loss": 4.8147, + "step": 27160 + }, + { + "epoch": 0.55267333984375, + "grad_norm": 12.58271312713623, + "learning_rate": 9.53920370678127e-06, + "loss": 5.1922, + "step": 27165 + }, + { + "epoch": 0.5527750651041666, + "grad_norm": 17.9115047454834, + "learning_rate": 9.539036101472712e-06, + "loss": 5.1182, + "step": 27170 + }, + { + "epoch": 0.5528767903645834, + "grad_norm": 18.979394912719727, + "learning_rate": 9.538868467161153e-06, + "loss": 5.1179, + "step": 27175 + }, + { + "epoch": 0.552978515625, + "grad_norm": 19.46761131286621, + "learning_rate": 9.538700803847668e-06, + "loss": 5.5729, + "step": 27180 + }, + { + "epoch": 0.5530802408854166, + "grad_norm": 11.781542778015137, + "learning_rate": 9.538533111533323e-06, + "loss": 5.0331, + "step": 27185 + }, + { + "epoch": 0.5531819661458334, + "grad_norm": 15.907182693481445, + "learning_rate": 9.538365390219193e-06, + "loss": 5.3107, + "step": 27190 + }, + { + "epoch": 0.55328369140625, + "grad_norm": 16.973440170288086, + "learning_rate": 9.538197639906347e-06, + "loss": 5.0509, + "step": 27195 + }, + { + "epoch": 0.5533854166666666, + "grad_norm": 18.173776626586914, + "learning_rate": 9.538029860595861e-06, + "loss": 5.0682, + "step": 27200 + }, + { + "epoch": 0.5534871419270834, + "grad_norm": 13.891077995300293, + "learning_rate": 9.537862052288804e-06, + "loss": 5.2514, + "step": 27205 + }, + { + "epoch": 0.5535888671875, + "grad_norm": 18.783855438232422, + "learning_rate": 9.537694214986248e-06, + "loss": 5.0563, + "step": 27210 + }, + { + "epoch": 0.5536905924479166, + "grad_norm": 17.027952194213867, + "learning_rate": 9.537526348689267e-06, + "loss": 5.4404, + "step": 27215 + }, + { + "epoch": 0.5537923177083334, + "grad_norm": 22.57168960571289, + "learning_rate": 9.537358453398935e-06, + "loss": 5.3542, + "step": 27220 + }, + { + "epoch": 0.55389404296875, + "grad_norm": 18.03928565979004, + "learning_rate": 9.537190529116322e-06, + "loss": 5.1322, + "step": 27225 + }, + { + "epoch": 0.5539957682291666, + "grad_norm": 16.32562828063965, + "learning_rate": 9.5370225758425e-06, + "loss": 5.1416, + "step": 27230 + }, + { + "epoch": 0.5540974934895834, + "grad_norm": 15.112764358520508, + "learning_rate": 9.536854593578546e-06, + "loss": 5.1893, + "step": 27235 + }, + { + "epoch": 0.55419921875, + "grad_norm": 21.594690322875977, + "learning_rate": 9.536686582325529e-06, + "loss": 4.9743, + "step": 27240 + }, + { + "epoch": 0.5543009440104166, + "grad_norm": 21.825544357299805, + "learning_rate": 9.536518542084527e-06, + "loss": 5.3021, + "step": 27245 + }, + { + "epoch": 0.5544026692708334, + "grad_norm": 17.28818702697754, + "learning_rate": 9.536350472856611e-06, + "loss": 5.3057, + "step": 27250 + }, + { + "epoch": 0.55450439453125, + "grad_norm": 18.047109603881836, + "learning_rate": 9.536182374642855e-06, + "loss": 5.0492, + "step": 27255 + }, + { + "epoch": 0.5546061197916666, + "grad_norm": 15.315347671508789, + "learning_rate": 9.536014247444334e-06, + "loss": 5.3046, + "step": 27260 + }, + { + "epoch": 0.5547078450520834, + "grad_norm": 12.819160461425781, + "learning_rate": 9.535846091262122e-06, + "loss": 5.3735, + "step": 27265 + }, + { + "epoch": 0.5548095703125, + "grad_norm": 14.679365158081055, + "learning_rate": 9.535677906097293e-06, + "loss": 4.9274, + "step": 27270 + }, + { + "epoch": 0.5549112955729166, + "grad_norm": 16.53300666809082, + "learning_rate": 9.535509691950923e-06, + "loss": 5.1565, + "step": 27275 + }, + { + "epoch": 0.5550130208333334, + "grad_norm": 20.922496795654297, + "learning_rate": 9.535341448824086e-06, + "loss": 5.459, + "step": 27280 + }, + { + "epoch": 0.55511474609375, + "grad_norm": 15.094378471374512, + "learning_rate": 9.535173176717855e-06, + "loss": 5.0482, + "step": 27285 + }, + { + "epoch": 0.5552164713541666, + "grad_norm": 18.391326904296875, + "learning_rate": 9.535004875633308e-06, + "loss": 4.9662, + "step": 27290 + }, + { + "epoch": 0.5553181966145834, + "grad_norm": 16.96718978881836, + "learning_rate": 9.53483654557152e-06, + "loss": 5.1191, + "step": 27295 + }, + { + "epoch": 0.555419921875, + "grad_norm": 14.991053581237793, + "learning_rate": 9.534668186533565e-06, + "loss": 5.1426, + "step": 27300 + }, + { + "epoch": 0.5555216471354166, + "grad_norm": 17.272319793701172, + "learning_rate": 9.53449979852052e-06, + "loss": 5.0985, + "step": 27305 + }, + { + "epoch": 0.5556233723958334, + "grad_norm": 18.214313507080078, + "learning_rate": 9.534331381533461e-06, + "loss": 5.105, + "step": 27310 + }, + { + "epoch": 0.55572509765625, + "grad_norm": 22.814537048339844, + "learning_rate": 9.534162935573464e-06, + "loss": 5.4466, + "step": 27315 + }, + { + "epoch": 0.5558268229166666, + "grad_norm": 15.799997329711914, + "learning_rate": 9.533994460641605e-06, + "loss": 5.3829, + "step": 27320 + }, + { + "epoch": 0.5559285481770834, + "grad_norm": 18.909652709960938, + "learning_rate": 9.53382595673896e-06, + "loss": 4.9933, + "step": 27325 + }, + { + "epoch": 0.5560302734375, + "grad_norm": 20.526660919189453, + "learning_rate": 9.533657423866607e-06, + "loss": 4.8974, + "step": 27330 + }, + { + "epoch": 0.5561319986979166, + "grad_norm": 18.46160316467285, + "learning_rate": 9.533488862025623e-06, + "loss": 5.0116, + "step": 27335 + }, + { + "epoch": 0.5562337239583334, + "grad_norm": 18.910091400146484, + "learning_rate": 9.533320271217084e-06, + "loss": 5.1097, + "step": 27340 + }, + { + "epoch": 0.55633544921875, + "grad_norm": 14.7957763671875, + "learning_rate": 9.533151651442067e-06, + "loss": 5.1146, + "step": 27345 + }, + { + "epoch": 0.5564371744791666, + "grad_norm": 13.549795150756836, + "learning_rate": 9.53298300270165e-06, + "loss": 5.0651, + "step": 27350 + }, + { + "epoch": 0.5565388997395834, + "grad_norm": 12.531173706054688, + "learning_rate": 9.53281432499691e-06, + "loss": 5.2026, + "step": 27355 + }, + { + "epoch": 0.556640625, + "grad_norm": 19.41629981994629, + "learning_rate": 9.532645618328926e-06, + "loss": 5.1342, + "step": 27360 + }, + { + "epoch": 0.5567423502604166, + "grad_norm": 18.257915496826172, + "learning_rate": 9.532476882698775e-06, + "loss": 5.1923, + "step": 27365 + }, + { + "epoch": 0.5568440755208334, + "grad_norm": 16.321544647216797, + "learning_rate": 9.532308118107537e-06, + "loss": 5.0458, + "step": 27370 + }, + { + "epoch": 0.55694580078125, + "grad_norm": 15.712728500366211, + "learning_rate": 9.53213932455629e-06, + "loss": 4.784, + "step": 27375 + }, + { + "epoch": 0.5570475260416666, + "grad_norm": 17.417930603027344, + "learning_rate": 9.531970502046107e-06, + "loss": 5.1181, + "step": 27380 + }, + { + "epoch": 0.5571492513020834, + "grad_norm": 13.493609428405762, + "learning_rate": 9.531801650578074e-06, + "loss": 5.1179, + "step": 27385 + }, + { + "epoch": 0.5572509765625, + "grad_norm": 21.312015533447266, + "learning_rate": 9.531632770153267e-06, + "loss": 4.8833, + "step": 27390 + }, + { + "epoch": 0.5573527018229166, + "grad_norm": 13.58987045288086, + "learning_rate": 9.531463860772762e-06, + "loss": 5.2297, + "step": 27395 + }, + { + "epoch": 0.5574544270833334, + "grad_norm": 12.36396598815918, + "learning_rate": 9.531294922437646e-06, + "loss": 5.0186, + "step": 27400 + }, + { + "epoch": 0.55755615234375, + "grad_norm": 23.58307647705078, + "learning_rate": 9.531125955148993e-06, + "loss": 5.3497, + "step": 27405 + }, + { + "epoch": 0.5576578776041666, + "grad_norm": 20.527196884155273, + "learning_rate": 9.53095695890788e-06, + "loss": 4.9658, + "step": 27410 + }, + { + "epoch": 0.5577596028645834, + "grad_norm": 15.360477447509766, + "learning_rate": 9.530787933715393e-06, + "loss": 5.2385, + "step": 27415 + }, + { + "epoch": 0.557861328125, + "grad_norm": 15.043216705322266, + "learning_rate": 9.53061887957261e-06, + "loss": 5.1774, + "step": 27420 + }, + { + "epoch": 0.5579630533854166, + "grad_norm": 11.783447265625, + "learning_rate": 9.53044979648061e-06, + "loss": 4.9732, + "step": 27425 + }, + { + "epoch": 0.5580647786458334, + "grad_norm": 18.694660186767578, + "learning_rate": 9.530280684440473e-06, + "loss": 5.1563, + "step": 27430 + }, + { + "epoch": 0.55816650390625, + "grad_norm": 23.026206970214844, + "learning_rate": 9.530111543453282e-06, + "loss": 5.034, + "step": 27435 + }, + { + "epoch": 0.5582682291666666, + "grad_norm": 14.396161079406738, + "learning_rate": 9.529942373520113e-06, + "loss": 5.223, + "step": 27440 + }, + { + "epoch": 0.5583699544270834, + "grad_norm": 19.363262176513672, + "learning_rate": 9.529773174642054e-06, + "loss": 5.0651, + "step": 27445 + }, + { + "epoch": 0.5584716796875, + "grad_norm": 16.652769088745117, + "learning_rate": 9.52960394682018e-06, + "loss": 5.2532, + "step": 27450 + }, + { + "epoch": 0.5585734049479166, + "grad_norm": 20.595529556274414, + "learning_rate": 9.529434690055577e-06, + "loss": 5.1483, + "step": 27455 + }, + { + "epoch": 0.5586751302083334, + "grad_norm": 19.050506591796875, + "learning_rate": 9.529265404349322e-06, + "loss": 5.2721, + "step": 27460 + }, + { + "epoch": 0.55877685546875, + "grad_norm": 16.3131046295166, + "learning_rate": 9.529096089702499e-06, + "loss": 5.167, + "step": 27465 + }, + { + "epoch": 0.5588785807291666, + "grad_norm": 15.787186622619629, + "learning_rate": 9.528926746116192e-06, + "loss": 4.8443, + "step": 27470 + }, + { + "epoch": 0.5589803059895834, + "grad_norm": 13.009543418884277, + "learning_rate": 9.52875737359148e-06, + "loss": 5.0998, + "step": 27475 + }, + { + "epoch": 0.55908203125, + "grad_norm": 22.33835792541504, + "learning_rate": 9.528587972129444e-06, + "loss": 4.9372, + "step": 27480 + }, + { + "epoch": 0.5591837565104166, + "grad_norm": 17.24803352355957, + "learning_rate": 9.52841854173117e-06, + "loss": 5.2964, + "step": 27485 + }, + { + "epoch": 0.5592854817708334, + "grad_norm": 15.933119773864746, + "learning_rate": 9.528249082397738e-06, + "loss": 5.1274, + "step": 27490 + }, + { + "epoch": 0.55938720703125, + "grad_norm": 16.222925186157227, + "learning_rate": 9.528079594130233e-06, + "loss": 5.1166, + "step": 27495 + }, + { + "epoch": 0.5594889322916666, + "grad_norm": 17.060894012451172, + "learning_rate": 9.527910076929736e-06, + "loss": 5.1266, + "step": 27500 + }, + { + "epoch": 0.5595906575520834, + "grad_norm": 17.06098747253418, + "learning_rate": 9.527740530797331e-06, + "loss": 5.1815, + "step": 27505 + }, + { + "epoch": 0.5596923828125, + "grad_norm": 17.68641471862793, + "learning_rate": 9.527570955734101e-06, + "loss": 4.9192, + "step": 27510 + }, + { + "epoch": 0.5597941080729166, + "grad_norm": 25.170183181762695, + "learning_rate": 9.52740135174113e-06, + "loss": 5.1346, + "step": 27515 + }, + { + "epoch": 0.5598958333333334, + "grad_norm": 19.26778793334961, + "learning_rate": 9.527231718819504e-06, + "loss": 4.9881, + "step": 27520 + }, + { + "epoch": 0.55999755859375, + "grad_norm": 18.577592849731445, + "learning_rate": 9.527062056970301e-06, + "loss": 5.3232, + "step": 27525 + }, + { + "epoch": 0.5600992838541666, + "grad_norm": 13.401984214782715, + "learning_rate": 9.526892366194608e-06, + "loss": 5.1239, + "step": 27530 + }, + { + "epoch": 0.5602010091145834, + "grad_norm": 15.467497825622559, + "learning_rate": 9.526722646493514e-06, + "loss": 5.0661, + "step": 27535 + }, + { + "epoch": 0.560302734375, + "grad_norm": 19.165061950683594, + "learning_rate": 9.526552897868097e-06, + "loss": 5.2584, + "step": 27540 + }, + { + "epoch": 0.5604044596354166, + "grad_norm": 17.92266845703125, + "learning_rate": 9.526383120319442e-06, + "loss": 5.181, + "step": 27545 + }, + { + "epoch": 0.5605061848958334, + "grad_norm": 12.496971130371094, + "learning_rate": 9.526213313848637e-06, + "loss": 4.8993, + "step": 27550 + }, + { + "epoch": 0.56060791015625, + "grad_norm": 13.211152076721191, + "learning_rate": 9.526043478456768e-06, + "loss": 4.993, + "step": 27555 + }, + { + "epoch": 0.5607096354166666, + "grad_norm": 14.68026065826416, + "learning_rate": 9.525873614144915e-06, + "loss": 5.0096, + "step": 27560 + }, + { + "epoch": 0.5608113606770834, + "grad_norm": 15.9718017578125, + "learning_rate": 9.525703720914169e-06, + "loss": 5.0469, + "step": 27565 + }, + { + "epoch": 0.5609130859375, + "grad_norm": 17.70387840270996, + "learning_rate": 9.52553379876561e-06, + "loss": 5.551, + "step": 27570 + }, + { + "epoch": 0.5610148111979166, + "grad_norm": 21.05810546875, + "learning_rate": 9.52536384770033e-06, + "loss": 5.1296, + "step": 27575 + }, + { + "epoch": 0.5611165364583334, + "grad_norm": 13.798822402954102, + "learning_rate": 9.525193867719409e-06, + "loss": 5.0845, + "step": 27580 + }, + { + "epoch": 0.56121826171875, + "grad_norm": 16.200220108032227, + "learning_rate": 9.525023858823936e-06, + "loss": 5.3214, + "step": 27585 + }, + { + "epoch": 0.5613199869791666, + "grad_norm": 15.84520149230957, + "learning_rate": 9.524853821014998e-06, + "loss": 5.0158, + "step": 27590 + }, + { + "epoch": 0.5614217122395834, + "grad_norm": 21.077266693115234, + "learning_rate": 9.52468375429368e-06, + "loss": 5.1168, + "step": 27595 + }, + { + "epoch": 0.5615234375, + "grad_norm": 18.50090789794922, + "learning_rate": 9.524513658661067e-06, + "loss": 5.2815, + "step": 27600 + }, + { + "epoch": 0.5616251627604166, + "grad_norm": 15.369682312011719, + "learning_rate": 9.524343534118252e-06, + "loss": 5.2655, + "step": 27605 + }, + { + "epoch": 0.5617268880208334, + "grad_norm": 20.226200103759766, + "learning_rate": 9.524173380666316e-06, + "loss": 5.0093, + "step": 27610 + }, + { + "epoch": 0.56182861328125, + "grad_norm": 14.719326972961426, + "learning_rate": 9.524003198306346e-06, + "loss": 5.0278, + "step": 27615 + }, + { + "epoch": 0.5619303385416666, + "grad_norm": 20.80523109436035, + "learning_rate": 9.523832987039434e-06, + "loss": 5.2268, + "step": 27620 + }, + { + "epoch": 0.5620320638020834, + "grad_norm": 26.07901382446289, + "learning_rate": 9.523662746866667e-06, + "loss": 4.9723, + "step": 27625 + }, + { + "epoch": 0.5621337890625, + "grad_norm": 16.94738006591797, + "learning_rate": 9.523492477789128e-06, + "loss": 5.074, + "step": 27630 + }, + { + "epoch": 0.5622355143229166, + "grad_norm": 14.360316276550293, + "learning_rate": 9.52332217980791e-06, + "loss": 4.987, + "step": 27635 + }, + { + "epoch": 0.5623372395833334, + "grad_norm": 14.428003311157227, + "learning_rate": 9.523151852924097e-06, + "loss": 5.2608, + "step": 27640 + }, + { + "epoch": 0.56243896484375, + "grad_norm": 16.072067260742188, + "learning_rate": 9.522981497138782e-06, + "loss": 5.2811, + "step": 27645 + }, + { + "epoch": 0.5625406901041666, + "grad_norm": 20.311351776123047, + "learning_rate": 9.52281111245305e-06, + "loss": 5.2066, + "step": 27650 + }, + { + "epoch": 0.5626424153645834, + "grad_norm": 14.209953308105469, + "learning_rate": 9.522640698867991e-06, + "loss": 4.9217, + "step": 27655 + }, + { + "epoch": 0.562744140625, + "grad_norm": 13.359545707702637, + "learning_rate": 9.522470256384693e-06, + "loss": 5.1066, + "step": 27660 + }, + { + "epoch": 0.5628458658854166, + "grad_norm": 18.390905380249023, + "learning_rate": 9.522299785004246e-06, + "loss": 5.1104, + "step": 27665 + }, + { + "epoch": 0.5629475911458334, + "grad_norm": 12.613424301147461, + "learning_rate": 9.522129284727737e-06, + "loss": 5.0775, + "step": 27670 + }, + { + "epoch": 0.56304931640625, + "grad_norm": 16.399682998657227, + "learning_rate": 9.52195875555626e-06, + "loss": 5.2764, + "step": 27675 + }, + { + "epoch": 0.5631510416666666, + "grad_norm": 17.068092346191406, + "learning_rate": 9.521788197490902e-06, + "loss": 5.066, + "step": 27680 + }, + { + "epoch": 0.5632527669270834, + "grad_norm": 23.09635353088379, + "learning_rate": 9.521617610532751e-06, + "loss": 5.0442, + "step": 27685 + }, + { + "epoch": 0.5633544921875, + "grad_norm": 31.060216903686523, + "learning_rate": 9.5214469946829e-06, + "loss": 5.336, + "step": 27690 + }, + { + "epoch": 0.5634562174479166, + "grad_norm": 13.213845252990723, + "learning_rate": 9.521276349942439e-06, + "loss": 5.2287, + "step": 27695 + }, + { + "epoch": 0.5635579427083334, + "grad_norm": 24.92420196533203, + "learning_rate": 9.521105676312455e-06, + "loss": 5.109, + "step": 27700 + }, + { + "epoch": 0.56365966796875, + "grad_norm": 20.04311180114746, + "learning_rate": 9.520934973794042e-06, + "loss": 5.0489, + "step": 27705 + }, + { + "epoch": 0.5637613932291666, + "grad_norm": 23.06500816345215, + "learning_rate": 9.52076424238829e-06, + "loss": 4.9933, + "step": 27710 + }, + { + "epoch": 0.5638631184895834, + "grad_norm": 24.464075088500977, + "learning_rate": 9.52059348209629e-06, + "loss": 5.0501, + "step": 27715 + }, + { + "epoch": 0.56396484375, + "grad_norm": 17.218605041503906, + "learning_rate": 9.520422692919132e-06, + "loss": 5.5548, + "step": 27720 + }, + { + "epoch": 0.5640665690104166, + "grad_norm": 16.627363204956055, + "learning_rate": 9.520251874857909e-06, + "loss": 5.0345, + "step": 27725 + }, + { + "epoch": 0.5641682942708334, + "grad_norm": 18.623842239379883, + "learning_rate": 9.52008102791371e-06, + "loss": 4.9691, + "step": 27730 + }, + { + "epoch": 0.56427001953125, + "grad_norm": 19.46030616760254, + "learning_rate": 9.519910152087628e-06, + "loss": 5.1243, + "step": 27735 + }, + { + "epoch": 0.5643717447916666, + "grad_norm": 19.895246505737305, + "learning_rate": 9.519739247380756e-06, + "loss": 5.1313, + "step": 27740 + }, + { + "epoch": 0.5644734700520834, + "grad_norm": 16.884416580200195, + "learning_rate": 9.519568313794184e-06, + "loss": 5.0485, + "step": 27745 + }, + { + "epoch": 0.5645751953125, + "grad_norm": 14.03328800201416, + "learning_rate": 9.519397351329006e-06, + "loss": 5.3312, + "step": 27750 + }, + { + "epoch": 0.5646769205729166, + "grad_norm": 18.335298538208008, + "learning_rate": 9.519226359986314e-06, + "loss": 5.237, + "step": 27755 + }, + { + "epoch": 0.5647786458333334, + "grad_norm": 20.009490966796875, + "learning_rate": 9.519055339767197e-06, + "loss": 5.1749, + "step": 27760 + }, + { + "epoch": 0.56488037109375, + "grad_norm": 18.166982650756836, + "learning_rate": 9.518884290672755e-06, + "loss": 5.1217, + "step": 27765 + }, + { + "epoch": 0.5649820963541666, + "grad_norm": 14.328213691711426, + "learning_rate": 9.518713212704074e-06, + "loss": 4.9956, + "step": 27770 + }, + { + "epoch": 0.5650838216145834, + "grad_norm": 18.439218521118164, + "learning_rate": 9.518542105862253e-06, + "loss": 5.5086, + "step": 27775 + }, + { + "epoch": 0.565185546875, + "grad_norm": 20.151317596435547, + "learning_rate": 9.518370970148381e-06, + "loss": 5.3744, + "step": 27780 + }, + { + "epoch": 0.5652872721354166, + "grad_norm": 26.905336380004883, + "learning_rate": 9.518199805563551e-06, + "loss": 5.2608, + "step": 27785 + }, + { + "epoch": 0.5653889973958334, + "grad_norm": 14.585002899169922, + "learning_rate": 9.51802861210886e-06, + "loss": 5.3323, + "step": 27790 + }, + { + "epoch": 0.56549072265625, + "grad_norm": 18.942535400390625, + "learning_rate": 9.5178573897854e-06, + "loss": 5.2823, + "step": 27795 + }, + { + "epoch": 0.5655924479166666, + "grad_norm": 14.013269424438477, + "learning_rate": 9.517686138594267e-06, + "loss": 5.1886, + "step": 27800 + }, + { + "epoch": 0.5656941731770834, + "grad_norm": 19.259769439697266, + "learning_rate": 9.517514858536552e-06, + "loss": 5.1346, + "step": 27805 + }, + { + "epoch": 0.5657958984375, + "grad_norm": 18.335493087768555, + "learning_rate": 9.51734354961335e-06, + "loss": 5.0404, + "step": 27810 + }, + { + "epoch": 0.5658976236979166, + "grad_norm": 16.137168884277344, + "learning_rate": 9.51717221182576e-06, + "loss": 5.2374, + "step": 27815 + }, + { + "epoch": 0.5659993489583334, + "grad_norm": 21.159528732299805, + "learning_rate": 9.517000845174871e-06, + "loss": 5.2263, + "step": 27820 + }, + { + "epoch": 0.56610107421875, + "grad_norm": 18.22709083557129, + "learning_rate": 9.516829449661781e-06, + "loss": 5.1047, + "step": 27825 + }, + { + "epoch": 0.5662027994791666, + "grad_norm": 16.245134353637695, + "learning_rate": 9.516658025287586e-06, + "loss": 5.0598, + "step": 27830 + }, + { + "epoch": 0.5663045247395834, + "grad_norm": 18.759105682373047, + "learning_rate": 9.516486572053379e-06, + "loss": 5.0111, + "step": 27835 + }, + { + "epoch": 0.56640625, + "grad_norm": 16.65563201904297, + "learning_rate": 9.516315089960256e-06, + "loss": 5.3862, + "step": 27840 + }, + { + "epoch": 0.5665079752604166, + "grad_norm": 16.603065490722656, + "learning_rate": 9.516143579009313e-06, + "loss": 5.1787, + "step": 27845 + }, + { + "epoch": 0.5666097005208334, + "grad_norm": 21.324420928955078, + "learning_rate": 9.515972039201647e-06, + "loss": 5.3711, + "step": 27850 + }, + { + "epoch": 0.56671142578125, + "grad_norm": 19.568204879760742, + "learning_rate": 9.515800470538352e-06, + "loss": 5.0201, + "step": 27855 + }, + { + "epoch": 0.5668131510416666, + "grad_norm": 16.66727638244629, + "learning_rate": 9.515628873020527e-06, + "loss": 5.0334, + "step": 27860 + }, + { + "epoch": 0.5669148763020834, + "grad_norm": 14.43701457977295, + "learning_rate": 9.515457246649267e-06, + "loss": 5.0052, + "step": 27865 + }, + { + "epoch": 0.5670166015625, + "grad_norm": 23.278430938720703, + "learning_rate": 9.515285591425667e-06, + "loss": 5.1363, + "step": 27870 + }, + { + "epoch": 0.5671183268229166, + "grad_norm": 15.436311721801758, + "learning_rate": 9.515113907350826e-06, + "loss": 5.1277, + "step": 27875 + }, + { + "epoch": 0.5672200520833334, + "grad_norm": 16.07198143005371, + "learning_rate": 9.514942194425843e-06, + "loss": 4.9865, + "step": 27880 + }, + { + "epoch": 0.56732177734375, + "grad_norm": 17.668407440185547, + "learning_rate": 9.514770452651809e-06, + "loss": 4.9792, + "step": 27885 + }, + { + "epoch": 0.5674235026041666, + "grad_norm": 21.959312438964844, + "learning_rate": 9.514598682029828e-06, + "loss": 5.2667, + "step": 27890 + }, + { + "epoch": 0.5675252278645834, + "grad_norm": 20.50105857849121, + "learning_rate": 9.514426882560991e-06, + "loss": 5.3343, + "step": 27895 + }, + { + "epoch": 0.567626953125, + "grad_norm": 13.54539966583252, + "learning_rate": 9.514255054246402e-06, + "loss": 5.1087, + "step": 27900 + }, + { + "epoch": 0.5677286783854166, + "grad_norm": 13.060052871704102, + "learning_rate": 9.514083197087156e-06, + "loss": 5.0808, + "step": 27905 + }, + { + "epoch": 0.5678304036458334, + "grad_norm": 19.206600189208984, + "learning_rate": 9.51391131108435e-06, + "loss": 5.3631, + "step": 27910 + }, + { + "epoch": 0.56793212890625, + "grad_norm": 17.23772430419922, + "learning_rate": 9.513739396239085e-06, + "loss": 4.9842, + "step": 27915 + }, + { + "epoch": 0.5680338541666666, + "grad_norm": 20.01923370361328, + "learning_rate": 9.513567452552456e-06, + "loss": 4.9802, + "step": 27920 + }, + { + "epoch": 0.5681355794270834, + "grad_norm": 19.208585739135742, + "learning_rate": 9.513395480025566e-06, + "loss": 4.917, + "step": 27925 + }, + { + "epoch": 0.5682373046875, + "grad_norm": 17.630924224853516, + "learning_rate": 9.513223478659513e-06, + "loss": 5.0844, + "step": 27930 + }, + { + "epoch": 0.5683390299479166, + "grad_norm": 16.840396881103516, + "learning_rate": 9.513051448455392e-06, + "loss": 5.1948, + "step": 27935 + }, + { + "epoch": 0.5684407552083334, + "grad_norm": 15.232376098632812, + "learning_rate": 9.512879389414305e-06, + "loss": 5.2843, + "step": 27940 + }, + { + "epoch": 0.56854248046875, + "grad_norm": 12.91799259185791, + "learning_rate": 9.512707301537351e-06, + "loss": 5.3237, + "step": 27945 + }, + { + "epoch": 0.5686442057291666, + "grad_norm": 12.5217866897583, + "learning_rate": 9.512535184825632e-06, + "loss": 5.0681, + "step": 27950 + }, + { + "epoch": 0.5687459309895834, + "grad_norm": 15.719836235046387, + "learning_rate": 9.512363039280244e-06, + "loss": 5.2333, + "step": 27955 + }, + { + "epoch": 0.56884765625, + "grad_norm": 17.4040470123291, + "learning_rate": 9.51219086490229e-06, + "loss": 4.8978, + "step": 27960 + }, + { + "epoch": 0.5689493815104166, + "grad_norm": 16.46617317199707, + "learning_rate": 9.512018661692867e-06, + "loss": 5.4968, + "step": 27965 + }, + { + "epoch": 0.5690511067708334, + "grad_norm": 23.70105743408203, + "learning_rate": 9.511846429653078e-06, + "loss": 5.1721, + "step": 27970 + }, + { + "epoch": 0.56915283203125, + "grad_norm": 18.5255184173584, + "learning_rate": 9.511674168784022e-06, + "loss": 5.2014, + "step": 27975 + }, + { + "epoch": 0.5692545572916666, + "grad_norm": 22.390491485595703, + "learning_rate": 9.5115018790868e-06, + "loss": 5.2646, + "step": 27980 + }, + { + "epoch": 0.5693562825520834, + "grad_norm": 19.596961975097656, + "learning_rate": 9.511329560562514e-06, + "loss": 5.1423, + "step": 27985 + }, + { + "epoch": 0.5694580078125, + "grad_norm": 19.702634811401367, + "learning_rate": 9.511157213212262e-06, + "loss": 5.1674, + "step": 27990 + }, + { + "epoch": 0.5695597330729166, + "grad_norm": 15.707798957824707, + "learning_rate": 9.510984837037151e-06, + "loss": 5.3129, + "step": 27995 + }, + { + "epoch": 0.5696614583333334, + "grad_norm": 17.597728729248047, + "learning_rate": 9.510812432038277e-06, + "loss": 5.0927, + "step": 28000 + }, + { + "epoch": 0.56976318359375, + "grad_norm": 17.546327590942383, + "learning_rate": 9.510639998216744e-06, + "loss": 5.1716, + "step": 28005 + }, + { + "epoch": 0.5698649088541666, + "grad_norm": 22.542451858520508, + "learning_rate": 9.510467535573653e-06, + "loss": 5.3094, + "step": 28010 + }, + { + "epoch": 0.5699666341145834, + "grad_norm": 26.98898696899414, + "learning_rate": 9.510295044110107e-06, + "loss": 5.1201, + "step": 28015 + }, + { + "epoch": 0.570068359375, + "grad_norm": 15.377753257751465, + "learning_rate": 9.510122523827205e-06, + "loss": 5.094, + "step": 28020 + }, + { + "epoch": 0.5701700846354166, + "grad_norm": 13.728224754333496, + "learning_rate": 9.509949974726053e-06, + "loss": 5.1276, + "step": 28025 + }, + { + "epoch": 0.5702718098958334, + "grad_norm": 18.313526153564453, + "learning_rate": 9.509777396807753e-06, + "loss": 5.6639, + "step": 28030 + }, + { + "epoch": 0.57037353515625, + "grad_norm": 13.584367752075195, + "learning_rate": 9.509604790073407e-06, + "loss": 5.0946, + "step": 28035 + }, + { + "epoch": 0.5704752604166666, + "grad_norm": 14.840548515319824, + "learning_rate": 9.509432154524119e-06, + "loss": 5.1867, + "step": 28040 + }, + { + "epoch": 0.5705769856770834, + "grad_norm": 14.414995193481445, + "learning_rate": 9.509259490160989e-06, + "loss": 5.4992, + "step": 28045 + }, + { + "epoch": 0.5706787109375, + "grad_norm": 21.079357147216797, + "learning_rate": 9.509086796985123e-06, + "loss": 5.2425, + "step": 28050 + }, + { + "epoch": 0.5707804361979166, + "grad_norm": 16.98733901977539, + "learning_rate": 9.508914074997623e-06, + "loss": 5.2063, + "step": 28055 + }, + { + "epoch": 0.5708821614583334, + "grad_norm": 17.14090919494629, + "learning_rate": 9.508741324199595e-06, + "loss": 4.938, + "step": 28060 + }, + { + "epoch": 0.57098388671875, + "grad_norm": 16.307973861694336, + "learning_rate": 9.50856854459214e-06, + "loss": 5.2338, + "step": 28065 + }, + { + "epoch": 0.5710856119791666, + "grad_norm": 16.412769317626953, + "learning_rate": 9.508395736176365e-06, + "loss": 5.4261, + "step": 28070 + }, + { + "epoch": 0.5711873372395834, + "grad_norm": 21.357221603393555, + "learning_rate": 9.50822289895337e-06, + "loss": 4.9512, + "step": 28075 + }, + { + "epoch": 0.5712890625, + "grad_norm": 14.155309677124023, + "learning_rate": 9.508050032924263e-06, + "loss": 5.1753, + "step": 28080 + }, + { + "epoch": 0.5713907877604166, + "grad_norm": 14.83113956451416, + "learning_rate": 9.507877138090146e-06, + "loss": 5.1737, + "step": 28085 + }, + { + "epoch": 0.5714925130208334, + "grad_norm": 18.378568649291992, + "learning_rate": 9.507704214452127e-06, + "loss": 5.2231, + "step": 28090 + }, + { + "epoch": 0.57159423828125, + "grad_norm": 15.45629596710205, + "learning_rate": 9.507531262011309e-06, + "loss": 4.9771, + "step": 28095 + }, + { + "epoch": 0.5716959635416666, + "grad_norm": 17.60671615600586, + "learning_rate": 9.507358280768796e-06, + "loss": 5.3436, + "step": 28100 + }, + { + "epoch": 0.5717976888020834, + "grad_norm": 14.913180351257324, + "learning_rate": 9.507185270725694e-06, + "loss": 5.2167, + "step": 28105 + }, + { + "epoch": 0.5718994140625, + "grad_norm": 22.102205276489258, + "learning_rate": 9.50701223188311e-06, + "loss": 5.6378, + "step": 28110 + }, + { + "epoch": 0.5720011393229166, + "grad_norm": 17.813968658447266, + "learning_rate": 9.506839164242147e-06, + "loss": 4.9709, + "step": 28115 + }, + { + "epoch": 0.5721028645833334, + "grad_norm": 19.313777923583984, + "learning_rate": 9.506666067803915e-06, + "loss": 5.0791, + "step": 28120 + }, + { + "epoch": 0.57220458984375, + "grad_norm": 19.641298294067383, + "learning_rate": 9.506492942569517e-06, + "loss": 5.2834, + "step": 28125 + }, + { + "epoch": 0.5723063151041666, + "grad_norm": 18.081327438354492, + "learning_rate": 9.506319788540056e-06, + "loss": 5.1834, + "step": 28130 + }, + { + "epoch": 0.5724080403645834, + "grad_norm": 17.127195358276367, + "learning_rate": 9.506146605716647e-06, + "loss": 5.0998, + "step": 28135 + }, + { + "epoch": 0.572509765625, + "grad_norm": 17.480052947998047, + "learning_rate": 9.505973394100388e-06, + "loss": 5.0354, + "step": 28140 + }, + { + "epoch": 0.5726114908854166, + "grad_norm": 17.8472843170166, + "learning_rate": 9.50580015369239e-06, + "loss": 5.0584, + "step": 28145 + }, + { + "epoch": 0.5727132161458334, + "grad_norm": 20.302841186523438, + "learning_rate": 9.505626884493762e-06, + "loss": 4.8398, + "step": 28150 + }, + { + "epoch": 0.57281494140625, + "grad_norm": 15.672582626342773, + "learning_rate": 9.505453586505607e-06, + "loss": 5.2955, + "step": 28155 + }, + { + "epoch": 0.5729166666666666, + "grad_norm": 22.56606674194336, + "learning_rate": 9.505280259729034e-06, + "loss": 5.2505, + "step": 28160 + }, + { + "epoch": 0.5730183919270834, + "grad_norm": 14.870548248291016, + "learning_rate": 9.505106904165148e-06, + "loss": 5.1015, + "step": 28165 + }, + { + "epoch": 0.5731201171875, + "grad_norm": 20.467166900634766, + "learning_rate": 9.504933519815061e-06, + "loss": 5.223, + "step": 28170 + }, + { + "epoch": 0.5732218424479166, + "grad_norm": 14.425257682800293, + "learning_rate": 9.504760106679878e-06, + "loss": 5.2019, + "step": 28175 + }, + { + "epoch": 0.5733235677083334, + "grad_norm": 20.04555892944336, + "learning_rate": 9.504586664760709e-06, + "loss": 4.9433, + "step": 28180 + }, + { + "epoch": 0.57342529296875, + "grad_norm": 15.35773754119873, + "learning_rate": 9.504413194058659e-06, + "loss": 5.3387, + "step": 28185 + }, + { + "epoch": 0.5735270182291666, + "grad_norm": 40.600677490234375, + "learning_rate": 9.504239694574841e-06, + "loss": 5.3624, + "step": 28190 + }, + { + "epoch": 0.5736287434895834, + "grad_norm": 22.99974250793457, + "learning_rate": 9.50406616631036e-06, + "loss": 4.9958, + "step": 28195 + }, + { + "epoch": 0.57373046875, + "grad_norm": 17.563608169555664, + "learning_rate": 9.503892609266327e-06, + "loss": 5.1961, + "step": 28200 + }, + { + "epoch": 0.5738321940104166, + "grad_norm": 17.19901466369629, + "learning_rate": 9.50371902344385e-06, + "loss": 4.8034, + "step": 28205 + }, + { + "epoch": 0.5739339192708334, + "grad_norm": 18.641273498535156, + "learning_rate": 9.503545408844036e-06, + "loss": 5.23, + "step": 28210 + }, + { + "epoch": 0.57403564453125, + "grad_norm": 12.753037452697754, + "learning_rate": 9.503371765468e-06, + "loss": 4.9638, + "step": 28215 + }, + { + "epoch": 0.5741373697916666, + "grad_norm": 14.598785400390625, + "learning_rate": 9.503198093316844e-06, + "loss": 5.1588, + "step": 28220 + }, + { + "epoch": 0.5742390950520834, + "grad_norm": 14.382951736450195, + "learning_rate": 9.503024392391685e-06, + "loss": 5.027, + "step": 28225 + }, + { + "epoch": 0.5743408203125, + "grad_norm": 15.519381523132324, + "learning_rate": 9.502850662693628e-06, + "loss": 5.2747, + "step": 28230 + }, + { + "epoch": 0.5744425455729166, + "grad_norm": 20.591957092285156, + "learning_rate": 9.502676904223784e-06, + "loss": 5.1354, + "step": 28235 + }, + { + "epoch": 0.5745442708333334, + "grad_norm": 14.720667839050293, + "learning_rate": 9.502503116983265e-06, + "loss": 4.9869, + "step": 28240 + }, + { + "epoch": 0.57464599609375, + "grad_norm": 24.54094123840332, + "learning_rate": 9.50232930097318e-06, + "loss": 4.9979, + "step": 28245 + }, + { + "epoch": 0.5747477213541666, + "grad_norm": 19.08615493774414, + "learning_rate": 9.50215545619464e-06, + "loss": 5.2676, + "step": 28250 + }, + { + "epoch": 0.5748494466145834, + "grad_norm": 18.77302360534668, + "learning_rate": 9.501981582648756e-06, + "loss": 4.839, + "step": 28255 + }, + { + "epoch": 0.574951171875, + "grad_norm": 19.989177703857422, + "learning_rate": 9.501807680336638e-06, + "loss": 5.2286, + "step": 28260 + }, + { + "epoch": 0.5750528971354166, + "grad_norm": 15.000138282775879, + "learning_rate": 9.5016337492594e-06, + "loss": 5.225, + "step": 28265 + }, + { + "epoch": 0.5751546223958334, + "grad_norm": 14.822260856628418, + "learning_rate": 9.501459789418149e-06, + "loss": 5.0541, + "step": 28270 + }, + { + "epoch": 0.57525634765625, + "grad_norm": 17.893400192260742, + "learning_rate": 9.501285800813999e-06, + "loss": 5.376, + "step": 28275 + }, + { + "epoch": 0.5753580729166666, + "grad_norm": 16.322355270385742, + "learning_rate": 9.501111783448066e-06, + "loss": 5.3159, + "step": 28280 + }, + { + "epoch": 0.5754597981770834, + "grad_norm": 19.492507934570312, + "learning_rate": 9.500937737321452e-06, + "loss": 5.4682, + "step": 28285 + }, + { + "epoch": 0.5755615234375, + "grad_norm": 22.57615089416504, + "learning_rate": 9.500763662435277e-06, + "loss": 5.0708, + "step": 28290 + }, + { + "epoch": 0.5756632486979166, + "grad_norm": 15.679899215698242, + "learning_rate": 9.50058955879065e-06, + "loss": 5.3581, + "step": 28295 + }, + { + "epoch": 0.5757649739583334, + "grad_norm": 18.471094131469727, + "learning_rate": 9.500415426388687e-06, + "loss": 5.0394, + "step": 28300 + }, + { + "epoch": 0.57586669921875, + "grad_norm": 16.923967361450195, + "learning_rate": 9.500241265230495e-06, + "loss": 5.0544, + "step": 28305 + }, + { + "epoch": 0.5759684244791666, + "grad_norm": 18.16712760925293, + "learning_rate": 9.500067075317191e-06, + "loss": 5.0819, + "step": 28310 + }, + { + "epoch": 0.5760701497395834, + "grad_norm": 14.97790241241455, + "learning_rate": 9.499892856649887e-06, + "loss": 5.085, + "step": 28315 + }, + { + "epoch": 0.576171875, + "grad_norm": 19.667339324951172, + "learning_rate": 9.499718609229695e-06, + "loss": 5.3601, + "step": 28320 + }, + { + "epoch": 0.5762736002604166, + "grad_norm": 21.86824607849121, + "learning_rate": 9.49954433305773e-06, + "loss": 5.1673, + "step": 28325 + }, + { + "epoch": 0.5763753255208334, + "grad_norm": 13.941978454589844, + "learning_rate": 9.499370028135106e-06, + "loss": 5.0973, + "step": 28330 + }, + { + "epoch": 0.57647705078125, + "grad_norm": 18.144641876220703, + "learning_rate": 9.499195694462937e-06, + "loss": 4.8731, + "step": 28335 + }, + { + "epoch": 0.5765787760416666, + "grad_norm": 14.29116153717041, + "learning_rate": 9.499021332042334e-06, + "loss": 5.4038, + "step": 28340 + }, + { + "epoch": 0.5766805013020834, + "grad_norm": 19.24259376525879, + "learning_rate": 9.498846940874413e-06, + "loss": 5.0, + "step": 28345 + }, + { + "epoch": 0.5767822265625, + "grad_norm": 18.770559310913086, + "learning_rate": 9.498672520960289e-06, + "loss": 5.048, + "step": 28350 + }, + { + "epoch": 0.5768839518229166, + "grad_norm": 15.181851387023926, + "learning_rate": 9.498498072301074e-06, + "loss": 5.3414, + "step": 28355 + }, + { + "epoch": 0.5769856770833334, + "grad_norm": 17.083250045776367, + "learning_rate": 9.498323594897885e-06, + "loss": 5.0204, + "step": 28360 + }, + { + "epoch": 0.57708740234375, + "grad_norm": 13.441373825073242, + "learning_rate": 9.498149088751837e-06, + "loss": 4.8758, + "step": 28365 + }, + { + "epoch": 0.5771891276041666, + "grad_norm": 21.28489875793457, + "learning_rate": 9.497974553864043e-06, + "loss": 5.2365, + "step": 28370 + }, + { + "epoch": 0.5772908528645834, + "grad_norm": 16.48379135131836, + "learning_rate": 9.497799990235622e-06, + "loss": 5.1705, + "step": 28375 + }, + { + "epoch": 0.577392578125, + "grad_norm": 18.686447143554688, + "learning_rate": 9.497625397867683e-06, + "loss": 5.2575, + "step": 28380 + }, + { + "epoch": 0.5774943033854166, + "grad_norm": 16.72586441040039, + "learning_rate": 9.497450776761347e-06, + "loss": 5.2105, + "step": 28385 + }, + { + "epoch": 0.5775960286458334, + "grad_norm": 12.989459991455078, + "learning_rate": 9.497276126917729e-06, + "loss": 4.9976, + "step": 28390 + }, + { + "epoch": 0.57769775390625, + "grad_norm": 16.952781677246094, + "learning_rate": 9.497101448337944e-06, + "loss": 5.0654, + "step": 28395 + }, + { + "epoch": 0.5777994791666666, + "grad_norm": 15.935881614685059, + "learning_rate": 9.496926741023108e-06, + "loss": 5.2037, + "step": 28400 + }, + { + "epoch": 0.5779012044270834, + "grad_norm": 20.80965232849121, + "learning_rate": 9.496752004974337e-06, + "loss": 5.0653, + "step": 28405 + }, + { + "epoch": 0.5780029296875, + "grad_norm": 14.401626586914062, + "learning_rate": 9.49657724019275e-06, + "loss": 5.2918, + "step": 28410 + }, + { + "epoch": 0.5781046549479166, + "grad_norm": 12.650864601135254, + "learning_rate": 9.49640244667946e-06, + "loss": 5.3725, + "step": 28415 + }, + { + "epoch": 0.5782063802083334, + "grad_norm": 20.65974235534668, + "learning_rate": 9.496227624435587e-06, + "loss": 5.1598, + "step": 28420 + }, + { + "epoch": 0.57830810546875, + "grad_norm": 17.54054832458496, + "learning_rate": 9.496052773462245e-06, + "loss": 5.0189, + "step": 28425 + }, + { + "epoch": 0.5784098307291666, + "grad_norm": 17.44231414794922, + "learning_rate": 9.495877893760555e-06, + "loss": 4.8714, + "step": 28430 + }, + { + "epoch": 0.5785115559895834, + "grad_norm": 15.414323806762695, + "learning_rate": 9.495702985331632e-06, + "loss": 5.2005, + "step": 28435 + }, + { + "epoch": 0.57861328125, + "grad_norm": 22.970998764038086, + "learning_rate": 9.495528048176594e-06, + "loss": 5.302, + "step": 28440 + }, + { + "epoch": 0.5787150065104166, + "grad_norm": 16.673847198486328, + "learning_rate": 9.495353082296558e-06, + "loss": 5.163, + "step": 28445 + }, + { + "epoch": 0.5788167317708334, + "grad_norm": 24.730567932128906, + "learning_rate": 9.495178087692643e-06, + "loss": 5.4039, + "step": 28450 + }, + { + "epoch": 0.57891845703125, + "grad_norm": 30.081361770629883, + "learning_rate": 9.495003064365969e-06, + "loss": 5.2283, + "step": 28455 + }, + { + "epoch": 0.5790201822916666, + "grad_norm": 18.650619506835938, + "learning_rate": 9.494828012317649e-06, + "loss": 5.0564, + "step": 28460 + }, + { + "epoch": 0.5791219075520834, + "grad_norm": 15.44764232635498, + "learning_rate": 9.494652931548807e-06, + "loss": 5.209, + "step": 28465 + }, + { + "epoch": 0.5792236328125, + "grad_norm": 21.42605209350586, + "learning_rate": 9.49447782206056e-06, + "loss": 5.1134, + "step": 28470 + }, + { + "epoch": 0.5793253580729166, + "grad_norm": 17.517114639282227, + "learning_rate": 9.494302683854024e-06, + "loss": 5.4483, + "step": 28475 + }, + { + "epoch": 0.5794270833333334, + "grad_norm": 15.524322509765625, + "learning_rate": 9.494127516930323e-06, + "loss": 5.248, + "step": 28480 + }, + { + "epoch": 0.57952880859375, + "grad_norm": 19.844383239746094, + "learning_rate": 9.493952321290572e-06, + "loss": 5.0564, + "step": 28485 + }, + { + "epoch": 0.5796305338541666, + "grad_norm": 12.509552001953125, + "learning_rate": 9.493777096935895e-06, + "loss": 5.3661, + "step": 28490 + }, + { + "epoch": 0.5797322591145834, + "grad_norm": 13.6521635055542, + "learning_rate": 9.493601843867407e-06, + "loss": 5.0831, + "step": 28495 + }, + { + "epoch": 0.579833984375, + "grad_norm": 16.386337280273438, + "learning_rate": 9.493426562086228e-06, + "loss": 5.1318, + "step": 28500 + }, + { + "epoch": 0.5799357096354166, + "grad_norm": 11.400629043579102, + "learning_rate": 9.493251251593483e-06, + "loss": 5.106, + "step": 28505 + }, + { + "epoch": 0.5800374348958334, + "grad_norm": 13.0906400680542, + "learning_rate": 9.493075912390286e-06, + "loss": 5.2296, + "step": 28510 + }, + { + "epoch": 0.58013916015625, + "grad_norm": 17.50678062438965, + "learning_rate": 9.492900544477761e-06, + "loss": 5.0689, + "step": 28515 + }, + { + "epoch": 0.5802408854166666, + "grad_norm": 16.424156188964844, + "learning_rate": 9.49272514785703e-06, + "loss": 4.9958, + "step": 28520 + }, + { + "epoch": 0.5803426106770834, + "grad_norm": 18.691089630126953, + "learning_rate": 9.492549722529209e-06, + "loss": 5.0046, + "step": 28525 + }, + { + "epoch": 0.5804443359375, + "grad_norm": 20.803064346313477, + "learning_rate": 9.49237426849542e-06, + "loss": 5.165, + "step": 28530 + }, + { + "epoch": 0.5805460611979166, + "grad_norm": 19.238901138305664, + "learning_rate": 9.492198785756788e-06, + "loss": 5.0596, + "step": 28535 + }, + { + "epoch": 0.5806477864583334, + "grad_norm": 19.924772262573242, + "learning_rate": 9.492023274314433e-06, + "loss": 5.2912, + "step": 28540 + }, + { + "epoch": 0.58074951171875, + "grad_norm": 20.771059036254883, + "learning_rate": 9.491847734169473e-06, + "loss": 5.2729, + "step": 28545 + }, + { + "epoch": 0.5808512369791666, + "grad_norm": 16.656587600708008, + "learning_rate": 9.491672165323031e-06, + "loss": 5.2427, + "step": 28550 + }, + { + "epoch": 0.5809529622395834, + "grad_norm": 18.40635871887207, + "learning_rate": 9.491496567776233e-06, + "loss": 5.1474, + "step": 28555 + }, + { + "epoch": 0.5810546875, + "grad_norm": 20.452491760253906, + "learning_rate": 9.491320941530196e-06, + "loss": 4.9558, + "step": 28560 + }, + { + "epoch": 0.5811564127604166, + "grad_norm": 14.584616661071777, + "learning_rate": 9.491145286586043e-06, + "loss": 4.9314, + "step": 28565 + }, + { + "epoch": 0.5812581380208334, + "grad_norm": 15.778558731079102, + "learning_rate": 9.490969602944899e-06, + "loss": 5.0657, + "step": 28570 + }, + { + "epoch": 0.58135986328125, + "grad_norm": 14.927220344543457, + "learning_rate": 9.490793890607883e-06, + "loss": 5.0775, + "step": 28575 + }, + { + "epoch": 0.5814615885416666, + "grad_norm": 34.09748077392578, + "learning_rate": 9.490618149576121e-06, + "loss": 5.2617, + "step": 28580 + }, + { + "epoch": 0.5815633138020834, + "grad_norm": 11.839200973510742, + "learning_rate": 9.490442379850734e-06, + "loss": 5.4353, + "step": 28585 + }, + { + "epoch": 0.5816650390625, + "grad_norm": 14.8182954788208, + "learning_rate": 9.490266581432846e-06, + "loss": 5.2365, + "step": 28590 + }, + { + "epoch": 0.5817667643229166, + "grad_norm": 17.208402633666992, + "learning_rate": 9.49009075432358e-06, + "loss": 5.0919, + "step": 28595 + }, + { + "epoch": 0.5818684895833334, + "grad_norm": 16.763832092285156, + "learning_rate": 9.489914898524058e-06, + "loss": 5.1953, + "step": 28600 + }, + { + "epoch": 0.58197021484375, + "grad_norm": 14.395819664001465, + "learning_rate": 9.489739014035405e-06, + "loss": 4.9735, + "step": 28605 + }, + { + "epoch": 0.5820719401041666, + "grad_norm": 15.408628463745117, + "learning_rate": 9.489563100858748e-06, + "loss": 5.2062, + "step": 28610 + }, + { + "epoch": 0.5821736653645834, + "grad_norm": 21.473190307617188, + "learning_rate": 9.489387158995206e-06, + "loss": 5.1761, + "step": 28615 + }, + { + "epoch": 0.582275390625, + "grad_norm": 18.07660484313965, + "learning_rate": 9.489211188445905e-06, + "loss": 5.2558, + "step": 28620 + }, + { + "epoch": 0.5823771158854166, + "grad_norm": 17.817277908325195, + "learning_rate": 9.48903518921197e-06, + "loss": 5.0586, + "step": 28625 + }, + { + "epoch": 0.5824788411458334, + "grad_norm": 17.11922836303711, + "learning_rate": 9.488859161294526e-06, + "loss": 4.9566, + "step": 28630 + }, + { + "epoch": 0.58258056640625, + "grad_norm": 20.96281623840332, + "learning_rate": 9.488683104694697e-06, + "loss": 4.9009, + "step": 28635 + }, + { + "epoch": 0.5826822916666666, + "grad_norm": 18.15694808959961, + "learning_rate": 9.488507019413604e-06, + "loss": 4.861, + "step": 28640 + }, + { + "epoch": 0.5827840169270834, + "grad_norm": 16.2524356842041, + "learning_rate": 9.48833090545238e-06, + "loss": 5.0238, + "step": 28645 + }, + { + "epoch": 0.5828857421875, + "grad_norm": 16.480178833007812, + "learning_rate": 9.488154762812145e-06, + "loss": 5.0891, + "step": 28650 + }, + { + "epoch": 0.5829874674479166, + "grad_norm": 15.761170387268066, + "learning_rate": 9.487978591494026e-06, + "loss": 5.0347, + "step": 28655 + }, + { + "epoch": 0.5830891927083334, + "grad_norm": 22.369163513183594, + "learning_rate": 9.487802391499148e-06, + "loss": 4.9777, + "step": 28660 + }, + { + "epoch": 0.58319091796875, + "grad_norm": 22.539182662963867, + "learning_rate": 9.487626162828638e-06, + "loss": 5.0773, + "step": 28665 + }, + { + "epoch": 0.5832926432291666, + "grad_norm": 15.898275375366211, + "learning_rate": 9.48744990548362e-06, + "loss": 5.114, + "step": 28670 + }, + { + "epoch": 0.5833943684895834, + "grad_norm": 18.92156219482422, + "learning_rate": 9.487273619465224e-06, + "loss": 5.0554, + "step": 28675 + }, + { + "epoch": 0.58349609375, + "grad_norm": 13.338627815246582, + "learning_rate": 9.487097304774573e-06, + "loss": 5.0499, + "step": 28680 + }, + { + "epoch": 0.5835978190104166, + "grad_norm": 19.440120697021484, + "learning_rate": 9.486920961412794e-06, + "loss": 5.2217, + "step": 28685 + }, + { + "epoch": 0.5836995442708334, + "grad_norm": 24.164588928222656, + "learning_rate": 9.486744589381014e-06, + "loss": 5.0004, + "step": 28690 + }, + { + "epoch": 0.58380126953125, + "grad_norm": 15.814630508422852, + "learning_rate": 9.486568188680362e-06, + "loss": 5.2157, + "step": 28695 + }, + { + "epoch": 0.5839029947916666, + "grad_norm": 14.877976417541504, + "learning_rate": 9.486391759311962e-06, + "loss": 5.0501, + "step": 28700 + }, + { + "epoch": 0.5840047200520834, + "grad_norm": 20.3271484375, + "learning_rate": 9.486215301276944e-06, + "loss": 4.8029, + "step": 28705 + }, + { + "epoch": 0.5841064453125, + "grad_norm": 13.4552583694458, + "learning_rate": 9.486038814576433e-06, + "loss": 4.8948, + "step": 28710 + }, + { + "epoch": 0.5842081705729166, + "grad_norm": 18.273950576782227, + "learning_rate": 9.485862299211559e-06, + "loss": 4.9979, + "step": 28715 + }, + { + "epoch": 0.5843098958333334, + "grad_norm": 15.405302047729492, + "learning_rate": 9.48568575518345e-06, + "loss": 4.8329, + "step": 28720 + }, + { + "epoch": 0.58441162109375, + "grad_norm": 21.215465545654297, + "learning_rate": 9.48550918249323e-06, + "loss": 5.3477, + "step": 28725 + }, + { + "epoch": 0.5845133463541666, + "grad_norm": 23.423545837402344, + "learning_rate": 9.485332581142032e-06, + "loss": 5.2166, + "step": 28730 + }, + { + "epoch": 0.5846150716145834, + "grad_norm": 18.233108520507812, + "learning_rate": 9.485155951130983e-06, + "loss": 5.1827, + "step": 28735 + }, + { + "epoch": 0.584716796875, + "grad_norm": 17.106609344482422, + "learning_rate": 9.48497929246121e-06, + "loss": 5.2273, + "step": 28740 + }, + { + "epoch": 0.5848185221354166, + "grad_norm": 18.924448013305664, + "learning_rate": 9.484802605133844e-06, + "loss": 5.3746, + "step": 28745 + }, + { + "epoch": 0.5849202473958334, + "grad_norm": 14.531387329101562, + "learning_rate": 9.484625889150015e-06, + "loss": 5.0753, + "step": 28750 + }, + { + "epoch": 0.58502197265625, + "grad_norm": 19.214075088500977, + "learning_rate": 9.484449144510847e-06, + "loss": 5.1766, + "step": 28755 + }, + { + "epoch": 0.5851236979166666, + "grad_norm": 16.68658447265625, + "learning_rate": 9.484272371217473e-06, + "loss": 5.2324, + "step": 28760 + }, + { + "epoch": 0.5852254231770834, + "grad_norm": 20.88003921508789, + "learning_rate": 9.484095569271024e-06, + "loss": 5.3481, + "step": 28765 + }, + { + "epoch": 0.5853271484375, + "grad_norm": 16.113374710083008, + "learning_rate": 9.483918738672625e-06, + "loss": 5.0911, + "step": 28770 + }, + { + "epoch": 0.5854288736979166, + "grad_norm": 17.023035049438477, + "learning_rate": 9.48374187942341e-06, + "loss": 5.3631, + "step": 28775 + }, + { + "epoch": 0.5855305989583334, + "grad_norm": 15.8486328125, + "learning_rate": 9.483564991524508e-06, + "loss": 5.1398, + "step": 28780 + }, + { + "epoch": 0.58563232421875, + "grad_norm": 15.577689170837402, + "learning_rate": 9.48338807497705e-06, + "loss": 4.869, + "step": 28785 + }, + { + "epoch": 0.5857340494791666, + "grad_norm": 21.233047485351562, + "learning_rate": 9.483211129782162e-06, + "loss": 5.0606, + "step": 28790 + }, + { + "epoch": 0.5858357747395834, + "grad_norm": 14.899164199829102, + "learning_rate": 9.483034155940982e-06, + "loss": 5.1909, + "step": 28795 + }, + { + "epoch": 0.5859375, + "grad_norm": 17.311824798583984, + "learning_rate": 9.482857153454634e-06, + "loss": 5.1239, + "step": 28800 + }, + { + "epoch": 0.5860392252604166, + "grad_norm": 18.47419548034668, + "learning_rate": 9.482680122324253e-06, + "loss": 4.9807, + "step": 28805 + }, + { + "epoch": 0.5861409505208334, + "grad_norm": 16.675094604492188, + "learning_rate": 9.482503062550968e-06, + "loss": 5.2128, + "step": 28810 + }, + { + "epoch": 0.58624267578125, + "grad_norm": 16.766340255737305, + "learning_rate": 9.482325974135913e-06, + "loss": 4.9281, + "step": 28815 + }, + { + "epoch": 0.5863444010416666, + "grad_norm": 15.554878234863281, + "learning_rate": 9.482148857080218e-06, + "loss": 4.9822, + "step": 28820 + }, + { + "epoch": 0.5864461263020834, + "grad_norm": 17.281213760375977, + "learning_rate": 9.481971711385012e-06, + "loss": 5.1751, + "step": 28825 + }, + { + "epoch": 0.5865478515625, + "grad_norm": 16.89583396911621, + "learning_rate": 9.481794537051431e-06, + "loss": 4.9632, + "step": 28830 + }, + { + "epoch": 0.5866495768229166, + "grad_norm": 17.427156448364258, + "learning_rate": 9.481617334080607e-06, + "loss": 5.3639, + "step": 28835 + }, + { + "epoch": 0.5867513020833334, + "grad_norm": 24.25047492980957, + "learning_rate": 9.48144010247367e-06, + "loss": 5.178, + "step": 28840 + }, + { + "epoch": 0.58685302734375, + "grad_norm": 23.736602783203125, + "learning_rate": 9.481262842231754e-06, + "loss": 5.2095, + "step": 28845 + }, + { + "epoch": 0.5869547526041666, + "grad_norm": 17.338228225708008, + "learning_rate": 9.48108555335599e-06, + "loss": 4.9517, + "step": 28850 + }, + { + "epoch": 0.5870564778645834, + "grad_norm": 16.162721633911133, + "learning_rate": 9.480908235847511e-06, + "loss": 5.5531, + "step": 28855 + }, + { + "epoch": 0.587158203125, + "grad_norm": 16.22698211669922, + "learning_rate": 9.480730889707452e-06, + "loss": 5.3596, + "step": 28860 + }, + { + "epoch": 0.5872599283854166, + "grad_norm": 18.424501419067383, + "learning_rate": 9.480553514936945e-06, + "loss": 5.252, + "step": 28865 + }, + { + "epoch": 0.5873616536458334, + "grad_norm": 18.017925262451172, + "learning_rate": 9.480376111537123e-06, + "loss": 5.0656, + "step": 28870 + }, + { + "epoch": 0.58746337890625, + "grad_norm": 19.410125732421875, + "learning_rate": 9.480198679509121e-06, + "loss": 5.1962, + "step": 28875 + }, + { + "epoch": 0.5875651041666666, + "grad_norm": 17.98143768310547, + "learning_rate": 9.48002121885407e-06, + "loss": 5.3204, + "step": 28880 + }, + { + "epoch": 0.5876668294270834, + "grad_norm": 15.99759292602539, + "learning_rate": 9.479843729573106e-06, + "loss": 5.2019, + "step": 28885 + }, + { + "epoch": 0.5877685546875, + "grad_norm": 16.656700134277344, + "learning_rate": 9.479666211667362e-06, + "loss": 5.2021, + "step": 28890 + }, + { + "epoch": 0.5878702799479166, + "grad_norm": 16.054964065551758, + "learning_rate": 9.479488665137975e-06, + "loss": 5.089, + "step": 28895 + }, + { + "epoch": 0.5879720052083334, + "grad_norm": 23.817607879638672, + "learning_rate": 9.479311089986076e-06, + "loss": 5.0164, + "step": 28900 + }, + { + "epoch": 0.58807373046875, + "grad_norm": 13.273167610168457, + "learning_rate": 9.479133486212801e-06, + "loss": 5.097, + "step": 28905 + }, + { + "epoch": 0.5881754557291666, + "grad_norm": 15.854758262634277, + "learning_rate": 9.478955853819284e-06, + "loss": 5.526, + "step": 28910 + }, + { + "epoch": 0.5882771809895834, + "grad_norm": 10.897637367248535, + "learning_rate": 9.478778192806664e-06, + "loss": 5.0805, + "step": 28915 + }, + { + "epoch": 0.58837890625, + "grad_norm": 19.048126220703125, + "learning_rate": 9.478600503176071e-06, + "loss": 5.4743, + "step": 28920 + }, + { + "epoch": 0.5884806315104166, + "grad_norm": 16.847593307495117, + "learning_rate": 9.478422784928643e-06, + "loss": 5.2021, + "step": 28925 + }, + { + "epoch": 0.5885823567708334, + "grad_norm": 16.2868709564209, + "learning_rate": 9.478245038065515e-06, + "loss": 5.0489, + "step": 28930 + }, + { + "epoch": 0.58868408203125, + "grad_norm": 14.62964916229248, + "learning_rate": 9.478067262587821e-06, + "loss": 5.1114, + "step": 28935 + }, + { + "epoch": 0.5887858072916666, + "grad_norm": 14.475417137145996, + "learning_rate": 9.477889458496701e-06, + "loss": 4.927, + "step": 28940 + }, + { + "epoch": 0.5888875325520834, + "grad_norm": 15.594334602355957, + "learning_rate": 9.477711625793287e-06, + "loss": 5.1431, + "step": 28945 + }, + { + "epoch": 0.5889892578125, + "grad_norm": 15.76807689666748, + "learning_rate": 9.47753376447872e-06, + "loss": 5.371, + "step": 28950 + }, + { + "epoch": 0.5890909830729166, + "grad_norm": 19.289915084838867, + "learning_rate": 9.47735587455413e-06, + "loss": 5.0027, + "step": 28955 + }, + { + "epoch": 0.5891927083333334, + "grad_norm": 18.90337371826172, + "learning_rate": 9.47717795602066e-06, + "loss": 5.0192, + "step": 28960 + }, + { + "epoch": 0.58929443359375, + "grad_norm": 20.630983352661133, + "learning_rate": 9.477000008879444e-06, + "loss": 5.0465, + "step": 28965 + }, + { + "epoch": 0.5893961588541666, + "grad_norm": 24.26479148864746, + "learning_rate": 9.476822033131618e-06, + "loss": 5.5608, + "step": 28970 + }, + { + "epoch": 0.5894978841145834, + "grad_norm": 17.14590835571289, + "learning_rate": 9.476644028778321e-06, + "loss": 5.3268, + "step": 28975 + }, + { + "epoch": 0.589599609375, + "grad_norm": 20.23741912841797, + "learning_rate": 9.476465995820689e-06, + "loss": 4.9545, + "step": 28980 + }, + { + "epoch": 0.5897013346354166, + "grad_norm": 16.225013732910156, + "learning_rate": 9.47628793425986e-06, + "loss": 5.1419, + "step": 28985 + }, + { + "epoch": 0.5898030598958334, + "grad_norm": 13.56171989440918, + "learning_rate": 9.476109844096972e-06, + "loss": 5.0345, + "step": 28990 + }, + { + "epoch": 0.58990478515625, + "grad_norm": 15.560667991638184, + "learning_rate": 9.475931725333164e-06, + "loss": 5.1948, + "step": 28995 + }, + { + "epoch": 0.5900065104166666, + "grad_norm": 20.538101196289062, + "learning_rate": 9.475753577969574e-06, + "loss": 5.2088, + "step": 29000 + }, + { + "epoch": 0.5901082356770834, + "grad_norm": 16.862751007080078, + "learning_rate": 9.475575402007337e-06, + "loss": 4.9487, + "step": 29005 + }, + { + "epoch": 0.5902099609375, + "grad_norm": 17.98019027709961, + "learning_rate": 9.475397197447596e-06, + "loss": 5.1193, + "step": 29010 + }, + { + "epoch": 0.5903116861979166, + "grad_norm": 18.11831283569336, + "learning_rate": 9.475218964291487e-06, + "loss": 4.8725, + "step": 29015 + }, + { + "epoch": 0.5904134114583334, + "grad_norm": 16.866207122802734, + "learning_rate": 9.475040702540149e-06, + "loss": 5.0105, + "step": 29020 + }, + { + "epoch": 0.59051513671875, + "grad_norm": 16.988269805908203, + "learning_rate": 9.474862412194724e-06, + "loss": 4.9524, + "step": 29025 + }, + { + "epoch": 0.5906168619791666, + "grad_norm": 15.602276802062988, + "learning_rate": 9.474684093256344e-06, + "loss": 5.1333, + "step": 29030 + }, + { + "epoch": 0.5907185872395834, + "grad_norm": 17.09202766418457, + "learning_rate": 9.474505745726158e-06, + "loss": 5.2461, + "step": 29035 + }, + { + "epoch": 0.5908203125, + "grad_norm": 17.369653701782227, + "learning_rate": 9.474327369605295e-06, + "loss": 4.9926, + "step": 29040 + }, + { + "epoch": 0.5909220377604166, + "grad_norm": 13.85522747039795, + "learning_rate": 9.474148964894905e-06, + "loss": 5.0214, + "step": 29045 + }, + { + "epoch": 0.5910237630208334, + "grad_norm": 15.626815795898438, + "learning_rate": 9.473970531596122e-06, + "loss": 4.8863, + "step": 29050 + }, + { + "epoch": 0.59112548828125, + "grad_norm": 18.702285766601562, + "learning_rate": 9.473792069710089e-06, + "loss": 5.0038, + "step": 29055 + }, + { + "epoch": 0.5912272135416666, + "grad_norm": 19.839447021484375, + "learning_rate": 9.473613579237942e-06, + "loss": 5.0821, + "step": 29060 + }, + { + "epoch": 0.5913289388020834, + "grad_norm": 11.578588485717773, + "learning_rate": 9.473435060180827e-06, + "loss": 5.2092, + "step": 29065 + }, + { + "epoch": 0.5914306640625, + "grad_norm": 16.884347915649414, + "learning_rate": 9.47325651253988e-06, + "loss": 5.0634, + "step": 29070 + }, + { + "epoch": 0.5915323893229166, + "grad_norm": 19.692800521850586, + "learning_rate": 9.473077936316245e-06, + "loss": 4.9928, + "step": 29075 + }, + { + "epoch": 0.5916341145833334, + "grad_norm": 25.234697341918945, + "learning_rate": 9.472899331511061e-06, + "loss": 4.894, + "step": 29080 + }, + { + "epoch": 0.59173583984375, + "grad_norm": 15.889342308044434, + "learning_rate": 9.472720698125471e-06, + "loss": 5.0524, + "step": 29085 + }, + { + "epoch": 0.5918375651041666, + "grad_norm": 18.652013778686523, + "learning_rate": 9.472542036160615e-06, + "loss": 5.223, + "step": 29090 + }, + { + "epoch": 0.5919392903645834, + "grad_norm": 14.692255020141602, + "learning_rate": 9.472363345617637e-06, + "loss": 5.2893, + "step": 29095 + }, + { + "epoch": 0.592041015625, + "grad_norm": 18.07635498046875, + "learning_rate": 9.472184626497674e-06, + "loss": 5.1784, + "step": 29100 + }, + { + "epoch": 0.5921427408854166, + "grad_norm": 24.977060317993164, + "learning_rate": 9.472005878801874e-06, + "loss": 5.3172, + "step": 29105 + }, + { + "epoch": 0.5922444661458334, + "grad_norm": 20.824628829956055, + "learning_rate": 9.471827102531373e-06, + "loss": 5.0929, + "step": 29110 + }, + { + "epoch": 0.59234619140625, + "grad_norm": 19.327735900878906, + "learning_rate": 9.47164829768732e-06, + "loss": 5.2501, + "step": 29115 + }, + { + "epoch": 0.5924479166666666, + "grad_norm": 19.2830867767334, + "learning_rate": 9.47146946427085e-06, + "loss": 5.553, + "step": 29120 + }, + { + "epoch": 0.5925496419270834, + "grad_norm": 16.19243812561035, + "learning_rate": 9.471290602283112e-06, + "loss": 4.9225, + "step": 29125 + }, + { + "epoch": 0.5926513671875, + "grad_norm": 13.32673168182373, + "learning_rate": 9.471111711725245e-06, + "loss": 5.1472, + "step": 29130 + }, + { + "epoch": 0.5927530924479166, + "grad_norm": 15.115610122680664, + "learning_rate": 9.470932792598394e-06, + "loss": 5.4092, + "step": 29135 + }, + { + "epoch": 0.5928548177083334, + "grad_norm": 17.601919174194336, + "learning_rate": 9.470753844903702e-06, + "loss": 5.1989, + "step": 29140 + }, + { + "epoch": 0.59295654296875, + "grad_norm": 25.830036163330078, + "learning_rate": 9.470574868642312e-06, + "loss": 5.1522, + "step": 29145 + }, + { + "epoch": 0.5930582682291666, + "grad_norm": 16.667842864990234, + "learning_rate": 9.470395863815367e-06, + "loss": 5.2654, + "step": 29150 + }, + { + "epoch": 0.5931599934895834, + "grad_norm": 15.38748550415039, + "learning_rate": 9.470216830424014e-06, + "loss": 5.2284, + "step": 29155 + }, + { + "epoch": 0.59326171875, + "grad_norm": 14.951435089111328, + "learning_rate": 9.470037768469392e-06, + "loss": 5.2106, + "step": 29160 + }, + { + "epoch": 0.5933634440104166, + "grad_norm": 24.460987091064453, + "learning_rate": 9.469858677952647e-06, + "loss": 5.1082, + "step": 29165 + }, + { + "epoch": 0.5934651692708334, + "grad_norm": 14.595767974853516, + "learning_rate": 9.469679558874926e-06, + "loss": 5.1489, + "step": 29170 + }, + { + "epoch": 0.59356689453125, + "grad_norm": 17.586124420166016, + "learning_rate": 9.46950041123737e-06, + "loss": 4.9281, + "step": 29175 + }, + { + "epoch": 0.5936686197916666, + "grad_norm": 16.957347869873047, + "learning_rate": 9.469321235041126e-06, + "loss": 5.0278, + "step": 29180 + }, + { + "epoch": 0.5937703450520834, + "grad_norm": 18.479074478149414, + "learning_rate": 9.469142030287339e-06, + "loss": 4.9913, + "step": 29185 + }, + { + "epoch": 0.5938720703125, + "grad_norm": 15.8654203414917, + "learning_rate": 9.468962796977152e-06, + "loss": 5.0846, + "step": 29190 + }, + { + "epoch": 0.5939737955729166, + "grad_norm": 14.555302619934082, + "learning_rate": 9.468783535111713e-06, + "loss": 5.1457, + "step": 29195 + }, + { + "epoch": 0.5940755208333334, + "grad_norm": 17.590383529663086, + "learning_rate": 9.468604244692164e-06, + "loss": 5.3174, + "step": 29200 + }, + { + "epoch": 0.59417724609375, + "grad_norm": 13.64831256866455, + "learning_rate": 9.468424925719653e-06, + "loss": 5.0675, + "step": 29205 + }, + { + "epoch": 0.5942789713541666, + "grad_norm": 16.723445892333984, + "learning_rate": 9.468245578195325e-06, + "loss": 5.083, + "step": 29210 + }, + { + "epoch": 0.5943806966145834, + "grad_norm": 17.8790225982666, + "learning_rate": 9.468066202120325e-06, + "loss": 5.4717, + "step": 29215 + }, + { + "epoch": 0.594482421875, + "grad_norm": 14.415895462036133, + "learning_rate": 9.467886797495802e-06, + "loss": 5.1018, + "step": 29220 + }, + { + "epoch": 0.5945841471354166, + "grad_norm": 21.40670394897461, + "learning_rate": 9.467707364322901e-06, + "loss": 5.3703, + "step": 29225 + }, + { + "epoch": 0.5946858723958334, + "grad_norm": 13.32238483428955, + "learning_rate": 9.467527902602767e-06, + "loss": 4.8979, + "step": 29230 + }, + { + "epoch": 0.59478759765625, + "grad_norm": 14.44276237487793, + "learning_rate": 9.467348412336548e-06, + "loss": 5.0291, + "step": 29235 + }, + { + "epoch": 0.5948893229166666, + "grad_norm": 18.29779815673828, + "learning_rate": 9.467168893525392e-06, + "loss": 4.9109, + "step": 29240 + }, + { + "epoch": 0.5949910481770834, + "grad_norm": 19.05511474609375, + "learning_rate": 9.466989346170444e-06, + "loss": 5.3557, + "step": 29245 + }, + { + "epoch": 0.5950927734375, + "grad_norm": 16.262346267700195, + "learning_rate": 9.466809770272852e-06, + "loss": 4.9371, + "step": 29250 + }, + { + "epoch": 0.5951944986979166, + "grad_norm": 20.17840576171875, + "learning_rate": 9.466630165833763e-06, + "loss": 4.9583, + "step": 29255 + }, + { + "epoch": 0.5952962239583334, + "grad_norm": 14.507734298706055, + "learning_rate": 9.466450532854324e-06, + "loss": 5.2575, + "step": 29260 + }, + { + "epoch": 0.59539794921875, + "grad_norm": 20.761672973632812, + "learning_rate": 9.466270871335685e-06, + "loss": 4.8227, + "step": 29265 + }, + { + "epoch": 0.5954996744791666, + "grad_norm": 16.75705337524414, + "learning_rate": 9.466091181278994e-06, + "loss": 5.41, + "step": 29270 + }, + { + "epoch": 0.5956013997395834, + "grad_norm": 18.932798385620117, + "learning_rate": 9.465911462685398e-06, + "loss": 5.1427, + "step": 29275 + }, + { + "epoch": 0.595703125, + "grad_norm": 19.621610641479492, + "learning_rate": 9.465731715556044e-06, + "loss": 4.9222, + "step": 29280 + }, + { + "epoch": 0.5958048502604166, + "grad_norm": 14.218145370483398, + "learning_rate": 9.465551939892084e-06, + "loss": 5.2987, + "step": 29285 + }, + { + "epoch": 0.5959065755208334, + "grad_norm": 20.53316307067871, + "learning_rate": 9.465372135694664e-06, + "loss": 5.1479, + "step": 29290 + }, + { + "epoch": 0.59600830078125, + "grad_norm": 18.40032386779785, + "learning_rate": 9.465192302964932e-06, + "loss": 4.9307, + "step": 29295 + }, + { + "epoch": 0.5961100260416666, + "grad_norm": 16.871227264404297, + "learning_rate": 9.46501244170404e-06, + "loss": 4.9949, + "step": 29300 + }, + { + "epoch": 0.5962117513020834, + "grad_norm": 17.926237106323242, + "learning_rate": 9.464832551913135e-06, + "loss": 5.1453, + "step": 29305 + }, + { + "epoch": 0.5963134765625, + "grad_norm": 22.42436408996582, + "learning_rate": 9.464652633593368e-06, + "loss": 5.0038, + "step": 29310 + }, + { + "epoch": 0.5964152018229166, + "grad_norm": 11.673129081726074, + "learning_rate": 9.46447268674589e-06, + "loss": 5.1851, + "step": 29315 + }, + { + "epoch": 0.5965169270833334, + "grad_norm": 13.493513107299805, + "learning_rate": 9.464292711371845e-06, + "loss": 5.0068, + "step": 29320 + }, + { + "epoch": 0.59661865234375, + "grad_norm": 12.66624641418457, + "learning_rate": 9.464112707472388e-06, + "loss": 5.265, + "step": 29325 + }, + { + "epoch": 0.5967203776041666, + "grad_norm": 20.63812828063965, + "learning_rate": 9.463932675048669e-06, + "loss": 5.1426, + "step": 29330 + }, + { + "epoch": 0.5968221028645834, + "grad_norm": 21.25363540649414, + "learning_rate": 9.463752614101837e-06, + "loss": 5.2093, + "step": 29335 + }, + { + "epoch": 0.596923828125, + "grad_norm": 25.85942268371582, + "learning_rate": 9.463572524633043e-06, + "loss": 5.821, + "step": 29340 + }, + { + "epoch": 0.5970255533854166, + "grad_norm": 16.48832130432129, + "learning_rate": 9.463392406643435e-06, + "loss": 5.0994, + "step": 29345 + }, + { + "epoch": 0.5971272786458334, + "grad_norm": 18.22324562072754, + "learning_rate": 9.46321226013417e-06, + "loss": 5.0419, + "step": 29350 + }, + { + "epoch": 0.59722900390625, + "grad_norm": 17.521703720092773, + "learning_rate": 9.463032085106393e-06, + "loss": 5.1705, + "step": 29355 + }, + { + "epoch": 0.5973307291666666, + "grad_norm": 12.910812377929688, + "learning_rate": 9.462851881561257e-06, + "loss": 4.915, + "step": 29360 + }, + { + "epoch": 0.5974324544270834, + "grad_norm": 14.25730037689209, + "learning_rate": 9.462671649499916e-06, + "loss": 5.136, + "step": 29365 + }, + { + "epoch": 0.5975341796875, + "grad_norm": 20.064258575439453, + "learning_rate": 9.46249138892352e-06, + "loss": 5.1366, + "step": 29370 + }, + { + "epoch": 0.5976359049479166, + "grad_norm": 16.281631469726562, + "learning_rate": 9.462311099833218e-06, + "loss": 4.9219, + "step": 29375 + }, + { + "epoch": 0.5977376302083334, + "grad_norm": 15.533080101013184, + "learning_rate": 9.462130782230167e-06, + "loss": 4.9476, + "step": 29380 + }, + { + "epoch": 0.59783935546875, + "grad_norm": 17.408451080322266, + "learning_rate": 9.461950436115516e-06, + "loss": 5.14, + "step": 29385 + }, + { + "epoch": 0.5979410807291666, + "grad_norm": 20.564483642578125, + "learning_rate": 9.461770061490419e-06, + "loss": 5.0265, + "step": 29390 + }, + { + "epoch": 0.5980428059895834, + "grad_norm": 16.8858699798584, + "learning_rate": 9.461589658356027e-06, + "loss": 5.2022, + "step": 29395 + }, + { + "epoch": 0.59814453125, + "grad_norm": 19.16018295288086, + "learning_rate": 9.461409226713493e-06, + "loss": 4.8063, + "step": 29400 + }, + { + "epoch": 0.5982462565104166, + "grad_norm": 18.14073371887207, + "learning_rate": 9.461228766563972e-06, + "loss": 5.1032, + "step": 29405 + }, + { + "epoch": 0.5983479817708334, + "grad_norm": 18.463220596313477, + "learning_rate": 9.461048277908614e-06, + "loss": 5.0139, + "step": 29410 + }, + { + "epoch": 0.59844970703125, + "grad_norm": 22.190372467041016, + "learning_rate": 9.460867760748573e-06, + "loss": 4.962, + "step": 29415 + }, + { + "epoch": 0.5985514322916666, + "grad_norm": 19.593143463134766, + "learning_rate": 9.460687215085005e-06, + "loss": 5.2828, + "step": 29420 + }, + { + "epoch": 0.5986531575520834, + "grad_norm": 19.17954444885254, + "learning_rate": 9.46050664091906e-06, + "loss": 5.4054, + "step": 29425 + }, + { + "epoch": 0.5987548828125, + "grad_norm": 17.7825927734375, + "learning_rate": 9.460326038251895e-06, + "loss": 5.0959, + "step": 29430 + }, + { + "epoch": 0.5988566080729166, + "grad_norm": 14.41823959350586, + "learning_rate": 9.46014540708466e-06, + "loss": 4.9442, + "step": 29435 + }, + { + "epoch": 0.5989583333333334, + "grad_norm": 17.188737869262695, + "learning_rate": 9.459964747418514e-06, + "loss": 4.9072, + "step": 29440 + }, + { + "epoch": 0.59906005859375, + "grad_norm": 16.31683921813965, + "learning_rate": 9.459784059254608e-06, + "loss": 5.3598, + "step": 29445 + }, + { + "epoch": 0.5991617838541666, + "grad_norm": 27.50450897216797, + "learning_rate": 9.459603342594098e-06, + "loss": 5.6414, + "step": 29450 + }, + { + "epoch": 0.5992635091145834, + "grad_norm": 18.4915828704834, + "learning_rate": 9.45942259743814e-06, + "loss": 5.0971, + "step": 29455 + }, + { + "epoch": 0.599365234375, + "grad_norm": 17.359663009643555, + "learning_rate": 9.459241823787884e-06, + "loss": 5.3928, + "step": 29460 + }, + { + "epoch": 0.5994669596354166, + "grad_norm": 17.303646087646484, + "learning_rate": 9.45906102164449e-06, + "loss": 4.7576, + "step": 29465 + }, + { + "epoch": 0.5995686848958334, + "grad_norm": 15.215665817260742, + "learning_rate": 9.458880191009112e-06, + "loss": 5.1286, + "step": 29470 + }, + { + "epoch": 0.59967041015625, + "grad_norm": 24.24822235107422, + "learning_rate": 9.458699331882903e-06, + "loss": 4.8809, + "step": 29475 + }, + { + "epoch": 0.5997721354166666, + "grad_norm": 17.70667839050293, + "learning_rate": 9.458518444267023e-06, + "loss": 5.2722, + "step": 29480 + }, + { + "epoch": 0.5998738606770834, + "grad_norm": 16.50320053100586, + "learning_rate": 9.458337528162624e-06, + "loss": 5.34, + "step": 29485 + }, + { + "epoch": 0.5999755859375, + "grad_norm": 17.232059478759766, + "learning_rate": 9.458156583570864e-06, + "loss": 5.2514, + "step": 29490 + }, + { + "epoch": 0.6000773111979166, + "grad_norm": 16.101295471191406, + "learning_rate": 9.4579756104929e-06, + "loss": 5.0724, + "step": 29495 + }, + { + "epoch": 0.6001790364583334, + "grad_norm": 19.069639205932617, + "learning_rate": 9.457794608929885e-06, + "loss": 5.1468, + "step": 29500 + }, + { + "epoch": 0.60028076171875, + "grad_norm": 23.85268783569336, + "learning_rate": 9.457613578882977e-06, + "loss": 5.5177, + "step": 29505 + }, + { + "epoch": 0.6003824869791666, + "grad_norm": 14.416375160217285, + "learning_rate": 9.457432520353335e-06, + "loss": 5.3504, + "step": 29510 + }, + { + "epoch": 0.6004842122395834, + "grad_norm": 17.983074188232422, + "learning_rate": 9.457251433342113e-06, + "loss": 5.046, + "step": 29515 + }, + { + "epoch": 0.6005859375, + "grad_norm": 16.935468673706055, + "learning_rate": 9.45707031785047e-06, + "loss": 5.0376, + "step": 29520 + }, + { + "epoch": 0.6006876627604166, + "grad_norm": 14.890877723693848, + "learning_rate": 9.456889173879563e-06, + "loss": 5.1435, + "step": 29525 + }, + { + "epoch": 0.6007893880208334, + "grad_norm": 12.901652336120605, + "learning_rate": 9.456708001430548e-06, + "loss": 5.1835, + "step": 29530 + }, + { + "epoch": 0.60089111328125, + "grad_norm": 18.17858123779297, + "learning_rate": 9.456526800504583e-06, + "loss": 5.443, + "step": 29535 + }, + { + "epoch": 0.6009928385416666, + "grad_norm": 14.849645614624023, + "learning_rate": 9.456345571102828e-06, + "loss": 5.2219, + "step": 29540 + }, + { + "epoch": 0.6010945638020834, + "grad_norm": 15.672321319580078, + "learning_rate": 9.456164313226439e-06, + "loss": 4.8393, + "step": 29545 + }, + { + "epoch": 0.6011962890625, + "grad_norm": 16.567405700683594, + "learning_rate": 9.455983026876575e-06, + "loss": 5.3872, + "step": 29550 + }, + { + "epoch": 0.6012980143229166, + "grad_norm": 23.754438400268555, + "learning_rate": 9.455801712054393e-06, + "loss": 4.9072, + "step": 29555 + }, + { + "epoch": 0.6013997395833334, + "grad_norm": 23.306243896484375, + "learning_rate": 9.455620368761053e-06, + "loss": 5.2992, + "step": 29560 + }, + { + "epoch": 0.60150146484375, + "grad_norm": 20.81386947631836, + "learning_rate": 9.455438996997713e-06, + "loss": 5.1863, + "step": 29565 + }, + { + "epoch": 0.6016031901041666, + "grad_norm": 20.142213821411133, + "learning_rate": 9.455257596765533e-06, + "loss": 4.8743, + "step": 29570 + }, + { + "epoch": 0.6017049153645834, + "grad_norm": 17.432281494140625, + "learning_rate": 9.455076168065668e-06, + "loss": 5.2119, + "step": 29575 + }, + { + "epoch": 0.601806640625, + "grad_norm": 15.983949661254883, + "learning_rate": 9.454894710899284e-06, + "loss": 5.185, + "step": 29580 + }, + { + "epoch": 0.6019083658854166, + "grad_norm": 14.107561111450195, + "learning_rate": 9.454713225267537e-06, + "loss": 5.2187, + "step": 29585 + }, + { + "epoch": 0.6020100911458334, + "grad_norm": 15.188472747802734, + "learning_rate": 9.454531711171585e-06, + "loss": 5.1995, + "step": 29590 + }, + { + "epoch": 0.60211181640625, + "grad_norm": 22.878217697143555, + "learning_rate": 9.45435016861259e-06, + "loss": 5.3296, + "step": 29595 + }, + { + "epoch": 0.6022135416666666, + "grad_norm": 17.25347328186035, + "learning_rate": 9.454168597591711e-06, + "loss": 5.0439, + "step": 29600 + }, + { + "epoch": 0.6023152669270834, + "grad_norm": 15.007644653320312, + "learning_rate": 9.45398699811011e-06, + "loss": 5.2323, + "step": 29605 + }, + { + "epoch": 0.6024169921875, + "grad_norm": 17.394580841064453, + "learning_rate": 9.453805370168944e-06, + "loss": 5.184, + "step": 29610 + }, + { + "epoch": 0.6025187174479166, + "grad_norm": 14.692073822021484, + "learning_rate": 9.453623713769377e-06, + "loss": 4.8685, + "step": 29615 + }, + { + "epoch": 0.6026204427083334, + "grad_norm": 18.025924682617188, + "learning_rate": 9.453442028912568e-06, + "loss": 5.0106, + "step": 29620 + }, + { + "epoch": 0.60272216796875, + "grad_norm": 21.348180770874023, + "learning_rate": 9.453260315599679e-06, + "loss": 5.3509, + "step": 29625 + }, + { + "epoch": 0.6028238932291666, + "grad_norm": 29.551116943359375, + "learning_rate": 9.453078573831868e-06, + "loss": 5.2205, + "step": 29630 + }, + { + "epoch": 0.6029256184895834, + "grad_norm": 14.53877067565918, + "learning_rate": 9.452896803610301e-06, + "loss": 5.2737, + "step": 29635 + }, + { + "epoch": 0.60302734375, + "grad_norm": 17.55487060546875, + "learning_rate": 9.452715004936135e-06, + "loss": 5.1214, + "step": 29640 + }, + { + "epoch": 0.6031290690104166, + "grad_norm": 16.552576065063477, + "learning_rate": 9.452533177810535e-06, + "loss": 4.9567, + "step": 29645 + }, + { + "epoch": 0.6032307942708334, + "grad_norm": 17.145788192749023, + "learning_rate": 9.452351322234661e-06, + "loss": 5.0358, + "step": 29650 + }, + { + "epoch": 0.60333251953125, + "grad_norm": 17.191434860229492, + "learning_rate": 9.452169438209674e-06, + "loss": 5.043, + "step": 29655 + }, + { + "epoch": 0.6034342447916666, + "grad_norm": 14.778326034545898, + "learning_rate": 9.451987525736738e-06, + "loss": 4.9904, + "step": 29660 + }, + { + "epoch": 0.6035359700520834, + "grad_norm": 14.919686317443848, + "learning_rate": 9.451805584817017e-06, + "loss": 5.2545, + "step": 29665 + }, + { + "epoch": 0.6036376953125, + "grad_norm": 17.424047470092773, + "learning_rate": 9.45162361545167e-06, + "loss": 5.1636, + "step": 29670 + }, + { + "epoch": 0.6037394205729166, + "grad_norm": 20.187541961669922, + "learning_rate": 9.45144161764186e-06, + "loss": 5.3417, + "step": 29675 + }, + { + "epoch": 0.6038411458333334, + "grad_norm": 15.385308265686035, + "learning_rate": 9.451259591388754e-06, + "loss": 5.3158, + "step": 29680 + }, + { + "epoch": 0.60394287109375, + "grad_norm": 22.990623474121094, + "learning_rate": 9.451077536693511e-06, + "loss": 5.0233, + "step": 29685 + }, + { + "epoch": 0.6040445963541666, + "grad_norm": 21.615121841430664, + "learning_rate": 9.450895453557294e-06, + "loss": 5.2024, + "step": 29690 + }, + { + "epoch": 0.6041463216145834, + "grad_norm": 21.063190460205078, + "learning_rate": 9.45071334198127e-06, + "loss": 5.0518, + "step": 29695 + }, + { + "epoch": 0.604248046875, + "grad_norm": 19.18638038635254, + "learning_rate": 9.450531201966599e-06, + "loss": 4.9373, + "step": 29700 + }, + { + "epoch": 0.6043497721354166, + "grad_norm": 14.216859817504883, + "learning_rate": 9.450349033514448e-06, + "loss": 5.2525, + "step": 29705 + }, + { + "epoch": 0.6044514973958334, + "grad_norm": 14.132561683654785, + "learning_rate": 9.450166836625979e-06, + "loss": 5.1306, + "step": 29710 + }, + { + "epoch": 0.60455322265625, + "grad_norm": 17.671953201293945, + "learning_rate": 9.449984611302356e-06, + "loss": 5.201, + "step": 29715 + }, + { + "epoch": 0.6046549479166666, + "grad_norm": 13.87009334564209, + "learning_rate": 9.449802357544744e-06, + "loss": 5.1161, + "step": 29720 + }, + { + "epoch": 0.6047566731770834, + "grad_norm": 16.956315994262695, + "learning_rate": 9.449620075354307e-06, + "loss": 5.0799, + "step": 29725 + }, + { + "epoch": 0.6048583984375, + "grad_norm": 19.206558227539062, + "learning_rate": 9.449437764732212e-06, + "loss": 4.8526, + "step": 29730 + }, + { + "epoch": 0.6049601236979166, + "grad_norm": 25.223758697509766, + "learning_rate": 9.449255425679621e-06, + "loss": 5.3083, + "step": 29735 + }, + { + "epoch": 0.6050618489583334, + "grad_norm": 16.82227897644043, + "learning_rate": 9.4490730581977e-06, + "loss": 5.381, + "step": 29740 + }, + { + "epoch": 0.60516357421875, + "grad_norm": 17.46430778503418, + "learning_rate": 9.448890662287616e-06, + "loss": 5.0943, + "step": 29745 + }, + { + "epoch": 0.6052652994791666, + "grad_norm": 19.578628540039062, + "learning_rate": 9.44870823795053e-06, + "loss": 5.3585, + "step": 29750 + }, + { + "epoch": 0.6053670247395834, + "grad_norm": 17.394187927246094, + "learning_rate": 9.448525785187612e-06, + "loss": 5.0987, + "step": 29755 + }, + { + "epoch": 0.60546875, + "grad_norm": 15.015830993652344, + "learning_rate": 9.448343304000026e-06, + "loss": 4.9849, + "step": 29760 + }, + { + "epoch": 0.6055704752604166, + "grad_norm": 18.481998443603516, + "learning_rate": 9.448160794388941e-06, + "loss": 5.1187, + "step": 29765 + }, + { + "epoch": 0.6056722005208334, + "grad_norm": 20.70869255065918, + "learning_rate": 9.447978256355517e-06, + "loss": 5.5711, + "step": 29770 + }, + { + "epoch": 0.60577392578125, + "grad_norm": 16.567066192626953, + "learning_rate": 9.447795689900927e-06, + "loss": 5.5525, + "step": 29775 + }, + { + "epoch": 0.6058756510416666, + "grad_norm": 13.289424896240234, + "learning_rate": 9.447613095026332e-06, + "loss": 5.044, + "step": 29780 + }, + { + "epoch": 0.6059773763020834, + "grad_norm": 14.876283645629883, + "learning_rate": 9.447430471732902e-06, + "loss": 5.2146, + "step": 29785 + }, + { + "epoch": 0.6060791015625, + "grad_norm": 16.087770462036133, + "learning_rate": 9.447247820021804e-06, + "loss": 5.1252, + "step": 29790 + }, + { + "epoch": 0.6061808268229166, + "grad_norm": 14.355435371398926, + "learning_rate": 9.447065139894202e-06, + "loss": 5.0835, + "step": 29795 + }, + { + "epoch": 0.6062825520833334, + "grad_norm": 17.288965225219727, + "learning_rate": 9.446882431351268e-06, + "loss": 4.9878, + "step": 29800 + }, + { + "epoch": 0.60638427734375, + "grad_norm": 21.871946334838867, + "learning_rate": 9.446699694394166e-06, + "loss": 5.2926, + "step": 29805 + }, + { + "epoch": 0.6064860026041666, + "grad_norm": 17.218799591064453, + "learning_rate": 9.446516929024063e-06, + "loss": 5.1119, + "step": 29810 + }, + { + "epoch": 0.6065877278645834, + "grad_norm": 23.95709991455078, + "learning_rate": 9.44633413524213e-06, + "loss": 5.1785, + "step": 29815 + }, + { + "epoch": 0.606689453125, + "grad_norm": 13.81667709350586, + "learning_rate": 9.446151313049534e-06, + "loss": 4.8679, + "step": 29820 + }, + { + "epoch": 0.6067911783854166, + "grad_norm": 15.903095245361328, + "learning_rate": 9.445968462447441e-06, + "loss": 5.0727, + "step": 29825 + }, + { + "epoch": 0.6068929036458334, + "grad_norm": 13.873254776000977, + "learning_rate": 9.445785583437022e-06, + "loss": 4.9256, + "step": 29830 + }, + { + "epoch": 0.60699462890625, + "grad_norm": 15.73450756072998, + "learning_rate": 9.445602676019443e-06, + "loss": 5.0291, + "step": 29835 + }, + { + "epoch": 0.6070963541666666, + "grad_norm": 19.181285858154297, + "learning_rate": 9.445419740195875e-06, + "loss": 5.2027, + "step": 29840 + }, + { + "epoch": 0.6071980794270834, + "grad_norm": 14.833454132080078, + "learning_rate": 9.445236775967485e-06, + "loss": 4.8369, + "step": 29845 + }, + { + "epoch": 0.6072998046875, + "grad_norm": 19.5732364654541, + "learning_rate": 9.445053783335445e-06, + "loss": 5.2842, + "step": 29850 + }, + { + "epoch": 0.6074015299479166, + "grad_norm": 16.198394775390625, + "learning_rate": 9.44487076230092e-06, + "loss": 4.8853, + "step": 29855 + }, + { + "epoch": 0.6075032552083334, + "grad_norm": 19.913394927978516, + "learning_rate": 9.444687712865084e-06, + "loss": 5.1163, + "step": 29860 + }, + { + "epoch": 0.60760498046875, + "grad_norm": 19.396657943725586, + "learning_rate": 9.444504635029103e-06, + "loss": 5.2523, + "step": 29865 + }, + { + "epoch": 0.6077067057291666, + "grad_norm": 14.36654281616211, + "learning_rate": 9.444321528794149e-06, + "loss": 5.092, + "step": 29870 + }, + { + "epoch": 0.6078084309895834, + "grad_norm": 17.03780174255371, + "learning_rate": 9.44413839416139e-06, + "loss": 4.8827, + "step": 29875 + }, + { + "epoch": 0.60791015625, + "grad_norm": 22.540653228759766, + "learning_rate": 9.443955231132001e-06, + "loss": 5.0049, + "step": 29880 + }, + { + "epoch": 0.6080118815104166, + "grad_norm": 26.17487335205078, + "learning_rate": 9.443772039707145e-06, + "loss": 5.2523, + "step": 29885 + }, + { + "epoch": 0.6081136067708334, + "grad_norm": 17.125469207763672, + "learning_rate": 9.443588819887999e-06, + "loss": 4.977, + "step": 29890 + }, + { + "epoch": 0.60821533203125, + "grad_norm": 18.564533233642578, + "learning_rate": 9.443405571675728e-06, + "loss": 5.1591, + "step": 29895 + }, + { + "epoch": 0.6083170572916666, + "grad_norm": 18.553434371948242, + "learning_rate": 9.443222295071508e-06, + "loss": 5.2527, + "step": 29900 + }, + { + "epoch": 0.6084187825520834, + "grad_norm": 14.480544090270996, + "learning_rate": 9.443038990076507e-06, + "loss": 5.0646, + "step": 29905 + }, + { + "epoch": 0.6085205078125, + "grad_norm": 23.033422470092773, + "learning_rate": 9.442855656691899e-06, + "loss": 4.9849, + "step": 29910 + }, + { + "epoch": 0.6086222330729166, + "grad_norm": 24.48007583618164, + "learning_rate": 9.442672294918853e-06, + "loss": 5.1427, + "step": 29915 + }, + { + "epoch": 0.6087239583333334, + "grad_norm": 18.95452308654785, + "learning_rate": 9.44248890475854e-06, + "loss": 5.1007, + "step": 29920 + }, + { + "epoch": 0.60882568359375, + "grad_norm": 16.268390655517578, + "learning_rate": 9.442305486212133e-06, + "loss": 4.9067, + "step": 29925 + }, + { + "epoch": 0.6089274088541666, + "grad_norm": 15.530526161193848, + "learning_rate": 9.442122039280805e-06, + "loss": 4.9932, + "step": 29930 + }, + { + "epoch": 0.6090291341145834, + "grad_norm": 20.603994369506836, + "learning_rate": 9.44193856396573e-06, + "loss": 5.1012, + "step": 29935 + }, + { + "epoch": 0.609130859375, + "grad_norm": 18.108606338500977, + "learning_rate": 9.441755060268075e-06, + "loss": 5.0638, + "step": 29940 + }, + { + "epoch": 0.6092325846354166, + "grad_norm": 13.917309761047363, + "learning_rate": 9.441571528189012e-06, + "loss": 5.1926, + "step": 29945 + }, + { + "epoch": 0.6093343098958334, + "grad_norm": 16.161029815673828, + "learning_rate": 9.441387967729723e-06, + "loss": 4.9626, + "step": 29950 + }, + { + "epoch": 0.60943603515625, + "grad_norm": 15.724774360656738, + "learning_rate": 9.44120437889137e-06, + "loss": 4.8806, + "step": 29955 + }, + { + "epoch": 0.6095377604166666, + "grad_norm": 16.355905532836914, + "learning_rate": 9.441020761675135e-06, + "loss": 5.0068, + "step": 29960 + }, + { + "epoch": 0.6096394856770834, + "grad_norm": 15.561532020568848, + "learning_rate": 9.440837116082183e-06, + "loss": 4.98, + "step": 29965 + }, + { + "epoch": 0.6097412109375, + "grad_norm": 16.127573013305664, + "learning_rate": 9.440653442113694e-06, + "loss": 5.153, + "step": 29970 + }, + { + "epoch": 0.6098429361979166, + "grad_norm": 18.03191375732422, + "learning_rate": 9.440469739770838e-06, + "loss": 4.9314, + "step": 29975 + }, + { + "epoch": 0.6099446614583334, + "grad_norm": 23.534549713134766, + "learning_rate": 9.44028600905479e-06, + "loss": 5.1753, + "step": 29980 + }, + { + "epoch": 0.61004638671875, + "grad_norm": 13.253257751464844, + "learning_rate": 9.440102249966724e-06, + "loss": 5.2094, + "step": 29985 + }, + { + "epoch": 0.6101481119791666, + "grad_norm": 18.71681785583496, + "learning_rate": 9.439918462507816e-06, + "loss": 5.2004, + "step": 29990 + }, + { + "epoch": 0.6102498372395834, + "grad_norm": 16.138835906982422, + "learning_rate": 9.439734646679237e-06, + "loss": 5.0458, + "step": 29995 + }, + { + "epoch": 0.6103515625, + "grad_norm": 17.620332717895508, + "learning_rate": 9.439550802482162e-06, + "loss": 5.1111, + "step": 30000 + }, + { + "epoch": 0.6104532877604166, + "grad_norm": 16.302846908569336, + "learning_rate": 9.439366929917767e-06, + "loss": 5.1794, + "step": 30005 + }, + { + "epoch": 0.6105550130208334, + "grad_norm": 16.625473022460938, + "learning_rate": 9.439183028987227e-06, + "loss": 5.0263, + "step": 30010 + }, + { + "epoch": 0.61065673828125, + "grad_norm": 26.094572067260742, + "learning_rate": 9.438999099691718e-06, + "loss": 5.1588, + "step": 30015 + }, + { + "epoch": 0.6107584635416666, + "grad_norm": 15.999886512756348, + "learning_rate": 9.438815142032412e-06, + "loss": 5.3597, + "step": 30020 + }, + { + "epoch": 0.6108601888020834, + "grad_norm": 15.716045379638672, + "learning_rate": 9.438631156010487e-06, + "loss": 4.9042, + "step": 30025 + }, + { + "epoch": 0.6109619140625, + "grad_norm": 17.17597770690918, + "learning_rate": 9.438447141627119e-06, + "loss": 4.9907, + "step": 30030 + }, + { + "epoch": 0.6110636393229166, + "grad_norm": 16.150800704956055, + "learning_rate": 9.438263098883481e-06, + "loss": 5.1584, + "step": 30035 + }, + { + "epoch": 0.6111653645833334, + "grad_norm": 22.343969345092773, + "learning_rate": 9.43807902778075e-06, + "loss": 5.3936, + "step": 30040 + }, + { + "epoch": 0.61126708984375, + "grad_norm": 18.01499366760254, + "learning_rate": 9.437894928320106e-06, + "loss": 4.9587, + "step": 30045 + }, + { + "epoch": 0.6113688151041666, + "grad_norm": 14.851211547851562, + "learning_rate": 9.437710800502719e-06, + "loss": 5.2508, + "step": 30050 + }, + { + "epoch": 0.6114705403645834, + "grad_norm": 15.653059959411621, + "learning_rate": 9.43752664432977e-06, + "loss": 5.1638, + "step": 30055 + }, + { + "epoch": 0.611572265625, + "grad_norm": 15.424692153930664, + "learning_rate": 9.437342459802435e-06, + "loss": 5.0973, + "step": 30060 + }, + { + "epoch": 0.6116739908854166, + "grad_norm": 16.80962562561035, + "learning_rate": 9.43715824692189e-06, + "loss": 5.1561, + "step": 30065 + }, + { + "epoch": 0.6117757161458334, + "grad_norm": 12.680726051330566, + "learning_rate": 9.436974005689312e-06, + "loss": 5.3372, + "step": 30070 + }, + { + "epoch": 0.61187744140625, + "grad_norm": 20.039262771606445, + "learning_rate": 9.436789736105877e-06, + "loss": 4.9788, + "step": 30075 + }, + { + "epoch": 0.6119791666666666, + "grad_norm": 15.68493366241455, + "learning_rate": 9.436605438172768e-06, + "loss": 5.2617, + "step": 30080 + }, + { + "epoch": 0.6120808919270834, + "grad_norm": 16.59848976135254, + "learning_rate": 9.436421111891156e-06, + "loss": 5.4871, + "step": 30085 + }, + { + "epoch": 0.6121826171875, + "grad_norm": 15.56002140045166, + "learning_rate": 9.436236757262222e-06, + "loss": 4.909, + "step": 30090 + }, + { + "epoch": 0.6122843424479166, + "grad_norm": 19.457883834838867, + "learning_rate": 9.436052374287142e-06, + "loss": 4.9837, + "step": 30095 + }, + { + "epoch": 0.6123860677083334, + "grad_norm": 18.001405715942383, + "learning_rate": 9.435867962967098e-06, + "loss": 5.0411, + "step": 30100 + }, + { + "epoch": 0.61248779296875, + "grad_norm": 17.905866622924805, + "learning_rate": 9.435683523303262e-06, + "loss": 5.1766, + "step": 30105 + }, + { + "epoch": 0.6125895182291666, + "grad_norm": 13.752098083496094, + "learning_rate": 9.43549905529682e-06, + "loss": 5.0506, + "step": 30110 + }, + { + "epoch": 0.6126912434895834, + "grad_norm": 19.333175659179688, + "learning_rate": 9.435314558948945e-06, + "loss": 4.9119, + "step": 30115 + }, + { + "epoch": 0.61279296875, + "grad_norm": 15.183969497680664, + "learning_rate": 9.43513003426082e-06, + "loss": 4.9493, + "step": 30120 + }, + { + "epoch": 0.6128946940104166, + "grad_norm": 17.020259857177734, + "learning_rate": 9.434945481233622e-06, + "loss": 4.9404, + "step": 30125 + }, + { + "epoch": 0.6129964192708334, + "grad_norm": 17.226659774780273, + "learning_rate": 9.434760899868529e-06, + "loss": 5.2473, + "step": 30130 + }, + { + "epoch": 0.61309814453125, + "grad_norm": 16.14830780029297, + "learning_rate": 9.43457629016672e-06, + "loss": 5.0429, + "step": 30135 + }, + { + "epoch": 0.6131998697916666, + "grad_norm": 18.131013870239258, + "learning_rate": 9.434391652129378e-06, + "loss": 5.0992, + "step": 30140 + }, + { + "epoch": 0.6133015950520834, + "grad_norm": 16.722476959228516, + "learning_rate": 9.43420698575768e-06, + "loss": 4.921, + "step": 30145 + }, + { + "epoch": 0.6134033203125, + "grad_norm": 15.93204116821289, + "learning_rate": 9.43402229105281e-06, + "loss": 5.1835, + "step": 30150 + }, + { + "epoch": 0.6135050455729166, + "grad_norm": 15.538992881774902, + "learning_rate": 9.433837568015943e-06, + "loss": 5.0452, + "step": 30155 + }, + { + "epoch": 0.6136067708333334, + "grad_norm": 13.906254768371582, + "learning_rate": 9.43365281664826e-06, + "loss": 5.1692, + "step": 30160 + }, + { + "epoch": 0.61370849609375, + "grad_norm": 15.914956092834473, + "learning_rate": 9.433468036950944e-06, + "loss": 4.9651, + "step": 30165 + }, + { + "epoch": 0.6138102213541666, + "grad_norm": 18.077341079711914, + "learning_rate": 9.433283228925176e-06, + "loss": 5.0032, + "step": 30170 + }, + { + "epoch": 0.6139119466145834, + "grad_norm": 16.759851455688477, + "learning_rate": 9.433098392572133e-06, + "loss": 5.0242, + "step": 30175 + }, + { + "epoch": 0.614013671875, + "grad_norm": 20.015052795410156, + "learning_rate": 9.432913527893001e-06, + "loss": 5.2663, + "step": 30180 + }, + { + "epoch": 0.6141153971354166, + "grad_norm": 15.61217975616455, + "learning_rate": 9.432728634888957e-06, + "loss": 5.2324, + "step": 30185 + }, + { + "epoch": 0.6142171223958334, + "grad_norm": 13.500385284423828, + "learning_rate": 9.432543713561184e-06, + "loss": 5.1745, + "step": 30190 + }, + { + "epoch": 0.61431884765625, + "grad_norm": 14.731355667114258, + "learning_rate": 9.432358763910866e-06, + "loss": 4.9679, + "step": 30195 + }, + { + "epoch": 0.6144205729166666, + "grad_norm": 16.518489837646484, + "learning_rate": 9.43217378593918e-06, + "loss": 5.479, + "step": 30200 + }, + { + "epoch": 0.6145222981770834, + "grad_norm": 16.65255355834961, + "learning_rate": 9.431988779647312e-06, + "loss": 5.1032, + "step": 30205 + }, + { + "epoch": 0.6146240234375, + "grad_norm": 17.616317749023438, + "learning_rate": 9.431803745036443e-06, + "loss": 5.1261, + "step": 30210 + }, + { + "epoch": 0.6147257486979166, + "grad_norm": 19.672822952270508, + "learning_rate": 9.431618682107754e-06, + "loss": 5.2695, + "step": 30215 + }, + { + "epoch": 0.6148274739583334, + "grad_norm": 22.520509719848633, + "learning_rate": 9.43143359086243e-06, + "loss": 5.1248, + "step": 30220 + }, + { + "epoch": 0.61492919921875, + "grad_norm": 16.721330642700195, + "learning_rate": 9.431248471301648e-06, + "loss": 4.9657, + "step": 30225 + }, + { + "epoch": 0.6150309244791666, + "grad_norm": 18.021921157836914, + "learning_rate": 9.431063323426597e-06, + "loss": 5.1136, + "step": 30230 + }, + { + "epoch": 0.6151326497395834, + "grad_norm": 19.182649612426758, + "learning_rate": 9.43087814723846e-06, + "loss": 5.0397, + "step": 30235 + }, + { + "epoch": 0.615234375, + "grad_norm": 17.27953338623047, + "learning_rate": 9.430692942738416e-06, + "loss": 5.0242, + "step": 30240 + }, + { + "epoch": 0.6153361002604166, + "grad_norm": 18.175182342529297, + "learning_rate": 9.430507709927652e-06, + "loss": 4.8609, + "step": 30245 + }, + { + "epoch": 0.6154378255208334, + "grad_norm": 14.878602027893066, + "learning_rate": 9.430322448807348e-06, + "loss": 5.2346, + "step": 30250 + }, + { + "epoch": 0.61553955078125, + "grad_norm": 17.02460289001465, + "learning_rate": 9.430137159378691e-06, + "loss": 4.8691, + "step": 30255 + }, + { + "epoch": 0.6156412760416666, + "grad_norm": 15.625641822814941, + "learning_rate": 9.429951841642865e-06, + "loss": 5.0668, + "step": 30260 + }, + { + "epoch": 0.6157430013020834, + "grad_norm": 21.861268997192383, + "learning_rate": 9.429766495601052e-06, + "loss": 5.1807, + "step": 30265 + }, + { + "epoch": 0.6158447265625, + "grad_norm": 29.93769645690918, + "learning_rate": 9.429581121254438e-06, + "loss": 5.0664, + "step": 30270 + }, + { + "epoch": 0.6159464518229166, + "grad_norm": 17.482017517089844, + "learning_rate": 9.429395718604207e-06, + "loss": 5.0567, + "step": 30275 + }, + { + "epoch": 0.6160481770833334, + "grad_norm": 12.969635009765625, + "learning_rate": 9.429210287651542e-06, + "loss": 5.1112, + "step": 30280 + }, + { + "epoch": 0.61614990234375, + "grad_norm": 20.76346206665039, + "learning_rate": 9.429024828397631e-06, + "loss": 4.9682, + "step": 30285 + }, + { + "epoch": 0.6162516276041666, + "grad_norm": 19.478715896606445, + "learning_rate": 9.428839340843654e-06, + "loss": 5.0892, + "step": 30290 + }, + { + "epoch": 0.6163533528645834, + "grad_norm": 16.787195205688477, + "learning_rate": 9.428653824990803e-06, + "loss": 5.3059, + "step": 30295 + }, + { + "epoch": 0.616455078125, + "grad_norm": 13.372591018676758, + "learning_rate": 9.428468280840259e-06, + "loss": 5.1985, + "step": 30300 + }, + { + "epoch": 0.6165568033854166, + "grad_norm": 17.94586753845215, + "learning_rate": 9.428282708393208e-06, + "loss": 5.0781, + "step": 30305 + }, + { + "epoch": 0.6166585286458334, + "grad_norm": 17.00450325012207, + "learning_rate": 9.428097107650836e-06, + "loss": 5.4053, + "step": 30310 + }, + { + "epoch": 0.61676025390625, + "grad_norm": 20.809017181396484, + "learning_rate": 9.427911478614329e-06, + "loss": 5.1964, + "step": 30315 + }, + { + "epoch": 0.6168619791666666, + "grad_norm": 17.610136032104492, + "learning_rate": 9.427725821284875e-06, + "loss": 5.4131, + "step": 30320 + }, + { + "epoch": 0.6169637044270834, + "grad_norm": 23.69182777404785, + "learning_rate": 9.427540135663658e-06, + "loss": 4.9695, + "step": 30325 + }, + { + "epoch": 0.6170654296875, + "grad_norm": 15.08543586730957, + "learning_rate": 9.427354421751863e-06, + "loss": 5.0081, + "step": 30330 + }, + { + "epoch": 0.6171671549479166, + "grad_norm": 17.244735717773438, + "learning_rate": 9.42716867955068e-06, + "loss": 5.1331, + "step": 30335 + }, + { + "epoch": 0.6172688802083334, + "grad_norm": 16.3637752532959, + "learning_rate": 9.426982909061296e-06, + "loss": 5.2073, + "step": 30340 + }, + { + "epoch": 0.61737060546875, + "grad_norm": 15.682032585144043, + "learning_rate": 9.426797110284894e-06, + "loss": 5.4192, + "step": 30345 + }, + { + "epoch": 0.6174723307291666, + "grad_norm": 18.949501037597656, + "learning_rate": 9.426611283222665e-06, + "loss": 4.9351, + "step": 30350 + }, + { + "epoch": 0.6175740559895834, + "grad_norm": 16.301795959472656, + "learning_rate": 9.426425427875793e-06, + "loss": 5.1846, + "step": 30355 + }, + { + "epoch": 0.61767578125, + "grad_norm": 18.89421844482422, + "learning_rate": 9.426239544245472e-06, + "loss": 5.0157, + "step": 30360 + }, + { + "epoch": 0.6177775065104166, + "grad_norm": 20.573089599609375, + "learning_rate": 9.426053632332883e-06, + "loss": 5.1097, + "step": 30365 + }, + { + "epoch": 0.6178792317708334, + "grad_norm": 16.128103256225586, + "learning_rate": 9.425867692139214e-06, + "loss": 5.025, + "step": 30370 + }, + { + "epoch": 0.61798095703125, + "grad_norm": 16.5623722076416, + "learning_rate": 9.425681723665658e-06, + "loss": 5.1819, + "step": 30375 + }, + { + "epoch": 0.6180826822916666, + "grad_norm": 18.042436599731445, + "learning_rate": 9.4254957269134e-06, + "loss": 5.1769, + "step": 30380 + }, + { + "epoch": 0.6181844075520834, + "grad_norm": 15.289289474487305, + "learning_rate": 9.42530970188363e-06, + "loss": 5.1279, + "step": 30385 + }, + { + "epoch": 0.6182861328125, + "grad_norm": 20.23379898071289, + "learning_rate": 9.425123648577536e-06, + "loss": 5.1819, + "step": 30390 + }, + { + "epoch": 0.6183878580729166, + "grad_norm": 16.294719696044922, + "learning_rate": 9.424937566996306e-06, + "loss": 5.0652, + "step": 30395 + }, + { + "epoch": 0.6184895833333334, + "grad_norm": 17.08517074584961, + "learning_rate": 9.42475145714113e-06, + "loss": 5.0364, + "step": 30400 + }, + { + "epoch": 0.61859130859375, + "grad_norm": 21.042892456054688, + "learning_rate": 9.424565319013196e-06, + "loss": 5.1889, + "step": 30405 + }, + { + "epoch": 0.6186930338541666, + "grad_norm": 24.73333168029785, + "learning_rate": 9.424379152613694e-06, + "loss": 5.036, + "step": 30410 + }, + { + "epoch": 0.6187947591145834, + "grad_norm": 15.383149147033691, + "learning_rate": 9.424192957943814e-06, + "loss": 5.3778, + "step": 30415 + }, + { + "epoch": 0.618896484375, + "grad_norm": 18.38212776184082, + "learning_rate": 9.424006735004745e-06, + "loss": 5.1283, + "step": 30420 + }, + { + "epoch": 0.6189982096354166, + "grad_norm": 15.523595809936523, + "learning_rate": 9.423820483797678e-06, + "loss": 5.2513, + "step": 30425 + }, + { + "epoch": 0.6190999348958334, + "grad_norm": 15.890924453735352, + "learning_rate": 9.423634204323802e-06, + "loss": 5.0109, + "step": 30430 + }, + { + "epoch": 0.61920166015625, + "grad_norm": 16.43634033203125, + "learning_rate": 9.423447896584309e-06, + "loss": 5.3216, + "step": 30435 + }, + { + "epoch": 0.6193033854166666, + "grad_norm": 15.829768180847168, + "learning_rate": 9.423261560580387e-06, + "loss": 5.0471, + "step": 30440 + }, + { + "epoch": 0.6194051106770834, + "grad_norm": 14.70785140991211, + "learning_rate": 9.423075196313228e-06, + "loss": 5.1344, + "step": 30445 + }, + { + "epoch": 0.6195068359375, + "grad_norm": 20.078018188476562, + "learning_rate": 9.422888803784022e-06, + "loss": 5.2394, + "step": 30450 + }, + { + "epoch": 0.6196085611979166, + "grad_norm": 14.914834976196289, + "learning_rate": 9.422702382993961e-06, + "loss": 4.9352, + "step": 30455 + }, + { + "epoch": 0.6197102864583334, + "grad_norm": 17.443466186523438, + "learning_rate": 9.422515933944236e-06, + "loss": 5.0179, + "step": 30460 + }, + { + "epoch": 0.61981201171875, + "grad_norm": 17.81487464904785, + "learning_rate": 9.422329456636038e-06, + "loss": 4.9984, + "step": 30465 + }, + { + "epoch": 0.6199137369791666, + "grad_norm": 13.743915557861328, + "learning_rate": 9.422142951070559e-06, + "loss": 5.1329, + "step": 30470 + }, + { + "epoch": 0.6200154622395834, + "grad_norm": 20.28707504272461, + "learning_rate": 9.421956417248989e-06, + "loss": 5.2074, + "step": 30475 + }, + { + "epoch": 0.6201171875, + "grad_norm": 13.97916030883789, + "learning_rate": 9.421769855172523e-06, + "loss": 5.3118, + "step": 30480 + }, + { + "epoch": 0.6202189127604166, + "grad_norm": 15.628433227539062, + "learning_rate": 9.421583264842349e-06, + "loss": 4.9009, + "step": 30485 + }, + { + "epoch": 0.6203206380208334, + "grad_norm": 17.56291961669922, + "learning_rate": 9.421396646259662e-06, + "loss": 5.1415, + "step": 30490 + }, + { + "epoch": 0.62042236328125, + "grad_norm": 16.891164779663086, + "learning_rate": 9.421209999425655e-06, + "loss": 4.8421, + "step": 30495 + }, + { + "epoch": 0.6205240885416666, + "grad_norm": 17.887493133544922, + "learning_rate": 9.42102332434152e-06, + "loss": 5.2648, + "step": 30500 + }, + { + "epoch": 0.6206258138020834, + "grad_norm": 15.765022277832031, + "learning_rate": 9.42083662100845e-06, + "loss": 5.2604, + "step": 30505 + }, + { + "epoch": 0.6207275390625, + "grad_norm": 17.148576736450195, + "learning_rate": 9.420649889427636e-06, + "loss": 5.1837, + "step": 30510 + }, + { + "epoch": 0.6208292643229166, + "grad_norm": 16.97907829284668, + "learning_rate": 9.42046312960027e-06, + "loss": 5.0385, + "step": 30515 + }, + { + "epoch": 0.6209309895833334, + "grad_norm": 12.027236938476562, + "learning_rate": 9.420276341527551e-06, + "loss": 4.888, + "step": 30520 + }, + { + "epoch": 0.62103271484375, + "grad_norm": 17.837291717529297, + "learning_rate": 9.42008952521067e-06, + "loss": 5.226, + "step": 30525 + }, + { + "epoch": 0.6211344401041666, + "grad_norm": 19.160476684570312, + "learning_rate": 9.41990268065082e-06, + "loss": 5.0705, + "step": 30530 + }, + { + "epoch": 0.6212361653645834, + "grad_norm": 17.23418426513672, + "learning_rate": 9.419715807849193e-06, + "loss": 5.1412, + "step": 30535 + }, + { + "epoch": 0.621337890625, + "grad_norm": 14.976672172546387, + "learning_rate": 9.419528906806985e-06, + "loss": 4.9411, + "step": 30540 + }, + { + "epoch": 0.6214396158854166, + "grad_norm": 21.463348388671875, + "learning_rate": 9.419341977525392e-06, + "loss": 5.0513, + "step": 30545 + }, + { + "epoch": 0.6215413411458334, + "grad_norm": 15.930542945861816, + "learning_rate": 9.419155020005605e-06, + "loss": 4.9078, + "step": 30550 + }, + { + "epoch": 0.62164306640625, + "grad_norm": 17.6920223236084, + "learning_rate": 9.418968034248822e-06, + "loss": 5.3563, + "step": 30555 + }, + { + "epoch": 0.6217447916666666, + "grad_norm": 20.171184539794922, + "learning_rate": 9.418781020256234e-06, + "loss": 4.9898, + "step": 30560 + }, + { + "epoch": 0.6218465169270834, + "grad_norm": 18.797170639038086, + "learning_rate": 9.418593978029038e-06, + "loss": 5.1208, + "step": 30565 + }, + { + "epoch": 0.6219482421875, + "grad_norm": 25.775917053222656, + "learning_rate": 9.418406907568431e-06, + "loss": 5.4624, + "step": 30570 + }, + { + "epoch": 0.6220499674479166, + "grad_norm": 14.768402099609375, + "learning_rate": 9.418219808875605e-06, + "loss": 4.8315, + "step": 30575 + }, + { + "epoch": 0.6221516927083334, + "grad_norm": 22.575767517089844, + "learning_rate": 9.418032681951758e-06, + "loss": 5.0719, + "step": 30580 + }, + { + "epoch": 0.62225341796875, + "grad_norm": 17.816144943237305, + "learning_rate": 9.417845526798084e-06, + "loss": 5.1491, + "step": 30585 + }, + { + "epoch": 0.6223551432291666, + "grad_norm": 14.262429237365723, + "learning_rate": 9.41765834341578e-06, + "loss": 5.0537, + "step": 30590 + }, + { + "epoch": 0.6224568684895834, + "grad_norm": 13.23470401763916, + "learning_rate": 9.41747113180604e-06, + "loss": 5.1029, + "step": 30595 + }, + { + "epoch": 0.62255859375, + "grad_norm": 15.323738098144531, + "learning_rate": 9.417283891970063e-06, + "loss": 5.4409, + "step": 30600 + }, + { + "epoch": 0.6226603190104166, + "grad_norm": 16.77809715270996, + "learning_rate": 9.417096623909045e-06, + "loss": 5.1237, + "step": 30605 + }, + { + "epoch": 0.6227620442708334, + "grad_norm": 17.883195877075195, + "learning_rate": 9.41690932762418e-06, + "loss": 4.9922, + "step": 30610 + }, + { + "epoch": 0.62286376953125, + "grad_norm": 14.441740036010742, + "learning_rate": 9.416722003116667e-06, + "loss": 5.2222, + "step": 30615 + }, + { + "epoch": 0.6229654947916666, + "grad_norm": 20.512426376342773, + "learning_rate": 9.416534650387703e-06, + "loss": 5.1926, + "step": 30620 + }, + { + "epoch": 0.6230672200520834, + "grad_norm": 12.562875747680664, + "learning_rate": 9.416347269438484e-06, + "loss": 5.0817, + "step": 30625 + }, + { + "epoch": 0.6231689453125, + "grad_norm": 16.19154167175293, + "learning_rate": 9.416159860270208e-06, + "loss": 4.8743, + "step": 30630 + }, + { + "epoch": 0.6232706705729166, + "grad_norm": 17.713111877441406, + "learning_rate": 9.415972422884073e-06, + "loss": 5.1023, + "step": 30635 + }, + { + "epoch": 0.6233723958333334, + "grad_norm": 16.127222061157227, + "learning_rate": 9.415784957281275e-06, + "loss": 4.9004, + "step": 30640 + }, + { + "epoch": 0.62347412109375, + "grad_norm": 16.0753231048584, + "learning_rate": 9.415597463463014e-06, + "loss": 5.6429, + "step": 30645 + }, + { + "epoch": 0.6235758463541666, + "grad_norm": 14.375929832458496, + "learning_rate": 9.415409941430486e-06, + "loss": 5.079, + "step": 30650 + }, + { + "epoch": 0.6236775716145834, + "grad_norm": 17.4058837890625, + "learning_rate": 9.415222391184892e-06, + "loss": 5.1111, + "step": 30655 + }, + { + "epoch": 0.623779296875, + "grad_norm": 15.394682884216309, + "learning_rate": 9.415034812727425e-06, + "loss": 5.1154, + "step": 30660 + }, + { + "epoch": 0.6238810221354166, + "grad_norm": 18.11135482788086, + "learning_rate": 9.414847206059291e-06, + "loss": 4.9886, + "step": 30665 + }, + { + "epoch": 0.6239827473958334, + "grad_norm": 13.267203330993652, + "learning_rate": 9.414659571181683e-06, + "loss": 5.1969, + "step": 30670 + }, + { + "epoch": 0.62408447265625, + "grad_norm": 14.284646034240723, + "learning_rate": 9.414471908095802e-06, + "loss": 5.0889, + "step": 30675 + }, + { + "epoch": 0.6241861979166666, + "grad_norm": 14.871561050415039, + "learning_rate": 9.414284216802845e-06, + "loss": 5.0607, + "step": 30680 + }, + { + "epoch": 0.6242879231770834, + "grad_norm": 17.473939895629883, + "learning_rate": 9.414096497304015e-06, + "loss": 4.885, + "step": 30685 + }, + { + "epoch": 0.6243896484375, + "grad_norm": 20.248899459838867, + "learning_rate": 9.413908749600509e-06, + "loss": 5.0022, + "step": 30690 + }, + { + "epoch": 0.6244913736979166, + "grad_norm": 17.476316452026367, + "learning_rate": 9.413720973693527e-06, + "loss": 5.2563, + "step": 30695 + }, + { + "epoch": 0.6245930989583334, + "grad_norm": 16.320554733276367, + "learning_rate": 9.41353316958427e-06, + "loss": 5.0226, + "step": 30700 + }, + { + "epoch": 0.62469482421875, + "grad_norm": 20.125869750976562, + "learning_rate": 9.413345337273939e-06, + "loss": 5.0615, + "step": 30705 + }, + { + "epoch": 0.6247965494791666, + "grad_norm": 32.979515075683594, + "learning_rate": 9.413157476763728e-06, + "loss": 5.4932, + "step": 30710 + }, + { + "epoch": 0.6248982747395834, + "grad_norm": 14.637618064880371, + "learning_rate": 9.412969588054843e-06, + "loss": 4.8565, + "step": 30715 + }, + { + "epoch": 0.625, + "grad_norm": 14.248215675354004, + "learning_rate": 9.412781671148486e-06, + "loss": 5.1445, + "step": 30720 + }, + { + "epoch": 0.6251017252604166, + "grad_norm": 17.346588134765625, + "learning_rate": 9.41259372604585e-06, + "loss": 5.2406, + "step": 30725 + }, + { + "epoch": 0.6252034505208334, + "grad_norm": 24.74590301513672, + "learning_rate": 9.412405752748146e-06, + "loss": 4.962, + "step": 30730 + }, + { + "epoch": 0.62530517578125, + "grad_norm": 14.924407005310059, + "learning_rate": 9.412217751256568e-06, + "loss": 4.892, + "step": 30735 + }, + { + "epoch": 0.6254069010416666, + "grad_norm": 16.628128051757812, + "learning_rate": 9.412029721572318e-06, + "loss": 5.0152, + "step": 30740 + }, + { + "epoch": 0.6255086263020834, + "grad_norm": 15.60901165008545, + "learning_rate": 9.411841663696598e-06, + "loss": 5.1534, + "step": 30745 + }, + { + "epoch": 0.6256103515625, + "grad_norm": 16.136281967163086, + "learning_rate": 9.411653577630613e-06, + "loss": 5.1677, + "step": 30750 + }, + { + "epoch": 0.6257120768229166, + "grad_norm": 16.659765243530273, + "learning_rate": 9.41146546337556e-06, + "loss": 5.0322, + "step": 30755 + }, + { + "epoch": 0.6258138020833334, + "grad_norm": 13.578655242919922, + "learning_rate": 9.411277320932643e-06, + "loss": 5.0823, + "step": 30760 + }, + { + "epoch": 0.62591552734375, + "grad_norm": 19.15943145751953, + "learning_rate": 9.411089150303066e-06, + "loss": 5.1284, + "step": 30765 + }, + { + "epoch": 0.6260172526041666, + "grad_norm": 21.949020385742188, + "learning_rate": 9.410900951488029e-06, + "loss": 4.9386, + "step": 30770 + }, + { + "epoch": 0.6261189778645834, + "grad_norm": 15.91179084777832, + "learning_rate": 9.410712724488734e-06, + "loss": 5.0987, + "step": 30775 + }, + { + "epoch": 0.626220703125, + "grad_norm": 18.950693130493164, + "learning_rate": 9.410524469306385e-06, + "loss": 5.1752, + "step": 30780 + }, + { + "epoch": 0.6263224283854166, + "grad_norm": 19.698713302612305, + "learning_rate": 9.410336185942186e-06, + "loss": 5.1602, + "step": 30785 + }, + { + "epoch": 0.6264241536458334, + "grad_norm": 18.33745574951172, + "learning_rate": 9.410147874397338e-06, + "loss": 4.9959, + "step": 30790 + }, + { + "epoch": 0.62652587890625, + "grad_norm": 16.67675018310547, + "learning_rate": 9.409959534673045e-06, + "loss": 5.1132, + "step": 30795 + }, + { + "epoch": 0.6266276041666666, + "grad_norm": 24.80389976501465, + "learning_rate": 9.409771166770511e-06, + "loss": 5.161, + "step": 30800 + }, + { + "epoch": 0.6267293294270834, + "grad_norm": 16.87621307373047, + "learning_rate": 9.40958277069094e-06, + "loss": 5.0594, + "step": 30805 + }, + { + "epoch": 0.6268310546875, + "grad_norm": 21.20949935913086, + "learning_rate": 9.409394346435532e-06, + "loss": 5.2844, + "step": 30810 + }, + { + "epoch": 0.6269327799479166, + "grad_norm": 15.497191429138184, + "learning_rate": 9.409205894005495e-06, + "loss": 5.2338, + "step": 30815 + }, + { + "epoch": 0.6270345052083334, + "grad_norm": 15.846179962158203, + "learning_rate": 9.409017413402034e-06, + "loss": 5.2387, + "step": 30820 + }, + { + "epoch": 0.62713623046875, + "grad_norm": 16.801908493041992, + "learning_rate": 9.40882890462635e-06, + "loss": 5.0189, + "step": 30825 + }, + { + "epoch": 0.6272379557291666, + "grad_norm": 20.139846801757812, + "learning_rate": 9.408640367679648e-06, + "loss": 4.9079, + "step": 30830 + }, + { + "epoch": 0.6273396809895834, + "grad_norm": 17.467193603515625, + "learning_rate": 9.408451802563134e-06, + "loss": 5.1274, + "step": 30835 + }, + { + "epoch": 0.62744140625, + "grad_norm": 14.398027420043945, + "learning_rate": 9.408263209278012e-06, + "loss": 5.0225, + "step": 30840 + }, + { + "epoch": 0.6275431315104166, + "grad_norm": 18.375289916992188, + "learning_rate": 9.408074587825488e-06, + "loss": 5.2139, + "step": 30845 + }, + { + "epoch": 0.6276448567708334, + "grad_norm": 21.521087646484375, + "learning_rate": 9.407885938206768e-06, + "loss": 5.1601, + "step": 30850 + }, + { + "epoch": 0.62774658203125, + "grad_norm": 17.493661880493164, + "learning_rate": 9.407697260423054e-06, + "loss": 5.1092, + "step": 30855 + }, + { + "epoch": 0.6278483072916666, + "grad_norm": 14.593825340270996, + "learning_rate": 9.407508554475554e-06, + "loss": 5.1079, + "step": 30860 + }, + { + "epoch": 0.6279500325520834, + "grad_norm": 17.208206176757812, + "learning_rate": 9.407319820365474e-06, + "loss": 5.0199, + "step": 30865 + }, + { + "epoch": 0.6280517578125, + "grad_norm": 21.032596588134766, + "learning_rate": 9.40713105809402e-06, + "loss": 5.1742, + "step": 30870 + }, + { + "epoch": 0.6281534830729166, + "grad_norm": 15.523119926452637, + "learning_rate": 9.406942267662397e-06, + "loss": 5.1277, + "step": 30875 + }, + { + "epoch": 0.6282552083333334, + "grad_norm": 16.91335105895996, + "learning_rate": 9.406753449071813e-06, + "loss": 5.1243, + "step": 30880 + }, + { + "epoch": 0.62835693359375, + "grad_norm": 20.111900329589844, + "learning_rate": 9.406564602323471e-06, + "loss": 5.0993, + "step": 30885 + }, + { + "epoch": 0.6284586588541666, + "grad_norm": 12.397068977355957, + "learning_rate": 9.406375727418582e-06, + "loss": 5.0284, + "step": 30890 + }, + { + "epoch": 0.6285603841145834, + "grad_norm": 13.354086875915527, + "learning_rate": 9.40618682435835e-06, + "loss": 5.2079, + "step": 30895 + }, + { + "epoch": 0.628662109375, + "grad_norm": 23.704015731811523, + "learning_rate": 9.405997893143984e-06, + "loss": 5.0178, + "step": 30900 + }, + { + "epoch": 0.6287638346354166, + "grad_norm": 18.298694610595703, + "learning_rate": 9.405808933776688e-06, + "loss": 5.275, + "step": 30905 + }, + { + "epoch": 0.6288655598958334, + "grad_norm": 16.65791893005371, + "learning_rate": 9.405619946257674e-06, + "loss": 5.3297, + "step": 30910 + }, + { + "epoch": 0.62896728515625, + "grad_norm": 20.331642150878906, + "learning_rate": 9.405430930588145e-06, + "loss": 5.2586, + "step": 30915 + }, + { + "epoch": 0.6290690104166666, + "grad_norm": 17.63691520690918, + "learning_rate": 9.405241886769312e-06, + "loss": 4.8368, + "step": 30920 + }, + { + "epoch": 0.6291707356770834, + "grad_norm": 19.05185890197754, + "learning_rate": 9.405052814802381e-06, + "loss": 5.31, + "step": 30925 + }, + { + "epoch": 0.6292724609375, + "grad_norm": 17.860158920288086, + "learning_rate": 9.404863714688561e-06, + "loss": 4.8611, + "step": 30930 + }, + { + "epoch": 0.6293741861979166, + "grad_norm": 15.3659029006958, + "learning_rate": 9.40467458642906e-06, + "loss": 5.0135, + "step": 30935 + }, + { + "epoch": 0.6294759114583334, + "grad_norm": 15.541513442993164, + "learning_rate": 9.404485430025087e-06, + "loss": 5.2223, + "step": 30940 + }, + { + "epoch": 0.62957763671875, + "grad_norm": 13.050470352172852, + "learning_rate": 9.40429624547785e-06, + "loss": 5.3681, + "step": 30945 + }, + { + "epoch": 0.6296793619791666, + "grad_norm": 14.561509132385254, + "learning_rate": 9.404107032788558e-06, + "loss": 4.9789, + "step": 30950 + }, + { + "epoch": 0.6297810872395834, + "grad_norm": 21.911033630371094, + "learning_rate": 9.403917791958421e-06, + "loss": 5.3062, + "step": 30955 + }, + { + "epoch": 0.6298828125, + "grad_norm": 16.76962661743164, + "learning_rate": 9.403728522988646e-06, + "loss": 4.9704, + "step": 30960 + }, + { + "epoch": 0.6299845377604166, + "grad_norm": 11.424452781677246, + "learning_rate": 9.403539225880445e-06, + "loss": 4.9967, + "step": 30965 + }, + { + "epoch": 0.6300862630208334, + "grad_norm": 18.067981719970703, + "learning_rate": 9.403349900635026e-06, + "loss": 4.8542, + "step": 30970 + }, + { + "epoch": 0.63018798828125, + "grad_norm": 19.683746337890625, + "learning_rate": 9.403160547253598e-06, + "loss": 4.9546, + "step": 30975 + }, + { + "epoch": 0.6302897135416666, + "grad_norm": 17.130245208740234, + "learning_rate": 9.402971165737373e-06, + "loss": 5.0928, + "step": 30980 + }, + { + "epoch": 0.6303914388020834, + "grad_norm": 18.437376022338867, + "learning_rate": 9.40278175608756e-06, + "loss": 4.8411, + "step": 30985 + }, + { + "epoch": 0.6304931640625, + "grad_norm": 23.054977416992188, + "learning_rate": 9.402592318305368e-06, + "loss": 5.2732, + "step": 30990 + }, + { + "epoch": 0.6305948893229166, + "grad_norm": 21.922698974609375, + "learning_rate": 9.40240285239201e-06, + "loss": 5.0925, + "step": 30995 + }, + { + "epoch": 0.6306966145833334, + "grad_norm": 17.990793228149414, + "learning_rate": 9.402213358348695e-06, + "loss": 5.0821, + "step": 31000 + }, + { + "epoch": 0.63079833984375, + "grad_norm": 22.96088409423828, + "learning_rate": 9.402023836176633e-06, + "loss": 5.1059, + "step": 31005 + }, + { + "epoch": 0.6309000651041666, + "grad_norm": 16.69580841064453, + "learning_rate": 9.401834285877037e-06, + "loss": 5.0626, + "step": 31010 + }, + { + "epoch": 0.6310017903645834, + "grad_norm": 16.902708053588867, + "learning_rate": 9.401644707451117e-06, + "loss": 4.861, + "step": 31015 + }, + { + "epoch": 0.631103515625, + "grad_norm": 18.12879180908203, + "learning_rate": 9.401455100900082e-06, + "loss": 5.1151, + "step": 31020 + }, + { + "epoch": 0.6312052408854166, + "grad_norm": 12.851465225219727, + "learning_rate": 9.40126546622515e-06, + "loss": 5.1862, + "step": 31025 + }, + { + "epoch": 0.6313069661458334, + "grad_norm": 22.46917724609375, + "learning_rate": 9.401075803427529e-06, + "loss": 4.9483, + "step": 31030 + }, + { + "epoch": 0.63140869140625, + "grad_norm": 16.903108596801758, + "learning_rate": 9.400886112508429e-06, + "loss": 4.9969, + "step": 31035 + }, + { + "epoch": 0.6315104166666666, + "grad_norm": 20.783811569213867, + "learning_rate": 9.400696393469063e-06, + "loss": 4.9817, + "step": 31040 + }, + { + "epoch": 0.6316121419270834, + "grad_norm": 18.69041633605957, + "learning_rate": 9.400506646310646e-06, + "loss": 5.0918, + "step": 31045 + }, + { + "epoch": 0.6317138671875, + "grad_norm": 17.99905776977539, + "learning_rate": 9.400316871034386e-06, + "loss": 4.9577, + "step": 31050 + }, + { + "epoch": 0.6318155924479166, + "grad_norm": 14.852863311767578, + "learning_rate": 9.4001270676415e-06, + "loss": 5.2424, + "step": 31055 + }, + { + "epoch": 0.6319173177083334, + "grad_norm": 19.120702743530273, + "learning_rate": 9.399937236133198e-06, + "loss": 4.9161, + "step": 31060 + }, + { + "epoch": 0.63201904296875, + "grad_norm": 18.192211151123047, + "learning_rate": 9.399747376510695e-06, + "loss": 4.8386, + "step": 31065 + }, + { + "epoch": 0.6321207682291666, + "grad_norm": 16.05440902709961, + "learning_rate": 9.399557488775202e-06, + "loss": 5.2216, + "step": 31070 + }, + { + "epoch": 0.6322224934895834, + "grad_norm": 15.366754531860352, + "learning_rate": 9.399367572927932e-06, + "loss": 5.0031, + "step": 31075 + }, + { + "epoch": 0.63232421875, + "grad_norm": 20.77650260925293, + "learning_rate": 9.399177628970102e-06, + "loss": 5.1805, + "step": 31080 + }, + { + "epoch": 0.6324259440104166, + "grad_norm": 18.732608795166016, + "learning_rate": 9.398987656902921e-06, + "loss": 5.0456, + "step": 31085 + }, + { + "epoch": 0.6325276692708334, + "grad_norm": 16.370464324951172, + "learning_rate": 9.398797656727608e-06, + "loss": 5.1421, + "step": 31090 + }, + { + "epoch": 0.63262939453125, + "grad_norm": 21.423067092895508, + "learning_rate": 9.398607628445373e-06, + "loss": 5.0928, + "step": 31095 + }, + { + "epoch": 0.6327311197916666, + "grad_norm": 17.112276077270508, + "learning_rate": 9.39841757205743e-06, + "loss": 5.1074, + "step": 31100 + }, + { + "epoch": 0.6328328450520834, + "grad_norm": 19.320579528808594, + "learning_rate": 9.398227487564997e-06, + "loss": 4.9809, + "step": 31105 + }, + { + "epoch": 0.6329345703125, + "grad_norm": 21.438594818115234, + "learning_rate": 9.398037374969287e-06, + "loss": 5.1956, + "step": 31110 + }, + { + "epoch": 0.6330362955729166, + "grad_norm": 19.047225952148438, + "learning_rate": 9.397847234271514e-06, + "loss": 5.0376, + "step": 31115 + }, + { + "epoch": 0.6331380208333334, + "grad_norm": 20.18342399597168, + "learning_rate": 9.397657065472891e-06, + "loss": 5.1411, + "step": 31120 + }, + { + "epoch": 0.63323974609375, + "grad_norm": 21.126062393188477, + "learning_rate": 9.397466868574638e-06, + "loss": 5.2377, + "step": 31125 + }, + { + "epoch": 0.6333414713541666, + "grad_norm": 13.747904777526855, + "learning_rate": 9.397276643577966e-06, + "loss": 5.2497, + "step": 31130 + }, + { + "epoch": 0.6334431966145834, + "grad_norm": 13.643434524536133, + "learning_rate": 9.397086390484093e-06, + "loss": 5.4544, + "step": 31135 + }, + { + "epoch": 0.633544921875, + "grad_norm": 19.507078170776367, + "learning_rate": 9.396896109294234e-06, + "loss": 4.9911, + "step": 31140 + }, + { + "epoch": 0.6336466471354166, + "grad_norm": 16.865497589111328, + "learning_rate": 9.396705800009605e-06, + "loss": 5.0464, + "step": 31145 + }, + { + "epoch": 0.6337483723958334, + "grad_norm": 20.194808959960938, + "learning_rate": 9.39651546263142e-06, + "loss": 4.966, + "step": 31150 + }, + { + "epoch": 0.63385009765625, + "grad_norm": 17.361568450927734, + "learning_rate": 9.396325097160898e-06, + "loss": 5.0697, + "step": 31155 + }, + { + "epoch": 0.6339518229166666, + "grad_norm": 17.44618034362793, + "learning_rate": 9.396134703599254e-06, + "loss": 5.33, + "step": 31160 + }, + { + "epoch": 0.6340535481770834, + "grad_norm": 19.344261169433594, + "learning_rate": 9.395944281947705e-06, + "loss": 5.0618, + "step": 31165 + }, + { + "epoch": 0.6341552734375, + "grad_norm": 18.87541961669922, + "learning_rate": 9.395753832207466e-06, + "loss": 5.249, + "step": 31170 + }, + { + "epoch": 0.6342569986979166, + "grad_norm": 15.053781509399414, + "learning_rate": 9.395563354379758e-06, + "loss": 4.8411, + "step": 31175 + }, + { + "epoch": 0.6343587239583334, + "grad_norm": 25.826507568359375, + "learning_rate": 9.395372848465793e-06, + "loss": 5.1911, + "step": 31180 + }, + { + "epoch": 0.63446044921875, + "grad_norm": 17.023841857910156, + "learning_rate": 9.395182314466793e-06, + "loss": 4.9772, + "step": 31185 + }, + { + "epoch": 0.6345621744791666, + "grad_norm": 13.174439430236816, + "learning_rate": 9.39499175238397e-06, + "loss": 5.0715, + "step": 31190 + }, + { + "epoch": 0.6346638997395834, + "grad_norm": 23.745691299438477, + "learning_rate": 9.394801162218547e-06, + "loss": 5.31, + "step": 31195 + }, + { + "epoch": 0.634765625, + "grad_norm": 31.717187881469727, + "learning_rate": 9.39461054397174e-06, + "loss": 4.9106, + "step": 31200 + }, + { + "epoch": 0.6348673502604166, + "grad_norm": 15.71392822265625, + "learning_rate": 9.394419897644766e-06, + "loss": 5.2967, + "step": 31205 + }, + { + "epoch": 0.6349690755208334, + "grad_norm": 19.94091033935547, + "learning_rate": 9.394229223238845e-06, + "loss": 5.2786, + "step": 31210 + }, + { + "epoch": 0.63507080078125, + "grad_norm": 21.66228675842285, + "learning_rate": 9.394038520755193e-06, + "loss": 4.9227, + "step": 31215 + }, + { + "epoch": 0.6351725260416666, + "grad_norm": 14.031932830810547, + "learning_rate": 9.393847790195029e-06, + "loss": 5.2485, + "step": 31220 + }, + { + "epoch": 0.6352742513020834, + "grad_norm": 19.061927795410156, + "learning_rate": 9.393657031559575e-06, + "loss": 5.0873, + "step": 31225 + }, + { + "epoch": 0.6353759765625, + "grad_norm": 16.187849044799805, + "learning_rate": 9.393466244850047e-06, + "loss": 5.1114, + "step": 31230 + }, + { + "epoch": 0.6354777018229166, + "grad_norm": 16.20943832397461, + "learning_rate": 9.39327543006766e-06, + "loss": 5.0481, + "step": 31235 + }, + { + "epoch": 0.6355794270833334, + "grad_norm": 20.068490982055664, + "learning_rate": 9.393084587213642e-06, + "loss": 5.0074, + "step": 31240 + }, + { + "epoch": 0.63568115234375, + "grad_norm": 18.264177322387695, + "learning_rate": 9.392893716289206e-06, + "loss": 4.9745, + "step": 31245 + }, + { + "epoch": 0.6357828776041666, + "grad_norm": 15.391866683959961, + "learning_rate": 9.392702817295577e-06, + "loss": 5.1376, + "step": 31250 + }, + { + "epoch": 0.6358846028645834, + "grad_norm": 14.799602508544922, + "learning_rate": 9.392511890233967e-06, + "loss": 5.2135, + "step": 31255 + }, + { + "epoch": 0.635986328125, + "grad_norm": 21.90947151184082, + "learning_rate": 9.392320935105601e-06, + "loss": 5.1697, + "step": 31260 + }, + { + "epoch": 0.6360880533854166, + "grad_norm": 12.961623191833496, + "learning_rate": 9.392129951911702e-06, + "loss": 5.0002, + "step": 31265 + }, + { + "epoch": 0.6361897786458334, + "grad_norm": 18.122636795043945, + "learning_rate": 9.391938940653484e-06, + "loss": 4.8621, + "step": 31270 + }, + { + "epoch": 0.63629150390625, + "grad_norm": 15.235447883605957, + "learning_rate": 9.391747901332171e-06, + "loss": 5.1104, + "step": 31275 + }, + { + "epoch": 0.6363932291666666, + "grad_norm": 14.285395622253418, + "learning_rate": 9.391556833948983e-06, + "loss": 5.217, + "step": 31280 + }, + { + "epoch": 0.6364949544270834, + "grad_norm": 16.003660202026367, + "learning_rate": 9.39136573850514e-06, + "loss": 4.9345, + "step": 31285 + }, + { + "epoch": 0.6365966796875, + "grad_norm": 17.22077751159668, + "learning_rate": 9.391174615001866e-06, + "loss": 5.2021, + "step": 31290 + }, + { + "epoch": 0.6366984049479166, + "grad_norm": 16.12464714050293, + "learning_rate": 9.390983463440378e-06, + "loss": 5.2929, + "step": 31295 + }, + { + "epoch": 0.6368001302083334, + "grad_norm": 17.98756980895996, + "learning_rate": 9.3907922838219e-06, + "loss": 5.1759, + "step": 31300 + }, + { + "epoch": 0.63690185546875, + "grad_norm": 18.580846786499023, + "learning_rate": 9.390601076147654e-06, + "loss": 5.0522, + "step": 31305 + }, + { + "epoch": 0.6370035807291666, + "grad_norm": 16.11943817138672, + "learning_rate": 9.390409840418862e-06, + "loss": 5.1331, + "step": 31310 + }, + { + "epoch": 0.6371053059895834, + "grad_norm": 15.085432052612305, + "learning_rate": 9.390218576636743e-06, + "loss": 4.9652, + "step": 31315 + }, + { + "epoch": 0.63720703125, + "grad_norm": 21.015262603759766, + "learning_rate": 9.390027284802522e-06, + "loss": 5.0319, + "step": 31320 + }, + { + "epoch": 0.6373087565104166, + "grad_norm": 16.43564796447754, + "learning_rate": 9.389835964917418e-06, + "loss": 5.0336, + "step": 31325 + }, + { + "epoch": 0.6374104817708334, + "grad_norm": 13.067967414855957, + "learning_rate": 9.389644616982659e-06, + "loss": 5.2988, + "step": 31330 + }, + { + "epoch": 0.63751220703125, + "grad_norm": 16.637033462524414, + "learning_rate": 9.389453240999461e-06, + "loss": 4.9065, + "step": 31335 + }, + { + "epoch": 0.6376139322916666, + "grad_norm": 15.072813987731934, + "learning_rate": 9.389261836969052e-06, + "loss": 5.3726, + "step": 31340 + }, + { + "epoch": 0.6377156575520834, + "grad_norm": 17.055736541748047, + "learning_rate": 9.389070404892653e-06, + "loss": 5.0585, + "step": 31345 + }, + { + "epoch": 0.6378173828125, + "grad_norm": 20.9744815826416, + "learning_rate": 9.388878944771487e-06, + "loss": 5.1945, + "step": 31350 + }, + { + "epoch": 0.6379191080729166, + "grad_norm": 12.476515769958496, + "learning_rate": 9.38868745660678e-06, + "loss": 4.9024, + "step": 31355 + }, + { + "epoch": 0.6380208333333334, + "grad_norm": 22.09336280822754, + "learning_rate": 9.38849594039975e-06, + "loss": 5.0771, + "step": 31360 + }, + { + "epoch": 0.63812255859375, + "grad_norm": 16.95551872253418, + "learning_rate": 9.388304396151626e-06, + "loss": 4.8748, + "step": 31365 + }, + { + "epoch": 0.6382242838541666, + "grad_norm": 16.87974739074707, + "learning_rate": 9.388112823863628e-06, + "loss": 5.1635, + "step": 31370 + }, + { + "epoch": 0.6383260091145834, + "grad_norm": 14.498518943786621, + "learning_rate": 9.387921223536985e-06, + "loss": 5.3443, + "step": 31375 + }, + { + "epoch": 0.638427734375, + "grad_norm": 12.605443000793457, + "learning_rate": 9.387729595172916e-06, + "loss": 4.8579, + "step": 31380 + }, + { + "epoch": 0.6385294596354166, + "grad_norm": 16.72337532043457, + "learning_rate": 9.387537938772649e-06, + "loss": 5.0562, + "step": 31385 + }, + { + "epoch": 0.6386311848958334, + "grad_norm": 15.155854225158691, + "learning_rate": 9.387346254337406e-06, + "loss": 5.0762, + "step": 31390 + }, + { + "epoch": 0.63873291015625, + "grad_norm": 16.914939880371094, + "learning_rate": 9.387154541868414e-06, + "loss": 5.2265, + "step": 31395 + }, + { + "epoch": 0.6388346354166666, + "grad_norm": 15.893189430236816, + "learning_rate": 9.386962801366898e-06, + "loss": 5.0754, + "step": 31400 + }, + { + "epoch": 0.6389363606770834, + "grad_norm": 12.42927074432373, + "learning_rate": 9.38677103283408e-06, + "loss": 5.2199, + "step": 31405 + }, + { + "epoch": 0.6390380859375, + "grad_norm": 16.689416885375977, + "learning_rate": 9.38657923627119e-06, + "loss": 5.2665, + "step": 31410 + }, + { + "epoch": 0.6391398111979166, + "grad_norm": 15.05963134765625, + "learning_rate": 9.38638741167945e-06, + "loss": 4.9942, + "step": 31415 + }, + { + "epoch": 0.6392415364583334, + "grad_norm": 17.18674659729004, + "learning_rate": 9.386195559060087e-06, + "loss": 5.142, + "step": 31420 + }, + { + "epoch": 0.63934326171875, + "grad_norm": 19.669309616088867, + "learning_rate": 9.386003678414328e-06, + "loss": 5.5311, + "step": 31425 + }, + { + "epoch": 0.6394449869791666, + "grad_norm": 20.954729080200195, + "learning_rate": 9.385811769743396e-06, + "loss": 5.0266, + "step": 31430 + }, + { + "epoch": 0.6395467122395834, + "grad_norm": 15.877363204956055, + "learning_rate": 9.38561983304852e-06, + "loss": 5.4423, + "step": 31435 + }, + { + "epoch": 0.6396484375, + "grad_norm": 18.972740173339844, + "learning_rate": 9.385427868330925e-06, + "loss": 5.014, + "step": 31440 + }, + { + "epoch": 0.6397501627604166, + "grad_norm": 15.12962532043457, + "learning_rate": 9.385235875591839e-06, + "loss": 5.1157, + "step": 31445 + }, + { + "epoch": 0.6398518880208334, + "grad_norm": 19.010576248168945, + "learning_rate": 9.385043854832488e-06, + "loss": 5.1298, + "step": 31450 + }, + { + "epoch": 0.63995361328125, + "grad_norm": 15.218372344970703, + "learning_rate": 9.384851806054096e-06, + "loss": 5.3089, + "step": 31455 + }, + { + "epoch": 0.6400553385416666, + "grad_norm": 16.222637176513672, + "learning_rate": 9.384659729257896e-06, + "loss": 5.1646, + "step": 31460 + }, + { + "epoch": 0.6401570638020834, + "grad_norm": 15.135822296142578, + "learning_rate": 9.38446762444511e-06, + "loss": 4.9139, + "step": 31465 + }, + { + "epoch": 0.6402587890625, + "grad_norm": 16.360910415649414, + "learning_rate": 9.38427549161697e-06, + "loss": 5.2558, + "step": 31470 + }, + { + "epoch": 0.6403605143229166, + "grad_norm": 18.305156707763672, + "learning_rate": 9.3840833307747e-06, + "loss": 5.2042, + "step": 31475 + }, + { + "epoch": 0.6404622395833334, + "grad_norm": 14.632264137268066, + "learning_rate": 9.383891141919527e-06, + "loss": 5.216, + "step": 31480 + }, + { + "epoch": 0.64056396484375, + "grad_norm": 18.81085968017578, + "learning_rate": 9.383698925052684e-06, + "loss": 5.3743, + "step": 31485 + }, + { + "epoch": 0.6406656901041666, + "grad_norm": 20.131345748901367, + "learning_rate": 9.383506680175396e-06, + "loss": 4.8143, + "step": 31490 + }, + { + "epoch": 0.6407674153645834, + "grad_norm": 16.635995864868164, + "learning_rate": 9.383314407288893e-06, + "loss": 4.7899, + "step": 31495 + }, + { + "epoch": 0.640869140625, + "grad_norm": 14.258245468139648, + "learning_rate": 9.3831221063944e-06, + "loss": 5.3169, + "step": 31500 + }, + { + "epoch": 0.6409708658854166, + "grad_norm": 16.46474838256836, + "learning_rate": 9.382929777493149e-06, + "loss": 5.1148, + "step": 31505 + }, + { + "epoch": 0.6410725911458334, + "grad_norm": 12.633837699890137, + "learning_rate": 9.382737420586368e-06, + "loss": 5.2979, + "step": 31510 + }, + { + "epoch": 0.64117431640625, + "grad_norm": 25.3958797454834, + "learning_rate": 9.382545035675288e-06, + "loss": 5.1346, + "step": 31515 + }, + { + "epoch": 0.6412760416666666, + "grad_norm": 18.45032501220703, + "learning_rate": 9.382352622761134e-06, + "loss": 5.4424, + "step": 31520 + }, + { + "epoch": 0.6413777669270834, + "grad_norm": 17.034996032714844, + "learning_rate": 9.38216018184514e-06, + "loss": 5.0431, + "step": 31525 + }, + { + "epoch": 0.6414794921875, + "grad_norm": 19.245471954345703, + "learning_rate": 9.38196771292853e-06, + "loss": 5.2204, + "step": 31530 + }, + { + "epoch": 0.6415812174479166, + "grad_norm": 20.478179931640625, + "learning_rate": 9.381775216012542e-06, + "loss": 5.1207, + "step": 31535 + }, + { + "epoch": 0.6416829427083334, + "grad_norm": 13.156067848205566, + "learning_rate": 9.381582691098397e-06, + "loss": 4.9599, + "step": 31540 + }, + { + "epoch": 0.64178466796875, + "grad_norm": 25.382261276245117, + "learning_rate": 9.381390138187332e-06, + "loss": 5.1229, + "step": 31545 + }, + { + "epoch": 0.6418863932291666, + "grad_norm": 22.11922264099121, + "learning_rate": 9.381197557280575e-06, + "loss": 5.0283, + "step": 31550 + }, + { + "epoch": 0.6419881184895834, + "grad_norm": 14.814275741577148, + "learning_rate": 9.381004948379355e-06, + "loss": 4.9906, + "step": 31555 + }, + { + "epoch": 0.64208984375, + "grad_norm": 13.51635456085205, + "learning_rate": 9.380812311484906e-06, + "loss": 5.2172, + "step": 31560 + }, + { + "epoch": 0.6421915690104166, + "grad_norm": 17.284528732299805, + "learning_rate": 9.380619646598455e-06, + "loss": 5.2207, + "step": 31565 + }, + { + "epoch": 0.6422932942708334, + "grad_norm": 16.683204650878906, + "learning_rate": 9.380426953721235e-06, + "loss": 5.2755, + "step": 31570 + }, + { + "epoch": 0.64239501953125, + "grad_norm": 19.721046447753906, + "learning_rate": 9.380234232854478e-06, + "loss": 4.7384, + "step": 31575 + }, + { + "epoch": 0.6424967447916666, + "grad_norm": 18.318424224853516, + "learning_rate": 9.380041483999416e-06, + "loss": 5.2314, + "step": 31580 + }, + { + "epoch": 0.6425984700520834, + "grad_norm": 19.588703155517578, + "learning_rate": 9.379848707157278e-06, + "loss": 4.9339, + "step": 31585 + }, + { + "epoch": 0.6427001953125, + "grad_norm": 26.850650787353516, + "learning_rate": 9.379655902329297e-06, + "loss": 4.9958, + "step": 31590 + }, + { + "epoch": 0.6428019205729166, + "grad_norm": 22.268943786621094, + "learning_rate": 9.379463069516705e-06, + "loss": 5.1292, + "step": 31595 + }, + { + "epoch": 0.6429036458333334, + "grad_norm": 17.073768615722656, + "learning_rate": 9.379270208720734e-06, + "loss": 5.1602, + "step": 31600 + }, + { + "epoch": 0.64300537109375, + "grad_norm": 13.909344673156738, + "learning_rate": 9.379077319942618e-06, + "loss": 5.1068, + "step": 31605 + }, + { + "epoch": 0.6431070963541666, + "grad_norm": 33.248905181884766, + "learning_rate": 9.378884403183587e-06, + "loss": 5.5095, + "step": 31610 + }, + { + "epoch": 0.6432088216145834, + "grad_norm": 24.90292739868164, + "learning_rate": 9.378691458444876e-06, + "loss": 4.9062, + "step": 31615 + }, + { + "epoch": 0.643310546875, + "grad_norm": 18.746685028076172, + "learning_rate": 9.378498485727714e-06, + "loss": 4.8905, + "step": 31620 + }, + { + "epoch": 0.6434122721354166, + "grad_norm": 13.847686767578125, + "learning_rate": 9.37830548503334e-06, + "loss": 4.9263, + "step": 31625 + }, + { + "epoch": 0.6435139973958334, + "grad_norm": 12.581063270568848, + "learning_rate": 9.37811245636298e-06, + "loss": 5.1567, + "step": 31630 + }, + { + "epoch": 0.64361572265625, + "grad_norm": 19.28635597229004, + "learning_rate": 9.377919399717872e-06, + "loss": 5.1374, + "step": 31635 + }, + { + "epoch": 0.6437174479166666, + "grad_norm": 17.49570655822754, + "learning_rate": 9.377726315099251e-06, + "loss": 5.3018, + "step": 31640 + }, + { + "epoch": 0.6438191731770834, + "grad_norm": 18.288143157958984, + "learning_rate": 9.377533202508347e-06, + "loss": 4.8936, + "step": 31645 + }, + { + "epoch": 0.6439208984375, + "grad_norm": 18.141117095947266, + "learning_rate": 9.377340061946396e-06, + "loss": 5.1356, + "step": 31650 + }, + { + "epoch": 0.6440226236979166, + "grad_norm": 21.698524475097656, + "learning_rate": 9.377146893414631e-06, + "loss": 4.9714, + "step": 31655 + }, + { + "epoch": 0.6441243489583334, + "grad_norm": 17.68563461303711, + "learning_rate": 9.376953696914289e-06, + "loss": 5.03, + "step": 31660 + }, + { + "epoch": 0.64422607421875, + "grad_norm": 17.63560676574707, + "learning_rate": 9.3767604724466e-06, + "loss": 5.3209, + "step": 31665 + }, + { + "epoch": 0.6443277994791666, + "grad_norm": 20.080245971679688, + "learning_rate": 9.3765672200128e-06, + "loss": 5.0484, + "step": 31670 + }, + { + "epoch": 0.6444295247395834, + "grad_norm": 19.021167755126953, + "learning_rate": 9.376373939614128e-06, + "loss": 5.0682, + "step": 31675 + }, + { + "epoch": 0.64453125, + "grad_norm": 12.570929527282715, + "learning_rate": 9.376180631251813e-06, + "loss": 5.1575, + "step": 31680 + }, + { + "epoch": 0.6446329752604166, + "grad_norm": 16.923171997070312, + "learning_rate": 9.375987294927093e-06, + "loss": 5.1042, + "step": 31685 + }, + { + "epoch": 0.6447347005208334, + "grad_norm": 15.190240859985352, + "learning_rate": 9.375793930641205e-06, + "loss": 5.5002, + "step": 31690 + }, + { + "epoch": 0.64483642578125, + "grad_norm": 22.96544647216797, + "learning_rate": 9.375600538395382e-06, + "loss": 5.1706, + "step": 31695 + }, + { + "epoch": 0.6449381510416666, + "grad_norm": 25.179363250732422, + "learning_rate": 9.37540711819086e-06, + "loss": 5.3985, + "step": 31700 + }, + { + "epoch": 0.6450398763020834, + "grad_norm": 19.444847106933594, + "learning_rate": 9.375213670028877e-06, + "loss": 5.265, + "step": 31705 + }, + { + "epoch": 0.6451416015625, + "grad_norm": 16.342453002929688, + "learning_rate": 9.375020193910667e-06, + "loss": 5.0416, + "step": 31710 + }, + { + "epoch": 0.6452433268229166, + "grad_norm": 14.280284881591797, + "learning_rate": 9.374826689837466e-06, + "loss": 5.0746, + "step": 31715 + }, + { + "epoch": 0.6453450520833334, + "grad_norm": 16.052656173706055, + "learning_rate": 9.37463315781051e-06, + "loss": 5.0707, + "step": 31720 + }, + { + "epoch": 0.64544677734375, + "grad_norm": 16.554107666015625, + "learning_rate": 9.37443959783104e-06, + "loss": 4.9025, + "step": 31725 + }, + { + "epoch": 0.6455485026041666, + "grad_norm": 15.389242172241211, + "learning_rate": 9.374246009900287e-06, + "loss": 5.1105, + "step": 31730 + }, + { + "epoch": 0.6456502278645834, + "grad_norm": 15.098888397216797, + "learning_rate": 9.374052394019492e-06, + "loss": 4.8929, + "step": 31735 + }, + { + "epoch": 0.645751953125, + "grad_norm": 27.231069564819336, + "learning_rate": 9.373858750189892e-06, + "loss": 4.9552, + "step": 31740 + }, + { + "epoch": 0.6458536783854166, + "grad_norm": 20.739707946777344, + "learning_rate": 9.37366507841272e-06, + "loss": 5.2544, + "step": 31745 + }, + { + "epoch": 0.6459554036458334, + "grad_norm": 21.5257625579834, + "learning_rate": 9.373471378689218e-06, + "loss": 5.282, + "step": 31750 + }, + { + "epoch": 0.64605712890625, + "grad_norm": 17.82184410095215, + "learning_rate": 9.373277651020622e-06, + "loss": 5.1266, + "step": 31755 + }, + { + "epoch": 0.6461588541666666, + "grad_norm": 19.01561737060547, + "learning_rate": 9.37308389540817e-06, + "loss": 5.0638, + "step": 31760 + }, + { + "epoch": 0.6462605794270834, + "grad_norm": 18.01209831237793, + "learning_rate": 9.372890111853102e-06, + "loss": 5.1399, + "step": 31765 + }, + { + "epoch": 0.6463623046875, + "grad_norm": 19.600631713867188, + "learning_rate": 9.372696300356651e-06, + "loss": 5.2837, + "step": 31770 + }, + { + "epoch": 0.6464640299479166, + "grad_norm": 18.439409255981445, + "learning_rate": 9.37250246092006e-06, + "loss": 5.0813, + "step": 31775 + }, + { + "epoch": 0.6465657552083334, + "grad_norm": 15.93150520324707, + "learning_rate": 9.372308593544569e-06, + "loss": 5.1896, + "step": 31780 + }, + { + "epoch": 0.64666748046875, + "grad_norm": 19.16075325012207, + "learning_rate": 9.37211469823141e-06, + "loss": 5.0191, + "step": 31785 + }, + { + "epoch": 0.6467692057291666, + "grad_norm": 14.738869667053223, + "learning_rate": 9.371920774981828e-06, + "loss": 4.9938, + "step": 31790 + }, + { + "epoch": 0.6468709309895834, + "grad_norm": 18.335376739501953, + "learning_rate": 9.37172682379706e-06, + "loss": 5.1775, + "step": 31795 + }, + { + "epoch": 0.64697265625, + "grad_norm": 22.861164093017578, + "learning_rate": 9.371532844678346e-06, + "loss": 5.3352, + "step": 31800 + }, + { + "epoch": 0.6470743815104166, + "grad_norm": 14.543967247009277, + "learning_rate": 9.371338837626923e-06, + "loss": 5.1362, + "step": 31805 + }, + { + "epoch": 0.6471761067708334, + "grad_norm": 16.742889404296875, + "learning_rate": 9.371144802644033e-06, + "loss": 5.0807, + "step": 31810 + }, + { + "epoch": 0.64727783203125, + "grad_norm": 14.58894157409668, + "learning_rate": 9.370950739730916e-06, + "loss": 4.7788, + "step": 31815 + }, + { + "epoch": 0.6473795572916666, + "grad_norm": 16.72580909729004, + "learning_rate": 9.37075664888881e-06, + "loss": 5.1194, + "step": 31820 + }, + { + "epoch": 0.6474812825520834, + "grad_norm": 20.273298263549805, + "learning_rate": 9.370562530118959e-06, + "loss": 5.1294, + "step": 31825 + }, + { + "epoch": 0.6475830078125, + "grad_norm": 15.447123527526855, + "learning_rate": 9.3703683834226e-06, + "loss": 5.2688, + "step": 31830 + }, + { + "epoch": 0.6476847330729166, + "grad_norm": 16.909822463989258, + "learning_rate": 9.370174208800973e-06, + "loss": 5.2708, + "step": 31835 + }, + { + "epoch": 0.6477864583333334, + "grad_norm": 30.88945198059082, + "learning_rate": 9.36998000625532e-06, + "loss": 5.202, + "step": 31840 + }, + { + "epoch": 0.64788818359375, + "grad_norm": 14.355341911315918, + "learning_rate": 9.369785775786884e-06, + "loss": 5.1701, + "step": 31845 + }, + { + "epoch": 0.6479899088541666, + "grad_norm": 17.50690269470215, + "learning_rate": 9.369591517396902e-06, + "loss": 5.0591, + "step": 31850 + }, + { + "epoch": 0.6480916341145834, + "grad_norm": 16.855396270751953, + "learning_rate": 9.36939723108662e-06, + "loss": 5.1213, + "step": 31855 + }, + { + "epoch": 0.648193359375, + "grad_norm": 19.824779510498047, + "learning_rate": 9.369202916857274e-06, + "loss": 5.0308, + "step": 31860 + }, + { + "epoch": 0.6482950846354166, + "grad_norm": 14.854310989379883, + "learning_rate": 9.369008574710108e-06, + "loss": 5.2436, + "step": 31865 + }, + { + "epoch": 0.6483968098958334, + "grad_norm": 22.381385803222656, + "learning_rate": 9.368814204646366e-06, + "loss": 5.0767, + "step": 31870 + }, + { + "epoch": 0.64849853515625, + "grad_norm": 17.58449935913086, + "learning_rate": 9.368619806667288e-06, + "loss": 5.1211, + "step": 31875 + }, + { + "epoch": 0.6486002604166666, + "grad_norm": 20.52455711364746, + "learning_rate": 9.368425380774114e-06, + "loss": 5.2309, + "step": 31880 + }, + { + "epoch": 0.6487019856770834, + "grad_norm": 13.858134269714355, + "learning_rate": 9.36823092696809e-06, + "loss": 4.9689, + "step": 31885 + }, + { + "epoch": 0.6488037109375, + "grad_norm": 15.99581527709961, + "learning_rate": 9.368036445250457e-06, + "loss": 5.1994, + "step": 31890 + }, + { + "epoch": 0.6489054361979166, + "grad_norm": 15.641708374023438, + "learning_rate": 9.367841935622459e-06, + "loss": 4.9747, + "step": 31895 + }, + { + "epoch": 0.6490071614583334, + "grad_norm": 25.593788146972656, + "learning_rate": 9.367647398085336e-06, + "loss": 5.317, + "step": 31900 + }, + { + "epoch": 0.64910888671875, + "grad_norm": 15.494670867919922, + "learning_rate": 9.367452832640333e-06, + "loss": 5.0467, + "step": 31905 + }, + { + "epoch": 0.6492106119791666, + "grad_norm": 24.892499923706055, + "learning_rate": 9.367258239288692e-06, + "loss": 4.9799, + "step": 31910 + }, + { + "epoch": 0.6493123372395834, + "grad_norm": 12.69201946258545, + "learning_rate": 9.36706361803166e-06, + "loss": 5.1537, + "step": 31915 + }, + { + "epoch": 0.6494140625, + "grad_norm": 17.69487953186035, + "learning_rate": 9.366868968870476e-06, + "loss": 5.3128, + "step": 31920 + }, + { + "epoch": 0.6495157877604166, + "grad_norm": 18.42898178100586, + "learning_rate": 9.366674291806384e-06, + "loss": 5.0162, + "step": 31925 + }, + { + "epoch": 0.6496175130208334, + "grad_norm": 19.846982955932617, + "learning_rate": 9.366479586840631e-06, + "loss": 5.1628, + "step": 31930 + }, + { + "epoch": 0.64971923828125, + "grad_norm": 14.689205169677734, + "learning_rate": 9.36628485397446e-06, + "loss": 5.0212, + "step": 31935 + }, + { + "epoch": 0.6498209635416666, + "grad_norm": 19.13104820251465, + "learning_rate": 9.366090093209114e-06, + "loss": 4.9644, + "step": 31940 + }, + { + "epoch": 0.6499226888020834, + "grad_norm": 12.935026168823242, + "learning_rate": 9.365895304545839e-06, + "loss": 4.8898, + "step": 31945 + }, + { + "epoch": 0.6500244140625, + "grad_norm": 30.09904670715332, + "learning_rate": 9.365700487985878e-06, + "loss": 5.2216, + "step": 31950 + }, + { + "epoch": 0.6501261393229166, + "grad_norm": 12.922197341918945, + "learning_rate": 9.365505643530477e-06, + "loss": 5.3568, + "step": 31955 + }, + { + "epoch": 0.6502278645833334, + "grad_norm": 20.092294692993164, + "learning_rate": 9.365310771180881e-06, + "loss": 4.8878, + "step": 31960 + }, + { + "epoch": 0.65032958984375, + "grad_norm": 14.165946006774902, + "learning_rate": 9.365115870938335e-06, + "loss": 4.9287, + "step": 31965 + }, + { + "epoch": 0.6504313151041666, + "grad_norm": 15.300055503845215, + "learning_rate": 9.364920942804083e-06, + "loss": 5.2126, + "step": 31970 + }, + { + "epoch": 0.6505330403645834, + "grad_norm": 21.99089813232422, + "learning_rate": 9.364725986779374e-06, + "loss": 5.0731, + "step": 31975 + }, + { + "epoch": 0.650634765625, + "grad_norm": 14.934657096862793, + "learning_rate": 9.36453100286545e-06, + "loss": 5.0901, + "step": 31980 + }, + { + "epoch": 0.6507364908854166, + "grad_norm": 15.056396484375, + "learning_rate": 9.364335991063558e-06, + "loss": 4.817, + "step": 31985 + }, + { + "epoch": 0.6508382161458334, + "grad_norm": 17.0059814453125, + "learning_rate": 9.364140951374947e-06, + "loss": 5.0931, + "step": 31990 + }, + { + "epoch": 0.65093994140625, + "grad_norm": 15.536980628967285, + "learning_rate": 9.363945883800859e-06, + "loss": 5.0642, + "step": 31995 + }, + { + "epoch": 0.6510416666666666, + "grad_norm": 15.093798637390137, + "learning_rate": 9.36375078834254e-06, + "loss": 5.3526, + "step": 32000 + }, + { + "epoch": 0.6511433919270834, + "grad_norm": 15.618889808654785, + "learning_rate": 9.363555665001242e-06, + "loss": 5.1204, + "step": 32005 + }, + { + "epoch": 0.6512451171875, + "grad_norm": 19.405811309814453, + "learning_rate": 9.363360513778205e-06, + "loss": 5.0761, + "step": 32010 + }, + { + "epoch": 0.6513468424479166, + "grad_norm": 19.521831512451172, + "learning_rate": 9.363165334674682e-06, + "loss": 4.9768, + "step": 32015 + }, + { + "epoch": 0.6514485677083334, + "grad_norm": 15.260570526123047, + "learning_rate": 9.362970127691919e-06, + "loss": 5.0205, + "step": 32020 + }, + { + "epoch": 0.65155029296875, + "grad_norm": 16.903160095214844, + "learning_rate": 9.362774892831158e-06, + "loss": 5.1691, + "step": 32025 + }, + { + "epoch": 0.6516520182291666, + "grad_norm": 19.80805778503418, + "learning_rate": 9.362579630093654e-06, + "loss": 5.215, + "step": 32030 + }, + { + "epoch": 0.6517537434895834, + "grad_norm": 25.50945472717285, + "learning_rate": 9.36238433948065e-06, + "loss": 5.085, + "step": 32035 + }, + { + "epoch": 0.65185546875, + "grad_norm": 16.994821548461914, + "learning_rate": 9.362189020993393e-06, + "loss": 5.2576, + "step": 32040 + }, + { + "epoch": 0.6519571940104166, + "grad_norm": 15.358511924743652, + "learning_rate": 9.361993674633133e-06, + "loss": 5.0805, + "step": 32045 + }, + { + "epoch": 0.6520589192708334, + "grad_norm": 17.737714767456055, + "learning_rate": 9.361798300401121e-06, + "loss": 5.3507, + "step": 32050 + }, + { + "epoch": 0.65216064453125, + "grad_norm": 16.136465072631836, + "learning_rate": 9.3616028982986e-06, + "loss": 5.0261, + "step": 32055 + }, + { + "epoch": 0.6522623697916666, + "grad_norm": 18.75303077697754, + "learning_rate": 9.361407468326821e-06, + "loss": 5.323, + "step": 32060 + }, + { + "epoch": 0.6523640950520834, + "grad_norm": 15.296272277832031, + "learning_rate": 9.361212010487036e-06, + "loss": 5.1439, + "step": 32065 + }, + { + "epoch": 0.6524658203125, + "grad_norm": 15.4063138961792, + "learning_rate": 9.361016524780487e-06, + "loss": 5.0981, + "step": 32070 + }, + { + "epoch": 0.6525675455729166, + "grad_norm": 16.670991897583008, + "learning_rate": 9.360821011208429e-06, + "loss": 5.1483, + "step": 32075 + }, + { + "epoch": 0.6526692708333334, + "grad_norm": 12.973590850830078, + "learning_rate": 9.360625469772106e-06, + "loss": 5.0456, + "step": 32080 + }, + { + "epoch": 0.65277099609375, + "grad_norm": 15.111567497253418, + "learning_rate": 9.360429900472775e-06, + "loss": 4.9944, + "step": 32085 + }, + { + "epoch": 0.6528727213541666, + "grad_norm": 15.438923835754395, + "learning_rate": 9.360234303311677e-06, + "loss": 5.214, + "step": 32090 + }, + { + "epoch": 0.6529744466145834, + "grad_norm": 17.926156997680664, + "learning_rate": 9.360038678290068e-06, + "loss": 5.1983, + "step": 32095 + }, + { + "epoch": 0.653076171875, + "grad_norm": 18.08146095275879, + "learning_rate": 9.359843025409197e-06, + "loss": 5.1049, + "step": 32100 + }, + { + "epoch": 0.6531778971354166, + "grad_norm": 31.95272445678711, + "learning_rate": 9.359647344670311e-06, + "loss": 4.848, + "step": 32105 + }, + { + "epoch": 0.6532796223958334, + "grad_norm": 17.123531341552734, + "learning_rate": 9.359451636074664e-06, + "loss": 5.0471, + "step": 32110 + }, + { + "epoch": 0.65338134765625, + "grad_norm": 23.285003662109375, + "learning_rate": 9.359255899623502e-06, + "loss": 5.3275, + "step": 32115 + }, + { + "epoch": 0.6534830729166666, + "grad_norm": 15.990904808044434, + "learning_rate": 9.359060135318081e-06, + "loss": 5.0217, + "step": 32120 + }, + { + "epoch": 0.6535847981770834, + "grad_norm": 17.099428176879883, + "learning_rate": 9.35886434315965e-06, + "loss": 4.9222, + "step": 32125 + }, + { + "epoch": 0.6536865234375, + "grad_norm": 28.57319450378418, + "learning_rate": 9.358668523149457e-06, + "loss": 5.3071, + "step": 32130 + }, + { + "epoch": 0.6537882486979166, + "grad_norm": 12.728937149047852, + "learning_rate": 9.358472675288759e-06, + "loss": 5.0098, + "step": 32135 + }, + { + "epoch": 0.6538899739583334, + "grad_norm": 18.25038719177246, + "learning_rate": 9.358276799578801e-06, + "loss": 5.5677, + "step": 32140 + }, + { + "epoch": 0.65399169921875, + "grad_norm": 23.707149505615234, + "learning_rate": 9.35808089602084e-06, + "loss": 5.3047, + "step": 32145 + }, + { + "epoch": 0.6540934244791666, + "grad_norm": 16.158048629760742, + "learning_rate": 9.357884964616125e-06, + "loss": 4.9151, + "step": 32150 + }, + { + "epoch": 0.6541951497395834, + "grad_norm": 20.851741790771484, + "learning_rate": 9.357689005365908e-06, + "loss": 5.1877, + "step": 32155 + }, + { + "epoch": 0.654296875, + "grad_norm": 17.22917366027832, + "learning_rate": 9.357493018271442e-06, + "loss": 5.1435, + "step": 32160 + }, + { + "epoch": 0.6543986002604166, + "grad_norm": 18.899051666259766, + "learning_rate": 9.357297003333979e-06, + "loss": 5.2115, + "step": 32165 + }, + { + "epoch": 0.6545003255208334, + "grad_norm": 14.868898391723633, + "learning_rate": 9.35710096055477e-06, + "loss": 4.889, + "step": 32170 + }, + { + "epoch": 0.65460205078125, + "grad_norm": 16.201168060302734, + "learning_rate": 9.35690488993507e-06, + "loss": 5.2407, + "step": 32175 + }, + { + "epoch": 0.6547037760416666, + "grad_norm": 14.71911907196045, + "learning_rate": 9.35670879147613e-06, + "loss": 5.2201, + "step": 32180 + }, + { + "epoch": 0.6548055013020834, + "grad_norm": 15.511197090148926, + "learning_rate": 9.356512665179206e-06, + "loss": 5.1526, + "step": 32185 + }, + { + "epoch": 0.6549072265625, + "grad_norm": 19.179201126098633, + "learning_rate": 9.356316511045547e-06, + "loss": 5.1446, + "step": 32190 + }, + { + "epoch": 0.6550089518229166, + "grad_norm": 21.813467025756836, + "learning_rate": 9.35612032907641e-06, + "loss": 5.0855, + "step": 32195 + }, + { + "epoch": 0.6551106770833334, + "grad_norm": 15.791779518127441, + "learning_rate": 9.355924119273046e-06, + "loss": 5.0717, + "step": 32200 + }, + { + "epoch": 0.65521240234375, + "grad_norm": 14.834059715270996, + "learning_rate": 9.35572788163671e-06, + "loss": 5.1948, + "step": 32205 + }, + { + "epoch": 0.6553141276041666, + "grad_norm": 17.650678634643555, + "learning_rate": 9.355531616168654e-06, + "loss": 5.2564, + "step": 32210 + }, + { + "epoch": 0.6554158528645834, + "grad_norm": 18.7623291015625, + "learning_rate": 9.355335322870136e-06, + "loss": 5.1751, + "step": 32215 + }, + { + "epoch": 0.655517578125, + "grad_norm": 17.45766830444336, + "learning_rate": 9.355139001742406e-06, + "loss": 5.0213, + "step": 32220 + }, + { + "epoch": 0.6556193033854166, + "grad_norm": 15.569891929626465, + "learning_rate": 9.354942652786721e-06, + "loss": 4.9875, + "step": 32225 + }, + { + "epoch": 0.6557210286458334, + "grad_norm": 17.937759399414062, + "learning_rate": 9.354746276004334e-06, + "loss": 5.2048, + "step": 32230 + }, + { + "epoch": 0.65582275390625, + "grad_norm": 16.887985229492188, + "learning_rate": 9.3545498713965e-06, + "loss": 4.7451, + "step": 32235 + }, + { + "epoch": 0.6559244791666666, + "grad_norm": 18.512813568115234, + "learning_rate": 9.354353438964479e-06, + "loss": 5.2889, + "step": 32240 + }, + { + "epoch": 0.6560262044270834, + "grad_norm": 17.359336853027344, + "learning_rate": 9.354156978709519e-06, + "loss": 5.0962, + "step": 32245 + }, + { + "epoch": 0.6561279296875, + "grad_norm": 21.34217643737793, + "learning_rate": 9.353960490632877e-06, + "loss": 5.18, + "step": 32250 + }, + { + "epoch": 0.6562296549479166, + "grad_norm": 15.591708183288574, + "learning_rate": 9.353763974735809e-06, + "loss": 5.1898, + "step": 32255 + }, + { + "epoch": 0.6563313802083334, + "grad_norm": 17.280122756958008, + "learning_rate": 9.353567431019574e-06, + "loss": 5.1428, + "step": 32260 + }, + { + "epoch": 0.65643310546875, + "grad_norm": 17.295360565185547, + "learning_rate": 9.353370859485425e-06, + "loss": 4.9398, + "step": 32265 + }, + { + "epoch": 0.6565348307291666, + "grad_norm": 17.71057891845703, + "learning_rate": 9.353174260134617e-06, + "loss": 5.0372, + "step": 32270 + }, + { + "epoch": 0.6566365559895834, + "grad_norm": 18.83734130859375, + "learning_rate": 9.352977632968408e-06, + "loss": 4.952, + "step": 32275 + }, + { + "epoch": 0.65673828125, + "grad_norm": 15.384209632873535, + "learning_rate": 9.352780977988052e-06, + "loss": 5.1112, + "step": 32280 + }, + { + "epoch": 0.6568400065104166, + "grad_norm": 20.6132869720459, + "learning_rate": 9.35258429519481e-06, + "loss": 5.0467, + "step": 32285 + }, + { + "epoch": 0.6569417317708334, + "grad_norm": 19.817201614379883, + "learning_rate": 9.352387584589934e-06, + "loss": 5.4826, + "step": 32290 + }, + { + "epoch": 0.65704345703125, + "grad_norm": 22.55689811706543, + "learning_rate": 9.352190846174682e-06, + "loss": 5.2852, + "step": 32295 + }, + { + "epoch": 0.6571451822916666, + "grad_norm": 15.523427963256836, + "learning_rate": 9.351994079950315e-06, + "loss": 5.1533, + "step": 32300 + }, + { + "epoch": 0.6572469075520834, + "grad_norm": 16.20138931274414, + "learning_rate": 9.351797285918085e-06, + "loss": 5.1477, + "step": 32305 + }, + { + "epoch": 0.6573486328125, + "grad_norm": 16.288331985473633, + "learning_rate": 9.351600464079254e-06, + "loss": 4.9398, + "step": 32310 + }, + { + "epoch": 0.6574503580729166, + "grad_norm": 16.6279296875, + "learning_rate": 9.351403614435076e-06, + "loss": 5.1058, + "step": 32315 + }, + { + "epoch": 0.6575520833333334, + "grad_norm": 23.79439926147461, + "learning_rate": 9.35120673698681e-06, + "loss": 5.1837, + "step": 32320 + }, + { + "epoch": 0.65765380859375, + "grad_norm": 16.153364181518555, + "learning_rate": 9.351009831735714e-06, + "loss": 5.1948, + "step": 32325 + }, + { + "epoch": 0.6577555338541666, + "grad_norm": 24.66461753845215, + "learning_rate": 9.350812898683048e-06, + "loss": 5.2068, + "step": 32330 + }, + { + "epoch": 0.6578572591145834, + "grad_norm": 27.252302169799805, + "learning_rate": 9.350615937830067e-06, + "loss": 5.1373, + "step": 32335 + }, + { + "epoch": 0.657958984375, + "grad_norm": 15.982672691345215, + "learning_rate": 9.350418949178033e-06, + "loss": 5.1382, + "step": 32340 + }, + { + "epoch": 0.6580607096354166, + "grad_norm": 14.148080825805664, + "learning_rate": 9.3502219327282e-06, + "loss": 5.0963, + "step": 32345 + }, + { + "epoch": 0.6581624348958334, + "grad_norm": 17.57377052307129, + "learning_rate": 9.350024888481833e-06, + "loss": 5.4766, + "step": 32350 + }, + { + "epoch": 0.65826416015625, + "grad_norm": 17.973426818847656, + "learning_rate": 9.349827816440186e-06, + "loss": 5.1117, + "step": 32355 + }, + { + "epoch": 0.6583658854166666, + "grad_norm": 13.870417594909668, + "learning_rate": 9.34963071660452e-06, + "loss": 5.0859, + "step": 32360 + }, + { + "epoch": 0.6584676106770834, + "grad_norm": 18.768579483032227, + "learning_rate": 9.349433588976095e-06, + "loss": 4.9218, + "step": 32365 + }, + { + "epoch": 0.6585693359375, + "grad_norm": 19.954084396362305, + "learning_rate": 9.349236433556171e-06, + "loss": 5.1216, + "step": 32370 + }, + { + "epoch": 0.6586710611979166, + "grad_norm": 19.275806427001953, + "learning_rate": 9.349039250346007e-06, + "loss": 5.0607, + "step": 32375 + }, + { + "epoch": 0.6587727864583334, + "grad_norm": 14.72518253326416, + "learning_rate": 9.34884203934686e-06, + "loss": 4.9967, + "step": 32380 + }, + { + "epoch": 0.65887451171875, + "grad_norm": 16.522930145263672, + "learning_rate": 9.348644800559994e-06, + "loss": 5.2174, + "step": 32385 + }, + { + "epoch": 0.6589762369791666, + "grad_norm": 13.20848274230957, + "learning_rate": 9.348447533986669e-06, + "loss": 5.0781, + "step": 32390 + }, + { + "epoch": 0.6590779622395834, + "grad_norm": 13.842391014099121, + "learning_rate": 9.348250239628145e-06, + "loss": 5.2024, + "step": 32395 + }, + { + "epoch": 0.6591796875, + "grad_norm": 16.86635971069336, + "learning_rate": 9.348052917485682e-06, + "loss": 5.2434, + "step": 32400 + }, + { + "epoch": 0.6592814127604166, + "grad_norm": 14.932280540466309, + "learning_rate": 9.34785556756054e-06, + "loss": 4.9289, + "step": 32405 + }, + { + "epoch": 0.6593831380208334, + "grad_norm": 15.8579740524292, + "learning_rate": 9.347658189853984e-06, + "loss": 5.4547, + "step": 32410 + }, + { + "epoch": 0.65948486328125, + "grad_norm": 16.13640022277832, + "learning_rate": 9.347460784367268e-06, + "loss": 5.0031, + "step": 32415 + }, + { + "epoch": 0.6595865885416666, + "grad_norm": 17.048534393310547, + "learning_rate": 9.34726335110166e-06, + "loss": 4.9816, + "step": 32420 + }, + { + "epoch": 0.6596883138020834, + "grad_norm": 17.062070846557617, + "learning_rate": 9.34706589005842e-06, + "loss": 4.8776, + "step": 32425 + }, + { + "epoch": 0.6597900390625, + "grad_norm": 17.85244369506836, + "learning_rate": 9.346868401238807e-06, + "loss": 5.1779, + "step": 32430 + }, + { + "epoch": 0.6598917643229166, + "grad_norm": 14.590566635131836, + "learning_rate": 9.346670884644086e-06, + "loss": 5.0687, + "step": 32435 + }, + { + "epoch": 0.6599934895833334, + "grad_norm": 16.728273391723633, + "learning_rate": 9.346473340275517e-06, + "loss": 5.0408, + "step": 32440 + }, + { + "epoch": 0.66009521484375, + "grad_norm": 19.395627975463867, + "learning_rate": 9.346275768134365e-06, + "loss": 4.9599, + "step": 32445 + }, + { + "epoch": 0.6601969401041666, + "grad_norm": 19.95354461669922, + "learning_rate": 9.346078168221889e-06, + "loss": 4.9687, + "step": 32450 + }, + { + "epoch": 0.6602986653645834, + "grad_norm": 16.805248260498047, + "learning_rate": 9.345880540539352e-06, + "loss": 5.2077, + "step": 32455 + }, + { + "epoch": 0.660400390625, + "grad_norm": 22.60630226135254, + "learning_rate": 9.345682885088022e-06, + "loss": 5.1665, + "step": 32460 + }, + { + "epoch": 0.6605021158854166, + "grad_norm": 19.516544342041016, + "learning_rate": 9.345485201869154e-06, + "loss": 5.5152, + "step": 32465 + }, + { + "epoch": 0.6606038411458334, + "grad_norm": 21.124103546142578, + "learning_rate": 9.345287490884016e-06, + "loss": 5.2871, + "step": 32470 + }, + { + "epoch": 0.66070556640625, + "grad_norm": 17.46218490600586, + "learning_rate": 9.34508975213387e-06, + "loss": 5.2183, + "step": 32475 + }, + { + "epoch": 0.6608072916666666, + "grad_norm": 20.428285598754883, + "learning_rate": 9.34489198561998e-06, + "loss": 5.1523, + "step": 32480 + }, + { + "epoch": 0.6609090169270834, + "grad_norm": 20.239395141601562, + "learning_rate": 9.34469419134361e-06, + "loss": 4.9437, + "step": 32485 + }, + { + "epoch": 0.6610107421875, + "grad_norm": 17.014432907104492, + "learning_rate": 9.344496369306024e-06, + "loss": 5.292, + "step": 32490 + }, + { + "epoch": 0.6611124674479166, + "grad_norm": 13.729925155639648, + "learning_rate": 9.344298519508484e-06, + "loss": 5.4935, + "step": 32495 + }, + { + "epoch": 0.6612141927083334, + "grad_norm": 14.85672664642334, + "learning_rate": 9.344100641952257e-06, + "loss": 5.04, + "step": 32500 + }, + { + "epoch": 0.66131591796875, + "grad_norm": 21.111942291259766, + "learning_rate": 9.343902736638604e-06, + "loss": 5.0941, + "step": 32505 + }, + { + "epoch": 0.6614176432291666, + "grad_norm": 18.3752384185791, + "learning_rate": 9.34370480356879e-06, + "loss": 5.0401, + "step": 32510 + }, + { + "epoch": 0.6615193684895834, + "grad_norm": 13.565577507019043, + "learning_rate": 9.343506842744085e-06, + "loss": 4.8749, + "step": 32515 + }, + { + "epoch": 0.66162109375, + "grad_norm": 21.64328384399414, + "learning_rate": 9.343308854165748e-06, + "loss": 4.9719, + "step": 32520 + }, + { + "epoch": 0.6617228190104166, + "grad_norm": 16.815446853637695, + "learning_rate": 9.343110837835048e-06, + "loss": 4.8923, + "step": 32525 + }, + { + "epoch": 0.6618245442708334, + "grad_norm": 19.471406936645508, + "learning_rate": 9.342912793753248e-06, + "loss": 5.1168, + "step": 32530 + }, + { + "epoch": 0.66192626953125, + "grad_norm": 14.498950004577637, + "learning_rate": 9.342714721921612e-06, + "loss": 5.1017, + "step": 32535 + }, + { + "epoch": 0.6620279947916666, + "grad_norm": 16.449010848999023, + "learning_rate": 9.34251662234141e-06, + "loss": 5.1352, + "step": 32540 + }, + { + "epoch": 0.6621297200520834, + "grad_norm": 15.574583053588867, + "learning_rate": 9.342318495013903e-06, + "loss": 5.3548, + "step": 32545 + }, + { + "epoch": 0.6622314453125, + "grad_norm": 13.667036056518555, + "learning_rate": 9.342120339940359e-06, + "loss": 5.088, + "step": 32550 + }, + { + "epoch": 0.6623331705729166, + "grad_norm": 18.432472229003906, + "learning_rate": 9.341922157122045e-06, + "loss": 5.3828, + "step": 32555 + }, + { + "epoch": 0.6624348958333334, + "grad_norm": 16.113418579101562, + "learning_rate": 9.341723946560229e-06, + "loss": 5.156, + "step": 32560 + }, + { + "epoch": 0.66253662109375, + "grad_norm": 16.192623138427734, + "learning_rate": 9.341525708256172e-06, + "loss": 5.2736, + "step": 32565 + }, + { + "epoch": 0.6626383463541666, + "grad_norm": 15.657552719116211, + "learning_rate": 9.341327442211145e-06, + "loss": 4.9089, + "step": 32570 + }, + { + "epoch": 0.6627400716145834, + "grad_norm": 17.297391891479492, + "learning_rate": 9.341129148426414e-06, + "loss": 5.06, + "step": 32575 + }, + { + "epoch": 0.662841796875, + "grad_norm": 16.469463348388672, + "learning_rate": 9.340930826903246e-06, + "loss": 5.2117, + "step": 32580 + }, + { + "epoch": 0.6629435221354166, + "grad_norm": 12.753962516784668, + "learning_rate": 9.340732477642908e-06, + "loss": 5.0543, + "step": 32585 + }, + { + "epoch": 0.6630452473958334, + "grad_norm": 16.222759246826172, + "learning_rate": 9.340534100646668e-06, + "loss": 5.104, + "step": 32590 + }, + { + "epoch": 0.66314697265625, + "grad_norm": 16.54787826538086, + "learning_rate": 9.340335695915791e-06, + "loss": 5.1337, + "step": 32595 + }, + { + "epoch": 0.6632486979166666, + "grad_norm": 17.57404136657715, + "learning_rate": 9.340137263451548e-06, + "loss": 4.8527, + "step": 32600 + }, + { + "epoch": 0.6633504231770834, + "grad_norm": 14.170412063598633, + "learning_rate": 9.339938803255205e-06, + "loss": 4.9518, + "step": 32605 + }, + { + "epoch": 0.6634521484375, + "grad_norm": 23.183734893798828, + "learning_rate": 9.339740315328033e-06, + "loss": 5.2425, + "step": 32610 + }, + { + "epoch": 0.6635538736979166, + "grad_norm": 15.66765022277832, + "learning_rate": 9.339541799671296e-06, + "loss": 5.0273, + "step": 32615 + }, + { + "epoch": 0.6636555989583334, + "grad_norm": 15.837629318237305, + "learning_rate": 9.339343256286266e-06, + "loss": 4.9509, + "step": 32620 + }, + { + "epoch": 0.66375732421875, + "grad_norm": 17.631166458129883, + "learning_rate": 9.339144685174208e-06, + "loss": 5.121, + "step": 32625 + }, + { + "epoch": 0.6638590494791666, + "grad_norm": 13.432469367980957, + "learning_rate": 9.338946086336393e-06, + "loss": 5.1565, + "step": 32630 + }, + { + "epoch": 0.6639607747395834, + "grad_norm": 12.67055892944336, + "learning_rate": 9.338747459774091e-06, + "loss": 5.0715, + "step": 32635 + }, + { + "epoch": 0.6640625, + "grad_norm": 17.463741302490234, + "learning_rate": 9.33854880548857e-06, + "loss": 5.4558, + "step": 32640 + }, + { + "epoch": 0.6641642252604166, + "grad_norm": 18.484455108642578, + "learning_rate": 9.338350123481098e-06, + "loss": 4.9685, + "step": 32645 + }, + { + "epoch": 0.6642659505208334, + "grad_norm": 14.986715316772461, + "learning_rate": 9.338151413752948e-06, + "loss": 5.1886, + "step": 32650 + }, + { + "epoch": 0.66436767578125, + "grad_norm": 13.093377113342285, + "learning_rate": 9.337952676305386e-06, + "loss": 5.4337, + "step": 32655 + }, + { + "epoch": 0.6644694010416666, + "grad_norm": 16.78972816467285, + "learning_rate": 9.337753911139684e-06, + "loss": 5.3782, + "step": 32660 + }, + { + "epoch": 0.6645711263020834, + "grad_norm": 15.116313934326172, + "learning_rate": 9.337555118257111e-06, + "loss": 4.9703, + "step": 32665 + }, + { + "epoch": 0.6646728515625, + "grad_norm": 16.509916305541992, + "learning_rate": 9.337356297658939e-06, + "loss": 5.2182, + "step": 32670 + }, + { + "epoch": 0.6647745768229166, + "grad_norm": 14.227357864379883, + "learning_rate": 9.337157449346437e-06, + "loss": 5.2021, + "step": 32675 + }, + { + "epoch": 0.6648763020833334, + "grad_norm": 14.513218879699707, + "learning_rate": 9.336958573320875e-06, + "loss": 5.4921, + "step": 32680 + }, + { + "epoch": 0.66497802734375, + "grad_norm": 19.572721481323242, + "learning_rate": 9.336759669583524e-06, + "loss": 5.011, + "step": 32685 + }, + { + "epoch": 0.6650797526041666, + "grad_norm": 14.991767883300781, + "learning_rate": 9.336560738135656e-06, + "loss": 5.2433, + "step": 32690 + }, + { + "epoch": 0.6651814778645834, + "grad_norm": 15.613245010375977, + "learning_rate": 9.336361778978542e-06, + "loss": 5.4961, + "step": 32695 + }, + { + "epoch": 0.665283203125, + "grad_norm": 12.882061004638672, + "learning_rate": 9.336162792113452e-06, + "loss": 5.4078, + "step": 32700 + }, + { + "epoch": 0.6653849283854166, + "grad_norm": 15.7986421585083, + "learning_rate": 9.335963777541657e-06, + "loss": 5.0489, + "step": 32705 + }, + { + "epoch": 0.6654866536458334, + "grad_norm": 15.454224586486816, + "learning_rate": 9.335764735264432e-06, + "loss": 5.0045, + "step": 32710 + }, + { + "epoch": 0.66558837890625, + "grad_norm": 24.476852416992188, + "learning_rate": 9.335565665283047e-06, + "loss": 5.2788, + "step": 32715 + }, + { + "epoch": 0.6656901041666666, + "grad_norm": 20.15134620666504, + "learning_rate": 9.335366567598771e-06, + "loss": 5.0776, + "step": 32720 + }, + { + "epoch": 0.6657918294270834, + "grad_norm": 15.948907852172852, + "learning_rate": 9.33516744221288e-06, + "loss": 5.1429, + "step": 32725 + }, + { + "epoch": 0.6658935546875, + "grad_norm": 18.665531158447266, + "learning_rate": 9.334968289126647e-06, + "loss": 5.2582, + "step": 32730 + }, + { + "epoch": 0.6659952799479166, + "grad_norm": 19.165267944335938, + "learning_rate": 9.334769108341341e-06, + "loss": 5.0019, + "step": 32735 + }, + { + "epoch": 0.6660970052083334, + "grad_norm": 17.431194305419922, + "learning_rate": 9.334569899858237e-06, + "loss": 4.927, + "step": 32740 + }, + { + "epoch": 0.66619873046875, + "grad_norm": 20.832916259765625, + "learning_rate": 9.334370663678606e-06, + "loss": 4.8816, + "step": 32745 + }, + { + "epoch": 0.6663004557291666, + "grad_norm": 13.429802894592285, + "learning_rate": 9.334171399803724e-06, + "loss": 4.856, + "step": 32750 + }, + { + "epoch": 0.6664021809895834, + "grad_norm": 15.896209716796875, + "learning_rate": 9.333972108234864e-06, + "loss": 4.9135, + "step": 32755 + }, + { + "epoch": 0.66650390625, + "grad_norm": 18.52705192565918, + "learning_rate": 9.333772788973295e-06, + "loss": 4.9946, + "step": 32760 + }, + { + "epoch": 0.6666056315104166, + "grad_norm": 15.347861289978027, + "learning_rate": 9.333573442020296e-06, + "loss": 5.1195, + "step": 32765 + }, + { + "epoch": 0.6667073567708334, + "grad_norm": 20.257789611816406, + "learning_rate": 9.333374067377138e-06, + "loss": 5.1905, + "step": 32770 + }, + { + "epoch": 0.66680908203125, + "grad_norm": 17.23021697998047, + "learning_rate": 9.333174665045092e-06, + "loss": 4.9415, + "step": 32775 + }, + { + "epoch": 0.6669108072916666, + "grad_norm": 26.570280075073242, + "learning_rate": 9.33297523502544e-06, + "loss": 5.146, + "step": 32780 + }, + { + "epoch": 0.6670125325520834, + "grad_norm": 18.774015426635742, + "learning_rate": 9.332775777319447e-06, + "loss": 5.1205, + "step": 32785 + }, + { + "epoch": 0.6671142578125, + "grad_norm": 24.157859802246094, + "learning_rate": 9.332576291928396e-06, + "loss": 5.1653, + "step": 32790 + }, + { + "epoch": 0.6672159830729166, + "grad_norm": 17.214920043945312, + "learning_rate": 9.332376778853556e-06, + "loss": 5.1304, + "step": 32795 + }, + { + "epoch": 0.6673177083333334, + "grad_norm": 14.644450187683105, + "learning_rate": 9.332177238096205e-06, + "loss": 4.8219, + "step": 32800 + }, + { + "epoch": 0.66741943359375, + "grad_norm": 17.80490493774414, + "learning_rate": 9.331977669657616e-06, + "loss": 5.0844, + "step": 32805 + }, + { + "epoch": 0.6675211588541666, + "grad_norm": 15.877418518066406, + "learning_rate": 9.331778073539064e-06, + "loss": 5.0128, + "step": 32810 + }, + { + "epoch": 0.6676228841145834, + "grad_norm": 19.210464477539062, + "learning_rate": 9.331578449741827e-06, + "loss": 5.2266, + "step": 32815 + }, + { + "epoch": 0.667724609375, + "grad_norm": 16.218170166015625, + "learning_rate": 9.331378798267177e-06, + "loss": 4.9119, + "step": 32820 + }, + { + "epoch": 0.6678263346354166, + "grad_norm": 15.468350410461426, + "learning_rate": 9.331179119116392e-06, + "loss": 5.0599, + "step": 32825 + }, + { + "epoch": 0.6679280598958334, + "grad_norm": 17.82147979736328, + "learning_rate": 9.330979412290748e-06, + "loss": 5.1171, + "step": 32830 + }, + { + "epoch": 0.66802978515625, + "grad_norm": 14.334609985351562, + "learning_rate": 9.330779677791519e-06, + "loss": 4.842, + "step": 32835 + }, + { + "epoch": 0.6681315104166666, + "grad_norm": 14.687226295471191, + "learning_rate": 9.330579915619984e-06, + "loss": 5.0392, + "step": 32840 + }, + { + "epoch": 0.6682332356770834, + "grad_norm": 12.280237197875977, + "learning_rate": 9.330380125777418e-06, + "loss": 5.177, + "step": 32845 + }, + { + "epoch": 0.6683349609375, + "grad_norm": 17.919784545898438, + "learning_rate": 9.330180308265098e-06, + "loss": 4.9436, + "step": 32850 + }, + { + "epoch": 0.6684366861979166, + "grad_norm": 17.637338638305664, + "learning_rate": 9.329980463084299e-06, + "loss": 5.2668, + "step": 32855 + }, + { + "epoch": 0.6685384114583334, + "grad_norm": 16.70052719116211, + "learning_rate": 9.3297805902363e-06, + "loss": 5.1253, + "step": 32860 + }, + { + "epoch": 0.66864013671875, + "grad_norm": 25.526260375976562, + "learning_rate": 9.329580689722378e-06, + "loss": 4.9345, + "step": 32865 + }, + { + "epoch": 0.6687418619791666, + "grad_norm": 14.283625602722168, + "learning_rate": 9.329380761543809e-06, + "loss": 5.1501, + "step": 32870 + }, + { + "epoch": 0.6688435872395834, + "grad_norm": 14.842408180236816, + "learning_rate": 9.329180805701872e-06, + "loss": 4.8129, + "step": 32875 + }, + { + "epoch": 0.6689453125, + "grad_norm": 16.1677303314209, + "learning_rate": 9.328980822197843e-06, + "loss": 5.4485, + "step": 32880 + }, + { + "epoch": 0.6690470377604166, + "grad_norm": 14.29813289642334, + "learning_rate": 9.328780811033002e-06, + "loss": 4.9852, + "step": 32885 + }, + { + "epoch": 0.6691487630208334, + "grad_norm": 12.207612991333008, + "learning_rate": 9.328580772208624e-06, + "loss": 5.086, + "step": 32890 + }, + { + "epoch": 0.66925048828125, + "grad_norm": 16.98727035522461, + "learning_rate": 9.32838070572599e-06, + "loss": 5.1065, + "step": 32895 + }, + { + "epoch": 0.6693522135416666, + "grad_norm": 18.094449996948242, + "learning_rate": 9.328180611586377e-06, + "loss": 5.154, + "step": 32900 + }, + { + "epoch": 0.6694539388020834, + "grad_norm": 14.547151565551758, + "learning_rate": 9.327980489791064e-06, + "loss": 5.0205, + "step": 32905 + }, + { + "epoch": 0.6695556640625, + "grad_norm": 16.629920959472656, + "learning_rate": 9.32778034034133e-06, + "loss": 5.008, + "step": 32910 + }, + { + "epoch": 0.6696573893229166, + "grad_norm": 19.95133399963379, + "learning_rate": 9.327580163238452e-06, + "loss": 5.0343, + "step": 32915 + }, + { + "epoch": 0.6697591145833334, + "grad_norm": 14.931022644042969, + "learning_rate": 9.327379958483714e-06, + "loss": 5.1569, + "step": 32920 + }, + { + "epoch": 0.66986083984375, + "grad_norm": 15.036190032958984, + "learning_rate": 9.327179726078388e-06, + "loss": 4.8074, + "step": 32925 + }, + { + "epoch": 0.6699625651041666, + "grad_norm": 16.228702545166016, + "learning_rate": 9.326979466023758e-06, + "loss": 5.2467, + "step": 32930 + }, + { + "epoch": 0.6700642903645834, + "grad_norm": 19.224163055419922, + "learning_rate": 9.326779178321104e-06, + "loss": 5.1515, + "step": 32935 + }, + { + "epoch": 0.670166015625, + "grad_norm": 17.970012664794922, + "learning_rate": 9.326578862971703e-06, + "loss": 5.0054, + "step": 32940 + }, + { + "epoch": 0.6702677408854166, + "grad_norm": 16.166507720947266, + "learning_rate": 9.326378519976837e-06, + "loss": 5.3822, + "step": 32945 + }, + { + "epoch": 0.6703694661458334, + "grad_norm": 17.262998580932617, + "learning_rate": 9.326178149337787e-06, + "loss": 5.0145, + "step": 32950 + }, + { + "epoch": 0.67047119140625, + "grad_norm": 18.877992630004883, + "learning_rate": 9.325977751055831e-06, + "loss": 5.2023, + "step": 32955 + }, + { + "epoch": 0.6705729166666666, + "grad_norm": 17.928050994873047, + "learning_rate": 9.32577732513225e-06, + "loss": 5.2409, + "step": 32960 + }, + { + "epoch": 0.6706746419270834, + "grad_norm": 20.918621063232422, + "learning_rate": 9.325576871568326e-06, + "loss": 4.9493, + "step": 32965 + }, + { + "epoch": 0.6707763671875, + "grad_norm": 19.0964412689209, + "learning_rate": 9.325376390365339e-06, + "loss": 4.9789, + "step": 32970 + }, + { + "epoch": 0.6708780924479166, + "grad_norm": 23.949350357055664, + "learning_rate": 9.32517588152457e-06, + "loss": 5.0623, + "step": 32975 + }, + { + "epoch": 0.6709798177083334, + "grad_norm": 17.651294708251953, + "learning_rate": 9.324975345047299e-06, + "loss": 4.9731, + "step": 32980 + }, + { + "epoch": 0.67108154296875, + "grad_norm": 20.262752532958984, + "learning_rate": 9.324774780934807e-06, + "loss": 5.0551, + "step": 32985 + }, + { + "epoch": 0.6711832682291666, + "grad_norm": 17.185239791870117, + "learning_rate": 9.324574189188381e-06, + "loss": 4.9184, + "step": 32990 + }, + { + "epoch": 0.6712849934895834, + "grad_norm": 18.497802734375, + "learning_rate": 9.324373569809295e-06, + "loss": 4.906, + "step": 32995 + }, + { + "epoch": 0.67138671875, + "grad_norm": 17.891672134399414, + "learning_rate": 9.324172922798836e-06, + "loss": 4.9872, + "step": 33000 + }, + { + "epoch": 0.6714884440104166, + "grad_norm": 16.159170150756836, + "learning_rate": 9.323972248158286e-06, + "loss": 5.026, + "step": 33005 + }, + { + "epoch": 0.6715901692708334, + "grad_norm": 16.290008544921875, + "learning_rate": 9.323771545888924e-06, + "loss": 5.0195, + "step": 33010 + }, + { + "epoch": 0.67169189453125, + "grad_norm": 18.87175178527832, + "learning_rate": 9.323570815992035e-06, + "loss": 5.0767, + "step": 33015 + }, + { + "epoch": 0.6717936197916666, + "grad_norm": 17.97630500793457, + "learning_rate": 9.323370058468901e-06, + "loss": 5.3119, + "step": 33020 + }, + { + "epoch": 0.6718953450520834, + "grad_norm": 20.941675186157227, + "learning_rate": 9.323169273320808e-06, + "loss": 5.1119, + "step": 33025 + }, + { + "epoch": 0.6719970703125, + "grad_norm": 17.52509880065918, + "learning_rate": 9.322968460549031e-06, + "loss": 4.9073, + "step": 33030 + }, + { + "epoch": 0.6720987955729166, + "grad_norm": 16.176021575927734, + "learning_rate": 9.322767620154859e-06, + "loss": 5.2729, + "step": 33035 + }, + { + "epoch": 0.6722005208333334, + "grad_norm": 16.350191116333008, + "learning_rate": 9.322566752139575e-06, + "loss": 5.3658, + "step": 33040 + }, + { + "epoch": 0.67230224609375, + "grad_norm": 14.16597843170166, + "learning_rate": 9.322365856504461e-06, + "loss": 4.8043, + "step": 33045 + }, + { + "epoch": 0.6724039713541666, + "grad_norm": 12.622088432312012, + "learning_rate": 9.322164933250801e-06, + "loss": 4.9509, + "step": 33050 + }, + { + "epoch": 0.6725056966145834, + "grad_norm": 22.018644332885742, + "learning_rate": 9.32196398237988e-06, + "loss": 5.3054, + "step": 33055 + }, + { + "epoch": 0.672607421875, + "grad_norm": 14.180121421813965, + "learning_rate": 9.32176300389298e-06, + "loss": 5.0361, + "step": 33060 + }, + { + "epoch": 0.6727091471354166, + "grad_norm": 15.976241111755371, + "learning_rate": 9.321561997791389e-06, + "loss": 5.0235, + "step": 33065 + }, + { + "epoch": 0.6728108723958334, + "grad_norm": 16.050016403198242, + "learning_rate": 9.321360964076386e-06, + "loss": 4.9547, + "step": 33070 + }, + { + "epoch": 0.67291259765625, + "grad_norm": 20.45379638671875, + "learning_rate": 9.32115990274926e-06, + "loss": 5.3725, + "step": 33075 + }, + { + "epoch": 0.6730143229166666, + "grad_norm": 18.903860092163086, + "learning_rate": 9.320958813811291e-06, + "loss": 4.9932, + "step": 33080 + }, + { + "epoch": 0.6731160481770834, + "grad_norm": 18.277708053588867, + "learning_rate": 9.32075769726377e-06, + "loss": 5.25, + "step": 33085 + }, + { + "epoch": 0.6732177734375, + "grad_norm": 16.69390869140625, + "learning_rate": 9.320556553107978e-06, + "loss": 4.9939, + "step": 33090 + }, + { + "epoch": 0.6733194986979166, + "grad_norm": 22.984027862548828, + "learning_rate": 9.3203553813452e-06, + "loss": 5.0652, + "step": 33095 + }, + { + "epoch": 0.6734212239583334, + "grad_norm": 16.260766983032227, + "learning_rate": 9.320154181976725e-06, + "loss": 5.3708, + "step": 33100 + }, + { + "epoch": 0.67352294921875, + "grad_norm": 14.483811378479004, + "learning_rate": 9.319952955003833e-06, + "loss": 5.2171, + "step": 33105 + }, + { + "epoch": 0.6736246744791666, + "grad_norm": 14.699227333068848, + "learning_rate": 9.319751700427815e-06, + "loss": 4.9843, + "step": 33110 + }, + { + "epoch": 0.6737263997395834, + "grad_norm": 17.180585861206055, + "learning_rate": 9.319550418249954e-06, + "loss": 5.3535, + "step": 33115 + }, + { + "epoch": 0.673828125, + "grad_norm": 11.26775074005127, + "learning_rate": 9.319349108471539e-06, + "loss": 5.1798, + "step": 33120 + }, + { + "epoch": 0.6739298502604166, + "grad_norm": 18.949260711669922, + "learning_rate": 9.319147771093852e-06, + "loss": 4.9789, + "step": 33125 + }, + { + "epoch": 0.6740315755208334, + "grad_norm": 19.03073501586914, + "learning_rate": 9.318946406118184e-06, + "loss": 5.203, + "step": 33130 + }, + { + "epoch": 0.67413330078125, + "grad_norm": 13.590119361877441, + "learning_rate": 9.318745013545816e-06, + "loss": 5.0584, + "step": 33135 + }, + { + "epoch": 0.6742350260416666, + "grad_norm": 17.000080108642578, + "learning_rate": 9.318543593378042e-06, + "loss": 5.1807, + "step": 33140 + }, + { + "epoch": 0.6743367513020834, + "grad_norm": 15.84831714630127, + "learning_rate": 9.318342145616145e-06, + "loss": 5.0997, + "step": 33145 + }, + { + "epoch": 0.6744384765625, + "grad_norm": 22.414836883544922, + "learning_rate": 9.318140670261409e-06, + "loss": 5.0776, + "step": 33150 + }, + { + "epoch": 0.6745402018229166, + "grad_norm": 15.283077239990234, + "learning_rate": 9.317939167315129e-06, + "loss": 4.9904, + "step": 33155 + }, + { + "epoch": 0.6746419270833334, + "grad_norm": 20.312145233154297, + "learning_rate": 9.317737636778585e-06, + "loss": 4.9867, + "step": 33160 + }, + { + "epoch": 0.67474365234375, + "grad_norm": 20.927410125732422, + "learning_rate": 9.317536078653069e-06, + "loss": 5.338, + "step": 33165 + }, + { + "epoch": 0.6748453776041666, + "grad_norm": 22.083433151245117, + "learning_rate": 9.31733449293987e-06, + "loss": 5.4982, + "step": 33170 + }, + { + "epoch": 0.6749471028645834, + "grad_norm": 18.670791625976562, + "learning_rate": 9.317132879640272e-06, + "loss": 5.0378, + "step": 33175 + }, + { + "epoch": 0.675048828125, + "grad_norm": 22.712867736816406, + "learning_rate": 9.316931238755566e-06, + "loss": 4.9265, + "step": 33180 + }, + { + "epoch": 0.6751505533854166, + "grad_norm": 17.661218643188477, + "learning_rate": 9.316729570287042e-06, + "loss": 5.0562, + "step": 33185 + }, + { + "epoch": 0.6752522786458334, + "grad_norm": 15.56041431427002, + "learning_rate": 9.316527874235983e-06, + "loss": 5.1964, + "step": 33190 + }, + { + "epoch": 0.67535400390625, + "grad_norm": 14.436470031738281, + "learning_rate": 9.316326150603683e-06, + "loss": 5.0335, + "step": 33195 + }, + { + "epoch": 0.6754557291666666, + "grad_norm": 21.43709373474121, + "learning_rate": 9.316124399391428e-06, + "loss": 4.9241, + "step": 33200 + }, + { + "epoch": 0.6755574544270834, + "grad_norm": 17.34275245666504, + "learning_rate": 9.315922620600509e-06, + "loss": 5.3839, + "step": 33205 + }, + { + "epoch": 0.6756591796875, + "grad_norm": 17.089521408081055, + "learning_rate": 9.315720814232214e-06, + "loss": 4.986, + "step": 33210 + }, + { + "epoch": 0.6757609049479166, + "grad_norm": 20.919111251831055, + "learning_rate": 9.315518980287833e-06, + "loss": 5.1152, + "step": 33215 + }, + { + "epoch": 0.6758626302083334, + "grad_norm": 15.685925483703613, + "learning_rate": 9.315317118768656e-06, + "loss": 5.3973, + "step": 33220 + }, + { + "epoch": 0.67596435546875, + "grad_norm": 23.160724639892578, + "learning_rate": 9.315115229675973e-06, + "loss": 5.0974, + "step": 33225 + }, + { + "epoch": 0.6760660807291666, + "grad_norm": 18.52215003967285, + "learning_rate": 9.314913313011073e-06, + "loss": 5.2406, + "step": 33230 + }, + { + "epoch": 0.6761678059895834, + "grad_norm": 17.208351135253906, + "learning_rate": 9.314711368775247e-06, + "loss": 5.2303, + "step": 33235 + }, + { + "epoch": 0.67626953125, + "grad_norm": 18.303308486938477, + "learning_rate": 9.314509396969783e-06, + "loss": 5.2699, + "step": 33240 + }, + { + "epoch": 0.6763712565104166, + "grad_norm": 17.939769744873047, + "learning_rate": 9.314307397595977e-06, + "loss": 4.9354, + "step": 33245 + }, + { + "epoch": 0.6764729817708334, + "grad_norm": 19.970691680908203, + "learning_rate": 9.314105370655114e-06, + "loss": 5.1032, + "step": 33250 + }, + { + "epoch": 0.67657470703125, + "grad_norm": 15.630949020385742, + "learning_rate": 9.313903316148487e-06, + "loss": 4.9105, + "step": 33255 + }, + { + "epoch": 0.6766764322916666, + "grad_norm": 20.382314682006836, + "learning_rate": 9.313701234077389e-06, + "loss": 4.9859, + "step": 33260 + }, + { + "epoch": 0.6767781575520834, + "grad_norm": 17.939443588256836, + "learning_rate": 9.313499124443108e-06, + "loss": 4.8981, + "step": 33265 + }, + { + "epoch": 0.6768798828125, + "grad_norm": 16.95020294189453, + "learning_rate": 9.313296987246938e-06, + "loss": 4.9108, + "step": 33270 + }, + { + "epoch": 0.6769816080729166, + "grad_norm": 17.747215270996094, + "learning_rate": 9.313094822490167e-06, + "loss": 5.228, + "step": 33275 + }, + { + "epoch": 0.6770833333333334, + "grad_norm": 13.515079498291016, + "learning_rate": 9.312892630174091e-06, + "loss": 5.0665, + "step": 33280 + }, + { + "epoch": 0.67718505859375, + "grad_norm": 22.43291473388672, + "learning_rate": 9.3126904103e-06, + "loss": 5.0089, + "step": 33285 + }, + { + "epoch": 0.6772867838541666, + "grad_norm": 18.761747360229492, + "learning_rate": 9.312488162869187e-06, + "loss": 4.8932, + "step": 33290 + }, + { + "epoch": 0.6773885091145834, + "grad_norm": 15.158099174499512, + "learning_rate": 9.312285887882944e-06, + "loss": 5.0625, + "step": 33295 + }, + { + "epoch": 0.677490234375, + "grad_norm": 18.0142765045166, + "learning_rate": 9.31208358534256e-06, + "loss": 4.9226, + "step": 33300 + }, + { + "epoch": 0.6775919596354166, + "grad_norm": 18.198890686035156, + "learning_rate": 9.311881255249333e-06, + "loss": 5.2764, + "step": 33305 + }, + { + "epoch": 0.6776936848958334, + "grad_norm": 17.16851806640625, + "learning_rate": 9.311678897604556e-06, + "loss": 5.1188, + "step": 33310 + }, + { + "epoch": 0.67779541015625, + "grad_norm": 16.59526824951172, + "learning_rate": 9.311476512409513e-06, + "loss": 5.1133, + "step": 33315 + }, + { + "epoch": 0.6778971354166666, + "grad_norm": 16.76675796508789, + "learning_rate": 9.31127409966551e-06, + "loss": 5.2451, + "step": 33320 + }, + { + "epoch": 0.6779988606770834, + "grad_norm": 17.45156478881836, + "learning_rate": 9.311071659373832e-06, + "loss": 5.1181, + "step": 33325 + }, + { + "epoch": 0.6781005859375, + "grad_norm": 19.312591552734375, + "learning_rate": 9.310869191535774e-06, + "loss": 5.3444, + "step": 33330 + }, + { + "epoch": 0.6782023111979166, + "grad_norm": 15.809503555297852, + "learning_rate": 9.310666696152631e-06, + "loss": 5.0838, + "step": 33335 + }, + { + "epoch": 0.6783040364583334, + "grad_norm": 15.8366060256958, + "learning_rate": 9.310464173225695e-06, + "loss": 5.0362, + "step": 33340 + }, + { + "epoch": 0.67840576171875, + "grad_norm": 18.272560119628906, + "learning_rate": 9.310261622756263e-06, + "loss": 4.9602, + "step": 33345 + }, + { + "epoch": 0.6785074869791666, + "grad_norm": 14.380008697509766, + "learning_rate": 9.310059044745627e-06, + "loss": 4.8961, + "step": 33350 + }, + { + "epoch": 0.6786092122395834, + "grad_norm": 18.85123634338379, + "learning_rate": 9.30985643919508e-06, + "loss": 5.3907, + "step": 33355 + }, + { + "epoch": 0.6787109375, + "grad_norm": 14.456490516662598, + "learning_rate": 9.309653806105921e-06, + "loss": 5.0511, + "step": 33360 + }, + { + "epoch": 0.6788126627604166, + "grad_norm": 17.701139450073242, + "learning_rate": 9.30945114547944e-06, + "loss": 5.1191, + "step": 33365 + }, + { + "epoch": 0.6789143880208334, + "grad_norm": 19.153667449951172, + "learning_rate": 9.309248457316936e-06, + "loss": 5.0501, + "step": 33370 + }, + { + "epoch": 0.67901611328125, + "grad_norm": 16.15332794189453, + "learning_rate": 9.309045741619702e-06, + "loss": 5.0348, + "step": 33375 + }, + { + "epoch": 0.6791178385416666, + "grad_norm": 17.004606246948242, + "learning_rate": 9.308842998389034e-06, + "loss": 4.8901, + "step": 33380 + }, + { + "epoch": 0.6792195638020834, + "grad_norm": 19.078475952148438, + "learning_rate": 9.308640227626225e-06, + "loss": 5.2117, + "step": 33385 + }, + { + "epoch": 0.6793212890625, + "grad_norm": 21.700735092163086, + "learning_rate": 9.308437429332575e-06, + "loss": 4.8628, + "step": 33390 + }, + { + "epoch": 0.6794230143229166, + "grad_norm": 15.788023948669434, + "learning_rate": 9.308234603509375e-06, + "loss": 5.0287, + "step": 33395 + }, + { + "epoch": 0.6795247395833334, + "grad_norm": 18.28249168395996, + "learning_rate": 9.308031750157926e-06, + "loss": 4.8896, + "step": 33400 + }, + { + "epoch": 0.67962646484375, + "grad_norm": 15.753076553344727, + "learning_rate": 9.30782886927952e-06, + "loss": 5.0844, + "step": 33405 + }, + { + "epoch": 0.6797281901041666, + "grad_norm": 14.584300994873047, + "learning_rate": 9.307625960875457e-06, + "loss": 5.0068, + "step": 33410 + }, + { + "epoch": 0.6798299153645834, + "grad_norm": 17.011159896850586, + "learning_rate": 9.30742302494703e-06, + "loss": 4.8629, + "step": 33415 + }, + { + "epoch": 0.679931640625, + "grad_norm": 18.9749755859375, + "learning_rate": 9.307220061495538e-06, + "loss": 4.8937, + "step": 33420 + }, + { + "epoch": 0.6800333658854166, + "grad_norm": 22.66876220703125, + "learning_rate": 9.307017070522277e-06, + "loss": 5.1605, + "step": 33425 + }, + { + "epoch": 0.6801350911458334, + "grad_norm": 15.068416595458984, + "learning_rate": 9.306814052028543e-06, + "loss": 5.0241, + "step": 33430 + }, + { + "epoch": 0.68023681640625, + "grad_norm": 21.46753692626953, + "learning_rate": 9.306611006015635e-06, + "loss": 5.2181, + "step": 33435 + }, + { + "epoch": 0.6803385416666666, + "grad_norm": 14.80542278289795, + "learning_rate": 9.30640793248485e-06, + "loss": 5.1063, + "step": 33440 + }, + { + "epoch": 0.6804402669270834, + "grad_norm": 16.589706420898438, + "learning_rate": 9.306204831437485e-06, + "loss": 5.0318, + "step": 33445 + }, + { + "epoch": 0.6805419921875, + "grad_norm": 17.093420028686523, + "learning_rate": 9.306001702874838e-06, + "loss": 4.9928, + "step": 33450 + }, + { + "epoch": 0.6806437174479166, + "grad_norm": 16.8298282623291, + "learning_rate": 9.305798546798209e-06, + "loss": 5.0189, + "step": 33455 + }, + { + "epoch": 0.6807454427083334, + "grad_norm": 14.732221603393555, + "learning_rate": 9.305595363208892e-06, + "loss": 5.0141, + "step": 33460 + }, + { + "epoch": 0.68084716796875, + "grad_norm": 15.610523223876953, + "learning_rate": 9.305392152108188e-06, + "loss": 5.2842, + "step": 33465 + }, + { + "epoch": 0.6809488932291666, + "grad_norm": 16.49553680419922, + "learning_rate": 9.305188913497394e-06, + "loss": 5.206, + "step": 33470 + }, + { + "epoch": 0.6810506184895834, + "grad_norm": 16.630340576171875, + "learning_rate": 9.30498564737781e-06, + "loss": 5.0076, + "step": 33475 + }, + { + "epoch": 0.68115234375, + "grad_norm": 20.9981632232666, + "learning_rate": 9.304782353750736e-06, + "loss": 5.2091, + "step": 33480 + }, + { + "epoch": 0.6812540690104166, + "grad_norm": 23.13046646118164, + "learning_rate": 9.304579032617467e-06, + "loss": 4.8573, + "step": 33485 + }, + { + "epoch": 0.6813557942708334, + "grad_norm": 17.46484375, + "learning_rate": 9.304375683979305e-06, + "loss": 4.9228, + "step": 33490 + }, + { + "epoch": 0.68145751953125, + "grad_norm": 16.496732711791992, + "learning_rate": 9.30417230783755e-06, + "loss": 5.0522, + "step": 33495 + }, + { + "epoch": 0.6815592447916666, + "grad_norm": 18.389446258544922, + "learning_rate": 9.303968904193499e-06, + "loss": 5.1924, + "step": 33500 + }, + { + "epoch": 0.6816609700520834, + "grad_norm": 14.570801734924316, + "learning_rate": 9.303765473048455e-06, + "loss": 4.8386, + "step": 33505 + }, + { + "epoch": 0.6817626953125, + "grad_norm": 15.165898323059082, + "learning_rate": 9.303562014403712e-06, + "loss": 5.1314, + "step": 33510 + }, + { + "epoch": 0.6818644205729166, + "grad_norm": 14.782651901245117, + "learning_rate": 9.303358528260576e-06, + "loss": 5.3562, + "step": 33515 + }, + { + "epoch": 0.6819661458333334, + "grad_norm": 14.572834968566895, + "learning_rate": 9.303155014620346e-06, + "loss": 4.9324, + "step": 33520 + }, + { + "epoch": 0.68206787109375, + "grad_norm": 18.807064056396484, + "learning_rate": 9.302951473484321e-06, + "loss": 5.0214, + "step": 33525 + }, + { + "epoch": 0.6821695963541666, + "grad_norm": 14.901573181152344, + "learning_rate": 9.3027479048538e-06, + "loss": 5.0447, + "step": 33530 + }, + { + "epoch": 0.6822713216145834, + "grad_norm": 27.433238983154297, + "learning_rate": 9.302544308730089e-06, + "loss": 5.2538, + "step": 33535 + }, + { + "epoch": 0.682373046875, + "grad_norm": 23.39227294921875, + "learning_rate": 9.302340685114483e-06, + "loss": 5.0901, + "step": 33540 + }, + { + "epoch": 0.6824747721354166, + "grad_norm": 15.207314491271973, + "learning_rate": 9.302137034008288e-06, + "loss": 5.1242, + "step": 33545 + }, + { + "epoch": 0.6825764973958334, + "grad_norm": 17.073217391967773, + "learning_rate": 9.3019333554128e-06, + "loss": 4.933, + "step": 33550 + }, + { + "epoch": 0.68267822265625, + "grad_norm": 16.951494216918945, + "learning_rate": 9.301729649329325e-06, + "loss": 5.0911, + "step": 33555 + }, + { + "epoch": 0.6827799479166666, + "grad_norm": 14.947306632995605, + "learning_rate": 9.301525915759165e-06, + "loss": 4.9817, + "step": 33560 + }, + { + "epoch": 0.6828816731770834, + "grad_norm": 24.303985595703125, + "learning_rate": 9.301322154703617e-06, + "loss": 4.8892, + "step": 33565 + }, + { + "epoch": 0.6829833984375, + "grad_norm": 15.917943000793457, + "learning_rate": 9.301118366163989e-06, + "loss": 4.9618, + "step": 33570 + }, + { + "epoch": 0.6830851236979166, + "grad_norm": 15.74440860748291, + "learning_rate": 9.300914550141576e-06, + "loss": 4.9072, + "step": 33575 + }, + { + "epoch": 0.6831868489583334, + "grad_norm": 20.82463836669922, + "learning_rate": 9.300710706637688e-06, + "loss": 5.0403, + "step": 33580 + }, + { + "epoch": 0.68328857421875, + "grad_norm": 20.0827579498291, + "learning_rate": 9.300506835653622e-06, + "loss": 4.9114, + "step": 33585 + }, + { + "epoch": 0.6833902994791666, + "grad_norm": 15.917044639587402, + "learning_rate": 9.300302937190682e-06, + "loss": 4.9245, + "step": 33590 + }, + { + "epoch": 0.6834920247395834, + "grad_norm": 14.231794357299805, + "learning_rate": 9.300099011250173e-06, + "loss": 4.9012, + "step": 33595 + }, + { + "epoch": 0.68359375, + "grad_norm": 17.97599983215332, + "learning_rate": 9.299895057833394e-06, + "loss": 5.0288, + "step": 33600 + }, + { + "epoch": 0.6836954752604166, + "grad_norm": 18.3054256439209, + "learning_rate": 9.299691076941653e-06, + "loss": 5.449, + "step": 33605 + }, + { + "epoch": 0.6837972005208334, + "grad_norm": 17.28217315673828, + "learning_rate": 9.299487068576249e-06, + "loss": 5.3141, + "step": 33610 + }, + { + "epoch": 0.68389892578125, + "grad_norm": 11.357345581054688, + "learning_rate": 9.299283032738489e-06, + "loss": 5.187, + "step": 33615 + }, + { + "epoch": 0.6840006510416666, + "grad_norm": 13.530364990234375, + "learning_rate": 9.299078969429674e-06, + "loss": 4.9629, + "step": 33620 + }, + { + "epoch": 0.6841023763020834, + "grad_norm": 17.080293655395508, + "learning_rate": 9.29887487865111e-06, + "loss": 4.9361, + "step": 33625 + }, + { + "epoch": 0.6842041015625, + "grad_norm": 19.66884422302246, + "learning_rate": 9.2986707604041e-06, + "loss": 5.2232, + "step": 33630 + }, + { + "epoch": 0.6843058268229166, + "grad_norm": 14.543728828430176, + "learning_rate": 9.298466614689948e-06, + "loss": 5.121, + "step": 33635 + }, + { + "epoch": 0.6844075520833334, + "grad_norm": 18.73929786682129, + "learning_rate": 9.29826244150996e-06, + "loss": 4.8757, + "step": 33640 + }, + { + "epoch": 0.68450927734375, + "grad_norm": 19.686420440673828, + "learning_rate": 9.298058240865438e-06, + "loss": 5.142, + "step": 33645 + }, + { + "epoch": 0.6846110026041666, + "grad_norm": 15.694758415222168, + "learning_rate": 9.297854012757689e-06, + "loss": 5.4059, + "step": 33650 + }, + { + "epoch": 0.6847127278645834, + "grad_norm": 17.96283531188965, + "learning_rate": 9.297649757188016e-06, + "loss": 5.2712, + "step": 33655 + }, + { + "epoch": 0.684814453125, + "grad_norm": 16.019676208496094, + "learning_rate": 9.297445474157727e-06, + "loss": 4.942, + "step": 33660 + }, + { + "epoch": 0.6849161783854166, + "grad_norm": 15.829846382141113, + "learning_rate": 9.297241163668124e-06, + "loss": 4.9182, + "step": 33665 + }, + { + "epoch": 0.6850179036458334, + "grad_norm": 14.552128791809082, + "learning_rate": 9.297036825720515e-06, + "loss": 5.0188, + "step": 33670 + }, + { + "epoch": 0.68511962890625, + "grad_norm": 21.006088256835938, + "learning_rate": 9.296832460316206e-06, + "loss": 4.9479, + "step": 33675 + }, + { + "epoch": 0.6852213541666666, + "grad_norm": 15.513442039489746, + "learning_rate": 9.2966280674565e-06, + "loss": 4.9856, + "step": 33680 + }, + { + "epoch": 0.6853230794270834, + "grad_norm": 21.0734920501709, + "learning_rate": 9.296423647142705e-06, + "loss": 4.995, + "step": 33685 + }, + { + "epoch": 0.6854248046875, + "grad_norm": 17.63383674621582, + "learning_rate": 9.296219199376126e-06, + "loss": 5.0122, + "step": 33690 + }, + { + "epoch": 0.6855265299479166, + "grad_norm": 14.852622985839844, + "learning_rate": 9.29601472415807e-06, + "loss": 4.9155, + "step": 33695 + }, + { + "epoch": 0.6856282552083334, + "grad_norm": 18.93818473815918, + "learning_rate": 9.295810221489846e-06, + "loss": 5.0813, + "step": 33700 + }, + { + "epoch": 0.68572998046875, + "grad_norm": 20.565990447998047, + "learning_rate": 9.295605691372757e-06, + "loss": 5.0438, + "step": 33705 + }, + { + "epoch": 0.6858317057291666, + "grad_norm": 14.434097290039062, + "learning_rate": 9.29540113380811e-06, + "loss": 5.3464, + "step": 33710 + }, + { + "epoch": 0.6859334309895834, + "grad_norm": 20.316362380981445, + "learning_rate": 9.295196548797215e-06, + "loss": 5.1162, + "step": 33715 + }, + { + "epoch": 0.68603515625, + "grad_norm": 12.635876655578613, + "learning_rate": 9.294991936341376e-06, + "loss": 4.9966, + "step": 33720 + }, + { + "epoch": 0.6861368815104166, + "grad_norm": 14.149226188659668, + "learning_rate": 9.294787296441903e-06, + "loss": 5.2258, + "step": 33725 + }, + { + "epoch": 0.6862386067708334, + "grad_norm": 17.377092361450195, + "learning_rate": 9.294582629100103e-06, + "loss": 5.5064, + "step": 33730 + }, + { + "epoch": 0.68634033203125, + "grad_norm": 18.62069320678711, + "learning_rate": 9.294377934317283e-06, + "loss": 5.1289, + "step": 33735 + }, + { + "epoch": 0.6864420572916666, + "grad_norm": 40.8613166809082, + "learning_rate": 9.29417321209475e-06, + "loss": 5.4362, + "step": 33740 + }, + { + "epoch": 0.6865437825520834, + "grad_norm": 27.64571762084961, + "learning_rate": 9.293968462433814e-06, + "loss": 5.0267, + "step": 33745 + }, + { + "epoch": 0.6866455078125, + "grad_norm": 13.96644401550293, + "learning_rate": 9.293763685335783e-06, + "loss": 5.2069, + "step": 33750 + }, + { + "epoch": 0.6867472330729166, + "grad_norm": 17.288864135742188, + "learning_rate": 9.293558880801964e-06, + "loss": 5.1108, + "step": 33755 + }, + { + "epoch": 0.6868489583333334, + "grad_norm": 19.925674438476562, + "learning_rate": 9.29335404883367e-06, + "loss": 4.8959, + "step": 33760 + }, + { + "epoch": 0.68695068359375, + "grad_norm": 20.637924194335938, + "learning_rate": 9.293149189432202e-06, + "loss": 4.8573, + "step": 33765 + }, + { + "epoch": 0.6870524088541666, + "grad_norm": 21.094959259033203, + "learning_rate": 9.292944302598876e-06, + "loss": 5.0779, + "step": 33770 + }, + { + "epoch": 0.6871541341145834, + "grad_norm": 18.66257095336914, + "learning_rate": 9.292739388334997e-06, + "loss": 5.0343, + "step": 33775 + }, + { + "epoch": 0.687255859375, + "grad_norm": 18.42866325378418, + "learning_rate": 9.292534446641878e-06, + "loss": 4.9582, + "step": 33780 + }, + { + "epoch": 0.6873575846354166, + "grad_norm": 15.608558654785156, + "learning_rate": 9.292329477520826e-06, + "loss": 5.3085, + "step": 33785 + }, + { + "epoch": 0.6874593098958334, + "grad_norm": 26.130325317382812, + "learning_rate": 9.292124480973152e-06, + "loss": 5.2133, + "step": 33790 + }, + { + "epoch": 0.68756103515625, + "grad_norm": 13.82011890411377, + "learning_rate": 9.291919457000164e-06, + "loss": 5.1189, + "step": 33795 + }, + { + "epoch": 0.6876627604166666, + "grad_norm": 20.410661697387695, + "learning_rate": 9.291714405603173e-06, + "loss": 5.3812, + "step": 33800 + }, + { + "epoch": 0.6877644856770834, + "grad_norm": 23.316801071166992, + "learning_rate": 9.29150932678349e-06, + "loss": 4.9049, + "step": 33805 + }, + { + "epoch": 0.6878662109375, + "grad_norm": 14.82959270477295, + "learning_rate": 9.291304220542424e-06, + "loss": 4.9623, + "step": 33810 + }, + { + "epoch": 0.6879679361979166, + "grad_norm": 18.9868221282959, + "learning_rate": 9.291099086881286e-06, + "loss": 5.0769, + "step": 33815 + }, + { + "epoch": 0.6880696614583334, + "grad_norm": 17.11673927307129, + "learning_rate": 9.290893925801387e-06, + "loss": 5.2481, + "step": 33820 + }, + { + "epoch": 0.68817138671875, + "grad_norm": 15.872008323669434, + "learning_rate": 9.290688737304038e-06, + "loss": 5.0855, + "step": 33825 + }, + { + "epoch": 0.6882731119791666, + "grad_norm": 16.253402709960938, + "learning_rate": 9.290483521390552e-06, + "loss": 5.3156, + "step": 33830 + }, + { + "epoch": 0.6883748372395834, + "grad_norm": 12.302165031433105, + "learning_rate": 9.290278278062236e-06, + "loss": 5.0877, + "step": 33835 + }, + { + "epoch": 0.6884765625, + "grad_norm": 17.100452423095703, + "learning_rate": 9.290073007320403e-06, + "loss": 4.984, + "step": 33840 + }, + { + "epoch": 0.6885782877604166, + "grad_norm": 17.751953125, + "learning_rate": 9.289867709166369e-06, + "loss": 5.0324, + "step": 33845 + }, + { + "epoch": 0.6886800130208334, + "grad_norm": 17.324481964111328, + "learning_rate": 9.289662383601438e-06, + "loss": 5.2809, + "step": 33850 + }, + { + "epoch": 0.68878173828125, + "grad_norm": 15.533482551574707, + "learning_rate": 9.28945703062693e-06, + "loss": 5.2082, + "step": 33855 + }, + { + "epoch": 0.6888834635416666, + "grad_norm": 14.592497825622559, + "learning_rate": 9.28925165024415e-06, + "loss": 5.2011, + "step": 33860 + }, + { + "epoch": 0.6889851888020834, + "grad_norm": 16.24352264404297, + "learning_rate": 9.289046242454416e-06, + "loss": 5.1572, + "step": 33865 + }, + { + "epoch": 0.6890869140625, + "grad_norm": 12.736359596252441, + "learning_rate": 9.288840807259036e-06, + "loss": 4.9405, + "step": 33870 + }, + { + "epoch": 0.6891886393229166, + "grad_norm": 20.192636489868164, + "learning_rate": 9.288635344659324e-06, + "loss": 5.0196, + "step": 33875 + }, + { + "epoch": 0.6892903645833334, + "grad_norm": 22.498071670532227, + "learning_rate": 9.288429854656596e-06, + "loss": 5.2553, + "step": 33880 + }, + { + "epoch": 0.68939208984375, + "grad_norm": 17.79788589477539, + "learning_rate": 9.28822433725216e-06, + "loss": 5.2597, + "step": 33885 + }, + { + "epoch": 0.6894938151041666, + "grad_norm": 17.631452560424805, + "learning_rate": 9.288018792447334e-06, + "loss": 5.084, + "step": 33890 + }, + { + "epoch": 0.6895955403645834, + "grad_norm": 13.762267112731934, + "learning_rate": 9.287813220243428e-06, + "loss": 5.004, + "step": 33895 + }, + { + "epoch": 0.689697265625, + "grad_norm": 14.861443519592285, + "learning_rate": 9.287607620641759e-06, + "loss": 4.99, + "step": 33900 + }, + { + "epoch": 0.6897989908854166, + "grad_norm": 16.70562744140625, + "learning_rate": 9.287401993643637e-06, + "loss": 4.9327, + "step": 33905 + }, + { + "epoch": 0.6899007161458334, + "grad_norm": 17.436748504638672, + "learning_rate": 9.287196339250375e-06, + "loss": 5.4531, + "step": 33910 + }, + { + "epoch": 0.69000244140625, + "grad_norm": 19.761547088623047, + "learning_rate": 9.286990657463291e-06, + "loss": 5.1234, + "step": 33915 + }, + { + "epoch": 0.6901041666666666, + "grad_norm": 18.338115692138672, + "learning_rate": 9.286784948283698e-06, + "loss": 5.0669, + "step": 33920 + }, + { + "epoch": 0.6902058919270834, + "grad_norm": 15.023652076721191, + "learning_rate": 9.28657921171291e-06, + "loss": 5.0707, + "step": 33925 + }, + { + "epoch": 0.6903076171875, + "grad_norm": 15.146169662475586, + "learning_rate": 9.286373447752242e-06, + "loss": 4.8572, + "step": 33930 + }, + { + "epoch": 0.6904093424479166, + "grad_norm": 21.08765983581543, + "learning_rate": 9.286167656403006e-06, + "loss": 5.1396, + "step": 33935 + }, + { + "epoch": 0.6905110677083334, + "grad_norm": 23.44820785522461, + "learning_rate": 9.285961837666522e-06, + "loss": 5.0429, + "step": 33940 + }, + { + "epoch": 0.69061279296875, + "grad_norm": 16.561555862426758, + "learning_rate": 9.2857559915441e-06, + "loss": 5.1966, + "step": 33945 + }, + { + "epoch": 0.6907145182291666, + "grad_norm": 19.684202194213867, + "learning_rate": 9.28555011803706e-06, + "loss": 5.1294, + "step": 33950 + }, + { + "epoch": 0.6908162434895834, + "grad_norm": 18.285364151000977, + "learning_rate": 9.285344217146714e-06, + "loss": 5.1737, + "step": 33955 + }, + { + "epoch": 0.69091796875, + "grad_norm": 19.9045352935791, + "learning_rate": 9.285138288874379e-06, + "loss": 5.139, + "step": 33960 + }, + { + "epoch": 0.6910196940104166, + "grad_norm": 14.116903305053711, + "learning_rate": 9.28493233322137e-06, + "loss": 5.0284, + "step": 33965 + }, + { + "epoch": 0.6911214192708334, + "grad_norm": 18.063695907592773, + "learning_rate": 9.284726350189005e-06, + "loss": 5.4018, + "step": 33970 + }, + { + "epoch": 0.69122314453125, + "grad_norm": 17.29608917236328, + "learning_rate": 9.284520339778597e-06, + "loss": 5.0181, + "step": 33975 + }, + { + "epoch": 0.6913248697916666, + "grad_norm": 19.579833984375, + "learning_rate": 9.284314301991466e-06, + "loss": 4.8016, + "step": 33980 + }, + { + "epoch": 0.6914265950520834, + "grad_norm": 14.003020286560059, + "learning_rate": 9.284108236828926e-06, + "loss": 5.1816, + "step": 33985 + }, + { + "epoch": 0.6915283203125, + "grad_norm": 14.420495986938477, + "learning_rate": 9.283902144292294e-06, + "loss": 4.9152, + "step": 33990 + }, + { + "epoch": 0.6916300455729166, + "grad_norm": 22.106931686401367, + "learning_rate": 9.283696024382887e-06, + "loss": 4.969, + "step": 33995 + }, + { + "epoch": 0.6917317708333334, + "grad_norm": 15.689826965332031, + "learning_rate": 9.283489877102022e-06, + "loss": 4.9799, + "step": 34000 + }, + { + "epoch": 0.69183349609375, + "grad_norm": 15.204407691955566, + "learning_rate": 9.283283702451016e-06, + "loss": 5.2004, + "step": 34005 + }, + { + "epoch": 0.6919352213541666, + "grad_norm": 22.432201385498047, + "learning_rate": 9.283077500431188e-06, + "loss": 5.2978, + "step": 34010 + }, + { + "epoch": 0.6920369466145834, + "grad_norm": 16.133438110351562, + "learning_rate": 9.282871271043854e-06, + "loss": 5.1232, + "step": 34015 + }, + { + "epoch": 0.692138671875, + "grad_norm": 12.707088470458984, + "learning_rate": 9.282665014290333e-06, + "loss": 5.1216, + "step": 34020 + }, + { + "epoch": 0.6922403971354166, + "grad_norm": 19.436847686767578, + "learning_rate": 9.282458730171941e-06, + "loss": 5.1551, + "step": 34025 + }, + { + "epoch": 0.6923421223958334, + "grad_norm": 21.557056427001953, + "learning_rate": 9.282252418689998e-06, + "loss": 4.914, + "step": 34030 + }, + { + "epoch": 0.69244384765625, + "grad_norm": 15.64487361907959, + "learning_rate": 9.282046079845821e-06, + "loss": 4.8255, + "step": 34035 + }, + { + "epoch": 0.6925455729166666, + "grad_norm": 20.248401641845703, + "learning_rate": 9.281839713640728e-06, + "loss": 5.0773, + "step": 34040 + }, + { + "epoch": 0.6926472981770834, + "grad_norm": 17.289226531982422, + "learning_rate": 9.28163332007604e-06, + "loss": 5.1382, + "step": 34045 + }, + { + "epoch": 0.6927490234375, + "grad_norm": 15.05897331237793, + "learning_rate": 9.281426899153073e-06, + "loss": 5.0599, + "step": 34050 + }, + { + "epoch": 0.6928507486979166, + "grad_norm": 17.012929916381836, + "learning_rate": 9.281220450873147e-06, + "loss": 5.0637, + "step": 34055 + }, + { + "epoch": 0.6929524739583334, + "grad_norm": 19.472082138061523, + "learning_rate": 9.281013975237583e-06, + "loss": 5.2721, + "step": 34060 + }, + { + "epoch": 0.69305419921875, + "grad_norm": 16.66843605041504, + "learning_rate": 9.280807472247697e-06, + "loss": 5.1061, + "step": 34065 + }, + { + "epoch": 0.6931559244791666, + "grad_norm": 16.168127059936523, + "learning_rate": 9.280600941904812e-06, + "loss": 5.1861, + "step": 34070 + }, + { + "epoch": 0.6932576497395834, + "grad_norm": 26.872638702392578, + "learning_rate": 9.280394384210246e-06, + "loss": 5.2535, + "step": 34075 + }, + { + "epoch": 0.693359375, + "grad_norm": 19.758237838745117, + "learning_rate": 9.280187799165315e-06, + "loss": 5.4778, + "step": 34080 + }, + { + "epoch": 0.6934611002604166, + "grad_norm": 15.44931411743164, + "learning_rate": 9.279981186771345e-06, + "loss": 5.0043, + "step": 34085 + }, + { + "epoch": 0.6935628255208334, + "grad_norm": 19.09596061706543, + "learning_rate": 9.279774547029654e-06, + "loss": 4.8967, + "step": 34090 + }, + { + "epoch": 0.69366455078125, + "grad_norm": 21.04253387451172, + "learning_rate": 9.279567879941563e-06, + "loss": 5.1406, + "step": 34095 + }, + { + "epoch": 0.6937662760416666, + "grad_norm": 18.21595001220703, + "learning_rate": 9.27936118550839e-06, + "loss": 5.1673, + "step": 34100 + }, + { + "epoch": 0.6938680013020834, + "grad_norm": 15.825023651123047, + "learning_rate": 9.279154463731458e-06, + "loss": 4.9585, + "step": 34105 + }, + { + "epoch": 0.6939697265625, + "grad_norm": 15.693605422973633, + "learning_rate": 9.27894771461209e-06, + "loss": 4.981, + "step": 34110 + }, + { + "epoch": 0.6940714518229166, + "grad_norm": 15.882243156433105, + "learning_rate": 9.278740938151601e-06, + "loss": 5.1658, + "step": 34115 + }, + { + "epoch": 0.6941731770833334, + "grad_norm": 19.204004287719727, + "learning_rate": 9.278534134351317e-06, + "loss": 4.9636, + "step": 34120 + }, + { + "epoch": 0.69427490234375, + "grad_norm": 19.250436782836914, + "learning_rate": 9.278327303212558e-06, + "loss": 4.9453, + "step": 34125 + }, + { + "epoch": 0.6943766276041666, + "grad_norm": 20.49230194091797, + "learning_rate": 9.278120444736644e-06, + "loss": 5.2657, + "step": 34130 + }, + { + "epoch": 0.6944783528645834, + "grad_norm": 16.678733825683594, + "learning_rate": 9.2779135589249e-06, + "loss": 5.1688, + "step": 34135 + }, + { + "epoch": 0.694580078125, + "grad_norm": 21.47434425354004, + "learning_rate": 9.277706645778648e-06, + "loss": 5.0793, + "step": 34140 + }, + { + "epoch": 0.6946818033854166, + "grad_norm": 18.163904190063477, + "learning_rate": 9.277499705299207e-06, + "loss": 5.1846, + "step": 34145 + }, + { + "epoch": 0.6947835286458334, + "grad_norm": 14.656689643859863, + "learning_rate": 9.277292737487899e-06, + "loss": 5.1185, + "step": 34150 + }, + { + "epoch": 0.69488525390625, + "grad_norm": 14.413768768310547, + "learning_rate": 9.27708574234605e-06, + "loss": 5.356, + "step": 34155 + }, + { + "epoch": 0.6949869791666666, + "grad_norm": 24.21296501159668, + "learning_rate": 9.276878719874981e-06, + "loss": 4.8492, + "step": 34160 + }, + { + "epoch": 0.6950887044270834, + "grad_norm": 13.268529891967773, + "learning_rate": 9.276671670076014e-06, + "loss": 5.1677, + "step": 34165 + }, + { + "epoch": 0.6951904296875, + "grad_norm": 18.8922061920166, + "learning_rate": 9.276464592950473e-06, + "loss": 4.9726, + "step": 34170 + }, + { + "epoch": 0.6952921549479166, + "grad_norm": 15.446558952331543, + "learning_rate": 9.27625748849968e-06, + "loss": 5.283, + "step": 34175 + }, + { + "epoch": 0.6953938802083334, + "grad_norm": 15.07085132598877, + "learning_rate": 9.27605035672496e-06, + "loss": 5.0881, + "step": 34180 + }, + { + "epoch": 0.69549560546875, + "grad_norm": 15.990857124328613, + "learning_rate": 9.275843197627636e-06, + "loss": 4.8585, + "step": 34185 + }, + { + "epoch": 0.6955973307291666, + "grad_norm": 17.349117279052734, + "learning_rate": 9.27563601120903e-06, + "loss": 5.3056, + "step": 34190 + }, + { + "epoch": 0.6956990559895834, + "grad_norm": 15.437599182128906, + "learning_rate": 9.275428797470469e-06, + "loss": 5.1749, + "step": 34195 + }, + { + "epoch": 0.69580078125, + "grad_norm": 16.901779174804688, + "learning_rate": 9.275221556413275e-06, + "loss": 5.2174, + "step": 34200 + }, + { + "epoch": 0.6959025065104166, + "grad_norm": 17.877426147460938, + "learning_rate": 9.27501428803877e-06, + "loss": 5.0744, + "step": 34205 + }, + { + "epoch": 0.6960042317708334, + "grad_norm": 15.84272575378418, + "learning_rate": 9.274806992348283e-06, + "loss": 5.1395, + "step": 34210 + }, + { + "epoch": 0.69610595703125, + "grad_norm": 19.190998077392578, + "learning_rate": 9.274599669343134e-06, + "loss": 5.3016, + "step": 34215 + }, + { + "epoch": 0.6962076822916666, + "grad_norm": 18.674842834472656, + "learning_rate": 9.274392319024651e-06, + "loss": 5.1484, + "step": 34220 + }, + { + "epoch": 0.6963094075520834, + "grad_norm": 17.398508071899414, + "learning_rate": 9.27418494139416e-06, + "loss": 5.1411, + "step": 34225 + }, + { + "epoch": 0.6964111328125, + "grad_norm": 18.809856414794922, + "learning_rate": 9.273977536452983e-06, + "loss": 5.2976, + "step": 34230 + }, + { + "epoch": 0.6965128580729166, + "grad_norm": 15.392123222351074, + "learning_rate": 9.273770104202446e-06, + "loss": 5.0751, + "step": 34235 + }, + { + "epoch": 0.6966145833333334, + "grad_norm": 15.337379455566406, + "learning_rate": 9.273562644643874e-06, + "loss": 5.0353, + "step": 34240 + }, + { + "epoch": 0.69671630859375, + "grad_norm": 17.22576904296875, + "learning_rate": 9.273355157778594e-06, + "loss": 5.142, + "step": 34245 + }, + { + "epoch": 0.6968180338541666, + "grad_norm": 29.768388748168945, + "learning_rate": 9.27314764360793e-06, + "loss": 5.2362, + "step": 34250 + }, + { + "epoch": 0.6969197591145834, + "grad_norm": 18.869991302490234, + "learning_rate": 9.27294010213321e-06, + "loss": 5.0427, + "step": 34255 + }, + { + "epoch": 0.697021484375, + "grad_norm": 28.391263961791992, + "learning_rate": 9.27273253335576e-06, + "loss": 5.3036, + "step": 34260 + }, + { + "epoch": 0.6971232096354166, + "grad_norm": 27.52553367614746, + "learning_rate": 9.272524937276905e-06, + "loss": 5.3068, + "step": 34265 + }, + { + "epoch": 0.6972249348958334, + "grad_norm": 16.336214065551758, + "learning_rate": 9.27231731389797e-06, + "loss": 4.8192, + "step": 34270 + }, + { + "epoch": 0.69732666015625, + "grad_norm": 22.056804656982422, + "learning_rate": 9.272109663220286e-06, + "loss": 5.2109, + "step": 34275 + }, + { + "epoch": 0.6974283854166666, + "grad_norm": 21.008859634399414, + "learning_rate": 9.271901985245176e-06, + "loss": 5.0854, + "step": 34280 + }, + { + "epoch": 0.6975301106770834, + "grad_norm": 15.644440650939941, + "learning_rate": 9.271694279973969e-06, + "loss": 5.0951, + "step": 34285 + }, + { + "epoch": 0.6976318359375, + "grad_norm": 18.389917373657227, + "learning_rate": 9.271486547407993e-06, + "loss": 5.1356, + "step": 34290 + }, + { + "epoch": 0.6977335611979166, + "grad_norm": 12.822190284729004, + "learning_rate": 9.271278787548571e-06, + "loss": 5.0108, + "step": 34295 + }, + { + "epoch": 0.6978352864583334, + "grad_norm": 15.321327209472656, + "learning_rate": 9.271071000397036e-06, + "loss": 4.9671, + "step": 34300 + }, + { + "epoch": 0.69793701171875, + "grad_norm": 18.240324020385742, + "learning_rate": 9.27086318595471e-06, + "loss": 5.11, + "step": 34305 + }, + { + "epoch": 0.6980387369791666, + "grad_norm": 15.389630317687988, + "learning_rate": 9.270655344222927e-06, + "loss": 4.9402, + "step": 34310 + }, + { + "epoch": 0.6981404622395834, + "grad_norm": 20.343719482421875, + "learning_rate": 9.27044747520301e-06, + "loss": 4.9095, + "step": 34315 + }, + { + "epoch": 0.6982421875, + "grad_norm": 16.753808975219727, + "learning_rate": 9.270239578896292e-06, + "loss": 5.2025, + "step": 34320 + }, + { + "epoch": 0.6983439127604166, + "grad_norm": 19.259502410888672, + "learning_rate": 9.270031655304096e-06, + "loss": 5.2058, + "step": 34325 + }, + { + "epoch": 0.6984456380208334, + "grad_norm": 13.180285453796387, + "learning_rate": 9.269823704427754e-06, + "loss": 4.9159, + "step": 34330 + }, + { + "epoch": 0.69854736328125, + "grad_norm": 18.492753982543945, + "learning_rate": 9.269615726268595e-06, + "loss": 5.2913, + "step": 34335 + }, + { + "epoch": 0.6986490885416666, + "grad_norm": 15.496081352233887, + "learning_rate": 9.269407720827946e-06, + "loss": 5.2892, + "step": 34340 + }, + { + "epoch": 0.6987508138020834, + "grad_norm": 20.69569969177246, + "learning_rate": 9.269199688107135e-06, + "loss": 5.1431, + "step": 34345 + }, + { + "epoch": 0.6988525390625, + "grad_norm": 22.81890869140625, + "learning_rate": 9.268991628107495e-06, + "loss": 4.9596, + "step": 34350 + }, + { + "epoch": 0.6989542643229166, + "grad_norm": 23.33427619934082, + "learning_rate": 9.268783540830356e-06, + "loss": 5.3641, + "step": 34355 + }, + { + "epoch": 0.6990559895833334, + "grad_norm": 23.16623878479004, + "learning_rate": 9.268575426277042e-06, + "loss": 5.2793, + "step": 34360 + }, + { + "epoch": 0.69915771484375, + "grad_norm": 17.406633377075195, + "learning_rate": 9.268367284448887e-06, + "loss": 5.005, + "step": 34365 + }, + { + "epoch": 0.6992594401041666, + "grad_norm": 17.260910034179688, + "learning_rate": 9.268159115347219e-06, + "loss": 5.0962, + "step": 34370 + }, + { + "epoch": 0.6993611653645834, + "grad_norm": 16.497541427612305, + "learning_rate": 9.267950918973371e-06, + "loss": 5.0833, + "step": 34375 + }, + { + "epoch": 0.699462890625, + "grad_norm": 19.625017166137695, + "learning_rate": 9.267742695328671e-06, + "loss": 4.9915, + "step": 34380 + }, + { + "epoch": 0.6995646158854166, + "grad_norm": 17.36544418334961, + "learning_rate": 9.26753444441445e-06, + "loss": 5.2545, + "step": 34385 + }, + { + "epoch": 0.6996663411458334, + "grad_norm": 18.141590118408203, + "learning_rate": 9.267326166232038e-06, + "loss": 4.9425, + "step": 34390 + }, + { + "epoch": 0.69976806640625, + "grad_norm": 12.982285499572754, + "learning_rate": 9.267117860782767e-06, + "loss": 5.0197, + "step": 34395 + }, + { + "epoch": 0.6998697916666666, + "grad_norm": 17.07708740234375, + "learning_rate": 9.266909528067967e-06, + "loss": 4.7809, + "step": 34400 + }, + { + "epoch": 0.6999715169270834, + "grad_norm": 17.42034339904785, + "learning_rate": 9.26670116808897e-06, + "loss": 5.0022, + "step": 34405 + }, + { + "epoch": 0.7000732421875, + "grad_norm": 17.633867263793945, + "learning_rate": 9.266492780847106e-06, + "loss": 5.0722, + "step": 34410 + }, + { + "epoch": 0.7001749674479166, + "grad_norm": 15.314096450805664, + "learning_rate": 9.266284366343707e-06, + "loss": 5.0733, + "step": 34415 + }, + { + "epoch": 0.7002766927083334, + "grad_norm": 16.823017120361328, + "learning_rate": 9.266075924580107e-06, + "loss": 4.9674, + "step": 34420 + }, + { + "epoch": 0.70037841796875, + "grad_norm": 19.425901412963867, + "learning_rate": 9.265867455557636e-06, + "loss": 4.9559, + "step": 34425 + }, + { + "epoch": 0.7004801432291666, + "grad_norm": 22.341726303100586, + "learning_rate": 9.265658959277625e-06, + "loss": 5.0334, + "step": 34430 + }, + { + "epoch": 0.7005818684895834, + "grad_norm": 18.712148666381836, + "learning_rate": 9.265450435741408e-06, + "loss": 4.9249, + "step": 34435 + }, + { + "epoch": 0.70068359375, + "grad_norm": 16.509441375732422, + "learning_rate": 9.265241884950317e-06, + "loss": 4.8902, + "step": 34440 + }, + { + "epoch": 0.7007853190104166, + "grad_norm": 17.2520694732666, + "learning_rate": 9.265033306905684e-06, + "loss": 5.1593, + "step": 34445 + }, + { + "epoch": 0.7008870442708334, + "grad_norm": 19.924028396606445, + "learning_rate": 9.264824701608843e-06, + "loss": 5.1089, + "step": 34450 + }, + { + "epoch": 0.70098876953125, + "grad_norm": 19.30536651611328, + "learning_rate": 9.264616069061124e-06, + "loss": 5.173, + "step": 34455 + }, + { + "epoch": 0.7010904947916666, + "grad_norm": 19.964202880859375, + "learning_rate": 9.264407409263863e-06, + "loss": 5.1269, + "step": 34460 + }, + { + "epoch": 0.7011922200520834, + "grad_norm": 14.483515739440918, + "learning_rate": 9.264198722218393e-06, + "loss": 4.9677, + "step": 34465 + }, + { + "epoch": 0.7012939453125, + "grad_norm": 17.50124740600586, + "learning_rate": 9.263990007926047e-06, + "loss": 4.932, + "step": 34470 + }, + { + "epoch": 0.7013956705729166, + "grad_norm": 17.250484466552734, + "learning_rate": 9.263781266388156e-06, + "loss": 4.8064, + "step": 34475 + }, + { + "epoch": 0.7014973958333334, + "grad_norm": 21.761850357055664, + "learning_rate": 9.26357249760606e-06, + "loss": 5.1957, + "step": 34480 + }, + { + "epoch": 0.70159912109375, + "grad_norm": 16.463348388671875, + "learning_rate": 9.263363701581085e-06, + "loss": 4.8948, + "step": 34485 + }, + { + "epoch": 0.7017008463541666, + "grad_norm": 16.16075897216797, + "learning_rate": 9.263154878314572e-06, + "loss": 5.234, + "step": 34490 + }, + { + "epoch": 0.7018025716145834, + "grad_norm": 20.116670608520508, + "learning_rate": 9.262946027807853e-06, + "loss": 5.0769, + "step": 34495 + }, + { + "epoch": 0.701904296875, + "grad_norm": 20.57040786743164, + "learning_rate": 9.262737150062261e-06, + "loss": 4.8518, + "step": 34500 + }, + { + "epoch": 0.7020060221354166, + "grad_norm": 16.05486488342285, + "learning_rate": 9.262528245079132e-06, + "loss": 4.9337, + "step": 34505 + }, + { + "epoch": 0.7021077473958334, + "grad_norm": 19.16739273071289, + "learning_rate": 9.2623193128598e-06, + "loss": 5.0069, + "step": 34510 + }, + { + "epoch": 0.70220947265625, + "grad_norm": 16.102184295654297, + "learning_rate": 9.262110353405603e-06, + "loss": 5.0202, + "step": 34515 + }, + { + "epoch": 0.7023111979166666, + "grad_norm": 21.761878967285156, + "learning_rate": 9.261901366717871e-06, + "loss": 4.8433, + "step": 34520 + }, + { + "epoch": 0.7024129231770834, + "grad_norm": 17.190597534179688, + "learning_rate": 9.261692352797943e-06, + "loss": 4.9278, + "step": 34525 + }, + { + "epoch": 0.7025146484375, + "grad_norm": 14.794886589050293, + "learning_rate": 9.261483311647155e-06, + "loss": 5.2076, + "step": 34530 + }, + { + "epoch": 0.7026163736979166, + "grad_norm": 15.294554710388184, + "learning_rate": 9.26127424326684e-06, + "loss": 5.0906, + "step": 34535 + }, + { + "epoch": 0.7027180989583334, + "grad_norm": 18.515214920043945, + "learning_rate": 9.261065147658335e-06, + "loss": 4.838, + "step": 34540 + }, + { + "epoch": 0.70281982421875, + "grad_norm": 16.450048446655273, + "learning_rate": 9.260856024822977e-06, + "loss": 5.2866, + "step": 34545 + }, + { + "epoch": 0.7029215494791666, + "grad_norm": 17.918468475341797, + "learning_rate": 9.260646874762102e-06, + "loss": 4.9136, + "step": 34550 + }, + { + "epoch": 0.7030232747395834, + "grad_norm": 18.882238388061523, + "learning_rate": 9.260437697477046e-06, + "loss": 4.8711, + "step": 34555 + }, + { + "epoch": 0.703125, + "grad_norm": 12.236174583435059, + "learning_rate": 9.260228492969145e-06, + "loss": 5.1106, + "step": 34560 + }, + { + "epoch": 0.7032267252604166, + "grad_norm": 17.422693252563477, + "learning_rate": 9.260019261239736e-06, + "loss": 5.2389, + "step": 34565 + }, + { + "epoch": 0.7033284505208334, + "grad_norm": 18.569095611572266, + "learning_rate": 9.259810002290157e-06, + "loss": 4.9338, + "step": 34570 + }, + { + "epoch": 0.70343017578125, + "grad_norm": 17.202327728271484, + "learning_rate": 9.259600716121744e-06, + "loss": 5.1506, + "step": 34575 + }, + { + "epoch": 0.7035319010416666, + "grad_norm": 18.02789306640625, + "learning_rate": 9.259391402735835e-06, + "loss": 5.3949, + "step": 34580 + }, + { + "epoch": 0.7036336263020834, + "grad_norm": 17.467214584350586, + "learning_rate": 9.259182062133767e-06, + "loss": 5.1572, + "step": 34585 + }, + { + "epoch": 0.7037353515625, + "grad_norm": 15.75724983215332, + "learning_rate": 9.258972694316877e-06, + "loss": 5.1425, + "step": 34590 + }, + { + "epoch": 0.7038370768229166, + "grad_norm": 13.855957984924316, + "learning_rate": 9.258763299286504e-06, + "loss": 5.2486, + "step": 34595 + }, + { + "epoch": 0.7039388020833334, + "grad_norm": 16.487850189208984, + "learning_rate": 9.258553877043985e-06, + "loss": 5.4373, + "step": 34600 + }, + { + "epoch": 0.70404052734375, + "grad_norm": 13.563170433044434, + "learning_rate": 9.258344427590658e-06, + "loss": 5.1813, + "step": 34605 + }, + { + "epoch": 0.7041422526041666, + "grad_norm": 17.875911712646484, + "learning_rate": 9.258134950927862e-06, + "loss": 4.8272, + "step": 34610 + }, + { + "epoch": 0.7042439778645834, + "grad_norm": 13.417071342468262, + "learning_rate": 9.257925447056937e-06, + "loss": 5.1992, + "step": 34615 + }, + { + "epoch": 0.704345703125, + "grad_norm": 15.059382438659668, + "learning_rate": 9.257715915979219e-06, + "loss": 5.0435, + "step": 34620 + }, + { + "epoch": 0.7044474283854166, + "grad_norm": 24.127153396606445, + "learning_rate": 9.25750635769605e-06, + "loss": 4.9925, + "step": 34625 + }, + { + "epoch": 0.7045491536458334, + "grad_norm": 13.811615943908691, + "learning_rate": 9.257296772208763e-06, + "loss": 5.0923, + "step": 34630 + }, + { + "epoch": 0.70465087890625, + "grad_norm": 18.164920806884766, + "learning_rate": 9.257087159518702e-06, + "loss": 5.032, + "step": 34635 + }, + { + "epoch": 0.7047526041666666, + "grad_norm": 13.059317588806152, + "learning_rate": 9.256877519627208e-06, + "loss": 5.1975, + "step": 34640 + }, + { + "epoch": 0.7048543294270834, + "grad_norm": 17.31121826171875, + "learning_rate": 9.256667852535616e-06, + "loss": 5.0611, + "step": 34645 + }, + { + "epoch": 0.7049560546875, + "grad_norm": 13.512758255004883, + "learning_rate": 9.256458158245267e-06, + "loss": 5.1275, + "step": 34650 + }, + { + "epoch": 0.7050577799479166, + "grad_norm": 17.693775177001953, + "learning_rate": 9.256248436757502e-06, + "loss": 4.8492, + "step": 34655 + }, + { + "epoch": 0.7051595052083334, + "grad_norm": 18.929668426513672, + "learning_rate": 9.256038688073664e-06, + "loss": 5.2562, + "step": 34660 + }, + { + "epoch": 0.70526123046875, + "grad_norm": 13.197490692138672, + "learning_rate": 9.255828912195085e-06, + "loss": 5.205, + "step": 34665 + }, + { + "epoch": 0.7053629557291666, + "grad_norm": 14.471574783325195, + "learning_rate": 9.255619109123113e-06, + "loss": 5.0785, + "step": 34670 + }, + { + "epoch": 0.7054646809895834, + "grad_norm": 19.29319953918457, + "learning_rate": 9.255409278859085e-06, + "loss": 4.9915, + "step": 34675 + }, + { + "epoch": 0.70556640625, + "grad_norm": 18.051069259643555, + "learning_rate": 9.255199421404342e-06, + "loss": 5.0739, + "step": 34680 + }, + { + "epoch": 0.7056681315104166, + "grad_norm": 19.580657958984375, + "learning_rate": 9.254989536760225e-06, + "loss": 4.9675, + "step": 34685 + }, + { + "epoch": 0.7057698567708334, + "grad_norm": 16.206655502319336, + "learning_rate": 9.254779624928078e-06, + "loss": 5.1133, + "step": 34690 + }, + { + "epoch": 0.70587158203125, + "grad_norm": 24.35723876953125, + "learning_rate": 9.254569685909239e-06, + "loss": 5.2446, + "step": 34695 + }, + { + "epoch": 0.7059733072916666, + "grad_norm": 15.329630851745605, + "learning_rate": 9.254359719705049e-06, + "loss": 5.155, + "step": 34700 + }, + { + "epoch": 0.7060750325520834, + "grad_norm": 13.338244438171387, + "learning_rate": 9.254149726316852e-06, + "loss": 5.0151, + "step": 34705 + }, + { + "epoch": 0.7061767578125, + "grad_norm": 17.325756072998047, + "learning_rate": 9.25393970574599e-06, + "loss": 5.0495, + "step": 34710 + }, + { + "epoch": 0.7062784830729166, + "grad_norm": 15.57020092010498, + "learning_rate": 9.2537296579938e-06, + "loss": 5.1514, + "step": 34715 + }, + { + "epoch": 0.7063802083333334, + "grad_norm": 15.923577308654785, + "learning_rate": 9.25351958306163e-06, + "loss": 5.2648, + "step": 34720 + }, + { + "epoch": 0.70648193359375, + "grad_norm": 16.02031707763672, + "learning_rate": 9.253309480950821e-06, + "loss": 5.1423, + "step": 34725 + }, + { + "epoch": 0.7065836588541666, + "grad_norm": 16.180065155029297, + "learning_rate": 9.253099351662713e-06, + "loss": 5.2284, + "step": 34730 + }, + { + "epoch": 0.7066853841145834, + "grad_norm": 20.133136749267578, + "learning_rate": 9.25288919519865e-06, + "loss": 4.8201, + "step": 34735 + }, + { + "epoch": 0.706787109375, + "grad_norm": 17.125762939453125, + "learning_rate": 9.252679011559977e-06, + "loss": 5.0556, + "step": 34740 + }, + { + "epoch": 0.7068888346354166, + "grad_norm": 18.847972869873047, + "learning_rate": 9.252468800748034e-06, + "loss": 4.7542, + "step": 34745 + }, + { + "epoch": 0.7069905598958334, + "grad_norm": 17.865657806396484, + "learning_rate": 9.252258562764165e-06, + "loss": 5.2437, + "step": 34750 + }, + { + "epoch": 0.70709228515625, + "grad_norm": 19.961767196655273, + "learning_rate": 9.252048297609714e-06, + "loss": 5.0947, + "step": 34755 + }, + { + "epoch": 0.7071940104166666, + "grad_norm": 16.42935562133789, + "learning_rate": 9.251838005286025e-06, + "loss": 5.3991, + "step": 34760 + }, + { + "epoch": 0.7072957356770834, + "grad_norm": 18.84818458557129, + "learning_rate": 9.251627685794439e-06, + "loss": 4.829, + "step": 34765 + }, + { + "epoch": 0.7073974609375, + "grad_norm": 15.889172554016113, + "learning_rate": 9.251417339136302e-06, + "loss": 5.3129, + "step": 34770 + }, + { + "epoch": 0.7074991861979166, + "grad_norm": 16.68112564086914, + "learning_rate": 9.251206965312959e-06, + "loss": 5.0915, + "step": 34775 + }, + { + "epoch": 0.7076009114583334, + "grad_norm": 19.643468856811523, + "learning_rate": 9.250996564325751e-06, + "loss": 5.3143, + "step": 34780 + }, + { + "epoch": 0.70770263671875, + "grad_norm": 15.219221115112305, + "learning_rate": 9.250786136176026e-06, + "loss": 5.0871, + "step": 34785 + }, + { + "epoch": 0.7078043619791666, + "grad_norm": 20.69162940979004, + "learning_rate": 9.250575680865128e-06, + "loss": 5.0944, + "step": 34790 + }, + { + "epoch": 0.7079060872395834, + "grad_norm": 14.00648021697998, + "learning_rate": 9.2503651983944e-06, + "loss": 5.2133, + "step": 34795 + }, + { + "epoch": 0.7080078125, + "grad_norm": 16.14615821838379, + "learning_rate": 9.250154688765187e-06, + "loss": 4.965, + "step": 34800 + }, + { + "epoch": 0.7081095377604166, + "grad_norm": 13.533339500427246, + "learning_rate": 9.249944151978835e-06, + "loss": 5.2099, + "step": 34805 + }, + { + "epoch": 0.7082112630208334, + "grad_norm": 18.40241050720215, + "learning_rate": 9.249733588036688e-06, + "loss": 5.4753, + "step": 34810 + }, + { + "epoch": 0.70831298828125, + "grad_norm": 15.66211223602295, + "learning_rate": 9.249522996940095e-06, + "loss": 4.9882, + "step": 34815 + }, + { + "epoch": 0.7084147135416666, + "grad_norm": 16.11068344116211, + "learning_rate": 9.249312378690397e-06, + "loss": 4.9964, + "step": 34820 + }, + { + "epoch": 0.7085164388020834, + "grad_norm": 14.647462844848633, + "learning_rate": 9.24910173328894e-06, + "loss": 5.3023, + "step": 34825 + }, + { + "epoch": 0.7086181640625, + "grad_norm": 17.884246826171875, + "learning_rate": 9.248891060737076e-06, + "loss": 5.0055, + "step": 34830 + }, + { + "epoch": 0.7087198893229166, + "grad_norm": 18.264970779418945, + "learning_rate": 9.248680361036144e-06, + "loss": 5.0946, + "step": 34835 + }, + { + "epoch": 0.7088216145833334, + "grad_norm": 24.072484970092773, + "learning_rate": 9.248469634187494e-06, + "loss": 5.0563, + "step": 34840 + }, + { + "epoch": 0.70892333984375, + "grad_norm": 15.367345809936523, + "learning_rate": 9.24825888019247e-06, + "loss": 5.0857, + "step": 34845 + }, + { + "epoch": 0.7090250651041666, + "grad_norm": 23.836231231689453, + "learning_rate": 9.248048099052424e-06, + "loss": 5.1852, + "step": 34850 + }, + { + "epoch": 0.7091267903645834, + "grad_norm": 18.417949676513672, + "learning_rate": 9.247837290768697e-06, + "loss": 5.5514, + "step": 34855 + }, + { + "epoch": 0.709228515625, + "grad_norm": 20.023395538330078, + "learning_rate": 9.24762645534264e-06, + "loss": 5.1269, + "step": 34860 + }, + { + "epoch": 0.7093302408854166, + "grad_norm": 16.342702865600586, + "learning_rate": 9.247415592775595e-06, + "loss": 4.9655, + "step": 34865 + }, + { + "epoch": 0.7094319661458334, + "grad_norm": 14.989740371704102, + "learning_rate": 9.247204703068915e-06, + "loss": 4.9738, + "step": 34870 + }, + { + "epoch": 0.70953369140625, + "grad_norm": 15.028088569641113, + "learning_rate": 9.246993786223943e-06, + "loss": 5.1866, + "step": 34875 + }, + { + "epoch": 0.7096354166666666, + "grad_norm": 24.009708404541016, + "learning_rate": 9.24678284224203e-06, + "loss": 5.1869, + "step": 34880 + }, + { + "epoch": 0.7097371419270834, + "grad_norm": 13.939474105834961, + "learning_rate": 9.246571871124524e-06, + "loss": 5.1133, + "step": 34885 + }, + { + "epoch": 0.7098388671875, + "grad_norm": 15.783243179321289, + "learning_rate": 9.246360872872772e-06, + "loss": 4.9457, + "step": 34890 + }, + { + "epoch": 0.7099405924479166, + "grad_norm": 20.735567092895508, + "learning_rate": 9.246149847488121e-06, + "loss": 5.1607, + "step": 34895 + }, + { + "epoch": 0.7100423177083334, + "grad_norm": 12.985214233398438, + "learning_rate": 9.24593879497192e-06, + "loss": 5.2859, + "step": 34900 + }, + { + "epoch": 0.71014404296875, + "grad_norm": 14.109498977661133, + "learning_rate": 9.245727715325518e-06, + "loss": 5.0972, + "step": 34905 + }, + { + "epoch": 0.7102457682291666, + "grad_norm": 19.337976455688477, + "learning_rate": 9.245516608550264e-06, + "loss": 5.0941, + "step": 34910 + }, + { + "epoch": 0.7103474934895834, + "grad_norm": 15.07957649230957, + "learning_rate": 9.245305474647507e-06, + "loss": 5.0938, + "step": 34915 + }, + { + "epoch": 0.71044921875, + "grad_norm": 16.004199981689453, + "learning_rate": 9.245094313618595e-06, + "loss": 5.0364, + "step": 34920 + }, + { + "epoch": 0.7105509440104166, + "grad_norm": 13.537200927734375, + "learning_rate": 9.244883125464879e-06, + "loss": 4.9399, + "step": 34925 + }, + { + "epoch": 0.7106526692708334, + "grad_norm": 15.5360107421875, + "learning_rate": 9.244671910187706e-06, + "loss": 4.9357, + "step": 34930 + }, + { + "epoch": 0.71075439453125, + "grad_norm": 14.737747192382812, + "learning_rate": 9.244460667788426e-06, + "loss": 5.101, + "step": 34935 + }, + { + "epoch": 0.7108561197916666, + "grad_norm": 16.66351890563965, + "learning_rate": 9.244249398268393e-06, + "loss": 4.9303, + "step": 34940 + }, + { + "epoch": 0.7109578450520834, + "grad_norm": 14.15138053894043, + "learning_rate": 9.244038101628952e-06, + "loss": 4.9758, + "step": 34945 + }, + { + "epoch": 0.7110595703125, + "grad_norm": 17.759737014770508, + "learning_rate": 9.243826777871454e-06, + "loss": 4.8749, + "step": 34950 + }, + { + "epoch": 0.7111612955729166, + "grad_norm": 21.50530433654785, + "learning_rate": 9.24361542699725e-06, + "loss": 5.2428, + "step": 34955 + }, + { + "epoch": 0.7112630208333334, + "grad_norm": 25.56772804260254, + "learning_rate": 9.243404049007691e-06, + "loss": 5.2481, + "step": 34960 + }, + { + "epoch": 0.71136474609375, + "grad_norm": 20.751554489135742, + "learning_rate": 9.243192643904126e-06, + "loss": 4.965, + "step": 34965 + }, + { + "epoch": 0.7114664713541666, + "grad_norm": 15.202127456665039, + "learning_rate": 9.242981211687908e-06, + "loss": 5.1617, + "step": 34970 + }, + { + "epoch": 0.7115681966145834, + "grad_norm": 29.89778709411621, + "learning_rate": 9.242769752360387e-06, + "loss": 5.0272, + "step": 34975 + }, + { + "epoch": 0.711669921875, + "grad_norm": 16.9115047454834, + "learning_rate": 9.242558265922916e-06, + "loss": 4.9194, + "step": 34980 + }, + { + "epoch": 0.7117716471354166, + "grad_norm": 14.19572639465332, + "learning_rate": 9.242346752376843e-06, + "loss": 4.9213, + "step": 34985 + }, + { + "epoch": 0.7118733723958334, + "grad_norm": 15.964802742004395, + "learning_rate": 9.24213521172352e-06, + "loss": 5.1034, + "step": 34990 + }, + { + "epoch": 0.71197509765625, + "grad_norm": 23.961143493652344, + "learning_rate": 9.241923643964301e-06, + "loss": 5.1047, + "step": 34995 + }, + { + "epoch": 0.7120768229166666, + "grad_norm": 18.94865608215332, + "learning_rate": 9.241712049100536e-06, + "loss": 4.5706, + "step": 35000 + }, + { + "epoch": 0.7121785481770834, + "grad_norm": 22.091062545776367, + "learning_rate": 9.241500427133577e-06, + "loss": 5.1007, + "step": 35005 + }, + { + "epoch": 0.7122802734375, + "grad_norm": 16.014564514160156, + "learning_rate": 9.241288778064777e-06, + "loss": 5.0236, + "step": 35010 + }, + { + "epoch": 0.7123819986979166, + "grad_norm": 15.60103702545166, + "learning_rate": 9.241077101895488e-06, + "loss": 5.2606, + "step": 35015 + }, + { + "epoch": 0.7124837239583334, + "grad_norm": 20.24944305419922, + "learning_rate": 9.240865398627064e-06, + "loss": 4.9728, + "step": 35020 + }, + { + "epoch": 0.71258544921875, + "grad_norm": 19.413902282714844, + "learning_rate": 9.240653668260854e-06, + "loss": 5.173, + "step": 35025 + }, + { + "epoch": 0.7126871744791666, + "grad_norm": 16.059064865112305, + "learning_rate": 9.240441910798215e-06, + "loss": 4.7823, + "step": 35030 + }, + { + "epoch": 0.7127888997395834, + "grad_norm": 23.843433380126953, + "learning_rate": 9.240230126240497e-06, + "loss": 5.1885, + "step": 35035 + }, + { + "epoch": 0.712890625, + "grad_norm": 18.710988998413086, + "learning_rate": 9.240018314589054e-06, + "loss": 5.0995, + "step": 35040 + }, + { + "epoch": 0.7129923502604166, + "grad_norm": 13.61864948272705, + "learning_rate": 9.239806475845241e-06, + "loss": 5.2557, + "step": 35045 + }, + { + "epoch": 0.7130940755208334, + "grad_norm": 19.99884033203125, + "learning_rate": 9.239594610010412e-06, + "loss": 5.1948, + "step": 35050 + }, + { + "epoch": 0.71319580078125, + "grad_norm": 15.95564079284668, + "learning_rate": 9.239382717085917e-06, + "loss": 5.0553, + "step": 35055 + }, + { + "epoch": 0.7132975260416666, + "grad_norm": 21.56372833251953, + "learning_rate": 9.239170797073114e-06, + "loss": 4.778, + "step": 35060 + }, + { + "epoch": 0.7133992513020834, + "grad_norm": 20.03286361694336, + "learning_rate": 9.238958849973354e-06, + "loss": 5.0507, + "step": 35065 + }, + { + "epoch": 0.7135009765625, + "grad_norm": 18.319156646728516, + "learning_rate": 9.238746875787992e-06, + "loss": 4.9337, + "step": 35070 + }, + { + "epoch": 0.7136027018229166, + "grad_norm": 19.073781967163086, + "learning_rate": 9.238534874518383e-06, + "loss": 5.281, + "step": 35075 + }, + { + "epoch": 0.7137044270833334, + "grad_norm": 15.609333038330078, + "learning_rate": 9.238322846165883e-06, + "loss": 5.2411, + "step": 35080 + }, + { + "epoch": 0.71380615234375, + "grad_norm": 16.65614891052246, + "learning_rate": 9.238110790731846e-06, + "loss": 5.0518, + "step": 35085 + }, + { + "epoch": 0.7139078776041666, + "grad_norm": 16.54350471496582, + "learning_rate": 9.237898708217625e-06, + "loss": 4.9775, + "step": 35090 + }, + { + "epoch": 0.7140096028645834, + "grad_norm": 11.578980445861816, + "learning_rate": 9.237686598624577e-06, + "loss": 5.3846, + "step": 35095 + }, + { + "epoch": 0.714111328125, + "grad_norm": 17.512836456298828, + "learning_rate": 9.237474461954056e-06, + "loss": 4.845, + "step": 35100 + }, + { + "epoch": 0.7142130533854166, + "grad_norm": 16.968067169189453, + "learning_rate": 9.23726229820742e-06, + "loss": 5.0808, + "step": 35105 + }, + { + "epoch": 0.7143147786458334, + "grad_norm": 15.524112701416016, + "learning_rate": 9.237050107386024e-06, + "loss": 5.0719, + "step": 35110 + }, + { + "epoch": 0.71441650390625, + "grad_norm": 24.18496322631836, + "learning_rate": 9.23683788949122e-06, + "loss": 5.011, + "step": 35115 + }, + { + "epoch": 0.7145182291666666, + "grad_norm": 16.445016860961914, + "learning_rate": 9.236625644524367e-06, + "loss": 5.2887, + "step": 35120 + }, + { + "epoch": 0.7146199544270834, + "grad_norm": 16.495174407958984, + "learning_rate": 9.236413372486822e-06, + "loss": 4.978, + "step": 35125 + }, + { + "epoch": 0.7147216796875, + "grad_norm": 18.381668090820312, + "learning_rate": 9.23620107337994e-06, + "loss": 5.0579, + "step": 35130 + }, + { + "epoch": 0.7148234049479166, + "grad_norm": 18.548847198486328, + "learning_rate": 9.235988747205078e-06, + "loss": 5.0308, + "step": 35135 + }, + { + "epoch": 0.7149251302083334, + "grad_norm": 17.512746810913086, + "learning_rate": 9.235776393963594e-06, + "loss": 5.2924, + "step": 35140 + }, + { + "epoch": 0.71502685546875, + "grad_norm": 13.221979141235352, + "learning_rate": 9.235564013656841e-06, + "loss": 5.0728, + "step": 35145 + }, + { + "epoch": 0.7151285807291666, + "grad_norm": 17.875953674316406, + "learning_rate": 9.23535160628618e-06, + "loss": 5.1637, + "step": 35150 + }, + { + "epoch": 0.7152303059895834, + "grad_norm": 18.276819229125977, + "learning_rate": 9.235139171852965e-06, + "loss": 5.0361, + "step": 35155 + }, + { + "epoch": 0.71533203125, + "grad_norm": 13.301962852478027, + "learning_rate": 9.234926710358558e-06, + "loss": 4.8555, + "step": 35160 + }, + { + "epoch": 0.7154337565104166, + "grad_norm": 19.59075927734375, + "learning_rate": 9.234714221804312e-06, + "loss": 5.2624, + "step": 35165 + }, + { + "epoch": 0.7155354817708334, + "grad_norm": 15.525080680847168, + "learning_rate": 9.234501706191587e-06, + "loss": 5.1158, + "step": 35170 + }, + { + "epoch": 0.71563720703125, + "grad_norm": 18.028263092041016, + "learning_rate": 9.234289163521739e-06, + "loss": 5.0258, + "step": 35175 + }, + { + "epoch": 0.7157389322916666, + "grad_norm": 20.788185119628906, + "learning_rate": 9.234076593796128e-06, + "loss": 5.1602, + "step": 35180 + }, + { + "epoch": 0.7158406575520834, + "grad_norm": 19.176916122436523, + "learning_rate": 9.233863997016113e-06, + "loss": 5.0281, + "step": 35185 + }, + { + "epoch": 0.7159423828125, + "grad_norm": 18.765586853027344, + "learning_rate": 9.233651373183049e-06, + "loss": 4.9536, + "step": 35190 + }, + { + "epoch": 0.7160441080729166, + "grad_norm": 16.181142807006836, + "learning_rate": 9.233438722298299e-06, + "loss": 5.202, + "step": 35195 + }, + { + "epoch": 0.7161458333333334, + "grad_norm": 18.6247615814209, + "learning_rate": 9.233226044363218e-06, + "loss": 5.0059, + "step": 35200 + }, + { + "epoch": 0.71624755859375, + "grad_norm": 19.309417724609375, + "learning_rate": 9.233013339379165e-06, + "loss": 5.0127, + "step": 35205 + }, + { + "epoch": 0.7163492838541666, + "grad_norm": 14.822741508483887, + "learning_rate": 9.232800607347503e-06, + "loss": 5.2951, + "step": 35210 + }, + { + "epoch": 0.7164510091145834, + "grad_norm": 17.788217544555664, + "learning_rate": 9.232587848269588e-06, + "loss": 5.0526, + "step": 35215 + }, + { + "epoch": 0.716552734375, + "grad_norm": 13.837873458862305, + "learning_rate": 9.232375062146778e-06, + "loss": 5.0886, + "step": 35220 + }, + { + "epoch": 0.7166544596354166, + "grad_norm": 17.808300018310547, + "learning_rate": 9.232162248980438e-06, + "loss": 5.3227, + "step": 35225 + }, + { + "epoch": 0.7167561848958334, + "grad_norm": 20.315610885620117, + "learning_rate": 9.231949408771923e-06, + "loss": 4.7359, + "step": 35230 + }, + { + "epoch": 0.71685791015625, + "grad_norm": 22.526718139648438, + "learning_rate": 9.231736541522595e-06, + "loss": 5.1057, + "step": 35235 + }, + { + "epoch": 0.7169596354166666, + "grad_norm": 17.659210205078125, + "learning_rate": 9.231523647233815e-06, + "loss": 5.2608, + "step": 35240 + }, + { + "epoch": 0.7170613606770834, + "grad_norm": 18.47774887084961, + "learning_rate": 9.231310725906941e-06, + "loss": 5.1748, + "step": 35245 + }, + { + "epoch": 0.7171630859375, + "grad_norm": 17.63787269592285, + "learning_rate": 9.231097777543334e-06, + "loss": 5.2943, + "step": 35250 + }, + { + "epoch": 0.7172648111979166, + "grad_norm": 15.319204330444336, + "learning_rate": 9.230884802144358e-06, + "loss": 5.1305, + "step": 35255 + }, + { + "epoch": 0.7173665364583334, + "grad_norm": 15.186155319213867, + "learning_rate": 9.230671799711368e-06, + "loss": 5.0017, + "step": 35260 + }, + { + "epoch": 0.71746826171875, + "grad_norm": 15.99899959564209, + "learning_rate": 9.230458770245729e-06, + "loss": 5.0231, + "step": 35265 + }, + { + "epoch": 0.7175699869791666, + "grad_norm": 20.745784759521484, + "learning_rate": 9.230245713748803e-06, + "loss": 4.8675, + "step": 35270 + }, + { + "epoch": 0.7176717122395834, + "grad_norm": 16.983890533447266, + "learning_rate": 9.23003263022195e-06, + "loss": 4.9169, + "step": 35275 + }, + { + "epoch": 0.7177734375, + "grad_norm": 15.990869522094727, + "learning_rate": 9.229819519666529e-06, + "loss": 5.1253, + "step": 35280 + }, + { + "epoch": 0.7178751627604166, + "grad_norm": 18.611051559448242, + "learning_rate": 9.229606382083905e-06, + "loss": 5.3594, + "step": 35285 + }, + { + "epoch": 0.7179768880208334, + "grad_norm": 18.920499801635742, + "learning_rate": 9.229393217475439e-06, + "loss": 4.9882, + "step": 35290 + }, + { + "epoch": 0.71807861328125, + "grad_norm": 15.53471851348877, + "learning_rate": 9.229180025842492e-06, + "loss": 5.1493, + "step": 35295 + }, + { + "epoch": 0.7181803385416666, + "grad_norm": 24.478984832763672, + "learning_rate": 9.228966807186428e-06, + "loss": 5.5225, + "step": 35300 + }, + { + "epoch": 0.7182820638020834, + "grad_norm": 16.698383331298828, + "learning_rate": 9.22875356150861e-06, + "loss": 4.9559, + "step": 35305 + }, + { + "epoch": 0.7183837890625, + "grad_norm": 19.530277252197266, + "learning_rate": 9.228540288810397e-06, + "loss": 5.2056, + "step": 35310 + }, + { + "epoch": 0.7184855143229166, + "grad_norm": 16.714406967163086, + "learning_rate": 9.228326989093155e-06, + "loss": 4.9762, + "step": 35315 + }, + { + "epoch": 0.7185872395833334, + "grad_norm": 17.73603630065918, + "learning_rate": 9.228113662358245e-06, + "loss": 4.9698, + "step": 35320 + }, + { + "epoch": 0.71868896484375, + "grad_norm": 13.201054573059082, + "learning_rate": 9.22790030860703e-06, + "loss": 5.1631, + "step": 35325 + }, + { + "epoch": 0.7187906901041666, + "grad_norm": 18.494524002075195, + "learning_rate": 9.227686927840876e-06, + "loss": 5.2822, + "step": 35330 + }, + { + "epoch": 0.7188924153645834, + "grad_norm": 20.4510555267334, + "learning_rate": 9.227473520061144e-06, + "loss": 5.4123, + "step": 35335 + }, + { + "epoch": 0.718994140625, + "grad_norm": 12.569093704223633, + "learning_rate": 9.227260085269199e-06, + "loss": 5.1959, + "step": 35340 + }, + { + "epoch": 0.7190958658854166, + "grad_norm": 15.549673080444336, + "learning_rate": 9.227046623466403e-06, + "loss": 5.0282, + "step": 35345 + }, + { + "epoch": 0.7191975911458334, + "grad_norm": 13.880372047424316, + "learning_rate": 9.22683313465412e-06, + "loss": 5.093, + "step": 35350 + }, + { + "epoch": 0.71929931640625, + "grad_norm": 17.93922233581543, + "learning_rate": 9.226619618833717e-06, + "loss": 4.9916, + "step": 35355 + }, + { + "epoch": 0.7194010416666666, + "grad_norm": 18.28989601135254, + "learning_rate": 9.226406076006555e-06, + "loss": 5.0431, + "step": 35360 + }, + { + "epoch": 0.7195027669270834, + "grad_norm": 17.990461349487305, + "learning_rate": 9.226192506174e-06, + "loss": 5.0346, + "step": 35365 + }, + { + "epoch": 0.7196044921875, + "grad_norm": 17.530244827270508, + "learning_rate": 9.225978909337417e-06, + "loss": 5.2036, + "step": 35370 + }, + { + "epoch": 0.7197062174479166, + "grad_norm": 11.618287086486816, + "learning_rate": 9.225765285498169e-06, + "loss": 4.8001, + "step": 35375 + }, + { + "epoch": 0.7198079427083334, + "grad_norm": 16.868194580078125, + "learning_rate": 9.225551634657623e-06, + "loss": 5.0454, + "step": 35380 + }, + { + "epoch": 0.71990966796875, + "grad_norm": 20.400249481201172, + "learning_rate": 9.225337956817144e-06, + "loss": 5.3731, + "step": 35385 + }, + { + "epoch": 0.7200113932291666, + "grad_norm": 15.904505729675293, + "learning_rate": 9.225124251978097e-06, + "loss": 5.1894, + "step": 35390 + }, + { + "epoch": 0.7201131184895834, + "grad_norm": 14.26028823852539, + "learning_rate": 9.224910520141846e-06, + "loss": 5.1155, + "step": 35395 + }, + { + "epoch": 0.72021484375, + "grad_norm": 15.745200157165527, + "learning_rate": 9.22469676130976e-06, + "loss": 5.1036, + "step": 35400 + }, + { + "epoch": 0.7203165690104166, + "grad_norm": 23.10540199279785, + "learning_rate": 9.2244829754832e-06, + "loss": 5.063, + "step": 35405 + }, + { + "epoch": 0.7204182942708334, + "grad_norm": 17.2059326171875, + "learning_rate": 9.224269162663537e-06, + "loss": 4.9427, + "step": 35410 + }, + { + "epoch": 0.72052001953125, + "grad_norm": 17.958906173706055, + "learning_rate": 9.224055322852134e-06, + "loss": 5.0222, + "step": 35415 + }, + { + "epoch": 0.7206217447916666, + "grad_norm": 16.526124954223633, + "learning_rate": 9.223841456050359e-06, + "loss": 4.9018, + "step": 35420 + }, + { + "epoch": 0.7207234700520834, + "grad_norm": 19.434175491333008, + "learning_rate": 9.223627562259577e-06, + "loss": 4.9374, + "step": 35425 + }, + { + "epoch": 0.7208251953125, + "grad_norm": 17.054489135742188, + "learning_rate": 9.223413641481156e-06, + "loss": 5.225, + "step": 35430 + }, + { + "epoch": 0.7209269205729166, + "grad_norm": 18.838945388793945, + "learning_rate": 9.223199693716463e-06, + "loss": 5.0565, + "step": 35435 + }, + { + "epoch": 0.7210286458333334, + "grad_norm": 14.516705513000488, + "learning_rate": 9.222985718966865e-06, + "loss": 5.2554, + "step": 35440 + }, + { + "epoch": 0.72113037109375, + "grad_norm": 14.123825073242188, + "learning_rate": 9.222771717233727e-06, + "loss": 4.949, + "step": 35445 + }, + { + "epoch": 0.7212320963541666, + "grad_norm": 18.700973510742188, + "learning_rate": 9.22255768851842e-06, + "loss": 5.2065, + "step": 35450 + }, + { + "epoch": 0.7213338216145834, + "grad_norm": 19.0125675201416, + "learning_rate": 9.222343632822309e-06, + "loss": 5.2707, + "step": 35455 + }, + { + "epoch": 0.721435546875, + "grad_norm": 14.846098899841309, + "learning_rate": 9.222129550146763e-06, + "loss": 5.3277, + "step": 35460 + }, + { + "epoch": 0.7215372721354166, + "grad_norm": 15.721159934997559, + "learning_rate": 9.221915440493148e-06, + "loss": 5.5936, + "step": 35465 + }, + { + "epoch": 0.7216389973958334, + "grad_norm": 12.280821800231934, + "learning_rate": 9.221701303862836e-06, + "loss": 4.9791, + "step": 35470 + }, + { + "epoch": 0.72174072265625, + "grad_norm": 18.160680770874023, + "learning_rate": 9.22148714025719e-06, + "loss": 5.2283, + "step": 35475 + }, + { + "epoch": 0.7218424479166666, + "grad_norm": 14.291827201843262, + "learning_rate": 9.221272949677585e-06, + "loss": 5.1898, + "step": 35480 + }, + { + "epoch": 0.7219441731770834, + "grad_norm": 35.63920593261719, + "learning_rate": 9.221058732125382e-06, + "loss": 5.4766, + "step": 35485 + }, + { + "epoch": 0.7220458984375, + "grad_norm": 13.01709270477295, + "learning_rate": 9.220844487601955e-06, + "loss": 5.1958, + "step": 35490 + }, + { + "epoch": 0.7221476236979166, + "grad_norm": 15.562185287475586, + "learning_rate": 9.220630216108672e-06, + "loss": 5.0665, + "step": 35495 + }, + { + "epoch": 0.7222493489583334, + "grad_norm": 22.99993133544922, + "learning_rate": 9.220415917646901e-06, + "loss": 4.9275, + "step": 35500 + }, + { + "epoch": 0.72235107421875, + "grad_norm": 14.701028823852539, + "learning_rate": 9.220201592218013e-06, + "loss": 5.0463, + "step": 35505 + }, + { + "epoch": 0.7224527994791666, + "grad_norm": 18.97341537475586, + "learning_rate": 9.219987239823375e-06, + "loss": 5.1574, + "step": 35510 + }, + { + "epoch": 0.7225545247395834, + "grad_norm": 14.5440092086792, + "learning_rate": 9.21977286046436e-06, + "loss": 5.1425, + "step": 35515 + }, + { + "epoch": 0.72265625, + "grad_norm": 17.162641525268555, + "learning_rate": 9.219558454142336e-06, + "loss": 4.8804, + "step": 35520 + }, + { + "epoch": 0.7227579752604166, + "grad_norm": 15.183576583862305, + "learning_rate": 9.219344020858671e-06, + "loss": 5.1896, + "step": 35525 + }, + { + "epoch": 0.7228597005208334, + "grad_norm": 15.809552192687988, + "learning_rate": 9.219129560614738e-06, + "loss": 5.1175, + "step": 35530 + }, + { + "epoch": 0.72296142578125, + "grad_norm": 16.37314224243164, + "learning_rate": 9.218915073411907e-06, + "loss": 5.307, + "step": 35535 + }, + { + "epoch": 0.7230631510416666, + "grad_norm": 21.71489143371582, + "learning_rate": 9.218700559251547e-06, + "loss": 5.5032, + "step": 35540 + }, + { + "epoch": 0.7231648763020834, + "grad_norm": 20.325117111206055, + "learning_rate": 9.218486018135031e-06, + "loss": 5.144, + "step": 35545 + }, + { + "epoch": 0.7232666015625, + "grad_norm": 18.03512191772461, + "learning_rate": 9.218271450063728e-06, + "loss": 5.1893, + "step": 35550 + }, + { + "epoch": 0.7233683268229166, + "grad_norm": 15.90255355834961, + "learning_rate": 9.21805685503901e-06, + "loss": 5.2314, + "step": 35555 + }, + { + "epoch": 0.7234700520833334, + "grad_norm": 17.32933235168457, + "learning_rate": 9.217842233062247e-06, + "loss": 5.0944, + "step": 35560 + }, + { + "epoch": 0.72357177734375, + "grad_norm": 12.962008476257324, + "learning_rate": 9.21762758413481e-06, + "loss": 5.0191, + "step": 35565 + }, + { + "epoch": 0.7236735026041666, + "grad_norm": 17.987489700317383, + "learning_rate": 9.217412908258073e-06, + "loss": 5.1264, + "step": 35570 + }, + { + "epoch": 0.7237752278645834, + "grad_norm": 18.17943572998047, + "learning_rate": 9.217198205433406e-06, + "loss": 5.0091, + "step": 35575 + }, + { + "epoch": 0.723876953125, + "grad_norm": 19.029170989990234, + "learning_rate": 9.21698347566218e-06, + "loss": 5.0459, + "step": 35580 + }, + { + "epoch": 0.7239786783854166, + "grad_norm": 11.989458084106445, + "learning_rate": 9.21676871894577e-06, + "loss": 4.9533, + "step": 35585 + }, + { + "epoch": 0.7240804036458334, + "grad_norm": 21.694168090820312, + "learning_rate": 9.216553935285546e-06, + "loss": 4.8771, + "step": 35590 + }, + { + "epoch": 0.72418212890625, + "grad_norm": 14.656665802001953, + "learning_rate": 9.21633912468288e-06, + "loss": 4.9294, + "step": 35595 + }, + { + "epoch": 0.7242838541666666, + "grad_norm": 16.109161376953125, + "learning_rate": 9.216124287139145e-06, + "loss": 5.5038, + "step": 35600 + }, + { + "epoch": 0.7243855794270834, + "grad_norm": 14.335966110229492, + "learning_rate": 9.215909422655716e-06, + "loss": 5.0416, + "step": 35605 + }, + { + "epoch": 0.7244873046875, + "grad_norm": 20.012474060058594, + "learning_rate": 9.215694531233964e-06, + "loss": 5.0512, + "step": 35610 + }, + { + "epoch": 0.7245890299479166, + "grad_norm": 14.587776184082031, + "learning_rate": 9.215479612875262e-06, + "loss": 4.9245, + "step": 35615 + }, + { + "epoch": 0.7246907552083334, + "grad_norm": 19.683324813842773, + "learning_rate": 9.215264667580984e-06, + "loss": 5.1142, + "step": 35620 + }, + { + "epoch": 0.72479248046875, + "grad_norm": 25.368186950683594, + "learning_rate": 9.2150496953525e-06, + "loss": 4.9949, + "step": 35625 + }, + { + "epoch": 0.7248942057291666, + "grad_norm": 19.23255729675293, + "learning_rate": 9.214834696191189e-06, + "loss": 5.2049, + "step": 35630 + }, + { + "epoch": 0.7249959309895834, + "grad_norm": 18.833356857299805, + "learning_rate": 9.214619670098422e-06, + "loss": 5.0494, + "step": 35635 + }, + { + "epoch": 0.72509765625, + "grad_norm": 17.50802993774414, + "learning_rate": 9.214404617075573e-06, + "loss": 5.1522, + "step": 35640 + }, + { + "epoch": 0.7251993815104166, + "grad_norm": 16.71397590637207, + "learning_rate": 9.214189537124014e-06, + "loss": 5.2367, + "step": 35645 + }, + { + "epoch": 0.7253011067708334, + "grad_norm": 15.823481559753418, + "learning_rate": 9.213974430245125e-06, + "loss": 4.8837, + "step": 35650 + }, + { + "epoch": 0.72540283203125, + "grad_norm": 13.398862838745117, + "learning_rate": 9.213759296440275e-06, + "loss": 5.3744, + "step": 35655 + }, + { + "epoch": 0.7255045572916666, + "grad_norm": 24.229272842407227, + "learning_rate": 9.21354413571084e-06, + "loss": 5.1426, + "step": 35660 + }, + { + "epoch": 0.7256062825520834, + "grad_norm": 13.31906795501709, + "learning_rate": 9.2133289480582e-06, + "loss": 5.1074, + "step": 35665 + }, + { + "epoch": 0.7257080078125, + "grad_norm": 20.043256759643555, + "learning_rate": 9.213113733483722e-06, + "loss": 4.9693, + "step": 35670 + }, + { + "epoch": 0.7258097330729166, + "grad_norm": 17.02350616455078, + "learning_rate": 9.212898491988785e-06, + "loss": 5.1013, + "step": 35675 + }, + { + "epoch": 0.7259114583333334, + "grad_norm": 17.42314910888672, + "learning_rate": 9.212683223574764e-06, + "loss": 4.8597, + "step": 35680 + }, + { + "epoch": 0.72601318359375, + "grad_norm": 19.522188186645508, + "learning_rate": 9.212467928243036e-06, + "loss": 5.024, + "step": 35685 + }, + { + "epoch": 0.7261149088541666, + "grad_norm": 17.613609313964844, + "learning_rate": 9.212252605994974e-06, + "loss": 4.8463, + "step": 35690 + }, + { + "epoch": 0.7262166341145834, + "grad_norm": 13.028299331665039, + "learning_rate": 9.212037256831954e-06, + "loss": 5.0809, + "step": 35695 + }, + { + "epoch": 0.726318359375, + "grad_norm": 16.06103515625, + "learning_rate": 9.211821880755355e-06, + "loss": 5.4283, + "step": 35700 + }, + { + "epoch": 0.7264200846354166, + "grad_norm": 17.646007537841797, + "learning_rate": 9.211606477766548e-06, + "loss": 4.9924, + "step": 35705 + }, + { + "epoch": 0.7265218098958334, + "grad_norm": 20.751699447631836, + "learning_rate": 9.211391047866916e-06, + "loss": 4.998, + "step": 35710 + }, + { + "epoch": 0.72662353515625, + "grad_norm": 15.878271102905273, + "learning_rate": 9.211175591057831e-06, + "loss": 5.1709, + "step": 35715 + }, + { + "epoch": 0.7267252604166666, + "grad_norm": 22.257675170898438, + "learning_rate": 9.21096010734067e-06, + "loss": 4.9423, + "step": 35720 + }, + { + "epoch": 0.7268269856770834, + "grad_norm": 18.12418556213379, + "learning_rate": 9.21074459671681e-06, + "loss": 5.0318, + "step": 35725 + }, + { + "epoch": 0.7269287109375, + "grad_norm": 17.994976043701172, + "learning_rate": 9.21052905918763e-06, + "loss": 5.3065, + "step": 35730 + }, + { + "epoch": 0.7270304361979166, + "grad_norm": 19.771188735961914, + "learning_rate": 9.210313494754505e-06, + "loss": 5.1333, + "step": 35735 + }, + { + "epoch": 0.7271321614583334, + "grad_norm": 17.60995101928711, + "learning_rate": 9.210097903418816e-06, + "loss": 5.0233, + "step": 35740 + }, + { + "epoch": 0.72723388671875, + "grad_norm": 14.111568450927734, + "learning_rate": 9.209882285181935e-06, + "loss": 5.2764, + "step": 35745 + }, + { + "epoch": 0.7273356119791666, + "grad_norm": 12.698616981506348, + "learning_rate": 9.209666640045244e-06, + "loss": 4.942, + "step": 35750 + }, + { + "epoch": 0.7274373372395834, + "grad_norm": 14.428445816040039, + "learning_rate": 9.209450968010118e-06, + "loss": 5.1583, + "step": 35755 + }, + { + "epoch": 0.7275390625, + "grad_norm": 15.878013610839844, + "learning_rate": 9.209235269077938e-06, + "loss": 5.0733, + "step": 35760 + }, + { + "epoch": 0.7276407877604166, + "grad_norm": 19.61509132385254, + "learning_rate": 9.209019543250081e-06, + "loss": 5.2535, + "step": 35765 + }, + { + "epoch": 0.7277425130208334, + "grad_norm": 16.145334243774414, + "learning_rate": 9.208803790527925e-06, + "loss": 5.229, + "step": 35770 + }, + { + "epoch": 0.72784423828125, + "grad_norm": 18.119205474853516, + "learning_rate": 9.208588010912848e-06, + "loss": 5.0227, + "step": 35775 + }, + { + "epoch": 0.7279459635416666, + "grad_norm": 16.443140029907227, + "learning_rate": 9.208372204406231e-06, + "loss": 5.2253, + "step": 35780 + }, + { + "epoch": 0.7280476888020834, + "grad_norm": 14.906098365783691, + "learning_rate": 9.208156371009449e-06, + "loss": 5.0829, + "step": 35785 + }, + { + "epoch": 0.7281494140625, + "grad_norm": 13.400835037231445, + "learning_rate": 9.207940510723886e-06, + "loss": 5.1032, + "step": 35790 + }, + { + "epoch": 0.7282511393229166, + "grad_norm": 12.179228782653809, + "learning_rate": 9.207724623550918e-06, + "loss": 5.2386, + "step": 35795 + }, + { + "epoch": 0.7283528645833334, + "grad_norm": 19.475887298583984, + "learning_rate": 9.207508709491925e-06, + "loss": 5.0709, + "step": 35800 + }, + { + "epoch": 0.72845458984375, + "grad_norm": 18.57087516784668, + "learning_rate": 9.207292768548288e-06, + "loss": 5.144, + "step": 35805 + }, + { + "epoch": 0.7285563151041666, + "grad_norm": 13.225935935974121, + "learning_rate": 9.207076800721385e-06, + "loss": 5.1362, + "step": 35810 + }, + { + "epoch": 0.7286580403645834, + "grad_norm": 15.300822257995605, + "learning_rate": 9.206860806012597e-06, + "loss": 4.9685, + "step": 35815 + }, + { + "epoch": 0.728759765625, + "grad_norm": 17.036752700805664, + "learning_rate": 9.206644784423304e-06, + "loss": 5.0199, + "step": 35820 + }, + { + "epoch": 0.7288614908854166, + "grad_norm": 14.085079193115234, + "learning_rate": 9.206428735954887e-06, + "loss": 5.0481, + "step": 35825 + }, + { + "epoch": 0.7289632161458334, + "grad_norm": 16.892515182495117, + "learning_rate": 9.206212660608725e-06, + "loss": 5.1695, + "step": 35830 + }, + { + "epoch": 0.72906494140625, + "grad_norm": 14.610936164855957, + "learning_rate": 9.205996558386198e-06, + "loss": 5.1826, + "step": 35835 + }, + { + "epoch": 0.7291666666666666, + "grad_norm": 13.573888778686523, + "learning_rate": 9.20578042928869e-06, + "loss": 5.0431, + "step": 35840 + }, + { + "epoch": 0.7292683919270834, + "grad_norm": 20.779226303100586, + "learning_rate": 9.20556427331758e-06, + "loss": 5.0915, + "step": 35845 + }, + { + "epoch": 0.7293701171875, + "grad_norm": 18.10480499267578, + "learning_rate": 9.205348090474248e-06, + "loss": 5.0346, + "step": 35850 + }, + { + "epoch": 0.7294718424479166, + "grad_norm": 18.00360679626465, + "learning_rate": 9.205131880760077e-06, + "loss": 4.8506, + "step": 35855 + }, + { + "epoch": 0.7295735677083334, + "grad_norm": 15.412200927734375, + "learning_rate": 9.204915644176449e-06, + "loss": 4.8988, + "step": 35860 + }, + { + "epoch": 0.72967529296875, + "grad_norm": 14.6448335647583, + "learning_rate": 9.204699380724744e-06, + "loss": 4.7585, + "step": 35865 + }, + { + "epoch": 0.7297770182291666, + "grad_norm": 17.882783889770508, + "learning_rate": 9.204483090406345e-06, + "loss": 4.9864, + "step": 35870 + }, + { + "epoch": 0.7298787434895834, + "grad_norm": 18.587005615234375, + "learning_rate": 9.204266773222635e-06, + "loss": 5.0816, + "step": 35875 + }, + { + "epoch": 0.72998046875, + "grad_norm": 23.508169174194336, + "learning_rate": 9.204050429174993e-06, + "loss": 5.1081, + "step": 35880 + }, + { + "epoch": 0.7300821940104166, + "grad_norm": 17.61883544921875, + "learning_rate": 9.203834058264804e-06, + "loss": 5.049, + "step": 35885 + }, + { + "epoch": 0.7301839192708334, + "grad_norm": 15.772636413574219, + "learning_rate": 9.20361766049345e-06, + "loss": 5.0379, + "step": 35890 + }, + { + "epoch": 0.73028564453125, + "grad_norm": 21.347023010253906, + "learning_rate": 9.203401235862314e-06, + "loss": 5.4314, + "step": 35895 + }, + { + "epoch": 0.7303873697916666, + "grad_norm": 15.537729263305664, + "learning_rate": 9.203184784372778e-06, + "loss": 5.1117, + "step": 35900 + }, + { + "epoch": 0.7304890950520834, + "grad_norm": 21.691987991333008, + "learning_rate": 9.202968306026225e-06, + "loss": 5.2839, + "step": 35905 + }, + { + "epoch": 0.7305908203125, + "grad_norm": 15.067819595336914, + "learning_rate": 9.20275180082404e-06, + "loss": 5.2618, + "step": 35910 + }, + { + "epoch": 0.7306925455729166, + "grad_norm": 14.799094200134277, + "learning_rate": 9.202535268767602e-06, + "loss": 5.0371, + "step": 35915 + }, + { + "epoch": 0.7307942708333334, + "grad_norm": 17.7814884185791, + "learning_rate": 9.202318709858302e-06, + "loss": 5.1411, + "step": 35920 + }, + { + "epoch": 0.73089599609375, + "grad_norm": 19.2491397857666, + "learning_rate": 9.202102124097516e-06, + "loss": 5.1319, + "step": 35925 + }, + { + "epoch": 0.7309977213541666, + "grad_norm": 18.829341888427734, + "learning_rate": 9.201885511486635e-06, + "loss": 5.0251, + "step": 35930 + }, + { + "epoch": 0.7310994466145834, + "grad_norm": 20.146928787231445, + "learning_rate": 9.201668872027036e-06, + "loss": 4.7687, + "step": 35935 + }, + { + "epoch": 0.731201171875, + "grad_norm": 21.156293869018555, + "learning_rate": 9.201452205720109e-06, + "loss": 4.9548, + "step": 35940 + }, + { + "epoch": 0.7313028971354166, + "grad_norm": 18.68021583557129, + "learning_rate": 9.201235512567234e-06, + "loss": 5.5517, + "step": 35945 + }, + { + "epoch": 0.7314046223958334, + "grad_norm": 25.685775756835938, + "learning_rate": 9.201018792569799e-06, + "loss": 4.929, + "step": 35950 + }, + { + "epoch": 0.73150634765625, + "grad_norm": 18.872053146362305, + "learning_rate": 9.200802045729187e-06, + "loss": 5.0174, + "step": 35955 + }, + { + "epoch": 0.7316080729166666, + "grad_norm": 16.205020904541016, + "learning_rate": 9.200585272046785e-06, + "loss": 5.0444, + "step": 35960 + }, + { + "epoch": 0.7317097981770834, + "grad_norm": 16.87994384765625, + "learning_rate": 9.200368471523975e-06, + "loss": 5.2174, + "step": 35965 + }, + { + "epoch": 0.7318115234375, + "grad_norm": 20.734588623046875, + "learning_rate": 9.200151644162142e-06, + "loss": 5.0821, + "step": 35970 + }, + { + "epoch": 0.7319132486979166, + "grad_norm": 15.925212860107422, + "learning_rate": 9.199934789962677e-06, + "loss": 4.8102, + "step": 35975 + }, + { + "epoch": 0.7320149739583334, + "grad_norm": 13.923768043518066, + "learning_rate": 9.19971790892696e-06, + "loss": 5.2479, + "step": 35980 + }, + { + "epoch": 0.73211669921875, + "grad_norm": 18.46208381652832, + "learning_rate": 9.199501001056379e-06, + "loss": 5.0673, + "step": 35985 + }, + { + "epoch": 0.7322184244791666, + "grad_norm": 19.124286651611328, + "learning_rate": 9.19928406635232e-06, + "loss": 5.4091, + "step": 35990 + }, + { + "epoch": 0.7323201497395834, + "grad_norm": 12.1830472946167, + "learning_rate": 9.199067104816168e-06, + "loss": 5.0241, + "step": 35995 + }, + { + "epoch": 0.732421875, + "grad_norm": 15.654253005981445, + "learning_rate": 9.198850116449311e-06, + "loss": 5.1137, + "step": 36000 + }, + { + "epoch": 0.7325236002604166, + "grad_norm": 19.138147354125977, + "learning_rate": 9.198633101253135e-06, + "loss": 5.1616, + "step": 36005 + }, + { + "epoch": 0.7326253255208334, + "grad_norm": 21.09530258178711, + "learning_rate": 9.198416059229025e-06, + "loss": 4.9643, + "step": 36010 + }, + { + "epoch": 0.73272705078125, + "grad_norm": 15.077824592590332, + "learning_rate": 9.198198990378372e-06, + "loss": 5.3052, + "step": 36015 + }, + { + "epoch": 0.7328287760416666, + "grad_norm": 18.26338005065918, + "learning_rate": 9.197981894702558e-06, + "loss": 5.2387, + "step": 36020 + }, + { + "epoch": 0.7329305013020834, + "grad_norm": 17.447689056396484, + "learning_rate": 9.19776477220297e-06, + "loss": 4.8468, + "step": 36025 + }, + { + "epoch": 0.7330322265625, + "grad_norm": 18.14933204650879, + "learning_rate": 9.197547622881e-06, + "loss": 5.2943, + "step": 36030 + }, + { + "epoch": 0.7331339518229166, + "grad_norm": 17.01972007751465, + "learning_rate": 9.197330446738034e-06, + "loss": 5.1859, + "step": 36035 + }, + { + "epoch": 0.7332356770833334, + "grad_norm": 19.029335021972656, + "learning_rate": 9.197113243775458e-06, + "loss": 5.1633, + "step": 36040 + }, + { + "epoch": 0.73333740234375, + "grad_norm": 16.650789260864258, + "learning_rate": 9.19689601399466e-06, + "loss": 5.177, + "step": 36045 + }, + { + "epoch": 0.7334391276041666, + "grad_norm": 18.2069149017334, + "learning_rate": 9.19667875739703e-06, + "loss": 5.1089, + "step": 36050 + }, + { + "epoch": 0.7335408528645834, + "grad_norm": 16.498889923095703, + "learning_rate": 9.196461473983956e-06, + "loss": 5.324, + "step": 36055 + }, + { + "epoch": 0.733642578125, + "grad_norm": 19.947113037109375, + "learning_rate": 9.196244163756823e-06, + "loss": 4.8589, + "step": 36060 + }, + { + "epoch": 0.7337443033854166, + "grad_norm": 18.63365364074707, + "learning_rate": 9.196026826717022e-06, + "loss": 5.0282, + "step": 36065 + }, + { + "epoch": 0.7338460286458334, + "grad_norm": 12.928183555603027, + "learning_rate": 9.195809462865944e-06, + "loss": 4.9789, + "step": 36070 + }, + { + "epoch": 0.73394775390625, + "grad_norm": 14.62520980834961, + "learning_rate": 9.195592072204973e-06, + "loss": 5.0427, + "step": 36075 + }, + { + "epoch": 0.7340494791666666, + "grad_norm": 15.953737258911133, + "learning_rate": 9.1953746547355e-06, + "loss": 5.1377, + "step": 36080 + }, + { + "epoch": 0.7341512044270834, + "grad_norm": 25.443578720092773, + "learning_rate": 9.195157210458916e-06, + "loss": 5.3997, + "step": 36085 + }, + { + "epoch": 0.7342529296875, + "grad_norm": 19.4981689453125, + "learning_rate": 9.194939739376609e-06, + "loss": 4.7754, + "step": 36090 + }, + { + "epoch": 0.7343546549479166, + "grad_norm": 22.378013610839844, + "learning_rate": 9.194722241489968e-06, + "loss": 5.0183, + "step": 36095 + }, + { + "epoch": 0.7344563802083334, + "grad_norm": 12.938876152038574, + "learning_rate": 9.194504716800386e-06, + "loss": 4.9011, + "step": 36100 + }, + { + "epoch": 0.73455810546875, + "grad_norm": 17.44036102294922, + "learning_rate": 9.194287165309248e-06, + "loss": 5.5147, + "step": 36105 + }, + { + "epoch": 0.7346598307291666, + "grad_norm": 16.51148223876953, + "learning_rate": 9.194069587017947e-06, + "loss": 4.8866, + "step": 36110 + }, + { + "epoch": 0.7347615559895834, + "grad_norm": 28.979114532470703, + "learning_rate": 9.193851981927873e-06, + "loss": 5.1632, + "step": 36115 + }, + { + "epoch": 0.73486328125, + "grad_norm": 22.963699340820312, + "learning_rate": 9.193634350040415e-06, + "loss": 4.9661, + "step": 36120 + }, + { + "epoch": 0.7349650065104166, + "grad_norm": 14.424213409423828, + "learning_rate": 9.193416691356966e-06, + "loss": 4.9459, + "step": 36125 + }, + { + "epoch": 0.7350667317708334, + "grad_norm": 17.39749526977539, + "learning_rate": 9.193199005878916e-06, + "loss": 5.0816, + "step": 36130 + }, + { + "epoch": 0.73516845703125, + "grad_norm": 14.196438789367676, + "learning_rate": 9.192981293607655e-06, + "loss": 5.0275, + "step": 36135 + }, + { + "epoch": 0.7352701822916666, + "grad_norm": 16.777854919433594, + "learning_rate": 9.192763554544574e-06, + "loss": 4.9741, + "step": 36140 + }, + { + "epoch": 0.7353719075520834, + "grad_norm": 20.344175338745117, + "learning_rate": 9.192545788691065e-06, + "loss": 5.2049, + "step": 36145 + }, + { + "epoch": 0.7354736328125, + "grad_norm": 21.58053970336914, + "learning_rate": 9.192327996048518e-06, + "loss": 5.0181, + "step": 36150 + }, + { + "epoch": 0.7355753580729166, + "grad_norm": 19.011035919189453, + "learning_rate": 9.192110176618328e-06, + "loss": 5.3555, + "step": 36155 + }, + { + "epoch": 0.7356770833333334, + "grad_norm": 24.017271041870117, + "learning_rate": 9.191892330401883e-06, + "loss": 5.2483, + "step": 36160 + }, + { + "epoch": 0.73577880859375, + "grad_norm": 18.53331756591797, + "learning_rate": 9.19167445740058e-06, + "loss": 5.1768, + "step": 36165 + }, + { + "epoch": 0.7358805338541666, + "grad_norm": 13.788512229919434, + "learning_rate": 9.191456557615804e-06, + "loss": 4.8755, + "step": 36170 + }, + { + "epoch": 0.7359822591145834, + "grad_norm": 16.579280853271484, + "learning_rate": 9.191238631048953e-06, + "loss": 4.8773, + "step": 36175 + }, + { + "epoch": 0.736083984375, + "grad_norm": 19.598722457885742, + "learning_rate": 9.191020677701417e-06, + "loss": 5.0012, + "step": 36180 + }, + { + "epoch": 0.7361857096354166, + "grad_norm": 16.96827507019043, + "learning_rate": 9.19080269757459e-06, + "loss": 5.2334, + "step": 36185 + }, + { + "epoch": 0.7362874348958334, + "grad_norm": 15.98430061340332, + "learning_rate": 9.190584690669863e-06, + "loss": 5.0727, + "step": 36190 + }, + { + "epoch": 0.73638916015625, + "grad_norm": 15.488249778747559, + "learning_rate": 9.190366656988632e-06, + "loss": 5.2558, + "step": 36195 + }, + { + "epoch": 0.7364908854166666, + "grad_norm": 13.022311210632324, + "learning_rate": 9.190148596532287e-06, + "loss": 4.9567, + "step": 36200 + }, + { + "epoch": 0.7365926106770834, + "grad_norm": 16.838665008544922, + "learning_rate": 9.189930509302223e-06, + "loss": 5.059, + "step": 36205 + }, + { + "epoch": 0.7366943359375, + "grad_norm": 14.54895305633545, + "learning_rate": 9.189712395299833e-06, + "loss": 5.1641, + "step": 36210 + }, + { + "epoch": 0.7367960611979166, + "grad_norm": 14.889993667602539, + "learning_rate": 9.18949425452651e-06, + "loss": 5.1159, + "step": 36215 + }, + { + "epoch": 0.7368977864583334, + "grad_norm": 12.726398468017578, + "learning_rate": 9.189276086983648e-06, + "loss": 5.0403, + "step": 36220 + }, + { + "epoch": 0.73699951171875, + "grad_norm": 17.100448608398438, + "learning_rate": 9.189057892672643e-06, + "loss": 5.2332, + "step": 36225 + }, + { + "epoch": 0.7371012369791666, + "grad_norm": 16.52337074279785, + "learning_rate": 9.18883967159489e-06, + "loss": 5.0644, + "step": 36230 + }, + { + "epoch": 0.7372029622395834, + "grad_norm": 25.317039489746094, + "learning_rate": 9.188621423751777e-06, + "loss": 5.1775, + "step": 36235 + }, + { + "epoch": 0.7373046875, + "grad_norm": 16.358184814453125, + "learning_rate": 9.188403149144704e-06, + "loss": 4.9027, + "step": 36240 + }, + { + "epoch": 0.7374064127604166, + "grad_norm": 17.99054718017578, + "learning_rate": 9.188184847775066e-06, + "loss": 5.1829, + "step": 36245 + }, + { + "epoch": 0.7375081380208334, + "grad_norm": 19.189197540283203, + "learning_rate": 9.187966519644255e-06, + "loss": 5.5807, + "step": 36250 + }, + { + "epoch": 0.73760986328125, + "grad_norm": 12.34333610534668, + "learning_rate": 9.187748164753668e-06, + "loss": 5.1266, + "step": 36255 + }, + { + "epoch": 0.7377115885416666, + "grad_norm": 20.361589431762695, + "learning_rate": 9.1875297831047e-06, + "loss": 4.9071, + "step": 36260 + }, + { + "epoch": 0.7378133138020834, + "grad_norm": 17.451745986938477, + "learning_rate": 9.187311374698746e-06, + "loss": 4.9652, + "step": 36265 + }, + { + "epoch": 0.7379150390625, + "grad_norm": 17.64006233215332, + "learning_rate": 9.1870929395372e-06, + "loss": 5.1282, + "step": 36270 + }, + { + "epoch": 0.7380167643229166, + "grad_norm": 20.753355026245117, + "learning_rate": 9.186874477621461e-06, + "loss": 4.9342, + "step": 36275 + }, + { + "epoch": 0.7381184895833334, + "grad_norm": 17.431884765625, + "learning_rate": 9.186655988952921e-06, + "loss": 4.9518, + "step": 36280 + }, + { + "epoch": 0.73822021484375, + "grad_norm": 18.749858856201172, + "learning_rate": 9.186437473532982e-06, + "loss": 5.3021, + "step": 36285 + }, + { + "epoch": 0.7383219401041666, + "grad_norm": 19.0269775390625, + "learning_rate": 9.186218931363032e-06, + "loss": 5.6276, + "step": 36290 + }, + { + "epoch": 0.7384236653645834, + "grad_norm": 16.10736656188965, + "learning_rate": 9.186000362444475e-06, + "loss": 5.2293, + "step": 36295 + }, + { + "epoch": 0.738525390625, + "grad_norm": 23.45927619934082, + "learning_rate": 9.185781766778702e-06, + "loss": 4.8984, + "step": 36300 + }, + { + "epoch": 0.7386271158854166, + "grad_norm": 18.773086547851562, + "learning_rate": 9.185563144367116e-06, + "loss": 4.842, + "step": 36305 + }, + { + "epoch": 0.7387288411458334, + "grad_norm": 15.609525680541992, + "learning_rate": 9.185344495211108e-06, + "loss": 5.2177, + "step": 36310 + }, + { + "epoch": 0.73883056640625, + "grad_norm": 20.687332153320312, + "learning_rate": 9.185125819312076e-06, + "loss": 4.9002, + "step": 36315 + }, + { + "epoch": 0.7389322916666666, + "grad_norm": 16.35346794128418, + "learning_rate": 9.18490711667142e-06, + "loss": 5.1088, + "step": 36320 + }, + { + "epoch": 0.7390340169270834, + "grad_norm": 19.51848602294922, + "learning_rate": 9.184688387290536e-06, + "loss": 5.348, + "step": 36325 + }, + { + "epoch": 0.7391357421875, + "grad_norm": 13.092832565307617, + "learning_rate": 9.18446963117082e-06, + "loss": 4.9786, + "step": 36330 + }, + { + "epoch": 0.7392374674479166, + "grad_norm": 16.66027069091797, + "learning_rate": 9.184250848313674e-06, + "loss": 5.2897, + "step": 36335 + }, + { + "epoch": 0.7393391927083334, + "grad_norm": 14.062280654907227, + "learning_rate": 9.184032038720491e-06, + "loss": 5.1373, + "step": 36340 + }, + { + "epoch": 0.73944091796875, + "grad_norm": 14.729297637939453, + "learning_rate": 9.183813202392674e-06, + "loss": 5.0609, + "step": 36345 + }, + { + "epoch": 0.7395426432291666, + "grad_norm": 14.66845703125, + "learning_rate": 9.183594339331617e-06, + "loss": 4.8931, + "step": 36350 + }, + { + "epoch": 0.7396443684895834, + "grad_norm": 20.665836334228516, + "learning_rate": 9.18337544953872e-06, + "loss": 5.1947, + "step": 36355 + }, + { + "epoch": 0.73974609375, + "grad_norm": 14.843277931213379, + "learning_rate": 9.183156533015384e-06, + "loss": 4.8336, + "step": 36360 + }, + { + "epoch": 0.7398478190104166, + "grad_norm": 28.989534378051758, + "learning_rate": 9.182937589763003e-06, + "loss": 5.0466, + "step": 36365 + }, + { + "epoch": 0.7399495442708334, + "grad_norm": 16.542877197265625, + "learning_rate": 9.18271861978298e-06, + "loss": 5.2375, + "step": 36370 + }, + { + "epoch": 0.74005126953125, + "grad_norm": 14.293092727661133, + "learning_rate": 9.182499623076712e-06, + "loss": 5.0022, + "step": 36375 + }, + { + "epoch": 0.7401529947916666, + "grad_norm": 16.264537811279297, + "learning_rate": 9.182280599645602e-06, + "loss": 4.8518, + "step": 36380 + }, + { + "epoch": 0.7402547200520834, + "grad_norm": 18.14327621459961, + "learning_rate": 9.182061549491044e-06, + "loss": 5.0724, + "step": 36385 + }, + { + "epoch": 0.7403564453125, + "grad_norm": 15.091423988342285, + "learning_rate": 9.18184247261444e-06, + "loss": 5.0144, + "step": 36390 + }, + { + "epoch": 0.7404581705729166, + "grad_norm": 17.286611557006836, + "learning_rate": 9.18162336901719e-06, + "loss": 5.1895, + "step": 36395 + }, + { + "epoch": 0.7405598958333334, + "grad_norm": 20.53700828552246, + "learning_rate": 9.181404238700696e-06, + "loss": 5.0521, + "step": 36400 + }, + { + "epoch": 0.74066162109375, + "grad_norm": 17.059349060058594, + "learning_rate": 9.181185081666355e-06, + "loss": 5.0845, + "step": 36405 + }, + { + "epoch": 0.7407633463541666, + "grad_norm": 15.799078941345215, + "learning_rate": 9.180965897915568e-06, + "loss": 5.0674, + "step": 36410 + }, + { + "epoch": 0.7408650716145834, + "grad_norm": 15.248836517333984, + "learning_rate": 9.180746687449737e-06, + "loss": 4.8392, + "step": 36415 + }, + { + "epoch": 0.740966796875, + "grad_norm": 15.12316608428955, + "learning_rate": 9.180527450270262e-06, + "loss": 5.4196, + "step": 36420 + }, + { + "epoch": 0.7410685221354166, + "grad_norm": 15.160975456237793, + "learning_rate": 9.180308186378542e-06, + "loss": 5.0095, + "step": 36425 + }, + { + "epoch": 0.7411702473958334, + "grad_norm": 18.79824447631836, + "learning_rate": 9.180088895775981e-06, + "loss": 5.1845, + "step": 36430 + }, + { + "epoch": 0.74127197265625, + "grad_norm": 14.58831787109375, + "learning_rate": 9.179869578463979e-06, + "loss": 5.217, + "step": 36435 + }, + { + "epoch": 0.7413736979166666, + "grad_norm": 14.207623481750488, + "learning_rate": 9.179650234443937e-06, + "loss": 5.0805, + "step": 36440 + }, + { + "epoch": 0.7414754231770834, + "grad_norm": 18.703079223632812, + "learning_rate": 9.179430863717258e-06, + "loss": 5.5514, + "step": 36445 + }, + { + "epoch": 0.7415771484375, + "grad_norm": 20.818628311157227, + "learning_rate": 9.179211466285341e-06, + "loss": 5.0155, + "step": 36450 + }, + { + "epoch": 0.7416788736979166, + "grad_norm": 21.695087432861328, + "learning_rate": 9.178992042149589e-06, + "loss": 5.196, + "step": 36455 + }, + { + "epoch": 0.7417805989583334, + "grad_norm": 17.22471046447754, + "learning_rate": 9.178772591311404e-06, + "loss": 4.9279, + "step": 36460 + }, + { + "epoch": 0.74188232421875, + "grad_norm": 14.941105842590332, + "learning_rate": 9.178553113772192e-06, + "loss": 5.1022, + "step": 36465 + }, + { + "epoch": 0.7419840494791666, + "grad_norm": 16.122615814208984, + "learning_rate": 9.17833360953335e-06, + "loss": 5.0967, + "step": 36470 + }, + { + "epoch": 0.7420857747395834, + "grad_norm": 18.495084762573242, + "learning_rate": 9.178114078596281e-06, + "loss": 5.209, + "step": 36475 + }, + { + "epoch": 0.7421875, + "grad_norm": 17.7525577545166, + "learning_rate": 9.177894520962392e-06, + "loss": 5.1974, + "step": 36480 + }, + { + "epoch": 0.7422892252604166, + "grad_norm": 16.331045150756836, + "learning_rate": 9.177674936633083e-06, + "loss": 5.0837, + "step": 36485 + }, + { + "epoch": 0.7423909505208334, + "grad_norm": 19.741727828979492, + "learning_rate": 9.177455325609756e-06, + "loss": 4.9037, + "step": 36490 + }, + { + "epoch": 0.74249267578125, + "grad_norm": 15.735695838928223, + "learning_rate": 9.177235687893815e-06, + "loss": 4.8765, + "step": 36495 + }, + { + "epoch": 0.7425944010416666, + "grad_norm": 29.134868621826172, + "learning_rate": 9.177016023486666e-06, + "loss": 5.4263, + "step": 36500 + }, + { + "epoch": 0.7426961263020834, + "grad_norm": 15.970190048217773, + "learning_rate": 9.176796332389709e-06, + "loss": 5.1418, + "step": 36505 + }, + { + "epoch": 0.7427978515625, + "grad_norm": 16.42106056213379, + "learning_rate": 9.17657661460435e-06, + "loss": 5.156, + "step": 36510 + }, + { + "epoch": 0.7428995768229166, + "grad_norm": 18.028079986572266, + "learning_rate": 9.176356870131994e-06, + "loss": 5.0048, + "step": 36515 + }, + { + "epoch": 0.7430013020833334, + "grad_norm": 19.67034912109375, + "learning_rate": 9.176137098974041e-06, + "loss": 5.2474, + "step": 36520 + }, + { + "epoch": 0.74310302734375, + "grad_norm": 16.001298904418945, + "learning_rate": 9.175917301131896e-06, + "loss": 4.9249, + "step": 36525 + }, + { + "epoch": 0.7432047526041666, + "grad_norm": 24.989795684814453, + "learning_rate": 9.175697476606968e-06, + "loss": 5.0737, + "step": 36530 + }, + { + "epoch": 0.7433064778645834, + "grad_norm": 19.516931533813477, + "learning_rate": 9.175477625400659e-06, + "loss": 5.1366, + "step": 36535 + }, + { + "epoch": 0.743408203125, + "grad_norm": 17.6110897064209, + "learning_rate": 9.175257747514373e-06, + "loss": 5.0425, + "step": 36540 + }, + { + "epoch": 0.7435099283854166, + "grad_norm": 20.431440353393555, + "learning_rate": 9.175037842949514e-06, + "loss": 4.8573, + "step": 36545 + }, + { + "epoch": 0.7436116536458334, + "grad_norm": 13.594186782836914, + "learning_rate": 9.17481791170749e-06, + "loss": 5.0415, + "step": 36550 + }, + { + "epoch": 0.74371337890625, + "grad_norm": 19.46485137939453, + "learning_rate": 9.174597953789704e-06, + "loss": 5.2711, + "step": 36555 + }, + { + "epoch": 0.7438151041666666, + "grad_norm": 17.079017639160156, + "learning_rate": 9.174377969197562e-06, + "loss": 4.8793, + "step": 36560 + }, + { + "epoch": 0.7439168294270834, + "grad_norm": 17.911453247070312, + "learning_rate": 9.17415795793247e-06, + "loss": 4.8208, + "step": 36565 + }, + { + "epoch": 0.7440185546875, + "grad_norm": 21.133577346801758, + "learning_rate": 9.173937919995835e-06, + "loss": 5.1245, + "step": 36570 + }, + { + "epoch": 0.7441202799479166, + "grad_norm": 15.159852027893066, + "learning_rate": 9.17371785538906e-06, + "loss": 4.866, + "step": 36575 + }, + { + "epoch": 0.7442220052083334, + "grad_norm": 22.12006378173828, + "learning_rate": 9.173497764113553e-06, + "loss": 4.9179, + "step": 36580 + }, + { + "epoch": 0.74432373046875, + "grad_norm": 11.710184097290039, + "learning_rate": 9.17327764617072e-06, + "loss": 5.2795, + "step": 36585 + }, + { + "epoch": 0.7444254557291666, + "grad_norm": 12.393409729003906, + "learning_rate": 9.173057501561968e-06, + "loss": 5.3545, + "step": 36590 + }, + { + "epoch": 0.7445271809895834, + "grad_norm": 20.76700782775879, + "learning_rate": 9.172837330288705e-06, + "loss": 5.2067, + "step": 36595 + }, + { + "epoch": 0.74462890625, + "grad_norm": 20.114839553833008, + "learning_rate": 9.172617132352334e-06, + "loss": 4.7618, + "step": 36600 + }, + { + "epoch": 0.7447306315104166, + "grad_norm": 19.855459213256836, + "learning_rate": 9.172396907754264e-06, + "loss": 4.9632, + "step": 36605 + }, + { + "epoch": 0.7448323567708334, + "grad_norm": 21.43327522277832, + "learning_rate": 9.172176656495904e-06, + "loss": 5.2123, + "step": 36610 + }, + { + "epoch": 0.74493408203125, + "grad_norm": 17.882692337036133, + "learning_rate": 9.171956378578658e-06, + "loss": 5.4478, + "step": 36615 + }, + { + "epoch": 0.7450358072916666, + "grad_norm": 12.904261589050293, + "learning_rate": 9.171736074003936e-06, + "loss": 5.1737, + "step": 36620 + }, + { + "epoch": 0.7451375325520834, + "grad_norm": 20.7202205657959, + "learning_rate": 9.171515742773145e-06, + "loss": 5.1741, + "step": 36625 + }, + { + "epoch": 0.7452392578125, + "grad_norm": 15.834383964538574, + "learning_rate": 9.171295384887691e-06, + "loss": 4.9143, + "step": 36630 + }, + { + "epoch": 0.7453409830729166, + "grad_norm": 15.168669700622559, + "learning_rate": 9.171075000348985e-06, + "loss": 5.1106, + "step": 36635 + }, + { + "epoch": 0.7454427083333334, + "grad_norm": 16.184864044189453, + "learning_rate": 9.170854589158433e-06, + "loss": 5.1087, + "step": 36640 + }, + { + "epoch": 0.74554443359375, + "grad_norm": 14.807124137878418, + "learning_rate": 9.170634151317445e-06, + "loss": 4.9074, + "step": 36645 + }, + { + "epoch": 0.7456461588541666, + "grad_norm": 14.755478858947754, + "learning_rate": 9.170413686827428e-06, + "loss": 4.8965, + "step": 36650 + }, + { + "epoch": 0.7457478841145834, + "grad_norm": 15.84883975982666, + "learning_rate": 9.170193195689791e-06, + "loss": 4.9696, + "step": 36655 + }, + { + "epoch": 0.745849609375, + "grad_norm": 15.391033172607422, + "learning_rate": 9.169972677905945e-06, + "loss": 4.9932, + "step": 36660 + }, + { + "epoch": 0.7459513346354166, + "grad_norm": 14.601147651672363, + "learning_rate": 9.169752133477297e-06, + "loss": 5.0422, + "step": 36665 + }, + { + "epoch": 0.7460530598958334, + "grad_norm": 30.939626693725586, + "learning_rate": 9.169531562405256e-06, + "loss": 5.0101, + "step": 36670 + }, + { + "epoch": 0.74615478515625, + "grad_norm": 13.973281860351562, + "learning_rate": 9.169310964691232e-06, + "loss": 4.9917, + "step": 36675 + }, + { + "epoch": 0.7462565104166666, + "grad_norm": 16.415855407714844, + "learning_rate": 9.169090340336633e-06, + "loss": 5.2641, + "step": 36680 + }, + { + "epoch": 0.7463582356770834, + "grad_norm": 22.76760482788086, + "learning_rate": 9.168869689342871e-06, + "loss": 5.4978, + "step": 36685 + }, + { + "epoch": 0.7464599609375, + "grad_norm": 15.6393404006958, + "learning_rate": 9.168649011711357e-06, + "loss": 4.8266, + "step": 36690 + }, + { + "epoch": 0.7465616861979166, + "grad_norm": 15.036974906921387, + "learning_rate": 9.168428307443498e-06, + "loss": 4.9987, + "step": 36695 + }, + { + "epoch": 0.7466634114583334, + "grad_norm": 17.457195281982422, + "learning_rate": 9.168207576540704e-06, + "loss": 5.1709, + "step": 36700 + }, + { + "epoch": 0.74676513671875, + "grad_norm": 15.871780395507812, + "learning_rate": 9.16798681900439e-06, + "loss": 5.1224, + "step": 36705 + }, + { + "epoch": 0.7468668619791666, + "grad_norm": 24.477901458740234, + "learning_rate": 9.167766034835959e-06, + "loss": 5.1856, + "step": 36710 + }, + { + "epoch": 0.7469685872395834, + "grad_norm": 16.68035316467285, + "learning_rate": 9.167545224036828e-06, + "loss": 5.2883, + "step": 36715 + }, + { + "epoch": 0.7470703125, + "grad_norm": 20.313692092895508, + "learning_rate": 9.167324386608406e-06, + "loss": 5.1299, + "step": 36720 + }, + { + "epoch": 0.7471720377604166, + "grad_norm": 14.299885749816895, + "learning_rate": 9.167103522552103e-06, + "loss": 5.041, + "step": 36725 + }, + { + "epoch": 0.7472737630208334, + "grad_norm": 17.81897735595703, + "learning_rate": 9.166882631869334e-06, + "loss": 5.1585, + "step": 36730 + }, + { + "epoch": 0.74737548828125, + "grad_norm": 16.55703353881836, + "learning_rate": 9.166661714561505e-06, + "loss": 4.9812, + "step": 36735 + }, + { + "epoch": 0.7474772135416666, + "grad_norm": 21.912317276000977, + "learning_rate": 9.166440770630032e-06, + "loss": 5.1605, + "step": 36740 + }, + { + "epoch": 0.7475789388020834, + "grad_norm": 27.465051651000977, + "learning_rate": 9.166219800076325e-06, + "loss": 5.4124, + "step": 36745 + }, + { + "epoch": 0.7476806640625, + "grad_norm": 15.262548446655273, + "learning_rate": 9.165998802901795e-06, + "loss": 5.1303, + "step": 36750 + }, + { + "epoch": 0.7477823893229166, + "grad_norm": 17.82813835144043, + "learning_rate": 9.165777779107856e-06, + "loss": 5.052, + "step": 36755 + }, + { + "epoch": 0.7478841145833334, + "grad_norm": 17.218631744384766, + "learning_rate": 9.16555672869592e-06, + "loss": 5.2407, + "step": 36760 + }, + { + "epoch": 0.74798583984375, + "grad_norm": 20.290536880493164, + "learning_rate": 9.165335651667398e-06, + "loss": 5.2862, + "step": 36765 + }, + { + "epoch": 0.7480875651041666, + "grad_norm": 15.588096618652344, + "learning_rate": 9.165114548023703e-06, + "loss": 4.9885, + "step": 36770 + }, + { + "epoch": 0.7481892903645834, + "grad_norm": 17.030899047851562, + "learning_rate": 9.164893417766248e-06, + "loss": 5.2907, + "step": 36775 + }, + { + "epoch": 0.748291015625, + "grad_norm": 15.521836280822754, + "learning_rate": 9.164672260896448e-06, + "loss": 4.9797, + "step": 36780 + }, + { + "epoch": 0.7483927408854166, + "grad_norm": 17.150087356567383, + "learning_rate": 9.164451077415712e-06, + "loss": 5.1116, + "step": 36785 + }, + { + "epoch": 0.7484944661458334, + "grad_norm": 20.158946990966797, + "learning_rate": 9.164229867325457e-06, + "loss": 5.3358, + "step": 36790 + }, + { + "epoch": 0.74859619140625, + "grad_norm": 18.272354125976562, + "learning_rate": 9.164008630627094e-06, + "loss": 5.0104, + "step": 36795 + }, + { + "epoch": 0.7486979166666666, + "grad_norm": 18.511089324951172, + "learning_rate": 9.16378736732204e-06, + "loss": 4.8382, + "step": 36800 + }, + { + "epoch": 0.7487996419270834, + "grad_norm": 18.177841186523438, + "learning_rate": 9.163566077411703e-06, + "loss": 5.0413, + "step": 36805 + }, + { + "epoch": 0.7489013671875, + "grad_norm": 13.852638244628906, + "learning_rate": 9.163344760897504e-06, + "loss": 4.9667, + "step": 36810 + }, + { + "epoch": 0.7490030924479166, + "grad_norm": 15.622172355651855, + "learning_rate": 9.163123417780851e-06, + "loss": 5.2103, + "step": 36815 + }, + { + "epoch": 0.7491048177083334, + "grad_norm": 19.15264320373535, + "learning_rate": 9.162902048063162e-06, + "loss": 5.2651, + "step": 36820 + }, + { + "epoch": 0.74920654296875, + "grad_norm": 15.303886413574219, + "learning_rate": 9.16268065174585e-06, + "loss": 5.0331, + "step": 36825 + }, + { + "epoch": 0.7493082682291666, + "grad_norm": 16.772443771362305, + "learning_rate": 9.16245922883033e-06, + "loss": 5.3578, + "step": 36830 + }, + { + "epoch": 0.7494099934895834, + "grad_norm": 15.988479614257812, + "learning_rate": 9.162237779318018e-06, + "loss": 5.1894, + "step": 36835 + }, + { + "epoch": 0.74951171875, + "grad_norm": 16.875856399536133, + "learning_rate": 9.162016303210326e-06, + "loss": 5.1512, + "step": 36840 + }, + { + "epoch": 0.7496134440104166, + "grad_norm": 18.80177116394043, + "learning_rate": 9.161794800508673e-06, + "loss": 5.1804, + "step": 36845 + }, + { + "epoch": 0.7497151692708334, + "grad_norm": 16.359338760375977, + "learning_rate": 9.161573271214471e-06, + "loss": 5.164, + "step": 36850 + }, + { + "epoch": 0.74981689453125, + "grad_norm": 19.86432647705078, + "learning_rate": 9.161351715329137e-06, + "loss": 5.0234, + "step": 36855 + }, + { + "epoch": 0.7499186197916666, + "grad_norm": 18.550922393798828, + "learning_rate": 9.161130132854087e-06, + "loss": 4.8853, + "step": 36860 + }, + { + "epoch": 0.75, + "eval_loss": 5.0802226066589355, + "eval_runtime": 107.2556, + "eval_samples_per_second": 18.712, + "eval_steps_per_second": 9.361, + "step": 36864 + }, + { + "epoch": 0.7500203450520834, + "grad_norm": 15.717974662780762, + "learning_rate": 9.160908523790737e-06, + "loss": 5.2716, + "step": 36865 + }, + { + "epoch": 0.7501220703125, + "grad_norm": 21.470962524414062, + "learning_rate": 9.160686888140502e-06, + "loss": 5.2807, + "step": 36870 + }, + { + "epoch": 0.7502237955729166, + "grad_norm": 13.37724781036377, + "learning_rate": 9.160465225904797e-06, + "loss": 4.9399, + "step": 36875 + }, + { + "epoch": 0.7503255208333334, + "grad_norm": 15.28072452545166, + "learning_rate": 9.16024353708504e-06, + "loss": 5.2532, + "step": 36880 + }, + { + "epoch": 0.75042724609375, + "grad_norm": 16.816984176635742, + "learning_rate": 9.16002182168265e-06, + "loss": 5.0852, + "step": 36885 + }, + { + "epoch": 0.7505289713541666, + "grad_norm": 14.177842140197754, + "learning_rate": 9.15980007969904e-06, + "loss": 5.1196, + "step": 36890 + }, + { + "epoch": 0.7506306966145834, + "grad_norm": 15.167383193969727, + "learning_rate": 9.159578311135626e-06, + "loss": 4.7559, + "step": 36895 + }, + { + "epoch": 0.750732421875, + "grad_norm": 16.433259963989258, + "learning_rate": 9.159356515993829e-06, + "loss": 5.0972, + "step": 36900 + }, + { + "epoch": 0.7508341471354166, + "grad_norm": 15.900951385498047, + "learning_rate": 9.159134694275065e-06, + "loss": 5.1816, + "step": 36905 + }, + { + "epoch": 0.7509358723958334, + "grad_norm": 19.588964462280273, + "learning_rate": 9.158912845980749e-06, + "loss": 5.1655, + "step": 36910 + }, + { + "epoch": 0.75103759765625, + "grad_norm": 17.821395874023438, + "learning_rate": 9.1586909711123e-06, + "loss": 4.9325, + "step": 36915 + }, + { + "epoch": 0.7511393229166666, + "grad_norm": 24.131196975708008, + "learning_rate": 9.158469069671137e-06, + "loss": 4.9975, + "step": 36920 + }, + { + "epoch": 0.7512410481770834, + "grad_norm": 14.133079528808594, + "learning_rate": 9.158247141658676e-06, + "loss": 5.1791, + "step": 36925 + }, + { + "epoch": 0.7513427734375, + "grad_norm": 17.21076202392578, + "learning_rate": 9.158025187076337e-06, + "loss": 4.7687, + "step": 36930 + }, + { + "epoch": 0.7514444986979166, + "grad_norm": 13.554998397827148, + "learning_rate": 9.157803205925535e-06, + "loss": 5.1024, + "step": 36935 + }, + { + "epoch": 0.7515462239583334, + "grad_norm": 22.526395797729492, + "learning_rate": 9.15758119820769e-06, + "loss": 5.5267, + "step": 36940 + }, + { + "epoch": 0.75164794921875, + "grad_norm": 12.59363079071045, + "learning_rate": 9.157359163924222e-06, + "loss": 4.9209, + "step": 36945 + }, + { + "epoch": 0.7517496744791666, + "grad_norm": 19.165348052978516, + "learning_rate": 9.15713710307655e-06, + "loss": 4.897, + "step": 36950 + }, + { + "epoch": 0.7518513997395834, + "grad_norm": 16.731475830078125, + "learning_rate": 9.156915015666091e-06, + "loss": 4.8167, + "step": 36955 + }, + { + "epoch": 0.751953125, + "grad_norm": 16.487009048461914, + "learning_rate": 9.156692901694263e-06, + "loss": 5.1817, + "step": 36960 + }, + { + "epoch": 0.7520548502604166, + "grad_norm": 17.33098793029785, + "learning_rate": 9.156470761162488e-06, + "loss": 5.0819, + "step": 36965 + }, + { + "epoch": 0.7521565755208334, + "grad_norm": 16.780330657958984, + "learning_rate": 9.156248594072184e-06, + "loss": 5.1995, + "step": 36970 + }, + { + "epoch": 0.75225830078125, + "grad_norm": 17.68711280822754, + "learning_rate": 9.156026400424771e-06, + "loss": 5.3312, + "step": 36975 + }, + { + "epoch": 0.7523600260416666, + "grad_norm": 16.173137664794922, + "learning_rate": 9.155804180221668e-06, + "loss": 5.171, + "step": 36980 + }, + { + "epoch": 0.7524617513020834, + "grad_norm": 15.171792030334473, + "learning_rate": 9.155581933464297e-06, + "loss": 5.1228, + "step": 36985 + }, + { + "epoch": 0.7525634765625, + "grad_norm": 19.392986297607422, + "learning_rate": 9.155359660154076e-06, + "loss": 5.067, + "step": 36990 + }, + { + "epoch": 0.7526652018229166, + "grad_norm": 18.615327835083008, + "learning_rate": 9.155137360292426e-06, + "loss": 5.0045, + "step": 36995 + }, + { + "epoch": 0.7527669270833334, + "grad_norm": 18.6655330657959, + "learning_rate": 9.154915033880766e-06, + "loss": 5.0426, + "step": 37000 + }, + { + "epoch": 0.75286865234375, + "grad_norm": 18.05080223083496, + "learning_rate": 9.154692680920518e-06, + "loss": 5.0827, + "step": 37005 + }, + { + "epoch": 0.7529703776041666, + "grad_norm": 16.393539428710938, + "learning_rate": 9.154470301413105e-06, + "loss": 5.0886, + "step": 37010 + }, + { + "epoch": 0.7530721028645834, + "grad_norm": 12.53264331817627, + "learning_rate": 9.154247895359943e-06, + "loss": 4.7973, + "step": 37015 + }, + { + "epoch": 0.753173828125, + "grad_norm": 15.355181694030762, + "learning_rate": 9.154025462762457e-06, + "loss": 4.9212, + "step": 37020 + }, + { + "epoch": 0.7532755533854166, + "grad_norm": 23.829681396484375, + "learning_rate": 9.153803003622066e-06, + "loss": 5.3036, + "step": 37025 + }, + { + "epoch": 0.7533772786458334, + "grad_norm": 15.332387924194336, + "learning_rate": 9.153580517940193e-06, + "loss": 5.2296, + "step": 37030 + }, + { + "epoch": 0.75347900390625, + "grad_norm": 16.79710578918457, + "learning_rate": 9.15335800571826e-06, + "loss": 5.4748, + "step": 37035 + }, + { + "epoch": 0.7535807291666666, + "grad_norm": 16.134990692138672, + "learning_rate": 9.153135466957686e-06, + "loss": 5.8208, + "step": 37040 + }, + { + "epoch": 0.7536824544270834, + "grad_norm": 21.150318145751953, + "learning_rate": 9.152912901659896e-06, + "loss": 4.9825, + "step": 37045 + }, + { + "epoch": 0.7537841796875, + "grad_norm": 15.783079147338867, + "learning_rate": 9.152690309826311e-06, + "loss": 4.9316, + "step": 37050 + }, + { + "epoch": 0.7538859049479166, + "grad_norm": 13.229981422424316, + "learning_rate": 9.152467691458354e-06, + "loss": 4.9457, + "step": 37055 + }, + { + "epoch": 0.7539876302083334, + "grad_norm": 17.150190353393555, + "learning_rate": 9.152245046557444e-06, + "loss": 4.9138, + "step": 37060 + }, + { + "epoch": 0.75408935546875, + "grad_norm": 15.779364585876465, + "learning_rate": 9.152022375125007e-06, + "loss": 5.3377, + "step": 37065 + }, + { + "epoch": 0.7541910807291666, + "grad_norm": 14.835942268371582, + "learning_rate": 9.151799677162465e-06, + "loss": 4.9377, + "step": 37070 + }, + { + "epoch": 0.7542928059895834, + "grad_norm": 18.73131561279297, + "learning_rate": 9.151576952671242e-06, + "loss": 5.0414, + "step": 37075 + }, + { + "epoch": 0.75439453125, + "grad_norm": 12.529745101928711, + "learning_rate": 9.15135420165276e-06, + "loss": 5.267, + "step": 37080 + }, + { + "epoch": 0.7544962565104166, + "grad_norm": 19.441896438598633, + "learning_rate": 9.151131424108442e-06, + "loss": 5.2086, + "step": 37085 + }, + { + "epoch": 0.7545979817708334, + "grad_norm": 17.836814880371094, + "learning_rate": 9.150908620039713e-06, + "loss": 4.8526, + "step": 37090 + }, + { + "epoch": 0.75469970703125, + "grad_norm": 16.72675895690918, + "learning_rate": 9.150685789447994e-06, + "loss": 4.9, + "step": 37095 + }, + { + "epoch": 0.7548014322916666, + "grad_norm": 17.22470474243164, + "learning_rate": 9.150462932334712e-06, + "loss": 5.0329, + "step": 37100 + }, + { + "epoch": 0.7549031575520834, + "grad_norm": 21.65757942199707, + "learning_rate": 9.15024004870129e-06, + "loss": 4.8452, + "step": 37105 + }, + { + "epoch": 0.7550048828125, + "grad_norm": 17.14958953857422, + "learning_rate": 9.150017138549147e-06, + "loss": 5.0433, + "step": 37110 + }, + { + "epoch": 0.7551066080729166, + "grad_norm": 14.127737045288086, + "learning_rate": 9.149794201879715e-06, + "loss": 5.0651, + "step": 37115 + }, + { + "epoch": 0.7552083333333334, + "grad_norm": 15.361345291137695, + "learning_rate": 9.149571238694417e-06, + "loss": 5.0741, + "step": 37120 + }, + { + "epoch": 0.75531005859375, + "grad_norm": 19.14865493774414, + "learning_rate": 9.149348248994675e-06, + "loss": 5.1112, + "step": 37125 + }, + { + "epoch": 0.7554117838541666, + "grad_norm": 13.402814865112305, + "learning_rate": 9.149125232781912e-06, + "loss": 5.0271, + "step": 37130 + }, + { + "epoch": 0.7555135091145834, + "grad_norm": 16.50531578063965, + "learning_rate": 9.14890219005756e-06, + "loss": 5.2344, + "step": 37135 + }, + { + "epoch": 0.755615234375, + "grad_norm": 27.759105682373047, + "learning_rate": 9.148679120823038e-06, + "loss": 5.0043, + "step": 37140 + }, + { + "epoch": 0.7557169596354166, + "grad_norm": 17.65532875061035, + "learning_rate": 9.148456025079773e-06, + "loss": 5.0172, + "step": 37145 + }, + { + "epoch": 0.7558186848958334, + "grad_norm": 13.739950180053711, + "learning_rate": 9.148232902829192e-06, + "loss": 5.2302, + "step": 37150 + }, + { + "epoch": 0.75592041015625, + "grad_norm": 17.289796829223633, + "learning_rate": 9.148009754072719e-06, + "loss": 5.0245, + "step": 37155 + }, + { + "epoch": 0.7560221354166666, + "grad_norm": 14.163284301757812, + "learning_rate": 9.14778657881178e-06, + "loss": 5.0497, + "step": 37160 + }, + { + "epoch": 0.7561238606770834, + "grad_norm": 15.174962043762207, + "learning_rate": 9.147563377047801e-06, + "loss": 5.2508, + "step": 37165 + }, + { + "epoch": 0.7562255859375, + "grad_norm": 18.774858474731445, + "learning_rate": 9.14734014878221e-06, + "loss": 4.7588, + "step": 37170 + }, + { + "epoch": 0.7563273111979166, + "grad_norm": 14.578207969665527, + "learning_rate": 9.147116894016433e-06, + "loss": 4.9172, + "step": 37175 + }, + { + "epoch": 0.7564290364583334, + "grad_norm": 20.5206298828125, + "learning_rate": 9.146893612751892e-06, + "loss": 5.2803, + "step": 37180 + }, + { + "epoch": 0.75653076171875, + "grad_norm": 19.58152961730957, + "learning_rate": 9.146670304990021e-06, + "loss": 5.245, + "step": 37185 + }, + { + "epoch": 0.7566324869791666, + "grad_norm": 15.266900062561035, + "learning_rate": 9.146446970732241e-06, + "loss": 4.919, + "step": 37190 + }, + { + "epoch": 0.7567342122395834, + "grad_norm": 18.98937225341797, + "learning_rate": 9.146223609979981e-06, + "loss": 5.0021, + "step": 37195 + }, + { + "epoch": 0.7568359375, + "grad_norm": 22.211769104003906, + "learning_rate": 9.14600022273467e-06, + "loss": 5.012, + "step": 37200 + }, + { + "epoch": 0.7569376627604166, + "grad_norm": 16.639507293701172, + "learning_rate": 9.145776808997731e-06, + "loss": 5.2709, + "step": 37205 + }, + { + "epoch": 0.7570393880208334, + "grad_norm": 17.89674949645996, + "learning_rate": 9.145553368770597e-06, + "loss": 5.0504, + "step": 37210 + }, + { + "epoch": 0.75714111328125, + "grad_norm": 16.35710334777832, + "learning_rate": 9.145329902054691e-06, + "loss": 5.2139, + "step": 37215 + }, + { + "epoch": 0.7572428385416666, + "grad_norm": 18.924062728881836, + "learning_rate": 9.145106408851443e-06, + "loss": 5.1207, + "step": 37220 + }, + { + "epoch": 0.7573445638020834, + "grad_norm": 16.890331268310547, + "learning_rate": 9.144882889162283e-06, + "loss": 5.031, + "step": 37225 + }, + { + "epoch": 0.7574462890625, + "grad_norm": 18.06820297241211, + "learning_rate": 9.144659342988635e-06, + "loss": 5.0594, + "step": 37230 + }, + { + "epoch": 0.7575480143229166, + "grad_norm": 17.680150985717773, + "learning_rate": 9.14443577033193e-06, + "loss": 5.0031, + "step": 37235 + }, + { + "epoch": 0.7576497395833334, + "grad_norm": 20.575870513916016, + "learning_rate": 9.144212171193596e-06, + "loss": 5.0268, + "step": 37240 + }, + { + "epoch": 0.75775146484375, + "grad_norm": 18.533287048339844, + "learning_rate": 9.143988545575063e-06, + "loss": 5.3269, + "step": 37245 + }, + { + "epoch": 0.7578531901041666, + "grad_norm": 15.216964721679688, + "learning_rate": 9.143764893477756e-06, + "loss": 4.996, + "step": 37250 + }, + { + "epoch": 0.7579549153645834, + "grad_norm": 24.150239944458008, + "learning_rate": 9.143541214903108e-06, + "loss": 5.2559, + "step": 37255 + }, + { + "epoch": 0.758056640625, + "grad_norm": 15.114307403564453, + "learning_rate": 9.143317509852547e-06, + "loss": 5.0311, + "step": 37260 + }, + { + "epoch": 0.7581583658854166, + "grad_norm": 19.744855880737305, + "learning_rate": 9.143093778327503e-06, + "loss": 4.9996, + "step": 37265 + }, + { + "epoch": 0.7582600911458334, + "grad_norm": 16.07987403869629, + "learning_rate": 9.142870020329404e-06, + "loss": 4.9726, + "step": 37270 + }, + { + "epoch": 0.75836181640625, + "grad_norm": 20.685100555419922, + "learning_rate": 9.14264623585968e-06, + "loss": 5.1559, + "step": 37275 + }, + { + "epoch": 0.7584635416666666, + "grad_norm": 20.073341369628906, + "learning_rate": 9.142422424919764e-06, + "loss": 4.8973, + "step": 37280 + }, + { + "epoch": 0.7585652669270834, + "grad_norm": 17.76526641845703, + "learning_rate": 9.142198587511083e-06, + "loss": 4.8759, + "step": 37285 + }, + { + "epoch": 0.7586669921875, + "grad_norm": 24.290531158447266, + "learning_rate": 9.141974723635067e-06, + "loss": 5.3905, + "step": 37290 + }, + { + "epoch": 0.7587687174479166, + "grad_norm": 20.07640838623047, + "learning_rate": 9.141750833293149e-06, + "loss": 4.8258, + "step": 37295 + }, + { + "epoch": 0.7588704427083334, + "grad_norm": 20.623519897460938, + "learning_rate": 9.141526916486755e-06, + "loss": 4.9791, + "step": 37300 + }, + { + "epoch": 0.75897216796875, + "grad_norm": 14.927350044250488, + "learning_rate": 9.141302973217321e-06, + "loss": 5.3049, + "step": 37305 + }, + { + "epoch": 0.7590738932291666, + "grad_norm": 16.336585998535156, + "learning_rate": 9.141079003486275e-06, + "loss": 5.0395, + "step": 37310 + }, + { + "epoch": 0.7591756184895834, + "grad_norm": 13.550013542175293, + "learning_rate": 9.140855007295049e-06, + "loss": 4.9644, + "step": 37315 + }, + { + "epoch": 0.75927734375, + "grad_norm": 15.901202201843262, + "learning_rate": 9.140630984645073e-06, + "loss": 5.3111, + "step": 37320 + }, + { + "epoch": 0.7593790690104166, + "grad_norm": 20.44413185119629, + "learning_rate": 9.140406935537781e-06, + "loss": 5.0152, + "step": 37325 + }, + { + "epoch": 0.7594807942708334, + "grad_norm": 21.38147735595703, + "learning_rate": 9.140182859974603e-06, + "loss": 5.0247, + "step": 37330 + }, + { + "epoch": 0.75958251953125, + "grad_norm": 14.95702838897705, + "learning_rate": 9.13995875795697e-06, + "loss": 5.2904, + "step": 37335 + }, + { + "epoch": 0.7596842447916666, + "grad_norm": 15.836380958557129, + "learning_rate": 9.139734629486314e-06, + "loss": 4.8117, + "step": 37340 + }, + { + "epoch": 0.7597859700520834, + "grad_norm": 16.662620544433594, + "learning_rate": 9.139510474564068e-06, + "loss": 5.3989, + "step": 37345 + }, + { + "epoch": 0.7598876953125, + "grad_norm": 18.051849365234375, + "learning_rate": 9.139286293191665e-06, + "loss": 4.8633, + "step": 37350 + }, + { + "epoch": 0.7599894205729166, + "grad_norm": 14.04649829864502, + "learning_rate": 9.139062085370536e-06, + "loss": 4.9279, + "step": 37355 + }, + { + "epoch": 0.7600911458333334, + "grad_norm": 64.87418365478516, + "learning_rate": 9.138837851102116e-06, + "loss": 5.2768, + "step": 37360 + }, + { + "epoch": 0.76019287109375, + "grad_norm": 20.41949462890625, + "learning_rate": 9.138613590387834e-06, + "loss": 5.0662, + "step": 37365 + }, + { + "epoch": 0.7602945963541666, + "grad_norm": 12.443839073181152, + "learning_rate": 9.138389303229125e-06, + "loss": 5.1003, + "step": 37370 + }, + { + "epoch": 0.7603963216145834, + "grad_norm": 19.218307495117188, + "learning_rate": 9.138164989627424e-06, + "loss": 5.0766, + "step": 37375 + }, + { + "epoch": 0.760498046875, + "grad_norm": 18.763778686523438, + "learning_rate": 9.137940649584159e-06, + "loss": 5.3511, + "step": 37380 + }, + { + "epoch": 0.7605997721354166, + "grad_norm": 16.552038192749023, + "learning_rate": 9.137716283100769e-06, + "loss": 4.8395, + "step": 37385 + }, + { + "epoch": 0.7607014973958334, + "grad_norm": 15.08154296875, + "learning_rate": 9.137491890178685e-06, + "loss": 5.1331, + "step": 37390 + }, + { + "epoch": 0.76080322265625, + "grad_norm": 18.443649291992188, + "learning_rate": 9.137267470819341e-06, + "loss": 4.9673, + "step": 37395 + }, + { + "epoch": 0.7609049479166666, + "grad_norm": 15.43382740020752, + "learning_rate": 9.137043025024172e-06, + "loss": 4.8793, + "step": 37400 + }, + { + "epoch": 0.7610066731770834, + "grad_norm": 15.468587875366211, + "learning_rate": 9.13681855279461e-06, + "loss": 5.0844, + "step": 37405 + }, + { + "epoch": 0.7611083984375, + "grad_norm": 16.379697799682617, + "learning_rate": 9.136594054132092e-06, + "loss": 5.1547, + "step": 37410 + }, + { + "epoch": 0.7612101236979166, + "grad_norm": 12.827725410461426, + "learning_rate": 9.13636952903805e-06, + "loss": 5.1106, + "step": 37415 + }, + { + "epoch": 0.7613118489583334, + "grad_norm": 16.474611282348633, + "learning_rate": 9.136144977513922e-06, + "loss": 5.113, + "step": 37420 + }, + { + "epoch": 0.76141357421875, + "grad_norm": 18.778759002685547, + "learning_rate": 9.135920399561138e-06, + "loss": 4.9612, + "step": 37425 + }, + { + "epoch": 0.7615152994791666, + "grad_norm": 24.91079330444336, + "learning_rate": 9.135695795181137e-06, + "loss": 5.0909, + "step": 37430 + }, + { + "epoch": 0.7616170247395834, + "grad_norm": 15.78696060180664, + "learning_rate": 9.13547116437535e-06, + "loss": 5.1991, + "step": 37435 + }, + { + "epoch": 0.76171875, + "grad_norm": 17.94595718383789, + "learning_rate": 9.135246507145218e-06, + "loss": 5.0705, + "step": 37440 + }, + { + "epoch": 0.7618204752604166, + "grad_norm": 19.209144592285156, + "learning_rate": 9.135021823492174e-06, + "loss": 4.8714, + "step": 37445 + }, + { + "epoch": 0.7619222005208334, + "grad_norm": 24.43460464477539, + "learning_rate": 9.134797113417652e-06, + "loss": 4.9715, + "step": 37450 + }, + { + "epoch": 0.76202392578125, + "grad_norm": 19.54807472229004, + "learning_rate": 9.13457237692309e-06, + "loss": 4.9648, + "step": 37455 + }, + { + "epoch": 0.7621256510416666, + "grad_norm": 21.097747802734375, + "learning_rate": 9.134347614009921e-06, + "loss": 5.0477, + "step": 37460 + }, + { + "epoch": 0.7622273763020834, + "grad_norm": 13.796135902404785, + "learning_rate": 9.134122824679583e-06, + "loss": 4.8882, + "step": 37465 + }, + { + "epoch": 0.7623291015625, + "grad_norm": 18.657747268676758, + "learning_rate": 9.133898008933514e-06, + "loss": 5.1315, + "step": 37470 + }, + { + "epoch": 0.7624308268229166, + "grad_norm": 18.32216453552246, + "learning_rate": 9.133673166773149e-06, + "loss": 5.2625, + "step": 37475 + }, + { + "epoch": 0.7625325520833334, + "grad_norm": 16.44662857055664, + "learning_rate": 9.133448298199925e-06, + "loss": 5.0181, + "step": 37480 + }, + { + "epoch": 0.76263427734375, + "grad_norm": 18.64207649230957, + "learning_rate": 9.133223403215276e-06, + "loss": 5.0322, + "step": 37485 + }, + { + "epoch": 0.7627360026041666, + "grad_norm": 17.452327728271484, + "learning_rate": 9.132998481820645e-06, + "loss": 4.9856, + "step": 37490 + }, + { + "epoch": 0.7628377278645834, + "grad_norm": 16.645891189575195, + "learning_rate": 9.132773534017462e-06, + "loss": 5.0089, + "step": 37495 + }, + { + "epoch": 0.762939453125, + "grad_norm": 13.474742889404297, + "learning_rate": 9.132548559807172e-06, + "loss": 5.0733, + "step": 37500 + }, + { + "epoch": 0.7630411783854166, + "grad_norm": 15.796951293945312, + "learning_rate": 9.132323559191206e-06, + "loss": 4.9734, + "step": 37505 + }, + { + "epoch": 0.7631429036458334, + "grad_norm": 18.407867431640625, + "learning_rate": 9.132098532171005e-06, + "loss": 4.9402, + "step": 37510 + }, + { + "epoch": 0.76324462890625, + "grad_norm": 20.067913055419922, + "learning_rate": 9.131873478748005e-06, + "loss": 5.0918, + "step": 37515 + }, + { + "epoch": 0.7633463541666666, + "grad_norm": 16.411039352416992, + "learning_rate": 9.131648398923647e-06, + "loss": 5.0062, + "step": 37520 + }, + { + "epoch": 0.7634480794270834, + "grad_norm": 14.653005599975586, + "learning_rate": 9.131423292699367e-06, + "loss": 5.0208, + "step": 37525 + }, + { + "epoch": 0.7635498046875, + "grad_norm": 11.10954475402832, + "learning_rate": 9.131198160076602e-06, + "loss": 4.9489, + "step": 37530 + }, + { + "epoch": 0.7636515299479166, + "grad_norm": 15.38349437713623, + "learning_rate": 9.130973001056793e-06, + "loss": 4.9682, + "step": 37535 + }, + { + "epoch": 0.7637532552083334, + "grad_norm": 19.10736083984375, + "learning_rate": 9.130747815641378e-06, + "loss": 5.116, + "step": 37540 + }, + { + "epoch": 0.76385498046875, + "grad_norm": 15.328252792358398, + "learning_rate": 9.130522603831796e-06, + "loss": 5.2061, + "step": 37545 + }, + { + "epoch": 0.7639567057291666, + "grad_norm": 13.639189720153809, + "learning_rate": 9.130297365629486e-06, + "loss": 5.0016, + "step": 37550 + }, + { + "epoch": 0.7640584309895834, + "grad_norm": 18.107210159301758, + "learning_rate": 9.130072101035887e-06, + "loss": 5.076, + "step": 37555 + }, + { + "epoch": 0.76416015625, + "grad_norm": 17.301054000854492, + "learning_rate": 9.129846810052437e-06, + "loss": 4.9823, + "step": 37560 + }, + { + "epoch": 0.7642618815104166, + "grad_norm": 23.42778778076172, + "learning_rate": 9.129621492680579e-06, + "loss": 4.6817, + "step": 37565 + }, + { + "epoch": 0.7643636067708334, + "grad_norm": 21.09479522705078, + "learning_rate": 9.129396148921748e-06, + "loss": 5.1277, + "step": 37570 + }, + { + "epoch": 0.76446533203125, + "grad_norm": 14.55953311920166, + "learning_rate": 9.129170778777388e-06, + "loss": 5.0255, + "step": 37575 + }, + { + "epoch": 0.7645670572916666, + "grad_norm": 12.912190437316895, + "learning_rate": 9.128945382248937e-06, + "loss": 4.7582, + "step": 37580 + }, + { + "epoch": 0.7646687825520834, + "grad_norm": 15.332571029663086, + "learning_rate": 9.128719959337837e-06, + "loss": 5.405, + "step": 37585 + }, + { + "epoch": 0.7647705078125, + "grad_norm": 14.057861328125, + "learning_rate": 9.128494510045525e-06, + "loss": 5.1454, + "step": 37590 + }, + { + "epoch": 0.7648722330729166, + "grad_norm": 20.384767532348633, + "learning_rate": 9.128269034373445e-06, + "loss": 5.1887, + "step": 37595 + }, + { + "epoch": 0.7649739583333334, + "grad_norm": 19.156795501708984, + "learning_rate": 9.128043532323035e-06, + "loss": 5.1448, + "step": 37600 + }, + { + "epoch": 0.76507568359375, + "grad_norm": 22.43840789794922, + "learning_rate": 9.127818003895738e-06, + "loss": 4.9642, + "step": 37605 + }, + { + "epoch": 0.7651774088541666, + "grad_norm": 17.54641342163086, + "learning_rate": 9.127592449092996e-06, + "loss": 5.4594, + "step": 37610 + }, + { + "epoch": 0.7652791341145834, + "grad_norm": 21.044696807861328, + "learning_rate": 9.127366867916247e-06, + "loss": 4.9354, + "step": 37615 + }, + { + "epoch": 0.765380859375, + "grad_norm": 15.373477935791016, + "learning_rate": 9.127141260366933e-06, + "loss": 5.1924, + "step": 37620 + }, + { + "epoch": 0.7654825846354166, + "grad_norm": 13.56985092163086, + "learning_rate": 9.126915626446498e-06, + "loss": 4.9293, + "step": 37625 + }, + { + "epoch": 0.7655843098958334, + "grad_norm": 22.481887817382812, + "learning_rate": 9.126689966156381e-06, + "loss": 5.2135, + "step": 37630 + }, + { + "epoch": 0.76568603515625, + "grad_norm": 21.38691520690918, + "learning_rate": 9.126464279498026e-06, + "loss": 5.005, + "step": 37635 + }, + { + "epoch": 0.7657877604166666, + "grad_norm": 15.777947425842285, + "learning_rate": 9.126238566472875e-06, + "loss": 4.8987, + "step": 37640 + }, + { + "epoch": 0.7658894856770834, + "grad_norm": 15.273459434509277, + "learning_rate": 9.126012827082368e-06, + "loss": 5.3207, + "step": 37645 + }, + { + "epoch": 0.7659912109375, + "grad_norm": 19.32965660095215, + "learning_rate": 9.12578706132795e-06, + "loss": 4.9196, + "step": 37650 + }, + { + "epoch": 0.7660929361979166, + "grad_norm": 18.966615676879883, + "learning_rate": 9.125561269211061e-06, + "loss": 5.1121, + "step": 37655 + }, + { + "epoch": 0.7661946614583334, + "grad_norm": 14.961262702941895, + "learning_rate": 9.125335450733146e-06, + "loss": 4.8349, + "step": 37660 + }, + { + "epoch": 0.76629638671875, + "grad_norm": 20.694904327392578, + "learning_rate": 9.125109605895645e-06, + "loss": 5.1209, + "step": 37665 + }, + { + "epoch": 0.7663981119791666, + "grad_norm": 20.727827072143555, + "learning_rate": 9.124883734700005e-06, + "loss": 4.8278, + "step": 37670 + }, + { + "epoch": 0.7664998372395834, + "grad_norm": 19.367206573486328, + "learning_rate": 9.124657837147668e-06, + "loss": 4.9833, + "step": 37675 + }, + { + "epoch": 0.7666015625, + "grad_norm": 21.855121612548828, + "learning_rate": 9.124431913240077e-06, + "loss": 5.094, + "step": 37680 + }, + { + "epoch": 0.7667032877604166, + "grad_norm": 21.972389221191406, + "learning_rate": 9.124205962978675e-06, + "loss": 4.9273, + "step": 37685 + }, + { + "epoch": 0.7668050130208334, + "grad_norm": 14.686653137207031, + "learning_rate": 9.123979986364904e-06, + "loss": 5.1124, + "step": 37690 + }, + { + "epoch": 0.76690673828125, + "grad_norm": 24.59175682067871, + "learning_rate": 9.12375398340021e-06, + "loss": 5.0792, + "step": 37695 + }, + { + "epoch": 0.7670084635416666, + "grad_norm": 17.463361740112305, + "learning_rate": 9.12352795408604e-06, + "loss": 5.1095, + "step": 37700 + }, + { + "epoch": 0.7671101888020834, + "grad_norm": 20.2195987701416, + "learning_rate": 9.123301898423833e-06, + "loss": 4.9166, + "step": 37705 + }, + { + "epoch": 0.7672119140625, + "grad_norm": 16.504436492919922, + "learning_rate": 9.123075816415036e-06, + "loss": 5.2455, + "step": 37710 + }, + { + "epoch": 0.7673136393229166, + "grad_norm": 20.356224060058594, + "learning_rate": 9.122849708061094e-06, + "loss": 5.1323, + "step": 37715 + }, + { + "epoch": 0.7674153645833334, + "grad_norm": 17.793746948242188, + "learning_rate": 9.122623573363452e-06, + "loss": 4.9403, + "step": 37720 + }, + { + "epoch": 0.76751708984375, + "grad_norm": 15.956522941589355, + "learning_rate": 9.122397412323554e-06, + "loss": 4.9582, + "step": 37725 + }, + { + "epoch": 0.7676188151041666, + "grad_norm": 22.77311897277832, + "learning_rate": 9.122171224942845e-06, + "loss": 5.0539, + "step": 37730 + }, + { + "epoch": 0.7677205403645834, + "grad_norm": 18.764057159423828, + "learning_rate": 9.121945011222768e-06, + "loss": 5.1756, + "step": 37735 + }, + { + "epoch": 0.767822265625, + "grad_norm": 21.43279266357422, + "learning_rate": 9.121718771164774e-06, + "loss": 5.1176, + "step": 37740 + }, + { + "epoch": 0.7679239908854166, + "grad_norm": 20.941390991210938, + "learning_rate": 9.121492504770303e-06, + "loss": 4.9494, + "step": 37745 + }, + { + "epoch": 0.7680257161458334, + "grad_norm": 15.477139472961426, + "learning_rate": 9.121266212040804e-06, + "loss": 5.2531, + "step": 37750 + }, + { + "epoch": 0.76812744140625, + "grad_norm": 15.235673904418945, + "learning_rate": 9.121039892977723e-06, + "loss": 5.1146, + "step": 37755 + }, + { + "epoch": 0.7682291666666666, + "grad_norm": 18.517303466796875, + "learning_rate": 9.120813547582504e-06, + "loss": 5.2029, + "step": 37760 + }, + { + "epoch": 0.7683308919270834, + "grad_norm": 13.560153007507324, + "learning_rate": 9.120587175856596e-06, + "loss": 4.8765, + "step": 37765 + }, + { + "epoch": 0.7684326171875, + "grad_norm": 14.863431930541992, + "learning_rate": 9.120360777801443e-06, + "loss": 4.9768, + "step": 37770 + }, + { + "epoch": 0.7685343424479166, + "grad_norm": 24.94033432006836, + "learning_rate": 9.120134353418492e-06, + "loss": 5.2273, + "step": 37775 + }, + { + "epoch": 0.7686360677083334, + "grad_norm": 34.22269058227539, + "learning_rate": 9.119907902709193e-06, + "loss": 5.3549, + "step": 37780 + }, + { + "epoch": 0.76873779296875, + "grad_norm": 17.9620418548584, + "learning_rate": 9.11968142567499e-06, + "loss": 5.3248, + "step": 37785 + }, + { + "epoch": 0.7688395182291666, + "grad_norm": 20.060245513916016, + "learning_rate": 9.119454922317328e-06, + "loss": 4.8671, + "step": 37790 + }, + { + "epoch": 0.7689412434895834, + "grad_norm": 17.0928955078125, + "learning_rate": 9.119228392637656e-06, + "loss": 4.7449, + "step": 37795 + }, + { + "epoch": 0.76904296875, + "grad_norm": 18.267332077026367, + "learning_rate": 9.119001836637425e-06, + "loss": 4.8583, + "step": 37800 + }, + { + "epoch": 0.7691446940104166, + "grad_norm": 17.270566940307617, + "learning_rate": 9.11877525431808e-06, + "loss": 4.9328, + "step": 37805 + }, + { + "epoch": 0.7692464192708334, + "grad_norm": 14.043905258178711, + "learning_rate": 9.118548645681066e-06, + "loss": 4.7591, + "step": 37810 + }, + { + "epoch": 0.76934814453125, + "grad_norm": 17.328941345214844, + "learning_rate": 9.118322010727836e-06, + "loss": 5.1005, + "step": 37815 + }, + { + "epoch": 0.7694498697916666, + "grad_norm": 19.220962524414062, + "learning_rate": 9.118095349459834e-06, + "loss": 5.0681, + "step": 37820 + }, + { + "epoch": 0.7695515950520834, + "grad_norm": 15.543777465820312, + "learning_rate": 9.117868661878511e-06, + "loss": 5.0477, + "step": 37825 + }, + { + "epoch": 0.7696533203125, + "grad_norm": 20.740407943725586, + "learning_rate": 9.117641947985313e-06, + "loss": 5.5996, + "step": 37830 + }, + { + "epoch": 0.7697550455729166, + "grad_norm": 19.462575912475586, + "learning_rate": 9.117415207781693e-06, + "loss": 5.1777, + "step": 37835 + }, + { + "epoch": 0.7698567708333334, + "grad_norm": 14.320125579833984, + "learning_rate": 9.117188441269094e-06, + "loss": 4.9806, + "step": 37840 + }, + { + "epoch": 0.76995849609375, + "grad_norm": 16.780675888061523, + "learning_rate": 9.11696164844897e-06, + "loss": 4.9969, + "step": 37845 + }, + { + "epoch": 0.7700602213541666, + "grad_norm": 14.334929466247559, + "learning_rate": 9.116734829322766e-06, + "loss": 5.1378, + "step": 37850 + }, + { + "epoch": 0.7701619466145834, + "grad_norm": 15.939923286437988, + "learning_rate": 9.116507983891934e-06, + "loss": 5.1286, + "step": 37855 + }, + { + "epoch": 0.770263671875, + "grad_norm": 15.957082748413086, + "learning_rate": 9.116281112157924e-06, + "loss": 4.9318, + "step": 37860 + }, + { + "epoch": 0.7703653971354166, + "grad_norm": 16.303258895874023, + "learning_rate": 9.116054214122182e-06, + "loss": 5.3764, + "step": 37865 + }, + { + "epoch": 0.7704671223958334, + "grad_norm": 19.321962356567383, + "learning_rate": 9.11582728978616e-06, + "loss": 4.9444, + "step": 37870 + }, + { + "epoch": 0.77056884765625, + "grad_norm": 20.33592414855957, + "learning_rate": 9.11560033915131e-06, + "loss": 4.9123, + "step": 37875 + }, + { + "epoch": 0.7706705729166666, + "grad_norm": 22.50284767150879, + "learning_rate": 9.115373362219081e-06, + "loss": 5.0214, + "step": 37880 + }, + { + "epoch": 0.7707722981770834, + "grad_norm": 18.150392532348633, + "learning_rate": 9.115146358990921e-06, + "loss": 5.1157, + "step": 37885 + }, + { + "epoch": 0.7708740234375, + "grad_norm": 16.95968246459961, + "learning_rate": 9.114919329468283e-06, + "loss": 5.0523, + "step": 37890 + }, + { + "epoch": 0.7709757486979166, + "grad_norm": 17.000015258789062, + "learning_rate": 9.114692273652617e-06, + "loss": 5.269, + "step": 37895 + }, + { + "epoch": 0.7710774739583334, + "grad_norm": 17.833518981933594, + "learning_rate": 9.114465191545373e-06, + "loss": 4.9591, + "step": 37900 + }, + { + "epoch": 0.77117919921875, + "grad_norm": 14.86413288116455, + "learning_rate": 9.114238083148002e-06, + "loss": 5.1815, + "step": 37905 + }, + { + "epoch": 0.7712809244791666, + "grad_norm": 15.240269660949707, + "learning_rate": 9.114010948461955e-06, + "loss": 5.0718, + "step": 37910 + }, + { + "epoch": 0.7713826497395834, + "grad_norm": 14.861196517944336, + "learning_rate": 9.113783787488685e-06, + "loss": 5.4688, + "step": 37915 + }, + { + "epoch": 0.771484375, + "grad_norm": 18.1726131439209, + "learning_rate": 9.113556600229642e-06, + "loss": 5.1331, + "step": 37920 + }, + { + "epoch": 0.7715861002604166, + "grad_norm": 14.361923217773438, + "learning_rate": 9.113329386686279e-06, + "loss": 5.3637, + "step": 37925 + }, + { + "epoch": 0.7716878255208334, + "grad_norm": 16.521224975585938, + "learning_rate": 9.113102146860046e-06, + "loss": 5.0535, + "step": 37930 + }, + { + "epoch": 0.77178955078125, + "grad_norm": 17.018421173095703, + "learning_rate": 9.112874880752398e-06, + "loss": 5.125, + "step": 37935 + }, + { + "epoch": 0.7718912760416666, + "grad_norm": 30.066795349121094, + "learning_rate": 9.112647588364784e-06, + "loss": 5.4135, + "step": 37940 + }, + { + "epoch": 0.7719930013020834, + "grad_norm": 17.953659057617188, + "learning_rate": 9.112420269698656e-06, + "loss": 5.1661, + "step": 37945 + }, + { + "epoch": 0.7720947265625, + "grad_norm": 20.09968376159668, + "learning_rate": 9.11219292475547e-06, + "loss": 5.1505, + "step": 37950 + }, + { + "epoch": 0.7721964518229166, + "grad_norm": 13.607805252075195, + "learning_rate": 9.111965553536674e-06, + "loss": 4.9441, + "step": 37955 + }, + { + "epoch": 0.7722981770833334, + "grad_norm": 22.169553756713867, + "learning_rate": 9.111738156043727e-06, + "loss": 5.0109, + "step": 37960 + }, + { + "epoch": 0.77239990234375, + "grad_norm": 14.163589477539062, + "learning_rate": 9.111510732278077e-06, + "loss": 4.9058, + "step": 37965 + }, + { + "epoch": 0.7725016276041666, + "grad_norm": 18.341644287109375, + "learning_rate": 9.111283282241177e-06, + "loss": 5.2335, + "step": 37970 + }, + { + "epoch": 0.7726033528645834, + "grad_norm": 17.602588653564453, + "learning_rate": 9.111055805934484e-06, + "loss": 5.1177, + "step": 37975 + }, + { + "epoch": 0.772705078125, + "grad_norm": 16.689783096313477, + "learning_rate": 9.110828303359449e-06, + "loss": 5.4778, + "step": 37980 + }, + { + "epoch": 0.7728068033854166, + "grad_norm": 19.28805923461914, + "learning_rate": 9.110600774517524e-06, + "loss": 5.0464, + "step": 37985 + }, + { + "epoch": 0.7729085286458334, + "grad_norm": 12.930512428283691, + "learning_rate": 9.110373219410168e-06, + "loss": 5.0592, + "step": 37990 + }, + { + "epoch": 0.77301025390625, + "grad_norm": 20.678728103637695, + "learning_rate": 9.11014563803883e-06, + "loss": 4.991, + "step": 37995 + }, + { + "epoch": 0.7731119791666666, + "grad_norm": 18.63764762878418, + "learning_rate": 9.109918030404966e-06, + "loss": 5.111, + "step": 38000 + }, + { + "epoch": 0.7732137044270834, + "grad_norm": 22.11285972595215, + "learning_rate": 9.109690396510032e-06, + "loss": 5.1089, + "step": 38005 + }, + { + "epoch": 0.7733154296875, + "grad_norm": 16.413789749145508, + "learning_rate": 9.10946273635548e-06, + "loss": 4.9755, + "step": 38010 + }, + { + "epoch": 0.7734171549479166, + "grad_norm": 17.154348373413086, + "learning_rate": 9.109235049942763e-06, + "loss": 5.0042, + "step": 38015 + }, + { + "epoch": 0.7735188802083334, + "grad_norm": 13.612090110778809, + "learning_rate": 9.109007337273342e-06, + "loss": 4.9357, + "step": 38020 + }, + { + "epoch": 0.77362060546875, + "grad_norm": 19.48721694946289, + "learning_rate": 9.108779598348667e-06, + "loss": 5.076, + "step": 38025 + }, + { + "epoch": 0.7737223307291666, + "grad_norm": 21.939157485961914, + "learning_rate": 9.108551833170196e-06, + "loss": 4.7435, + "step": 38030 + }, + { + "epoch": 0.7738240559895834, + "grad_norm": 16.64963722229004, + "learning_rate": 9.108324041739383e-06, + "loss": 4.9986, + "step": 38035 + }, + { + "epoch": 0.77392578125, + "grad_norm": 17.39975357055664, + "learning_rate": 9.108096224057683e-06, + "loss": 4.7407, + "step": 38040 + }, + { + "epoch": 0.7740275065104166, + "grad_norm": 23.329143524169922, + "learning_rate": 9.10786838012655e-06, + "loss": 4.9504, + "step": 38045 + }, + { + "epoch": 0.7741292317708334, + "grad_norm": 15.647180557250977, + "learning_rate": 9.107640509947446e-06, + "loss": 5.0705, + "step": 38050 + }, + { + "epoch": 0.77423095703125, + "grad_norm": 16.368032455444336, + "learning_rate": 9.10741261352182e-06, + "loss": 4.8991, + "step": 38055 + }, + { + "epoch": 0.7743326822916666, + "grad_norm": 13.763962745666504, + "learning_rate": 9.107184690851132e-06, + "loss": 5.035, + "step": 38060 + }, + { + "epoch": 0.7744344075520834, + "grad_norm": 22.009050369262695, + "learning_rate": 9.106956741936839e-06, + "loss": 5.1579, + "step": 38065 + }, + { + "epoch": 0.7745361328125, + "grad_norm": 18.86267852783203, + "learning_rate": 9.106728766780394e-06, + "loss": 5.1285, + "step": 38070 + }, + { + "epoch": 0.7746378580729166, + "grad_norm": 15.807943344116211, + "learning_rate": 9.106500765383259e-06, + "loss": 5.0078, + "step": 38075 + }, + { + "epoch": 0.7747395833333334, + "grad_norm": 20.470361709594727, + "learning_rate": 9.106272737746884e-06, + "loss": 4.9163, + "step": 38080 + }, + { + "epoch": 0.77484130859375, + "grad_norm": 13.699433326721191, + "learning_rate": 9.106044683872732e-06, + "loss": 5.1043, + "step": 38085 + }, + { + "epoch": 0.7749430338541666, + "grad_norm": 13.150802612304688, + "learning_rate": 9.105816603762258e-06, + "loss": 5.1118, + "step": 38090 + }, + { + "epoch": 0.7750447591145834, + "grad_norm": 12.854818344116211, + "learning_rate": 9.105588497416917e-06, + "loss": 5.3208, + "step": 38095 + }, + { + "epoch": 0.775146484375, + "grad_norm": 16.095144271850586, + "learning_rate": 9.105360364838171e-06, + "loss": 4.8688, + "step": 38100 + }, + { + "epoch": 0.7752482096354166, + "grad_norm": 15.467554092407227, + "learning_rate": 9.105132206027476e-06, + "loss": 4.9953, + "step": 38105 + }, + { + "epoch": 0.7753499348958334, + "grad_norm": 16.440683364868164, + "learning_rate": 9.104904020986287e-06, + "loss": 4.894, + "step": 38110 + }, + { + "epoch": 0.77545166015625, + "grad_norm": 13.221678733825684, + "learning_rate": 9.104675809716066e-06, + "loss": 5.2059, + "step": 38115 + }, + { + "epoch": 0.7755533854166666, + "grad_norm": 12.664115905761719, + "learning_rate": 9.104447572218268e-06, + "loss": 5.0602, + "step": 38120 + }, + { + "epoch": 0.7756551106770834, + "grad_norm": 15.611042976379395, + "learning_rate": 9.104219308494355e-06, + "loss": 5.0773, + "step": 38125 + }, + { + "epoch": 0.7757568359375, + "grad_norm": 14.993188858032227, + "learning_rate": 9.103991018545783e-06, + "loss": 5.3899, + "step": 38130 + }, + { + "epoch": 0.7758585611979166, + "grad_norm": 29.878829956054688, + "learning_rate": 9.103762702374011e-06, + "loss": 4.8102, + "step": 38135 + }, + { + "epoch": 0.7759602864583334, + "grad_norm": 14.924657821655273, + "learning_rate": 9.103534359980497e-06, + "loss": 4.8274, + "step": 38140 + }, + { + "epoch": 0.77606201171875, + "grad_norm": 16.926511764526367, + "learning_rate": 9.103305991366703e-06, + "loss": 4.7389, + "step": 38145 + }, + { + "epoch": 0.7761637369791666, + "grad_norm": 16.3781795501709, + "learning_rate": 9.103077596534085e-06, + "loss": 4.9194, + "step": 38150 + }, + { + "epoch": 0.7762654622395834, + "grad_norm": 17.963682174682617, + "learning_rate": 9.102849175484103e-06, + "loss": 5.1563, + "step": 38155 + }, + { + "epoch": 0.7763671875, + "grad_norm": 19.879966735839844, + "learning_rate": 9.102620728218217e-06, + "loss": 4.9991, + "step": 38160 + }, + { + "epoch": 0.7764689127604166, + "grad_norm": 13.689791679382324, + "learning_rate": 9.102392254737889e-06, + "loss": 4.9609, + "step": 38165 + }, + { + "epoch": 0.7765706380208334, + "grad_norm": 11.970956802368164, + "learning_rate": 9.102163755044576e-06, + "loss": 5.1687, + "step": 38170 + }, + { + "epoch": 0.77667236328125, + "grad_norm": 17.328296661376953, + "learning_rate": 9.101935229139738e-06, + "loss": 5.014, + "step": 38175 + }, + { + "epoch": 0.7767740885416666, + "grad_norm": 15.689697265625, + "learning_rate": 9.101706677024837e-06, + "loss": 5.1682, + "step": 38180 + }, + { + "epoch": 0.7768758138020834, + "grad_norm": 15.350934982299805, + "learning_rate": 9.101478098701331e-06, + "loss": 5.0231, + "step": 38185 + }, + { + "epoch": 0.7769775390625, + "grad_norm": 18.846712112426758, + "learning_rate": 9.101249494170683e-06, + "loss": 5.1677, + "step": 38190 + }, + { + "epoch": 0.7770792643229166, + "grad_norm": 20.836824417114258, + "learning_rate": 9.101020863434352e-06, + "loss": 4.8556, + "step": 38195 + }, + { + "epoch": 0.7771809895833334, + "grad_norm": 17.694576263427734, + "learning_rate": 9.1007922064938e-06, + "loss": 4.7928, + "step": 38200 + }, + { + "epoch": 0.77728271484375, + "grad_norm": 14.94758415222168, + "learning_rate": 9.100563523350487e-06, + "loss": 5.2587, + "step": 38205 + }, + { + "epoch": 0.7773844401041666, + "grad_norm": 12.2636079788208, + "learning_rate": 9.100334814005875e-06, + "loss": 5.0082, + "step": 38210 + }, + { + "epoch": 0.7774861653645834, + "grad_norm": 15.252568244934082, + "learning_rate": 9.100106078461425e-06, + "loss": 5.1903, + "step": 38215 + }, + { + "epoch": 0.777587890625, + "grad_norm": 29.21184730529785, + "learning_rate": 9.0998773167186e-06, + "loss": 5.1063, + "step": 38220 + }, + { + "epoch": 0.7776896158854166, + "grad_norm": 20.01173210144043, + "learning_rate": 9.099648528778859e-06, + "loss": 5.0518, + "step": 38225 + }, + { + "epoch": 0.7777913411458334, + "grad_norm": 16.536169052124023, + "learning_rate": 9.099419714643668e-06, + "loss": 4.8418, + "step": 38230 + }, + { + "epoch": 0.77789306640625, + "grad_norm": 15.37202262878418, + "learning_rate": 9.099190874314483e-06, + "loss": 5.0854, + "step": 38235 + }, + { + "epoch": 0.7779947916666666, + "grad_norm": 18.725112915039062, + "learning_rate": 9.09896200779277e-06, + "loss": 4.6795, + "step": 38240 + }, + { + "epoch": 0.7780965169270834, + "grad_norm": 21.326763153076172, + "learning_rate": 9.098733115079994e-06, + "loss": 4.7653, + "step": 38245 + }, + { + "epoch": 0.7781982421875, + "grad_norm": 17.288869857788086, + "learning_rate": 9.098504196177612e-06, + "loss": 5.0339, + "step": 38250 + }, + { + "epoch": 0.7782999674479166, + "grad_norm": 18.440034866333008, + "learning_rate": 9.098275251087088e-06, + "loss": 5.1811, + "step": 38255 + }, + { + "epoch": 0.7784016927083334, + "grad_norm": 15.33721923828125, + "learning_rate": 9.098046279809888e-06, + "loss": 4.8406, + "step": 38260 + }, + { + "epoch": 0.77850341796875, + "grad_norm": 15.161357879638672, + "learning_rate": 9.097817282347472e-06, + "loss": 4.8696, + "step": 38265 + }, + { + "epoch": 0.7786051432291666, + "grad_norm": 15.962961196899414, + "learning_rate": 9.097588258701307e-06, + "loss": 5.2955, + "step": 38270 + }, + { + "epoch": 0.7787068684895834, + "grad_norm": 28.317365646362305, + "learning_rate": 9.097359208872851e-06, + "loss": 5.0392, + "step": 38275 + }, + { + "epoch": 0.77880859375, + "grad_norm": 13.48157024383545, + "learning_rate": 9.09713013286357e-06, + "loss": 5.1035, + "step": 38280 + }, + { + "epoch": 0.7789103190104166, + "grad_norm": 18.46375846862793, + "learning_rate": 9.09690103067493e-06, + "loss": 4.9129, + "step": 38285 + }, + { + "epoch": 0.7790120442708334, + "grad_norm": 18.122461318969727, + "learning_rate": 9.096671902308392e-06, + "loss": 5.0715, + "step": 38290 + }, + { + "epoch": 0.77911376953125, + "grad_norm": 25.462194442749023, + "learning_rate": 9.096442747765422e-06, + "loss": 5.5375, + "step": 38295 + }, + { + "epoch": 0.7792154947916666, + "grad_norm": 16.760942459106445, + "learning_rate": 9.096213567047482e-06, + "loss": 5.257, + "step": 38300 + }, + { + "epoch": 0.7793172200520834, + "grad_norm": 20.486675262451172, + "learning_rate": 9.095984360156038e-06, + "loss": 4.9515, + "step": 38305 + }, + { + "epoch": 0.7794189453125, + "grad_norm": 21.508628845214844, + "learning_rate": 9.095755127092554e-06, + "loss": 4.9415, + "step": 38310 + }, + { + "epoch": 0.7795206705729166, + "grad_norm": 19.311311721801758, + "learning_rate": 9.095525867858497e-06, + "loss": 5.0152, + "step": 38315 + }, + { + "epoch": 0.7796223958333334, + "grad_norm": 14.585928916931152, + "learning_rate": 9.095296582455326e-06, + "loss": 5.028, + "step": 38320 + }, + { + "epoch": 0.77972412109375, + "grad_norm": 19.414573669433594, + "learning_rate": 9.095067270884513e-06, + "loss": 5.0346, + "step": 38325 + }, + { + "epoch": 0.7798258463541666, + "grad_norm": 21.541364669799805, + "learning_rate": 9.094837933147518e-06, + "loss": 5.0055, + "step": 38330 + }, + { + "epoch": 0.7799275716145834, + "grad_norm": 16.248939514160156, + "learning_rate": 9.09460856924581e-06, + "loss": 5.3575, + "step": 38335 + }, + { + "epoch": 0.780029296875, + "grad_norm": 18.72229766845703, + "learning_rate": 9.094379179180853e-06, + "loss": 5.1752, + "step": 38340 + }, + { + "epoch": 0.7801310221354166, + "grad_norm": 19.57839012145996, + "learning_rate": 9.094149762954111e-06, + "loss": 5.0912, + "step": 38345 + }, + { + "epoch": 0.7802327473958334, + "grad_norm": 18.00303840637207, + "learning_rate": 9.093920320567053e-06, + "loss": 5.0819, + "step": 38350 + }, + { + "epoch": 0.78033447265625, + "grad_norm": 14.766885757446289, + "learning_rate": 9.093690852021145e-06, + "loss": 4.9385, + "step": 38355 + }, + { + "epoch": 0.7804361979166666, + "grad_norm": 21.563867568969727, + "learning_rate": 9.09346135731785e-06, + "loss": 5.159, + "step": 38360 + }, + { + "epoch": 0.7805379231770834, + "grad_norm": 13.745574951171875, + "learning_rate": 9.093231836458636e-06, + "loss": 5.2752, + "step": 38365 + }, + { + "epoch": 0.7806396484375, + "grad_norm": 19.25821876525879, + "learning_rate": 9.093002289444973e-06, + "loss": 4.8154, + "step": 38370 + }, + { + "epoch": 0.7807413736979166, + "grad_norm": 13.964234352111816, + "learning_rate": 9.092772716278322e-06, + "loss": 5.1395, + "step": 38375 + }, + { + "epoch": 0.7808430989583334, + "grad_norm": 15.996054649353027, + "learning_rate": 9.092543116960153e-06, + "loss": 4.9956, + "step": 38380 + }, + { + "epoch": 0.78094482421875, + "grad_norm": 15.35560417175293, + "learning_rate": 9.092313491491933e-06, + "loss": 4.8344, + "step": 38385 + }, + { + "epoch": 0.7810465494791666, + "grad_norm": 14.105667114257812, + "learning_rate": 9.092083839875128e-06, + "loss": 5.1244, + "step": 38390 + }, + { + "epoch": 0.7811482747395834, + "grad_norm": 15.144017219543457, + "learning_rate": 9.091854162111207e-06, + "loss": 4.8785, + "step": 38395 + }, + { + "epoch": 0.78125, + "grad_norm": 19.12779426574707, + "learning_rate": 9.091624458201636e-06, + "loss": 5.0478, + "step": 38400 + }, + { + "epoch": 0.7813517252604166, + "grad_norm": 15.807084083557129, + "learning_rate": 9.091394728147885e-06, + "loss": 4.771, + "step": 38405 + }, + { + "epoch": 0.7814534505208334, + "grad_norm": 18.248300552368164, + "learning_rate": 9.09116497195142e-06, + "loss": 5.0158, + "step": 38410 + }, + { + "epoch": 0.78155517578125, + "grad_norm": 20.51373291015625, + "learning_rate": 9.09093518961371e-06, + "loss": 4.9866, + "step": 38415 + }, + { + "epoch": 0.7816569010416666, + "grad_norm": 17.934276580810547, + "learning_rate": 9.09070538113622e-06, + "loss": 5.1708, + "step": 38420 + }, + { + "epoch": 0.7817586263020834, + "grad_norm": 18.16665267944336, + "learning_rate": 9.090475546520423e-06, + "loss": 5.0951, + "step": 38425 + }, + { + "epoch": 0.7818603515625, + "grad_norm": 15.392996788024902, + "learning_rate": 9.090245685767787e-06, + "loss": 5.1283, + "step": 38430 + }, + { + "epoch": 0.7819620768229166, + "grad_norm": 21.31895637512207, + "learning_rate": 9.090015798879779e-06, + "loss": 5.3423, + "step": 38435 + }, + { + "epoch": 0.7820638020833334, + "grad_norm": 23.8211727142334, + "learning_rate": 9.089785885857867e-06, + "loss": 5.2789, + "step": 38440 + }, + { + "epoch": 0.78216552734375, + "grad_norm": 20.8219051361084, + "learning_rate": 9.089555946703523e-06, + "loss": 5.2344, + "step": 38445 + }, + { + "epoch": 0.7822672526041666, + "grad_norm": 15.374608039855957, + "learning_rate": 9.089325981418214e-06, + "loss": 5.1146, + "step": 38450 + }, + { + "epoch": 0.7823689778645834, + "grad_norm": 24.966163635253906, + "learning_rate": 9.08909599000341e-06, + "loss": 5.3936, + "step": 38455 + }, + { + "epoch": 0.782470703125, + "grad_norm": 19.317153930664062, + "learning_rate": 9.088865972460581e-06, + "loss": 5.0723, + "step": 38460 + }, + { + "epoch": 0.7825724283854166, + "grad_norm": 13.34479808807373, + "learning_rate": 9.088635928791197e-06, + "loss": 5.1963, + "step": 38465 + }, + { + "epoch": 0.7826741536458334, + "grad_norm": 14.919037818908691, + "learning_rate": 9.088405858996726e-06, + "loss": 5.2628, + "step": 38470 + }, + { + "epoch": 0.78277587890625, + "grad_norm": 17.815343856811523, + "learning_rate": 9.08817576307864e-06, + "loss": 5.0832, + "step": 38475 + }, + { + "epoch": 0.7828776041666666, + "grad_norm": 13.469996452331543, + "learning_rate": 9.087945641038408e-06, + "loss": 5.365, + "step": 38480 + }, + { + "epoch": 0.7829793294270834, + "grad_norm": 14.027068138122559, + "learning_rate": 9.087715492877502e-06, + "loss": 5.1163, + "step": 38485 + }, + { + "epoch": 0.7830810546875, + "grad_norm": 21.259265899658203, + "learning_rate": 9.08748531859739e-06, + "loss": 5.1844, + "step": 38490 + }, + { + "epoch": 0.7831827799479166, + "grad_norm": 20.22934341430664, + "learning_rate": 9.087255118199547e-06, + "loss": 5.2659, + "step": 38495 + }, + { + "epoch": 0.7832845052083334, + "grad_norm": 14.072093963623047, + "learning_rate": 9.08702489168544e-06, + "loss": 4.9696, + "step": 38500 + }, + { + "epoch": 0.78338623046875, + "grad_norm": 15.470830917358398, + "learning_rate": 9.08679463905654e-06, + "loss": 4.9156, + "step": 38505 + }, + { + "epoch": 0.7834879557291666, + "grad_norm": 16.484893798828125, + "learning_rate": 9.086564360314323e-06, + "loss": 5.0482, + "step": 38510 + }, + { + "epoch": 0.7835896809895834, + "grad_norm": 16.485942840576172, + "learning_rate": 9.086334055460254e-06, + "loss": 4.9264, + "step": 38515 + }, + { + "epoch": 0.78369140625, + "grad_norm": 17.710905075073242, + "learning_rate": 9.086103724495809e-06, + "loss": 5.1889, + "step": 38520 + }, + { + "epoch": 0.7837931315104166, + "grad_norm": 15.110018730163574, + "learning_rate": 9.085873367422457e-06, + "loss": 5.0547, + "step": 38525 + }, + { + "epoch": 0.7838948567708334, + "grad_norm": 19.210586547851562, + "learning_rate": 9.085642984241671e-06, + "loss": 5.3044, + "step": 38530 + }, + { + "epoch": 0.78399658203125, + "grad_norm": 16.566421508789062, + "learning_rate": 9.085412574954925e-06, + "loss": 5.0126, + "step": 38535 + }, + { + "epoch": 0.7840983072916666, + "grad_norm": 15.764559745788574, + "learning_rate": 9.085182139563689e-06, + "loss": 5.4102, + "step": 38540 + }, + { + "epoch": 0.7842000325520834, + "grad_norm": 19.193954467773438, + "learning_rate": 9.084951678069435e-06, + "loss": 5.14, + "step": 38545 + }, + { + "epoch": 0.7843017578125, + "grad_norm": 14.151999473571777, + "learning_rate": 9.084721190473638e-06, + "loss": 5.0362, + "step": 38550 + }, + { + "epoch": 0.7844034830729166, + "grad_norm": 15.49203872680664, + "learning_rate": 9.084490676777768e-06, + "loss": 5.0523, + "step": 38555 + }, + { + "epoch": 0.7845052083333334, + "grad_norm": 17.473283767700195, + "learning_rate": 9.0842601369833e-06, + "loss": 5.1878, + "step": 38560 + }, + { + "epoch": 0.78460693359375, + "grad_norm": 17.2724609375, + "learning_rate": 9.084029571091704e-06, + "loss": 5.0734, + "step": 38565 + }, + { + "epoch": 0.7847086588541666, + "grad_norm": 14.179994583129883, + "learning_rate": 9.08379897910446e-06, + "loss": 5.0374, + "step": 38570 + }, + { + "epoch": 0.7848103841145834, + "grad_norm": 20.293865203857422, + "learning_rate": 9.083568361023033e-06, + "loss": 4.9298, + "step": 38575 + }, + { + "epoch": 0.784912109375, + "grad_norm": 20.20954132080078, + "learning_rate": 9.0833377168489e-06, + "loss": 5.1229, + "step": 38580 + }, + { + "epoch": 0.7850138346354166, + "grad_norm": 15.886885643005371, + "learning_rate": 9.083107046583538e-06, + "loss": 5.2668, + "step": 38585 + }, + { + "epoch": 0.7851155598958334, + "grad_norm": 16.915000915527344, + "learning_rate": 9.082876350228418e-06, + "loss": 5.0024, + "step": 38590 + }, + { + "epoch": 0.78521728515625, + "grad_norm": 23.225051879882812, + "learning_rate": 9.082645627785011e-06, + "loss": 5.2643, + "step": 38595 + }, + { + "epoch": 0.7853190104166666, + "grad_norm": 27.98716926574707, + "learning_rate": 9.082414879254797e-06, + "loss": 5.1005, + "step": 38600 + }, + { + "epoch": 0.7854207356770834, + "grad_norm": 18.382970809936523, + "learning_rate": 9.082184104639249e-06, + "loss": 4.9964, + "step": 38605 + }, + { + "epoch": 0.7855224609375, + "grad_norm": 19.337980270385742, + "learning_rate": 9.081953303939838e-06, + "loss": 5.0251, + "step": 38610 + }, + { + "epoch": 0.7856241861979166, + "grad_norm": 16.797649383544922, + "learning_rate": 9.081722477158042e-06, + "loss": 4.9417, + "step": 38615 + }, + { + "epoch": 0.7857259114583334, + "grad_norm": 15.971357345581055, + "learning_rate": 9.081491624295335e-06, + "loss": 5.0926, + "step": 38620 + }, + { + "epoch": 0.78582763671875, + "grad_norm": 16.66270637512207, + "learning_rate": 9.081260745353192e-06, + "loss": 4.9308, + "step": 38625 + }, + { + "epoch": 0.7859293619791666, + "grad_norm": 15.529961585998535, + "learning_rate": 9.08102984033309e-06, + "loss": 4.92, + "step": 38630 + }, + { + "epoch": 0.7860310872395834, + "grad_norm": 15.886109352111816, + "learning_rate": 9.080798909236501e-06, + "loss": 5.3651, + "step": 38635 + }, + { + "epoch": 0.7861328125, + "grad_norm": 14.704756736755371, + "learning_rate": 9.080567952064903e-06, + "loss": 5.1131, + "step": 38640 + }, + { + "epoch": 0.7862345377604166, + "grad_norm": 20.753721237182617, + "learning_rate": 9.080336968819772e-06, + "loss": 4.9972, + "step": 38645 + }, + { + "epoch": 0.7863362630208334, + "grad_norm": 20.871702194213867, + "learning_rate": 9.080105959502582e-06, + "loss": 4.7227, + "step": 38650 + }, + { + "epoch": 0.78643798828125, + "grad_norm": 23.519670486450195, + "learning_rate": 9.07987492411481e-06, + "loss": 4.968, + "step": 38655 + }, + { + "epoch": 0.7865397135416666, + "grad_norm": 14.59158706665039, + "learning_rate": 9.079643862657932e-06, + "loss": 4.9881, + "step": 38660 + }, + { + "epoch": 0.7866414388020834, + "grad_norm": 14.012027740478516, + "learning_rate": 9.079412775133426e-06, + "loss": 5.0585, + "step": 38665 + }, + { + "epoch": 0.7867431640625, + "grad_norm": 19.40126609802246, + "learning_rate": 9.079181661542767e-06, + "loss": 5.1969, + "step": 38670 + }, + { + "epoch": 0.7868448893229166, + "grad_norm": 14.660162925720215, + "learning_rate": 9.078950521887432e-06, + "loss": 4.8648, + "step": 38675 + }, + { + "epoch": 0.7869466145833334, + "grad_norm": 16.690988540649414, + "learning_rate": 9.078719356168896e-06, + "loss": 5.1281, + "step": 38680 + }, + { + "epoch": 0.78704833984375, + "grad_norm": 21.63994789123535, + "learning_rate": 9.078488164388641e-06, + "loss": 5.1301, + "step": 38685 + }, + { + "epoch": 0.7871500651041666, + "grad_norm": 19.753286361694336, + "learning_rate": 9.078256946548142e-06, + "loss": 5.046, + "step": 38690 + }, + { + "epoch": 0.7872517903645834, + "grad_norm": 15.46033763885498, + "learning_rate": 9.078025702648872e-06, + "loss": 5.1998, + "step": 38695 + }, + { + "epoch": 0.787353515625, + "grad_norm": 29.430925369262695, + "learning_rate": 9.077794432692315e-06, + "loss": 4.8865, + "step": 38700 + }, + { + "epoch": 0.7874552408854166, + "grad_norm": 17.114364624023438, + "learning_rate": 9.077563136679945e-06, + "loss": 4.8864, + "step": 38705 + }, + { + "epoch": 0.7875569661458334, + "grad_norm": 13.877915382385254, + "learning_rate": 9.077331814613242e-06, + "loss": 4.9917, + "step": 38710 + }, + { + "epoch": 0.78765869140625, + "grad_norm": 15.859269142150879, + "learning_rate": 9.077100466493682e-06, + "loss": 4.8842, + "step": 38715 + }, + { + "epoch": 0.7877604166666666, + "grad_norm": 15.731815338134766, + "learning_rate": 9.076869092322744e-06, + "loss": 4.9493, + "step": 38720 + }, + { + "epoch": 0.7878621419270834, + "grad_norm": 16.2520694732666, + "learning_rate": 9.076637692101907e-06, + "loss": 5.0256, + "step": 38725 + }, + { + "epoch": 0.7879638671875, + "grad_norm": 13.064926147460938, + "learning_rate": 9.076406265832649e-06, + "loss": 4.9428, + "step": 38730 + }, + { + "epoch": 0.7880655924479166, + "grad_norm": 18.680858612060547, + "learning_rate": 9.07617481351645e-06, + "loss": 5.0093, + "step": 38735 + }, + { + "epoch": 0.7881673177083334, + "grad_norm": 17.562480926513672, + "learning_rate": 9.075943335154786e-06, + "loss": 5.0063, + "step": 38740 + }, + { + "epoch": 0.78826904296875, + "grad_norm": 14.056733131408691, + "learning_rate": 9.07571183074914e-06, + "loss": 4.9452, + "step": 38745 + }, + { + "epoch": 0.7883707682291666, + "grad_norm": 17.474607467651367, + "learning_rate": 9.075480300300988e-06, + "loss": 5.0001, + "step": 38750 + }, + { + "epoch": 0.7884724934895834, + "grad_norm": 16.981250762939453, + "learning_rate": 9.075248743811812e-06, + "loss": 5.0978, + "step": 38755 + }, + { + "epoch": 0.78857421875, + "grad_norm": 13.238334655761719, + "learning_rate": 9.075017161283088e-06, + "loss": 4.9056, + "step": 38760 + }, + { + "epoch": 0.7886759440104166, + "grad_norm": 11.94461441040039, + "learning_rate": 9.0747855527163e-06, + "loss": 4.9544, + "step": 38765 + }, + { + "epoch": 0.7887776692708334, + "grad_norm": 17.950883865356445, + "learning_rate": 9.074553918112924e-06, + "loss": 4.837, + "step": 38770 + }, + { + "epoch": 0.78887939453125, + "grad_norm": 26.629737854003906, + "learning_rate": 9.074322257474444e-06, + "loss": 5.1357, + "step": 38775 + }, + { + "epoch": 0.7889811197916666, + "grad_norm": 11.416871070861816, + "learning_rate": 9.074090570802338e-06, + "loss": 4.7647, + "step": 38780 + }, + { + "epoch": 0.7890828450520834, + "grad_norm": 12.79575252532959, + "learning_rate": 9.073858858098086e-06, + "loss": 5.1898, + "step": 38785 + }, + { + "epoch": 0.7891845703125, + "grad_norm": 18.239532470703125, + "learning_rate": 9.073627119363168e-06, + "loss": 5.1813, + "step": 38790 + }, + { + "epoch": 0.7892862955729166, + "grad_norm": 17.225374221801758, + "learning_rate": 9.073395354599066e-06, + "loss": 5.0449, + "step": 38795 + }, + { + "epoch": 0.7893880208333334, + "grad_norm": 23.090614318847656, + "learning_rate": 9.073163563807261e-06, + "loss": 5.0793, + "step": 38800 + }, + { + "epoch": 0.78948974609375, + "grad_norm": 16.67605972290039, + "learning_rate": 9.072931746989236e-06, + "loss": 5.0664, + "step": 38805 + }, + { + "epoch": 0.7895914713541666, + "grad_norm": 17.207490921020508, + "learning_rate": 9.072699904146467e-06, + "loss": 5.2978, + "step": 38810 + }, + { + "epoch": 0.7896931966145834, + "grad_norm": 15.624272346496582, + "learning_rate": 9.07246803528044e-06, + "loss": 5.1946, + "step": 38815 + }, + { + "epoch": 0.789794921875, + "grad_norm": 19.85232162475586, + "learning_rate": 9.072236140392634e-06, + "loss": 4.7935, + "step": 38820 + }, + { + "epoch": 0.7898966471354166, + "grad_norm": 18.415246963500977, + "learning_rate": 9.072004219484532e-06, + "loss": 5.088, + "step": 38825 + }, + { + "epoch": 0.7899983723958334, + "grad_norm": 12.307367324829102, + "learning_rate": 9.071772272557618e-06, + "loss": 5.4572, + "step": 38830 + }, + { + "epoch": 0.79010009765625, + "grad_norm": 18.422378540039062, + "learning_rate": 9.071540299613369e-06, + "loss": 5.0224, + "step": 38835 + }, + { + "epoch": 0.7902018229166666, + "grad_norm": 19.514080047607422, + "learning_rate": 9.071308300653272e-06, + "loss": 4.9354, + "step": 38840 + }, + { + "epoch": 0.7903035481770834, + "grad_norm": 17.962230682373047, + "learning_rate": 9.071076275678805e-06, + "loss": 4.9959, + "step": 38845 + }, + { + "epoch": 0.7904052734375, + "grad_norm": 14.813207626342773, + "learning_rate": 9.070844224691454e-06, + "loss": 5.4256, + "step": 38850 + }, + { + "epoch": 0.7905069986979166, + "grad_norm": 21.803268432617188, + "learning_rate": 9.070612147692702e-06, + "loss": 4.9675, + "step": 38855 + }, + { + "epoch": 0.7906087239583334, + "grad_norm": 25.277524948120117, + "learning_rate": 9.070380044684028e-06, + "loss": 5.031, + "step": 38860 + }, + { + "epoch": 0.79071044921875, + "grad_norm": 15.581558227539062, + "learning_rate": 9.070147915666917e-06, + "loss": 5.1575, + "step": 38865 + }, + { + "epoch": 0.7908121744791666, + "grad_norm": 18.822668075561523, + "learning_rate": 9.069915760642856e-06, + "loss": 4.9291, + "step": 38870 + }, + { + "epoch": 0.7909138997395834, + "grad_norm": 17.014896392822266, + "learning_rate": 9.069683579613323e-06, + "loss": 5.1046, + "step": 38875 + }, + { + "epoch": 0.791015625, + "grad_norm": 21.534637451171875, + "learning_rate": 9.069451372579808e-06, + "loss": 5.0195, + "step": 38880 + }, + { + "epoch": 0.7911173502604166, + "grad_norm": 15.898839950561523, + "learning_rate": 9.069219139543785e-06, + "loss": 5.1655, + "step": 38885 + }, + { + "epoch": 0.7912190755208334, + "grad_norm": 14.435086250305176, + "learning_rate": 9.068986880506747e-06, + "loss": 5.0968, + "step": 38890 + }, + { + "epoch": 0.79132080078125, + "grad_norm": 18.270483016967773, + "learning_rate": 9.068754595470174e-06, + "loss": 4.7592, + "step": 38895 + }, + { + "epoch": 0.7914225260416666, + "grad_norm": 17.243009567260742, + "learning_rate": 9.06852228443555e-06, + "loss": 5.2352, + "step": 38900 + }, + { + "epoch": 0.7915242513020834, + "grad_norm": 15.180118560791016, + "learning_rate": 9.06828994740436e-06, + "loss": 5.0097, + "step": 38905 + }, + { + "epoch": 0.7916259765625, + "grad_norm": 16.538238525390625, + "learning_rate": 9.06805758437809e-06, + "loss": 5.0645, + "step": 38910 + }, + { + "epoch": 0.7917277018229166, + "grad_norm": 13.668008804321289, + "learning_rate": 9.067825195358222e-06, + "loss": 5.141, + "step": 38915 + }, + { + "epoch": 0.7918294270833334, + "grad_norm": 18.465198516845703, + "learning_rate": 9.067592780346242e-06, + "loss": 5.0519, + "step": 38920 + }, + { + "epoch": 0.79193115234375, + "grad_norm": 21.29652976989746, + "learning_rate": 9.067360339343636e-06, + "loss": 4.9899, + "step": 38925 + }, + { + "epoch": 0.7920328776041666, + "grad_norm": 14.960943222045898, + "learning_rate": 9.067127872351889e-06, + "loss": 4.9213, + "step": 38930 + }, + { + "epoch": 0.7921346028645834, + "grad_norm": 16.74181365966797, + "learning_rate": 9.066895379372485e-06, + "loss": 5.2003, + "step": 38935 + }, + { + "epoch": 0.792236328125, + "grad_norm": 19.95344352722168, + "learning_rate": 9.066662860406912e-06, + "loss": 5.2496, + "step": 38940 + }, + { + "epoch": 0.7923380533854166, + "grad_norm": 16.079111099243164, + "learning_rate": 9.066430315456652e-06, + "loss": 4.856, + "step": 38945 + }, + { + "epoch": 0.7924397786458334, + "grad_norm": 12.118106842041016, + "learning_rate": 9.066197744523195e-06, + "loss": 5.0365, + "step": 38950 + }, + { + "epoch": 0.79254150390625, + "grad_norm": 15.761765480041504, + "learning_rate": 9.065965147608024e-06, + "loss": 5.063, + "step": 38955 + }, + { + "epoch": 0.7926432291666666, + "grad_norm": 18.13157844543457, + "learning_rate": 9.065732524712628e-06, + "loss": 4.7877, + "step": 38960 + }, + { + "epoch": 0.7927449544270834, + "grad_norm": 14.14089298248291, + "learning_rate": 9.06549987583849e-06, + "loss": 5.1337, + "step": 38965 + }, + { + "epoch": 0.7928466796875, + "grad_norm": 16.04478645324707, + "learning_rate": 9.0652672009871e-06, + "loss": 5.0856, + "step": 38970 + }, + { + "epoch": 0.7929484049479166, + "grad_norm": 15.54870319366455, + "learning_rate": 9.065034500159943e-06, + "loss": 5.0508, + "step": 38975 + }, + { + "epoch": 0.7930501302083334, + "grad_norm": 40.143184661865234, + "learning_rate": 9.064801773358505e-06, + "loss": 5.0779, + "step": 38980 + }, + { + "epoch": 0.79315185546875, + "grad_norm": 28.359638214111328, + "learning_rate": 9.064569020584274e-06, + "loss": 5.352, + "step": 38985 + }, + { + "epoch": 0.7932535807291666, + "grad_norm": 13.903857231140137, + "learning_rate": 9.064336241838737e-06, + "loss": 5.2251, + "step": 38990 + }, + { + "epoch": 0.7933553059895834, + "grad_norm": 18.654279708862305, + "learning_rate": 9.064103437123382e-06, + "loss": 4.8089, + "step": 38995 + }, + { + "epoch": 0.79345703125, + "grad_norm": 17.612407684326172, + "learning_rate": 9.063870606439697e-06, + "loss": 4.9252, + "step": 39000 + }, + { + "epoch": 0.7935587565104166, + "grad_norm": 18.05344009399414, + "learning_rate": 9.063637749789167e-06, + "loss": 5.089, + "step": 39005 + }, + { + "epoch": 0.7936604817708334, + "grad_norm": 14.949981689453125, + "learning_rate": 9.063404867173285e-06, + "loss": 5.1057, + "step": 39010 + }, + { + "epoch": 0.79376220703125, + "grad_norm": 13.60877799987793, + "learning_rate": 9.063171958593533e-06, + "loss": 4.9649, + "step": 39015 + }, + { + "epoch": 0.7938639322916666, + "grad_norm": 15.406497955322266, + "learning_rate": 9.062939024051403e-06, + "loss": 5.4927, + "step": 39020 + }, + { + "epoch": 0.7939656575520834, + "grad_norm": 22.355318069458008, + "learning_rate": 9.062706063548381e-06, + "loss": 5.2558, + "step": 39025 + }, + { + "epoch": 0.7940673828125, + "grad_norm": 17.420875549316406, + "learning_rate": 9.062473077085957e-06, + "loss": 4.9407, + "step": 39030 + }, + { + "epoch": 0.7941691080729166, + "grad_norm": 14.983186721801758, + "learning_rate": 9.062240064665621e-06, + "loss": 4.9914, + "step": 39035 + }, + { + "epoch": 0.7942708333333334, + "grad_norm": 12.421483993530273, + "learning_rate": 9.062007026288862e-06, + "loss": 5.393, + "step": 39040 + }, + { + "epoch": 0.79437255859375, + "grad_norm": 17.544742584228516, + "learning_rate": 9.061773961957166e-06, + "loss": 4.9195, + "step": 39045 + }, + { + "epoch": 0.7944742838541666, + "grad_norm": 21.1174259185791, + "learning_rate": 9.061540871672023e-06, + "loss": 5.2527, + "step": 39050 + }, + { + "epoch": 0.7945760091145834, + "grad_norm": 21.68458366394043, + "learning_rate": 9.061307755434925e-06, + "loss": 5.2705, + "step": 39055 + }, + { + "epoch": 0.794677734375, + "grad_norm": 15.749639511108398, + "learning_rate": 9.061074613247357e-06, + "loss": 5.1229, + "step": 39060 + }, + { + "epoch": 0.7947794596354166, + "grad_norm": 16.056880950927734, + "learning_rate": 9.060841445110813e-06, + "loss": 5.106, + "step": 39065 + }, + { + "epoch": 0.7948811848958334, + "grad_norm": 17.47373390197754, + "learning_rate": 9.060608251026782e-06, + "loss": 4.9293, + "step": 39070 + }, + { + "epoch": 0.79498291015625, + "grad_norm": 22.136018753051758, + "learning_rate": 9.060375030996751e-06, + "loss": 4.711, + "step": 39075 + }, + { + "epoch": 0.7950846354166666, + "grad_norm": 18.0781307220459, + "learning_rate": 9.060141785022214e-06, + "loss": 5.086, + "step": 39080 + }, + { + "epoch": 0.7951863606770834, + "grad_norm": 18.303848266601562, + "learning_rate": 9.05990851310466e-06, + "loss": 5.1053, + "step": 39085 + }, + { + "epoch": 0.7952880859375, + "grad_norm": 18.69728660583496, + "learning_rate": 9.059675215245579e-06, + "loss": 4.9146, + "step": 39090 + }, + { + "epoch": 0.7953898111979166, + "grad_norm": 19.86650276184082, + "learning_rate": 9.059441891446461e-06, + "loss": 5.0671, + "step": 39095 + }, + { + "epoch": 0.7954915364583334, + "grad_norm": 15.210362434387207, + "learning_rate": 9.0592085417088e-06, + "loss": 5.0645, + "step": 39100 + }, + { + "epoch": 0.79559326171875, + "grad_norm": 20.611337661743164, + "learning_rate": 9.058975166034083e-06, + "loss": 4.8549, + "step": 39105 + }, + { + "epoch": 0.7956949869791666, + "grad_norm": 14.461285591125488, + "learning_rate": 9.058741764423803e-06, + "loss": 4.9676, + "step": 39110 + }, + { + "epoch": 0.7957967122395834, + "grad_norm": 24.43839454650879, + "learning_rate": 9.058508336879455e-06, + "loss": 5.2296, + "step": 39115 + }, + { + "epoch": 0.7958984375, + "grad_norm": 16.358182907104492, + "learning_rate": 9.058274883402524e-06, + "loss": 5.1304, + "step": 39120 + }, + { + "epoch": 0.7960001627604166, + "grad_norm": 31.695470809936523, + "learning_rate": 9.058041403994506e-06, + "loss": 5.0576, + "step": 39125 + }, + { + "epoch": 0.7961018880208334, + "grad_norm": 17.79303741455078, + "learning_rate": 9.05780789865689e-06, + "loss": 4.8595, + "step": 39130 + }, + { + "epoch": 0.79620361328125, + "grad_norm": 14.158602714538574, + "learning_rate": 9.057574367391171e-06, + "loss": 5.345, + "step": 39135 + }, + { + "epoch": 0.7963053385416666, + "grad_norm": 15.200301170349121, + "learning_rate": 9.05734081019884e-06, + "loss": 4.8097, + "step": 39140 + }, + { + "epoch": 0.7964070638020834, + "grad_norm": 16.352876663208008, + "learning_rate": 9.057107227081389e-06, + "loss": 4.8973, + "step": 39145 + }, + { + "epoch": 0.7965087890625, + "grad_norm": 12.239370346069336, + "learning_rate": 9.056873618040311e-06, + "loss": 4.9407, + "step": 39150 + }, + { + "epoch": 0.7966105143229166, + "grad_norm": 16.48160743713379, + "learning_rate": 9.056639983077098e-06, + "loss": 5.1886, + "step": 39155 + }, + { + "epoch": 0.7967122395833334, + "grad_norm": 17.516695022583008, + "learning_rate": 9.056406322193242e-06, + "loss": 5.2441, + "step": 39160 + }, + { + "epoch": 0.79681396484375, + "grad_norm": 16.967220306396484, + "learning_rate": 9.056172635390239e-06, + "loss": 5.0257, + "step": 39165 + }, + { + "epoch": 0.7969156901041666, + "grad_norm": 14.442729949951172, + "learning_rate": 9.05593892266958e-06, + "loss": 4.8595, + "step": 39170 + }, + { + "epoch": 0.7970174153645834, + "grad_norm": 17.007150650024414, + "learning_rate": 9.055705184032759e-06, + "loss": 5.1752, + "step": 39175 + }, + { + "epoch": 0.797119140625, + "grad_norm": 19.29538917541504, + "learning_rate": 9.05547141948127e-06, + "loss": 5.0446, + "step": 39180 + }, + { + "epoch": 0.7972208658854166, + "grad_norm": 13.584966659545898, + "learning_rate": 9.055237629016605e-06, + "loss": 4.8384, + "step": 39185 + }, + { + "epoch": 0.7973225911458334, + "grad_norm": 16.932510375976562, + "learning_rate": 9.055003812640259e-06, + "loss": 4.9984, + "step": 39190 + }, + { + "epoch": 0.79742431640625, + "grad_norm": 16.518911361694336, + "learning_rate": 9.054769970353725e-06, + "loss": 5.0684, + "step": 39195 + }, + { + "epoch": 0.7975260416666666, + "grad_norm": 20.93357276916504, + "learning_rate": 9.0545361021585e-06, + "loss": 4.9875, + "step": 39200 + }, + { + "epoch": 0.7976277669270834, + "grad_norm": 22.037002563476562, + "learning_rate": 9.054302208056076e-06, + "loss": 5.0131, + "step": 39205 + }, + { + "epoch": 0.7977294921875, + "grad_norm": 16.392879486083984, + "learning_rate": 9.054068288047946e-06, + "loss": 5.3331, + "step": 39210 + }, + { + "epoch": 0.7978312174479166, + "grad_norm": 16.634668350219727, + "learning_rate": 9.053834342135607e-06, + "loss": 5.0801, + "step": 39215 + }, + { + "epoch": 0.7979329427083334, + "grad_norm": 17.853710174560547, + "learning_rate": 9.053600370320556e-06, + "loss": 4.9371, + "step": 39220 + }, + { + "epoch": 0.79803466796875, + "grad_norm": 13.782548904418945, + "learning_rate": 9.053366372604283e-06, + "loss": 5.1231, + "step": 39225 + }, + { + "epoch": 0.7981363932291666, + "grad_norm": 16.25218391418457, + "learning_rate": 9.053132348988287e-06, + "loss": 5.2797, + "step": 39230 + }, + { + "epoch": 0.7982381184895834, + "grad_norm": 17.99275779724121, + "learning_rate": 9.05289829947406e-06, + "loss": 4.9337, + "step": 39235 + }, + { + "epoch": 0.79833984375, + "grad_norm": 17.476821899414062, + "learning_rate": 9.052664224063102e-06, + "loss": 5.0185, + "step": 39240 + }, + { + "epoch": 0.7984415690104166, + "grad_norm": 16.36882209777832, + "learning_rate": 9.052430122756904e-06, + "loss": 5.3179, + "step": 39245 + }, + { + "epoch": 0.7985432942708334, + "grad_norm": 22.59345054626465, + "learning_rate": 9.052195995556965e-06, + "loss": 4.9029, + "step": 39250 + }, + { + "epoch": 0.79864501953125, + "grad_norm": 21.7537784576416, + "learning_rate": 9.051961842464777e-06, + "loss": 4.8763, + "step": 39255 + }, + { + "epoch": 0.7987467447916666, + "grad_norm": 18.632482528686523, + "learning_rate": 9.051727663481843e-06, + "loss": 5.2864, + "step": 39260 + }, + { + "epoch": 0.7988484700520834, + "grad_norm": 16.175756454467773, + "learning_rate": 9.051493458609656e-06, + "loss": 5.2017, + "step": 39265 + }, + { + "epoch": 0.7989501953125, + "grad_norm": 12.958712577819824, + "learning_rate": 9.051259227849708e-06, + "loss": 4.9957, + "step": 39270 + }, + { + "epoch": 0.7990519205729166, + "grad_norm": 15.41077995300293, + "learning_rate": 9.0510249712035e-06, + "loss": 5.1433, + "step": 39275 + }, + { + "epoch": 0.7991536458333334, + "grad_norm": 14.36764144897461, + "learning_rate": 9.050790688672532e-06, + "loss": 4.9831, + "step": 39280 + }, + { + "epoch": 0.79925537109375, + "grad_norm": 28.295270919799805, + "learning_rate": 9.050556380258295e-06, + "loss": 5.3458, + "step": 39285 + }, + { + "epoch": 0.7993570963541666, + "grad_norm": 14.827741622924805, + "learning_rate": 9.050322045962289e-06, + "loss": 5.0721, + "step": 39290 + }, + { + "epoch": 0.7994588216145834, + "grad_norm": 19.675174713134766, + "learning_rate": 9.05008768578601e-06, + "loss": 4.998, + "step": 39295 + }, + { + "epoch": 0.799560546875, + "grad_norm": 17.33957290649414, + "learning_rate": 9.049853299730958e-06, + "loss": 5.0823, + "step": 39300 + }, + { + "epoch": 0.7996622721354166, + "grad_norm": 16.94350814819336, + "learning_rate": 9.049618887798629e-06, + "loss": 4.9123, + "step": 39305 + }, + { + "epoch": 0.7997639973958334, + "grad_norm": 18.13597869873047, + "learning_rate": 9.049384449990519e-06, + "loss": 5.1393, + "step": 39310 + }, + { + "epoch": 0.79986572265625, + "grad_norm": 19.886388778686523, + "learning_rate": 9.049149986308129e-06, + "loss": 4.8566, + "step": 39315 + }, + { + "epoch": 0.7999674479166666, + "grad_norm": 20.495912551879883, + "learning_rate": 9.048915496752955e-06, + "loss": 5.0103, + "step": 39320 + }, + { + "epoch": 0.8000691731770834, + "grad_norm": 22.365507125854492, + "learning_rate": 9.048680981326498e-06, + "loss": 5.2934, + "step": 39325 + }, + { + "epoch": 0.8001708984375, + "grad_norm": 15.594839096069336, + "learning_rate": 9.048446440030252e-06, + "loss": 4.8425, + "step": 39330 + }, + { + "epoch": 0.8002726236979166, + "grad_norm": 14.336177825927734, + "learning_rate": 9.04821187286572e-06, + "loss": 4.7354, + "step": 39335 + }, + { + "epoch": 0.8003743489583334, + "grad_norm": 14.827462196350098, + "learning_rate": 9.047977279834401e-06, + "loss": 5.0459, + "step": 39340 + }, + { + "epoch": 0.80047607421875, + "grad_norm": 23.829971313476562, + "learning_rate": 9.04774266093779e-06, + "loss": 5.1267, + "step": 39345 + }, + { + "epoch": 0.8005777994791666, + "grad_norm": 16.92340660095215, + "learning_rate": 9.047508016177389e-06, + "loss": 5.1962, + "step": 39350 + }, + { + "epoch": 0.8006795247395834, + "grad_norm": 15.728913307189941, + "learning_rate": 9.047273345554697e-06, + "loss": 4.9719, + "step": 39355 + }, + { + "epoch": 0.80078125, + "grad_norm": 16.070960998535156, + "learning_rate": 9.04703864907121e-06, + "loss": 4.9568, + "step": 39360 + }, + { + "epoch": 0.8008829752604166, + "grad_norm": 19.84795570373535, + "learning_rate": 9.046803926728435e-06, + "loss": 5.0971, + "step": 39365 + }, + { + "epoch": 0.8009847005208334, + "grad_norm": 14.164381980895996, + "learning_rate": 9.046569178527866e-06, + "loss": 5.2107, + "step": 39370 + }, + { + "epoch": 0.80108642578125, + "grad_norm": 14.655193328857422, + "learning_rate": 9.046334404471004e-06, + "loss": 5.2987, + "step": 39375 + }, + { + "epoch": 0.8011881510416666, + "grad_norm": 17.709697723388672, + "learning_rate": 9.046099604559351e-06, + "loss": 5.1064, + "step": 39380 + }, + { + "epoch": 0.8012898763020834, + "grad_norm": 14.80785846710205, + "learning_rate": 9.045864778794405e-06, + "loss": 5.0388, + "step": 39385 + }, + { + "epoch": 0.8013916015625, + "grad_norm": 17.72690200805664, + "learning_rate": 9.045629927177668e-06, + "loss": 4.8092, + "step": 39390 + }, + { + "epoch": 0.8014933268229166, + "grad_norm": 19.244403839111328, + "learning_rate": 9.045395049710639e-06, + "loss": 5.3136, + "step": 39395 + }, + { + "epoch": 0.8015950520833334, + "grad_norm": 13.517215728759766, + "learning_rate": 9.045160146394818e-06, + "loss": 4.9414, + "step": 39400 + }, + { + "epoch": 0.80169677734375, + "grad_norm": 19.171689987182617, + "learning_rate": 9.04492521723171e-06, + "loss": 4.9695, + "step": 39405 + }, + { + "epoch": 0.8017985026041666, + "grad_norm": 19.53545570373535, + "learning_rate": 9.044690262222814e-06, + "loss": 4.9793, + "step": 39410 + }, + { + "epoch": 0.8019002278645834, + "grad_norm": 14.555130958557129, + "learning_rate": 9.044455281369632e-06, + "loss": 4.8936, + "step": 39415 + }, + { + "epoch": 0.802001953125, + "grad_norm": 20.399511337280273, + "learning_rate": 9.044220274673662e-06, + "loss": 4.9939, + "step": 39420 + }, + { + "epoch": 0.8021036783854166, + "grad_norm": 20.317781448364258, + "learning_rate": 9.043985242136412e-06, + "loss": 5.2139, + "step": 39425 + }, + { + "epoch": 0.8022054036458334, + "grad_norm": 13.976344108581543, + "learning_rate": 9.043750183759376e-06, + "loss": 5.0403, + "step": 39430 + }, + { + "epoch": 0.80230712890625, + "grad_norm": 14.038840293884277, + "learning_rate": 9.043515099544064e-06, + "loss": 5.0497, + "step": 39435 + }, + { + "epoch": 0.8024088541666666, + "grad_norm": 21.065738677978516, + "learning_rate": 9.04327998949197e-06, + "loss": 5.0891, + "step": 39440 + }, + { + "epoch": 0.8025105794270834, + "grad_norm": 16.19965362548828, + "learning_rate": 9.043044853604603e-06, + "loss": 4.9798, + "step": 39445 + }, + { + "epoch": 0.8026123046875, + "grad_norm": 12.86209774017334, + "learning_rate": 9.042809691883463e-06, + "loss": 5.0489, + "step": 39450 + }, + { + "epoch": 0.8027140299479166, + "grad_norm": 21.256940841674805, + "learning_rate": 9.042574504330052e-06, + "loss": 5.1199, + "step": 39455 + }, + { + "epoch": 0.8028157552083334, + "grad_norm": 14.451148986816406, + "learning_rate": 9.042339290945873e-06, + "loss": 4.9888, + "step": 39460 + }, + { + "epoch": 0.80291748046875, + "grad_norm": 19.0195255279541, + "learning_rate": 9.042104051732427e-06, + "loss": 4.9366, + "step": 39465 + }, + { + "epoch": 0.8030192057291666, + "grad_norm": 16.98509407043457, + "learning_rate": 9.041868786691223e-06, + "loss": 5.1257, + "step": 39470 + }, + { + "epoch": 0.8031209309895834, + "grad_norm": 18.77008819580078, + "learning_rate": 9.041633495823758e-06, + "loss": 5.0317, + "step": 39475 + }, + { + "epoch": 0.80322265625, + "grad_norm": 15.777860641479492, + "learning_rate": 9.04139817913154e-06, + "loss": 4.9744, + "step": 39480 + }, + { + "epoch": 0.8033243815104166, + "grad_norm": 19.897098541259766, + "learning_rate": 9.041162836616068e-06, + "loss": 5.1638, + "step": 39485 + }, + { + "epoch": 0.8034261067708334, + "grad_norm": 15.589550971984863, + "learning_rate": 9.04092746827885e-06, + "loss": 5.0456, + "step": 39490 + }, + { + "epoch": 0.80352783203125, + "grad_norm": 19.358198165893555, + "learning_rate": 9.040692074121388e-06, + "loss": 4.9979, + "step": 39495 + }, + { + "epoch": 0.8036295572916666, + "grad_norm": 16.41994857788086, + "learning_rate": 9.040456654145187e-06, + "loss": 4.7496, + "step": 39500 + }, + { + "epoch": 0.8037312825520834, + "grad_norm": 18.20423126220703, + "learning_rate": 9.04022120835175e-06, + "loss": 4.9825, + "step": 39505 + }, + { + "epoch": 0.8038330078125, + "grad_norm": 18.663728713989258, + "learning_rate": 9.03998573674258e-06, + "loss": 5.1194, + "step": 39510 + }, + { + "epoch": 0.8039347330729166, + "grad_norm": 17.563114166259766, + "learning_rate": 9.039750239319186e-06, + "loss": 5.2212, + "step": 39515 + }, + { + "epoch": 0.8040364583333334, + "grad_norm": 20.028282165527344, + "learning_rate": 9.039514716083069e-06, + "loss": 4.8173, + "step": 39520 + }, + { + "epoch": 0.80413818359375, + "grad_norm": 16.018356323242188, + "learning_rate": 9.039279167035737e-06, + "loss": 5.0518, + "step": 39525 + }, + { + "epoch": 0.8042399088541666, + "grad_norm": 17.326570510864258, + "learning_rate": 9.03904359217869e-06, + "loss": 5.137, + "step": 39530 + }, + { + "epoch": 0.8043416341145834, + "grad_norm": 16.545001983642578, + "learning_rate": 9.03880799151344e-06, + "loss": 4.8974, + "step": 39535 + }, + { + "epoch": 0.804443359375, + "grad_norm": 16.940475463867188, + "learning_rate": 9.038572365041488e-06, + "loss": 5.249, + "step": 39540 + }, + { + "epoch": 0.8045450846354166, + "grad_norm": 20.80504608154297, + "learning_rate": 9.038336712764342e-06, + "loss": 5.0896, + "step": 39545 + }, + { + "epoch": 0.8046468098958334, + "grad_norm": 15.148119926452637, + "learning_rate": 9.038101034683504e-06, + "loss": 5.2196, + "step": 39550 + }, + { + "epoch": 0.80474853515625, + "grad_norm": 19.119529724121094, + "learning_rate": 9.037865330800484e-06, + "loss": 5.1705, + "step": 39555 + }, + { + "epoch": 0.8048502604166666, + "grad_norm": 19.750837326049805, + "learning_rate": 9.037629601116784e-06, + "loss": 4.8918, + "step": 39560 + }, + { + "epoch": 0.8049519856770834, + "grad_norm": 15.979229927062988, + "learning_rate": 9.037393845633915e-06, + "loss": 4.9672, + "step": 39565 + }, + { + "epoch": 0.8050537109375, + "grad_norm": 15.76176643371582, + "learning_rate": 9.03715806435338e-06, + "loss": 4.8858, + "step": 39570 + }, + { + "epoch": 0.8051554361979166, + "grad_norm": 17.509933471679688, + "learning_rate": 9.036922257276686e-06, + "loss": 5.0306, + "step": 39575 + }, + { + "epoch": 0.8052571614583334, + "grad_norm": 18.152509689331055, + "learning_rate": 9.036686424405341e-06, + "loss": 5.3465, + "step": 39580 + }, + { + "epoch": 0.80535888671875, + "grad_norm": 16.123411178588867, + "learning_rate": 9.036450565740851e-06, + "loss": 5.0066, + "step": 39585 + }, + { + "epoch": 0.8054606119791666, + "grad_norm": 20.939184188842773, + "learning_rate": 9.036214681284724e-06, + "loss": 5.0059, + "step": 39590 + }, + { + "epoch": 0.8055623372395834, + "grad_norm": 18.32037925720215, + "learning_rate": 9.035978771038465e-06, + "loss": 5.0433, + "step": 39595 + }, + { + "epoch": 0.8056640625, + "grad_norm": 17.005908966064453, + "learning_rate": 9.035742835003582e-06, + "loss": 4.9913, + "step": 39600 + }, + { + "epoch": 0.8057657877604166, + "grad_norm": 11.8641357421875, + "learning_rate": 9.035506873181586e-06, + "loss": 4.9478, + "step": 39605 + }, + { + "epoch": 0.8058675130208334, + "grad_norm": 20.587949752807617, + "learning_rate": 9.03527088557398e-06, + "loss": 5.0548, + "step": 39610 + }, + { + "epoch": 0.80596923828125, + "grad_norm": 21.0520076751709, + "learning_rate": 9.035034872182277e-06, + "loss": 5.4396, + "step": 39615 + }, + { + "epoch": 0.8060709635416666, + "grad_norm": 19.163286209106445, + "learning_rate": 9.034798833007979e-06, + "loss": 4.9902, + "step": 39620 + }, + { + "epoch": 0.8061726888020834, + "grad_norm": 14.2377347946167, + "learning_rate": 9.0345627680526e-06, + "loss": 4.8785, + "step": 39625 + }, + { + "epoch": 0.8062744140625, + "grad_norm": 20.75943374633789, + "learning_rate": 9.034326677317643e-06, + "loss": 5.2415, + "step": 39630 + }, + { + "epoch": 0.8063761393229166, + "grad_norm": 19.685508728027344, + "learning_rate": 9.03409056080462e-06, + "loss": 4.9687, + "step": 39635 + }, + { + "epoch": 0.8064778645833334, + "grad_norm": 17.339672088623047, + "learning_rate": 9.03385441851504e-06, + "loss": 5.0094, + "step": 39640 + }, + { + "epoch": 0.80657958984375, + "grad_norm": 12.933045387268066, + "learning_rate": 9.033618250450409e-06, + "loss": 5.004, + "step": 39645 + }, + { + "epoch": 0.8066813151041666, + "grad_norm": 15.026620864868164, + "learning_rate": 9.03338205661224e-06, + "loss": 4.9896, + "step": 39650 + }, + { + "epoch": 0.8067830403645834, + "grad_norm": 17.21794319152832, + "learning_rate": 9.033145837002037e-06, + "loss": 5.0804, + "step": 39655 + }, + { + "epoch": 0.806884765625, + "grad_norm": 15.225666999816895, + "learning_rate": 9.032909591621315e-06, + "loss": 5.4699, + "step": 39660 + }, + { + "epoch": 0.8069864908854166, + "grad_norm": 18.296972274780273, + "learning_rate": 9.032673320471582e-06, + "loss": 5.2762, + "step": 39665 + }, + { + "epoch": 0.8070882161458334, + "grad_norm": 12.894665718078613, + "learning_rate": 9.032437023554343e-06, + "loss": 5.1296, + "step": 39670 + }, + { + "epoch": 0.80718994140625, + "grad_norm": 19.808156967163086, + "learning_rate": 9.032200700871113e-06, + "loss": 5.0567, + "step": 39675 + }, + { + "epoch": 0.8072916666666666, + "grad_norm": 17.6763858795166, + "learning_rate": 9.031964352423402e-06, + "loss": 5.0379, + "step": 39680 + }, + { + "epoch": 0.8073933919270834, + "grad_norm": 17.30624008178711, + "learning_rate": 9.031727978212717e-06, + "loss": 4.8732, + "step": 39685 + }, + { + "epoch": 0.8074951171875, + "grad_norm": 19.261173248291016, + "learning_rate": 9.03149157824057e-06, + "loss": 5.3018, + "step": 39690 + }, + { + "epoch": 0.8075968424479166, + "grad_norm": 17.855993270874023, + "learning_rate": 9.031255152508472e-06, + "loss": 5.3643, + "step": 39695 + }, + { + "epoch": 0.8076985677083334, + "grad_norm": 14.629176139831543, + "learning_rate": 9.031018701017933e-06, + "loss": 5.1274, + "step": 39700 + }, + { + "epoch": 0.80780029296875, + "grad_norm": 17.031618118286133, + "learning_rate": 9.030782223770464e-06, + "loss": 5.0534, + "step": 39705 + }, + { + "epoch": 0.8079020182291666, + "grad_norm": 18.014307022094727, + "learning_rate": 9.030545720767578e-06, + "loss": 4.9358, + "step": 39710 + }, + { + "epoch": 0.8080037434895834, + "grad_norm": 18.35402488708496, + "learning_rate": 9.030309192010782e-06, + "loss": 4.9076, + "step": 39715 + }, + { + "epoch": 0.80810546875, + "grad_norm": 29.72167205810547, + "learning_rate": 9.03007263750159e-06, + "loss": 5.0743, + "step": 39720 + }, + { + "epoch": 0.8082071940104166, + "grad_norm": 19.323354721069336, + "learning_rate": 9.029836057241514e-06, + "loss": 5.2259, + "step": 39725 + }, + { + "epoch": 0.8083089192708334, + "grad_norm": 20.298627853393555, + "learning_rate": 9.029599451232066e-06, + "loss": 5.1228, + "step": 39730 + }, + { + "epoch": 0.80841064453125, + "grad_norm": 17.207048416137695, + "learning_rate": 9.029362819474754e-06, + "loss": 5.184, + "step": 39735 + }, + { + "epoch": 0.8085123697916666, + "grad_norm": 16.224376678466797, + "learning_rate": 9.029126161971093e-06, + "loss": 5.1841, + "step": 39740 + }, + { + "epoch": 0.8086140950520834, + "grad_norm": 15.583319664001465, + "learning_rate": 9.028889478722597e-06, + "loss": 5.2608, + "step": 39745 + }, + { + "epoch": 0.8087158203125, + "grad_norm": 17.742204666137695, + "learning_rate": 9.028652769730775e-06, + "loss": 5.0109, + "step": 39750 + }, + { + "epoch": 0.8088175455729166, + "grad_norm": 18.09943389892578, + "learning_rate": 9.02841603499714e-06, + "loss": 4.9578, + "step": 39755 + }, + { + "epoch": 0.8089192708333334, + "grad_norm": 16.202756881713867, + "learning_rate": 9.028179274523206e-06, + "loss": 5.0088, + "step": 39760 + }, + { + "epoch": 0.80902099609375, + "grad_norm": 15.239214897155762, + "learning_rate": 9.027942488310486e-06, + "loss": 5.1405, + "step": 39765 + }, + { + "epoch": 0.8091227213541666, + "grad_norm": 20.064945220947266, + "learning_rate": 9.027705676360491e-06, + "loss": 5.3397, + "step": 39770 + }, + { + "epoch": 0.8092244466145834, + "grad_norm": 18.906070709228516, + "learning_rate": 9.027468838674735e-06, + "loss": 5.1128, + "step": 39775 + }, + { + "epoch": 0.809326171875, + "grad_norm": 19.60686683654785, + "learning_rate": 9.027231975254732e-06, + "loss": 5.0501, + "step": 39780 + }, + { + "epoch": 0.8094278971354166, + "grad_norm": 16.8685245513916, + "learning_rate": 9.026995086101995e-06, + "loss": 4.9719, + "step": 39785 + }, + { + "epoch": 0.8095296223958334, + "grad_norm": 19.635929107666016, + "learning_rate": 9.026758171218039e-06, + "loss": 4.9829, + "step": 39790 + }, + { + "epoch": 0.80963134765625, + "grad_norm": 28.031291961669922, + "learning_rate": 9.026521230604374e-06, + "loss": 5.1246, + "step": 39795 + }, + { + "epoch": 0.8097330729166666, + "grad_norm": 13.071540832519531, + "learning_rate": 9.02628426426252e-06, + "loss": 5.2689, + "step": 39800 + }, + { + "epoch": 0.8098347981770834, + "grad_norm": 17.84857177734375, + "learning_rate": 9.026047272193985e-06, + "loss": 5.2906, + "step": 39805 + }, + { + "epoch": 0.8099365234375, + "grad_norm": 14.488429069519043, + "learning_rate": 9.025810254400288e-06, + "loss": 5.3392, + "step": 39810 + }, + { + "epoch": 0.8100382486979166, + "grad_norm": 19.623716354370117, + "learning_rate": 9.025573210882938e-06, + "loss": 5.0028, + "step": 39815 + }, + { + "epoch": 0.8101399739583334, + "grad_norm": 14.41666030883789, + "learning_rate": 9.025336141643455e-06, + "loss": 5.2301, + "step": 39820 + }, + { + "epoch": 0.81024169921875, + "grad_norm": 13.620198249816895, + "learning_rate": 9.025099046683352e-06, + "loss": 4.9689, + "step": 39825 + }, + { + "epoch": 0.8103434244791666, + "grad_norm": 19.537944793701172, + "learning_rate": 9.024861926004145e-06, + "loss": 4.9884, + "step": 39830 + }, + { + "epoch": 0.8104451497395834, + "grad_norm": 16.646366119384766, + "learning_rate": 9.024624779607347e-06, + "loss": 5.308, + "step": 39835 + }, + { + "epoch": 0.810546875, + "grad_norm": 23.151987075805664, + "learning_rate": 9.024387607494473e-06, + "loss": 5.5964, + "step": 39840 + }, + { + "epoch": 0.8106486002604166, + "grad_norm": 15.856104850769043, + "learning_rate": 9.02415040966704e-06, + "loss": 4.7503, + "step": 39845 + }, + { + "epoch": 0.8107503255208334, + "grad_norm": 14.06312084197998, + "learning_rate": 9.023913186126565e-06, + "loss": 5.2916, + "step": 39850 + }, + { + "epoch": 0.81085205078125, + "grad_norm": 15.39443588256836, + "learning_rate": 9.02367593687456e-06, + "loss": 4.9261, + "step": 39855 + }, + { + "epoch": 0.8109537760416666, + "grad_norm": 22.939287185668945, + "learning_rate": 9.023438661912546e-06, + "loss": 5.1608, + "step": 39860 + }, + { + "epoch": 0.8110555013020834, + "grad_norm": 17.518997192382812, + "learning_rate": 9.023201361242035e-06, + "loss": 5.1694, + "step": 39865 + }, + { + "epoch": 0.8111572265625, + "grad_norm": 17.49787139892578, + "learning_rate": 9.022964034864543e-06, + "loss": 5.2293, + "step": 39870 + }, + { + "epoch": 0.8112589518229166, + "grad_norm": 14.428359031677246, + "learning_rate": 9.022726682781589e-06, + "loss": 5.1582, + "step": 39875 + }, + { + "epoch": 0.8113606770833334, + "grad_norm": 16.849712371826172, + "learning_rate": 9.022489304994687e-06, + "loss": 5.1075, + "step": 39880 + }, + { + "epoch": 0.81146240234375, + "grad_norm": 18.177722930908203, + "learning_rate": 9.022251901505356e-06, + "loss": 5.0302, + "step": 39885 + }, + { + "epoch": 0.8115641276041666, + "grad_norm": 14.29599380493164, + "learning_rate": 9.022014472315113e-06, + "loss": 4.9911, + "step": 39890 + }, + { + "epoch": 0.8116658528645834, + "grad_norm": 17.67877960205078, + "learning_rate": 9.021777017425475e-06, + "loss": 4.9728, + "step": 39895 + }, + { + "epoch": 0.811767578125, + "grad_norm": 16.024471282958984, + "learning_rate": 9.021539536837957e-06, + "loss": 5.0937, + "step": 39900 + }, + { + "epoch": 0.8118693033854166, + "grad_norm": 16.91876983642578, + "learning_rate": 9.02130203055408e-06, + "loss": 4.7374, + "step": 39905 + }, + { + "epoch": 0.8119710286458334, + "grad_norm": 20.508628845214844, + "learning_rate": 9.021064498575355e-06, + "loss": 5.2043, + "step": 39910 + }, + { + "epoch": 0.81207275390625, + "grad_norm": 13.038483619689941, + "learning_rate": 9.020826940903308e-06, + "loss": 4.8938, + "step": 39915 + }, + { + "epoch": 0.8121744791666666, + "grad_norm": 19.683652877807617, + "learning_rate": 9.020589357539454e-06, + "loss": 5.3212, + "step": 39920 + }, + { + "epoch": 0.8122762044270834, + "grad_norm": 17.672353744506836, + "learning_rate": 9.02035174848531e-06, + "loss": 5.1276, + "step": 39925 + }, + { + "epoch": 0.8123779296875, + "grad_norm": 15.103602409362793, + "learning_rate": 9.020114113742391e-06, + "loss": 4.9726, + "step": 39930 + }, + { + "epoch": 0.8124796549479166, + "grad_norm": 20.897926330566406, + "learning_rate": 9.019876453312222e-06, + "loss": 5.1748, + "step": 39935 + }, + { + "epoch": 0.8125813802083334, + "grad_norm": 12.270588874816895, + "learning_rate": 9.019638767196317e-06, + "loss": 5.1254, + "step": 39940 + }, + { + "epoch": 0.81268310546875, + "grad_norm": 17.77957534790039, + "learning_rate": 9.019401055396197e-06, + "loss": 5.2117, + "step": 39945 + }, + { + "epoch": 0.8127848307291666, + "grad_norm": 20.111595153808594, + "learning_rate": 9.019163317913382e-06, + "loss": 4.9863, + "step": 39950 + }, + { + "epoch": 0.8128865559895834, + "grad_norm": 17.560741424560547, + "learning_rate": 9.018925554749386e-06, + "loss": 5.1512, + "step": 39955 + }, + { + "epoch": 0.81298828125, + "grad_norm": 17.38470458984375, + "learning_rate": 9.018687765905734e-06, + "loss": 5.3027, + "step": 39960 + }, + { + "epoch": 0.8130900065104166, + "grad_norm": 16.297521591186523, + "learning_rate": 9.018449951383943e-06, + "loss": 5.0225, + "step": 39965 + }, + { + "epoch": 0.8131917317708334, + "grad_norm": 29.502634048461914, + "learning_rate": 9.018212111185531e-06, + "loss": 4.8543, + "step": 39970 + }, + { + "epoch": 0.81329345703125, + "grad_norm": 19.10433006286621, + "learning_rate": 9.01797424531202e-06, + "loss": 5.1073, + "step": 39975 + }, + { + "epoch": 0.8133951822916666, + "grad_norm": 14.608220100402832, + "learning_rate": 9.017736353764927e-06, + "loss": 4.94, + "step": 39980 + }, + { + "epoch": 0.8134969075520834, + "grad_norm": 14.184122085571289, + "learning_rate": 9.017498436545778e-06, + "loss": 4.9627, + "step": 39985 + }, + { + "epoch": 0.8135986328125, + "grad_norm": 18.93905258178711, + "learning_rate": 9.017260493656085e-06, + "loss": 4.9663, + "step": 39990 + }, + { + "epoch": 0.8137003580729166, + "grad_norm": 19.372594833374023, + "learning_rate": 9.017022525097375e-06, + "loss": 4.9535, + "step": 39995 + }, + { + "epoch": 0.8138020833333334, + "grad_norm": 15.311578750610352, + "learning_rate": 9.016784530871167e-06, + "loss": 5.0561, + "step": 40000 + }, + { + "epoch": 0.81390380859375, + "grad_norm": 16.13190269470215, + "learning_rate": 9.01654651097898e-06, + "loss": 4.8346, + "step": 40005 + }, + { + "epoch": 0.8140055338541666, + "grad_norm": 15.990065574645996, + "learning_rate": 9.016308465422335e-06, + "loss": 4.9103, + "step": 40010 + }, + { + "epoch": 0.8141072591145834, + "grad_norm": 17.13196563720703, + "learning_rate": 9.016070394202753e-06, + "loss": 5.0095, + "step": 40015 + }, + { + "epoch": 0.814208984375, + "grad_norm": 14.839542388916016, + "learning_rate": 9.01583229732176e-06, + "loss": 5.1255, + "step": 40020 + }, + { + "epoch": 0.8143107096354166, + "grad_norm": 16.49394989013672, + "learning_rate": 9.01559417478087e-06, + "loss": 5.1062, + "step": 40025 + }, + { + "epoch": 0.8144124348958334, + "grad_norm": 21.404539108276367, + "learning_rate": 9.01535602658161e-06, + "loss": 5.0287, + "step": 40030 + }, + { + "epoch": 0.81451416015625, + "grad_norm": 17.694908142089844, + "learning_rate": 9.015117852725498e-06, + "loss": 4.9939, + "step": 40035 + }, + { + "epoch": 0.8146158854166666, + "grad_norm": 16.16663360595703, + "learning_rate": 9.014879653214058e-06, + "loss": 5.182, + "step": 40040 + }, + { + "epoch": 0.8147176106770834, + "grad_norm": 17.779001235961914, + "learning_rate": 9.014641428048812e-06, + "loss": 4.8863, + "step": 40045 + }, + { + "epoch": 0.8148193359375, + "grad_norm": 14.876154899597168, + "learning_rate": 9.014403177231281e-06, + "loss": 4.8967, + "step": 40050 + }, + { + "epoch": 0.8149210611979166, + "grad_norm": 17.360654830932617, + "learning_rate": 9.014164900762989e-06, + "loss": 5.0281, + "step": 40055 + }, + { + "epoch": 0.8150227864583334, + "grad_norm": 19.116119384765625, + "learning_rate": 9.013926598645456e-06, + "loss": 5.1376, + "step": 40060 + }, + { + "epoch": 0.81512451171875, + "grad_norm": 15.263222694396973, + "learning_rate": 9.013688270880207e-06, + "loss": 4.9893, + "step": 40065 + }, + { + "epoch": 0.8152262369791666, + "grad_norm": 21.203216552734375, + "learning_rate": 9.013449917468765e-06, + "loss": 5.2509, + "step": 40070 + }, + { + "epoch": 0.8153279622395834, + "grad_norm": 14.194693565368652, + "learning_rate": 9.013211538412651e-06, + "loss": 5.125, + "step": 40075 + }, + { + "epoch": 0.8154296875, + "grad_norm": 18.143665313720703, + "learning_rate": 9.01297313371339e-06, + "loss": 5.2042, + "step": 40080 + }, + { + "epoch": 0.8155314127604166, + "grad_norm": 16.968759536743164, + "learning_rate": 9.012734703372503e-06, + "loss": 5.0923, + "step": 40085 + }, + { + "epoch": 0.8156331380208334, + "grad_norm": 18.664960861206055, + "learning_rate": 9.012496247391517e-06, + "loss": 4.8699, + "step": 40090 + }, + { + "epoch": 0.81573486328125, + "grad_norm": 18.62981414794922, + "learning_rate": 9.012257765771953e-06, + "loss": 5.141, + "step": 40095 + }, + { + "epoch": 0.8158365885416666, + "grad_norm": 17.40444564819336, + "learning_rate": 9.012019258515336e-06, + "loss": 5.1518, + "step": 40100 + }, + { + "epoch": 0.8159383138020834, + "grad_norm": 16.47307014465332, + "learning_rate": 9.011780725623188e-06, + "loss": 5.0638, + "step": 40105 + }, + { + "epoch": 0.8160400390625, + "grad_norm": 17.455766677856445, + "learning_rate": 9.011542167097035e-06, + "loss": 5.0187, + "step": 40110 + }, + { + "epoch": 0.8161417643229166, + "grad_norm": 16.271772384643555, + "learning_rate": 9.011303582938402e-06, + "loss": 5.0685, + "step": 40115 + }, + { + "epoch": 0.8162434895833334, + "grad_norm": 14.713613510131836, + "learning_rate": 9.011064973148813e-06, + "loss": 5.0855, + "step": 40120 + }, + { + "epoch": 0.81634521484375, + "grad_norm": 17.771617889404297, + "learning_rate": 9.010826337729791e-06, + "loss": 5.1132, + "step": 40125 + }, + { + "epoch": 0.8164469401041666, + "grad_norm": 21.667938232421875, + "learning_rate": 9.010587676682862e-06, + "loss": 5.1962, + "step": 40130 + }, + { + "epoch": 0.8165486653645834, + "grad_norm": 18.609560012817383, + "learning_rate": 9.010348990009551e-06, + "loss": 5.0426, + "step": 40135 + }, + { + "epoch": 0.816650390625, + "grad_norm": 17.949771881103516, + "learning_rate": 9.010110277711383e-06, + "loss": 4.8988, + "step": 40140 + }, + { + "epoch": 0.8167521158854166, + "grad_norm": 19.69295883178711, + "learning_rate": 9.009871539789883e-06, + "loss": 5.1244, + "step": 40145 + }, + { + "epoch": 0.8168538411458334, + "grad_norm": 17.629070281982422, + "learning_rate": 9.009632776246577e-06, + "loss": 5.2466, + "step": 40150 + }, + { + "epoch": 0.81695556640625, + "grad_norm": 17.78474998474121, + "learning_rate": 9.00939398708299e-06, + "loss": 4.9904, + "step": 40155 + }, + { + "epoch": 0.8170572916666666, + "grad_norm": 18.377307891845703, + "learning_rate": 9.00915517230065e-06, + "loss": 4.8685, + "step": 40160 + }, + { + "epoch": 0.8171590169270834, + "grad_norm": 21.337543487548828, + "learning_rate": 9.008916331901078e-06, + "loss": 5.2975, + "step": 40165 + }, + { + "epoch": 0.8172607421875, + "grad_norm": 20.804595947265625, + "learning_rate": 9.008677465885806e-06, + "loss": 4.9912, + "step": 40170 + }, + { + "epoch": 0.8173624674479166, + "grad_norm": 16.88298225402832, + "learning_rate": 9.008438574256357e-06, + "loss": 4.9482, + "step": 40175 + }, + { + "epoch": 0.8174641927083334, + "grad_norm": 27.434934616088867, + "learning_rate": 9.008199657014257e-06, + "loss": 5.4111, + "step": 40180 + }, + { + "epoch": 0.81756591796875, + "grad_norm": 20.551834106445312, + "learning_rate": 9.007960714161033e-06, + "loss": 5.2299, + "step": 40185 + }, + { + "epoch": 0.8176676432291666, + "grad_norm": 18.06477928161621, + "learning_rate": 9.007721745698214e-06, + "loss": 5.1254, + "step": 40190 + }, + { + "epoch": 0.8177693684895834, + "grad_norm": 18.44793128967285, + "learning_rate": 9.007482751627324e-06, + "loss": 4.8214, + "step": 40195 + }, + { + "epoch": 0.81787109375, + "grad_norm": 15.40019702911377, + "learning_rate": 9.007243731949893e-06, + "loss": 5.1146, + "step": 40200 + }, + { + "epoch": 0.8179728190104166, + "grad_norm": 16.88304901123047, + "learning_rate": 9.007004686667445e-06, + "loss": 5.1827, + "step": 40205 + }, + { + "epoch": 0.8180745442708334, + "grad_norm": 16.294750213623047, + "learning_rate": 9.00676561578151e-06, + "loss": 5.1565, + "step": 40210 + }, + { + "epoch": 0.81817626953125, + "grad_norm": 19.308246612548828, + "learning_rate": 9.006526519293615e-06, + "loss": 4.7489, + "step": 40215 + }, + { + "epoch": 0.8182779947916666, + "grad_norm": 15.1412992477417, + "learning_rate": 9.006287397205286e-06, + "loss": 5.1537, + "step": 40220 + }, + { + "epoch": 0.8183797200520834, + "grad_norm": 16.862722396850586, + "learning_rate": 9.006048249518052e-06, + "loss": 5.0428, + "step": 40225 + }, + { + "epoch": 0.8184814453125, + "grad_norm": 16.142473220825195, + "learning_rate": 9.005809076233442e-06, + "loss": 4.6909, + "step": 40230 + }, + { + "epoch": 0.8185831705729166, + "grad_norm": 19.28382110595703, + "learning_rate": 9.005569877352984e-06, + "loss": 5.1434, + "step": 40235 + }, + { + "epoch": 0.8186848958333334, + "grad_norm": 18.486711502075195, + "learning_rate": 9.005330652878207e-06, + "loss": 4.9219, + "step": 40240 + }, + { + "epoch": 0.81878662109375, + "grad_norm": 17.66242790222168, + "learning_rate": 9.005091402810638e-06, + "loss": 5.2156, + "step": 40245 + }, + { + "epoch": 0.8188883463541666, + "grad_norm": 16.285926818847656, + "learning_rate": 9.004852127151805e-06, + "loss": 5.2798, + "step": 40250 + }, + { + "epoch": 0.8189900716145834, + "grad_norm": 14.551901817321777, + "learning_rate": 9.004612825903239e-06, + "loss": 4.8723, + "step": 40255 + }, + { + "epoch": 0.819091796875, + "grad_norm": 16.301544189453125, + "learning_rate": 9.004373499066469e-06, + "loss": 4.8146, + "step": 40260 + }, + { + "epoch": 0.8191935221354166, + "grad_norm": 14.44806957244873, + "learning_rate": 9.004134146643022e-06, + "loss": 4.9722, + "step": 40265 + }, + { + "epoch": 0.8192952473958334, + "grad_norm": 15.80798625946045, + "learning_rate": 9.003894768634431e-06, + "loss": 5.1718, + "step": 40270 + }, + { + "epoch": 0.81939697265625, + "grad_norm": 18.351221084594727, + "learning_rate": 9.003655365042222e-06, + "loss": 5.1336, + "step": 40275 + }, + { + "epoch": 0.8194986979166666, + "grad_norm": 20.766334533691406, + "learning_rate": 9.003415935867925e-06, + "loss": 5.2757, + "step": 40280 + }, + { + "epoch": 0.8196004231770834, + "grad_norm": 17.65115737915039, + "learning_rate": 9.003176481113074e-06, + "loss": 5.233, + "step": 40285 + }, + { + "epoch": 0.8197021484375, + "grad_norm": 13.95207405090332, + "learning_rate": 9.002937000779193e-06, + "loss": 5.2606, + "step": 40290 + }, + { + "epoch": 0.8198038736979166, + "grad_norm": 17.100679397583008, + "learning_rate": 9.002697494867818e-06, + "loss": 4.9244, + "step": 40295 + }, + { + "epoch": 0.8199055989583334, + "grad_norm": 15.286942481994629, + "learning_rate": 9.002457963380476e-06, + "loss": 5.1197, + "step": 40300 + }, + { + "epoch": 0.82000732421875, + "grad_norm": 18.514986038208008, + "learning_rate": 9.002218406318697e-06, + "loss": 5.2786, + "step": 40305 + }, + { + "epoch": 0.8201090494791666, + "grad_norm": 15.662779808044434, + "learning_rate": 9.001978823684012e-06, + "loss": 5.4064, + "step": 40310 + }, + { + "epoch": 0.8202107747395834, + "grad_norm": 16.297271728515625, + "learning_rate": 9.001739215477955e-06, + "loss": 5.1378, + "step": 40315 + }, + { + "epoch": 0.8203125, + "grad_norm": 16.013076782226562, + "learning_rate": 9.001499581702054e-06, + "loss": 4.9296, + "step": 40320 + }, + { + "epoch": 0.8204142252604166, + "grad_norm": 18.211015701293945, + "learning_rate": 9.001259922357842e-06, + "loss": 4.8203, + "step": 40325 + }, + { + "epoch": 0.8205159505208334, + "grad_norm": 18.09383201599121, + "learning_rate": 9.001020237446847e-06, + "loss": 4.8943, + "step": 40330 + }, + { + "epoch": 0.82061767578125, + "grad_norm": 20.62177276611328, + "learning_rate": 9.000780526970603e-06, + "loss": 5.1423, + "step": 40335 + }, + { + "epoch": 0.8207194010416666, + "grad_norm": 17.937091827392578, + "learning_rate": 9.000540790930642e-06, + "loss": 5.086, + "step": 40340 + }, + { + "epoch": 0.8208211263020834, + "grad_norm": 18.25832748413086, + "learning_rate": 9.000301029328496e-06, + "loss": 5.2011, + "step": 40345 + }, + { + "epoch": 0.8209228515625, + "grad_norm": 18.436946868896484, + "learning_rate": 9.000061242165697e-06, + "loss": 4.9147, + "step": 40350 + }, + { + "epoch": 0.8210245768229166, + "grad_norm": 19.096445083618164, + "learning_rate": 8.999821429443774e-06, + "loss": 4.968, + "step": 40355 + }, + { + "epoch": 0.8211263020833334, + "grad_norm": 17.8380069732666, + "learning_rate": 8.999581591164264e-06, + "loss": 5.1039, + "step": 40360 + }, + { + "epoch": 0.82122802734375, + "grad_norm": 16.815919876098633, + "learning_rate": 8.999341727328695e-06, + "loss": 5.071, + "step": 40365 + }, + { + "epoch": 0.8213297526041666, + "grad_norm": 15.774674415588379, + "learning_rate": 8.999101837938605e-06, + "loss": 5.0428, + "step": 40370 + }, + { + "epoch": 0.8214314778645834, + "grad_norm": 19.62752914428711, + "learning_rate": 8.99886192299552e-06, + "loss": 4.7234, + "step": 40375 + }, + { + "epoch": 0.821533203125, + "grad_norm": 16.284299850463867, + "learning_rate": 8.99862198250098e-06, + "loss": 5.0653, + "step": 40380 + }, + { + "epoch": 0.8216349283854166, + "grad_norm": 14.677477836608887, + "learning_rate": 8.998382016456513e-06, + "loss": 4.9679, + "step": 40385 + }, + { + "epoch": 0.8217366536458334, + "grad_norm": 20.469886779785156, + "learning_rate": 8.998142024863654e-06, + "loss": 4.9143, + "step": 40390 + }, + { + "epoch": 0.82183837890625, + "grad_norm": 17.180540084838867, + "learning_rate": 8.997902007723937e-06, + "loss": 4.8885, + "step": 40395 + }, + { + "epoch": 0.8219401041666666, + "grad_norm": 17.575040817260742, + "learning_rate": 8.997661965038896e-06, + "loss": 5.0695, + "step": 40400 + }, + { + "epoch": 0.8220418294270834, + "grad_norm": 19.7827205657959, + "learning_rate": 8.997421896810064e-06, + "loss": 5.1362, + "step": 40405 + }, + { + "epoch": 0.8221435546875, + "grad_norm": 18.04209327697754, + "learning_rate": 8.997181803038975e-06, + "loss": 4.7798, + "step": 40410 + }, + { + "epoch": 0.8222452799479166, + "grad_norm": 18.898754119873047, + "learning_rate": 8.996941683727162e-06, + "loss": 5.0915, + "step": 40415 + }, + { + "epoch": 0.8223470052083334, + "grad_norm": 23.06604766845703, + "learning_rate": 8.996701538876163e-06, + "loss": 5.0641, + "step": 40420 + }, + { + "epoch": 0.82244873046875, + "grad_norm": 19.17162322998047, + "learning_rate": 8.996461368487509e-06, + "loss": 4.9956, + "step": 40425 + }, + { + "epoch": 0.8225504557291666, + "grad_norm": 17.886638641357422, + "learning_rate": 8.996221172562734e-06, + "loss": 5.0181, + "step": 40430 + }, + { + "epoch": 0.8226521809895834, + "grad_norm": 21.553525924682617, + "learning_rate": 8.995980951103373e-06, + "loss": 5.0032, + "step": 40435 + }, + { + "epoch": 0.82275390625, + "grad_norm": 14.997940063476562, + "learning_rate": 8.995740704110964e-06, + "loss": 5.0577, + "step": 40440 + }, + { + "epoch": 0.8228556315104166, + "grad_norm": 14.370160102844238, + "learning_rate": 8.995500431587041e-06, + "loss": 4.8388, + "step": 40445 + }, + { + "epoch": 0.8229573567708334, + "grad_norm": 26.973121643066406, + "learning_rate": 8.995260133533139e-06, + "loss": 4.9895, + "step": 40450 + }, + { + "epoch": 0.82305908203125, + "grad_norm": 17.001707077026367, + "learning_rate": 8.995019809950792e-06, + "loss": 5.114, + "step": 40455 + }, + { + "epoch": 0.8231608072916666, + "grad_norm": 15.96695327758789, + "learning_rate": 8.994779460841538e-06, + "loss": 5.1264, + "step": 40460 + }, + { + "epoch": 0.8232625325520834, + "grad_norm": 18.063766479492188, + "learning_rate": 8.994539086206909e-06, + "loss": 5.122, + "step": 40465 + }, + { + "epoch": 0.8233642578125, + "grad_norm": 16.534557342529297, + "learning_rate": 8.994298686048444e-06, + "loss": 5.06, + "step": 40470 + }, + { + "epoch": 0.8234659830729166, + "grad_norm": 17.719886779785156, + "learning_rate": 8.99405826036768e-06, + "loss": 5.0891, + "step": 40475 + }, + { + "epoch": 0.8235677083333334, + "grad_norm": 14.389893531799316, + "learning_rate": 8.99381780916615e-06, + "loss": 4.985, + "step": 40480 + }, + { + "epoch": 0.82366943359375, + "grad_norm": 14.66081428527832, + "learning_rate": 8.993577332445392e-06, + "loss": 4.9634, + "step": 40485 + }, + { + "epoch": 0.8237711588541666, + "grad_norm": 20.87640953063965, + "learning_rate": 8.993336830206942e-06, + "loss": 5.1663, + "step": 40490 + }, + { + "epoch": 0.8238728841145834, + "grad_norm": 24.66372299194336, + "learning_rate": 8.99309630245234e-06, + "loss": 4.9459, + "step": 40495 + }, + { + "epoch": 0.823974609375, + "grad_norm": 13.8410005569458, + "learning_rate": 8.992855749183119e-06, + "loss": 4.9295, + "step": 40500 + }, + { + "epoch": 0.8240763346354166, + "grad_norm": 14.93869686126709, + "learning_rate": 8.992615170400815e-06, + "loss": 4.9062, + "step": 40505 + }, + { + "epoch": 0.8241780598958334, + "grad_norm": 39.393882751464844, + "learning_rate": 8.99237456610697e-06, + "loss": 5.0746, + "step": 40510 + }, + { + "epoch": 0.82427978515625, + "grad_norm": 21.064212799072266, + "learning_rate": 8.992133936303118e-06, + "loss": 5.1593, + "step": 40515 + }, + { + "epoch": 0.8243815104166666, + "grad_norm": 22.947673797607422, + "learning_rate": 8.991893280990797e-06, + "loss": 5.5109, + "step": 40520 + }, + { + "epoch": 0.8244832356770834, + "grad_norm": 15.159880638122559, + "learning_rate": 8.991652600171545e-06, + "loss": 4.7968, + "step": 40525 + }, + { + "epoch": 0.8245849609375, + "grad_norm": 15.057247161865234, + "learning_rate": 8.991411893846901e-06, + "loss": 5.2086, + "step": 40530 + }, + { + "epoch": 0.8246866861979166, + "grad_norm": 17.72362518310547, + "learning_rate": 8.991171162018402e-06, + "loss": 5.1107, + "step": 40535 + }, + { + "epoch": 0.8247884114583334, + "grad_norm": 13.359354019165039, + "learning_rate": 8.990930404687586e-06, + "loss": 5.1492, + "step": 40540 + }, + { + "epoch": 0.82489013671875, + "grad_norm": 19.35919761657715, + "learning_rate": 8.99068962185599e-06, + "loss": 4.8714, + "step": 40545 + }, + { + "epoch": 0.8249918619791666, + "grad_norm": 19.285308837890625, + "learning_rate": 8.990448813525156e-06, + "loss": 5.0217, + "step": 40550 + }, + { + "epoch": 0.8250935872395834, + "grad_norm": 20.731489181518555, + "learning_rate": 8.990207979696621e-06, + "loss": 5.0738, + "step": 40555 + }, + { + "epoch": 0.8251953125, + "grad_norm": 14.086581230163574, + "learning_rate": 8.989967120371923e-06, + "loss": 5.1616, + "step": 40560 + }, + { + "epoch": 0.8252970377604166, + "grad_norm": 19.382375717163086, + "learning_rate": 8.989726235552603e-06, + "loss": 5.2139, + "step": 40565 + }, + { + "epoch": 0.8253987630208334, + "grad_norm": 12.645767211914062, + "learning_rate": 8.989485325240197e-06, + "loss": 4.9459, + "step": 40570 + }, + { + "epoch": 0.82550048828125, + "grad_norm": 28.356443405151367, + "learning_rate": 8.989244389436248e-06, + "loss": 5.1971, + "step": 40575 + }, + { + "epoch": 0.8256022135416666, + "grad_norm": 21.89112663269043, + "learning_rate": 8.989003428142293e-06, + "loss": 4.9825, + "step": 40580 + }, + { + "epoch": 0.8257039388020834, + "grad_norm": 18.66221809387207, + "learning_rate": 8.98876244135987e-06, + "loss": 5.0444, + "step": 40585 + }, + { + "epoch": 0.8258056640625, + "grad_norm": 18.795530319213867, + "learning_rate": 8.988521429090524e-06, + "loss": 4.7572, + "step": 40590 + }, + { + "epoch": 0.8259073893229166, + "grad_norm": 23.089378356933594, + "learning_rate": 8.988280391335791e-06, + "loss": 4.9594, + "step": 40595 + }, + { + "epoch": 0.8260091145833334, + "grad_norm": 14.489564895629883, + "learning_rate": 8.988039328097214e-06, + "loss": 5.1749, + "step": 40600 + }, + { + "epoch": 0.82611083984375, + "grad_norm": 18.433351516723633, + "learning_rate": 8.987798239376329e-06, + "loss": 5.1288, + "step": 40605 + }, + { + "epoch": 0.8262125651041666, + "grad_norm": 24.931039810180664, + "learning_rate": 8.98755712517468e-06, + "loss": 5.1301, + "step": 40610 + }, + { + "epoch": 0.8263142903645834, + "grad_norm": 17.351585388183594, + "learning_rate": 8.987315985493807e-06, + "loss": 4.9144, + "step": 40615 + }, + { + "epoch": 0.826416015625, + "grad_norm": 15.995537757873535, + "learning_rate": 8.987074820335253e-06, + "loss": 5.1493, + "step": 40620 + }, + { + "epoch": 0.8265177408854166, + "grad_norm": 19.679357528686523, + "learning_rate": 8.986833629700553e-06, + "loss": 5.1214, + "step": 40625 + }, + { + "epoch": 0.8266194661458334, + "grad_norm": 14.825810432434082, + "learning_rate": 8.986592413591254e-06, + "loss": 5.128, + "step": 40630 + }, + { + "epoch": 0.82672119140625, + "grad_norm": 16.13737678527832, + "learning_rate": 8.986351172008893e-06, + "loss": 5.1349, + "step": 40635 + }, + { + "epoch": 0.8268229166666666, + "grad_norm": 17.048973083496094, + "learning_rate": 8.986109904955017e-06, + "loss": 5.0009, + "step": 40640 + }, + { + "epoch": 0.8269246419270834, + "grad_norm": 17.120840072631836, + "learning_rate": 8.98586861243116e-06, + "loss": 5.1762, + "step": 40645 + }, + { + "epoch": 0.8270263671875, + "grad_norm": 22.369640350341797, + "learning_rate": 8.98562729443887e-06, + "loss": 5.0558, + "step": 40650 + }, + { + "epoch": 0.8271280924479166, + "grad_norm": 17.46108627319336, + "learning_rate": 8.985385950979686e-06, + "loss": 4.946, + "step": 40655 + }, + { + "epoch": 0.8272298177083334, + "grad_norm": 13.874532699584961, + "learning_rate": 8.985144582055153e-06, + "loss": 5.0988, + "step": 40660 + }, + { + "epoch": 0.82733154296875, + "grad_norm": 17.203838348388672, + "learning_rate": 8.984903187666808e-06, + "loss": 4.9986, + "step": 40665 + }, + { + "epoch": 0.8274332682291666, + "grad_norm": 17.78472328186035, + "learning_rate": 8.984661767816199e-06, + "loss": 4.8673, + "step": 40670 + }, + { + "epoch": 0.8275349934895834, + "grad_norm": 31.462230682373047, + "learning_rate": 8.984420322504864e-06, + "loss": 4.9161, + "step": 40675 + }, + { + "epoch": 0.82763671875, + "grad_norm": 15.746337890625, + "learning_rate": 8.98417885173435e-06, + "loss": 5.1171, + "step": 40680 + }, + { + "epoch": 0.8277384440104166, + "grad_norm": 16.282569885253906, + "learning_rate": 8.983937355506196e-06, + "loss": 5.1333, + "step": 40685 + }, + { + "epoch": 0.8278401692708334, + "grad_norm": 20.889686584472656, + "learning_rate": 8.983695833821949e-06, + "loss": 5.052, + "step": 40690 + }, + { + "epoch": 0.82794189453125, + "grad_norm": 14.963558197021484, + "learning_rate": 8.983454286683148e-06, + "loss": 5.0217, + "step": 40695 + }, + { + "epoch": 0.8280436197916666, + "grad_norm": 18.230289459228516, + "learning_rate": 8.98321271409134e-06, + "loss": 5.071, + "step": 40700 + }, + { + "epoch": 0.8281453450520834, + "grad_norm": 21.638507843017578, + "learning_rate": 8.982971116048068e-06, + "loss": 4.9642, + "step": 40705 + }, + { + "epoch": 0.8282470703125, + "grad_norm": 20.86768913269043, + "learning_rate": 8.982729492554875e-06, + "loss": 4.9255, + "step": 40710 + }, + { + "epoch": 0.8283487955729166, + "grad_norm": 18.845884323120117, + "learning_rate": 8.982487843613303e-06, + "loss": 5.1253, + "step": 40715 + }, + { + "epoch": 0.8284505208333334, + "grad_norm": 14.582634925842285, + "learning_rate": 8.9822461692249e-06, + "loss": 5.2015, + "step": 40720 + }, + { + "epoch": 0.82855224609375, + "grad_norm": 18.094125747680664, + "learning_rate": 8.982004469391206e-06, + "loss": 5.3205, + "step": 40725 + }, + { + "epoch": 0.8286539713541666, + "grad_norm": 23.551603317260742, + "learning_rate": 8.981762744113769e-06, + "loss": 5.0054, + "step": 40730 + }, + { + "epoch": 0.8287556966145834, + "grad_norm": 17.356508255004883, + "learning_rate": 8.98152099339413e-06, + "loss": 5.0194, + "step": 40735 + }, + { + "epoch": 0.828857421875, + "grad_norm": 16.611846923828125, + "learning_rate": 8.981279217233838e-06, + "loss": 5.0727, + "step": 40740 + }, + { + "epoch": 0.8289591471354166, + "grad_norm": 20.19108009338379, + "learning_rate": 8.981037415634434e-06, + "loss": 5.013, + "step": 40745 + }, + { + "epoch": 0.8290608723958334, + "grad_norm": 24.5153865814209, + "learning_rate": 8.980795588597466e-06, + "loss": 5.0429, + "step": 40750 + }, + { + "epoch": 0.82916259765625, + "grad_norm": 16.974191665649414, + "learning_rate": 8.980553736124477e-06, + "loss": 4.9795, + "step": 40755 + }, + { + "epoch": 0.8292643229166666, + "grad_norm": 17.818017959594727, + "learning_rate": 8.980311858217014e-06, + "loss": 4.8606, + "step": 40760 + }, + { + "epoch": 0.8293660481770834, + "grad_norm": 18.46076774597168, + "learning_rate": 8.98006995487662e-06, + "loss": 4.95, + "step": 40765 + }, + { + "epoch": 0.8294677734375, + "grad_norm": 17.286836624145508, + "learning_rate": 8.979828026104844e-06, + "loss": 5.1129, + "step": 40770 + }, + { + "epoch": 0.8295694986979166, + "grad_norm": 21.299415588378906, + "learning_rate": 8.97958607190323e-06, + "loss": 4.8892, + "step": 40775 + }, + { + "epoch": 0.8296712239583334, + "grad_norm": 13.971571922302246, + "learning_rate": 8.979344092273326e-06, + "loss": 5.0148, + "step": 40780 + }, + { + "epoch": 0.82977294921875, + "grad_norm": 17.192062377929688, + "learning_rate": 8.979102087216674e-06, + "loss": 4.7977, + "step": 40785 + }, + { + "epoch": 0.8298746744791666, + "grad_norm": 17.344501495361328, + "learning_rate": 8.978860056734824e-06, + "loss": 4.9071, + "step": 40790 + }, + { + "epoch": 0.8299763997395834, + "grad_norm": 16.825265884399414, + "learning_rate": 8.97861800082932e-06, + "loss": 5.1871, + "step": 40795 + }, + { + "epoch": 0.830078125, + "grad_norm": 14.848609924316406, + "learning_rate": 8.978375919501712e-06, + "loss": 4.9049, + "step": 40800 + }, + { + "epoch": 0.8301798502604166, + "grad_norm": 16.191526412963867, + "learning_rate": 8.978133812753542e-06, + "loss": 4.9789, + "step": 40805 + }, + { + "epoch": 0.8302815755208334, + "grad_norm": 13.751249313354492, + "learning_rate": 8.977891680586363e-06, + "loss": 5.068, + "step": 40810 + }, + { + "epoch": 0.83038330078125, + "grad_norm": 18.949129104614258, + "learning_rate": 8.977649523001717e-06, + "loss": 5.0974, + "step": 40815 + }, + { + "epoch": 0.8304850260416666, + "grad_norm": 16.20638084411621, + "learning_rate": 8.977407340001153e-06, + "loss": 4.9899, + "step": 40820 + }, + { + "epoch": 0.8305867513020834, + "grad_norm": 12.799920082092285, + "learning_rate": 8.97716513158622e-06, + "loss": 5.2043, + "step": 40825 + }, + { + "epoch": 0.8306884765625, + "grad_norm": 13.903509140014648, + "learning_rate": 8.976922897758463e-06, + "loss": 5.0578, + "step": 40830 + }, + { + "epoch": 0.8307902018229166, + "grad_norm": 16.26909637451172, + "learning_rate": 8.976680638519431e-06, + "loss": 4.9162, + "step": 40835 + }, + { + "epoch": 0.8308919270833334, + "grad_norm": 14.350013732910156, + "learning_rate": 8.976438353870673e-06, + "loss": 5.0648, + "step": 40840 + }, + { + "epoch": 0.83099365234375, + "grad_norm": 16.492156982421875, + "learning_rate": 8.976196043813736e-06, + "loss": 4.9761, + "step": 40845 + }, + { + "epoch": 0.8310953776041666, + "grad_norm": 13.028374671936035, + "learning_rate": 8.975953708350169e-06, + "loss": 5.1903, + "step": 40850 + }, + { + "epoch": 0.8311971028645834, + "grad_norm": 18.78083038330078, + "learning_rate": 8.975711347481519e-06, + "loss": 5.0703, + "step": 40855 + }, + { + "epoch": 0.831298828125, + "grad_norm": 21.106582641601562, + "learning_rate": 8.975468961209336e-06, + "loss": 5.1019, + "step": 40860 + }, + { + "epoch": 0.8314005533854166, + "grad_norm": 25.032691955566406, + "learning_rate": 8.975226549535169e-06, + "loss": 4.9391, + "step": 40865 + }, + { + "epoch": 0.8315022786458334, + "grad_norm": 13.069761276245117, + "learning_rate": 8.974984112460565e-06, + "loss": 5.0552, + "step": 40870 + }, + { + "epoch": 0.83160400390625, + "grad_norm": 17.694429397583008, + "learning_rate": 8.974741649987076e-06, + "loss": 4.9895, + "step": 40875 + }, + { + "epoch": 0.8317057291666666, + "grad_norm": 16.049591064453125, + "learning_rate": 8.974499162116247e-06, + "loss": 5.1123, + "step": 40880 + }, + { + "epoch": 0.8318074544270834, + "grad_norm": 25.293899536132812, + "learning_rate": 8.974256648849633e-06, + "loss": 5.1393, + "step": 40885 + }, + { + "epoch": 0.8319091796875, + "grad_norm": 13.196322441101074, + "learning_rate": 8.974014110188777e-06, + "loss": 5.365, + "step": 40890 + }, + { + "epoch": 0.8320109049479166, + "grad_norm": 17.457239151000977, + "learning_rate": 8.973771546135234e-06, + "loss": 5.3138, + "step": 40895 + }, + { + "epoch": 0.8321126302083334, + "grad_norm": 22.274837493896484, + "learning_rate": 8.973528956690554e-06, + "loss": 5.2074, + "step": 40900 + }, + { + "epoch": 0.83221435546875, + "grad_norm": 18.321313858032227, + "learning_rate": 8.973286341856285e-06, + "loss": 5.1421, + "step": 40905 + }, + { + "epoch": 0.8323160807291666, + "grad_norm": 16.983417510986328, + "learning_rate": 8.973043701633977e-06, + "loss": 5.0381, + "step": 40910 + }, + { + "epoch": 0.8324178059895834, + "grad_norm": 16.608177185058594, + "learning_rate": 8.972801036025181e-06, + "loss": 5.0154, + "step": 40915 + }, + { + "epoch": 0.83251953125, + "grad_norm": 16.46457290649414, + "learning_rate": 8.972558345031446e-06, + "loss": 5.3962, + "step": 40920 + }, + { + "epoch": 0.8326212565104166, + "grad_norm": 22.193023681640625, + "learning_rate": 8.972315628654324e-06, + "loss": 5.2512, + "step": 40925 + }, + { + "epoch": 0.8327229817708334, + "grad_norm": 18.393592834472656, + "learning_rate": 8.972072886895368e-06, + "loss": 5.2523, + "step": 40930 + }, + { + "epoch": 0.83282470703125, + "grad_norm": 33.11868667602539, + "learning_rate": 8.971830119756127e-06, + "loss": 4.9974, + "step": 40935 + }, + { + "epoch": 0.8329264322916666, + "grad_norm": 22.255477905273438, + "learning_rate": 8.971587327238152e-06, + "loss": 5.0668, + "step": 40940 + }, + { + "epoch": 0.8330281575520834, + "grad_norm": 15.306065559387207, + "learning_rate": 8.971344509342994e-06, + "loss": 4.9296, + "step": 40945 + }, + { + "epoch": 0.8331298828125, + "grad_norm": 24.707101821899414, + "learning_rate": 8.971101666072206e-06, + "loss": 4.9885, + "step": 40950 + }, + { + "epoch": 0.8332316080729166, + "grad_norm": 15.997750282287598, + "learning_rate": 8.97085879742734e-06, + "loss": 5.0478, + "step": 40955 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 20.970064163208008, + "learning_rate": 8.970615903409945e-06, + "loss": 5.1008, + "step": 40960 + }, + { + "epoch": 0.83343505859375, + "grad_norm": 22.55337905883789, + "learning_rate": 8.970372984021576e-06, + "loss": 5.0915, + "step": 40965 + }, + { + "epoch": 0.8335367838541666, + "grad_norm": 14.682839393615723, + "learning_rate": 8.970130039263783e-06, + "loss": 5.3238, + "step": 40970 + }, + { + "epoch": 0.8336385091145834, + "grad_norm": 14.605016708374023, + "learning_rate": 8.96988706913812e-06, + "loss": 4.7745, + "step": 40975 + }, + { + "epoch": 0.833740234375, + "grad_norm": 20.63225746154785, + "learning_rate": 8.96964407364614e-06, + "loss": 4.8026, + "step": 40980 + }, + { + "epoch": 0.8338419596354166, + "grad_norm": 15.875832557678223, + "learning_rate": 8.969401052789392e-06, + "loss": 5.2354, + "step": 40985 + }, + { + "epoch": 0.8339436848958334, + "grad_norm": 17.1268253326416, + "learning_rate": 8.969158006569434e-06, + "loss": 4.8699, + "step": 40990 + }, + { + "epoch": 0.83404541015625, + "grad_norm": 13.267155647277832, + "learning_rate": 8.968914934987813e-06, + "loss": 5.2297, + "step": 40995 + }, + { + "epoch": 0.8341471354166666, + "grad_norm": 20.139671325683594, + "learning_rate": 8.968671838046089e-06, + "loss": 5.0247, + "step": 41000 + }, + { + "epoch": 0.8342488606770834, + "grad_norm": 20.768198013305664, + "learning_rate": 8.968428715745809e-06, + "loss": 4.8428, + "step": 41005 + }, + { + "epoch": 0.8343505859375, + "grad_norm": 17.167570114135742, + "learning_rate": 8.96818556808853e-06, + "loss": 4.9446, + "step": 41010 + }, + { + "epoch": 0.8344523111979166, + "grad_norm": 27.720643997192383, + "learning_rate": 8.967942395075807e-06, + "loss": 5.2157, + "step": 41015 + }, + { + "epoch": 0.8345540364583334, + "grad_norm": 15.220477104187012, + "learning_rate": 8.96769919670919e-06, + "loss": 4.9955, + "step": 41020 + }, + { + "epoch": 0.83465576171875, + "grad_norm": 13.182517051696777, + "learning_rate": 8.967455972990233e-06, + "loss": 4.8932, + "step": 41025 + }, + { + "epoch": 0.8347574869791666, + "grad_norm": 16.43160629272461, + "learning_rate": 8.967212723920494e-06, + "loss": 4.9525, + "step": 41030 + }, + { + "epoch": 0.8348592122395834, + "grad_norm": 19.733110427856445, + "learning_rate": 8.966969449501524e-06, + "loss": 5.0447, + "step": 41035 + }, + { + "epoch": 0.8349609375, + "grad_norm": 15.94453239440918, + "learning_rate": 8.96672614973488e-06, + "loss": 5.1888, + "step": 41040 + }, + { + "epoch": 0.8350626627604166, + "grad_norm": 16.83171844482422, + "learning_rate": 8.966482824622114e-06, + "loss": 5.1745, + "step": 41045 + }, + { + "epoch": 0.8351643880208334, + "grad_norm": 16.287630081176758, + "learning_rate": 8.966239474164782e-06, + "loss": 5.0364, + "step": 41050 + }, + { + "epoch": 0.83526611328125, + "grad_norm": 13.453579902648926, + "learning_rate": 8.965996098364438e-06, + "loss": 5.2151, + "step": 41055 + }, + { + "epoch": 0.8353678385416666, + "grad_norm": 13.746821403503418, + "learning_rate": 8.96575269722264e-06, + "loss": 5.048, + "step": 41060 + }, + { + "epoch": 0.8354695638020834, + "grad_norm": 15.9334716796875, + "learning_rate": 8.96550927074094e-06, + "loss": 4.9579, + "step": 41065 + }, + { + "epoch": 0.8355712890625, + "grad_norm": 16.277488708496094, + "learning_rate": 8.965265818920894e-06, + "loss": 5.0742, + "step": 41070 + }, + { + "epoch": 0.8356730143229166, + "grad_norm": 26.552757263183594, + "learning_rate": 8.965022341764059e-06, + "loss": 4.8849, + "step": 41075 + }, + { + "epoch": 0.8357747395833334, + "grad_norm": 25.100034713745117, + "learning_rate": 8.964778839271988e-06, + "loss": 5.1683, + "step": 41080 + }, + { + "epoch": 0.83587646484375, + "grad_norm": 17.604358673095703, + "learning_rate": 8.96453531144624e-06, + "loss": 4.9866, + "step": 41085 + }, + { + "epoch": 0.8359781901041666, + "grad_norm": 18.004362106323242, + "learning_rate": 8.96429175828837e-06, + "loss": 5.036, + "step": 41090 + }, + { + "epoch": 0.8360799153645834, + "grad_norm": 16.196619033813477, + "learning_rate": 8.964048179799935e-06, + "loss": 5.1462, + "step": 41095 + }, + { + "epoch": 0.836181640625, + "grad_norm": 21.131315231323242, + "learning_rate": 8.96380457598249e-06, + "loss": 5.2289, + "step": 41100 + }, + { + "epoch": 0.8362833658854166, + "grad_norm": 18.82535171508789, + "learning_rate": 8.963560946837592e-06, + "loss": 5.0332, + "step": 41105 + }, + { + "epoch": 0.8363850911458334, + "grad_norm": 24.673221588134766, + "learning_rate": 8.963317292366797e-06, + "loss": 5.1002, + "step": 41110 + }, + { + "epoch": 0.83648681640625, + "grad_norm": 20.559215545654297, + "learning_rate": 8.963073612571664e-06, + "loss": 4.8283, + "step": 41115 + }, + { + "epoch": 0.8365885416666666, + "grad_norm": 13.928614616394043, + "learning_rate": 8.962829907453748e-06, + "loss": 5.2796, + "step": 41120 + }, + { + "epoch": 0.8366902669270834, + "grad_norm": 15.442462921142578, + "learning_rate": 8.962586177014607e-06, + "loss": 5.1647, + "step": 41125 + }, + { + "epoch": 0.8367919921875, + "grad_norm": 16.61680793762207, + "learning_rate": 8.962342421255797e-06, + "loss": 4.9851, + "step": 41130 + }, + { + "epoch": 0.8368937174479166, + "grad_norm": 17.273645401000977, + "learning_rate": 8.962098640178878e-06, + "loss": 4.9951, + "step": 41135 + }, + { + "epoch": 0.8369954427083334, + "grad_norm": 13.699993133544922, + "learning_rate": 8.961854833785407e-06, + "loss": 4.9815, + "step": 41140 + }, + { + "epoch": 0.83709716796875, + "grad_norm": 20.70905113220215, + "learning_rate": 8.96161100207694e-06, + "loss": 5.0474, + "step": 41145 + }, + { + "epoch": 0.8371988932291666, + "grad_norm": 18.28776741027832, + "learning_rate": 8.961367145055036e-06, + "loss": 5.1717, + "step": 41150 + }, + { + "epoch": 0.8373006184895834, + "grad_norm": 21.353313446044922, + "learning_rate": 8.961123262721254e-06, + "loss": 5.0681, + "step": 41155 + }, + { + "epoch": 0.83740234375, + "grad_norm": 20.28192138671875, + "learning_rate": 8.960879355077152e-06, + "loss": 4.9387, + "step": 41160 + }, + { + "epoch": 0.8375040690104166, + "grad_norm": 17.254362106323242, + "learning_rate": 8.960635422124288e-06, + "loss": 5.1463, + "step": 41165 + }, + { + "epoch": 0.8376057942708334, + "grad_norm": 13.40074348449707, + "learning_rate": 8.96039146386422e-06, + "loss": 5.5421, + "step": 41170 + }, + { + "epoch": 0.83770751953125, + "grad_norm": 18.97890281677246, + "learning_rate": 8.96014748029851e-06, + "loss": 4.9046, + "step": 41175 + }, + { + "epoch": 0.8378092447916666, + "grad_norm": 25.04377555847168, + "learning_rate": 8.959903471428713e-06, + "loss": 5.1875, + "step": 41180 + }, + { + "epoch": 0.8379109700520834, + "grad_norm": 20.645944595336914, + "learning_rate": 8.959659437256391e-06, + "loss": 5.0615, + "step": 41185 + }, + { + "epoch": 0.8380126953125, + "grad_norm": 15.502013206481934, + "learning_rate": 8.959415377783102e-06, + "loss": 4.939, + "step": 41190 + }, + { + "epoch": 0.8381144205729166, + "grad_norm": 16.82326889038086, + "learning_rate": 8.959171293010405e-06, + "loss": 4.7725, + "step": 41195 + }, + { + "epoch": 0.8382161458333334, + "grad_norm": 15.798381805419922, + "learning_rate": 8.95892718293986e-06, + "loss": 5.0244, + "step": 41200 + }, + { + "epoch": 0.83831787109375, + "grad_norm": 16.318408966064453, + "learning_rate": 8.958683047573027e-06, + "loss": 5.0984, + "step": 41205 + }, + { + "epoch": 0.8384195963541666, + "grad_norm": 17.109102249145508, + "learning_rate": 8.958438886911466e-06, + "loss": 5.1314, + "step": 41210 + }, + { + "epoch": 0.8385213216145834, + "grad_norm": 17.882722854614258, + "learning_rate": 8.958194700956739e-06, + "loss": 5.1007, + "step": 41215 + }, + { + "epoch": 0.838623046875, + "grad_norm": 18.348669052124023, + "learning_rate": 8.957950489710402e-06, + "loss": 5.2727, + "step": 41220 + }, + { + "epoch": 0.8387247721354166, + "grad_norm": 12.908005714416504, + "learning_rate": 8.957706253174018e-06, + "loss": 5.1512, + "step": 41225 + }, + { + "epoch": 0.8388264973958334, + "grad_norm": 19.023452758789062, + "learning_rate": 8.957461991349149e-06, + "loss": 4.7023, + "step": 41230 + }, + { + "epoch": 0.83892822265625, + "grad_norm": 15.490195274353027, + "learning_rate": 8.957217704237352e-06, + "loss": 5.0697, + "step": 41235 + }, + { + "epoch": 0.8390299479166666, + "grad_norm": 14.568397521972656, + "learning_rate": 8.956973391840193e-06, + "loss": 5.2866, + "step": 41240 + }, + { + "epoch": 0.8391316731770834, + "grad_norm": 13.708386421203613, + "learning_rate": 8.956729054159229e-06, + "loss": 5.0366, + "step": 41245 + }, + { + "epoch": 0.8392333984375, + "grad_norm": 18.64493751525879, + "learning_rate": 8.956484691196021e-06, + "loss": 5.0031, + "step": 41250 + }, + { + "epoch": 0.8393351236979166, + "grad_norm": 15.984405517578125, + "learning_rate": 8.956240302952133e-06, + "loss": 4.8106, + "step": 41255 + }, + { + "epoch": 0.8394368489583334, + "grad_norm": 16.922090530395508, + "learning_rate": 8.955995889429125e-06, + "loss": 5.0186, + "step": 41260 + }, + { + "epoch": 0.83953857421875, + "grad_norm": 14.717543601989746, + "learning_rate": 8.955751450628557e-06, + "loss": 4.9674, + "step": 41265 + }, + { + "epoch": 0.8396402994791666, + "grad_norm": 13.631390571594238, + "learning_rate": 8.955506986551997e-06, + "loss": 5.2505, + "step": 41270 + }, + { + "epoch": 0.8397420247395834, + "grad_norm": 16.764888763427734, + "learning_rate": 8.955262497201001e-06, + "loss": 4.9945, + "step": 41275 + }, + { + "epoch": 0.83984375, + "grad_norm": 14.17052936553955, + "learning_rate": 8.955017982577133e-06, + "loss": 5.1005, + "step": 41280 + }, + { + "epoch": 0.8399454752604166, + "grad_norm": 12.250419616699219, + "learning_rate": 8.954773442681955e-06, + "loss": 5.1137, + "step": 41285 + }, + { + "epoch": 0.8400472005208334, + "grad_norm": 13.303406715393066, + "learning_rate": 8.954528877517031e-06, + "loss": 4.9223, + "step": 41290 + }, + { + "epoch": 0.84014892578125, + "grad_norm": 19.201923370361328, + "learning_rate": 8.954284287083923e-06, + "loss": 5.5676, + "step": 41295 + }, + { + "epoch": 0.8402506510416666, + "grad_norm": 16.78432846069336, + "learning_rate": 8.954039671384194e-06, + "loss": 5.0887, + "step": 41300 + }, + { + "epoch": 0.8403523763020834, + "grad_norm": 14.144844055175781, + "learning_rate": 8.953795030419405e-06, + "loss": 5.0773, + "step": 41305 + }, + { + "epoch": 0.8404541015625, + "grad_norm": 16.389434814453125, + "learning_rate": 8.953550364191121e-06, + "loss": 5.0231, + "step": 41310 + }, + { + "epoch": 0.8405558268229166, + "grad_norm": 19.184871673583984, + "learning_rate": 8.953305672700907e-06, + "loss": 5.2961, + "step": 41315 + }, + { + "epoch": 0.8406575520833334, + "grad_norm": 22.172447204589844, + "learning_rate": 8.953060955950323e-06, + "loss": 5.1389, + "step": 41320 + }, + { + "epoch": 0.84075927734375, + "grad_norm": 22.2673282623291, + "learning_rate": 8.952816213940936e-06, + "loss": 5.2439, + "step": 41325 + }, + { + "epoch": 0.8408610026041666, + "grad_norm": 14.41690731048584, + "learning_rate": 8.952571446674307e-06, + "loss": 5.1307, + "step": 41330 + }, + { + "epoch": 0.8409627278645834, + "grad_norm": 20.83781623840332, + "learning_rate": 8.952326654152002e-06, + "loss": 4.7371, + "step": 41335 + }, + { + "epoch": 0.841064453125, + "grad_norm": 14.196508407592773, + "learning_rate": 8.952081836375584e-06, + "loss": 5.0411, + "step": 41340 + }, + { + "epoch": 0.8411661783854166, + "grad_norm": 21.944040298461914, + "learning_rate": 8.951836993346616e-06, + "loss": 5.2001, + "step": 41345 + }, + { + "epoch": 0.8412679036458334, + "grad_norm": 17.994277954101562, + "learning_rate": 8.951592125066667e-06, + "loss": 5.176, + "step": 41350 + }, + { + "epoch": 0.84136962890625, + "grad_norm": 16.852582931518555, + "learning_rate": 8.951347231537297e-06, + "loss": 5.1309, + "step": 41355 + }, + { + "epoch": 0.8414713541666666, + "grad_norm": 17.056751251220703, + "learning_rate": 8.951102312760072e-06, + "loss": 4.9895, + "step": 41360 + }, + { + "epoch": 0.8415730794270834, + "grad_norm": 16.72382926940918, + "learning_rate": 8.95085736873656e-06, + "loss": 4.939, + "step": 41365 + }, + { + "epoch": 0.8416748046875, + "grad_norm": 19.422950744628906, + "learning_rate": 8.950612399468322e-06, + "loss": 4.7745, + "step": 41370 + }, + { + "epoch": 0.8417765299479166, + "grad_norm": 21.931190490722656, + "learning_rate": 8.950367404956923e-06, + "loss": 4.9479, + "step": 41375 + }, + { + "epoch": 0.8418782552083334, + "grad_norm": 18.683574676513672, + "learning_rate": 8.950122385203933e-06, + "loss": 4.9563, + "step": 41380 + }, + { + "epoch": 0.84197998046875, + "grad_norm": 23.023651123046875, + "learning_rate": 8.949877340210913e-06, + "loss": 5.1146, + "step": 41385 + }, + { + "epoch": 0.8420817057291666, + "grad_norm": 16.401620864868164, + "learning_rate": 8.949632269979432e-06, + "loss": 5.105, + "step": 41390 + }, + { + "epoch": 0.8421834309895834, + "grad_norm": 15.12240219116211, + "learning_rate": 8.949387174511054e-06, + "loss": 5.1859, + "step": 41395 + }, + { + "epoch": 0.84228515625, + "grad_norm": 20.873146057128906, + "learning_rate": 8.949142053807346e-06, + "loss": 5.0693, + "step": 41400 + }, + { + "epoch": 0.8423868815104166, + "grad_norm": 15.4004487991333, + "learning_rate": 8.948896907869873e-06, + "loss": 5.2207, + "step": 41405 + }, + { + "epoch": 0.8424886067708334, + "grad_norm": 15.600560188293457, + "learning_rate": 8.948651736700203e-06, + "loss": 5.1579, + "step": 41410 + }, + { + "epoch": 0.84259033203125, + "grad_norm": 16.060644149780273, + "learning_rate": 8.948406540299902e-06, + "loss": 5.1745, + "step": 41415 + }, + { + "epoch": 0.8426920572916666, + "grad_norm": 21.633037567138672, + "learning_rate": 8.948161318670536e-06, + "loss": 5.1296, + "step": 41420 + }, + { + "epoch": 0.8427937825520834, + "grad_norm": 32.726139068603516, + "learning_rate": 8.947916071813674e-06, + "loss": 4.9143, + "step": 41425 + }, + { + "epoch": 0.8428955078125, + "grad_norm": 18.156076431274414, + "learning_rate": 8.94767079973088e-06, + "loss": 5.0477, + "step": 41430 + }, + { + "epoch": 0.8429972330729166, + "grad_norm": 22.992694854736328, + "learning_rate": 8.947425502423724e-06, + "loss": 5.2665, + "step": 41435 + }, + { + "epoch": 0.8430989583333334, + "grad_norm": 13.063825607299805, + "learning_rate": 8.94718017989377e-06, + "loss": 4.9955, + "step": 41440 + }, + { + "epoch": 0.84320068359375, + "grad_norm": 16.420665740966797, + "learning_rate": 8.94693483214259e-06, + "loss": 5.0562, + "step": 41445 + }, + { + "epoch": 0.8433024088541666, + "grad_norm": 19.821945190429688, + "learning_rate": 8.946689459171748e-06, + "loss": 5.1196, + "step": 41450 + }, + { + "epoch": 0.8434041341145834, + "grad_norm": 21.368642807006836, + "learning_rate": 8.946444060982813e-06, + "loss": 5.11, + "step": 41455 + }, + { + "epoch": 0.843505859375, + "grad_norm": 17.67949867248535, + "learning_rate": 8.946198637577355e-06, + "loss": 5.1243, + "step": 41460 + }, + { + "epoch": 0.8436075846354166, + "grad_norm": 16.45645523071289, + "learning_rate": 8.945953188956938e-06, + "loss": 5.1681, + "step": 41465 + }, + { + "epoch": 0.8437093098958334, + "grad_norm": 20.417665481567383, + "learning_rate": 8.945707715123135e-06, + "loss": 4.8495, + "step": 41470 + }, + { + "epoch": 0.84381103515625, + "grad_norm": 14.10073184967041, + "learning_rate": 8.94546221607751e-06, + "loss": 5.0191, + "step": 41475 + }, + { + "epoch": 0.8439127604166666, + "grad_norm": 21.532453536987305, + "learning_rate": 8.945216691821635e-06, + "loss": 5.0436, + "step": 41480 + }, + { + "epoch": 0.8440144856770834, + "grad_norm": 18.912334442138672, + "learning_rate": 8.944971142357079e-06, + "loss": 5.0679, + "step": 41485 + }, + { + "epoch": 0.8441162109375, + "grad_norm": 14.790243148803711, + "learning_rate": 8.944725567685409e-06, + "loss": 4.9416, + "step": 41490 + }, + { + "epoch": 0.8442179361979166, + "grad_norm": 21.132667541503906, + "learning_rate": 8.944479967808193e-06, + "loss": 5.065, + "step": 41495 + }, + { + "epoch": 0.8443196614583334, + "grad_norm": 17.348501205444336, + "learning_rate": 8.944234342727003e-06, + "loss": 5.0881, + "step": 41500 + }, + { + "epoch": 0.84442138671875, + "grad_norm": 17.676605224609375, + "learning_rate": 8.943988692443409e-06, + "loss": 5.1257, + "step": 41505 + }, + { + "epoch": 0.8445231119791666, + "grad_norm": 16.872177124023438, + "learning_rate": 8.943743016958978e-06, + "loss": 5.0998, + "step": 41510 + }, + { + "epoch": 0.8446248372395834, + "grad_norm": 25.30636215209961, + "learning_rate": 8.94349731627528e-06, + "loss": 4.9816, + "step": 41515 + }, + { + "epoch": 0.8447265625, + "grad_norm": 16.937074661254883, + "learning_rate": 8.94325159039389e-06, + "loss": 5.2955, + "step": 41520 + }, + { + "epoch": 0.8448282877604166, + "grad_norm": 19.32732391357422, + "learning_rate": 8.94300583931637e-06, + "loss": 4.8948, + "step": 41525 + }, + { + "epoch": 0.8449300130208334, + "grad_norm": 15.51845932006836, + "learning_rate": 8.942760063044295e-06, + "loss": 5.0994, + "step": 41530 + }, + { + "epoch": 0.84503173828125, + "grad_norm": 14.197003364562988, + "learning_rate": 8.942514261579235e-06, + "loss": 5.0125, + "step": 41535 + }, + { + "epoch": 0.8451334635416666, + "grad_norm": 18.03932762145996, + "learning_rate": 8.942268434922763e-06, + "loss": 5.2606, + "step": 41540 + }, + { + "epoch": 0.8452351888020834, + "grad_norm": 18.756498336791992, + "learning_rate": 8.942022583076444e-06, + "loss": 4.9262, + "step": 41545 + }, + { + "epoch": 0.8453369140625, + "grad_norm": 16.160261154174805, + "learning_rate": 8.941776706041854e-06, + "loss": 4.8785, + "step": 41550 + }, + { + "epoch": 0.8454386393229166, + "grad_norm": 13.184591293334961, + "learning_rate": 8.941530803820561e-06, + "loss": 5.184, + "step": 41555 + }, + { + "epoch": 0.8455403645833334, + "grad_norm": 13.1394624710083, + "learning_rate": 8.941284876414138e-06, + "loss": 5.0888, + "step": 41560 + }, + { + "epoch": 0.84564208984375, + "grad_norm": 15.644800186157227, + "learning_rate": 8.941038923824155e-06, + "loss": 4.9899, + "step": 41565 + }, + { + "epoch": 0.8457438151041666, + "grad_norm": 22.828840255737305, + "learning_rate": 8.940792946052185e-06, + "loss": 4.9543, + "step": 41570 + }, + { + "epoch": 0.8458455403645834, + "grad_norm": 20.823389053344727, + "learning_rate": 8.9405469430998e-06, + "loss": 4.8593, + "step": 41575 + }, + { + "epoch": 0.845947265625, + "grad_norm": 26.165868759155273, + "learning_rate": 8.940300914968569e-06, + "loss": 4.8934, + "step": 41580 + }, + { + "epoch": 0.8460489908854166, + "grad_norm": 22.81753921508789, + "learning_rate": 8.940054861660066e-06, + "loss": 5.1053, + "step": 41585 + }, + { + "epoch": 0.8461507161458334, + "grad_norm": 15.280860900878906, + "learning_rate": 8.939808783175866e-06, + "loss": 5.1885, + "step": 41590 + }, + { + "epoch": 0.84625244140625, + "grad_norm": 13.507987976074219, + "learning_rate": 8.939562679517534e-06, + "loss": 5.1418, + "step": 41595 + }, + { + "epoch": 0.8463541666666666, + "grad_norm": 13.708534240722656, + "learning_rate": 8.93931655068665e-06, + "loss": 4.9841, + "step": 41600 + }, + { + "epoch": 0.8464558919270834, + "grad_norm": 17.436508178710938, + "learning_rate": 8.939070396684784e-06, + "loss": 5.0661, + "step": 41605 + }, + { + "epoch": 0.8465576171875, + "grad_norm": 13.661314964294434, + "learning_rate": 8.938824217513508e-06, + "loss": 4.9616, + "step": 41610 + }, + { + "epoch": 0.8466593424479166, + "grad_norm": 18.845691680908203, + "learning_rate": 8.938578013174396e-06, + "loss": 5.0519, + "step": 41615 + }, + { + "epoch": 0.8467610677083334, + "grad_norm": 20.987255096435547, + "learning_rate": 8.938331783669018e-06, + "loss": 5.1525, + "step": 41620 + }, + { + "epoch": 0.84686279296875, + "grad_norm": 18.091171264648438, + "learning_rate": 8.938085528998953e-06, + "loss": 4.6543, + "step": 41625 + }, + { + "epoch": 0.8469645182291666, + "grad_norm": 15.673166275024414, + "learning_rate": 8.937839249165771e-06, + "loss": 5.1778, + "step": 41630 + }, + { + "epoch": 0.8470662434895834, + "grad_norm": 19.786134719848633, + "learning_rate": 8.937592944171047e-06, + "loss": 4.8429, + "step": 41635 + }, + { + "epoch": 0.84716796875, + "grad_norm": 14.341763496398926, + "learning_rate": 8.937346614016353e-06, + "loss": 5.2163, + "step": 41640 + }, + { + "epoch": 0.8472696940104166, + "grad_norm": 15.8727445602417, + "learning_rate": 8.937100258703264e-06, + "loss": 5.0215, + "step": 41645 + }, + { + "epoch": 0.8473714192708334, + "grad_norm": 15.38093376159668, + "learning_rate": 8.936853878233354e-06, + "loss": 4.8567, + "step": 41650 + }, + { + "epoch": 0.84747314453125, + "grad_norm": 14.319844245910645, + "learning_rate": 8.936607472608199e-06, + "loss": 5.1164, + "step": 41655 + }, + { + "epoch": 0.8475748697916666, + "grad_norm": 14.879122734069824, + "learning_rate": 8.93636104182937e-06, + "loss": 5.2413, + "step": 41660 + }, + { + "epoch": 0.8476765950520834, + "grad_norm": 16.752275466918945, + "learning_rate": 8.936114585898445e-06, + "loss": 5.1047, + "step": 41665 + }, + { + "epoch": 0.8477783203125, + "grad_norm": 20.05743980407715, + "learning_rate": 8.935868104816996e-06, + "loss": 5.1217, + "step": 41670 + }, + { + "epoch": 0.8478800455729166, + "grad_norm": 13.993396759033203, + "learning_rate": 8.935621598586599e-06, + "loss": 5.0494, + "step": 41675 + }, + { + "epoch": 0.8479817708333334, + "grad_norm": 13.13659954071045, + "learning_rate": 8.935375067208831e-06, + "loss": 4.8865, + "step": 41680 + }, + { + "epoch": 0.84808349609375, + "grad_norm": 13.17566967010498, + "learning_rate": 8.935128510685265e-06, + "loss": 5.226, + "step": 41685 + }, + { + "epoch": 0.8481852213541666, + "grad_norm": 15.76957893371582, + "learning_rate": 8.934881929017476e-06, + "loss": 5.0464, + "step": 41690 + }, + { + "epoch": 0.8482869466145834, + "grad_norm": 25.277854919433594, + "learning_rate": 8.934635322207042e-06, + "loss": 5.0104, + "step": 41695 + }, + { + "epoch": 0.848388671875, + "grad_norm": 19.935436248779297, + "learning_rate": 8.934388690255535e-06, + "loss": 4.839, + "step": 41700 + }, + { + "epoch": 0.8484903971354166, + "grad_norm": 20.403017044067383, + "learning_rate": 8.934142033164536e-06, + "loss": 5.1174, + "step": 41705 + }, + { + "epoch": 0.8485921223958334, + "grad_norm": 16.895729064941406, + "learning_rate": 8.933895350935617e-06, + "loss": 4.9898, + "step": 41710 + }, + { + "epoch": 0.84869384765625, + "grad_norm": 18.79355812072754, + "learning_rate": 8.933648643570355e-06, + "loss": 4.954, + "step": 41715 + }, + { + "epoch": 0.8487955729166666, + "grad_norm": 14.209393501281738, + "learning_rate": 8.933401911070328e-06, + "loss": 4.9186, + "step": 41720 + }, + { + "epoch": 0.8488972981770834, + "grad_norm": 21.137235641479492, + "learning_rate": 8.93315515343711e-06, + "loss": 5.2154, + "step": 41725 + }, + { + "epoch": 0.8489990234375, + "grad_norm": 16.06001091003418, + "learning_rate": 8.93290837067228e-06, + "loss": 5.1903, + "step": 41730 + }, + { + "epoch": 0.8491007486979166, + "grad_norm": 17.75275993347168, + "learning_rate": 8.932661562777415e-06, + "loss": 4.883, + "step": 41735 + }, + { + "epoch": 0.8492024739583334, + "grad_norm": 14.244123458862305, + "learning_rate": 8.932414729754088e-06, + "loss": 4.8564, + "step": 41740 + }, + { + "epoch": 0.84930419921875, + "grad_norm": 15.257214546203613, + "learning_rate": 8.932167871603881e-06, + "loss": 5.0501, + "step": 41745 + }, + { + "epoch": 0.8494059244791666, + "grad_norm": 17.670635223388672, + "learning_rate": 8.931920988328371e-06, + "loss": 5.0606, + "step": 41750 + }, + { + "epoch": 0.8495076497395834, + "grad_norm": 16.84939193725586, + "learning_rate": 8.93167407992913e-06, + "loss": 5.3063, + "step": 41755 + }, + { + "epoch": 0.849609375, + "grad_norm": 16.55073356628418, + "learning_rate": 8.931427146407745e-06, + "loss": 5.2139, + "step": 41760 + }, + { + "epoch": 0.8497111002604166, + "grad_norm": 17.79154396057129, + "learning_rate": 8.931180187765785e-06, + "loss": 5.2308, + "step": 41765 + }, + { + "epoch": 0.8498128255208334, + "grad_norm": 25.496318817138672, + "learning_rate": 8.930933204004832e-06, + "loss": 5.0827, + "step": 41770 + }, + { + "epoch": 0.84991455078125, + "grad_norm": 15.266977310180664, + "learning_rate": 8.930686195126466e-06, + "loss": 5.0808, + "step": 41775 + }, + { + "epoch": 0.8500162760416666, + "grad_norm": 17.015918731689453, + "learning_rate": 8.93043916113226e-06, + "loss": 4.9559, + "step": 41780 + }, + { + "epoch": 0.8501180013020834, + "grad_norm": 15.355846405029297, + "learning_rate": 8.930192102023797e-06, + "loss": 4.9517, + "step": 41785 + }, + { + "epoch": 0.8502197265625, + "grad_norm": 17.346010208129883, + "learning_rate": 8.929945017802654e-06, + "loss": 4.9614, + "step": 41790 + }, + { + "epoch": 0.8503214518229166, + "grad_norm": 17.45450210571289, + "learning_rate": 8.929697908470409e-06, + "loss": 4.9159, + "step": 41795 + }, + { + "epoch": 0.8504231770833334, + "grad_norm": 14.235349655151367, + "learning_rate": 8.929450774028643e-06, + "loss": 4.9749, + "step": 41800 + }, + { + "epoch": 0.85052490234375, + "grad_norm": 17.61083221435547, + "learning_rate": 8.929203614478933e-06, + "loss": 5.0552, + "step": 41805 + }, + { + "epoch": 0.8506266276041666, + "grad_norm": 14.464179039001465, + "learning_rate": 8.92895642982286e-06, + "loss": 4.95, + "step": 41810 + }, + { + "epoch": 0.8507283528645834, + "grad_norm": 16.260271072387695, + "learning_rate": 8.928709220062e-06, + "loss": 4.7411, + "step": 41815 + }, + { + "epoch": 0.850830078125, + "grad_norm": 15.951292991638184, + "learning_rate": 8.928461985197939e-06, + "loss": 4.9998, + "step": 41820 + }, + { + "epoch": 0.8509318033854166, + "grad_norm": 20.735937118530273, + "learning_rate": 8.92821472523225e-06, + "loss": 4.8763, + "step": 41825 + }, + { + "epoch": 0.8510335286458334, + "grad_norm": 17.764570236206055, + "learning_rate": 8.927967440166517e-06, + "loss": 5.1195, + "step": 41830 + }, + { + "epoch": 0.85113525390625, + "grad_norm": 13.05023193359375, + "learning_rate": 8.92772013000232e-06, + "loss": 5.1238, + "step": 41835 + }, + { + "epoch": 0.8512369791666666, + "grad_norm": 15.922545433044434, + "learning_rate": 8.927472794741237e-06, + "loss": 5.0402, + "step": 41840 + }, + { + "epoch": 0.8513387044270834, + "grad_norm": 18.330251693725586, + "learning_rate": 8.92722543438485e-06, + "loss": 5.1182, + "step": 41845 + }, + { + "epoch": 0.8514404296875, + "grad_norm": 14.233848571777344, + "learning_rate": 8.92697804893474e-06, + "loss": 4.9222, + "step": 41850 + }, + { + "epoch": 0.8515421549479166, + "grad_norm": 20.82847023010254, + "learning_rate": 8.926730638392484e-06, + "loss": 4.9943, + "step": 41855 + }, + { + "epoch": 0.8516438802083334, + "grad_norm": 15.15902042388916, + "learning_rate": 8.926483202759669e-06, + "loss": 5.2467, + "step": 41860 + }, + { + "epoch": 0.85174560546875, + "grad_norm": 16.071828842163086, + "learning_rate": 8.92623574203787e-06, + "loss": 5.0484, + "step": 41865 + }, + { + "epoch": 0.8518473307291666, + "grad_norm": 17.14536476135254, + "learning_rate": 8.925988256228672e-06, + "loss": 4.9028, + "step": 41870 + }, + { + "epoch": 0.8519490559895834, + "grad_norm": 15.770079612731934, + "learning_rate": 8.925740745333658e-06, + "loss": 5.1862, + "step": 41875 + }, + { + "epoch": 0.85205078125, + "grad_norm": 13.118361473083496, + "learning_rate": 8.925493209354403e-06, + "loss": 4.9432, + "step": 41880 + }, + { + "epoch": 0.8521525065104166, + "grad_norm": 21.135740280151367, + "learning_rate": 8.925245648292494e-06, + "loss": 5.1416, + "step": 41885 + }, + { + "epoch": 0.8522542317708334, + "grad_norm": 16.75426483154297, + "learning_rate": 8.924998062149512e-06, + "loss": 5.1178, + "step": 41890 + }, + { + "epoch": 0.85235595703125, + "grad_norm": 17.7708740234375, + "learning_rate": 8.924750450927037e-06, + "loss": 4.9579, + "step": 41895 + }, + { + "epoch": 0.8524576822916666, + "grad_norm": 21.16646385192871, + "learning_rate": 8.924502814626654e-06, + "loss": 4.9554, + "step": 41900 + }, + { + "epoch": 0.8525594075520834, + "grad_norm": 18.342857360839844, + "learning_rate": 8.924255153249943e-06, + "loss": 5.1458, + "step": 41905 + }, + { + "epoch": 0.8526611328125, + "grad_norm": 15.319868087768555, + "learning_rate": 8.924007466798487e-06, + "loss": 5.2297, + "step": 41910 + }, + { + "epoch": 0.8527628580729166, + "grad_norm": 16.533689498901367, + "learning_rate": 8.92375975527387e-06, + "loss": 4.8425, + "step": 41915 + }, + { + "epoch": 0.8528645833333334, + "grad_norm": 12.742045402526855, + "learning_rate": 8.923512018677674e-06, + "loss": 5.0214, + "step": 41920 + }, + { + "epoch": 0.85296630859375, + "grad_norm": 15.22391128540039, + "learning_rate": 8.92326425701148e-06, + "loss": 5.0457, + "step": 41925 + }, + { + "epoch": 0.8530680338541666, + "grad_norm": 14.679268836975098, + "learning_rate": 8.923016470276873e-06, + "loss": 5.288, + "step": 41930 + }, + { + "epoch": 0.8531697591145834, + "grad_norm": 15.350794792175293, + "learning_rate": 8.922768658475435e-06, + "loss": 4.9158, + "step": 41935 + }, + { + "epoch": 0.853271484375, + "grad_norm": 19.49604606628418, + "learning_rate": 8.922520821608753e-06, + "loss": 5.3024, + "step": 41940 + }, + { + "epoch": 0.8533732096354166, + "grad_norm": 11.648550987243652, + "learning_rate": 8.922272959678408e-06, + "loss": 5.03, + "step": 41945 + }, + { + "epoch": 0.8534749348958334, + "grad_norm": 22.614540100097656, + "learning_rate": 8.922025072685983e-06, + "loss": 5.1161, + "step": 41950 + }, + { + "epoch": 0.85357666015625, + "grad_norm": 16.600435256958008, + "learning_rate": 8.921777160633063e-06, + "loss": 5.1769, + "step": 41955 + }, + { + "epoch": 0.8536783854166666, + "grad_norm": 30.71523666381836, + "learning_rate": 8.921529223521231e-06, + "loss": 5.2438, + "step": 41960 + }, + { + "epoch": 0.8537801106770834, + "grad_norm": 16.177396774291992, + "learning_rate": 8.921281261352074e-06, + "loss": 4.9345, + "step": 41965 + }, + { + "epoch": 0.8538818359375, + "grad_norm": 18.46438217163086, + "learning_rate": 8.921033274127174e-06, + "loss": 5.0465, + "step": 41970 + }, + { + "epoch": 0.8539835611979166, + "grad_norm": 17.234397888183594, + "learning_rate": 8.920785261848115e-06, + "loss": 4.8482, + "step": 41975 + }, + { + "epoch": 0.8540852864583334, + "grad_norm": 19.203346252441406, + "learning_rate": 8.920537224516484e-06, + "loss": 5.0908, + "step": 41980 + }, + { + "epoch": 0.85418701171875, + "grad_norm": 20.459938049316406, + "learning_rate": 8.920289162133864e-06, + "loss": 4.782, + "step": 41985 + }, + { + "epoch": 0.8542887369791666, + "grad_norm": 15.627167701721191, + "learning_rate": 8.92004107470184e-06, + "loss": 5.0188, + "step": 41990 + }, + { + "epoch": 0.8543904622395834, + "grad_norm": 20.1669921875, + "learning_rate": 8.919792962221998e-06, + "loss": 4.8546, + "step": 41995 + }, + { + "epoch": 0.8544921875, + "grad_norm": 21.350858688354492, + "learning_rate": 8.919544824695925e-06, + "loss": 4.9513, + "step": 42000 + }, + { + "epoch": 0.8545939127604166, + "grad_norm": 14.571099281311035, + "learning_rate": 8.919296662125203e-06, + "loss": 5.1481, + "step": 42005 + }, + { + "epoch": 0.8546956380208334, + "grad_norm": 16.648006439208984, + "learning_rate": 8.91904847451142e-06, + "loss": 5.219, + "step": 42010 + }, + { + "epoch": 0.85479736328125, + "grad_norm": 15.011650085449219, + "learning_rate": 8.918800261856161e-06, + "loss": 5.1201, + "step": 42015 + }, + { + "epoch": 0.8548990885416666, + "grad_norm": 19.417682647705078, + "learning_rate": 8.918552024161013e-06, + "loss": 5.051, + "step": 42020 + }, + { + "epoch": 0.8550008138020834, + "grad_norm": 14.606025695800781, + "learning_rate": 8.918303761427563e-06, + "loss": 4.9007, + "step": 42025 + }, + { + "epoch": 0.8551025390625, + "grad_norm": 17.63313102722168, + "learning_rate": 8.918055473657394e-06, + "loss": 5.2524, + "step": 42030 + }, + { + "epoch": 0.8552042643229166, + "grad_norm": 17.32497215270996, + "learning_rate": 8.917807160852094e-06, + "loss": 5.1402, + "step": 42035 + }, + { + "epoch": 0.8553059895833334, + "grad_norm": 26.003149032592773, + "learning_rate": 8.91755882301325e-06, + "loss": 5.3463, + "step": 42040 + }, + { + "epoch": 0.85540771484375, + "grad_norm": 18.577774047851562, + "learning_rate": 8.91731046014245e-06, + "loss": 5.0382, + "step": 42045 + }, + { + "epoch": 0.8555094401041666, + "grad_norm": 20.95619010925293, + "learning_rate": 8.917062072241278e-06, + "loss": 4.9432, + "step": 42050 + }, + { + "epoch": 0.8556111653645834, + "grad_norm": 19.389936447143555, + "learning_rate": 8.916813659311325e-06, + "loss": 5.0975, + "step": 42055 + }, + { + "epoch": 0.855712890625, + "grad_norm": 20.976341247558594, + "learning_rate": 8.916565221354174e-06, + "loss": 5.0997, + "step": 42060 + }, + { + "epoch": 0.8558146158854166, + "grad_norm": 17.043354034423828, + "learning_rate": 8.916316758371415e-06, + "loss": 5.1527, + "step": 42065 + }, + { + "epoch": 0.8559163411458334, + "grad_norm": 14.933531761169434, + "learning_rate": 8.916068270364635e-06, + "loss": 5.0009, + "step": 42070 + }, + { + "epoch": 0.85601806640625, + "grad_norm": 15.711906433105469, + "learning_rate": 8.915819757335421e-06, + "loss": 5.0772, + "step": 42075 + }, + { + "epoch": 0.8561197916666666, + "grad_norm": 14.62206745147705, + "learning_rate": 8.915571219285363e-06, + "loss": 4.7595, + "step": 42080 + }, + { + "epoch": 0.8562215169270834, + "grad_norm": 17.11756134033203, + "learning_rate": 8.91532265621605e-06, + "loss": 5.037, + "step": 42085 + }, + { + "epoch": 0.8563232421875, + "grad_norm": 22.568456649780273, + "learning_rate": 8.915074068129065e-06, + "loss": 4.9528, + "step": 42090 + }, + { + "epoch": 0.8564249674479166, + "grad_norm": 14.029351234436035, + "learning_rate": 8.914825455025999e-06, + "loss": 4.8761, + "step": 42095 + }, + { + "epoch": 0.8565266927083334, + "grad_norm": 18.955347061157227, + "learning_rate": 8.914576816908444e-06, + "loss": 5.2739, + "step": 42100 + }, + { + "epoch": 0.85662841796875, + "grad_norm": 19.659671783447266, + "learning_rate": 8.914328153777985e-06, + "loss": 5.141, + "step": 42105 + }, + { + "epoch": 0.8567301432291666, + "grad_norm": 20.225666046142578, + "learning_rate": 8.91407946563621e-06, + "loss": 4.9811, + "step": 42110 + }, + { + "epoch": 0.8568318684895834, + "grad_norm": 16.674453735351562, + "learning_rate": 8.913830752484712e-06, + "loss": 4.8909, + "step": 42115 + }, + { + "epoch": 0.85693359375, + "grad_norm": 21.121536254882812, + "learning_rate": 8.913582014325075e-06, + "loss": 5.2855, + "step": 42120 + }, + { + "epoch": 0.8570353190104166, + "grad_norm": 24.29547119140625, + "learning_rate": 8.913333251158893e-06, + "loss": 5.0973, + "step": 42125 + }, + { + "epoch": 0.8571370442708334, + "grad_norm": 15.396759033203125, + "learning_rate": 8.913084462987754e-06, + "loss": 5.2356, + "step": 42130 + }, + { + "epoch": 0.85723876953125, + "grad_norm": 17.045286178588867, + "learning_rate": 8.912835649813248e-06, + "loss": 5.4687, + "step": 42135 + }, + { + "epoch": 0.8573404947916666, + "grad_norm": 15.960103988647461, + "learning_rate": 8.912586811636963e-06, + "loss": 4.991, + "step": 42140 + }, + { + "epoch": 0.8574422200520834, + "grad_norm": 17.507741928100586, + "learning_rate": 8.91233794846049e-06, + "loss": 5.1425, + "step": 42145 + }, + { + "epoch": 0.8575439453125, + "grad_norm": 18.337602615356445, + "learning_rate": 8.91208906028542e-06, + "loss": 4.9643, + "step": 42150 + }, + { + "epoch": 0.8576456705729166, + "grad_norm": 16.266529083251953, + "learning_rate": 8.911840147113344e-06, + "loss": 5.1558, + "step": 42155 + }, + { + "epoch": 0.8577473958333334, + "grad_norm": 19.61992645263672, + "learning_rate": 8.91159120894585e-06, + "loss": 5.0158, + "step": 42160 + }, + { + "epoch": 0.85784912109375, + "grad_norm": 19.799476623535156, + "learning_rate": 8.91134224578453e-06, + "loss": 4.9004, + "step": 42165 + }, + { + "epoch": 0.8579508463541666, + "grad_norm": 18.687763214111328, + "learning_rate": 8.911093257630976e-06, + "loss": 5.0698, + "step": 42170 + }, + { + "epoch": 0.8580525716145834, + "grad_norm": 16.1019287109375, + "learning_rate": 8.910844244486777e-06, + "loss": 5.2201, + "step": 42175 + }, + { + "epoch": 0.858154296875, + "grad_norm": 17.10932731628418, + "learning_rate": 8.910595206353523e-06, + "loss": 4.9405, + "step": 42180 + }, + { + "epoch": 0.8582560221354166, + "grad_norm": 16.21923828125, + "learning_rate": 8.910346143232808e-06, + "loss": 4.8871, + "step": 42185 + }, + { + "epoch": 0.8583577473958334, + "grad_norm": 17.363325119018555, + "learning_rate": 8.910097055126222e-06, + "loss": 5.3507, + "step": 42190 + }, + { + "epoch": 0.85845947265625, + "grad_norm": 20.096704483032227, + "learning_rate": 8.909847942035357e-06, + "loss": 4.7506, + "step": 42195 + }, + { + "epoch": 0.8585611979166666, + "grad_norm": 21.977123260498047, + "learning_rate": 8.909598803961806e-06, + "loss": 4.8918, + "step": 42200 + }, + { + "epoch": 0.8586629231770834, + "grad_norm": 12.90881633758545, + "learning_rate": 8.90934964090716e-06, + "loss": 5.0136, + "step": 42205 + }, + { + "epoch": 0.8587646484375, + "grad_norm": 14.655598640441895, + "learning_rate": 8.90910045287301e-06, + "loss": 5.1151, + "step": 42210 + }, + { + "epoch": 0.8588663736979166, + "grad_norm": 19.082515716552734, + "learning_rate": 8.908851239860948e-06, + "loss": 5.1349, + "step": 42215 + }, + { + "epoch": 0.8589680989583334, + "grad_norm": 20.044408798217773, + "learning_rate": 8.908602001872568e-06, + "loss": 4.9362, + "step": 42220 + }, + { + "epoch": 0.85906982421875, + "grad_norm": 21.187841415405273, + "learning_rate": 8.908352738909462e-06, + "loss": 5.0564, + "step": 42225 + }, + { + "epoch": 0.8591715494791666, + "grad_norm": 25.54913330078125, + "learning_rate": 8.908103450973225e-06, + "loss": 5.1573, + "step": 42230 + }, + { + "epoch": 0.8592732747395834, + "grad_norm": 14.414727210998535, + "learning_rate": 8.907854138065444e-06, + "loss": 5.2415, + "step": 42235 + }, + { + "epoch": 0.859375, + "grad_norm": 16.3396053314209, + "learning_rate": 8.907604800187717e-06, + "loss": 5.0806, + "step": 42240 + }, + { + "epoch": 0.8594767252604166, + "grad_norm": 18.288724899291992, + "learning_rate": 8.907355437341638e-06, + "loss": 4.8811, + "step": 42245 + }, + { + "epoch": 0.8595784505208334, + "grad_norm": 17.362260818481445, + "learning_rate": 8.907106049528796e-06, + "loss": 4.7677, + "step": 42250 + }, + { + "epoch": 0.85968017578125, + "grad_norm": 13.999979019165039, + "learning_rate": 8.906856636750786e-06, + "loss": 5.1149, + "step": 42255 + }, + { + "epoch": 0.8597819010416666, + "grad_norm": 20.58000946044922, + "learning_rate": 8.906607199009203e-06, + "loss": 4.9856, + "step": 42260 + }, + { + "epoch": 0.8598836263020834, + "grad_norm": 23.085983276367188, + "learning_rate": 8.90635773630564e-06, + "loss": 5.5376, + "step": 42265 + }, + { + "epoch": 0.8599853515625, + "grad_norm": 19.58768653869629, + "learning_rate": 8.906108248641691e-06, + "loss": 5.1757, + "step": 42270 + }, + { + "epoch": 0.8600870768229166, + "grad_norm": 20.213865280151367, + "learning_rate": 8.905858736018951e-06, + "loss": 5.3578, + "step": 42275 + }, + { + "epoch": 0.8601888020833334, + "grad_norm": 20.187850952148438, + "learning_rate": 8.905609198439013e-06, + "loss": 4.9927, + "step": 42280 + }, + { + "epoch": 0.86029052734375, + "grad_norm": 19.3273868560791, + "learning_rate": 8.905359635903472e-06, + "loss": 4.9126, + "step": 42285 + }, + { + "epoch": 0.8603922526041666, + "grad_norm": 19.0757999420166, + "learning_rate": 8.905110048413923e-06, + "loss": 5.2326, + "step": 42290 + }, + { + "epoch": 0.8604939778645834, + "grad_norm": 18.137174606323242, + "learning_rate": 8.90486043597196e-06, + "loss": 5.1954, + "step": 42295 + }, + { + "epoch": 0.860595703125, + "grad_norm": 14.20051097869873, + "learning_rate": 8.904610798579178e-06, + "loss": 4.9469, + "step": 42300 + }, + { + "epoch": 0.8606974283854166, + "grad_norm": 24.525333404541016, + "learning_rate": 8.904361136237175e-06, + "loss": 5.0053, + "step": 42305 + }, + { + "epoch": 0.8607991536458334, + "grad_norm": 16.0393123626709, + "learning_rate": 8.90411144894754e-06, + "loss": 5.08, + "step": 42310 + }, + { + "epoch": 0.86090087890625, + "grad_norm": 18.542951583862305, + "learning_rate": 8.903861736711874e-06, + "loss": 5.1709, + "step": 42315 + }, + { + "epoch": 0.8610026041666666, + "grad_norm": 19.422582626342773, + "learning_rate": 8.903611999531772e-06, + "loss": 5.5681, + "step": 42320 + }, + { + "epoch": 0.8611043294270834, + "grad_norm": 19.69454002380371, + "learning_rate": 8.903362237408825e-06, + "loss": 5.1677, + "step": 42325 + }, + { + "epoch": 0.8612060546875, + "grad_norm": 16.08063316345215, + "learning_rate": 8.903112450344635e-06, + "loss": 4.9216, + "step": 42330 + }, + { + "epoch": 0.8613077799479166, + "grad_norm": 23.02671241760254, + "learning_rate": 8.902862638340795e-06, + "loss": 5.1094, + "step": 42335 + }, + { + "epoch": 0.8614095052083334, + "grad_norm": 25.290555953979492, + "learning_rate": 8.902612801398903e-06, + "loss": 5.1439, + "step": 42340 + }, + { + "epoch": 0.86151123046875, + "grad_norm": 16.795930862426758, + "learning_rate": 8.902362939520552e-06, + "loss": 4.9261, + "step": 42345 + }, + { + "epoch": 0.8616129557291666, + "grad_norm": 12.895272254943848, + "learning_rate": 8.902113052707341e-06, + "loss": 4.9262, + "step": 42350 + }, + { + "epoch": 0.8617146809895834, + "grad_norm": 18.159685134887695, + "learning_rate": 8.901863140960867e-06, + "loss": 5.1798, + "step": 42355 + }, + { + "epoch": 0.86181640625, + "grad_norm": 12.494197845458984, + "learning_rate": 8.901613204282726e-06, + "loss": 5.3162, + "step": 42360 + }, + { + "epoch": 0.8619181315104166, + "grad_norm": 20.739057540893555, + "learning_rate": 8.901363242674514e-06, + "loss": 4.8573, + "step": 42365 + }, + { + "epoch": 0.8620198567708334, + "grad_norm": 14.33878231048584, + "learning_rate": 8.901113256137831e-06, + "loss": 4.9322, + "step": 42370 + }, + { + "epoch": 0.86212158203125, + "grad_norm": 22.3004150390625, + "learning_rate": 8.900863244674271e-06, + "loss": 4.9817, + "step": 42375 + }, + { + "epoch": 0.8622233072916666, + "grad_norm": 14.246588706970215, + "learning_rate": 8.900613208285435e-06, + "loss": 5.1158, + "step": 42380 + }, + { + "epoch": 0.8623250325520834, + "grad_norm": 17.592132568359375, + "learning_rate": 8.90036314697292e-06, + "loss": 4.6993, + "step": 42385 + }, + { + "epoch": 0.8624267578125, + "grad_norm": 17.320083618164062, + "learning_rate": 8.900113060738319e-06, + "loss": 4.9613, + "step": 42390 + }, + { + "epoch": 0.8625284830729166, + "grad_norm": 20.19345474243164, + "learning_rate": 8.899862949583235e-06, + "loss": 5.2294, + "step": 42395 + }, + { + "epoch": 0.8626302083333334, + "grad_norm": 20.41764259338379, + "learning_rate": 8.899612813509265e-06, + "loss": 4.7583, + "step": 42400 + }, + { + "epoch": 0.86273193359375, + "grad_norm": 13.183356285095215, + "learning_rate": 8.899362652518008e-06, + "loss": 4.9911, + "step": 42405 + }, + { + "epoch": 0.8628336588541666, + "grad_norm": 19.938196182250977, + "learning_rate": 8.899112466611059e-06, + "loss": 5.1255, + "step": 42410 + }, + { + "epoch": 0.8629353841145834, + "grad_norm": 13.590353012084961, + "learning_rate": 8.898862255790021e-06, + "loss": 5.1693, + "step": 42415 + }, + { + "epoch": 0.863037109375, + "grad_norm": 16.980098724365234, + "learning_rate": 8.89861202005649e-06, + "loss": 5.1982, + "step": 42420 + }, + { + "epoch": 0.8631388346354166, + "grad_norm": 14.390365600585938, + "learning_rate": 8.898361759412066e-06, + "loss": 4.9833, + "step": 42425 + }, + { + "epoch": 0.8632405598958334, + "grad_norm": 16.644289016723633, + "learning_rate": 8.89811147385835e-06, + "loss": 5.1265, + "step": 42430 + }, + { + "epoch": 0.86334228515625, + "grad_norm": 17.689496994018555, + "learning_rate": 8.897861163396936e-06, + "loss": 4.9827, + "step": 42435 + }, + { + "epoch": 0.8634440104166666, + "grad_norm": 17.901777267456055, + "learning_rate": 8.897610828029428e-06, + "loss": 5.1686, + "step": 42440 + }, + { + "epoch": 0.8635457356770834, + "grad_norm": 12.803570747375488, + "learning_rate": 8.897360467757424e-06, + "loss": 4.8427, + "step": 42445 + }, + { + "epoch": 0.8636474609375, + "grad_norm": 30.617515563964844, + "learning_rate": 8.897110082582523e-06, + "loss": 4.8088, + "step": 42450 + }, + { + "epoch": 0.8637491861979166, + "grad_norm": 13.481993675231934, + "learning_rate": 8.896859672506326e-06, + "loss": 4.8499, + "step": 42455 + }, + { + "epoch": 0.8638509114583334, + "grad_norm": 15.173347473144531, + "learning_rate": 8.896609237530434e-06, + "loss": 5.3834, + "step": 42460 + }, + { + "epoch": 0.86395263671875, + "grad_norm": 15.273802757263184, + "learning_rate": 8.896358777656445e-06, + "loss": 5.3696, + "step": 42465 + }, + { + "epoch": 0.8640543619791666, + "grad_norm": 21.603727340698242, + "learning_rate": 8.89610829288596e-06, + "loss": 5.0866, + "step": 42470 + }, + { + "epoch": 0.8641560872395834, + "grad_norm": 14.004937171936035, + "learning_rate": 8.895857783220578e-06, + "loss": 5.1897, + "step": 42475 + }, + { + "epoch": 0.8642578125, + "grad_norm": 19.055160522460938, + "learning_rate": 8.895607248661905e-06, + "loss": 4.8924, + "step": 42480 + }, + { + "epoch": 0.8643595377604166, + "grad_norm": 24.948654174804688, + "learning_rate": 8.895356689211537e-06, + "loss": 4.9621, + "step": 42485 + }, + { + "epoch": 0.8644612630208334, + "grad_norm": 18.369529724121094, + "learning_rate": 8.895106104871073e-06, + "loss": 5.0463, + "step": 42490 + }, + { + "epoch": 0.86456298828125, + "grad_norm": 18.498394012451172, + "learning_rate": 8.894855495642122e-06, + "loss": 4.9046, + "step": 42495 + }, + { + "epoch": 0.8646647135416666, + "grad_norm": 27.85092544555664, + "learning_rate": 8.89460486152628e-06, + "loss": 5.0334, + "step": 42500 + }, + { + "epoch": 0.8647664388020834, + "grad_norm": 19.135604858398438, + "learning_rate": 8.894354202525149e-06, + "loss": 4.9656, + "step": 42505 + }, + { + "epoch": 0.8648681640625, + "grad_norm": 14.632831573486328, + "learning_rate": 8.894103518640328e-06, + "loss": 5.0854, + "step": 42510 + }, + { + "epoch": 0.8649698893229166, + "grad_norm": 17.955533981323242, + "learning_rate": 8.893852809873427e-06, + "loss": 4.8169, + "step": 42515 + }, + { + "epoch": 0.8650716145833334, + "grad_norm": 19.43095588684082, + "learning_rate": 8.893602076226038e-06, + "loss": 4.9905, + "step": 42520 + }, + { + "epoch": 0.86517333984375, + "grad_norm": 13.917204856872559, + "learning_rate": 8.89335131769977e-06, + "loss": 5.373, + "step": 42525 + }, + { + "epoch": 0.8652750651041666, + "grad_norm": 17.122840881347656, + "learning_rate": 8.893100534296221e-06, + "loss": 4.9207, + "step": 42530 + }, + { + "epoch": 0.8653767903645834, + "grad_norm": 13.652422904968262, + "learning_rate": 8.892849726016999e-06, + "loss": 5.0685, + "step": 42535 + }, + { + "epoch": 0.865478515625, + "grad_norm": 29.46314239501953, + "learning_rate": 8.8925988928637e-06, + "loss": 5.5506, + "step": 42540 + }, + { + "epoch": 0.8655802408854166, + "grad_norm": 14.740784645080566, + "learning_rate": 8.89234803483793e-06, + "loss": 5.0362, + "step": 42545 + }, + { + "epoch": 0.8656819661458334, + "grad_norm": 18.367801666259766, + "learning_rate": 8.892097151941293e-06, + "loss": 5.1244, + "step": 42550 + }, + { + "epoch": 0.86578369140625, + "grad_norm": 17.154695510864258, + "learning_rate": 8.89184624417539e-06, + "loss": 4.9429, + "step": 42555 + }, + { + "epoch": 0.8658854166666666, + "grad_norm": 19.254724502563477, + "learning_rate": 8.891595311541826e-06, + "loss": 4.8473, + "step": 42560 + }, + { + "epoch": 0.8659871419270834, + "grad_norm": 16.638349533081055, + "learning_rate": 8.8913443540422e-06, + "loss": 5.1595, + "step": 42565 + }, + { + "epoch": 0.8660888671875, + "grad_norm": 20.5998592376709, + "learning_rate": 8.891093371678121e-06, + "loss": 5.0438, + "step": 42570 + }, + { + "epoch": 0.8661905924479166, + "grad_norm": 20.031930923461914, + "learning_rate": 8.89084236445119e-06, + "loss": 5.1522, + "step": 42575 + }, + { + "epoch": 0.8662923177083334, + "grad_norm": 19.86385726928711, + "learning_rate": 8.890591332363014e-06, + "loss": 5.0433, + "step": 42580 + }, + { + "epoch": 0.86639404296875, + "grad_norm": 20.101016998291016, + "learning_rate": 8.890340275415192e-06, + "loss": 4.8012, + "step": 42585 + }, + { + "epoch": 0.8664957682291666, + "grad_norm": 19.93854522705078, + "learning_rate": 8.89008919360933e-06, + "loss": 5.0258, + "step": 42590 + }, + { + "epoch": 0.8665974934895834, + "grad_norm": 17.96428871154785, + "learning_rate": 8.889838086947036e-06, + "loss": 5.3533, + "step": 42595 + }, + { + "epoch": 0.86669921875, + "grad_norm": 14.419118881225586, + "learning_rate": 8.889586955429909e-06, + "loss": 5.2685, + "step": 42600 + }, + { + "epoch": 0.8668009440104166, + "grad_norm": 18.116792678833008, + "learning_rate": 8.889335799059556e-06, + "loss": 5.2098, + "step": 42605 + }, + { + "epoch": 0.8669026692708334, + "grad_norm": 17.029848098754883, + "learning_rate": 8.889084617837582e-06, + "loss": 5.132, + "step": 42610 + }, + { + "epoch": 0.86700439453125, + "grad_norm": 15.874643325805664, + "learning_rate": 8.888833411765593e-06, + "loss": 4.9055, + "step": 42615 + }, + { + "epoch": 0.8671061197916666, + "grad_norm": 24.236576080322266, + "learning_rate": 8.88858218084519e-06, + "loss": 4.9993, + "step": 42620 + }, + { + "epoch": 0.8672078450520834, + "grad_norm": 16.341184616088867, + "learning_rate": 8.888330925077987e-06, + "loss": 5.0909, + "step": 42625 + }, + { + "epoch": 0.8673095703125, + "grad_norm": 15.717717170715332, + "learning_rate": 8.88807964446558e-06, + "loss": 5.1474, + "step": 42630 + }, + { + "epoch": 0.8674112955729166, + "grad_norm": 14.462894439697266, + "learning_rate": 8.887828339009577e-06, + "loss": 5.511, + "step": 42635 + }, + { + "epoch": 0.8675130208333334, + "grad_norm": 19.96649169921875, + "learning_rate": 8.887577008711586e-06, + "loss": 5.1779, + "step": 42640 + }, + { + "epoch": 0.86761474609375, + "grad_norm": 15.801294326782227, + "learning_rate": 8.887325653573213e-06, + "loss": 5.2614, + "step": 42645 + }, + { + "epoch": 0.8677164713541666, + "grad_norm": 22.51976776123047, + "learning_rate": 8.887074273596064e-06, + "loss": 5.0379, + "step": 42650 + }, + { + "epoch": 0.8678181966145834, + "grad_norm": 16.98371696472168, + "learning_rate": 8.88682286878174e-06, + "loss": 5.1486, + "step": 42655 + }, + { + "epoch": 0.867919921875, + "grad_norm": 16.54132843017578, + "learning_rate": 8.886571439131858e-06, + "loss": 5.1639, + "step": 42660 + }, + { + "epoch": 0.8680216471354166, + "grad_norm": 19.616519927978516, + "learning_rate": 8.886319984648014e-06, + "loss": 4.916, + "step": 42665 + }, + { + "epoch": 0.8681233723958334, + "grad_norm": 18.38568687438965, + "learning_rate": 8.88606850533182e-06, + "loss": 4.999, + "step": 42670 + }, + { + "epoch": 0.86822509765625, + "grad_norm": 18.307170867919922, + "learning_rate": 8.885817001184883e-06, + "loss": 5.1712, + "step": 42675 + }, + { + "epoch": 0.8683268229166666, + "grad_norm": 37.74911880493164, + "learning_rate": 8.885565472208806e-06, + "loss": 5.4843, + "step": 42680 + }, + { + "epoch": 0.8684285481770834, + "grad_norm": 18.92078399658203, + "learning_rate": 8.8853139184052e-06, + "loss": 4.8221, + "step": 42685 + }, + { + "epoch": 0.8685302734375, + "grad_norm": 18.511314392089844, + "learning_rate": 8.885062339775673e-06, + "loss": 5.0789, + "step": 42690 + }, + { + "epoch": 0.8686319986979166, + "grad_norm": 18.87552833557129, + "learning_rate": 8.88481073632183e-06, + "loss": 4.8846, + "step": 42695 + }, + { + "epoch": 0.8687337239583334, + "grad_norm": 17.851898193359375, + "learning_rate": 8.88455910804528e-06, + "loss": 4.8526, + "step": 42700 + }, + { + "epoch": 0.86883544921875, + "grad_norm": 17.935728073120117, + "learning_rate": 8.88430745494763e-06, + "loss": 4.8532, + "step": 42705 + }, + { + "epoch": 0.8689371744791666, + "grad_norm": 15.604446411132812, + "learning_rate": 8.884055777030487e-06, + "loss": 5.2142, + "step": 42710 + }, + { + "epoch": 0.8690388997395834, + "grad_norm": 15.787296295166016, + "learning_rate": 8.883804074295463e-06, + "loss": 5.0677, + "step": 42715 + }, + { + "epoch": 0.869140625, + "grad_norm": 21.677494049072266, + "learning_rate": 8.883552346744163e-06, + "loss": 5.0175, + "step": 42720 + }, + { + "epoch": 0.8692423502604166, + "grad_norm": 28.3038330078125, + "learning_rate": 8.883300594378196e-06, + "loss": 4.9525, + "step": 42725 + }, + { + "epoch": 0.8693440755208334, + "grad_norm": 19.879623413085938, + "learning_rate": 8.883048817199172e-06, + "loss": 5.0584, + "step": 42730 + }, + { + "epoch": 0.86944580078125, + "grad_norm": 19.9832820892334, + "learning_rate": 8.882797015208698e-06, + "loss": 4.849, + "step": 42735 + }, + { + "epoch": 0.8695475260416666, + "grad_norm": 21.91051483154297, + "learning_rate": 8.882545188408384e-06, + "loss": 5.151, + "step": 42740 + }, + { + "epoch": 0.8696492513020834, + "grad_norm": 16.368825912475586, + "learning_rate": 8.882293336799838e-06, + "loss": 4.9976, + "step": 42745 + }, + { + "epoch": 0.8697509765625, + "grad_norm": 20.09807586669922, + "learning_rate": 8.88204146038467e-06, + "loss": 5.0503, + "step": 42750 + }, + { + "epoch": 0.8698527018229166, + "grad_norm": 20.303874969482422, + "learning_rate": 8.881789559164489e-06, + "loss": 5.2726, + "step": 42755 + }, + { + "epoch": 0.8699544270833334, + "grad_norm": 50.511566162109375, + "learning_rate": 8.881537633140907e-06, + "loss": 5.2883, + "step": 42760 + }, + { + "epoch": 0.87005615234375, + "grad_norm": 15.84361457824707, + "learning_rate": 8.881285682315529e-06, + "loss": 5.059, + "step": 42765 + }, + { + "epoch": 0.8701578776041666, + "grad_norm": 18.158538818359375, + "learning_rate": 8.88103370668997e-06, + "loss": 5.0322, + "step": 42770 + }, + { + "epoch": 0.8702596028645834, + "grad_norm": 14.823787689208984, + "learning_rate": 8.880781706265835e-06, + "loss": 4.9787, + "step": 42775 + }, + { + "epoch": 0.870361328125, + "grad_norm": 14.903100967407227, + "learning_rate": 8.880529681044739e-06, + "loss": 4.9778, + "step": 42780 + }, + { + "epoch": 0.8704630533854166, + "grad_norm": 14.432779312133789, + "learning_rate": 8.88027763102829e-06, + "loss": 4.8698, + "step": 42785 + }, + { + "epoch": 0.8705647786458334, + "grad_norm": 17.047521591186523, + "learning_rate": 8.880025556218096e-06, + "loss": 5.2048, + "step": 42790 + }, + { + "epoch": 0.87066650390625, + "grad_norm": 20.04309844970703, + "learning_rate": 8.879773456615774e-06, + "loss": 5.4245, + "step": 42795 + }, + { + "epoch": 0.8707682291666666, + "grad_norm": 14.319525718688965, + "learning_rate": 8.87952133222293e-06, + "loss": 4.8629, + "step": 42800 + }, + { + "epoch": 0.8708699544270834, + "grad_norm": 17.641042709350586, + "learning_rate": 8.879269183041176e-06, + "loss": 5.0137, + "step": 42805 + }, + { + "epoch": 0.8709716796875, + "grad_norm": 15.255274772644043, + "learning_rate": 8.879017009072123e-06, + "loss": 5.0562, + "step": 42810 + }, + { + "epoch": 0.8710734049479166, + "grad_norm": 14.475593566894531, + "learning_rate": 8.878764810317385e-06, + "loss": 5.0605, + "step": 42815 + }, + { + "epoch": 0.8711751302083334, + "grad_norm": 16.408964157104492, + "learning_rate": 8.878512586778567e-06, + "loss": 5.0491, + "step": 42820 + }, + { + "epoch": 0.87127685546875, + "grad_norm": 15.872966766357422, + "learning_rate": 8.878260338457288e-06, + "loss": 5.2196, + "step": 42825 + }, + { + "epoch": 0.8713785807291666, + "grad_norm": 15.895662307739258, + "learning_rate": 8.878008065355155e-06, + "loss": 4.9752, + "step": 42830 + }, + { + "epoch": 0.8714803059895834, + "grad_norm": 20.476478576660156, + "learning_rate": 8.877755767473782e-06, + "loss": 5.1225, + "step": 42835 + }, + { + "epoch": 0.87158203125, + "grad_norm": 17.589426040649414, + "learning_rate": 8.87750344481478e-06, + "loss": 4.9659, + "step": 42840 + }, + { + "epoch": 0.8716837565104166, + "grad_norm": 14.800042152404785, + "learning_rate": 8.877251097379763e-06, + "loss": 5.0211, + "step": 42845 + }, + { + "epoch": 0.8717854817708334, + "grad_norm": 20.990846633911133, + "learning_rate": 8.876998725170342e-06, + "loss": 5.0197, + "step": 42850 + }, + { + "epoch": 0.87188720703125, + "grad_norm": 15.862299919128418, + "learning_rate": 8.87674632818813e-06, + "loss": 4.8827, + "step": 42855 + }, + { + "epoch": 0.8719889322916666, + "grad_norm": 17.10612678527832, + "learning_rate": 8.876493906434739e-06, + "loss": 4.7858, + "step": 42860 + }, + { + "epoch": 0.8720906575520834, + "grad_norm": 20.69097137451172, + "learning_rate": 8.876241459911782e-06, + "loss": 5.0513, + "step": 42865 + }, + { + "epoch": 0.8721923828125, + "grad_norm": 13.789690017700195, + "learning_rate": 8.875988988620874e-06, + "loss": 5.0, + "step": 42870 + }, + { + "epoch": 0.8722941080729166, + "grad_norm": 15.821860313415527, + "learning_rate": 8.875736492563627e-06, + "loss": 5.6941, + "step": 42875 + }, + { + "epoch": 0.8723958333333334, + "grad_norm": 23.64419174194336, + "learning_rate": 8.875483971741652e-06, + "loss": 5.0039, + "step": 42880 + }, + { + "epoch": 0.87249755859375, + "grad_norm": 16.837451934814453, + "learning_rate": 8.875231426156566e-06, + "loss": 5.0047, + "step": 42885 + }, + { + "epoch": 0.8725992838541666, + "grad_norm": 19.176626205444336, + "learning_rate": 8.874978855809981e-06, + "loss": 5.0005, + "step": 42890 + }, + { + "epoch": 0.8727010091145834, + "grad_norm": 13.289956092834473, + "learning_rate": 8.874726260703512e-06, + "loss": 5.0372, + "step": 42895 + }, + { + "epoch": 0.872802734375, + "grad_norm": 12.944452285766602, + "learning_rate": 8.874473640838772e-06, + "loss": 5.1902, + "step": 42900 + }, + { + "epoch": 0.8729044596354166, + "grad_norm": 13.075714111328125, + "learning_rate": 8.874220996217376e-06, + "loss": 5.2462, + "step": 42905 + }, + { + "epoch": 0.8730061848958334, + "grad_norm": 17.807920455932617, + "learning_rate": 8.873968326840937e-06, + "loss": 5.2438, + "step": 42910 + }, + { + "epoch": 0.87310791015625, + "grad_norm": 15.699905395507812, + "learning_rate": 8.873715632711069e-06, + "loss": 5.1398, + "step": 42915 + }, + { + "epoch": 0.8732096354166666, + "grad_norm": 20.115562438964844, + "learning_rate": 8.873462913829388e-06, + "loss": 5.1002, + "step": 42920 + }, + { + "epoch": 0.8733113606770834, + "grad_norm": 14.171952247619629, + "learning_rate": 8.873210170197509e-06, + "loss": 5.121, + "step": 42925 + }, + { + "epoch": 0.8734130859375, + "grad_norm": 21.827783584594727, + "learning_rate": 8.872957401817047e-06, + "loss": 5.1512, + "step": 42930 + }, + { + "epoch": 0.8735148111979166, + "grad_norm": 36.94054412841797, + "learning_rate": 8.872704608689616e-06, + "loss": 5.3579, + "step": 42935 + }, + { + "epoch": 0.8736165364583334, + "grad_norm": 18.967193603515625, + "learning_rate": 8.872451790816834e-06, + "loss": 5.1415, + "step": 42940 + }, + { + "epoch": 0.87371826171875, + "grad_norm": 20.12362289428711, + "learning_rate": 8.872198948200312e-06, + "loss": 5.1543, + "step": 42945 + }, + { + "epoch": 0.8738199869791666, + "grad_norm": 13.444326400756836, + "learning_rate": 8.871946080841668e-06, + "loss": 5.0547, + "step": 42950 + }, + { + "epoch": 0.8739217122395834, + "grad_norm": 16.974903106689453, + "learning_rate": 8.871693188742518e-06, + "loss": 5.0324, + "step": 42955 + }, + { + "epoch": 0.8740234375, + "grad_norm": 22.590944290161133, + "learning_rate": 8.871440271904478e-06, + "loss": 5.0033, + "step": 42960 + }, + { + "epoch": 0.8741251627604166, + "grad_norm": 14.876791000366211, + "learning_rate": 8.871187330329163e-06, + "loss": 4.773, + "step": 42965 + }, + { + "epoch": 0.8742268880208334, + "grad_norm": 21.824432373046875, + "learning_rate": 8.87093436401819e-06, + "loss": 5.022, + "step": 42970 + }, + { + "epoch": 0.87432861328125, + "grad_norm": 19.861244201660156, + "learning_rate": 8.870681372973175e-06, + "loss": 5.1165, + "step": 42975 + }, + { + "epoch": 0.8744303385416666, + "grad_norm": 13.894518852233887, + "learning_rate": 8.870428357195736e-06, + "loss": 5.1132, + "step": 42980 + }, + { + "epoch": 0.8745320638020834, + "grad_norm": 15.536883354187012, + "learning_rate": 8.870175316687488e-06, + "loss": 4.9679, + "step": 42985 + }, + { + "epoch": 0.8746337890625, + "grad_norm": 24.765172958374023, + "learning_rate": 8.869922251450046e-06, + "loss": 5.0085, + "step": 42990 + }, + { + "epoch": 0.8747355143229166, + "grad_norm": 23.165048599243164, + "learning_rate": 8.869669161485033e-06, + "loss": 5.4189, + "step": 42995 + }, + { + "epoch": 0.8748372395833334, + "grad_norm": 18.301124572753906, + "learning_rate": 8.869416046794061e-06, + "loss": 5.1082, + "step": 43000 + }, + { + "epoch": 0.87493896484375, + "grad_norm": 21.58678436279297, + "learning_rate": 8.869162907378748e-06, + "loss": 4.986, + "step": 43005 + }, + { + "epoch": 0.8750406901041666, + "grad_norm": 16.47013282775879, + "learning_rate": 8.868909743240712e-06, + "loss": 5.0251, + "step": 43010 + }, + { + "epoch": 0.8751424153645834, + "grad_norm": 16.03070640563965, + "learning_rate": 8.868656554381573e-06, + "loss": 5.0779, + "step": 43015 + }, + { + "epoch": 0.875244140625, + "grad_norm": 21.524694442749023, + "learning_rate": 8.868403340802945e-06, + "loss": 4.721, + "step": 43020 + }, + { + "epoch": 0.8753458658854166, + "grad_norm": 16.771284103393555, + "learning_rate": 8.868150102506447e-06, + "loss": 5.0418, + "step": 43025 + }, + { + "epoch": 0.8754475911458334, + "grad_norm": 21.191877365112305, + "learning_rate": 8.8678968394937e-06, + "loss": 4.8633, + "step": 43030 + }, + { + "epoch": 0.87554931640625, + "grad_norm": 14.690362930297852, + "learning_rate": 8.867643551766319e-06, + "loss": 4.975, + "step": 43035 + }, + { + "epoch": 0.8756510416666666, + "grad_norm": 23.24401092529297, + "learning_rate": 8.867390239325922e-06, + "loss": 4.9123, + "step": 43040 + }, + { + "epoch": 0.8757527669270834, + "grad_norm": 16.717239379882812, + "learning_rate": 8.867136902174132e-06, + "loss": 4.97, + "step": 43045 + }, + { + "epoch": 0.8758544921875, + "grad_norm": 16.897470474243164, + "learning_rate": 8.866883540312563e-06, + "loss": 5.0408, + "step": 43050 + }, + { + "epoch": 0.8759562174479166, + "grad_norm": 20.13373374938965, + "learning_rate": 8.866630153742834e-06, + "loss": 5.0873, + "step": 43055 + }, + { + "epoch": 0.8760579427083334, + "grad_norm": 16.884462356567383, + "learning_rate": 8.866376742466569e-06, + "loss": 4.9589, + "step": 43060 + }, + { + "epoch": 0.87615966796875, + "grad_norm": 14.73864459991455, + "learning_rate": 8.866123306485381e-06, + "loss": 5.0895, + "step": 43065 + }, + { + "epoch": 0.8762613932291666, + "grad_norm": 17.15338134765625, + "learning_rate": 8.865869845800893e-06, + "loss": 4.9, + "step": 43070 + }, + { + "epoch": 0.8763631184895834, + "grad_norm": 15.508557319641113, + "learning_rate": 8.865616360414725e-06, + "loss": 5.1345, + "step": 43075 + }, + { + "epoch": 0.87646484375, + "grad_norm": 16.72393226623535, + "learning_rate": 8.865362850328496e-06, + "loss": 5.2931, + "step": 43080 + }, + { + "epoch": 0.8765665690104166, + "grad_norm": 25.003341674804688, + "learning_rate": 8.865109315543826e-06, + "loss": 5.0021, + "step": 43085 + }, + { + "epoch": 0.8766682942708334, + "grad_norm": 13.754127502441406, + "learning_rate": 8.864855756062331e-06, + "loss": 4.8647, + "step": 43090 + }, + { + "epoch": 0.87677001953125, + "grad_norm": 20.31143569946289, + "learning_rate": 8.864602171885638e-06, + "loss": 4.7364, + "step": 43095 + }, + { + "epoch": 0.8768717447916666, + "grad_norm": 16.66619110107422, + "learning_rate": 8.864348563015362e-06, + "loss": 5.0878, + "step": 43100 + }, + { + "epoch": 0.8769734700520834, + "grad_norm": 14.886104583740234, + "learning_rate": 8.864094929453126e-06, + "loss": 5.4109, + "step": 43105 + }, + { + "epoch": 0.8770751953125, + "grad_norm": 16.335376739501953, + "learning_rate": 8.863841271200551e-06, + "loss": 5.0733, + "step": 43110 + }, + { + "epoch": 0.8771769205729166, + "grad_norm": 20.503271102905273, + "learning_rate": 8.863587588259254e-06, + "loss": 5.0515, + "step": 43115 + }, + { + "epoch": 0.8772786458333334, + "grad_norm": 19.92795753479004, + "learning_rate": 8.863333880630862e-06, + "loss": 5.0542, + "step": 43120 + }, + { + "epoch": 0.87738037109375, + "grad_norm": 13.61466121673584, + "learning_rate": 8.863080148316992e-06, + "loss": 5.1089, + "step": 43125 + }, + { + "epoch": 0.8774820963541666, + "grad_norm": 17.783294677734375, + "learning_rate": 8.862826391319266e-06, + "loss": 4.8666, + "step": 43130 + }, + { + "epoch": 0.8775838216145834, + "grad_norm": 18.34406280517578, + "learning_rate": 8.862572609639305e-06, + "loss": 4.8553, + "step": 43135 + }, + { + "epoch": 0.877685546875, + "grad_norm": 16.58452796936035, + "learning_rate": 8.862318803278731e-06, + "loss": 5.4529, + "step": 43140 + }, + { + "epoch": 0.8777872721354166, + "grad_norm": 14.94102668762207, + "learning_rate": 8.862064972239166e-06, + "loss": 5.1386, + "step": 43145 + }, + { + "epoch": 0.8778889973958334, + "grad_norm": 15.67796802520752, + "learning_rate": 8.861811116522232e-06, + "loss": 5.1407, + "step": 43150 + }, + { + "epoch": 0.87799072265625, + "grad_norm": 26.826356887817383, + "learning_rate": 8.86155723612955e-06, + "loss": 5.2306, + "step": 43155 + }, + { + "epoch": 0.8780924479166666, + "grad_norm": 21.15692138671875, + "learning_rate": 8.861303331062745e-06, + "loss": 5.1635, + "step": 43160 + }, + { + "epoch": 0.8781941731770834, + "grad_norm": 17.619159698486328, + "learning_rate": 8.861049401323436e-06, + "loss": 4.656, + "step": 43165 + }, + { + "epoch": 0.8782958984375, + "grad_norm": 14.08676528930664, + "learning_rate": 8.860795446913248e-06, + "loss": 5.199, + "step": 43170 + }, + { + "epoch": 0.8783976236979166, + "grad_norm": 18.83004379272461, + "learning_rate": 8.860541467833803e-06, + "loss": 4.8929, + "step": 43175 + }, + { + "epoch": 0.8784993489583334, + "grad_norm": 15.802373886108398, + "learning_rate": 8.860287464086723e-06, + "loss": 4.9257, + "step": 43180 + }, + { + "epoch": 0.87860107421875, + "grad_norm": 16.453533172607422, + "learning_rate": 8.860033435673631e-06, + "loss": 5.0465, + "step": 43185 + }, + { + "epoch": 0.8787027994791666, + "grad_norm": 19.300634384155273, + "learning_rate": 8.859779382596151e-06, + "loss": 5.0227, + "step": 43190 + }, + { + "epoch": 0.8788045247395834, + "grad_norm": 14.674081802368164, + "learning_rate": 8.859525304855905e-06, + "loss": 5.1928, + "step": 43195 + }, + { + "epoch": 0.87890625, + "grad_norm": 23.306177139282227, + "learning_rate": 8.859271202454519e-06, + "loss": 4.91, + "step": 43200 + }, + { + "epoch": 0.8790079752604166, + "grad_norm": 13.839393615722656, + "learning_rate": 8.859017075393614e-06, + "loss": 4.9589, + "step": 43205 + }, + { + "epoch": 0.8791097005208334, + "grad_norm": 16.97818374633789, + "learning_rate": 8.858762923674816e-06, + "loss": 5.2546, + "step": 43210 + }, + { + "epoch": 0.87921142578125, + "grad_norm": 16.935958862304688, + "learning_rate": 8.858508747299748e-06, + "loss": 4.9475, + "step": 43215 + }, + { + "epoch": 0.8793131510416666, + "grad_norm": 15.924129486083984, + "learning_rate": 8.858254546270034e-06, + "loss": 5.0988, + "step": 43220 + }, + { + "epoch": 0.8794148763020834, + "grad_norm": 15.531250953674316, + "learning_rate": 8.858000320587297e-06, + "loss": 5.1207, + "step": 43225 + }, + { + "epoch": 0.8795166015625, + "grad_norm": 14.92564582824707, + "learning_rate": 8.85774607025316e-06, + "loss": 5.0749, + "step": 43230 + }, + { + "epoch": 0.8796183268229166, + "grad_norm": 21.05592918395996, + "learning_rate": 8.857491795269256e-06, + "loss": 5.1291, + "step": 43235 + }, + { + "epoch": 0.8797200520833334, + "grad_norm": 14.879911422729492, + "learning_rate": 8.8572374956372e-06, + "loss": 4.9984, + "step": 43240 + }, + { + "epoch": 0.87982177734375, + "grad_norm": 14.340715408325195, + "learning_rate": 8.856983171358623e-06, + "loss": 5.2179, + "step": 43245 + }, + { + "epoch": 0.8799235026041666, + "grad_norm": 16.813234329223633, + "learning_rate": 8.856728822435147e-06, + "loss": 4.9506, + "step": 43250 + }, + { + "epoch": 0.8800252278645834, + "grad_norm": 17.15883445739746, + "learning_rate": 8.856474448868398e-06, + "loss": 4.7865, + "step": 43255 + }, + { + "epoch": 0.880126953125, + "grad_norm": 22.386655807495117, + "learning_rate": 8.856220050660002e-06, + "loss": 5.1302, + "step": 43260 + }, + { + "epoch": 0.8802286783854166, + "grad_norm": 23.604948043823242, + "learning_rate": 8.855965627811585e-06, + "loss": 5.0757, + "step": 43265 + }, + { + "epoch": 0.8803304036458334, + "grad_norm": 18.568761825561523, + "learning_rate": 8.855711180324769e-06, + "loss": 4.9235, + "step": 43270 + }, + { + "epoch": 0.88043212890625, + "grad_norm": 17.745853424072266, + "learning_rate": 8.855456708201184e-06, + "loss": 5.0539, + "step": 43275 + }, + { + "epoch": 0.8805338541666666, + "grad_norm": 15.840141296386719, + "learning_rate": 8.855202211442454e-06, + "loss": 5.0369, + "step": 43280 + }, + { + "epoch": 0.8806355794270834, + "grad_norm": 18.25118637084961, + "learning_rate": 8.854947690050206e-06, + "loss": 5.1209, + "step": 43285 + }, + { + "epoch": 0.8807373046875, + "grad_norm": 19.835289001464844, + "learning_rate": 8.854693144026066e-06, + "loss": 5.2511, + "step": 43290 + }, + { + "epoch": 0.8808390299479166, + "grad_norm": 21.370914459228516, + "learning_rate": 8.85443857337166e-06, + "loss": 5.0885, + "step": 43295 + }, + { + "epoch": 0.8809407552083334, + "grad_norm": 18.529329299926758, + "learning_rate": 8.854183978088617e-06, + "loss": 4.9927, + "step": 43300 + }, + { + "epoch": 0.88104248046875, + "grad_norm": 21.313030242919922, + "learning_rate": 8.85392935817856e-06, + "loss": 5.2037, + "step": 43305 + }, + { + "epoch": 0.8811442057291666, + "grad_norm": 16.584238052368164, + "learning_rate": 8.853674713643119e-06, + "loss": 4.9847, + "step": 43310 + }, + { + "epoch": 0.8812459309895834, + "grad_norm": 20.816940307617188, + "learning_rate": 8.853420044483918e-06, + "loss": 4.8284, + "step": 43315 + }, + { + "epoch": 0.88134765625, + "grad_norm": 17.534570693969727, + "learning_rate": 8.853165350702589e-06, + "loss": 5.1406, + "step": 43320 + }, + { + "epoch": 0.8814493815104166, + "grad_norm": 14.431397438049316, + "learning_rate": 8.852910632300754e-06, + "loss": 5.1512, + "step": 43325 + }, + { + "epoch": 0.8815511067708334, + "grad_norm": 16.41152572631836, + "learning_rate": 8.852655889280045e-06, + "loss": 4.9137, + "step": 43330 + }, + { + "epoch": 0.88165283203125, + "grad_norm": 23.277835845947266, + "learning_rate": 8.852401121642086e-06, + "loss": 5.0337, + "step": 43335 + }, + { + "epoch": 0.8817545572916666, + "grad_norm": 17.28369903564453, + "learning_rate": 8.852146329388506e-06, + "loss": 5.0696, + "step": 43340 + }, + { + "epoch": 0.8818562825520834, + "grad_norm": 18.581008911132812, + "learning_rate": 8.851891512520937e-06, + "loss": 4.9642, + "step": 43345 + }, + { + "epoch": 0.8819580078125, + "grad_norm": 18.27910614013672, + "learning_rate": 8.851636671041002e-06, + "loss": 4.9527, + "step": 43350 + }, + { + "epoch": 0.8820597330729166, + "grad_norm": 17.850893020629883, + "learning_rate": 8.851381804950332e-06, + "loss": 5.1957, + "step": 43355 + }, + { + "epoch": 0.8821614583333334, + "grad_norm": 19.223196029663086, + "learning_rate": 8.851126914250555e-06, + "loss": 5.0166, + "step": 43360 + }, + { + "epoch": 0.88226318359375, + "grad_norm": 20.0583438873291, + "learning_rate": 8.8508719989433e-06, + "loss": 4.9536, + "step": 43365 + }, + { + "epoch": 0.8823649088541666, + "grad_norm": 20.515249252319336, + "learning_rate": 8.850617059030193e-06, + "loss": 4.9156, + "step": 43370 + }, + { + "epoch": 0.8824666341145834, + "grad_norm": 14.12017822265625, + "learning_rate": 8.850362094512868e-06, + "loss": 5.1583, + "step": 43375 + }, + { + "epoch": 0.882568359375, + "grad_norm": 19.50937843322754, + "learning_rate": 8.85010710539295e-06, + "loss": 5.171, + "step": 43380 + }, + { + "epoch": 0.8826700846354166, + "grad_norm": 14.903800964355469, + "learning_rate": 8.84985209167207e-06, + "loss": 5.2457, + "step": 43385 + }, + { + "epoch": 0.8827718098958334, + "grad_norm": 18.560569763183594, + "learning_rate": 8.849597053351857e-06, + "loss": 5.1629, + "step": 43390 + }, + { + "epoch": 0.88287353515625, + "grad_norm": 23.286943435668945, + "learning_rate": 8.849341990433942e-06, + "loss": 5.2262, + "step": 43395 + }, + { + "epoch": 0.8829752604166666, + "grad_norm": 21.039440155029297, + "learning_rate": 8.849086902919952e-06, + "loss": 5.1238, + "step": 43400 + }, + { + "epoch": 0.8830769856770834, + "grad_norm": 16.1473388671875, + "learning_rate": 8.84883179081152e-06, + "loss": 4.8796, + "step": 43405 + }, + { + "epoch": 0.8831787109375, + "grad_norm": 16.729177474975586, + "learning_rate": 8.848576654110273e-06, + "loss": 5.0818, + "step": 43410 + }, + { + "epoch": 0.8832804361979166, + "grad_norm": 17.777381896972656, + "learning_rate": 8.848321492817843e-06, + "loss": 4.9699, + "step": 43415 + }, + { + "epoch": 0.8833821614583334, + "grad_norm": 17.252328872680664, + "learning_rate": 8.848066306935863e-06, + "loss": 4.8707, + "step": 43420 + }, + { + "epoch": 0.88348388671875, + "grad_norm": 15.509208679199219, + "learning_rate": 8.847811096465959e-06, + "loss": 5.1121, + "step": 43425 + }, + { + "epoch": 0.8835856119791666, + "grad_norm": 17.677640914916992, + "learning_rate": 8.847555861409763e-06, + "loss": 5.1069, + "step": 43430 + }, + { + "epoch": 0.8836873372395834, + "grad_norm": 14.529900550842285, + "learning_rate": 8.847300601768907e-06, + "loss": 4.8605, + "step": 43435 + }, + { + "epoch": 0.8837890625, + "grad_norm": 14.316545486450195, + "learning_rate": 8.847045317545022e-06, + "loss": 4.9399, + "step": 43440 + }, + { + "epoch": 0.8838907877604166, + "grad_norm": 15.073031425476074, + "learning_rate": 8.84679000873974e-06, + "loss": 5.053, + "step": 43445 + }, + { + "epoch": 0.8839925130208334, + "grad_norm": 16.586397171020508, + "learning_rate": 8.846534675354687e-06, + "loss": 5.1449, + "step": 43450 + }, + { + "epoch": 0.88409423828125, + "grad_norm": 15.834929466247559, + "learning_rate": 8.8462793173915e-06, + "loss": 4.9066, + "step": 43455 + }, + { + "epoch": 0.8841959635416666, + "grad_norm": 15.22117805480957, + "learning_rate": 8.846023934851811e-06, + "loss": 5.2954, + "step": 43460 + }, + { + "epoch": 0.8842976888020834, + "grad_norm": 13.203136444091797, + "learning_rate": 8.845768527737249e-06, + "loss": 5.3222, + "step": 43465 + }, + { + "epoch": 0.8843994140625, + "grad_norm": 13.989424705505371, + "learning_rate": 8.845513096049446e-06, + "loss": 5.0419, + "step": 43470 + }, + { + "epoch": 0.8845011393229166, + "grad_norm": 17.009462356567383, + "learning_rate": 8.845257639790035e-06, + "loss": 4.9235, + "step": 43475 + }, + { + "epoch": 0.8846028645833334, + "grad_norm": 13.738597869873047, + "learning_rate": 8.84500215896065e-06, + "loss": 5.0915, + "step": 43480 + }, + { + "epoch": 0.88470458984375, + "grad_norm": 14.339741706848145, + "learning_rate": 8.84474665356292e-06, + "loss": 5.1714, + "step": 43485 + }, + { + "epoch": 0.8848063151041666, + "grad_norm": 16.953460693359375, + "learning_rate": 8.84449112359848e-06, + "loss": 4.8055, + "step": 43490 + }, + { + "epoch": 0.8849080403645834, + "grad_norm": 21.544858932495117, + "learning_rate": 8.844235569068962e-06, + "loss": 4.9449, + "step": 43495 + }, + { + "epoch": 0.885009765625, + "grad_norm": 18.54915428161621, + "learning_rate": 8.843979989976e-06, + "loss": 4.9372, + "step": 43500 + }, + { + "epoch": 0.8851114908854166, + "grad_norm": 14.941094398498535, + "learning_rate": 8.843724386321224e-06, + "loss": 4.9701, + "step": 43505 + }, + { + "epoch": 0.8852132161458334, + "grad_norm": 19.67859649658203, + "learning_rate": 8.843468758106272e-06, + "loss": 4.7706, + "step": 43510 + }, + { + "epoch": 0.88531494140625, + "grad_norm": 15.795574188232422, + "learning_rate": 8.843213105332773e-06, + "loss": 5.0671, + "step": 43515 + }, + { + "epoch": 0.8854166666666666, + "grad_norm": 18.464181900024414, + "learning_rate": 8.842957428002364e-06, + "loss": 5.0662, + "step": 43520 + }, + { + "epoch": 0.8855183919270834, + "grad_norm": 18.357845306396484, + "learning_rate": 8.842701726116675e-06, + "loss": 4.9828, + "step": 43525 + }, + { + "epoch": 0.8856201171875, + "grad_norm": 19.23867416381836, + "learning_rate": 8.842445999677343e-06, + "loss": 5.0232, + "step": 43530 + }, + { + "epoch": 0.8857218424479166, + "grad_norm": 16.598224639892578, + "learning_rate": 8.842190248686e-06, + "loss": 4.8694, + "step": 43535 + }, + { + "epoch": 0.8858235677083334, + "grad_norm": 19.199607849121094, + "learning_rate": 8.84193447314428e-06, + "loss": 5.001, + "step": 43540 + }, + { + "epoch": 0.88592529296875, + "grad_norm": 14.221386909484863, + "learning_rate": 8.84167867305382e-06, + "loss": 5.0704, + "step": 43545 + }, + { + "epoch": 0.8860270182291666, + "grad_norm": 17.471925735473633, + "learning_rate": 8.841422848416252e-06, + "loss": 4.8706, + "step": 43550 + }, + { + "epoch": 0.8861287434895834, + "grad_norm": 22.17996597290039, + "learning_rate": 8.841166999233212e-06, + "loss": 4.819, + "step": 43555 + }, + { + "epoch": 0.88623046875, + "grad_norm": 15.015336990356445, + "learning_rate": 8.840911125506334e-06, + "loss": 5.2007, + "step": 43560 + }, + { + "epoch": 0.8863321940104166, + "grad_norm": 15.549728393554688, + "learning_rate": 8.840655227237254e-06, + "loss": 4.887, + "step": 43565 + }, + { + "epoch": 0.8864339192708334, + "grad_norm": 16.339868545532227, + "learning_rate": 8.840399304427605e-06, + "loss": 5.0613, + "step": 43570 + }, + { + "epoch": 0.88653564453125, + "grad_norm": 16.520967483520508, + "learning_rate": 8.840143357079024e-06, + "loss": 5.1026, + "step": 43575 + }, + { + "epoch": 0.8866373697916666, + "grad_norm": 20.055057525634766, + "learning_rate": 8.839887385193147e-06, + "loss": 5.0611, + "step": 43580 + }, + { + "epoch": 0.8867390950520834, + "grad_norm": 18.373689651489258, + "learning_rate": 8.839631388771606e-06, + "loss": 4.6856, + "step": 43585 + }, + { + "epoch": 0.8868408203125, + "grad_norm": 17.762706756591797, + "learning_rate": 8.839375367816041e-06, + "loss": 4.9028, + "step": 43590 + }, + { + "epoch": 0.8869425455729166, + "grad_norm": 19.754465103149414, + "learning_rate": 8.839119322328087e-06, + "loss": 5.1975, + "step": 43595 + }, + { + "epoch": 0.8870442708333334, + "grad_norm": 22.61135482788086, + "learning_rate": 8.838863252309378e-06, + "loss": 5.0153, + "step": 43600 + }, + { + "epoch": 0.88714599609375, + "grad_norm": 22.965160369873047, + "learning_rate": 8.838607157761551e-06, + "loss": 4.8689, + "step": 43605 + }, + { + "epoch": 0.8872477213541666, + "grad_norm": 18.441572189331055, + "learning_rate": 8.838351038686244e-06, + "loss": 4.9348, + "step": 43610 + }, + { + "epoch": 0.8873494466145834, + "grad_norm": 16.894760131835938, + "learning_rate": 8.838094895085092e-06, + "loss": 4.9278, + "step": 43615 + }, + { + "epoch": 0.887451171875, + "grad_norm": 17.944318771362305, + "learning_rate": 8.837838726959731e-06, + "loss": 4.8502, + "step": 43620 + }, + { + "epoch": 0.8875528971354166, + "grad_norm": 20.14411735534668, + "learning_rate": 8.837582534311799e-06, + "loss": 5.2477, + "step": 43625 + }, + { + "epoch": 0.8876546223958334, + "grad_norm": 20.384418487548828, + "learning_rate": 8.837326317142934e-06, + "loss": 5.0245, + "step": 43630 + }, + { + "epoch": 0.88775634765625, + "grad_norm": 16.123287200927734, + "learning_rate": 8.837070075454772e-06, + "loss": 4.9369, + "step": 43635 + }, + { + "epoch": 0.8878580729166666, + "grad_norm": 15.937771797180176, + "learning_rate": 8.83681380924895e-06, + "loss": 5.0792, + "step": 43640 + }, + { + "epoch": 0.8879597981770834, + "grad_norm": 19.37684440612793, + "learning_rate": 8.836557518527104e-06, + "loss": 5.1685, + "step": 43645 + }, + { + "epoch": 0.8880615234375, + "grad_norm": 16.12508773803711, + "learning_rate": 8.836301203290875e-06, + "loss": 5.2402, + "step": 43650 + }, + { + "epoch": 0.8881632486979166, + "grad_norm": 12.314973831176758, + "learning_rate": 8.836044863541899e-06, + "loss": 5.5571, + "step": 43655 + }, + { + "epoch": 0.8882649739583334, + "grad_norm": 19.256206512451172, + "learning_rate": 8.835788499281814e-06, + "loss": 4.9603, + "step": 43660 + }, + { + "epoch": 0.88836669921875, + "grad_norm": 17.809324264526367, + "learning_rate": 8.83553211051226e-06, + "loss": 4.9396, + "step": 43665 + }, + { + "epoch": 0.8884684244791666, + "grad_norm": 16.7783260345459, + "learning_rate": 8.835275697234869e-06, + "loss": 5.2354, + "step": 43670 + }, + { + "epoch": 0.8885701497395834, + "grad_norm": 19.431888580322266, + "learning_rate": 8.835019259451287e-06, + "loss": 5.2328, + "step": 43675 + }, + { + "epoch": 0.888671875, + "grad_norm": 17.43482208251953, + "learning_rate": 8.834762797163147e-06, + "loss": 5.3085, + "step": 43680 + }, + { + "epoch": 0.8887736002604166, + "grad_norm": 15.572704315185547, + "learning_rate": 8.834506310372092e-06, + "loss": 4.9359, + "step": 43685 + }, + { + "epoch": 0.8888753255208334, + "grad_norm": 25.043502807617188, + "learning_rate": 8.834249799079759e-06, + "loss": 5.237, + "step": 43690 + }, + { + "epoch": 0.88897705078125, + "grad_norm": 22.535654067993164, + "learning_rate": 8.833993263287787e-06, + "loss": 5.0626, + "step": 43695 + }, + { + "epoch": 0.8890787760416666, + "grad_norm": 18.26309585571289, + "learning_rate": 8.833736702997815e-06, + "loss": 5.0901, + "step": 43700 + }, + { + "epoch": 0.8891805013020834, + "grad_norm": 16.420917510986328, + "learning_rate": 8.83348011821148e-06, + "loss": 5.3538, + "step": 43705 + }, + { + "epoch": 0.8892822265625, + "grad_norm": 14.094341278076172, + "learning_rate": 8.833223508930427e-06, + "loss": 5.0865, + "step": 43710 + }, + { + "epoch": 0.8893839518229166, + "grad_norm": 16.382322311401367, + "learning_rate": 8.832966875156293e-06, + "loss": 5.0073, + "step": 43715 + }, + { + "epoch": 0.8894856770833334, + "grad_norm": 18.764591217041016, + "learning_rate": 8.832710216890714e-06, + "loss": 4.7952, + "step": 43720 + }, + { + "epoch": 0.88958740234375, + "grad_norm": 16.274511337280273, + "learning_rate": 8.832453534135336e-06, + "loss": 5.2775, + "step": 43725 + }, + { + "epoch": 0.8896891276041666, + "grad_norm": 18.26470375061035, + "learning_rate": 8.832196826891797e-06, + "loss": 4.982, + "step": 43730 + }, + { + "epoch": 0.8897908528645834, + "grad_norm": 16.99710464477539, + "learning_rate": 8.831940095161735e-06, + "loss": 4.8676, + "step": 43735 + }, + { + "epoch": 0.889892578125, + "grad_norm": 16.426069259643555, + "learning_rate": 8.831683338946792e-06, + "loss": 5.238, + "step": 43740 + }, + { + "epoch": 0.8899943033854166, + "grad_norm": 18.00581932067871, + "learning_rate": 8.83142655824861e-06, + "loss": 4.9932, + "step": 43745 + }, + { + "epoch": 0.8900960286458334, + "grad_norm": 16.996177673339844, + "learning_rate": 8.83116975306883e-06, + "loss": 5.0197, + "step": 43750 + }, + { + "epoch": 0.89019775390625, + "grad_norm": 15.403931617736816, + "learning_rate": 8.830912923409088e-06, + "loss": 5.3586, + "step": 43755 + }, + { + "epoch": 0.8902994791666666, + "grad_norm": 11.899918556213379, + "learning_rate": 8.83065606927103e-06, + "loss": 5.119, + "step": 43760 + }, + { + "epoch": 0.8904012044270834, + "grad_norm": 23.315542221069336, + "learning_rate": 8.830399190656296e-06, + "loss": 4.8886, + "step": 43765 + }, + { + "epoch": 0.8905029296875, + "grad_norm": 15.885437965393066, + "learning_rate": 8.830142287566527e-06, + "loss": 4.7078, + "step": 43770 + }, + { + "epoch": 0.8906046549479166, + "grad_norm": 18.878049850463867, + "learning_rate": 8.829885360003365e-06, + "loss": 4.982, + "step": 43775 + }, + { + "epoch": 0.8907063802083334, + "grad_norm": 17.191532135009766, + "learning_rate": 8.829628407968451e-06, + "loss": 5.0186, + "step": 43780 + }, + { + "epoch": 0.89080810546875, + "grad_norm": 18.1351261138916, + "learning_rate": 8.829371431463428e-06, + "loss": 5.039, + "step": 43785 + }, + { + "epoch": 0.8909098307291666, + "grad_norm": 48.80992126464844, + "learning_rate": 8.829114430489936e-06, + "loss": 5.0373, + "step": 43790 + }, + { + "epoch": 0.8910115559895834, + "grad_norm": 18.137210845947266, + "learning_rate": 8.828857405049617e-06, + "loss": 5.1545, + "step": 43795 + }, + { + "epoch": 0.89111328125, + "grad_norm": 12.852038383483887, + "learning_rate": 8.828600355144117e-06, + "loss": 4.7983, + "step": 43800 + }, + { + "epoch": 0.8912150065104166, + "grad_norm": 13.131843566894531, + "learning_rate": 8.828343280775075e-06, + "loss": 5.1019, + "step": 43805 + }, + { + "epoch": 0.8913167317708334, + "grad_norm": 16.04221534729004, + "learning_rate": 8.828086181944135e-06, + "loss": 5.021, + "step": 43810 + }, + { + "epoch": 0.89141845703125, + "grad_norm": 18.21012306213379, + "learning_rate": 8.827829058652938e-06, + "loss": 5.0323, + "step": 43815 + }, + { + "epoch": 0.8915201822916666, + "grad_norm": 15.249977111816406, + "learning_rate": 8.827571910903132e-06, + "loss": 5.1116, + "step": 43820 + }, + { + "epoch": 0.8916219075520834, + "grad_norm": 16.31293487548828, + "learning_rate": 8.827314738696353e-06, + "loss": 5.2947, + "step": 43825 + }, + { + "epoch": 0.8917236328125, + "grad_norm": 18.216583251953125, + "learning_rate": 8.82705754203425e-06, + "loss": 4.833, + "step": 43830 + }, + { + "epoch": 0.8918253580729166, + "grad_norm": 14.062454223632812, + "learning_rate": 8.826800320918464e-06, + "loss": 4.8073, + "step": 43835 + }, + { + "epoch": 0.8919270833333334, + "grad_norm": 19.791261672973633, + "learning_rate": 8.826543075350638e-06, + "loss": 5.0135, + "step": 43840 + }, + { + "epoch": 0.89202880859375, + "grad_norm": 16.0094051361084, + "learning_rate": 8.826285805332417e-06, + "loss": 5.0448, + "step": 43845 + }, + { + "epoch": 0.8921305338541666, + "grad_norm": 14.087653160095215, + "learning_rate": 8.826028510865443e-06, + "loss": 5.065, + "step": 43850 + }, + { + "epoch": 0.8922322591145834, + "grad_norm": 14.951184272766113, + "learning_rate": 8.825771191951362e-06, + "loss": 4.897, + "step": 43855 + }, + { + "epoch": 0.892333984375, + "grad_norm": 15.768499374389648, + "learning_rate": 8.82551384859182e-06, + "loss": 4.8002, + "step": 43860 + }, + { + "epoch": 0.8924357096354166, + "grad_norm": 20.58837890625, + "learning_rate": 8.825256480788455e-06, + "loss": 5.117, + "step": 43865 + }, + { + "epoch": 0.8925374348958334, + "grad_norm": 34.882102966308594, + "learning_rate": 8.824999088542919e-06, + "loss": 5.3063, + "step": 43870 + }, + { + "epoch": 0.89263916015625, + "grad_norm": 20.6192569732666, + "learning_rate": 8.82474167185685e-06, + "loss": 5.2496, + "step": 43875 + }, + { + "epoch": 0.8927408854166666, + "grad_norm": 19.18145751953125, + "learning_rate": 8.824484230731898e-06, + "loss": 5.0536, + "step": 43880 + }, + { + "epoch": 0.8928426106770834, + "grad_norm": 15.75452995300293, + "learning_rate": 8.824226765169704e-06, + "loss": 4.9485, + "step": 43885 + }, + { + "epoch": 0.8929443359375, + "grad_norm": 17.815906524658203, + "learning_rate": 8.823969275171915e-06, + "loss": 5.1787, + "step": 43890 + }, + { + "epoch": 0.8930460611979166, + "grad_norm": 16.237850189208984, + "learning_rate": 8.823711760740177e-06, + "loss": 5.0768, + "step": 43895 + }, + { + "epoch": 0.8931477864583334, + "grad_norm": 18.75973892211914, + "learning_rate": 8.823454221876135e-06, + "loss": 4.8068, + "step": 43900 + }, + { + "epoch": 0.89324951171875, + "grad_norm": 18.08612823486328, + "learning_rate": 8.823196658581435e-06, + "loss": 5.1272, + "step": 43905 + }, + { + "epoch": 0.8933512369791666, + "grad_norm": 17.677949905395508, + "learning_rate": 8.82293907085772e-06, + "loss": 5.1753, + "step": 43910 + }, + { + "epoch": 0.8934529622395834, + "grad_norm": 20.03943634033203, + "learning_rate": 8.82268145870664e-06, + "loss": 5.0166, + "step": 43915 + }, + { + "epoch": 0.8935546875, + "grad_norm": 15.189835548400879, + "learning_rate": 8.822423822129836e-06, + "loss": 4.8593, + "step": 43920 + }, + { + "epoch": 0.8936564127604166, + "grad_norm": 19.18446159362793, + "learning_rate": 8.82216616112896e-06, + "loss": 4.8274, + "step": 43925 + }, + { + "epoch": 0.8937581380208334, + "grad_norm": 19.14366912841797, + "learning_rate": 8.821908475705654e-06, + "loss": 5.2363, + "step": 43930 + }, + { + "epoch": 0.89385986328125, + "grad_norm": 17.006399154663086, + "learning_rate": 8.821650765861567e-06, + "loss": 5.1203, + "step": 43935 + }, + { + "epoch": 0.8939615885416666, + "grad_norm": 20.625137329101562, + "learning_rate": 8.821393031598345e-06, + "loss": 5.0023, + "step": 43940 + }, + { + "epoch": 0.8940633138020834, + "grad_norm": 18.584531784057617, + "learning_rate": 8.821135272917634e-06, + "loss": 5.0976, + "step": 43945 + }, + { + "epoch": 0.8941650390625, + "grad_norm": 17.97655487060547, + "learning_rate": 8.82087748982108e-06, + "loss": 4.9914, + "step": 43950 + }, + { + "epoch": 0.8942667643229166, + "grad_norm": 16.112058639526367, + "learning_rate": 8.820619682310333e-06, + "loss": 5.047, + "step": 43955 + }, + { + "epoch": 0.8943684895833334, + "grad_norm": 13.876602172851562, + "learning_rate": 8.82036185038704e-06, + "loss": 5.0592, + "step": 43960 + }, + { + "epoch": 0.89447021484375, + "grad_norm": 22.62623405456543, + "learning_rate": 8.820103994052846e-06, + "loss": 5.0375, + "step": 43965 + }, + { + "epoch": 0.8945719401041666, + "grad_norm": 18.205001831054688, + "learning_rate": 8.819846113309402e-06, + "loss": 4.9724, + "step": 43970 + }, + { + "epoch": 0.8946736653645834, + "grad_norm": 16.563724517822266, + "learning_rate": 8.819588208158353e-06, + "loss": 5.0031, + "step": 43975 + }, + { + "epoch": 0.894775390625, + "grad_norm": 19.34146499633789, + "learning_rate": 8.819330278601348e-06, + "loss": 4.8112, + "step": 43980 + }, + { + "epoch": 0.8948771158854166, + "grad_norm": 18.761892318725586, + "learning_rate": 8.819072324640034e-06, + "loss": 5.0539, + "step": 43985 + }, + { + "epoch": 0.8949788411458334, + "grad_norm": 16.245933532714844, + "learning_rate": 8.818814346276062e-06, + "loss": 4.8342, + "step": 43990 + }, + { + "epoch": 0.89508056640625, + "grad_norm": 22.90732192993164, + "learning_rate": 8.818556343511078e-06, + "loss": 4.9492, + "step": 43995 + }, + { + "epoch": 0.8951822916666666, + "grad_norm": 17.41977310180664, + "learning_rate": 8.818298316346729e-06, + "loss": 4.9423, + "step": 44000 + }, + { + "epoch": 0.8952840169270834, + "grad_norm": 16.071121215820312, + "learning_rate": 8.818040264784668e-06, + "loss": 4.9391, + "step": 44005 + }, + { + "epoch": 0.8953857421875, + "grad_norm": 12.441767692565918, + "learning_rate": 8.81778218882654e-06, + "loss": 4.8547, + "step": 44010 + }, + { + "epoch": 0.8954874674479166, + "grad_norm": 16.719533920288086, + "learning_rate": 8.817524088473998e-06, + "loss": 4.787, + "step": 44015 + }, + { + "epoch": 0.8955891927083334, + "grad_norm": 20.205461502075195, + "learning_rate": 8.817265963728687e-06, + "loss": 5.341, + "step": 44020 + }, + { + "epoch": 0.89569091796875, + "grad_norm": 19.02943992614746, + "learning_rate": 8.81700781459226e-06, + "loss": 4.933, + "step": 44025 + }, + { + "epoch": 0.8957926432291666, + "grad_norm": 17.110353469848633, + "learning_rate": 8.816749641066363e-06, + "loss": 5.1332, + "step": 44030 + }, + { + "epoch": 0.8958943684895834, + "grad_norm": 19.408464431762695, + "learning_rate": 8.816491443152649e-06, + "loss": 5.062, + "step": 44035 + }, + { + "epoch": 0.89599609375, + "grad_norm": 14.951727867126465, + "learning_rate": 8.816233220852766e-06, + "loss": 5.3047, + "step": 44040 + }, + { + "epoch": 0.8960978190104166, + "grad_norm": 16.878910064697266, + "learning_rate": 8.815974974168363e-06, + "loss": 5.0974, + "step": 44045 + }, + { + "epoch": 0.8961995442708334, + "grad_norm": 18.19757080078125, + "learning_rate": 8.815716703101092e-06, + "loss": 5.1524, + "step": 44050 + }, + { + "epoch": 0.89630126953125, + "grad_norm": 19.7820987701416, + "learning_rate": 8.815458407652601e-06, + "loss": 4.9838, + "step": 44055 + }, + { + "epoch": 0.8964029947916666, + "grad_norm": 12.776026725769043, + "learning_rate": 8.815200087824545e-06, + "loss": 4.9085, + "step": 44060 + }, + { + "epoch": 0.8965047200520834, + "grad_norm": 16.545961380004883, + "learning_rate": 8.81494174361857e-06, + "loss": 4.918, + "step": 44065 + }, + { + "epoch": 0.8966064453125, + "grad_norm": 17.11878204345703, + "learning_rate": 8.814683375036328e-06, + "loss": 5.1841, + "step": 44070 + }, + { + "epoch": 0.8967081705729166, + "grad_norm": 19.741987228393555, + "learning_rate": 8.81442498207947e-06, + "loss": 5.2288, + "step": 44075 + }, + { + "epoch": 0.8968098958333334, + "grad_norm": 12.684208869934082, + "learning_rate": 8.814166564749648e-06, + "loss": 5.2406, + "step": 44080 + }, + { + "epoch": 0.89691162109375, + "grad_norm": 14.23024845123291, + "learning_rate": 8.81390812304851e-06, + "loss": 4.9521, + "step": 44085 + }, + { + "epoch": 0.8970133463541666, + "grad_norm": 15.683372497558594, + "learning_rate": 8.813649656977713e-06, + "loss": 5.1354, + "step": 44090 + }, + { + "epoch": 0.8971150716145834, + "grad_norm": 19.441524505615234, + "learning_rate": 8.813391166538905e-06, + "loss": 4.7343, + "step": 44095 + }, + { + "epoch": 0.897216796875, + "grad_norm": 17.920862197875977, + "learning_rate": 8.813132651733738e-06, + "loss": 5.1325, + "step": 44100 + }, + { + "epoch": 0.8973185221354166, + "grad_norm": 22.013103485107422, + "learning_rate": 8.812874112563862e-06, + "loss": 5.173, + "step": 44105 + }, + { + "epoch": 0.8974202473958334, + "grad_norm": 14.910172462463379, + "learning_rate": 8.812615549030932e-06, + "loss": 5.3033, + "step": 44110 + }, + { + "epoch": 0.89752197265625, + "grad_norm": 18.413862228393555, + "learning_rate": 8.812356961136598e-06, + "loss": 4.8896, + "step": 44115 + }, + { + "epoch": 0.8976236979166666, + "grad_norm": 12.538565635681152, + "learning_rate": 8.812098348882515e-06, + "loss": 5.2266, + "step": 44120 + }, + { + "epoch": 0.8977254231770834, + "grad_norm": 14.207859992980957, + "learning_rate": 8.811839712270332e-06, + "loss": 5.1909, + "step": 44125 + }, + { + "epoch": 0.8978271484375, + "grad_norm": 20.395051956176758, + "learning_rate": 8.811581051301703e-06, + "loss": 4.9025, + "step": 44130 + }, + { + "epoch": 0.8979288736979166, + "grad_norm": 12.256073951721191, + "learning_rate": 8.811322365978283e-06, + "loss": 5.3112, + "step": 44135 + }, + { + "epoch": 0.8980305989583334, + "grad_norm": 20.275794982910156, + "learning_rate": 8.81106365630172e-06, + "loss": 5.4515, + "step": 44140 + }, + { + "epoch": 0.89813232421875, + "grad_norm": 17.700637817382812, + "learning_rate": 8.810804922273672e-06, + "loss": 5.1934, + "step": 44145 + }, + { + "epoch": 0.8982340494791666, + "grad_norm": 20.066287994384766, + "learning_rate": 8.81054616389579e-06, + "loss": 4.9637, + "step": 44150 + }, + { + "epoch": 0.8983357747395834, + "grad_norm": 13.670787811279297, + "learning_rate": 8.81028738116973e-06, + "loss": 5.0384, + "step": 44155 + }, + { + "epoch": 0.8984375, + "grad_norm": 16.84861183166504, + "learning_rate": 8.81002857409714e-06, + "loss": 4.7753, + "step": 44160 + }, + { + "epoch": 0.8985392252604166, + "grad_norm": 19.268718719482422, + "learning_rate": 8.809769742679678e-06, + "loss": 5.2125, + "step": 44165 + }, + { + "epoch": 0.8986409505208334, + "grad_norm": 13.888301849365234, + "learning_rate": 8.809510886918995e-06, + "loss": 4.9393, + "step": 44170 + }, + { + "epoch": 0.89874267578125, + "grad_norm": 13.316596031188965, + "learning_rate": 8.80925200681675e-06, + "loss": 5.1397, + "step": 44175 + }, + { + "epoch": 0.8988444010416666, + "grad_norm": 18.665145874023438, + "learning_rate": 8.80899310237459e-06, + "loss": 5.0867, + "step": 44180 + }, + { + "epoch": 0.8989461263020834, + "grad_norm": 14.23328971862793, + "learning_rate": 8.808734173594175e-06, + "loss": 4.864, + "step": 44185 + }, + { + "epoch": 0.8990478515625, + "grad_norm": 16.794658660888672, + "learning_rate": 8.808475220477159e-06, + "loss": 5.0639, + "step": 44190 + }, + { + "epoch": 0.8991495768229166, + "grad_norm": 17.08638572692871, + "learning_rate": 8.808216243025195e-06, + "loss": 4.9166, + "step": 44195 + }, + { + "epoch": 0.8992513020833334, + "grad_norm": 14.119484901428223, + "learning_rate": 8.807957241239937e-06, + "loss": 5.0069, + "step": 44200 + }, + { + "epoch": 0.89935302734375, + "grad_norm": 19.882802963256836, + "learning_rate": 8.807698215123041e-06, + "loss": 5.361, + "step": 44205 + }, + { + "epoch": 0.8994547526041666, + "grad_norm": 15.813698768615723, + "learning_rate": 8.807439164676164e-06, + "loss": 4.9973, + "step": 44210 + }, + { + "epoch": 0.8995564778645834, + "grad_norm": 20.30816078186035, + "learning_rate": 8.807180089900958e-06, + "loss": 4.999, + "step": 44215 + }, + { + "epoch": 0.899658203125, + "grad_norm": 18.788715362548828, + "learning_rate": 8.806920990799079e-06, + "loss": 5.0028, + "step": 44220 + }, + { + "epoch": 0.8997599283854166, + "grad_norm": 16.304288864135742, + "learning_rate": 8.806661867372183e-06, + "loss": 5.0087, + "step": 44225 + }, + { + "epoch": 0.8998616536458334, + "grad_norm": 15.654781341552734, + "learning_rate": 8.806402719621927e-06, + "loss": 4.989, + "step": 44230 + }, + { + "epoch": 0.89996337890625, + "grad_norm": 20.56938362121582, + "learning_rate": 8.806143547549967e-06, + "loss": 4.9797, + "step": 44235 + }, + { + "epoch": 0.9000651041666666, + "grad_norm": 18.6163272857666, + "learning_rate": 8.805884351157957e-06, + "loss": 5.3035, + "step": 44240 + }, + { + "epoch": 0.9001668294270834, + "grad_norm": 18.186315536499023, + "learning_rate": 8.805625130447556e-06, + "loss": 4.9098, + "step": 44245 + }, + { + "epoch": 0.9002685546875, + "grad_norm": 16.194679260253906, + "learning_rate": 8.805365885420416e-06, + "loss": 5.2689, + "step": 44250 + }, + { + "epoch": 0.9003702799479166, + "grad_norm": 17.493282318115234, + "learning_rate": 8.805106616078196e-06, + "loss": 5.1789, + "step": 44255 + }, + { + "epoch": 0.9004720052083334, + "grad_norm": 16.074832916259766, + "learning_rate": 8.804847322422554e-06, + "loss": 4.9072, + "step": 44260 + }, + { + "epoch": 0.90057373046875, + "grad_norm": 21.419713973999023, + "learning_rate": 8.804588004455145e-06, + "loss": 5.1061, + "step": 44265 + }, + { + "epoch": 0.9006754557291666, + "grad_norm": 17.866411209106445, + "learning_rate": 8.804328662177625e-06, + "loss": 4.9797, + "step": 44270 + }, + { + "epoch": 0.9007771809895834, + "grad_norm": 14.709637641906738, + "learning_rate": 8.804069295591655e-06, + "loss": 5.006, + "step": 44275 + }, + { + "epoch": 0.90087890625, + "grad_norm": 16.229732513427734, + "learning_rate": 8.803809904698888e-06, + "loss": 5.2344, + "step": 44280 + }, + { + "epoch": 0.9009806315104166, + "grad_norm": 16.75119972229004, + "learning_rate": 8.803550489500982e-06, + "loss": 4.9963, + "step": 44285 + }, + { + "epoch": 0.9010823567708334, + "grad_norm": 16.196382522583008, + "learning_rate": 8.803291049999599e-06, + "loss": 5.0725, + "step": 44290 + }, + { + "epoch": 0.90118408203125, + "grad_norm": 17.172008514404297, + "learning_rate": 8.803031586196392e-06, + "loss": 5.4394, + "step": 44295 + }, + { + "epoch": 0.9012858072916666, + "grad_norm": 17.112449645996094, + "learning_rate": 8.80277209809302e-06, + "loss": 4.9605, + "step": 44300 + }, + { + "epoch": 0.9013875325520834, + "grad_norm": 18.63500213623047, + "learning_rate": 8.80251258569114e-06, + "loss": 4.9652, + "step": 44305 + }, + { + "epoch": 0.9014892578125, + "grad_norm": 15.732176780700684, + "learning_rate": 8.802253048992415e-06, + "loss": 4.9275, + "step": 44310 + }, + { + "epoch": 0.9015909830729166, + "grad_norm": 15.07666015625, + "learning_rate": 8.801993487998499e-06, + "loss": 4.8674, + "step": 44315 + }, + { + "epoch": 0.9016927083333334, + "grad_norm": 13.291590690612793, + "learning_rate": 8.801733902711051e-06, + "loss": 4.9442, + "step": 44320 + }, + { + "epoch": 0.90179443359375, + "grad_norm": 15.094367980957031, + "learning_rate": 8.801474293131729e-06, + "loss": 5.0161, + "step": 44325 + }, + { + "epoch": 0.9018961588541666, + "grad_norm": 18.980144500732422, + "learning_rate": 8.801214659262195e-06, + "loss": 4.8517, + "step": 44330 + }, + { + "epoch": 0.9019978841145834, + "grad_norm": 14.527191162109375, + "learning_rate": 8.800955001104105e-06, + "loss": 5.0326, + "step": 44335 + }, + { + "epoch": 0.902099609375, + "grad_norm": 12.272083282470703, + "learning_rate": 8.80069531865912e-06, + "loss": 5.145, + "step": 44340 + }, + { + "epoch": 0.9022013346354166, + "grad_norm": 11.864821434020996, + "learning_rate": 8.800435611928897e-06, + "loss": 5.2428, + "step": 44345 + }, + { + "epoch": 0.9023030598958334, + "grad_norm": 18.797161102294922, + "learning_rate": 8.800175880915098e-06, + "loss": 4.6781, + "step": 44350 + }, + { + "epoch": 0.90240478515625, + "grad_norm": 21.3400821685791, + "learning_rate": 8.799916125619381e-06, + "loss": 5.0023, + "step": 44355 + }, + { + "epoch": 0.9025065104166666, + "grad_norm": 17.97197151184082, + "learning_rate": 8.799656346043406e-06, + "loss": 5.1287, + "step": 44360 + }, + { + "epoch": 0.9026082356770834, + "grad_norm": 11.918844223022461, + "learning_rate": 8.799396542188834e-06, + "loss": 5.0109, + "step": 44365 + }, + { + "epoch": 0.9027099609375, + "grad_norm": 13.677571296691895, + "learning_rate": 8.799136714057323e-06, + "loss": 5.0191, + "step": 44370 + }, + { + "epoch": 0.9028116861979166, + "grad_norm": 23.358488082885742, + "learning_rate": 8.798876861650534e-06, + "loss": 4.7902, + "step": 44375 + }, + { + "epoch": 0.9029134114583334, + "grad_norm": 19.370054244995117, + "learning_rate": 8.79861698497013e-06, + "loss": 5.1157, + "step": 44380 + }, + { + "epoch": 0.90301513671875, + "grad_norm": 13.700504302978516, + "learning_rate": 8.798357084017768e-06, + "loss": 4.9973, + "step": 44385 + }, + { + "epoch": 0.9031168619791666, + "grad_norm": 22.280174255371094, + "learning_rate": 8.79809715879511e-06, + "loss": 4.9504, + "step": 44390 + }, + { + "epoch": 0.9032185872395834, + "grad_norm": 17.497047424316406, + "learning_rate": 8.797837209303817e-06, + "loss": 5.213, + "step": 44395 + }, + { + "epoch": 0.9033203125, + "grad_norm": 13.826454162597656, + "learning_rate": 8.797577235545548e-06, + "loss": 5.027, + "step": 44400 + }, + { + "epoch": 0.9034220377604166, + "grad_norm": 20.173969268798828, + "learning_rate": 8.79731723752197e-06, + "loss": 5.1116, + "step": 44405 + }, + { + "epoch": 0.9035237630208334, + "grad_norm": 17.12943458557129, + "learning_rate": 8.797057215234738e-06, + "loss": 4.8938, + "step": 44410 + }, + { + "epoch": 0.90362548828125, + "grad_norm": 18.873371124267578, + "learning_rate": 8.796797168685516e-06, + "loss": 4.8041, + "step": 44415 + }, + { + "epoch": 0.9037272135416666, + "grad_norm": 27.324459075927734, + "learning_rate": 8.796537097875964e-06, + "loss": 5.0126, + "step": 44420 + }, + { + "epoch": 0.9038289388020834, + "grad_norm": 18.648466110229492, + "learning_rate": 8.796277002807748e-06, + "loss": 5.0379, + "step": 44425 + }, + { + "epoch": 0.9039306640625, + "grad_norm": 17.132343292236328, + "learning_rate": 8.796016883482524e-06, + "loss": 4.866, + "step": 44430 + }, + { + "epoch": 0.9040323893229166, + "grad_norm": 13.003429412841797, + "learning_rate": 8.795756739901958e-06, + "loss": 5.1183, + "step": 44435 + }, + { + "epoch": 0.9041341145833334, + "grad_norm": 15.98914623260498, + "learning_rate": 8.795496572067712e-06, + "loss": 4.8397, + "step": 44440 + }, + { + "epoch": 0.90423583984375, + "grad_norm": 14.429542541503906, + "learning_rate": 8.795236379981447e-06, + "loss": 4.93, + "step": 44445 + }, + { + "epoch": 0.9043375651041666, + "grad_norm": 16.672252655029297, + "learning_rate": 8.794976163644828e-06, + "loss": 4.9617, + "step": 44450 + }, + { + "epoch": 0.9044392903645834, + "grad_norm": 14.816722869873047, + "learning_rate": 8.794715923059514e-06, + "loss": 5.0721, + "step": 44455 + }, + { + "epoch": 0.904541015625, + "grad_norm": 17.063947677612305, + "learning_rate": 8.794455658227171e-06, + "loss": 4.9962, + "step": 44460 + }, + { + "epoch": 0.9046427408854166, + "grad_norm": 17.301557540893555, + "learning_rate": 8.794195369149462e-06, + "loss": 5.0453, + "step": 44465 + }, + { + "epoch": 0.9047444661458334, + "grad_norm": 16.18663215637207, + "learning_rate": 8.793935055828046e-06, + "loss": 5.0757, + "step": 44470 + }, + { + "epoch": 0.90484619140625, + "grad_norm": 19.635818481445312, + "learning_rate": 8.79367471826459e-06, + "loss": 5.0289, + "step": 44475 + }, + { + "epoch": 0.9049479166666666, + "grad_norm": 16.935272216796875, + "learning_rate": 8.79341435646076e-06, + "loss": 4.975, + "step": 44480 + }, + { + "epoch": 0.9050496419270834, + "grad_norm": 48.03938674926758, + "learning_rate": 8.793153970418213e-06, + "loss": 5.1057, + "step": 44485 + }, + { + "epoch": 0.9051513671875, + "grad_norm": 11.542248725891113, + "learning_rate": 8.792893560138618e-06, + "loss": 4.9753, + "step": 44490 + }, + { + "epoch": 0.9052530924479166, + "grad_norm": 17.196304321289062, + "learning_rate": 8.792633125623636e-06, + "loss": 4.97, + "step": 44495 + }, + { + "epoch": 0.9053548177083334, + "grad_norm": 21.367258071899414, + "learning_rate": 8.792372666874934e-06, + "loss": 4.679, + "step": 44500 + }, + { + "epoch": 0.90545654296875, + "grad_norm": 19.352176666259766, + "learning_rate": 8.792112183894173e-06, + "loss": 4.8965, + "step": 44505 + }, + { + "epoch": 0.9055582682291666, + "grad_norm": 25.23930549621582, + "learning_rate": 8.791851676683018e-06, + "loss": 5.2472, + "step": 44510 + }, + { + "epoch": 0.9056599934895834, + "grad_norm": 16.9487247467041, + "learning_rate": 8.791591145243138e-06, + "loss": 5.0901, + "step": 44515 + }, + { + "epoch": 0.90576171875, + "grad_norm": 23.91531753540039, + "learning_rate": 8.79133058957619e-06, + "loss": 4.7932, + "step": 44520 + }, + { + "epoch": 0.9058634440104166, + "grad_norm": 17.927824020385742, + "learning_rate": 8.791070009683846e-06, + "loss": 5.1728, + "step": 44525 + }, + { + "epoch": 0.9059651692708334, + "grad_norm": 14.097804069519043, + "learning_rate": 8.790809405567766e-06, + "loss": 4.9045, + "step": 44530 + }, + { + "epoch": 0.90606689453125, + "grad_norm": 15.424575805664062, + "learning_rate": 8.790548777229619e-06, + "loss": 4.9075, + "step": 44535 + }, + { + "epoch": 0.9061686197916666, + "grad_norm": 17.169254302978516, + "learning_rate": 8.79028812467107e-06, + "loss": 5.0547, + "step": 44540 + }, + { + "epoch": 0.9062703450520834, + "grad_norm": 14.736417770385742, + "learning_rate": 8.79002744789378e-06, + "loss": 5.0231, + "step": 44545 + }, + { + "epoch": 0.9063720703125, + "grad_norm": 16.90686798095703, + "learning_rate": 8.789766746899418e-06, + "loss": 4.9742, + "step": 44550 + }, + { + "epoch": 0.9064737955729166, + "grad_norm": 15.50202465057373, + "learning_rate": 8.789506021689652e-06, + "loss": 5.2418, + "step": 44555 + }, + { + "epoch": 0.9065755208333334, + "grad_norm": 19.109216690063477, + "learning_rate": 8.789245272266142e-06, + "loss": 5.02, + "step": 44560 + }, + { + "epoch": 0.90667724609375, + "grad_norm": 14.010442733764648, + "learning_rate": 8.788984498630559e-06, + "loss": 4.9951, + "step": 44565 + }, + { + "epoch": 0.9067789713541666, + "grad_norm": 17.068143844604492, + "learning_rate": 8.78872370078457e-06, + "loss": 4.9387, + "step": 44570 + }, + { + "epoch": 0.9068806966145834, + "grad_norm": 17.85623550415039, + "learning_rate": 8.788462878729837e-06, + "loss": 5.003, + "step": 44575 + }, + { + "epoch": 0.906982421875, + "grad_norm": 16.167890548706055, + "learning_rate": 8.788202032468029e-06, + "loss": 5.3012, + "step": 44580 + }, + { + "epoch": 0.9070841471354166, + "grad_norm": 15.29325008392334, + "learning_rate": 8.787941162000813e-06, + "loss": 5.1202, + "step": 44585 + }, + { + "epoch": 0.9071858723958334, + "grad_norm": 17.43918800354004, + "learning_rate": 8.787680267329855e-06, + "loss": 5.2255, + "step": 44590 + }, + { + "epoch": 0.90728759765625, + "grad_norm": 11.717183113098145, + "learning_rate": 8.787419348456822e-06, + "loss": 5.0088, + "step": 44595 + }, + { + "epoch": 0.9073893229166666, + "grad_norm": 22.995113372802734, + "learning_rate": 8.787158405383383e-06, + "loss": 4.8997, + "step": 44600 + }, + { + "epoch": 0.9074910481770834, + "grad_norm": 14.824771881103516, + "learning_rate": 8.786897438111204e-06, + "loss": 5.2061, + "step": 44605 + }, + { + "epoch": 0.9075927734375, + "grad_norm": 21.638195037841797, + "learning_rate": 8.786636446641952e-06, + "loss": 5.2962, + "step": 44610 + }, + { + "epoch": 0.9076944986979166, + "grad_norm": 18.956317901611328, + "learning_rate": 8.786375430977293e-06, + "loss": 5.4006, + "step": 44615 + }, + { + "epoch": 0.9077962239583334, + "grad_norm": 15.916585922241211, + "learning_rate": 8.7861143911189e-06, + "loss": 4.9905, + "step": 44620 + }, + { + "epoch": 0.90789794921875, + "grad_norm": 19.353355407714844, + "learning_rate": 8.785853327068438e-06, + "loss": 4.7025, + "step": 44625 + }, + { + "epoch": 0.9079996744791666, + "grad_norm": 14.249920845031738, + "learning_rate": 8.785592238827574e-06, + "loss": 4.8871, + "step": 44630 + }, + { + "epoch": 0.9081013997395834, + "grad_norm": 16.192733764648438, + "learning_rate": 8.785331126397977e-06, + "loss": 5.205, + "step": 44635 + }, + { + "epoch": 0.908203125, + "grad_norm": 19.9262752532959, + "learning_rate": 8.785069989781316e-06, + "loss": 5.0495, + "step": 44640 + }, + { + "epoch": 0.9083048502604166, + "grad_norm": 15.008155822753906, + "learning_rate": 8.78480882897926e-06, + "loss": 4.7515, + "step": 44645 + }, + { + "epoch": 0.9084065755208334, + "grad_norm": 16.4153995513916, + "learning_rate": 8.784547643993475e-06, + "loss": 5.0227, + "step": 44650 + }, + { + "epoch": 0.90850830078125, + "grad_norm": 18.08397102355957, + "learning_rate": 8.784286434825632e-06, + "loss": 4.9833, + "step": 44655 + }, + { + "epoch": 0.9086100260416666, + "grad_norm": 18.156705856323242, + "learning_rate": 8.784025201477403e-06, + "loss": 5.0503, + "step": 44660 + }, + { + "epoch": 0.9087117513020834, + "grad_norm": 15.393243789672852, + "learning_rate": 8.783763943950451e-06, + "loss": 4.9456, + "step": 44665 + }, + { + "epoch": 0.9088134765625, + "grad_norm": 19.66738510131836, + "learning_rate": 8.783502662246452e-06, + "loss": 4.9464, + "step": 44670 + }, + { + "epoch": 0.9089152018229166, + "grad_norm": 16.636564254760742, + "learning_rate": 8.783241356367069e-06, + "loss": 5.1296, + "step": 44675 + }, + { + "epoch": 0.9090169270833334, + "grad_norm": 19.291227340698242, + "learning_rate": 8.782980026313977e-06, + "loss": 4.7987, + "step": 44680 + }, + { + "epoch": 0.90911865234375, + "grad_norm": 19.425207138061523, + "learning_rate": 8.782718672088842e-06, + "loss": 5.4206, + "step": 44685 + }, + { + "epoch": 0.9092203776041666, + "grad_norm": 15.833382606506348, + "learning_rate": 8.782457293693333e-06, + "loss": 5.0054, + "step": 44690 + }, + { + "epoch": 0.9093221028645834, + "grad_norm": 19.150379180908203, + "learning_rate": 8.782195891129127e-06, + "loss": 5.0666, + "step": 44695 + }, + { + "epoch": 0.909423828125, + "grad_norm": 20.430973052978516, + "learning_rate": 8.781934464397888e-06, + "loss": 5.1227, + "step": 44700 + }, + { + "epoch": 0.9095255533854166, + "grad_norm": 14.028791427612305, + "learning_rate": 8.781673013501289e-06, + "loss": 5.2639, + "step": 44705 + }, + { + "epoch": 0.9096272786458334, + "grad_norm": 15.383533477783203, + "learning_rate": 8.781411538440998e-06, + "loss": 5.0867, + "step": 44710 + }, + { + "epoch": 0.90972900390625, + "grad_norm": 18.03864288330078, + "learning_rate": 8.781150039218689e-06, + "loss": 4.9465, + "step": 44715 + }, + { + "epoch": 0.9098307291666666, + "grad_norm": 17.560476303100586, + "learning_rate": 8.780888515836033e-06, + "loss": 5.0202, + "step": 44720 + }, + { + "epoch": 0.9099324544270834, + "grad_norm": 15.446413040161133, + "learning_rate": 8.780626968294697e-06, + "loss": 5.0187, + "step": 44725 + }, + { + "epoch": 0.9100341796875, + "grad_norm": 16.965986251831055, + "learning_rate": 8.780365396596356e-06, + "loss": 5.1572, + "step": 44730 + }, + { + "epoch": 0.9101359049479166, + "grad_norm": 17.25484275817871, + "learning_rate": 8.78010380074268e-06, + "loss": 5.1492, + "step": 44735 + }, + { + "epoch": 0.9102376302083334, + "grad_norm": 16.576065063476562, + "learning_rate": 8.779842180735342e-06, + "loss": 5.1604, + "step": 44740 + }, + { + "epoch": 0.91033935546875, + "grad_norm": 16.864511489868164, + "learning_rate": 8.77958053657601e-06, + "loss": 4.8184, + "step": 44745 + }, + { + "epoch": 0.9104410807291666, + "grad_norm": 14.637031555175781, + "learning_rate": 8.77931886826636e-06, + "loss": 4.9448, + "step": 44750 + }, + { + "epoch": 0.9105428059895834, + "grad_norm": 21.056119918823242, + "learning_rate": 8.77905717580806e-06, + "loss": 5.05, + "step": 44755 + }, + { + "epoch": 0.91064453125, + "grad_norm": 13.28573226928711, + "learning_rate": 8.778795459202788e-06, + "loss": 4.9004, + "step": 44760 + }, + { + "epoch": 0.9107462565104166, + "grad_norm": 16.446943283081055, + "learning_rate": 8.778533718452208e-06, + "loss": 5.0302, + "step": 44765 + }, + { + "epoch": 0.9108479817708334, + "grad_norm": 18.731098175048828, + "learning_rate": 8.778271953558e-06, + "loss": 4.9508, + "step": 44770 + }, + { + "epoch": 0.91094970703125, + "grad_norm": 18.365245819091797, + "learning_rate": 8.778010164521834e-06, + "loss": 4.7831, + "step": 44775 + }, + { + "epoch": 0.9110514322916666, + "grad_norm": 15.327624320983887, + "learning_rate": 8.77774835134538e-06, + "loss": 5.1022, + "step": 44780 + }, + { + "epoch": 0.9111531575520834, + "grad_norm": 21.47565269470215, + "learning_rate": 8.777486514030316e-06, + "loss": 5.2137, + "step": 44785 + }, + { + "epoch": 0.9112548828125, + "grad_norm": 18.902734756469727, + "learning_rate": 8.777224652578311e-06, + "loss": 5.1682, + "step": 44790 + }, + { + "epoch": 0.9113566080729166, + "grad_norm": 15.194536209106445, + "learning_rate": 8.77696276699104e-06, + "loss": 5.2931, + "step": 44795 + }, + { + "epoch": 0.9114583333333334, + "grad_norm": 27.728107452392578, + "learning_rate": 8.776700857270174e-06, + "loss": 4.8794, + "step": 44800 + }, + { + "epoch": 0.91156005859375, + "grad_norm": 20.570232391357422, + "learning_rate": 8.77643892341739e-06, + "loss": 5.1867, + "step": 44805 + }, + { + "epoch": 0.9116617838541666, + "grad_norm": 15.562506675720215, + "learning_rate": 8.77617696543436e-06, + "loss": 4.7092, + "step": 44810 + }, + { + "epoch": 0.9117635091145834, + "grad_norm": 13.868577003479004, + "learning_rate": 8.775914983322758e-06, + "loss": 5.0878, + "step": 44815 + }, + { + "epoch": 0.911865234375, + "grad_norm": 24.267663955688477, + "learning_rate": 8.775652977084257e-06, + "loss": 5.2736, + "step": 44820 + }, + { + "epoch": 0.9119669596354166, + "grad_norm": 15.962109565734863, + "learning_rate": 8.775390946720533e-06, + "loss": 5.0627, + "step": 44825 + }, + { + "epoch": 0.9120686848958334, + "grad_norm": 15.354581832885742, + "learning_rate": 8.775128892233258e-06, + "loss": 5.2165, + "step": 44830 + }, + { + "epoch": 0.91217041015625, + "grad_norm": 19.345186233520508, + "learning_rate": 8.77486681362411e-06, + "loss": 4.7622, + "step": 44835 + }, + { + "epoch": 0.9122721354166666, + "grad_norm": 23.6007022857666, + "learning_rate": 8.77460471089476e-06, + "loss": 4.9023, + "step": 44840 + }, + { + "epoch": 0.9123738606770834, + "grad_norm": 16.3542423248291, + "learning_rate": 8.774342584046883e-06, + "loss": 5.1678, + "step": 44845 + }, + { + "epoch": 0.9124755859375, + "grad_norm": 17.616344451904297, + "learning_rate": 8.774080433082157e-06, + "loss": 5.3728, + "step": 44850 + }, + { + "epoch": 0.9125773111979166, + "grad_norm": 18.449748992919922, + "learning_rate": 8.773818258002253e-06, + "loss": 5.0018, + "step": 44855 + }, + { + "epoch": 0.9126790364583334, + "grad_norm": 14.173205375671387, + "learning_rate": 8.77355605880885e-06, + "loss": 4.9872, + "step": 44860 + }, + { + "epoch": 0.91278076171875, + "grad_norm": 13.90081787109375, + "learning_rate": 8.773293835503623e-06, + "loss": 5.2407, + "step": 44865 + }, + { + "epoch": 0.9128824869791666, + "grad_norm": 16.24280548095703, + "learning_rate": 8.773031588088243e-06, + "loss": 4.7567, + "step": 44870 + }, + { + "epoch": 0.9129842122395834, + "grad_norm": 19.14404296875, + "learning_rate": 8.772769316564392e-06, + "loss": 5.002, + "step": 44875 + }, + { + "epoch": 0.9130859375, + "grad_norm": 17.291959762573242, + "learning_rate": 8.77250702093374e-06, + "loss": 5.4921, + "step": 44880 + }, + { + "epoch": 0.9131876627604166, + "grad_norm": 13.461915969848633, + "learning_rate": 8.772244701197966e-06, + "loss": 5.0206, + "step": 44885 + }, + { + "epoch": 0.9132893880208334, + "grad_norm": 14.739304542541504, + "learning_rate": 8.771982357358748e-06, + "loss": 5.1313, + "step": 44890 + }, + { + "epoch": 0.91339111328125, + "grad_norm": 23.167875289916992, + "learning_rate": 8.771719989417757e-06, + "loss": 5.3301, + "step": 44895 + }, + { + "epoch": 0.9134928385416666, + "grad_norm": 16.340904235839844, + "learning_rate": 8.771457597376674e-06, + "loss": 4.9027, + "step": 44900 + }, + { + "epoch": 0.9135945638020834, + "grad_norm": 16.065025329589844, + "learning_rate": 8.771195181237176e-06, + "loss": 5.1104, + "step": 44905 + }, + { + "epoch": 0.9136962890625, + "grad_norm": 19.250368118286133, + "learning_rate": 8.770932741000935e-06, + "loss": 5.0841, + "step": 44910 + }, + { + "epoch": 0.9137980143229166, + "grad_norm": 16.899778366088867, + "learning_rate": 8.770670276669633e-06, + "loss": 5.2393, + "step": 44915 + }, + { + "epoch": 0.9138997395833334, + "grad_norm": 15.299036979675293, + "learning_rate": 8.770407788244943e-06, + "loss": 4.7837, + "step": 44920 + }, + { + "epoch": 0.91400146484375, + "grad_norm": 21.456953048706055, + "learning_rate": 8.770145275728546e-06, + "loss": 5.0741, + "step": 44925 + }, + { + "epoch": 0.9141031901041666, + "grad_norm": 16.509660720825195, + "learning_rate": 8.769882739122117e-06, + "loss": 4.7856, + "step": 44930 + }, + { + "epoch": 0.9142049153645834, + "grad_norm": 16.596906661987305, + "learning_rate": 8.769620178427334e-06, + "loss": 5.0025, + "step": 44935 + }, + { + "epoch": 0.914306640625, + "grad_norm": 16.99668312072754, + "learning_rate": 8.769357593645874e-06, + "loss": 5.2085, + "step": 44940 + }, + { + "epoch": 0.9144083658854166, + "grad_norm": 15.447405815124512, + "learning_rate": 8.769094984779416e-06, + "loss": 4.9127, + "step": 44945 + }, + { + "epoch": 0.9145100911458334, + "grad_norm": 18.872661590576172, + "learning_rate": 8.768832351829637e-06, + "loss": 5.0725, + "step": 44950 + }, + { + "epoch": 0.91461181640625, + "grad_norm": 17.378028869628906, + "learning_rate": 8.768569694798217e-06, + "loss": 5.0498, + "step": 44955 + }, + { + "epoch": 0.9147135416666666, + "grad_norm": 23.152748107910156, + "learning_rate": 8.768307013686832e-06, + "loss": 5.0158, + "step": 44960 + }, + { + "epoch": 0.9148152669270834, + "grad_norm": 19.15017318725586, + "learning_rate": 8.768044308497162e-06, + "loss": 4.8218, + "step": 44965 + }, + { + "epoch": 0.9149169921875, + "grad_norm": 15.351704597473145, + "learning_rate": 8.767781579230883e-06, + "loss": 4.9055, + "step": 44970 + }, + { + "epoch": 0.9150187174479166, + "grad_norm": 18.051063537597656, + "learning_rate": 8.767518825889678e-06, + "loss": 5.2058, + "step": 44975 + }, + { + "epoch": 0.9151204427083334, + "grad_norm": 19.396291732788086, + "learning_rate": 8.767256048475224e-06, + "loss": 4.8514, + "step": 44980 + }, + { + "epoch": 0.91522216796875, + "grad_norm": 18.13347053527832, + "learning_rate": 8.766993246989197e-06, + "loss": 4.9719, + "step": 44985 + }, + { + "epoch": 0.9153238932291666, + "grad_norm": 18.837751388549805, + "learning_rate": 8.76673042143328e-06, + "loss": 5.2556, + "step": 44990 + }, + { + "epoch": 0.9154256184895834, + "grad_norm": 15.87172794342041, + "learning_rate": 8.766467571809154e-06, + "loss": 4.8952, + "step": 44995 + }, + { + "epoch": 0.91552734375, + "grad_norm": 20.308496475219727, + "learning_rate": 8.766204698118492e-06, + "loss": 5.0193, + "step": 45000 + }, + { + "epoch": 0.9156290690104166, + "grad_norm": 22.08003807067871, + "learning_rate": 8.765941800362981e-06, + "loss": 5.2992, + "step": 45005 + }, + { + "epoch": 0.9157307942708334, + "grad_norm": 14.443645477294922, + "learning_rate": 8.765678878544295e-06, + "loss": 5.0956, + "step": 45010 + }, + { + "epoch": 0.91583251953125, + "grad_norm": 17.276330947875977, + "learning_rate": 8.765415932664116e-06, + "loss": 5.1866, + "step": 45015 + }, + { + "epoch": 0.9159342447916666, + "grad_norm": 22.018770217895508, + "learning_rate": 8.765152962724127e-06, + "loss": 4.7202, + "step": 45020 + }, + { + "epoch": 0.9160359700520834, + "grad_norm": 17.683107376098633, + "learning_rate": 8.764889968726004e-06, + "loss": 5.0192, + "step": 45025 + }, + { + "epoch": 0.9161376953125, + "grad_norm": 15.059403419494629, + "learning_rate": 8.764626950671428e-06, + "loss": 5.0546, + "step": 45030 + }, + { + "epoch": 0.9162394205729166, + "grad_norm": 21.00143051147461, + "learning_rate": 8.764363908562084e-06, + "loss": 4.8177, + "step": 45035 + }, + { + "epoch": 0.9163411458333334, + "grad_norm": 12.08775806427002, + "learning_rate": 8.764100842399648e-06, + "loss": 5.1451, + "step": 45040 + }, + { + "epoch": 0.91644287109375, + "grad_norm": 20.52021598815918, + "learning_rate": 8.763837752185801e-06, + "loss": 5.0012, + "step": 45045 + }, + { + "epoch": 0.9165445963541666, + "grad_norm": 30.782718658447266, + "learning_rate": 8.763574637922228e-06, + "loss": 5.2476, + "step": 45050 + }, + { + "epoch": 0.9166463216145834, + "grad_norm": 15.647490501403809, + "learning_rate": 8.763311499610606e-06, + "loss": 5.112, + "step": 45055 + }, + { + "epoch": 0.916748046875, + "grad_norm": 13.637089729309082, + "learning_rate": 8.76304833725262e-06, + "loss": 5.0107, + "step": 45060 + }, + { + "epoch": 0.9168497721354166, + "grad_norm": 15.153265953063965, + "learning_rate": 8.762785150849948e-06, + "loss": 5.0999, + "step": 45065 + }, + { + "epoch": 0.9169514973958334, + "grad_norm": 17.492219924926758, + "learning_rate": 8.762521940404274e-06, + "loss": 5.3416, + "step": 45070 + }, + { + "epoch": 0.91705322265625, + "grad_norm": 15.177302360534668, + "learning_rate": 8.762258705917281e-06, + "loss": 4.8755, + "step": 45075 + }, + { + "epoch": 0.9171549479166666, + "grad_norm": 20.050344467163086, + "learning_rate": 8.761995447390646e-06, + "loss": 5.3827, + "step": 45080 + }, + { + "epoch": 0.9172566731770834, + "grad_norm": 17.299339294433594, + "learning_rate": 8.761732164826056e-06, + "loss": 5.3551, + "step": 45085 + }, + { + "epoch": 0.9173583984375, + "grad_norm": 18.111474990844727, + "learning_rate": 8.761468858225192e-06, + "loss": 5.1529, + "step": 45090 + }, + { + "epoch": 0.9174601236979166, + "grad_norm": 16.601940155029297, + "learning_rate": 8.761205527589735e-06, + "loss": 4.8779, + "step": 45095 + }, + { + "epoch": 0.9175618489583334, + "grad_norm": 15.469865798950195, + "learning_rate": 8.760942172921369e-06, + "loss": 4.8027, + "step": 45100 + }, + { + "epoch": 0.91766357421875, + "grad_norm": 18.953704833984375, + "learning_rate": 8.760678794221777e-06, + "loss": 5.1515, + "step": 45105 + }, + { + "epoch": 0.9177652994791666, + "grad_norm": 17.18794822692871, + "learning_rate": 8.76041539149264e-06, + "loss": 5.4633, + "step": 45110 + }, + { + "epoch": 0.9178670247395834, + "grad_norm": 16.069475173950195, + "learning_rate": 8.760151964735643e-06, + "loss": 4.7574, + "step": 45115 + }, + { + "epoch": 0.91796875, + "grad_norm": 17.10263442993164, + "learning_rate": 8.759888513952468e-06, + "loss": 5.0831, + "step": 45120 + }, + { + "epoch": 0.9180704752604166, + "grad_norm": 18.24595069885254, + "learning_rate": 8.759625039144801e-06, + "loss": 4.8087, + "step": 45125 + }, + { + "epoch": 0.9181722005208334, + "grad_norm": 18.23316764831543, + "learning_rate": 8.759361540314322e-06, + "loss": 4.9008, + "step": 45130 + }, + { + "epoch": 0.91827392578125, + "grad_norm": 27.913978576660156, + "learning_rate": 8.759098017462716e-06, + "loss": 4.9385, + "step": 45135 + }, + { + "epoch": 0.9183756510416666, + "grad_norm": 16.818021774291992, + "learning_rate": 8.758834470591668e-06, + "loss": 4.7684, + "step": 45140 + }, + { + "epoch": 0.9184773763020834, + "grad_norm": 15.202385902404785, + "learning_rate": 8.758570899702858e-06, + "loss": 5.0586, + "step": 45145 + }, + { + "epoch": 0.9185791015625, + "grad_norm": 16.98111915588379, + "learning_rate": 8.758307304797977e-06, + "loss": 5.1355, + "step": 45150 + }, + { + "epoch": 0.9186808268229166, + "grad_norm": 20.956340789794922, + "learning_rate": 8.758043685878702e-06, + "loss": 4.9307, + "step": 45155 + }, + { + "epoch": 0.9187825520833334, + "grad_norm": 18.043638229370117, + "learning_rate": 8.757780042946723e-06, + "loss": 5.0546, + "step": 45160 + }, + { + "epoch": 0.91888427734375, + "grad_norm": 18.590078353881836, + "learning_rate": 8.757516376003722e-06, + "loss": 5.0301, + "step": 45165 + }, + { + "epoch": 0.9189860026041666, + "grad_norm": 15.9541015625, + "learning_rate": 8.757252685051385e-06, + "loss": 4.9174, + "step": 45170 + }, + { + "epoch": 0.9190877278645834, + "grad_norm": 20.352771759033203, + "learning_rate": 8.756988970091393e-06, + "loss": 4.8875, + "step": 45175 + }, + { + "epoch": 0.919189453125, + "grad_norm": 15.295975685119629, + "learning_rate": 8.756725231125437e-06, + "loss": 5.0903, + "step": 45180 + }, + { + "epoch": 0.9192911783854166, + "grad_norm": 16.56306266784668, + "learning_rate": 8.756461468155198e-06, + "loss": 4.9194, + "step": 45185 + }, + { + "epoch": 0.9193929036458334, + "grad_norm": 23.675373077392578, + "learning_rate": 8.756197681182364e-06, + "loss": 5.5658, + "step": 45190 + }, + { + "epoch": 0.91949462890625, + "grad_norm": 13.796375274658203, + "learning_rate": 8.755933870208618e-06, + "loss": 5.0919, + "step": 45195 + }, + { + "epoch": 0.9195963541666666, + "grad_norm": 16.00160026550293, + "learning_rate": 8.755670035235647e-06, + "loss": 4.8273, + "step": 45200 + }, + { + "epoch": 0.9196980794270834, + "grad_norm": 16.338560104370117, + "learning_rate": 8.755406176265138e-06, + "loss": 5.2478, + "step": 45205 + }, + { + "epoch": 0.9197998046875, + "grad_norm": 15.614837646484375, + "learning_rate": 8.755142293298773e-06, + "loss": 4.9811, + "step": 45210 + }, + { + "epoch": 0.9199015299479166, + "grad_norm": 19.07928466796875, + "learning_rate": 8.754878386338242e-06, + "loss": 5.2413, + "step": 45215 + }, + { + "epoch": 0.9200032552083334, + "grad_norm": 23.409133911132812, + "learning_rate": 8.75461445538523e-06, + "loss": 5.104, + "step": 45220 + }, + { + "epoch": 0.92010498046875, + "grad_norm": 15.96032428741455, + "learning_rate": 8.754350500441424e-06, + "loss": 5.092, + "step": 45225 + }, + { + "epoch": 0.9202067057291666, + "grad_norm": 16.3989315032959, + "learning_rate": 8.75408652150851e-06, + "loss": 4.9606, + "step": 45230 + }, + { + "epoch": 0.9203084309895834, + "grad_norm": 19.525146484375, + "learning_rate": 8.753822518588176e-06, + "loss": 5.242, + "step": 45235 + }, + { + "epoch": 0.92041015625, + "grad_norm": 16.58430290222168, + "learning_rate": 8.753558491682106e-06, + "loss": 5.3284, + "step": 45240 + }, + { + "epoch": 0.9205118815104166, + "grad_norm": 16.426958084106445, + "learning_rate": 8.75329444079199e-06, + "loss": 4.7979, + "step": 45245 + }, + { + "epoch": 0.9206136067708334, + "grad_norm": 15.91930103302002, + "learning_rate": 8.753030365919513e-06, + "loss": 5.2774, + "step": 45250 + }, + { + "epoch": 0.92071533203125, + "grad_norm": 20.300010681152344, + "learning_rate": 8.752766267066363e-06, + "loss": 5.0434, + "step": 45255 + }, + { + "epoch": 0.9208170572916666, + "grad_norm": 14.128829002380371, + "learning_rate": 8.752502144234229e-06, + "loss": 5.3257, + "step": 45260 + }, + { + "epoch": 0.9209187825520834, + "grad_norm": 18.968090057373047, + "learning_rate": 8.752237997424798e-06, + "loss": 5.0343, + "step": 45265 + }, + { + "epoch": 0.9210205078125, + "grad_norm": 18.117877960205078, + "learning_rate": 8.751973826639755e-06, + "loss": 4.9947, + "step": 45270 + }, + { + "epoch": 0.9211222330729166, + "grad_norm": 17.040544509887695, + "learning_rate": 8.751709631880791e-06, + "loss": 4.9786, + "step": 45275 + }, + { + "epoch": 0.9212239583333334, + "grad_norm": 15.584954261779785, + "learning_rate": 8.751445413149595e-06, + "loss": 4.7015, + "step": 45280 + }, + { + "epoch": 0.92132568359375, + "grad_norm": 17.02669906616211, + "learning_rate": 8.751181170447852e-06, + "loss": 5.1387, + "step": 45285 + }, + { + "epoch": 0.9214274088541666, + "grad_norm": 19.790149688720703, + "learning_rate": 8.750916903777253e-06, + "loss": 4.938, + "step": 45290 + }, + { + "epoch": 0.9215291341145834, + "grad_norm": 16.028345108032227, + "learning_rate": 8.750652613139486e-06, + "loss": 4.9094, + "step": 45295 + }, + { + "epoch": 0.921630859375, + "grad_norm": 17.443824768066406, + "learning_rate": 8.75038829853624e-06, + "loss": 4.9468, + "step": 45300 + }, + { + "epoch": 0.9217325846354166, + "grad_norm": 20.17779541015625, + "learning_rate": 8.7501239599692e-06, + "loss": 5.0204, + "step": 45305 + }, + { + "epoch": 0.9218343098958334, + "grad_norm": 16.405162811279297, + "learning_rate": 8.74985959744006e-06, + "loss": 4.9544, + "step": 45310 + }, + { + "epoch": 0.92193603515625, + "grad_norm": 25.37957000732422, + "learning_rate": 8.74959521095051e-06, + "loss": 5.3948, + "step": 45315 + }, + { + "epoch": 0.9220377604166666, + "grad_norm": 17.708084106445312, + "learning_rate": 8.749330800502234e-06, + "loss": 5.1431, + "step": 45320 + }, + { + "epoch": 0.9221394856770834, + "grad_norm": 16.31627655029297, + "learning_rate": 8.749066366096926e-06, + "loss": 5.1881, + "step": 45325 + }, + { + "epoch": 0.9222412109375, + "grad_norm": 12.770536422729492, + "learning_rate": 8.748801907736273e-06, + "loss": 5.0745, + "step": 45330 + }, + { + "epoch": 0.9223429361979166, + "grad_norm": 21.50994300842285, + "learning_rate": 8.748537425421966e-06, + "loss": 4.923, + "step": 45335 + }, + { + "epoch": 0.9224446614583334, + "grad_norm": 23.427183151245117, + "learning_rate": 8.748272919155694e-06, + "loss": 5.3436, + "step": 45340 + }, + { + "epoch": 0.92254638671875, + "grad_norm": 19.329442977905273, + "learning_rate": 8.748008388939149e-06, + "loss": 5.091, + "step": 45345 + }, + { + "epoch": 0.9226481119791666, + "grad_norm": 17.263782501220703, + "learning_rate": 8.747743834774021e-06, + "loss": 5.2334, + "step": 45350 + }, + { + "epoch": 0.9227498372395834, + "grad_norm": 13.84416675567627, + "learning_rate": 8.747479256662e-06, + "loss": 5.0029, + "step": 45355 + }, + { + "epoch": 0.9228515625, + "grad_norm": 15.69725227355957, + "learning_rate": 8.747214654604773e-06, + "loss": 5.1356, + "step": 45360 + }, + { + "epoch": 0.9229532877604166, + "grad_norm": 31.378957748413086, + "learning_rate": 8.746950028604036e-06, + "loss": 5.2639, + "step": 45365 + }, + { + "epoch": 0.9230550130208334, + "grad_norm": 21.94241714477539, + "learning_rate": 8.746685378661477e-06, + "loss": 4.9296, + "step": 45370 + }, + { + "epoch": 0.92315673828125, + "grad_norm": 27.880311965942383, + "learning_rate": 8.746420704778788e-06, + "loss": 5.0959, + "step": 45375 + }, + { + "epoch": 0.9232584635416666, + "grad_norm": 21.407058715820312, + "learning_rate": 8.74615600695766e-06, + "loss": 5.1603, + "step": 45380 + }, + { + "epoch": 0.9233601888020834, + "grad_norm": 18.509035110473633, + "learning_rate": 8.745891285199785e-06, + "loss": 5.1814, + "step": 45385 + }, + { + "epoch": 0.9234619140625, + "grad_norm": 21.81510353088379, + "learning_rate": 8.745626539506852e-06, + "loss": 5.0689, + "step": 45390 + }, + { + "epoch": 0.9235636393229166, + "grad_norm": 12.490840911865234, + "learning_rate": 8.745361769880558e-06, + "loss": 5.0497, + "step": 45395 + }, + { + "epoch": 0.9236653645833334, + "grad_norm": 20.292858123779297, + "learning_rate": 8.745096976322589e-06, + "loss": 4.9253, + "step": 45400 + }, + { + "epoch": 0.92376708984375, + "grad_norm": 19.554140090942383, + "learning_rate": 8.744832158834638e-06, + "loss": 5.0469, + "step": 45405 + }, + { + "epoch": 0.9238688151041666, + "grad_norm": 23.197315216064453, + "learning_rate": 8.7445673174184e-06, + "loss": 5.2429, + "step": 45410 + }, + { + "epoch": 0.9239705403645834, + "grad_norm": 15.319231033325195, + "learning_rate": 8.744302452075564e-06, + "loss": 5.2029, + "step": 45415 + }, + { + "epoch": 0.924072265625, + "grad_norm": 12.963876724243164, + "learning_rate": 8.744037562807826e-06, + "loss": 5.0895, + "step": 45420 + }, + { + "epoch": 0.9241739908854166, + "grad_norm": 15.987841606140137, + "learning_rate": 8.743772649616875e-06, + "loss": 4.8718, + "step": 45425 + }, + { + "epoch": 0.9242757161458334, + "grad_norm": 15.009998321533203, + "learning_rate": 8.743507712504406e-06, + "loss": 5.2043, + "step": 45430 + }, + { + "epoch": 0.92437744140625, + "grad_norm": 19.395872116088867, + "learning_rate": 8.74324275147211e-06, + "loss": 5.1684, + "step": 45435 + }, + { + "epoch": 0.9244791666666666, + "grad_norm": 16.177736282348633, + "learning_rate": 8.74297776652168e-06, + "loss": 5.1626, + "step": 45440 + }, + { + "epoch": 0.9245808919270834, + "grad_norm": 14.507593154907227, + "learning_rate": 8.742712757654815e-06, + "loss": 5.2394, + "step": 45445 + }, + { + "epoch": 0.9246826171875, + "grad_norm": 16.608827590942383, + "learning_rate": 8.7424477248732e-06, + "loss": 5.0758, + "step": 45450 + }, + { + "epoch": 0.9247843424479166, + "grad_norm": 17.24175262451172, + "learning_rate": 8.742182668178532e-06, + "loss": 5.1311, + "step": 45455 + }, + { + "epoch": 0.9248860677083334, + "grad_norm": 19.118335723876953, + "learning_rate": 8.741917587572505e-06, + "loss": 4.9906, + "step": 45460 + }, + { + "epoch": 0.92498779296875, + "grad_norm": 18.71849250793457, + "learning_rate": 8.741652483056813e-06, + "loss": 4.9835, + "step": 45465 + }, + { + "epoch": 0.9250895182291666, + "grad_norm": 19.760663986206055, + "learning_rate": 8.741387354633149e-06, + "loss": 4.9005, + "step": 45470 + }, + { + "epoch": 0.9251912434895834, + "grad_norm": 23.710166931152344, + "learning_rate": 8.741122202303206e-06, + "loss": 5.1649, + "step": 45475 + }, + { + "epoch": 0.92529296875, + "grad_norm": 16.924833297729492, + "learning_rate": 8.740857026068683e-06, + "loss": 4.8053, + "step": 45480 + }, + { + "epoch": 0.9253946940104166, + "grad_norm": 15.988739967346191, + "learning_rate": 8.74059182593127e-06, + "loss": 5.2476, + "step": 45485 + }, + { + "epoch": 0.9254964192708334, + "grad_norm": 17.693828582763672, + "learning_rate": 8.740326601892662e-06, + "loss": 4.7984, + "step": 45490 + }, + { + "epoch": 0.92559814453125, + "grad_norm": 17.999435424804688, + "learning_rate": 8.740061353954555e-06, + "loss": 4.9671, + "step": 45495 + }, + { + "epoch": 0.9256998697916666, + "grad_norm": 14.185702323913574, + "learning_rate": 8.739796082118641e-06, + "loss": 4.7248, + "step": 45500 + }, + { + "epoch": 0.9258015950520834, + "grad_norm": 17.526121139526367, + "learning_rate": 8.739530786386618e-06, + "loss": 4.7465, + "step": 45505 + }, + { + "epoch": 0.9259033203125, + "grad_norm": 19.108888626098633, + "learning_rate": 8.739265466760183e-06, + "loss": 5.1994, + "step": 45510 + }, + { + "epoch": 0.9260050455729166, + "grad_norm": 18.396806716918945, + "learning_rate": 8.739000123241026e-06, + "loss": 4.9015, + "step": 45515 + }, + { + "epoch": 0.9261067708333334, + "grad_norm": 18.52408218383789, + "learning_rate": 8.738734755830846e-06, + "loss": 4.887, + "step": 45520 + }, + { + "epoch": 0.92620849609375, + "grad_norm": 18.151439666748047, + "learning_rate": 8.738469364531338e-06, + "loss": 4.7729, + "step": 45525 + }, + { + "epoch": 0.9263102213541666, + "grad_norm": 19.618274688720703, + "learning_rate": 8.738203949344197e-06, + "loss": 4.8866, + "step": 45530 + }, + { + "epoch": 0.9264119466145834, + "grad_norm": 18.890880584716797, + "learning_rate": 8.73793851027112e-06, + "loss": 5.0586, + "step": 45535 + }, + { + "epoch": 0.926513671875, + "grad_norm": 13.971052169799805, + "learning_rate": 8.737673047313802e-06, + "loss": 5.0307, + "step": 45540 + }, + { + "epoch": 0.9266153971354166, + "grad_norm": 20.066099166870117, + "learning_rate": 8.73740756047394e-06, + "loss": 5.3516, + "step": 45545 + }, + { + "epoch": 0.9267171223958334, + "grad_norm": 16.05860710144043, + "learning_rate": 8.737142049753231e-06, + "loss": 5.0328, + "step": 45550 + }, + { + "epoch": 0.92681884765625, + "grad_norm": 15.811692237854004, + "learning_rate": 8.73687651515337e-06, + "loss": 5.0307, + "step": 45555 + }, + { + "epoch": 0.9269205729166666, + "grad_norm": 15.696101188659668, + "learning_rate": 8.736610956676052e-06, + "loss": 4.8203, + "step": 45560 + }, + { + "epoch": 0.9270222981770834, + "grad_norm": 22.557710647583008, + "learning_rate": 8.736345374322978e-06, + "loss": 5.2527, + "step": 45565 + }, + { + "epoch": 0.9271240234375, + "grad_norm": 21.73860740661621, + "learning_rate": 8.736079768095845e-06, + "loss": 5.1662, + "step": 45570 + }, + { + "epoch": 0.9272257486979166, + "grad_norm": 19.91318702697754, + "learning_rate": 8.735814137996346e-06, + "loss": 5.0384, + "step": 45575 + }, + { + "epoch": 0.9273274739583334, + "grad_norm": 21.86736488342285, + "learning_rate": 8.735548484026182e-06, + "loss": 4.8405, + "step": 45580 + }, + { + "epoch": 0.92742919921875, + "grad_norm": 17.721174240112305, + "learning_rate": 8.735282806187047e-06, + "loss": 5.0871, + "step": 45585 + }, + { + "epoch": 0.9275309244791666, + "grad_norm": 16.07786750793457, + "learning_rate": 8.735017104480642e-06, + "loss": 5.0031, + "step": 45590 + }, + { + "epoch": 0.9276326497395834, + "grad_norm": 14.869708061218262, + "learning_rate": 8.734751378908663e-06, + "loss": 5.0064, + "step": 45595 + }, + { + "epoch": 0.927734375, + "grad_norm": 14.914709091186523, + "learning_rate": 8.734485629472809e-06, + "loss": 5.1857, + "step": 45600 + }, + { + "epoch": 0.9278361002604166, + "grad_norm": 16.2762393951416, + "learning_rate": 8.734219856174776e-06, + "loss": 5.0841, + "step": 45605 + }, + { + "epoch": 0.9279378255208334, + "grad_norm": 18.09320831298828, + "learning_rate": 8.733954059016264e-06, + "loss": 5.0409, + "step": 45610 + }, + { + "epoch": 0.92803955078125, + "grad_norm": 14.228349685668945, + "learning_rate": 8.733688237998972e-06, + "loss": 5.0479, + "step": 45615 + }, + { + "epoch": 0.9281412760416666, + "grad_norm": 15.610493659973145, + "learning_rate": 8.733422393124595e-06, + "loss": 5.1868, + "step": 45620 + }, + { + "epoch": 0.9282430013020834, + "grad_norm": 21.721900939941406, + "learning_rate": 8.733156524394838e-06, + "loss": 4.9346, + "step": 45625 + }, + { + "epoch": 0.9283447265625, + "grad_norm": 16.293630599975586, + "learning_rate": 8.732890631811391e-06, + "loss": 5.1521, + "step": 45630 + }, + { + "epoch": 0.9284464518229166, + "grad_norm": 18.454750061035156, + "learning_rate": 8.73262471537596e-06, + "loss": 4.7414, + "step": 45635 + }, + { + "epoch": 0.9285481770833334, + "grad_norm": 18.99530792236328, + "learning_rate": 8.732358775090243e-06, + "loss": 5.0729, + "step": 45640 + }, + { + "epoch": 0.92864990234375, + "grad_norm": 17.0201473236084, + "learning_rate": 8.732092810955937e-06, + "loss": 5.0876, + "step": 45645 + }, + { + "epoch": 0.9287516276041666, + "grad_norm": 17.516572952270508, + "learning_rate": 8.731826822974743e-06, + "loss": 4.8775, + "step": 45650 + }, + { + "epoch": 0.9288533528645834, + "grad_norm": 19.403234481811523, + "learning_rate": 8.731560811148361e-06, + "loss": 5.0986, + "step": 45655 + }, + { + "epoch": 0.928955078125, + "grad_norm": 15.768342018127441, + "learning_rate": 8.73129477547849e-06, + "loss": 5.1822, + "step": 45660 + }, + { + "epoch": 0.9290568033854166, + "grad_norm": 24.6158504486084, + "learning_rate": 8.731028715966828e-06, + "loss": 4.8298, + "step": 45665 + }, + { + "epoch": 0.9291585286458334, + "grad_norm": 29.8992862701416, + "learning_rate": 8.730762632615077e-06, + "loss": 5.271, + "step": 45670 + }, + { + "epoch": 0.92926025390625, + "grad_norm": 21.47937774658203, + "learning_rate": 8.730496525424938e-06, + "loss": 5.5898, + "step": 45675 + }, + { + "epoch": 0.9293619791666666, + "grad_norm": 18.12157440185547, + "learning_rate": 8.730230394398112e-06, + "loss": 4.7912, + "step": 45680 + }, + { + "epoch": 0.9294637044270834, + "grad_norm": 23.15785026550293, + "learning_rate": 8.729964239536295e-06, + "loss": 5.0326, + "step": 45685 + }, + { + "epoch": 0.9295654296875, + "grad_norm": 16.976285934448242, + "learning_rate": 8.729698060841192e-06, + "loss": 4.8027, + "step": 45690 + }, + { + "epoch": 0.9296671549479166, + "grad_norm": 18.8576717376709, + "learning_rate": 8.729431858314502e-06, + "loss": 4.7992, + "step": 45695 + }, + { + "epoch": 0.9297688802083334, + "grad_norm": 14.784289360046387, + "learning_rate": 8.729165631957928e-06, + "loss": 5.1369, + "step": 45700 + }, + { + "epoch": 0.92987060546875, + "grad_norm": 20.926010131835938, + "learning_rate": 8.728899381773168e-06, + "loss": 5.0599, + "step": 45705 + }, + { + "epoch": 0.9299723307291666, + "grad_norm": 16.6568660736084, + "learning_rate": 8.728633107761925e-06, + "loss": 5.1065, + "step": 45710 + }, + { + "epoch": 0.9300740559895834, + "grad_norm": 21.66792106628418, + "learning_rate": 8.728366809925899e-06, + "loss": 5.1381, + "step": 45715 + }, + { + "epoch": 0.93017578125, + "grad_norm": 21.002086639404297, + "learning_rate": 8.728100488266795e-06, + "loss": 5.3268, + "step": 45720 + }, + { + "epoch": 0.9302775065104166, + "grad_norm": 20.734861373901367, + "learning_rate": 8.727834142786308e-06, + "loss": 5.1548, + "step": 45725 + }, + { + "epoch": 0.9303792317708334, + "grad_norm": 15.777372360229492, + "learning_rate": 8.727567773486148e-06, + "loss": 5.1808, + "step": 45730 + }, + { + "epoch": 0.93048095703125, + "grad_norm": 12.608663558959961, + "learning_rate": 8.727301380368014e-06, + "loss": 4.7859, + "step": 45735 + }, + { + "epoch": 0.9305826822916666, + "grad_norm": 20.939455032348633, + "learning_rate": 8.727034963433605e-06, + "loss": 5.233, + "step": 45740 + }, + { + "epoch": 0.9306844075520834, + "grad_norm": 13.449736595153809, + "learning_rate": 8.726768522684626e-06, + "loss": 5.0508, + "step": 45745 + }, + { + "epoch": 0.9307861328125, + "grad_norm": 16.478919982910156, + "learning_rate": 8.726502058122778e-06, + "loss": 4.8894, + "step": 45750 + }, + { + "epoch": 0.9308878580729166, + "grad_norm": 21.119646072387695, + "learning_rate": 8.726235569749768e-06, + "loss": 4.9912, + "step": 45755 + }, + { + "epoch": 0.9309895833333334, + "grad_norm": 20.068279266357422, + "learning_rate": 8.725969057567294e-06, + "loss": 4.9915, + "step": 45760 + }, + { + "epoch": 0.93109130859375, + "grad_norm": 15.183531761169434, + "learning_rate": 8.72570252157706e-06, + "loss": 5.0789, + "step": 45765 + }, + { + "epoch": 0.9311930338541666, + "grad_norm": 19.515256881713867, + "learning_rate": 8.725435961780769e-06, + "loss": 4.794, + "step": 45770 + }, + { + "epoch": 0.9312947591145834, + "grad_norm": 16.058347702026367, + "learning_rate": 8.725169378180125e-06, + "loss": 4.9636, + "step": 45775 + }, + { + "epoch": 0.931396484375, + "grad_norm": 17.140941619873047, + "learning_rate": 8.724902770776833e-06, + "loss": 4.9993, + "step": 45780 + }, + { + "epoch": 0.9314982096354166, + "grad_norm": 22.900575637817383, + "learning_rate": 8.724636139572593e-06, + "loss": 5.1534, + "step": 45785 + }, + { + "epoch": 0.9315999348958334, + "grad_norm": 19.82611846923828, + "learning_rate": 8.724369484569112e-06, + "loss": 4.9082, + "step": 45790 + }, + { + "epoch": 0.93170166015625, + "grad_norm": 18.842723846435547, + "learning_rate": 8.724102805768091e-06, + "loss": 5.0397, + "step": 45795 + }, + { + "epoch": 0.9318033854166666, + "grad_norm": 14.694660186767578, + "learning_rate": 8.723836103171237e-06, + "loss": 5.0251, + "step": 45800 + }, + { + "epoch": 0.9319051106770834, + "grad_norm": 19.153837203979492, + "learning_rate": 8.723569376780249e-06, + "loss": 5.1327, + "step": 45805 + }, + { + "epoch": 0.9320068359375, + "grad_norm": 21.07360076904297, + "learning_rate": 8.723302626596836e-06, + "loss": 4.9712, + "step": 45810 + }, + { + "epoch": 0.9321085611979166, + "grad_norm": 20.11358070373535, + "learning_rate": 8.723035852622702e-06, + "loss": 5.1133, + "step": 45815 + }, + { + "epoch": 0.9322102864583334, + "grad_norm": 19.07341766357422, + "learning_rate": 8.722769054859552e-06, + "loss": 5.1084, + "step": 45820 + }, + { + "epoch": 0.93231201171875, + "grad_norm": 15.028763771057129, + "learning_rate": 8.722502233309088e-06, + "loss": 4.7799, + "step": 45825 + }, + { + "epoch": 0.9324137369791666, + "grad_norm": 17.24458122253418, + "learning_rate": 8.722235387973016e-06, + "loss": 5.1358, + "step": 45830 + }, + { + "epoch": 0.9325154622395834, + "grad_norm": 21.87672233581543, + "learning_rate": 8.721968518853044e-06, + "loss": 5.083, + "step": 45835 + }, + { + "epoch": 0.9326171875, + "grad_norm": 18.226604461669922, + "learning_rate": 8.721701625950871e-06, + "loss": 5.1814, + "step": 45840 + }, + { + "epoch": 0.9327189127604166, + "grad_norm": 14.841641426086426, + "learning_rate": 8.721434709268208e-06, + "loss": 4.9684, + "step": 45845 + }, + { + "epoch": 0.9328206380208334, + "grad_norm": 16.00997543334961, + "learning_rate": 8.721167768806758e-06, + "loss": 4.8665, + "step": 45850 + }, + { + "epoch": 0.93292236328125, + "grad_norm": 18.490158081054688, + "learning_rate": 8.720900804568228e-06, + "loss": 5.0344, + "step": 45855 + }, + { + "epoch": 0.9330240885416666, + "grad_norm": 21.74952507019043, + "learning_rate": 8.720633816554323e-06, + "loss": 4.8037, + "step": 45860 + }, + { + "epoch": 0.9331258138020834, + "grad_norm": 19.554025650024414, + "learning_rate": 8.720366804766748e-06, + "loss": 4.8914, + "step": 45865 + }, + { + "epoch": 0.9332275390625, + "grad_norm": 18.752172470092773, + "learning_rate": 8.720099769207211e-06, + "loss": 5.4162, + "step": 45870 + }, + { + "epoch": 0.9333292643229166, + "grad_norm": 17.089534759521484, + "learning_rate": 8.719832709877417e-06, + "loss": 4.8816, + "step": 45875 + }, + { + "epoch": 0.9334309895833334, + "grad_norm": 15.328987121582031, + "learning_rate": 8.719565626779071e-06, + "loss": 4.9618, + "step": 45880 + }, + { + "epoch": 0.93353271484375, + "grad_norm": 19.073087692260742, + "learning_rate": 8.719298519913884e-06, + "loss": 4.7767, + "step": 45885 + }, + { + "epoch": 0.9336344401041666, + "grad_norm": 18.676429748535156, + "learning_rate": 8.719031389283561e-06, + "loss": 5.1498, + "step": 45890 + }, + { + "epoch": 0.9337361653645834, + "grad_norm": 21.108015060424805, + "learning_rate": 8.718764234889805e-06, + "loss": 4.9847, + "step": 45895 + }, + { + "epoch": 0.933837890625, + "grad_norm": 21.375812530517578, + "learning_rate": 8.718497056734328e-06, + "loss": 4.8502, + "step": 45900 + }, + { + "epoch": 0.9339396158854166, + "grad_norm": 13.194903373718262, + "learning_rate": 8.718229854818833e-06, + "loss": 4.9319, + "step": 45905 + }, + { + "epoch": 0.9340413411458334, + "grad_norm": 17.404541015625, + "learning_rate": 8.717962629145032e-06, + "loss": 5.1723, + "step": 45910 + }, + { + "epoch": 0.93414306640625, + "grad_norm": 16.849794387817383, + "learning_rate": 8.71769537971463e-06, + "loss": 5.0581, + "step": 45915 + }, + { + "epoch": 0.9342447916666666, + "grad_norm": 14.376570701599121, + "learning_rate": 8.717428106529334e-06, + "loss": 4.8666, + "step": 45920 + }, + { + "epoch": 0.9343465169270834, + "grad_norm": 14.775856018066406, + "learning_rate": 8.717160809590853e-06, + "loss": 5.2992, + "step": 45925 + }, + { + "epoch": 0.9344482421875, + "grad_norm": 29.134620666503906, + "learning_rate": 8.716893488900892e-06, + "loss": 5.0407, + "step": 45930 + }, + { + "epoch": 0.9345499674479166, + "grad_norm": 21.61101531982422, + "learning_rate": 8.716626144461164e-06, + "loss": 4.7569, + "step": 45935 + }, + { + "epoch": 0.9346516927083334, + "grad_norm": 21.251129150390625, + "learning_rate": 8.716358776273375e-06, + "loss": 5.1185, + "step": 45940 + }, + { + "epoch": 0.93475341796875, + "grad_norm": 21.739404678344727, + "learning_rate": 8.716091384339231e-06, + "loss": 4.9169, + "step": 45945 + }, + { + "epoch": 0.9348551432291666, + "grad_norm": 19.873300552368164, + "learning_rate": 8.715823968660444e-06, + "loss": 5.0616, + "step": 45950 + }, + { + "epoch": 0.9349568684895834, + "grad_norm": 16.223054885864258, + "learning_rate": 8.715556529238723e-06, + "loss": 5.2174, + "step": 45955 + }, + { + "epoch": 0.93505859375, + "grad_norm": 18.172592163085938, + "learning_rate": 8.715289066075772e-06, + "loss": 5.0621, + "step": 45960 + }, + { + "epoch": 0.9351603190104166, + "grad_norm": 16.040834426879883, + "learning_rate": 8.715021579173306e-06, + "loss": 5.0898, + "step": 45965 + }, + { + "epoch": 0.9352620442708334, + "grad_norm": 11.76721477508545, + "learning_rate": 8.714754068533028e-06, + "loss": 5.0524, + "step": 45970 + }, + { + "epoch": 0.93536376953125, + "grad_norm": 16.15619468688965, + "learning_rate": 8.714486534156653e-06, + "loss": 4.9872, + "step": 45975 + }, + { + "epoch": 0.9354654947916666, + "grad_norm": 15.126753807067871, + "learning_rate": 8.71421897604589e-06, + "loss": 4.9583, + "step": 45980 + }, + { + "epoch": 0.9355672200520834, + "grad_norm": 20.158143997192383, + "learning_rate": 8.713951394202445e-06, + "loss": 4.9645, + "step": 45985 + }, + { + "epoch": 0.9356689453125, + "grad_norm": 16.085533142089844, + "learning_rate": 8.713683788628028e-06, + "loss": 4.9663, + "step": 45990 + }, + { + "epoch": 0.9357706705729166, + "grad_norm": 19.265514373779297, + "learning_rate": 8.713416159324352e-06, + "loss": 4.9227, + "step": 45995 + }, + { + "epoch": 0.9358723958333334, + "grad_norm": 14.493550300598145, + "learning_rate": 8.713148506293125e-06, + "loss": 4.9663, + "step": 46000 + }, + { + "epoch": 0.93597412109375, + "grad_norm": 18.310440063476562, + "learning_rate": 8.712880829536058e-06, + "loss": 5.1179, + "step": 46005 + }, + { + "epoch": 0.9360758463541666, + "grad_norm": 14.40590763092041, + "learning_rate": 8.712613129054862e-06, + "loss": 4.9837, + "step": 46010 + }, + { + "epoch": 0.9361775716145834, + "grad_norm": 17.173128128051758, + "learning_rate": 8.712345404851246e-06, + "loss": 5.1742, + "step": 46015 + }, + { + "epoch": 0.936279296875, + "grad_norm": 13.704706192016602, + "learning_rate": 8.712077656926922e-06, + "loss": 4.8662, + "step": 46020 + }, + { + "epoch": 0.9363810221354166, + "grad_norm": 24.07213020324707, + "learning_rate": 8.7118098852836e-06, + "loss": 5.0827, + "step": 46025 + }, + { + "epoch": 0.9364827473958334, + "grad_norm": 15.3742094039917, + "learning_rate": 8.711542089922989e-06, + "loss": 4.9411, + "step": 46030 + }, + { + "epoch": 0.93658447265625, + "grad_norm": 16.705623626708984, + "learning_rate": 8.711274270846804e-06, + "loss": 5.0361, + "step": 46035 + }, + { + "epoch": 0.9366861979166666, + "grad_norm": 23.41046142578125, + "learning_rate": 8.711006428056755e-06, + "loss": 4.9385, + "step": 46040 + }, + { + "epoch": 0.9367879231770834, + "grad_norm": 21.270639419555664, + "learning_rate": 8.710738561554553e-06, + "loss": 5.179, + "step": 46045 + }, + { + "epoch": 0.9368896484375, + "grad_norm": 16.000476837158203, + "learning_rate": 8.710470671341909e-06, + "loss": 5.0108, + "step": 46050 + }, + { + "epoch": 0.9369913736979166, + "grad_norm": 15.216069221496582, + "learning_rate": 8.710202757420535e-06, + "loss": 5.0414, + "step": 46055 + }, + { + "epoch": 0.9370930989583334, + "grad_norm": 15.575425148010254, + "learning_rate": 8.709934819792143e-06, + "loss": 5.2124, + "step": 46060 + }, + { + "epoch": 0.93719482421875, + "grad_norm": 20.14999771118164, + "learning_rate": 8.709666858458445e-06, + "loss": 5.2127, + "step": 46065 + }, + { + "epoch": 0.9372965494791666, + "grad_norm": 15.001408576965332, + "learning_rate": 8.709398873421154e-06, + "loss": 4.9067, + "step": 46070 + }, + { + "epoch": 0.9373982747395834, + "grad_norm": 18.61107063293457, + "learning_rate": 8.709130864681981e-06, + "loss": 4.8849, + "step": 46075 + }, + { + "epoch": 0.9375, + "grad_norm": 20.770261764526367, + "learning_rate": 8.70886283224264e-06, + "loss": 5.108, + "step": 46080 + }, + { + "epoch": 0.9376017252604166, + "grad_norm": 24.121248245239258, + "learning_rate": 8.708594776104843e-06, + "loss": 5.0095, + "step": 46085 + }, + { + "epoch": 0.9377034505208334, + "grad_norm": 18.312671661376953, + "learning_rate": 8.708326696270302e-06, + "loss": 5.2266, + "step": 46090 + }, + { + "epoch": 0.93780517578125, + "grad_norm": 17.878568649291992, + "learning_rate": 8.708058592740731e-06, + "loss": 5.1354, + "step": 46095 + }, + { + "epoch": 0.9379069010416666, + "grad_norm": 20.123952865600586, + "learning_rate": 8.707790465517843e-06, + "loss": 4.7686, + "step": 46100 + }, + { + "epoch": 0.9380086263020834, + "grad_norm": 19.522228240966797, + "learning_rate": 8.707522314603349e-06, + "loss": 5.1942, + "step": 46105 + }, + { + "epoch": 0.9381103515625, + "grad_norm": 19.2492733001709, + "learning_rate": 8.707254139998965e-06, + "loss": 5.0121, + "step": 46110 + }, + { + "epoch": 0.9382120768229166, + "grad_norm": 14.766759872436523, + "learning_rate": 8.706985941706408e-06, + "loss": 5.2619, + "step": 46115 + }, + { + "epoch": 0.9383138020833334, + "grad_norm": 14.767952919006348, + "learning_rate": 8.706717719727382e-06, + "loss": 5.1983, + "step": 46120 + }, + { + "epoch": 0.93841552734375, + "grad_norm": 17.523061752319336, + "learning_rate": 8.70644947406361e-06, + "loss": 4.995, + "step": 46125 + }, + { + "epoch": 0.9385172526041666, + "grad_norm": 16.628358840942383, + "learning_rate": 8.706181204716802e-06, + "loss": 5.4164, + "step": 46130 + }, + { + "epoch": 0.9386189778645834, + "grad_norm": 19.330066680908203, + "learning_rate": 8.705912911688672e-06, + "loss": 5.4239, + "step": 46135 + }, + { + "epoch": 0.938720703125, + "grad_norm": 18.48580551147461, + "learning_rate": 8.705644594980934e-06, + "loss": 5.0911, + "step": 46140 + }, + { + "epoch": 0.9388224283854166, + "grad_norm": 16.569467544555664, + "learning_rate": 8.705376254595304e-06, + "loss": 5.1277, + "step": 46145 + }, + { + "epoch": 0.9389241536458334, + "grad_norm": 16.951431274414062, + "learning_rate": 8.705107890533495e-06, + "loss": 5.1042, + "step": 46150 + }, + { + "epoch": 0.93902587890625, + "grad_norm": 19.104900360107422, + "learning_rate": 8.704839502797224e-06, + "loss": 4.7895, + "step": 46155 + }, + { + "epoch": 0.9391276041666666, + "grad_norm": 18.756196975708008, + "learning_rate": 8.704571091388206e-06, + "loss": 4.9979, + "step": 46160 + }, + { + "epoch": 0.9392293294270834, + "grad_norm": 16.291610717773438, + "learning_rate": 8.704302656308152e-06, + "loss": 4.7759, + "step": 46165 + }, + { + "epoch": 0.9393310546875, + "grad_norm": 19.288835525512695, + "learning_rate": 8.704034197558781e-06, + "loss": 4.9207, + "step": 46170 + }, + { + "epoch": 0.9394327799479166, + "grad_norm": 21.552825927734375, + "learning_rate": 8.703765715141808e-06, + "loss": 4.948, + "step": 46175 + }, + { + "epoch": 0.9395345052083334, + "grad_norm": 18.691638946533203, + "learning_rate": 8.703497209058947e-06, + "loss": 5.1427, + "step": 46180 + }, + { + "epoch": 0.93963623046875, + "grad_norm": 15.355193138122559, + "learning_rate": 8.703228679311914e-06, + "loss": 4.7981, + "step": 46185 + }, + { + "epoch": 0.9397379557291666, + "grad_norm": 12.204293251037598, + "learning_rate": 8.702960125902427e-06, + "loss": 5.0416, + "step": 46190 + }, + { + "epoch": 0.9398396809895834, + "grad_norm": 17.45279884338379, + "learning_rate": 8.702691548832199e-06, + "loss": 4.977, + "step": 46195 + }, + { + "epoch": 0.93994140625, + "grad_norm": 13.161735534667969, + "learning_rate": 8.702422948102948e-06, + "loss": 4.9954, + "step": 46200 + }, + { + "epoch": 0.9400431315104166, + "grad_norm": 15.075936317443848, + "learning_rate": 8.702154323716389e-06, + "loss": 5.024, + "step": 46205 + }, + { + "epoch": 0.9401448567708334, + "grad_norm": 17.024843215942383, + "learning_rate": 8.70188567567424e-06, + "loss": 4.9344, + "step": 46210 + }, + { + "epoch": 0.94024658203125, + "grad_norm": 16.327898025512695, + "learning_rate": 8.701617003978215e-06, + "loss": 4.8877, + "step": 46215 + }, + { + "epoch": 0.9403483072916666, + "grad_norm": 16.78786277770996, + "learning_rate": 8.701348308630033e-06, + "loss": 4.9268, + "step": 46220 + }, + { + "epoch": 0.9404500325520834, + "grad_norm": 19.60451316833496, + "learning_rate": 8.701079589631411e-06, + "loss": 5.0557, + "step": 46225 + }, + { + "epoch": 0.9405517578125, + "grad_norm": 28.390361785888672, + "learning_rate": 8.700810846984065e-06, + "loss": 5.1711, + "step": 46230 + }, + { + "epoch": 0.9406534830729166, + "grad_norm": 12.592581748962402, + "learning_rate": 8.700542080689713e-06, + "loss": 4.9267, + "step": 46235 + }, + { + "epoch": 0.9407552083333334, + "grad_norm": 13.643098831176758, + "learning_rate": 8.700273290750071e-06, + "loss": 4.9192, + "step": 46240 + }, + { + "epoch": 0.94085693359375, + "grad_norm": 15.880212783813477, + "learning_rate": 8.700004477166858e-06, + "loss": 5.2593, + "step": 46245 + }, + { + "epoch": 0.9409586588541666, + "grad_norm": 14.672184944152832, + "learning_rate": 8.69973563994179e-06, + "loss": 4.8505, + "step": 46250 + }, + { + "epoch": 0.9410603841145834, + "grad_norm": 21.686235427856445, + "learning_rate": 8.699466779076586e-06, + "loss": 4.9663, + "step": 46255 + }, + { + "epoch": 0.941162109375, + "grad_norm": 16.915555953979492, + "learning_rate": 8.699197894572964e-06, + "loss": 4.7266, + "step": 46260 + }, + { + "epoch": 0.9412638346354166, + "grad_norm": 18.455549240112305, + "learning_rate": 8.698928986432641e-06, + "loss": 5.0147, + "step": 46265 + }, + { + "epoch": 0.9413655598958334, + "grad_norm": 18.774642944335938, + "learning_rate": 8.698660054657338e-06, + "loss": 4.8368, + "step": 46270 + }, + { + "epoch": 0.94146728515625, + "grad_norm": 21.03612518310547, + "learning_rate": 8.698391099248771e-06, + "loss": 4.9731, + "step": 46275 + }, + { + "epoch": 0.9415690104166666, + "grad_norm": 17.650388717651367, + "learning_rate": 8.698122120208657e-06, + "loss": 5.1591, + "step": 46280 + }, + { + "epoch": 0.9416707356770834, + "grad_norm": 19.53095054626465, + "learning_rate": 8.697853117538717e-06, + "loss": 5.0057, + "step": 46285 + }, + { + "epoch": 0.9417724609375, + "grad_norm": 18.213787078857422, + "learning_rate": 8.69758409124067e-06, + "loss": 5.2447, + "step": 46290 + }, + { + "epoch": 0.9418741861979166, + "grad_norm": 16.22197151184082, + "learning_rate": 8.697315041316234e-06, + "loss": 4.7856, + "step": 46295 + }, + { + "epoch": 0.9419759114583334, + "grad_norm": 15.943615913391113, + "learning_rate": 8.697045967767131e-06, + "loss": 4.9648, + "step": 46300 + }, + { + "epoch": 0.94207763671875, + "grad_norm": 18.625333786010742, + "learning_rate": 8.696776870595073e-06, + "loss": 4.7397, + "step": 46305 + }, + { + "epoch": 0.9421793619791666, + "grad_norm": 23.945655822753906, + "learning_rate": 8.696507749801788e-06, + "loss": 5.1467, + "step": 46310 + }, + { + "epoch": 0.9422810872395834, + "grad_norm": 20.22397232055664, + "learning_rate": 8.696238605388991e-06, + "loss": 5.2915, + "step": 46315 + }, + { + "epoch": 0.9423828125, + "grad_norm": 14.383116722106934, + "learning_rate": 8.695969437358403e-06, + "loss": 4.9643, + "step": 46320 + }, + { + "epoch": 0.9424845377604166, + "grad_norm": 13.002197265625, + "learning_rate": 8.695700245711744e-06, + "loss": 5.0401, + "step": 46325 + }, + { + "epoch": 0.9425862630208334, + "grad_norm": 15.98853874206543, + "learning_rate": 8.695431030450732e-06, + "loss": 5.1104, + "step": 46330 + }, + { + "epoch": 0.94268798828125, + "grad_norm": 23.355209350585938, + "learning_rate": 8.69516179157709e-06, + "loss": 5.0885, + "step": 46335 + }, + { + "epoch": 0.9427897135416666, + "grad_norm": 17.412858963012695, + "learning_rate": 8.694892529092537e-06, + "loss": 4.7163, + "step": 46340 + }, + { + "epoch": 0.9428914388020834, + "grad_norm": 14.098752975463867, + "learning_rate": 8.694623242998795e-06, + "loss": 4.9457, + "step": 46345 + }, + { + "epoch": 0.9429931640625, + "grad_norm": 21.100399017333984, + "learning_rate": 8.694353933297582e-06, + "loss": 4.8515, + "step": 46350 + }, + { + "epoch": 0.9430948893229166, + "grad_norm": 22.43006706237793, + "learning_rate": 8.69408459999062e-06, + "loss": 5.0376, + "step": 46355 + }, + { + "epoch": 0.9431966145833334, + "grad_norm": 19.862884521484375, + "learning_rate": 8.693815243079629e-06, + "loss": 5.4066, + "step": 46360 + }, + { + "epoch": 0.94329833984375, + "grad_norm": 18.444622039794922, + "learning_rate": 8.693545862566331e-06, + "loss": 5.3708, + "step": 46365 + }, + { + "epoch": 0.9434000651041666, + "grad_norm": 13.216726303100586, + "learning_rate": 8.693276458452449e-06, + "loss": 5.0377, + "step": 46370 + }, + { + "epoch": 0.9435017903645834, + "grad_norm": 19.39557456970215, + "learning_rate": 8.693007030739702e-06, + "loss": 4.9841, + "step": 46375 + }, + { + "epoch": 0.943603515625, + "grad_norm": 21.804609298706055, + "learning_rate": 8.692737579429813e-06, + "loss": 5.0184, + "step": 46380 + }, + { + "epoch": 0.9437052408854166, + "grad_norm": 22.1883602142334, + "learning_rate": 8.692468104524504e-06, + "loss": 5.1344, + "step": 46385 + }, + { + "epoch": 0.9438069661458334, + "grad_norm": 17.37307357788086, + "learning_rate": 8.692198606025493e-06, + "loss": 5.0621, + "step": 46390 + }, + { + "epoch": 0.94390869140625, + "grad_norm": 18.314748764038086, + "learning_rate": 8.691929083934507e-06, + "loss": 5.2994, + "step": 46395 + }, + { + "epoch": 0.9440104166666666, + "grad_norm": 20.88266372680664, + "learning_rate": 8.691659538253266e-06, + "loss": 5.1847, + "step": 46400 + }, + { + "epoch": 0.9441121419270834, + "grad_norm": 18.704917907714844, + "learning_rate": 8.691389968983492e-06, + "loss": 5.0693, + "step": 46405 + }, + { + "epoch": 0.9442138671875, + "grad_norm": 13.01060676574707, + "learning_rate": 8.69112037612691e-06, + "loss": 4.7903, + "step": 46410 + }, + { + "epoch": 0.9443155924479166, + "grad_norm": 15.234101295471191, + "learning_rate": 8.69085075968524e-06, + "loss": 4.8469, + "step": 46415 + }, + { + "epoch": 0.9444173177083334, + "grad_norm": 12.430242538452148, + "learning_rate": 8.690581119660201e-06, + "loss": 4.9033, + "step": 46420 + }, + { + "epoch": 0.94451904296875, + "grad_norm": 12.617941856384277, + "learning_rate": 8.690311456053524e-06, + "loss": 5.0187, + "step": 46425 + }, + { + "epoch": 0.9446207682291666, + "grad_norm": 15.51794719696045, + "learning_rate": 8.690041768866928e-06, + "loss": 5.0069, + "step": 46430 + }, + { + "epoch": 0.9447224934895834, + "grad_norm": 17.01591682434082, + "learning_rate": 8.689772058102135e-06, + "loss": 5.0389, + "step": 46435 + }, + { + "epoch": 0.94482421875, + "grad_norm": 17.323593139648438, + "learning_rate": 8.689502323760872e-06, + "loss": 5.1008, + "step": 46440 + }, + { + "epoch": 0.9449259440104166, + "grad_norm": 15.73176097869873, + "learning_rate": 8.689232565844859e-06, + "loss": 5.1464, + "step": 46445 + }, + { + "epoch": 0.9450276692708334, + "grad_norm": 18.792098999023438, + "learning_rate": 8.688962784355821e-06, + "loss": 4.9886, + "step": 46450 + }, + { + "epoch": 0.94512939453125, + "grad_norm": 10.650278091430664, + "learning_rate": 8.688692979295482e-06, + "loss": 5.0325, + "step": 46455 + }, + { + "epoch": 0.9452311197916666, + "grad_norm": 18.397565841674805, + "learning_rate": 8.688423150665566e-06, + "loss": 4.9996, + "step": 46460 + }, + { + "epoch": 0.9453328450520834, + "grad_norm": 18.12125015258789, + "learning_rate": 8.688153298467796e-06, + "loss": 5.1361, + "step": 46465 + }, + { + "epoch": 0.9454345703125, + "grad_norm": 15.434760093688965, + "learning_rate": 8.687883422703898e-06, + "loss": 5.009, + "step": 46470 + }, + { + "epoch": 0.9455362955729166, + "grad_norm": 27.667335510253906, + "learning_rate": 8.687613523375597e-06, + "loss": 5.0214, + "step": 46475 + }, + { + "epoch": 0.9456380208333334, + "grad_norm": 14.2490234375, + "learning_rate": 8.687343600484614e-06, + "loss": 5.0312, + "step": 46480 + }, + { + "epoch": 0.94573974609375, + "grad_norm": 16.767967224121094, + "learning_rate": 8.687073654032677e-06, + "loss": 5.1954, + "step": 46485 + }, + { + "epoch": 0.9458414713541666, + "grad_norm": 21.98805046081543, + "learning_rate": 8.68680368402151e-06, + "loss": 5.0096, + "step": 46490 + }, + { + "epoch": 0.9459431966145834, + "grad_norm": 13.527399063110352, + "learning_rate": 8.686533690452836e-06, + "loss": 4.8778, + "step": 46495 + }, + { + "epoch": 0.946044921875, + "grad_norm": 15.678458213806152, + "learning_rate": 8.686263673328383e-06, + "loss": 5.1215, + "step": 46500 + }, + { + "epoch": 0.9461466471354166, + "grad_norm": 20.849430084228516, + "learning_rate": 8.685993632649876e-06, + "loss": 5.0035, + "step": 46505 + }, + { + "epoch": 0.9462483723958334, + "grad_norm": 21.25079917907715, + "learning_rate": 8.68572356841904e-06, + "loss": 4.8046, + "step": 46510 + }, + { + "epoch": 0.94635009765625, + "grad_norm": 15.2230806350708, + "learning_rate": 8.6854534806376e-06, + "loss": 5.1022, + "step": 46515 + }, + { + "epoch": 0.9464518229166666, + "grad_norm": 18.585311889648438, + "learning_rate": 8.685183369307281e-06, + "loss": 4.9361, + "step": 46520 + }, + { + "epoch": 0.9465535481770834, + "grad_norm": 13.658872604370117, + "learning_rate": 8.684913234429813e-06, + "loss": 4.9949, + "step": 46525 + }, + { + "epoch": 0.9466552734375, + "grad_norm": 14.085484504699707, + "learning_rate": 8.684643076006915e-06, + "loss": 5.0369, + "step": 46530 + }, + { + "epoch": 0.9467569986979166, + "grad_norm": 16.628986358642578, + "learning_rate": 8.68437289404032e-06, + "loss": 5.0615, + "step": 46535 + }, + { + "epoch": 0.9468587239583334, + "grad_norm": 20.36720085144043, + "learning_rate": 8.684102688531752e-06, + "loss": 4.9759, + "step": 46540 + }, + { + "epoch": 0.94696044921875, + "grad_norm": 14.344942092895508, + "learning_rate": 8.683832459482937e-06, + "loss": 4.9447, + "step": 46545 + }, + { + "epoch": 0.9470621744791666, + "grad_norm": 15.94544792175293, + "learning_rate": 8.683562206895602e-06, + "loss": 5.0, + "step": 46550 + }, + { + "epoch": 0.9471638997395834, + "grad_norm": 16.15205955505371, + "learning_rate": 8.683291930771474e-06, + "loss": 5.4642, + "step": 46555 + }, + { + "epoch": 0.947265625, + "grad_norm": 19.133813858032227, + "learning_rate": 8.683021631112278e-06, + "loss": 4.9694, + "step": 46560 + }, + { + "epoch": 0.9473673502604166, + "grad_norm": 17.254535675048828, + "learning_rate": 8.682751307919746e-06, + "loss": 4.8816, + "step": 46565 + }, + { + "epoch": 0.9474690755208334, + "grad_norm": 17.42214012145996, + "learning_rate": 8.682480961195598e-06, + "loss": 5.212, + "step": 46570 + }, + { + "epoch": 0.94757080078125, + "grad_norm": 16.333356857299805, + "learning_rate": 8.682210590941569e-06, + "loss": 5.0847, + "step": 46575 + }, + { + "epoch": 0.9476725260416666, + "grad_norm": 13.849227905273438, + "learning_rate": 8.681940197159382e-06, + "loss": 5.2341, + "step": 46580 + }, + { + "epoch": 0.9477742513020834, + "grad_norm": 17.535778045654297, + "learning_rate": 8.681669779850768e-06, + "loss": 5.1921, + "step": 46585 + }, + { + "epoch": 0.9478759765625, + "grad_norm": 23.445724487304688, + "learning_rate": 8.68139933901745e-06, + "loss": 5.1438, + "step": 46590 + }, + { + "epoch": 0.9479777018229166, + "grad_norm": 17.836200714111328, + "learning_rate": 8.68112887466116e-06, + "loss": 5.1075, + "step": 46595 + }, + { + "epoch": 0.9480794270833334, + "grad_norm": 15.65931224822998, + "learning_rate": 8.680858386783625e-06, + "loss": 4.8341, + "step": 46600 + }, + { + "epoch": 0.94818115234375, + "grad_norm": 16.76944923400879, + "learning_rate": 8.680587875386573e-06, + "loss": 5.1282, + "step": 46605 + }, + { + "epoch": 0.9482828776041666, + "grad_norm": 12.642790794372559, + "learning_rate": 8.680317340471734e-06, + "loss": 5.0641, + "step": 46610 + }, + { + "epoch": 0.9483846028645834, + "grad_norm": 19.61115074157715, + "learning_rate": 8.680046782040833e-06, + "loss": 4.8541, + "step": 46615 + }, + { + "epoch": 0.948486328125, + "grad_norm": 24.344066619873047, + "learning_rate": 8.679776200095605e-06, + "loss": 5.2134, + "step": 46620 + }, + { + "epoch": 0.9485880533854166, + "grad_norm": 13.686387062072754, + "learning_rate": 8.679505594637772e-06, + "loss": 5.029, + "step": 46625 + }, + { + "epoch": 0.9486897786458334, + "grad_norm": 12.111117362976074, + "learning_rate": 8.679234965669067e-06, + "loss": 4.8351, + "step": 46630 + }, + { + "epoch": 0.94879150390625, + "grad_norm": 14.788562774658203, + "learning_rate": 8.678964313191221e-06, + "loss": 4.9866, + "step": 46635 + }, + { + "epoch": 0.9488932291666666, + "grad_norm": 13.758666038513184, + "learning_rate": 8.678693637205957e-06, + "loss": 4.9315, + "step": 46640 + }, + { + "epoch": 0.9489949544270834, + "grad_norm": 16.45059585571289, + "learning_rate": 8.67842293771501e-06, + "loss": 4.9537, + "step": 46645 + }, + { + "epoch": 0.9490966796875, + "grad_norm": 23.175901412963867, + "learning_rate": 8.678152214720109e-06, + "loss": 5.1134, + "step": 46650 + }, + { + "epoch": 0.9491984049479166, + "grad_norm": 16.97336769104004, + "learning_rate": 8.677881468222983e-06, + "loss": 4.9116, + "step": 46655 + }, + { + "epoch": 0.9493001302083334, + "grad_norm": 20.258119583129883, + "learning_rate": 8.677610698225361e-06, + "loss": 4.8543, + "step": 46660 + }, + { + "epoch": 0.94940185546875, + "grad_norm": 12.634359359741211, + "learning_rate": 8.677339904728973e-06, + "loss": 5.0822, + "step": 46665 + }, + { + "epoch": 0.9495035807291666, + "grad_norm": 14.238597869873047, + "learning_rate": 8.677069087735553e-06, + "loss": 5.2819, + "step": 46670 + }, + { + "epoch": 0.9496053059895834, + "grad_norm": 18.72743034362793, + "learning_rate": 8.676798247246828e-06, + "loss": 4.6496, + "step": 46675 + }, + { + "epoch": 0.94970703125, + "grad_norm": 16.791906356811523, + "learning_rate": 8.676527383264529e-06, + "loss": 5.1515, + "step": 46680 + }, + { + "epoch": 0.9498087565104166, + "grad_norm": 17.40926170349121, + "learning_rate": 8.676256495790386e-06, + "loss": 4.8147, + "step": 46685 + }, + { + "epoch": 0.9499104817708334, + "grad_norm": 17.241836547851562, + "learning_rate": 8.675985584826131e-06, + "loss": 5.1219, + "step": 46690 + }, + { + "epoch": 0.95001220703125, + "grad_norm": 16.65614128112793, + "learning_rate": 8.675714650373499e-06, + "loss": 4.9325, + "step": 46695 + }, + { + "epoch": 0.9501139322916666, + "grad_norm": 16.452808380126953, + "learning_rate": 8.675443692434214e-06, + "loss": 5.2768, + "step": 46700 + }, + { + "epoch": 0.9502156575520834, + "grad_norm": 18.646656036376953, + "learning_rate": 8.67517271101001e-06, + "loss": 5.1148, + "step": 46705 + }, + { + "epoch": 0.9503173828125, + "grad_norm": 17.172632217407227, + "learning_rate": 8.674901706102618e-06, + "loss": 4.9856, + "step": 46710 + }, + { + "epoch": 0.9504191080729166, + "grad_norm": 12.749005317687988, + "learning_rate": 8.674630677713774e-06, + "loss": 4.864, + "step": 46715 + }, + { + "epoch": 0.9505208333333334, + "grad_norm": 17.951107025146484, + "learning_rate": 8.674359625845205e-06, + "loss": 5.2354, + "step": 46720 + }, + { + "epoch": 0.95062255859375, + "grad_norm": 21.65696144104004, + "learning_rate": 8.674088550498645e-06, + "loss": 5.0753, + "step": 46725 + }, + { + "epoch": 0.9507242838541666, + "grad_norm": 17.712419509887695, + "learning_rate": 8.673817451675823e-06, + "loss": 4.8403, + "step": 46730 + }, + { + "epoch": 0.9508260091145834, + "grad_norm": 17.766176223754883, + "learning_rate": 8.673546329378477e-06, + "loss": 4.8669, + "step": 46735 + }, + { + "epoch": 0.950927734375, + "grad_norm": 14.156540870666504, + "learning_rate": 8.673275183608334e-06, + "loss": 4.9994, + "step": 46740 + }, + { + "epoch": 0.9510294596354166, + "grad_norm": 19.903057098388672, + "learning_rate": 8.67300401436713e-06, + "loss": 5.1839, + "step": 46745 + }, + { + "epoch": 0.9511311848958334, + "grad_norm": 18.128562927246094, + "learning_rate": 8.672732821656596e-06, + "loss": 4.9027, + "step": 46750 + }, + { + "epoch": 0.95123291015625, + "grad_norm": 21.59538459777832, + "learning_rate": 8.672461605478463e-06, + "loss": 5.0349, + "step": 46755 + }, + { + "epoch": 0.9513346354166666, + "grad_norm": 19.384809494018555, + "learning_rate": 8.67219036583447e-06, + "loss": 4.9778, + "step": 46760 + }, + { + "epoch": 0.9514363606770834, + "grad_norm": 17.041170120239258, + "learning_rate": 8.671919102726343e-06, + "loss": 4.7843, + "step": 46765 + }, + { + "epoch": 0.9515380859375, + "grad_norm": 19.318817138671875, + "learning_rate": 8.67164781615582e-06, + "loss": 4.9811, + "step": 46770 + }, + { + "epoch": 0.9516398111979166, + "grad_norm": 17.41954231262207, + "learning_rate": 8.671376506124631e-06, + "loss": 4.9353, + "step": 46775 + }, + { + "epoch": 0.9517415364583334, + "grad_norm": 15.566298484802246, + "learning_rate": 8.671105172634513e-06, + "loss": 5.1631, + "step": 46780 + }, + { + "epoch": 0.95184326171875, + "grad_norm": 16.46381950378418, + "learning_rate": 8.670833815687199e-06, + "loss": 5.1307, + "step": 46785 + }, + { + "epoch": 0.9519449869791666, + "grad_norm": 22.875343322753906, + "learning_rate": 8.670562435284422e-06, + "loss": 4.8234, + "step": 46790 + }, + { + "epoch": 0.9520467122395834, + "grad_norm": 15.273146629333496, + "learning_rate": 8.670291031427917e-06, + "loss": 4.9845, + "step": 46795 + }, + { + "epoch": 0.9521484375, + "grad_norm": 19.725351333618164, + "learning_rate": 8.670019604119416e-06, + "loss": 5.0784, + "step": 46800 + }, + { + "epoch": 0.9522501627604166, + "grad_norm": 18.364721298217773, + "learning_rate": 8.669748153360655e-06, + "loss": 5.1318, + "step": 46805 + }, + { + "epoch": 0.9523518880208334, + "grad_norm": 19.765010833740234, + "learning_rate": 8.669476679153368e-06, + "loss": 5.2126, + "step": 46810 + }, + { + "epoch": 0.95245361328125, + "grad_norm": 20.686214447021484, + "learning_rate": 8.669205181499291e-06, + "loss": 5.2497, + "step": 46815 + }, + { + "epoch": 0.9525553385416666, + "grad_norm": 14.595255851745605, + "learning_rate": 8.668933660400157e-06, + "loss": 5.2672, + "step": 46820 + }, + { + "epoch": 0.9526570638020834, + "grad_norm": 16.219810485839844, + "learning_rate": 8.668662115857702e-06, + "loss": 4.9139, + "step": 46825 + }, + { + "epoch": 0.9527587890625, + "grad_norm": 19.44927978515625, + "learning_rate": 8.668390547873659e-06, + "loss": 5.2248, + "step": 46830 + }, + { + "epoch": 0.9528605143229166, + "grad_norm": 14.203231811523438, + "learning_rate": 8.668118956449767e-06, + "loss": 4.9747, + "step": 46835 + }, + { + "epoch": 0.9529622395833334, + "grad_norm": 20.46356964111328, + "learning_rate": 8.667847341587758e-06, + "loss": 4.9295, + "step": 46840 + }, + { + "epoch": 0.95306396484375, + "grad_norm": 18.857107162475586, + "learning_rate": 8.667575703289369e-06, + "loss": 5.21, + "step": 46845 + }, + { + "epoch": 0.9531656901041666, + "grad_norm": 17.54464340209961, + "learning_rate": 8.667304041556334e-06, + "loss": 5.2603, + "step": 46850 + }, + { + "epoch": 0.9532674153645834, + "grad_norm": 17.738426208496094, + "learning_rate": 8.667032356390393e-06, + "loss": 5.3415, + "step": 46855 + }, + { + "epoch": 0.953369140625, + "grad_norm": 16.31195640563965, + "learning_rate": 8.666760647793278e-06, + "loss": 5.0534, + "step": 46860 + }, + { + "epoch": 0.9534708658854166, + "grad_norm": 19.210617065429688, + "learning_rate": 8.666488915766726e-06, + "loss": 5.0566, + "step": 46865 + }, + { + "epoch": 0.9535725911458334, + "grad_norm": 16.865886688232422, + "learning_rate": 8.666217160312476e-06, + "loss": 5.082, + "step": 46870 + }, + { + "epoch": 0.95367431640625, + "grad_norm": 16.46302032470703, + "learning_rate": 8.665945381432261e-06, + "loss": 5.0001, + "step": 46875 + }, + { + "epoch": 0.9537760416666666, + "grad_norm": 18.22321319580078, + "learning_rate": 8.665673579127817e-06, + "loss": 4.8662, + "step": 46880 + }, + { + "epoch": 0.9538777669270834, + "grad_norm": 15.739984512329102, + "learning_rate": 8.665401753400885e-06, + "loss": 5.2236, + "step": 46885 + }, + { + "epoch": 0.9539794921875, + "grad_norm": 17.63494110107422, + "learning_rate": 8.665129904253198e-06, + "loss": 5.3078, + "step": 46890 + }, + { + "epoch": 0.9540812174479166, + "grad_norm": 13.490073204040527, + "learning_rate": 8.664858031686495e-06, + "loss": 5.5114, + "step": 46895 + }, + { + "epoch": 0.9541829427083334, + "grad_norm": 16.106529235839844, + "learning_rate": 8.664586135702512e-06, + "loss": 4.9999, + "step": 46900 + }, + { + "epoch": 0.95428466796875, + "grad_norm": 19.618267059326172, + "learning_rate": 8.664314216302986e-06, + "loss": 5.1443, + "step": 46905 + }, + { + "epoch": 0.9543863932291666, + "grad_norm": 21.011110305786133, + "learning_rate": 8.664042273489656e-06, + "loss": 5.1069, + "step": 46910 + }, + { + "epoch": 0.9544881184895834, + "grad_norm": 17.80475425720215, + "learning_rate": 8.66377030726426e-06, + "loss": 5.056, + "step": 46915 + }, + { + "epoch": 0.95458984375, + "grad_norm": 17.019582748413086, + "learning_rate": 8.663498317628535e-06, + "loss": 4.8978, + "step": 46920 + }, + { + "epoch": 0.9546915690104166, + "grad_norm": 15.508393287658691, + "learning_rate": 8.663226304584218e-06, + "loss": 5.0946, + "step": 46925 + }, + { + "epoch": 0.9547932942708334, + "grad_norm": 17.446897506713867, + "learning_rate": 8.662954268133047e-06, + "loss": 4.9383, + "step": 46930 + }, + { + "epoch": 0.95489501953125, + "grad_norm": 19.68635368347168, + "learning_rate": 8.662682208276762e-06, + "loss": 4.7744, + "step": 46935 + }, + { + "epoch": 0.9549967447916666, + "grad_norm": 16.7099666595459, + "learning_rate": 8.6624101250171e-06, + "loss": 5.1853, + "step": 46940 + }, + { + "epoch": 0.9550984700520834, + "grad_norm": 20.903575897216797, + "learning_rate": 8.662138018355799e-06, + "loss": 5.1053, + "step": 46945 + }, + { + "epoch": 0.9552001953125, + "grad_norm": 13.083740234375, + "learning_rate": 8.6618658882946e-06, + "loss": 4.993, + "step": 46950 + }, + { + "epoch": 0.9553019205729166, + "grad_norm": 17.30512046813965, + "learning_rate": 8.66159373483524e-06, + "loss": 5.0342, + "step": 46955 + }, + { + "epoch": 0.9554036458333334, + "grad_norm": 16.64785385131836, + "learning_rate": 8.661321557979458e-06, + "loss": 4.9473, + "step": 46960 + }, + { + "epoch": 0.95550537109375, + "grad_norm": 21.24454116821289, + "learning_rate": 8.661049357728995e-06, + "loss": 5.3216, + "step": 46965 + }, + { + "epoch": 0.9556070963541666, + "grad_norm": 17.062002182006836, + "learning_rate": 8.660777134085587e-06, + "loss": 4.9303, + "step": 46970 + }, + { + "epoch": 0.9557088216145834, + "grad_norm": 23.813627243041992, + "learning_rate": 8.660504887050975e-06, + "loss": 5.1467, + "step": 46975 + }, + { + "epoch": 0.955810546875, + "grad_norm": 15.3272123336792, + "learning_rate": 8.6602326166269e-06, + "loss": 4.8107, + "step": 46980 + }, + { + "epoch": 0.9559122721354166, + "grad_norm": 18.101221084594727, + "learning_rate": 8.659960322815099e-06, + "loss": 4.923, + "step": 46985 + }, + { + "epoch": 0.9560139973958334, + "grad_norm": 12.130484580993652, + "learning_rate": 8.659688005617314e-06, + "loss": 5.3075, + "step": 46990 + }, + { + "epoch": 0.95611572265625, + "grad_norm": 32.808631896972656, + "learning_rate": 8.659415665035287e-06, + "loss": 4.8933, + "step": 46995 + }, + { + "epoch": 0.9562174479166666, + "grad_norm": 17.190265655517578, + "learning_rate": 8.659143301070752e-06, + "loss": 5.1831, + "step": 47000 + }, + { + "epoch": 0.9563191731770834, + "grad_norm": 20.452117919921875, + "learning_rate": 8.658870913725454e-06, + "loss": 4.8796, + "step": 47005 + }, + { + "epoch": 0.9564208984375, + "grad_norm": 13.131489753723145, + "learning_rate": 8.658598503001134e-06, + "loss": 4.9129, + "step": 47010 + }, + { + "epoch": 0.9565226236979166, + "grad_norm": 16.567710876464844, + "learning_rate": 8.65832606889953e-06, + "loss": 5.0488, + "step": 47015 + }, + { + "epoch": 0.9566243489583334, + "grad_norm": 20.72747230529785, + "learning_rate": 8.658053611422382e-06, + "loss": 5.0145, + "step": 47020 + }, + { + "epoch": 0.95672607421875, + "grad_norm": 15.766063690185547, + "learning_rate": 8.657781130571435e-06, + "loss": 5.2109, + "step": 47025 + }, + { + "epoch": 0.9568277994791666, + "grad_norm": 15.407547950744629, + "learning_rate": 8.657508626348426e-06, + "loss": 4.7682, + "step": 47030 + }, + { + "epoch": 0.9569295247395834, + "grad_norm": 21.514583587646484, + "learning_rate": 8.657236098755098e-06, + "loss": 5.4945, + "step": 47035 + }, + { + "epoch": 0.95703125, + "grad_norm": 24.01378059387207, + "learning_rate": 8.656963547793194e-06, + "loss": 5.1845, + "step": 47040 + }, + { + "epoch": 0.9571329752604166, + "grad_norm": 15.318079948425293, + "learning_rate": 8.656690973464452e-06, + "loss": 4.8961, + "step": 47045 + }, + { + "epoch": 0.9572347005208334, + "grad_norm": 13.4182767868042, + "learning_rate": 8.656418375770617e-06, + "loss": 5.1786, + "step": 47050 + }, + { + "epoch": 0.95733642578125, + "grad_norm": 19.362167358398438, + "learning_rate": 8.656145754713427e-06, + "loss": 4.9128, + "step": 47055 + }, + { + "epoch": 0.9574381510416666, + "grad_norm": 16.25887107849121, + "learning_rate": 8.655873110294627e-06, + "loss": 4.817, + "step": 47060 + }, + { + "epoch": 0.9575398763020834, + "grad_norm": 18.658124923706055, + "learning_rate": 8.65560044251596e-06, + "loss": 4.8787, + "step": 47065 + }, + { + "epoch": 0.9576416015625, + "grad_norm": 19.23420524597168, + "learning_rate": 8.655327751379165e-06, + "loss": 5.2188, + "step": 47070 + }, + { + "epoch": 0.9577433268229166, + "grad_norm": 16.695531845092773, + "learning_rate": 8.655055036885987e-06, + "loss": 4.9899, + "step": 47075 + }, + { + "epoch": 0.9578450520833334, + "grad_norm": 15.953563690185547, + "learning_rate": 8.654782299038166e-06, + "loss": 4.9489, + "step": 47080 + }, + { + "epoch": 0.95794677734375, + "grad_norm": 18.487892150878906, + "learning_rate": 8.65450953783745e-06, + "loss": 4.9767, + "step": 47085 + }, + { + "epoch": 0.9580485026041666, + "grad_norm": 16.450536727905273, + "learning_rate": 8.654236753285575e-06, + "loss": 5.0769, + "step": 47090 + }, + { + "epoch": 0.9581502278645834, + "grad_norm": 20.430007934570312, + "learning_rate": 8.653963945384287e-06, + "loss": 5.0416, + "step": 47095 + }, + { + "epoch": 0.958251953125, + "grad_norm": 20.481121063232422, + "learning_rate": 8.65369111413533e-06, + "loss": 4.9205, + "step": 47100 + }, + { + "epoch": 0.9583536783854166, + "grad_norm": 21.031667709350586, + "learning_rate": 8.653418259540446e-06, + "loss": 5.07, + "step": 47105 + }, + { + "epoch": 0.9584554036458334, + "grad_norm": 21.534893035888672, + "learning_rate": 8.65314538160138e-06, + "loss": 5.3777, + "step": 47110 + }, + { + "epoch": 0.95855712890625, + "grad_norm": 15.696374893188477, + "learning_rate": 8.652872480319877e-06, + "loss": 5.0753, + "step": 47115 + }, + { + "epoch": 0.9586588541666666, + "grad_norm": 18.863840103149414, + "learning_rate": 8.652599555697676e-06, + "loss": 5.0065, + "step": 47120 + }, + { + "epoch": 0.9587605794270834, + "grad_norm": 16.453569412231445, + "learning_rate": 8.652326607736522e-06, + "loss": 4.9518, + "step": 47125 + }, + { + "epoch": 0.9588623046875, + "grad_norm": 13.963264465332031, + "learning_rate": 8.652053636438162e-06, + "loss": 4.9452, + "step": 47130 + }, + { + "epoch": 0.9589640299479166, + "grad_norm": 14.441951751708984, + "learning_rate": 8.651780641804338e-06, + "loss": 4.8957, + "step": 47135 + }, + { + "epoch": 0.9590657552083334, + "grad_norm": 17.091259002685547, + "learning_rate": 8.651507623836795e-06, + "loss": 5.085, + "step": 47140 + }, + { + "epoch": 0.95916748046875, + "grad_norm": 19.565799713134766, + "learning_rate": 8.651234582537278e-06, + "loss": 5.3206, + "step": 47145 + }, + { + "epoch": 0.9592692057291666, + "grad_norm": 17.25426483154297, + "learning_rate": 8.65096151790753e-06, + "loss": 5.1902, + "step": 47150 + }, + { + "epoch": 0.9593709309895834, + "grad_norm": 21.855144500732422, + "learning_rate": 8.650688429949298e-06, + "loss": 4.9413, + "step": 47155 + }, + { + "epoch": 0.95947265625, + "grad_norm": 16.055435180664062, + "learning_rate": 8.650415318664325e-06, + "loss": 5.099, + "step": 47160 + }, + { + "epoch": 0.9595743815104166, + "grad_norm": 20.82807159423828, + "learning_rate": 8.650142184054358e-06, + "loss": 4.912, + "step": 47165 + }, + { + "epoch": 0.9596761067708334, + "grad_norm": 15.542463302612305, + "learning_rate": 8.649869026121141e-06, + "loss": 5.0746, + "step": 47170 + }, + { + "epoch": 0.95977783203125, + "grad_norm": 18.234188079833984, + "learning_rate": 8.649595844866417e-06, + "loss": 5.1191, + "step": 47175 + }, + { + "epoch": 0.9598795572916666, + "grad_norm": 19.391281127929688, + "learning_rate": 8.649322640291938e-06, + "loss": 4.9579, + "step": 47180 + }, + { + "epoch": 0.9599812825520834, + "grad_norm": 18.215595245361328, + "learning_rate": 8.649049412399443e-06, + "loss": 4.9211, + "step": 47185 + }, + { + "epoch": 0.9600830078125, + "grad_norm": 14.569546699523926, + "learning_rate": 8.648776161190682e-06, + "loss": 5.3078, + "step": 47190 + }, + { + "epoch": 0.9601847330729166, + "grad_norm": 22.138504028320312, + "learning_rate": 8.648502886667398e-06, + "loss": 5.1471, + "step": 47195 + }, + { + "epoch": 0.9602864583333334, + "grad_norm": 14.58286190032959, + "learning_rate": 8.64822958883134e-06, + "loss": 5.0346, + "step": 47200 + }, + { + "epoch": 0.96038818359375, + "grad_norm": 16.261117935180664, + "learning_rate": 8.647956267684253e-06, + "loss": 5.0386, + "step": 47205 + }, + { + "epoch": 0.9604899088541666, + "grad_norm": 14.172650337219238, + "learning_rate": 8.647682923227882e-06, + "loss": 5.0024, + "step": 47210 + }, + { + "epoch": 0.9605916341145834, + "grad_norm": 17.57910919189453, + "learning_rate": 8.647409555463977e-06, + "loss": 5.0321, + "step": 47215 + }, + { + "epoch": 0.960693359375, + "grad_norm": 19.140954971313477, + "learning_rate": 8.647136164394283e-06, + "loss": 5.1963, + "step": 47220 + }, + { + "epoch": 0.9607950846354166, + "grad_norm": 22.740436553955078, + "learning_rate": 8.646862750020546e-06, + "loss": 5.1399, + "step": 47225 + }, + { + "epoch": 0.9608968098958334, + "grad_norm": 25.187915802001953, + "learning_rate": 8.646589312344513e-06, + "loss": 5.2627, + "step": 47230 + }, + { + "epoch": 0.96099853515625, + "grad_norm": 18.49036407470703, + "learning_rate": 8.646315851367933e-06, + "loss": 5.0586, + "step": 47235 + }, + { + "epoch": 0.9611002604166666, + "grad_norm": 16.751089096069336, + "learning_rate": 8.646042367092551e-06, + "loss": 5.2063, + "step": 47240 + }, + { + "epoch": 0.9612019856770834, + "grad_norm": 20.238466262817383, + "learning_rate": 8.645768859520116e-06, + "loss": 5.0861, + "step": 47245 + }, + { + "epoch": 0.9613037109375, + "grad_norm": 18.466615676879883, + "learning_rate": 8.645495328652375e-06, + "loss": 5.1624, + "step": 47250 + }, + { + "epoch": 0.9614054361979166, + "grad_norm": 17.18227767944336, + "learning_rate": 8.645221774491078e-06, + "loss": 5.059, + "step": 47255 + }, + { + "epoch": 0.9615071614583334, + "grad_norm": 17.557817459106445, + "learning_rate": 8.644948197037969e-06, + "loss": 5.2173, + "step": 47260 + }, + { + "epoch": 0.96160888671875, + "grad_norm": 21.368404388427734, + "learning_rate": 8.644674596294799e-06, + "loss": 4.6428, + "step": 47265 + }, + { + "epoch": 0.9617106119791666, + "grad_norm": 17.08077049255371, + "learning_rate": 8.644400972263314e-06, + "loss": 5.1129, + "step": 47270 + }, + { + "epoch": 0.9618123372395834, + "grad_norm": 18.80756378173828, + "learning_rate": 8.644127324945265e-06, + "loss": 4.902, + "step": 47275 + }, + { + "epoch": 0.9619140625, + "grad_norm": 15.801793098449707, + "learning_rate": 8.6438536543424e-06, + "loss": 4.8937, + "step": 47280 + }, + { + "epoch": 0.9620157877604166, + "grad_norm": 17.179189682006836, + "learning_rate": 8.643579960456465e-06, + "loss": 5.0459, + "step": 47285 + }, + { + "epoch": 0.9621175130208334, + "grad_norm": 16.283674240112305, + "learning_rate": 8.643306243289212e-06, + "loss": 5.0673, + "step": 47290 + }, + { + "epoch": 0.96221923828125, + "grad_norm": 17.12388801574707, + "learning_rate": 8.643032502842387e-06, + "loss": 5.2136, + "step": 47295 + }, + { + "epoch": 0.9623209635416666, + "grad_norm": 17.95309829711914, + "learning_rate": 8.642758739117743e-06, + "loss": 4.9729, + "step": 47300 + }, + { + "epoch": 0.9624226888020834, + "grad_norm": 14.276808738708496, + "learning_rate": 8.642484952117026e-06, + "loss": 5.0483, + "step": 47305 + }, + { + "epoch": 0.9625244140625, + "grad_norm": 21.121179580688477, + "learning_rate": 8.642211141841986e-06, + "loss": 5.4245, + "step": 47310 + }, + { + "epoch": 0.9626261393229166, + "grad_norm": 18.17770004272461, + "learning_rate": 8.641937308294372e-06, + "loss": 5.2441, + "step": 47315 + }, + { + "epoch": 0.9627278645833334, + "grad_norm": 15.974089622497559, + "learning_rate": 8.641663451475936e-06, + "loss": 5.4071, + "step": 47320 + }, + { + "epoch": 0.96282958984375, + "grad_norm": 24.108003616333008, + "learning_rate": 8.641389571388428e-06, + "loss": 5.2131, + "step": 47325 + }, + { + "epoch": 0.9629313151041666, + "grad_norm": 13.798370361328125, + "learning_rate": 8.641115668033595e-06, + "loss": 5.3122, + "step": 47330 + }, + { + "epoch": 0.9630330403645834, + "grad_norm": 16.29169464111328, + "learning_rate": 8.640841741413188e-06, + "loss": 4.9829, + "step": 47335 + }, + { + "epoch": 0.963134765625, + "grad_norm": 16.070817947387695, + "learning_rate": 8.64056779152896e-06, + "loss": 4.7949, + "step": 47340 + }, + { + "epoch": 0.9632364908854166, + "grad_norm": 16.4368839263916, + "learning_rate": 8.64029381838266e-06, + "loss": 5.0901, + "step": 47345 + }, + { + "epoch": 0.9633382161458334, + "grad_norm": 16.02420425415039, + "learning_rate": 8.640019821976034e-06, + "loss": 4.8561, + "step": 47350 + }, + { + "epoch": 0.96343994140625, + "grad_norm": 19.607341766357422, + "learning_rate": 8.63974580231084e-06, + "loss": 4.7718, + "step": 47355 + }, + { + "epoch": 0.9635416666666666, + "grad_norm": 18.932863235473633, + "learning_rate": 8.639471759388825e-06, + "loss": 5.1392, + "step": 47360 + }, + { + "epoch": 0.9636433919270834, + "grad_norm": 19.980716705322266, + "learning_rate": 8.63919769321174e-06, + "loss": 5.0944, + "step": 47365 + }, + { + "epoch": 0.9637451171875, + "grad_norm": 20.725187301635742, + "learning_rate": 8.63892360378134e-06, + "loss": 5.0703, + "step": 47370 + }, + { + "epoch": 0.9638468424479166, + "grad_norm": 18.048009872436523, + "learning_rate": 8.63864949109937e-06, + "loss": 5.0931, + "step": 47375 + }, + { + "epoch": 0.9639485677083334, + "grad_norm": 17.270587921142578, + "learning_rate": 8.638375355167586e-06, + "loss": 4.9566, + "step": 47380 + }, + { + "epoch": 0.96405029296875, + "grad_norm": 15.883404731750488, + "learning_rate": 8.638101195987738e-06, + "loss": 5.1413, + "step": 47385 + }, + { + "epoch": 0.9641520182291666, + "grad_norm": 14.8269624710083, + "learning_rate": 8.63782701356158e-06, + "loss": 4.9595, + "step": 47390 + }, + { + "epoch": 0.9642537434895834, + "grad_norm": 15.820860862731934, + "learning_rate": 8.63755280789086e-06, + "loss": 5.1888, + "step": 47395 + }, + { + "epoch": 0.96435546875, + "grad_norm": 17.757568359375, + "learning_rate": 8.637278578977332e-06, + "loss": 5.0302, + "step": 47400 + }, + { + "epoch": 0.9644571940104166, + "grad_norm": 16.82150650024414, + "learning_rate": 8.637004326822751e-06, + "loss": 5.0648, + "step": 47405 + }, + { + "epoch": 0.9645589192708334, + "grad_norm": 15.127779960632324, + "learning_rate": 8.636730051428866e-06, + "loss": 4.9787, + "step": 47410 + }, + { + "epoch": 0.96466064453125, + "grad_norm": 15.205060005187988, + "learning_rate": 8.63645575279743e-06, + "loss": 4.8729, + "step": 47415 + }, + { + "epoch": 0.9647623697916666, + "grad_norm": 16.137161254882812, + "learning_rate": 8.636181430930196e-06, + "loss": 5.005, + "step": 47420 + }, + { + "epoch": 0.9648640950520834, + "grad_norm": 15.75645923614502, + "learning_rate": 8.635907085828916e-06, + "loss": 4.8812, + "step": 47425 + }, + { + "epoch": 0.9649658203125, + "grad_norm": 20.2498779296875, + "learning_rate": 8.635632717495347e-06, + "loss": 5.1492, + "step": 47430 + }, + { + "epoch": 0.9650675455729166, + "grad_norm": 15.356266021728516, + "learning_rate": 8.635358325931236e-06, + "loss": 4.9715, + "step": 47435 + }, + { + "epoch": 0.9651692708333334, + "grad_norm": 16.640911102294922, + "learning_rate": 8.63508391113834e-06, + "loss": 4.9544, + "step": 47440 + }, + { + "epoch": 0.96527099609375, + "grad_norm": 15.614727973937988, + "learning_rate": 8.63480947311841e-06, + "loss": 5.2148, + "step": 47445 + }, + { + "epoch": 0.9653727213541666, + "grad_norm": 16.59054946899414, + "learning_rate": 8.634535011873204e-06, + "loss": 5.5675, + "step": 47450 + }, + { + "epoch": 0.9654744466145834, + "grad_norm": 13.007445335388184, + "learning_rate": 8.634260527404472e-06, + "loss": 5.0556, + "step": 47455 + }, + { + "epoch": 0.965576171875, + "grad_norm": 20.86238670349121, + "learning_rate": 8.63398601971397e-06, + "loss": 5.2942, + "step": 47460 + }, + { + "epoch": 0.9656778971354166, + "grad_norm": 19.75774383544922, + "learning_rate": 8.63371148880345e-06, + "loss": 4.9888, + "step": 47465 + }, + { + "epoch": 0.9657796223958334, + "grad_norm": 15.92248249053955, + "learning_rate": 8.633436934674668e-06, + "loss": 5.1869, + "step": 47470 + }, + { + "epoch": 0.96588134765625, + "grad_norm": 17.902524948120117, + "learning_rate": 8.633162357329377e-06, + "loss": 5.0859, + "step": 47475 + }, + { + "epoch": 0.9659830729166666, + "grad_norm": 12.790898323059082, + "learning_rate": 8.63288775676933e-06, + "loss": 5.0555, + "step": 47480 + }, + { + "epoch": 0.9660847981770834, + "grad_norm": 21.425952911376953, + "learning_rate": 8.632613132996285e-06, + "loss": 4.902, + "step": 47485 + }, + { + "epoch": 0.9661865234375, + "grad_norm": 17.206775665283203, + "learning_rate": 8.632338486011995e-06, + "loss": 4.9339, + "step": 47490 + }, + { + "epoch": 0.9662882486979166, + "grad_norm": 16.45416831970215, + "learning_rate": 8.632063815818217e-06, + "loss": 5.0795, + "step": 47495 + }, + { + "epoch": 0.9663899739583334, + "grad_norm": 16.544193267822266, + "learning_rate": 8.631789122416702e-06, + "loss": 5.0562, + "step": 47500 + }, + { + "epoch": 0.96649169921875, + "grad_norm": 19.950756072998047, + "learning_rate": 8.631514405809208e-06, + "loss": 5.1598, + "step": 47505 + }, + { + "epoch": 0.9665934244791666, + "grad_norm": 15.756986618041992, + "learning_rate": 8.631239665997488e-06, + "loss": 4.9475, + "step": 47510 + }, + { + "epoch": 0.9666951497395834, + "grad_norm": 14.31860637664795, + "learning_rate": 8.630964902983303e-06, + "loss": 4.9088, + "step": 47515 + }, + { + "epoch": 0.966796875, + "grad_norm": 19.997364044189453, + "learning_rate": 8.6306901167684e-06, + "loss": 4.9726, + "step": 47520 + }, + { + "epoch": 0.9668986002604166, + "grad_norm": 26.920368194580078, + "learning_rate": 8.630415307354544e-06, + "loss": 4.9065, + "step": 47525 + }, + { + "epoch": 0.9670003255208334, + "grad_norm": 19.664608001708984, + "learning_rate": 8.630140474743485e-06, + "loss": 4.9162, + "step": 47530 + }, + { + "epoch": 0.96710205078125, + "grad_norm": 16.6022891998291, + "learning_rate": 8.62986561893698e-06, + "loss": 5.2091, + "step": 47535 + }, + { + "epoch": 0.9672037760416666, + "grad_norm": 16.590837478637695, + "learning_rate": 8.629590739936789e-06, + "loss": 5.1148, + "step": 47540 + }, + { + "epoch": 0.9673055013020834, + "grad_norm": 17.663402557373047, + "learning_rate": 8.629315837744662e-06, + "loss": 4.8528, + "step": 47545 + }, + { + "epoch": 0.9674072265625, + "grad_norm": 14.134257316589355, + "learning_rate": 8.62904091236236e-06, + "loss": 5.1313, + "step": 47550 + }, + { + "epoch": 0.9675089518229166, + "grad_norm": 18.24213981628418, + "learning_rate": 8.628765963791638e-06, + "loss": 4.9372, + "step": 47555 + }, + { + "epoch": 0.9676106770833334, + "grad_norm": 24.00977897644043, + "learning_rate": 8.628490992034256e-06, + "loss": 4.9172, + "step": 47560 + }, + { + "epoch": 0.96771240234375, + "grad_norm": 13.586995124816895, + "learning_rate": 8.628215997091965e-06, + "loss": 5.2948, + "step": 47565 + }, + { + "epoch": 0.9678141276041666, + "grad_norm": 20.406021118164062, + "learning_rate": 8.627940978966527e-06, + "loss": 4.9555, + "step": 47570 + }, + { + "epoch": 0.9679158528645834, + "grad_norm": 14.494266510009766, + "learning_rate": 8.627665937659697e-06, + "loss": 4.903, + "step": 47575 + }, + { + "epoch": 0.968017578125, + "grad_norm": 15.25130558013916, + "learning_rate": 8.627390873173235e-06, + "loss": 4.9185, + "step": 47580 + }, + { + "epoch": 0.9681193033854166, + "grad_norm": 19.009016036987305, + "learning_rate": 8.627115785508896e-06, + "loss": 5.1056, + "step": 47585 + }, + { + "epoch": 0.9682210286458334, + "grad_norm": 11.948423385620117, + "learning_rate": 8.626840674668437e-06, + "loss": 4.863, + "step": 47590 + }, + { + "epoch": 0.96832275390625, + "grad_norm": 21.88857650756836, + "learning_rate": 8.62656554065362e-06, + "loss": 5.0557, + "step": 47595 + }, + { + "epoch": 0.9684244791666666, + "grad_norm": 18.30642318725586, + "learning_rate": 8.6262903834662e-06, + "loss": 4.8786, + "step": 47600 + }, + { + "epoch": 0.9685262044270834, + "grad_norm": 19.31427001953125, + "learning_rate": 8.626015203107934e-06, + "loss": 4.8605, + "step": 47605 + }, + { + "epoch": 0.9686279296875, + "grad_norm": 16.239696502685547, + "learning_rate": 8.625739999580582e-06, + "loss": 4.7415, + "step": 47610 + }, + { + "epoch": 0.9687296549479166, + "grad_norm": 29.32090187072754, + "learning_rate": 8.625464772885903e-06, + "loss": 4.7186, + "step": 47615 + }, + { + "epoch": 0.9688313802083334, + "grad_norm": 16.425012588500977, + "learning_rate": 8.625189523025657e-06, + "loss": 5.0357, + "step": 47620 + }, + { + "epoch": 0.96893310546875, + "grad_norm": 16.233444213867188, + "learning_rate": 8.624914250001598e-06, + "loss": 5.0834, + "step": 47625 + }, + { + "epoch": 0.9690348307291666, + "grad_norm": 16.213825225830078, + "learning_rate": 8.624638953815487e-06, + "loss": 5.2392, + "step": 47630 + }, + { + "epoch": 0.9691365559895834, + "grad_norm": 15.814722061157227, + "learning_rate": 8.624363634469086e-06, + "loss": 4.9524, + "step": 47635 + }, + { + "epoch": 0.96923828125, + "grad_norm": 18.665069580078125, + "learning_rate": 8.62408829196415e-06, + "loss": 5.0353, + "step": 47640 + }, + { + "epoch": 0.9693400065104166, + "grad_norm": 19.12314224243164, + "learning_rate": 8.623812926302442e-06, + "loss": 4.9548, + "step": 47645 + }, + { + "epoch": 0.9694417317708334, + "grad_norm": 14.975316047668457, + "learning_rate": 8.623537537485718e-06, + "loss": 4.9163, + "step": 47650 + }, + { + "epoch": 0.96954345703125, + "grad_norm": 18.75388526916504, + "learning_rate": 8.623262125515741e-06, + "loss": 4.9561, + "step": 47655 + }, + { + "epoch": 0.9696451822916666, + "grad_norm": 18.12784194946289, + "learning_rate": 8.622986690394268e-06, + "loss": 4.9278, + "step": 47660 + }, + { + "epoch": 0.9697469075520834, + "grad_norm": 16.25618553161621, + "learning_rate": 8.622711232123062e-06, + "loss": 5.1049, + "step": 47665 + }, + { + "epoch": 0.9698486328125, + "grad_norm": 13.293037414550781, + "learning_rate": 8.62243575070388e-06, + "loss": 5.1047, + "step": 47670 + }, + { + "epoch": 0.9699503580729166, + "grad_norm": 17.623443603515625, + "learning_rate": 8.622160246138483e-06, + "loss": 4.9939, + "step": 47675 + }, + { + "epoch": 0.9700520833333334, + "grad_norm": 18.225627899169922, + "learning_rate": 8.621884718428632e-06, + "loss": 4.9976, + "step": 47680 + }, + { + "epoch": 0.97015380859375, + "grad_norm": 17.206146240234375, + "learning_rate": 8.621609167576089e-06, + "loss": 4.8631, + "step": 47685 + }, + { + "epoch": 0.9702555338541666, + "grad_norm": 22.549917221069336, + "learning_rate": 8.621333593582612e-06, + "loss": 5.0409, + "step": 47690 + }, + { + "epoch": 0.9703572591145834, + "grad_norm": 18.568618774414062, + "learning_rate": 8.621057996449963e-06, + "loss": 5.0969, + "step": 47695 + }, + { + "epoch": 0.970458984375, + "grad_norm": 16.182159423828125, + "learning_rate": 8.620782376179901e-06, + "loss": 4.7361, + "step": 47700 + }, + { + "epoch": 0.9705607096354166, + "grad_norm": 17.954668045043945, + "learning_rate": 8.62050673277419e-06, + "loss": 4.8638, + "step": 47705 + }, + { + "epoch": 0.9706624348958334, + "grad_norm": 11.490238189697266, + "learning_rate": 8.620231066234592e-06, + "loss": 5.0847, + "step": 47710 + }, + { + "epoch": 0.97076416015625, + "grad_norm": 24.71381187438965, + "learning_rate": 8.619955376562865e-06, + "loss": 4.7597, + "step": 47715 + }, + { + "epoch": 0.9708658854166666, + "grad_norm": 24.324234008789062, + "learning_rate": 8.619679663760773e-06, + "loss": 5.1504, + "step": 47720 + }, + { + "epoch": 0.9709676106770834, + "grad_norm": 18.237749099731445, + "learning_rate": 8.619403927830077e-06, + "loss": 5.0379, + "step": 47725 + }, + { + "epoch": 0.9710693359375, + "grad_norm": 15.128486633300781, + "learning_rate": 8.61912816877254e-06, + "loss": 4.9381, + "step": 47730 + }, + { + "epoch": 0.9711710611979166, + "grad_norm": 16.387222290039062, + "learning_rate": 8.61885238658992e-06, + "loss": 5.1714, + "step": 47735 + }, + { + "epoch": 0.9712727864583334, + "grad_norm": 24.838491439819336, + "learning_rate": 8.618576581283983e-06, + "loss": 5.0819, + "step": 47740 + }, + { + "epoch": 0.97137451171875, + "grad_norm": 21.441768646240234, + "learning_rate": 8.618300752856492e-06, + "loss": 4.9046, + "step": 47745 + }, + { + "epoch": 0.9714762369791666, + "grad_norm": 14.206487655639648, + "learning_rate": 8.618024901309206e-06, + "loss": 4.9386, + "step": 47750 + }, + { + "epoch": 0.9715779622395834, + "grad_norm": 18.309350967407227, + "learning_rate": 8.617749026643892e-06, + "loss": 5.1894, + "step": 47755 + }, + { + "epoch": 0.9716796875, + "grad_norm": 18.76384735107422, + "learning_rate": 8.617473128862307e-06, + "loss": 5.0716, + "step": 47760 + }, + { + "epoch": 0.9717814127604166, + "grad_norm": 16.43670082092285, + "learning_rate": 8.617197207966217e-06, + "loss": 4.9821, + "step": 47765 + }, + { + "epoch": 0.9718831380208334, + "grad_norm": 14.364315032958984, + "learning_rate": 8.616921263957387e-06, + "loss": 5.1188, + "step": 47770 + }, + { + "epoch": 0.97198486328125, + "grad_norm": 21.393306732177734, + "learning_rate": 8.616645296837578e-06, + "loss": 5.2092, + "step": 47775 + }, + { + "epoch": 0.9720865885416666, + "grad_norm": 15.536739349365234, + "learning_rate": 8.616369306608552e-06, + "loss": 4.9029, + "step": 47780 + }, + { + "epoch": 0.9721883138020834, + "grad_norm": 18.200546264648438, + "learning_rate": 8.616093293272076e-06, + "loss": 5.2359, + "step": 47785 + }, + { + "epoch": 0.9722900390625, + "grad_norm": 17.95948600769043, + "learning_rate": 8.615817256829912e-06, + "loss": 4.9186, + "step": 47790 + }, + { + "epoch": 0.9723917643229166, + "grad_norm": 15.869820594787598, + "learning_rate": 8.615541197283824e-06, + "loss": 4.8266, + "step": 47795 + }, + { + "epoch": 0.9724934895833334, + "grad_norm": 13.48196029663086, + "learning_rate": 8.615265114635572e-06, + "loss": 5.0991, + "step": 47800 + }, + { + "epoch": 0.97259521484375, + "grad_norm": 15.35083293914795, + "learning_rate": 8.614989008886927e-06, + "loss": 5.086, + "step": 47805 + }, + { + "epoch": 0.9726969401041666, + "grad_norm": 13.864507675170898, + "learning_rate": 8.614712880039648e-06, + "loss": 5.0477, + "step": 47810 + }, + { + "epoch": 0.9727986653645834, + "grad_norm": 16.146514892578125, + "learning_rate": 8.614436728095502e-06, + "loss": 5.2936, + "step": 47815 + }, + { + "epoch": 0.972900390625, + "grad_norm": 17.701894760131836, + "learning_rate": 8.614160553056251e-06, + "loss": 5.3143, + "step": 47820 + }, + { + "epoch": 0.9730021158854166, + "grad_norm": 15.18911075592041, + "learning_rate": 8.613884354923664e-06, + "loss": 5.0779, + "step": 47825 + }, + { + "epoch": 0.9731038411458334, + "grad_norm": 13.478972434997559, + "learning_rate": 8.6136081336995e-06, + "loss": 5.0012, + "step": 47830 + }, + { + "epoch": 0.97320556640625, + "grad_norm": 20.784061431884766, + "learning_rate": 8.61333188938553e-06, + "loss": 4.767, + "step": 47835 + }, + { + "epoch": 0.9733072916666666, + "grad_norm": 23.921445846557617, + "learning_rate": 8.613055621983516e-06, + "loss": 4.9788, + "step": 47840 + }, + { + "epoch": 0.9734090169270834, + "grad_norm": 17.004701614379883, + "learning_rate": 8.612779331495222e-06, + "loss": 4.8863, + "step": 47845 + }, + { + "epoch": 0.9735107421875, + "grad_norm": 15.631455421447754, + "learning_rate": 8.612503017922415e-06, + "loss": 5.0236, + "step": 47850 + }, + { + "epoch": 0.9736124674479166, + "grad_norm": 16.01015853881836, + "learning_rate": 8.612226681266863e-06, + "loss": 5.1512, + "step": 47855 + }, + { + "epoch": 0.9737141927083334, + "grad_norm": 19.166725158691406, + "learning_rate": 8.611950321530327e-06, + "loss": 4.9563, + "step": 47860 + }, + { + "epoch": 0.97381591796875, + "grad_norm": 14.222599029541016, + "learning_rate": 8.611673938714576e-06, + "loss": 5.0326, + "step": 47865 + }, + { + "epoch": 0.9739176432291666, + "grad_norm": 14.192057609558105, + "learning_rate": 8.611397532821373e-06, + "loss": 4.7801, + "step": 47870 + }, + { + "epoch": 0.9740193684895834, + "grad_norm": 22.753084182739258, + "learning_rate": 8.611121103852489e-06, + "loss": 5.2415, + "step": 47875 + }, + { + "epoch": 0.97412109375, + "grad_norm": 14.329468727111816, + "learning_rate": 8.610844651809686e-06, + "loss": 5.1323, + "step": 47880 + }, + { + "epoch": 0.9742228190104166, + "grad_norm": 20.680084228515625, + "learning_rate": 8.610568176694731e-06, + "loss": 5.0139, + "step": 47885 + }, + { + "epoch": 0.9743245442708334, + "grad_norm": 17.30199432373047, + "learning_rate": 8.610291678509393e-06, + "loss": 5.2238, + "step": 47890 + }, + { + "epoch": 0.97442626953125, + "grad_norm": 17.627986907958984, + "learning_rate": 8.610015157255438e-06, + "loss": 4.9507, + "step": 47895 + }, + { + "epoch": 0.9745279947916666, + "grad_norm": 12.587896347045898, + "learning_rate": 8.60973861293463e-06, + "loss": 5.1613, + "step": 47900 + }, + { + "epoch": 0.9746297200520834, + "grad_norm": 15.520218849182129, + "learning_rate": 8.609462045548738e-06, + "loss": 5.2874, + "step": 47905 + }, + { + "epoch": 0.9747314453125, + "grad_norm": 16.983911514282227, + "learning_rate": 8.609185455099531e-06, + "loss": 4.9577, + "step": 47910 + }, + { + "epoch": 0.9748331705729166, + "grad_norm": 20.318058013916016, + "learning_rate": 8.608908841588775e-06, + "loss": 5.2258, + "step": 47915 + }, + { + "epoch": 0.9749348958333334, + "grad_norm": 14.944866180419922, + "learning_rate": 8.608632205018235e-06, + "loss": 4.9994, + "step": 47920 + }, + { + "epoch": 0.97503662109375, + "grad_norm": 17.289003372192383, + "learning_rate": 8.608355545389682e-06, + "loss": 5.169, + "step": 47925 + }, + { + "epoch": 0.9751383463541666, + "grad_norm": 15.275737762451172, + "learning_rate": 8.608078862704883e-06, + "loss": 4.8666, + "step": 47930 + }, + { + "epoch": 0.9752400716145834, + "grad_norm": 25.068113327026367, + "learning_rate": 8.607802156965603e-06, + "loss": 5.0096, + "step": 47935 + }, + { + "epoch": 0.975341796875, + "grad_norm": 17.659042358398438, + "learning_rate": 8.607525428173615e-06, + "loss": 5.1772, + "step": 47940 + }, + { + "epoch": 0.9754435221354166, + "grad_norm": 16.913240432739258, + "learning_rate": 8.607248676330685e-06, + "loss": 4.8793, + "step": 47945 + }, + { + "epoch": 0.9755452473958334, + "grad_norm": 15.044074058532715, + "learning_rate": 8.60697190143858e-06, + "loss": 4.9845, + "step": 47950 + }, + { + "epoch": 0.97564697265625, + "grad_norm": 18.013328552246094, + "learning_rate": 8.606695103499069e-06, + "loss": 5.2351, + "step": 47955 + }, + { + "epoch": 0.9757486979166666, + "grad_norm": 19.628738403320312, + "learning_rate": 8.606418282513921e-06, + "loss": 5.1079, + "step": 47960 + }, + { + "epoch": 0.9758504231770834, + "grad_norm": 19.525157928466797, + "learning_rate": 8.606141438484907e-06, + "loss": 5.0058, + "step": 47965 + }, + { + "epoch": 0.9759521484375, + "grad_norm": 17.376611709594727, + "learning_rate": 8.605864571413793e-06, + "loss": 5.1724, + "step": 47970 + }, + { + "epoch": 0.9760538736979166, + "grad_norm": 14.289008140563965, + "learning_rate": 8.605587681302347e-06, + "loss": 5.0795, + "step": 47975 + }, + { + "epoch": 0.9761555989583334, + "grad_norm": 14.451820373535156, + "learning_rate": 8.605310768152343e-06, + "loss": 5.0184, + "step": 47980 + }, + { + "epoch": 0.97625732421875, + "grad_norm": 14.795913696289062, + "learning_rate": 8.605033831965547e-06, + "loss": 5.2098, + "step": 47985 + }, + { + "epoch": 0.9763590494791666, + "grad_norm": 15.251574516296387, + "learning_rate": 8.604756872743728e-06, + "loss": 4.9282, + "step": 47990 + }, + { + "epoch": 0.9764607747395834, + "grad_norm": 16.250568389892578, + "learning_rate": 8.60447989048866e-06, + "loss": 4.8132, + "step": 47995 + }, + { + "epoch": 0.9765625, + "grad_norm": 16.180849075317383, + "learning_rate": 8.604202885202107e-06, + "loss": 5.1332, + "step": 48000 + }, + { + "epoch": 0.9766642252604166, + "grad_norm": 19.792095184326172, + "learning_rate": 8.603925856885842e-06, + "loss": 5.0108, + "step": 48005 + }, + { + "epoch": 0.9767659505208334, + "grad_norm": 20.676713943481445, + "learning_rate": 8.603648805541634e-06, + "loss": 5.0429, + "step": 48010 + }, + { + "epoch": 0.97686767578125, + "grad_norm": 18.098581314086914, + "learning_rate": 8.603371731171256e-06, + "loss": 4.9953, + "step": 48015 + }, + { + "epoch": 0.9769694010416666, + "grad_norm": 17.066219329833984, + "learning_rate": 8.603094633776476e-06, + "loss": 4.9976, + "step": 48020 + }, + { + "epoch": 0.9770711263020834, + "grad_norm": 15.654402732849121, + "learning_rate": 8.602817513359064e-06, + "loss": 4.8989, + "step": 48025 + }, + { + "epoch": 0.9771728515625, + "grad_norm": 19.57740592956543, + "learning_rate": 8.602540369920793e-06, + "loss": 4.9349, + "step": 48030 + }, + { + "epoch": 0.9772745768229166, + "grad_norm": 17.65016746520996, + "learning_rate": 8.602263203463432e-06, + "loss": 5.1575, + "step": 48035 + }, + { + "epoch": 0.9773763020833334, + "grad_norm": 16.83289337158203, + "learning_rate": 8.601986013988753e-06, + "loss": 4.7464, + "step": 48040 + }, + { + "epoch": 0.97747802734375, + "grad_norm": 16.774520874023438, + "learning_rate": 8.601708801498526e-06, + "loss": 4.8298, + "step": 48045 + }, + { + "epoch": 0.9775797526041666, + "grad_norm": 19.246662139892578, + "learning_rate": 8.601431565994524e-06, + "loss": 5.0458, + "step": 48050 + }, + { + "epoch": 0.9776814778645834, + "grad_norm": 17.460216522216797, + "learning_rate": 8.601154307478517e-06, + "loss": 4.9457, + "step": 48055 + }, + { + "epoch": 0.977783203125, + "grad_norm": 21.582183837890625, + "learning_rate": 8.600877025952276e-06, + "loss": 5.1306, + "step": 48060 + }, + { + "epoch": 0.9778849283854166, + "grad_norm": 16.89261245727539, + "learning_rate": 8.600599721417576e-06, + "loss": 5.0695, + "step": 48065 + }, + { + "epoch": 0.9779866536458334, + "grad_norm": 15.822879791259766, + "learning_rate": 8.600322393876185e-06, + "loss": 4.8451, + "step": 48070 + }, + { + "epoch": 0.97808837890625, + "grad_norm": 15.746513366699219, + "learning_rate": 8.60004504332988e-06, + "loss": 5.0496, + "step": 48075 + }, + { + "epoch": 0.9781901041666666, + "grad_norm": 19.222917556762695, + "learning_rate": 8.599767669780427e-06, + "loss": 5.3149, + "step": 48080 + }, + { + "epoch": 0.9782918294270834, + "grad_norm": 16.821910858154297, + "learning_rate": 8.599490273229602e-06, + "loss": 5.004, + "step": 48085 + }, + { + "epoch": 0.9783935546875, + "grad_norm": 14.243106842041016, + "learning_rate": 8.599212853679178e-06, + "loss": 5.2702, + "step": 48090 + }, + { + "epoch": 0.9784952799479166, + "grad_norm": 14.419164657592773, + "learning_rate": 8.598935411130925e-06, + "loss": 4.816, + "step": 48095 + }, + { + "epoch": 0.9785970052083334, + "grad_norm": 18.010141372680664, + "learning_rate": 8.598657945586617e-06, + "loss": 5.1226, + "step": 48100 + }, + { + "epoch": 0.97869873046875, + "grad_norm": 20.692602157592773, + "learning_rate": 8.598380457048027e-06, + "loss": 4.9333, + "step": 48105 + }, + { + "epoch": 0.9788004557291666, + "grad_norm": 20.44984245300293, + "learning_rate": 8.598102945516928e-06, + "loss": 5.1309, + "step": 48110 + }, + { + "epoch": 0.9789021809895834, + "grad_norm": 17.45133399963379, + "learning_rate": 8.597825410995095e-06, + "loss": 4.9836, + "step": 48115 + }, + { + "epoch": 0.97900390625, + "grad_norm": 17.51549530029297, + "learning_rate": 8.597547853484298e-06, + "loss": 4.8322, + "step": 48120 + }, + { + "epoch": 0.9791056315104166, + "grad_norm": 22.133041381835938, + "learning_rate": 8.597270272986313e-06, + "loss": 5.17, + "step": 48125 + }, + { + "epoch": 0.9792073567708334, + "grad_norm": 13.939135551452637, + "learning_rate": 8.596992669502913e-06, + "loss": 5.132, + "step": 48130 + }, + { + "epoch": 0.97930908203125, + "grad_norm": 12.797228813171387, + "learning_rate": 8.596715043035873e-06, + "loss": 5.1133, + "step": 48135 + }, + { + "epoch": 0.9794108072916666, + "grad_norm": 16.605308532714844, + "learning_rate": 8.596437393586963e-06, + "loss": 5.0591, + "step": 48140 + }, + { + "epoch": 0.9795125325520834, + "grad_norm": 22.022796630859375, + "learning_rate": 8.596159721157961e-06, + "loss": 5.1371, + "step": 48145 + }, + { + "epoch": 0.9796142578125, + "grad_norm": 14.949295997619629, + "learning_rate": 8.595882025750641e-06, + "loss": 5.1331, + "step": 48150 + }, + { + "epoch": 0.9797159830729166, + "grad_norm": 22.875045776367188, + "learning_rate": 8.595604307366775e-06, + "loss": 4.9916, + "step": 48155 + }, + { + "epoch": 0.9798177083333334, + "grad_norm": 19.577211380004883, + "learning_rate": 8.595326566008137e-06, + "loss": 4.9288, + "step": 48160 + }, + { + "epoch": 0.97991943359375, + "grad_norm": 15.640681266784668, + "learning_rate": 8.595048801676506e-06, + "loss": 5.006, + "step": 48165 + }, + { + "epoch": 0.9800211588541666, + "grad_norm": 14.377607345581055, + "learning_rate": 8.594771014373655e-06, + "loss": 4.9875, + "step": 48170 + }, + { + "epoch": 0.9801228841145834, + "grad_norm": 16.34181785583496, + "learning_rate": 8.594493204101357e-06, + "loss": 5.1334, + "step": 48175 + }, + { + "epoch": 0.980224609375, + "grad_norm": 18.48345375061035, + "learning_rate": 8.59421537086139e-06, + "loss": 4.8288, + "step": 48180 + }, + { + "epoch": 0.9803263346354166, + "grad_norm": 17.775325775146484, + "learning_rate": 8.593937514655527e-06, + "loss": 5.0168, + "step": 48185 + }, + { + "epoch": 0.9804280598958334, + "grad_norm": 13.297990798950195, + "learning_rate": 8.593659635485544e-06, + "loss": 5.0731, + "step": 48190 + }, + { + "epoch": 0.98052978515625, + "grad_norm": 15.682979583740234, + "learning_rate": 8.593381733353214e-06, + "loss": 4.7323, + "step": 48195 + }, + { + "epoch": 0.9806315104166666, + "grad_norm": 15.588661193847656, + "learning_rate": 8.59310380826032e-06, + "loss": 4.8786, + "step": 48200 + }, + { + "epoch": 0.9807332356770834, + "grad_norm": 17.32611083984375, + "learning_rate": 8.59282586020863e-06, + "loss": 5.289, + "step": 48205 + }, + { + "epoch": 0.9808349609375, + "grad_norm": 16.609392166137695, + "learning_rate": 8.592547889199924e-06, + "loss": 5.1798, + "step": 48210 + }, + { + "epoch": 0.9809366861979166, + "grad_norm": 18.68968963623047, + "learning_rate": 8.592269895235979e-06, + "loss": 4.9669, + "step": 48215 + }, + { + "epoch": 0.9810384114583334, + "grad_norm": 17.000167846679688, + "learning_rate": 8.591991878318569e-06, + "loss": 5.1429, + "step": 48220 + }, + { + "epoch": 0.98114013671875, + "grad_norm": 16.36261558532715, + "learning_rate": 8.59171383844947e-06, + "loss": 5.3064, + "step": 48225 + }, + { + "epoch": 0.9812418619791666, + "grad_norm": 13.26115894317627, + "learning_rate": 8.59143577563046e-06, + "loss": 4.9516, + "step": 48230 + }, + { + "epoch": 0.9813435872395834, + "grad_norm": 14.614829063415527, + "learning_rate": 8.591157689863316e-06, + "loss": 5.0258, + "step": 48235 + }, + { + "epoch": 0.9814453125, + "grad_norm": 15.819396018981934, + "learning_rate": 8.590879581149814e-06, + "loss": 4.9629, + "step": 48240 + }, + { + "epoch": 0.9815470377604166, + "grad_norm": 13.339287757873535, + "learning_rate": 8.590601449491731e-06, + "loss": 5.0329, + "step": 48245 + }, + { + "epoch": 0.9816487630208334, + "grad_norm": 19.991914749145508, + "learning_rate": 8.590323294890845e-06, + "loss": 5.0776, + "step": 48250 + }, + { + "epoch": 0.98175048828125, + "grad_norm": 19.38823127746582, + "learning_rate": 8.590045117348933e-06, + "loss": 5.2512, + "step": 48255 + }, + { + "epoch": 0.9818522135416666, + "grad_norm": 22.173141479492188, + "learning_rate": 8.589766916867774e-06, + "loss": 5.0809, + "step": 48260 + }, + { + "epoch": 0.9819539388020834, + "grad_norm": 13.00747013092041, + "learning_rate": 8.589488693449142e-06, + "loss": 4.9306, + "step": 48265 + }, + { + "epoch": 0.9820556640625, + "grad_norm": 15.463179588317871, + "learning_rate": 8.589210447094816e-06, + "loss": 5.0567, + "step": 48270 + }, + { + "epoch": 0.9821573893229166, + "grad_norm": 15.999430656433105, + "learning_rate": 8.588932177806575e-06, + "loss": 4.9748, + "step": 48275 + }, + { + "epoch": 0.9822591145833334, + "grad_norm": 18.75946044921875, + "learning_rate": 8.588653885586198e-06, + "loss": 5.2275, + "step": 48280 + }, + { + "epoch": 0.98236083984375, + "grad_norm": 18.816761016845703, + "learning_rate": 8.58837557043546e-06, + "loss": 4.888, + "step": 48285 + }, + { + "epoch": 0.9824625651041666, + "grad_norm": 13.929689407348633, + "learning_rate": 8.588097232356144e-06, + "loss": 4.9801, + "step": 48290 + }, + { + "epoch": 0.9825642903645834, + "grad_norm": 15.878829002380371, + "learning_rate": 8.587818871350025e-06, + "loss": 5.0754, + "step": 48295 + }, + { + "epoch": 0.982666015625, + "grad_norm": 17.089750289916992, + "learning_rate": 8.58754048741888e-06, + "loss": 5.1405, + "step": 48300 + }, + { + "epoch": 0.9827677408854166, + "grad_norm": 15.435402870178223, + "learning_rate": 8.587262080564491e-06, + "loss": 4.8705, + "step": 48305 + }, + { + "epoch": 0.9828694661458334, + "grad_norm": 22.29397201538086, + "learning_rate": 8.586983650788637e-06, + "loss": 4.9781, + "step": 48310 + }, + { + "epoch": 0.98297119140625, + "grad_norm": 13.164569854736328, + "learning_rate": 8.586705198093096e-06, + "loss": 5.0561, + "step": 48315 + }, + { + "epoch": 0.9830729166666666, + "grad_norm": 16.15287208557129, + "learning_rate": 8.586426722479647e-06, + "loss": 5.1941, + "step": 48320 + }, + { + "epoch": 0.9831746419270834, + "grad_norm": 18.255855560302734, + "learning_rate": 8.58614822395007e-06, + "loss": 5.187, + "step": 48325 + }, + { + "epoch": 0.9832763671875, + "grad_norm": 16.320743560791016, + "learning_rate": 8.585869702506145e-06, + "loss": 4.8491, + "step": 48330 + }, + { + "epoch": 0.9833780924479166, + "grad_norm": 18.013917922973633, + "learning_rate": 8.585591158149649e-06, + "loss": 4.7892, + "step": 48335 + }, + { + "epoch": 0.9834798177083334, + "grad_norm": 20.711654663085938, + "learning_rate": 8.585312590882364e-06, + "loss": 4.6835, + "step": 48340 + }, + { + "epoch": 0.98358154296875, + "grad_norm": 21.310176849365234, + "learning_rate": 8.585034000706071e-06, + "loss": 5.0614, + "step": 48345 + }, + { + "epoch": 0.9836832682291666, + "grad_norm": 19.158296585083008, + "learning_rate": 8.584755387622548e-06, + "loss": 5.1192, + "step": 48350 + }, + { + "epoch": 0.9837849934895834, + "grad_norm": 16.35950469970703, + "learning_rate": 8.584476751633576e-06, + "loss": 4.8699, + "step": 48355 + }, + { + "epoch": 0.98388671875, + "grad_norm": 13.358195304870605, + "learning_rate": 8.584198092740935e-06, + "loss": 5.043, + "step": 48360 + }, + { + "epoch": 0.9839884440104166, + "grad_norm": 16.256473541259766, + "learning_rate": 8.583919410946406e-06, + "loss": 5.2204, + "step": 48365 + }, + { + "epoch": 0.9840901692708334, + "grad_norm": 12.77108383178711, + "learning_rate": 8.583640706251772e-06, + "loss": 4.6554, + "step": 48370 + }, + { + "epoch": 0.98419189453125, + "grad_norm": 18.56621742248535, + "learning_rate": 8.58336197865881e-06, + "loss": 5.0843, + "step": 48375 + }, + { + "epoch": 0.9842936197916666, + "grad_norm": 16.902462005615234, + "learning_rate": 8.583083228169301e-06, + "loss": 4.8181, + "step": 48380 + }, + { + "epoch": 0.9843953450520834, + "grad_norm": 21.515079498291016, + "learning_rate": 8.582804454785028e-06, + "loss": 5.1672, + "step": 48385 + }, + { + "epoch": 0.9844970703125, + "grad_norm": 22.632240295410156, + "learning_rate": 8.582525658507773e-06, + "loss": 5.0244, + "step": 48390 + }, + { + "epoch": 0.9845987955729166, + "grad_norm": 17.210006713867188, + "learning_rate": 8.582246839339316e-06, + "loss": 5.1042, + "step": 48395 + }, + { + "epoch": 0.9847005208333334, + "grad_norm": 20.883289337158203, + "learning_rate": 8.58196799728144e-06, + "loss": 5.1849, + "step": 48400 + }, + { + "epoch": 0.98480224609375, + "grad_norm": 16.0712833404541, + "learning_rate": 8.581689132335923e-06, + "loss": 4.8478, + "step": 48405 + }, + { + "epoch": 0.9849039713541666, + "grad_norm": 17.38435173034668, + "learning_rate": 8.58141024450455e-06, + "loss": 4.9801, + "step": 48410 + }, + { + "epoch": 0.9850056966145834, + "grad_norm": 16.22496795654297, + "learning_rate": 8.581131333789104e-06, + "loss": 4.9508, + "step": 48415 + }, + { + "epoch": 0.985107421875, + "grad_norm": 13.973748207092285, + "learning_rate": 8.580852400191365e-06, + "loss": 5.1335, + "step": 48420 + }, + { + "epoch": 0.9852091471354166, + "grad_norm": 19.30055046081543, + "learning_rate": 8.580573443713115e-06, + "loss": 5.1104, + "step": 48425 + }, + { + "epoch": 0.9853108723958334, + "grad_norm": 20.015560150146484, + "learning_rate": 8.580294464356138e-06, + "loss": 5.0141, + "step": 48430 + }, + { + "epoch": 0.98541259765625, + "grad_norm": 19.80840301513672, + "learning_rate": 8.580015462122218e-06, + "loss": 4.9852, + "step": 48435 + }, + { + "epoch": 0.9855143229166666, + "grad_norm": 16.322261810302734, + "learning_rate": 8.579736437013133e-06, + "loss": 4.6724, + "step": 48440 + }, + { + "epoch": 0.9856160481770834, + "grad_norm": 17.68556785583496, + "learning_rate": 8.579457389030668e-06, + "loss": 4.9644, + "step": 48445 + }, + { + "epoch": 0.9857177734375, + "grad_norm": 27.968402862548828, + "learning_rate": 8.579178318176608e-06, + "loss": 5.0592, + "step": 48450 + }, + { + "epoch": 0.9858194986979166, + "grad_norm": 18.399112701416016, + "learning_rate": 8.578899224452734e-06, + "loss": 5.0041, + "step": 48455 + }, + { + "epoch": 0.9859212239583334, + "grad_norm": 14.329474449157715, + "learning_rate": 8.57862010786083e-06, + "loss": 4.8216, + "step": 48460 + }, + { + "epoch": 0.98602294921875, + "grad_norm": 20.66102409362793, + "learning_rate": 8.57834096840268e-06, + "loss": 5.1294, + "step": 48465 + }, + { + "epoch": 0.9861246744791666, + "grad_norm": 19.324031829833984, + "learning_rate": 8.578061806080066e-06, + "loss": 4.7413, + "step": 48470 + }, + { + "epoch": 0.9862263997395834, + "grad_norm": 16.12556266784668, + "learning_rate": 8.577782620894775e-06, + "loss": 4.9753, + "step": 48475 + }, + { + "epoch": 0.986328125, + "grad_norm": 26.275325775146484, + "learning_rate": 8.577503412848589e-06, + "loss": 5.0684, + "step": 48480 + }, + { + "epoch": 0.9864298502604166, + "grad_norm": 17.495737075805664, + "learning_rate": 8.577224181943289e-06, + "loss": 5.1703, + "step": 48485 + }, + { + "epoch": 0.9865315755208334, + "grad_norm": 25.04590606689453, + "learning_rate": 8.576944928180663e-06, + "loss": 4.8142, + "step": 48490 + }, + { + "epoch": 0.98663330078125, + "grad_norm": 13.589649200439453, + "learning_rate": 8.576665651562496e-06, + "loss": 5.0997, + "step": 48495 + }, + { + "epoch": 0.9867350260416666, + "grad_norm": 21.160282135009766, + "learning_rate": 8.57638635209057e-06, + "loss": 4.968, + "step": 48500 + }, + { + "epoch": 0.9868367513020834, + "grad_norm": 23.064176559448242, + "learning_rate": 8.57610702976667e-06, + "loss": 4.9051, + "step": 48505 + }, + { + "epoch": 0.9869384765625, + "grad_norm": 13.44221019744873, + "learning_rate": 8.575827684592581e-06, + "loss": 4.8711, + "step": 48510 + }, + { + "epoch": 0.9870402018229166, + "grad_norm": 13.303998947143555, + "learning_rate": 8.57554831657009e-06, + "loss": 4.838, + "step": 48515 + }, + { + "epoch": 0.9871419270833334, + "grad_norm": 20.062419891357422, + "learning_rate": 8.57526892570098e-06, + "loss": 4.7644, + "step": 48520 + }, + { + "epoch": 0.98724365234375, + "grad_norm": 13.35070514678955, + "learning_rate": 8.574989511987035e-06, + "loss": 4.9522, + "step": 48525 + }, + { + "epoch": 0.9873453776041666, + "grad_norm": 18.8725528717041, + "learning_rate": 8.57471007543004e-06, + "loss": 5.166, + "step": 48530 + }, + { + "epoch": 0.9874471028645834, + "grad_norm": 18.79351043701172, + "learning_rate": 8.574430616031786e-06, + "loss": 5.0887, + "step": 48535 + }, + { + "epoch": 0.987548828125, + "grad_norm": 18.280136108398438, + "learning_rate": 8.574151133794056e-06, + "loss": 5.156, + "step": 48540 + }, + { + "epoch": 0.9876505533854166, + "grad_norm": 17.429096221923828, + "learning_rate": 8.573871628718631e-06, + "loss": 4.8611, + "step": 48545 + }, + { + "epoch": 0.9877522786458334, + "grad_norm": 12.448982238769531, + "learning_rate": 8.573592100807304e-06, + "loss": 5.0447, + "step": 48550 + }, + { + "epoch": 0.98785400390625, + "grad_norm": 15.132949829101562, + "learning_rate": 8.573312550061857e-06, + "loss": 4.9473, + "step": 48555 + }, + { + "epoch": 0.9879557291666666, + "grad_norm": 14.896417617797852, + "learning_rate": 8.573032976484074e-06, + "loss": 4.9322, + "step": 48560 + }, + { + "epoch": 0.9880574544270834, + "grad_norm": 17.252986907958984, + "learning_rate": 8.572753380075749e-06, + "loss": 4.9284, + "step": 48565 + }, + { + "epoch": 0.9881591796875, + "grad_norm": 18.381006240844727, + "learning_rate": 8.572473760838662e-06, + "loss": 4.9098, + "step": 48570 + }, + { + "epoch": 0.9882609049479166, + "grad_norm": 17.327102661132812, + "learning_rate": 8.572194118774601e-06, + "loss": 4.9366, + "step": 48575 + }, + { + "epoch": 0.9883626302083334, + "grad_norm": 17.353179931640625, + "learning_rate": 8.571914453885354e-06, + "loss": 5.1293, + "step": 48580 + }, + { + "epoch": 0.98846435546875, + "grad_norm": 16.711349487304688, + "learning_rate": 8.571634766172708e-06, + "loss": 4.775, + "step": 48585 + }, + { + "epoch": 0.9885660807291666, + "grad_norm": 16.298568725585938, + "learning_rate": 8.571355055638451e-06, + "loss": 4.8698, + "step": 48590 + }, + { + "epoch": 0.9886678059895834, + "grad_norm": 15.778936386108398, + "learning_rate": 8.571075322284368e-06, + "loss": 4.8619, + "step": 48595 + }, + { + "epoch": 0.98876953125, + "grad_norm": 12.92691707611084, + "learning_rate": 8.570795566112245e-06, + "loss": 4.9325, + "step": 48600 + }, + { + "epoch": 0.9888712565104166, + "grad_norm": 20.552162170410156, + "learning_rate": 8.570515787123874e-06, + "loss": 5.1152, + "step": 48605 + }, + { + "epoch": 0.9889729817708334, + "grad_norm": 21.989990234375, + "learning_rate": 8.57023598532104e-06, + "loss": 5.0976, + "step": 48610 + }, + { + "epoch": 0.98907470703125, + "grad_norm": 18.947681427001953, + "learning_rate": 8.569956160705532e-06, + "loss": 4.9484, + "step": 48615 + }, + { + "epoch": 0.9891764322916666, + "grad_norm": 17.64944076538086, + "learning_rate": 8.569676313279137e-06, + "loss": 4.8971, + "step": 48620 + }, + { + "epoch": 0.9892781575520834, + "grad_norm": 23.45442008972168, + "learning_rate": 8.569396443043643e-06, + "loss": 4.977, + "step": 48625 + }, + { + "epoch": 0.9893798828125, + "grad_norm": 21.956850051879883, + "learning_rate": 8.56911655000084e-06, + "loss": 5.2568, + "step": 48630 + }, + { + "epoch": 0.9894816080729166, + "grad_norm": 19.147899627685547, + "learning_rate": 8.568836634152516e-06, + "loss": 4.8621, + "step": 48635 + }, + { + "epoch": 0.9895833333333334, + "grad_norm": 23.147754669189453, + "learning_rate": 8.568556695500456e-06, + "loss": 5.2636, + "step": 48640 + }, + { + "epoch": 0.98968505859375, + "grad_norm": 13.837553977966309, + "learning_rate": 8.568276734046452e-06, + "loss": 5.0291, + "step": 48645 + }, + { + "epoch": 0.9897867838541666, + "grad_norm": 17.85003662109375, + "learning_rate": 8.567996749792293e-06, + "loss": 4.9108, + "step": 48650 + }, + { + "epoch": 0.9898885091145834, + "grad_norm": 17.80352020263672, + "learning_rate": 8.567716742739769e-06, + "loss": 4.9022, + "step": 48655 + }, + { + "epoch": 0.989990234375, + "grad_norm": 15.873879432678223, + "learning_rate": 8.567436712890666e-06, + "loss": 4.8728, + "step": 48660 + }, + { + "epoch": 0.9900919596354166, + "grad_norm": 23.215435028076172, + "learning_rate": 8.567156660246774e-06, + "loss": 5.1023, + "step": 48665 + }, + { + "epoch": 0.9901936848958334, + "grad_norm": 14.607815742492676, + "learning_rate": 8.566876584809885e-06, + "loss": 5.0509, + "step": 48670 + }, + { + "epoch": 0.99029541015625, + "grad_norm": 18.6451358795166, + "learning_rate": 8.566596486581786e-06, + "loss": 5.0569, + "step": 48675 + }, + { + "epoch": 0.9903971354166666, + "grad_norm": 16.561294555664062, + "learning_rate": 8.566316365564268e-06, + "loss": 4.9881, + "step": 48680 + }, + { + "epoch": 0.9904988606770834, + "grad_norm": 17.135711669921875, + "learning_rate": 8.56603622175912e-06, + "loss": 4.9009, + "step": 48685 + }, + { + "epoch": 0.9906005859375, + "grad_norm": 14.603614807128906, + "learning_rate": 8.565756055168133e-06, + "loss": 5.0473, + "step": 48690 + }, + { + "epoch": 0.9907023111979166, + "grad_norm": 17.98837661743164, + "learning_rate": 8.565475865793099e-06, + "loss": 5.2812, + "step": 48695 + }, + { + "epoch": 0.9908040364583334, + "grad_norm": 15.750791549682617, + "learning_rate": 8.565195653635803e-06, + "loss": 4.7652, + "step": 48700 + }, + { + "epoch": 0.99090576171875, + "grad_norm": 18.963499069213867, + "learning_rate": 8.56491541869804e-06, + "loss": 5.0241, + "step": 48705 + }, + { + "epoch": 0.9910074869791666, + "grad_norm": 16.98630142211914, + "learning_rate": 8.564635160981598e-06, + "loss": 4.8139, + "step": 48710 + }, + { + "epoch": 0.9911092122395834, + "grad_norm": 14.660598754882812, + "learning_rate": 8.564354880488268e-06, + "loss": 5.0576, + "step": 48715 + }, + { + "epoch": 0.9912109375, + "grad_norm": 15.064656257629395, + "learning_rate": 8.564074577219843e-06, + "loss": 4.9542, + "step": 48720 + }, + { + "epoch": 0.9913126627604166, + "grad_norm": 14.018280029296875, + "learning_rate": 8.563794251178113e-06, + "loss": 4.8399, + "step": 48725 + }, + { + "epoch": 0.9914143880208334, + "grad_norm": 16.307640075683594, + "learning_rate": 8.563513902364869e-06, + "loss": 5.1267, + "step": 48730 + }, + { + "epoch": 0.99151611328125, + "grad_norm": 18.788022994995117, + "learning_rate": 8.563233530781902e-06, + "loss": 5.2723, + "step": 48735 + }, + { + "epoch": 0.9916178385416666, + "grad_norm": 16.4036865234375, + "learning_rate": 8.562953136431005e-06, + "loss": 5.3003, + "step": 48740 + }, + { + "epoch": 0.9917195638020834, + "grad_norm": 21.712017059326172, + "learning_rate": 8.562672719313969e-06, + "loss": 5.0066, + "step": 48745 + }, + { + "epoch": 0.9918212890625, + "grad_norm": 15.559101104736328, + "learning_rate": 8.562392279432582e-06, + "loss": 4.886, + "step": 48750 + }, + { + "epoch": 0.9919230143229166, + "grad_norm": 18.546049118041992, + "learning_rate": 8.562111816788644e-06, + "loss": 4.8036, + "step": 48755 + }, + { + "epoch": 0.9920247395833334, + "grad_norm": 11.709075927734375, + "learning_rate": 8.561831331383939e-06, + "loss": 4.9319, + "step": 48760 + }, + { + "epoch": 0.99212646484375, + "grad_norm": 15.541711807250977, + "learning_rate": 8.561550823220262e-06, + "loss": 4.8656, + "step": 48765 + }, + { + "epoch": 0.9922281901041666, + "grad_norm": 17.204082489013672, + "learning_rate": 8.561270292299407e-06, + "loss": 5.0799, + "step": 48770 + }, + { + "epoch": 0.9923299153645834, + "grad_norm": 18.66228675842285, + "learning_rate": 8.560989738623165e-06, + "loss": 4.8302, + "step": 48775 + }, + { + "epoch": 0.992431640625, + "grad_norm": 14.958258628845215, + "learning_rate": 8.560709162193329e-06, + "loss": 4.9895, + "step": 48780 + }, + { + "epoch": 0.9925333658854166, + "grad_norm": 19.640729904174805, + "learning_rate": 8.56042856301169e-06, + "loss": 4.878, + "step": 48785 + }, + { + "epoch": 0.9926350911458334, + "grad_norm": 20.860342025756836, + "learning_rate": 8.560147941080045e-06, + "loss": 5.0259, + "step": 48790 + }, + { + "epoch": 0.99273681640625, + "grad_norm": 16.937456130981445, + "learning_rate": 8.559867296400187e-06, + "loss": 4.9431, + "step": 48795 + }, + { + "epoch": 0.9928385416666666, + "grad_norm": 17.035947799682617, + "learning_rate": 8.559586628973902e-06, + "loss": 5.1314, + "step": 48800 + }, + { + "epoch": 0.9929402669270834, + "grad_norm": 18.34052848815918, + "learning_rate": 8.55930593880299e-06, + "loss": 5.1426, + "step": 48805 + }, + { + "epoch": 0.9930419921875, + "grad_norm": 17.27220344543457, + "learning_rate": 8.559025225889246e-06, + "loss": 5.1135, + "step": 48810 + }, + { + "epoch": 0.9931437174479166, + "grad_norm": 18.973041534423828, + "learning_rate": 8.558744490234457e-06, + "loss": 5.0338, + "step": 48815 + }, + { + "epoch": 0.9932454427083334, + "grad_norm": 18.596817016601562, + "learning_rate": 8.558463731840424e-06, + "loss": 4.8067, + "step": 48820 + }, + { + "epoch": 0.99334716796875, + "grad_norm": 15.979645729064941, + "learning_rate": 8.558182950708933e-06, + "loss": 4.8518, + "step": 48825 + }, + { + "epoch": 0.9934488932291666, + "grad_norm": 16.716829299926758, + "learning_rate": 8.557902146841786e-06, + "loss": 5.1263, + "step": 48830 + }, + { + "epoch": 0.9935506184895834, + "grad_norm": 15.54236125946045, + "learning_rate": 8.557621320240772e-06, + "loss": 4.9893, + "step": 48835 + }, + { + "epoch": 0.99365234375, + "grad_norm": 20.30299949645996, + "learning_rate": 8.557340470907687e-06, + "loss": 4.8279, + "step": 48840 + }, + { + "epoch": 0.9937540690104166, + "grad_norm": 19.49803352355957, + "learning_rate": 8.557059598844326e-06, + "loss": 5.4106, + "step": 48845 + }, + { + "epoch": 0.9938557942708334, + "grad_norm": 17.324195861816406, + "learning_rate": 8.556778704052484e-06, + "loss": 5.2643, + "step": 48850 + }, + { + "epoch": 0.99395751953125, + "grad_norm": 15.156524658203125, + "learning_rate": 8.556497786533953e-06, + "loss": 5.1295, + "step": 48855 + }, + { + "epoch": 0.9940592447916666, + "grad_norm": 16.79237174987793, + "learning_rate": 8.556216846290533e-06, + "loss": 5.2387, + "step": 48860 + }, + { + "epoch": 0.9941609700520834, + "grad_norm": 13.890887260437012, + "learning_rate": 8.555935883324014e-06, + "loss": 4.9823, + "step": 48865 + }, + { + "epoch": 0.9942626953125, + "grad_norm": 15.640703201293945, + "learning_rate": 8.555654897636194e-06, + "loss": 4.8919, + "step": 48870 + }, + { + "epoch": 0.9943644205729166, + "grad_norm": 13.569287300109863, + "learning_rate": 8.555373889228868e-06, + "loss": 5.2486, + "step": 48875 + }, + { + "epoch": 0.9944661458333334, + "grad_norm": 15.541230201721191, + "learning_rate": 8.555092858103832e-06, + "loss": 5.0031, + "step": 48880 + }, + { + "epoch": 0.99456787109375, + "grad_norm": 17.761735916137695, + "learning_rate": 8.55481180426288e-06, + "loss": 4.7512, + "step": 48885 + }, + { + "epoch": 0.9946695963541666, + "grad_norm": 16.22867774963379, + "learning_rate": 8.55453072770781e-06, + "loss": 4.9644, + "step": 48890 + }, + { + "epoch": 0.9947713216145834, + "grad_norm": 18.81041717529297, + "learning_rate": 8.55424962844042e-06, + "loss": 5.1738, + "step": 48895 + }, + { + "epoch": 0.994873046875, + "grad_norm": 18.7274169921875, + "learning_rate": 8.553968506462497e-06, + "loss": 5.306, + "step": 48900 + }, + { + "epoch": 0.9949747721354166, + "grad_norm": 18.671131134033203, + "learning_rate": 8.55368736177585e-06, + "loss": 4.687, + "step": 48905 + }, + { + "epoch": 0.9950764973958334, + "grad_norm": 19.687074661254883, + "learning_rate": 8.553406194382264e-06, + "loss": 5.0543, + "step": 48910 + }, + { + "epoch": 0.99517822265625, + "grad_norm": 13.864228248596191, + "learning_rate": 8.553125004283542e-06, + "loss": 5.4767, + "step": 48915 + }, + { + "epoch": 0.9952799479166666, + "grad_norm": 17.501798629760742, + "learning_rate": 8.552843791481477e-06, + "loss": 5.094, + "step": 48920 + }, + { + "epoch": 0.9953816731770834, + "grad_norm": 18.237943649291992, + "learning_rate": 8.55256255597787e-06, + "loss": 5.0493, + "step": 48925 + }, + { + "epoch": 0.9954833984375, + "grad_norm": 15.646058082580566, + "learning_rate": 8.552281297774516e-06, + "loss": 4.9917, + "step": 48930 + }, + { + "epoch": 0.9955851236979166, + "grad_norm": 19.371009826660156, + "learning_rate": 8.552000016873212e-06, + "loss": 4.917, + "step": 48935 + }, + { + "epoch": 0.9956868489583334, + "grad_norm": 17.801597595214844, + "learning_rate": 8.551718713275756e-06, + "loss": 4.8394, + "step": 48940 + }, + { + "epoch": 0.99578857421875, + "grad_norm": 28.305051803588867, + "learning_rate": 8.551437386983945e-06, + "loss": 4.8575, + "step": 48945 + }, + { + "epoch": 0.9958902994791666, + "grad_norm": 15.72253131866455, + "learning_rate": 8.551156037999575e-06, + "loss": 5.161, + "step": 48950 + }, + { + "epoch": 0.9959920247395834, + "grad_norm": 17.38005256652832, + "learning_rate": 8.550874666324447e-06, + "loss": 5.1453, + "step": 48955 + }, + { + "epoch": 0.99609375, + "grad_norm": 22.96340560913086, + "learning_rate": 8.550593271960356e-06, + "loss": 5.0933, + "step": 48960 + }, + { + "epoch": 0.9961954752604166, + "grad_norm": 20.62730598449707, + "learning_rate": 8.550311854909102e-06, + "loss": 4.8335, + "step": 48965 + }, + { + "epoch": 0.9962972005208334, + "grad_norm": 19.472442626953125, + "learning_rate": 8.55003041517248e-06, + "loss": 4.9681, + "step": 48970 + }, + { + "epoch": 0.99639892578125, + "grad_norm": 24.282272338867188, + "learning_rate": 8.549748952752293e-06, + "loss": 5.1072, + "step": 48975 + }, + { + "epoch": 0.9965006510416666, + "grad_norm": 25.24115562438965, + "learning_rate": 8.549467467650336e-06, + "loss": 5.1337, + "step": 48980 + }, + { + "epoch": 0.9966023763020834, + "grad_norm": 14.42555046081543, + "learning_rate": 8.549185959868408e-06, + "loss": 5.0608, + "step": 48985 + }, + { + "epoch": 0.9967041015625, + "grad_norm": 21.554380416870117, + "learning_rate": 8.54890442940831e-06, + "loss": 5.2409, + "step": 48990 + }, + { + "epoch": 0.9968058268229166, + "grad_norm": 16.537879943847656, + "learning_rate": 8.548622876271838e-06, + "loss": 5.2111, + "step": 48995 + }, + { + "epoch": 0.9969075520833334, + "grad_norm": 22.28121566772461, + "learning_rate": 8.548341300460792e-06, + "loss": 5.0873, + "step": 49000 + }, + { + "epoch": 0.99700927734375, + "grad_norm": 16.434223175048828, + "learning_rate": 8.548059701976974e-06, + "loss": 5.0063, + "step": 49005 + }, + { + "epoch": 0.9971110026041666, + "grad_norm": 15.080023765563965, + "learning_rate": 8.547778080822179e-06, + "loss": 4.9634, + "step": 49010 + }, + { + "epoch": 0.9972127278645834, + "grad_norm": 18.47478675842285, + "learning_rate": 8.547496436998209e-06, + "loss": 5.4138, + "step": 49015 + }, + { + "epoch": 0.997314453125, + "grad_norm": 17.202173233032227, + "learning_rate": 8.547214770506861e-06, + "loss": 4.7124, + "step": 49020 + }, + { + "epoch": 0.9974161783854166, + "grad_norm": 17.15684700012207, + "learning_rate": 8.546933081349937e-06, + "loss": 5.0117, + "step": 49025 + }, + { + "epoch": 0.9975179036458334, + "grad_norm": 27.229270935058594, + "learning_rate": 8.546651369529238e-06, + "loss": 5.1623, + "step": 49030 + }, + { + "epoch": 0.99761962890625, + "grad_norm": 21.359464645385742, + "learning_rate": 8.546369635046562e-06, + "loss": 4.8993, + "step": 49035 + }, + { + "epoch": 0.9977213541666666, + "grad_norm": 16.130889892578125, + "learning_rate": 8.54608787790371e-06, + "loss": 5.1255, + "step": 49040 + }, + { + "epoch": 0.9978230794270834, + "grad_norm": 19.079959869384766, + "learning_rate": 8.545806098102482e-06, + "loss": 5.1798, + "step": 49045 + }, + { + "epoch": 0.9979248046875, + "grad_norm": 19.37075424194336, + "learning_rate": 8.545524295644679e-06, + "loss": 5.1654, + "step": 49050 + }, + { + "epoch": 0.9980265299479166, + "grad_norm": 16.996009826660156, + "learning_rate": 8.5452424705321e-06, + "loss": 5.1046, + "step": 49055 + }, + { + "epoch": 0.9981282552083334, + "grad_norm": 14.378419876098633, + "learning_rate": 8.544960622766549e-06, + "loss": 5.0613, + "step": 49060 + }, + { + "epoch": 0.99822998046875, + "grad_norm": 14.848052978515625, + "learning_rate": 8.544678752349823e-06, + "loss": 5.0399, + "step": 49065 + }, + { + "epoch": 0.9983317057291666, + "grad_norm": 16.69135284423828, + "learning_rate": 8.544396859283725e-06, + "loss": 4.928, + "step": 49070 + }, + { + "epoch": 0.9984334309895834, + "grad_norm": 19.27393341064453, + "learning_rate": 8.544114943570058e-06, + "loss": 5.0431, + "step": 49075 + }, + { + "epoch": 0.99853515625, + "grad_norm": 18.154239654541016, + "learning_rate": 8.543833005210621e-06, + "loss": 4.7243, + "step": 49080 + }, + { + "epoch": 0.9986368815104166, + "grad_norm": 21.03568458557129, + "learning_rate": 8.543551044207215e-06, + "loss": 4.9446, + "step": 49085 + }, + { + "epoch": 0.9987386067708334, + "grad_norm": 19.796348571777344, + "learning_rate": 8.543269060561645e-06, + "loss": 4.9211, + "step": 49090 + }, + { + "epoch": 0.99884033203125, + "grad_norm": 18.540363311767578, + "learning_rate": 8.542987054275708e-06, + "loss": 5.2683, + "step": 49095 + }, + { + "epoch": 0.9989420572916666, + "grad_norm": 19.497875213623047, + "learning_rate": 8.54270502535121e-06, + "loss": 5.086, + "step": 49100 + }, + { + "epoch": 0.9990437825520834, + "grad_norm": 16.816776275634766, + "learning_rate": 8.54242297378995e-06, + "loss": 4.8604, + "step": 49105 + }, + { + "epoch": 0.9991455078125, + "grad_norm": 21.188087463378906, + "learning_rate": 8.542140899593732e-06, + "loss": 4.9446, + "step": 49110 + }, + { + "epoch": 0.9992472330729166, + "grad_norm": 15.380881309509277, + "learning_rate": 8.54185880276436e-06, + "loss": 4.9014, + "step": 49115 + }, + { + "epoch": 0.9993489583333334, + "grad_norm": 14.98224925994873, + "learning_rate": 8.541576683303632e-06, + "loss": 5.0166, + "step": 49120 + }, + { + "epoch": 0.99945068359375, + "grad_norm": 15.580916404724121, + "learning_rate": 8.541294541213354e-06, + "loss": 4.8632, + "step": 49125 + }, + { + "epoch": 0.9995524088541666, + "grad_norm": 17.692411422729492, + "learning_rate": 8.54101237649533e-06, + "loss": 4.7933, + "step": 49130 + }, + { + "epoch": 0.9996541341145834, + "grad_norm": 25.09701919555664, + "learning_rate": 8.540730189151358e-06, + "loss": 4.9296, + "step": 49135 + }, + { + "epoch": 0.999755859375, + "grad_norm": 19.373004913330078, + "learning_rate": 8.540447979183246e-06, + "loss": 5.2483, + "step": 49140 + }, + { + "epoch": 0.9998575846354166, + "grad_norm": 18.908618927001953, + "learning_rate": 8.540165746592794e-06, + "loss": 5.3886, + "step": 49145 + }, + { + "epoch": 0.9999593098958334, + "grad_norm": 17.063941955566406, + "learning_rate": 8.539883491381808e-06, + "loss": 4.9427, + "step": 49150 + }, + { + "epoch": 1.0, + "eval_loss": 5.023543357849121, + "eval_runtime": 107.6279, + "eval_samples_per_second": 18.648, + "eval_steps_per_second": 9.328, + "step": 49152 + }, + { + "epoch": 1.00006103515625, + "grad_norm": 13.893716812133789, + "learning_rate": 8.53960121355209e-06, + "loss": 4.8282, + "step": 49155 + }, + { + "epoch": 1.0001627604166667, + "grad_norm": 14.390314102172852, + "learning_rate": 8.539318913105443e-06, + "loss": 5.0357, + "step": 49160 + }, + { + "epoch": 1.0002644856770833, + "grad_norm": 12.713805198669434, + "learning_rate": 8.539036590043672e-06, + "loss": 5.0363, + "step": 49165 + }, + { + "epoch": 1.0003662109375, + "grad_norm": 20.17597198486328, + "learning_rate": 8.538754244368581e-06, + "loss": 4.9002, + "step": 49170 + }, + { + "epoch": 1.0004679361979167, + "grad_norm": 28.101938247680664, + "learning_rate": 8.538471876081975e-06, + "loss": 5.0234, + "step": 49175 + }, + { + "epoch": 1.0005696614583333, + "grad_norm": 18.052257537841797, + "learning_rate": 8.538189485185655e-06, + "loss": 4.96, + "step": 49180 + }, + { + "epoch": 1.00067138671875, + "grad_norm": 11.292994499206543, + "learning_rate": 8.537907071681428e-06, + "loss": 4.8141, + "step": 49185 + }, + { + "epoch": 1.0007731119791667, + "grad_norm": 14.956276893615723, + "learning_rate": 8.537624635571097e-06, + "loss": 4.9044, + "step": 49190 + }, + { + "epoch": 1.0008748372395833, + "grad_norm": 17.070533752441406, + "learning_rate": 8.53734217685647e-06, + "loss": 4.9289, + "step": 49195 + }, + { + "epoch": 1.0009765625, + "grad_norm": 15.736486434936523, + "learning_rate": 8.537059695539347e-06, + "loss": 5.0092, + "step": 49200 + }, + { + "epoch": 1.0010782877604167, + "grad_norm": 20.379661560058594, + "learning_rate": 8.536777191621536e-06, + "loss": 4.8131, + "step": 49205 + }, + { + "epoch": 1.0011800130208333, + "grad_norm": 20.185291290283203, + "learning_rate": 8.536494665104841e-06, + "loss": 5.1022, + "step": 49210 + }, + { + "epoch": 1.00128173828125, + "grad_norm": 16.50724983215332, + "learning_rate": 8.536212115991068e-06, + "loss": 5.2319, + "step": 49215 + }, + { + "epoch": 1.0013834635416667, + "grad_norm": 15.085100173950195, + "learning_rate": 8.535929544282022e-06, + "loss": 4.664, + "step": 49220 + }, + { + "epoch": 1.0014851888020833, + "grad_norm": 15.049492835998535, + "learning_rate": 8.53564694997951e-06, + "loss": 5.115, + "step": 49225 + }, + { + "epoch": 1.0015869140625, + "grad_norm": 16.701820373535156, + "learning_rate": 8.535364333085335e-06, + "loss": 5.0642, + "step": 49230 + }, + { + "epoch": 1.0016886393229167, + "grad_norm": 17.55550193786621, + "learning_rate": 8.535081693601306e-06, + "loss": 4.7406, + "step": 49235 + }, + { + "epoch": 1.0017903645833333, + "grad_norm": 15.600825309753418, + "learning_rate": 8.534799031529225e-06, + "loss": 4.8105, + "step": 49240 + }, + { + "epoch": 1.00189208984375, + "grad_norm": 25.049448013305664, + "learning_rate": 8.5345163468709e-06, + "loss": 5.1503, + "step": 49245 + }, + { + "epoch": 1.0019938151041667, + "grad_norm": 20.873207092285156, + "learning_rate": 8.53423363962814e-06, + "loss": 4.9672, + "step": 49250 + }, + { + "epoch": 1.0020955403645833, + "grad_norm": 17.45471954345703, + "learning_rate": 8.533950909802747e-06, + "loss": 5.2129, + "step": 49255 + }, + { + "epoch": 1.002197265625, + "grad_norm": 13.443032264709473, + "learning_rate": 8.533668157396531e-06, + "loss": 4.9655, + "step": 49260 + }, + { + "epoch": 1.0022989908854167, + "grad_norm": 15.539262771606445, + "learning_rate": 8.533385382411297e-06, + "loss": 4.941, + "step": 49265 + }, + { + "epoch": 1.0024007161458333, + "grad_norm": 15.405316352844238, + "learning_rate": 8.533102584848849e-06, + "loss": 5.0377, + "step": 49270 + }, + { + "epoch": 1.00250244140625, + "grad_norm": 13.563315391540527, + "learning_rate": 8.532819764711e-06, + "loss": 4.9738, + "step": 49275 + }, + { + "epoch": 1.0026041666666667, + "grad_norm": 17.922588348388672, + "learning_rate": 8.532536921999554e-06, + "loss": 4.9064, + "step": 49280 + }, + { + "epoch": 1.0027058919270833, + "grad_norm": 17.871089935302734, + "learning_rate": 8.532254056716319e-06, + "loss": 5.0936, + "step": 49285 + }, + { + "epoch": 1.0028076171875, + "grad_norm": 19.82862091064453, + "learning_rate": 8.5319711688631e-06, + "loss": 4.8366, + "step": 49290 + }, + { + "epoch": 1.0029093424479167, + "grad_norm": 16.566661834716797, + "learning_rate": 8.531688258441708e-06, + "loss": 4.879, + "step": 49295 + }, + { + "epoch": 1.0030110677083333, + "grad_norm": 15.03042984008789, + "learning_rate": 8.531405325453948e-06, + "loss": 4.7961, + "step": 49300 + }, + { + "epoch": 1.00311279296875, + "grad_norm": 19.27758026123047, + "learning_rate": 8.53112236990163e-06, + "loss": 4.6992, + "step": 49305 + }, + { + "epoch": 1.0032145182291667, + "grad_norm": 15.573705673217773, + "learning_rate": 8.53083939178656e-06, + "loss": 5.0639, + "step": 49310 + }, + { + "epoch": 1.0033162434895833, + "grad_norm": 20.24220085144043, + "learning_rate": 8.530556391110547e-06, + "loss": 4.6978, + "step": 49315 + }, + { + "epoch": 1.00341796875, + "grad_norm": 17.000974655151367, + "learning_rate": 8.530273367875401e-06, + "loss": 4.9174, + "step": 49320 + }, + { + "epoch": 1.0035196940104167, + "grad_norm": 17.91844367980957, + "learning_rate": 8.529990322082927e-06, + "loss": 5.1852, + "step": 49325 + }, + { + "epoch": 1.0036214192708333, + "grad_norm": 21.165977478027344, + "learning_rate": 8.529707253734936e-06, + "loss": 5.1242, + "step": 49330 + }, + { + "epoch": 1.00372314453125, + "grad_norm": 16.247726440429688, + "learning_rate": 8.529424162833237e-06, + "loss": 5.1877, + "step": 49335 + }, + { + "epoch": 1.0038248697916667, + "grad_norm": 16.79099464416504, + "learning_rate": 8.529141049379637e-06, + "loss": 4.8549, + "step": 49340 + }, + { + "epoch": 1.0039265950520833, + "grad_norm": 16.799592971801758, + "learning_rate": 8.528857913375945e-06, + "loss": 4.9552, + "step": 49345 + }, + { + "epoch": 1.0040283203125, + "grad_norm": 15.099892616271973, + "learning_rate": 8.528574754823974e-06, + "loss": 4.9765, + "step": 49350 + }, + { + "epoch": 1.0041300455729167, + "grad_norm": 14.0614595413208, + "learning_rate": 8.528291573725529e-06, + "loss": 4.7067, + "step": 49355 + }, + { + "epoch": 1.0042317708333333, + "grad_norm": 18.00037384033203, + "learning_rate": 8.52800837008242e-06, + "loss": 4.7841, + "step": 49360 + }, + { + "epoch": 1.00433349609375, + "grad_norm": 16.71462631225586, + "learning_rate": 8.527725143896459e-06, + "loss": 4.717, + "step": 49365 + }, + { + "epoch": 1.0044352213541667, + "grad_norm": 19.917139053344727, + "learning_rate": 8.527441895169453e-06, + "loss": 4.7216, + "step": 49370 + }, + { + "epoch": 1.0045369466145833, + "grad_norm": 19.68880844116211, + "learning_rate": 8.527158623903214e-06, + "loss": 5.0429, + "step": 49375 + }, + { + "epoch": 1.004638671875, + "grad_norm": 18.9014949798584, + "learning_rate": 8.52687533009955e-06, + "loss": 4.971, + "step": 49380 + }, + { + "epoch": 1.0047403971354167, + "grad_norm": 17.53072738647461, + "learning_rate": 8.526592013760273e-06, + "loss": 5.1676, + "step": 49385 + }, + { + "epoch": 1.0048421223958333, + "grad_norm": 13.856674194335938, + "learning_rate": 8.52630867488719e-06, + "loss": 4.7893, + "step": 49390 + }, + { + "epoch": 1.00494384765625, + "grad_norm": 30.497241973876953, + "learning_rate": 8.52602531348212e-06, + "loss": 5.0765, + "step": 49395 + }, + { + "epoch": 1.0050455729166667, + "grad_norm": 20.201038360595703, + "learning_rate": 8.525741929546863e-06, + "loss": 5.1686, + "step": 49400 + }, + { + "epoch": 1.0051472981770833, + "grad_norm": 19.371675491333008, + "learning_rate": 8.525458523083235e-06, + "loss": 5.0377, + "step": 49405 + }, + { + "epoch": 1.0052490234375, + "grad_norm": 21.920114517211914, + "learning_rate": 8.525175094093046e-06, + "loss": 5.1229, + "step": 49410 + }, + { + "epoch": 1.0053507486979167, + "grad_norm": 15.455580711364746, + "learning_rate": 8.524891642578109e-06, + "loss": 4.9261, + "step": 49415 + }, + { + "epoch": 1.0054524739583333, + "grad_norm": 16.196401596069336, + "learning_rate": 8.52460816854023e-06, + "loss": 5.2597, + "step": 49420 + }, + { + "epoch": 1.00555419921875, + "grad_norm": 17.020315170288086, + "learning_rate": 8.524324671981225e-06, + "loss": 4.957, + "step": 49425 + }, + { + "epoch": 1.0056559244791667, + "grad_norm": 20.785566329956055, + "learning_rate": 8.524041152902905e-06, + "loss": 4.9799, + "step": 49430 + }, + { + "epoch": 1.0057576497395833, + "grad_norm": 17.63185691833496, + "learning_rate": 8.52375761130708e-06, + "loss": 4.9451, + "step": 49435 + }, + { + "epoch": 1.005859375, + "grad_norm": 12.132659912109375, + "learning_rate": 8.523474047195564e-06, + "loss": 5.1057, + "step": 49440 + }, + { + "epoch": 1.0059611002604167, + "grad_norm": 18.72160530090332, + "learning_rate": 8.523190460570165e-06, + "loss": 4.9145, + "step": 49445 + }, + { + "epoch": 1.0060628255208333, + "grad_norm": 15.63821029663086, + "learning_rate": 8.5229068514327e-06, + "loss": 4.8629, + "step": 49450 + }, + { + "epoch": 1.00616455078125, + "grad_norm": 15.38923454284668, + "learning_rate": 8.522623219784975e-06, + "loss": 4.8284, + "step": 49455 + }, + { + "epoch": 1.0062662760416667, + "grad_norm": 14.242500305175781, + "learning_rate": 8.522339565628808e-06, + "loss": 4.6041, + "step": 49460 + }, + { + "epoch": 1.0063680013020833, + "grad_norm": 14.852119445800781, + "learning_rate": 8.522055888966009e-06, + "loss": 4.9454, + "step": 49465 + }, + { + "epoch": 1.0064697265625, + "grad_norm": 17.325441360473633, + "learning_rate": 8.521772189798391e-06, + "loss": 5.0084, + "step": 49470 + }, + { + "epoch": 1.0065714518229167, + "grad_norm": 18.869951248168945, + "learning_rate": 8.521488468127768e-06, + "loss": 4.86, + "step": 49475 + }, + { + "epoch": 1.0066731770833333, + "grad_norm": 22.30113983154297, + "learning_rate": 8.52120472395595e-06, + "loss": 4.6794, + "step": 49480 + }, + { + "epoch": 1.00677490234375, + "grad_norm": 17.177967071533203, + "learning_rate": 8.520920957284753e-06, + "loss": 4.7315, + "step": 49485 + }, + { + "epoch": 1.0068766276041667, + "grad_norm": 20.797565460205078, + "learning_rate": 8.520637168115986e-06, + "loss": 4.8145, + "step": 49490 + }, + { + "epoch": 1.0069783528645833, + "grad_norm": 18.488372802734375, + "learning_rate": 8.520353356451468e-06, + "loss": 5.0979, + "step": 49495 + }, + { + "epoch": 1.007080078125, + "grad_norm": 25.087467193603516, + "learning_rate": 8.52006952229301e-06, + "loss": 4.993, + "step": 49500 + }, + { + "epoch": 1.0071818033854167, + "grad_norm": 22.37204933166504, + "learning_rate": 8.519785665642422e-06, + "loss": 4.881, + "step": 49505 + }, + { + "epoch": 1.0072835286458333, + "grad_norm": 17.55752944946289, + "learning_rate": 8.519501786501523e-06, + "loss": 4.8919, + "step": 49510 + }, + { + "epoch": 1.00738525390625, + "grad_norm": 17.727861404418945, + "learning_rate": 8.519217884872124e-06, + "loss": 4.7447, + "step": 49515 + }, + { + "epoch": 1.0074869791666667, + "grad_norm": 15.307409286499023, + "learning_rate": 8.518933960756039e-06, + "loss": 5.1027, + "step": 49520 + }, + { + "epoch": 1.0075887044270833, + "grad_norm": 16.537288665771484, + "learning_rate": 8.518650014155086e-06, + "loss": 5.1982, + "step": 49525 + }, + { + "epoch": 1.0076904296875, + "grad_norm": 16.17396354675293, + "learning_rate": 8.518366045071074e-06, + "loss": 5.0167, + "step": 49530 + }, + { + "epoch": 1.0077921549479167, + "grad_norm": 17.306936264038086, + "learning_rate": 8.51808205350582e-06, + "loss": 5.2276, + "step": 49535 + }, + { + "epoch": 1.0078938802083333, + "grad_norm": 20.478059768676758, + "learning_rate": 8.517798039461138e-06, + "loss": 5.0132, + "step": 49540 + }, + { + "epoch": 1.00799560546875, + "grad_norm": 15.301987648010254, + "learning_rate": 8.517514002938844e-06, + "loss": 4.7471, + "step": 49545 + }, + { + "epoch": 1.0080973307291667, + "grad_norm": 16.924522399902344, + "learning_rate": 8.517229943940753e-06, + "loss": 4.8997, + "step": 49550 + }, + { + "epoch": 1.0081990559895833, + "grad_norm": 17.771224975585938, + "learning_rate": 8.516945862468679e-06, + "loss": 5.2149, + "step": 49555 + }, + { + "epoch": 1.00830078125, + "grad_norm": 15.035135269165039, + "learning_rate": 8.516661758524436e-06, + "loss": 4.8389, + "step": 49560 + }, + { + "epoch": 1.0084025065104167, + "grad_norm": 17.338329315185547, + "learning_rate": 8.516377632109842e-06, + "loss": 5.0761, + "step": 49565 + }, + { + "epoch": 1.0085042317708333, + "grad_norm": 18.041772842407227, + "learning_rate": 8.51609348322671e-06, + "loss": 5.0208, + "step": 49570 + }, + { + "epoch": 1.00860595703125, + "grad_norm": 15.133748054504395, + "learning_rate": 8.515809311876859e-06, + "loss": 4.814, + "step": 49575 + }, + { + "epoch": 1.0087076822916667, + "grad_norm": 16.966766357421875, + "learning_rate": 8.5155251180621e-06, + "loss": 4.9319, + "step": 49580 + }, + { + "epoch": 1.0088094075520833, + "grad_norm": 15.94857406616211, + "learning_rate": 8.515240901784251e-06, + "loss": 4.9125, + "step": 49585 + }, + { + "epoch": 1.0089111328125, + "grad_norm": 14.697163581848145, + "learning_rate": 8.514956663045131e-06, + "loss": 4.858, + "step": 49590 + }, + { + "epoch": 1.0090128580729167, + "grad_norm": 19.311771392822266, + "learning_rate": 8.514672401846554e-06, + "loss": 5.0081, + "step": 49595 + }, + { + "epoch": 1.0091145833333333, + "grad_norm": 15.380820274353027, + "learning_rate": 8.514388118190334e-06, + "loss": 5.0712, + "step": 49600 + }, + { + "epoch": 1.00921630859375, + "grad_norm": 16.411523818969727, + "learning_rate": 8.51410381207829e-06, + "loss": 4.9905, + "step": 49605 + }, + { + "epoch": 1.0093180338541667, + "grad_norm": 13.25225830078125, + "learning_rate": 8.513819483512239e-06, + "loss": 5.1346, + "step": 49610 + }, + { + "epoch": 1.0094197591145833, + "grad_norm": 20.04414176940918, + "learning_rate": 8.513535132493998e-06, + "loss": 5.0025, + "step": 49615 + }, + { + "epoch": 1.009521484375, + "grad_norm": 14.995129585266113, + "learning_rate": 8.51325075902538e-06, + "loss": 4.9858, + "step": 49620 + }, + { + "epoch": 1.0096232096354167, + "grad_norm": 17.680208206176758, + "learning_rate": 8.512966363108206e-06, + "loss": 4.9749, + "step": 49625 + }, + { + "epoch": 1.0097249348958333, + "grad_norm": 18.935827255249023, + "learning_rate": 8.512681944744292e-06, + "loss": 5.3489, + "step": 49630 + }, + { + "epoch": 1.00982666015625, + "grad_norm": 15.774500846862793, + "learning_rate": 8.512397503935457e-06, + "loss": 5.1732, + "step": 49635 + }, + { + "epoch": 1.0099283854166667, + "grad_norm": 22.5977783203125, + "learning_rate": 8.512113040683516e-06, + "loss": 4.8062, + "step": 49640 + }, + { + "epoch": 1.0100301106770833, + "grad_norm": 17.601362228393555, + "learning_rate": 8.511828554990287e-06, + "loss": 5.1959, + "step": 49645 + }, + { + "epoch": 1.0101318359375, + "grad_norm": 29.856718063354492, + "learning_rate": 8.511544046857587e-06, + "loss": 5.3633, + "step": 49650 + }, + { + "epoch": 1.0102335611979167, + "grad_norm": 17.023548126220703, + "learning_rate": 8.511259516287238e-06, + "loss": 5.0112, + "step": 49655 + }, + { + "epoch": 1.0103352864583333, + "grad_norm": 13.240806579589844, + "learning_rate": 8.510974963281054e-06, + "loss": 5.0228, + "step": 49660 + }, + { + "epoch": 1.01043701171875, + "grad_norm": 19.84905433654785, + "learning_rate": 8.510690387840854e-06, + "loss": 5.0099, + "step": 49665 + }, + { + "epoch": 1.0105387369791667, + "grad_norm": 16.87331199645996, + "learning_rate": 8.510405789968457e-06, + "loss": 5.0907, + "step": 49670 + }, + { + "epoch": 1.0106404622395833, + "grad_norm": 18.69076919555664, + "learning_rate": 8.510121169665684e-06, + "loss": 4.901, + "step": 49675 + }, + { + "epoch": 1.0107421875, + "grad_norm": 18.399892807006836, + "learning_rate": 8.50983652693435e-06, + "loss": 4.8755, + "step": 49680 + }, + { + "epoch": 1.0108439127604167, + "grad_norm": 15.868056297302246, + "learning_rate": 8.509551861776272e-06, + "loss": 4.9356, + "step": 49685 + }, + { + "epoch": 1.0109456380208333, + "grad_norm": 24.488128662109375, + "learning_rate": 8.509267174193274e-06, + "loss": 4.9157, + "step": 49690 + }, + { + "epoch": 1.01104736328125, + "grad_norm": 18.30144691467285, + "learning_rate": 8.50898246418717e-06, + "loss": 5.1366, + "step": 49695 + }, + { + "epoch": 1.0111490885416667, + "grad_norm": 19.37835693359375, + "learning_rate": 8.508697731759785e-06, + "loss": 4.9143, + "step": 49700 + }, + { + "epoch": 1.0112508138020833, + "grad_norm": 23.06700325012207, + "learning_rate": 8.508412976912935e-06, + "loss": 5.0863, + "step": 49705 + }, + { + "epoch": 1.0113525390625, + "grad_norm": 17.252859115600586, + "learning_rate": 8.50812819964844e-06, + "loss": 4.6452, + "step": 49710 + }, + { + "epoch": 1.0114542643229167, + "grad_norm": 18.12398338317871, + "learning_rate": 8.507843399968117e-06, + "loss": 5.0327, + "step": 49715 + }, + { + "epoch": 1.0115559895833333, + "grad_norm": 16.189477920532227, + "learning_rate": 8.507558577873791e-06, + "loss": 4.7534, + "step": 49720 + }, + { + "epoch": 1.01165771484375, + "grad_norm": 16.940584182739258, + "learning_rate": 8.507273733367277e-06, + "loss": 5.0869, + "step": 49725 + }, + { + "epoch": 1.0117594401041667, + "grad_norm": 16.269041061401367, + "learning_rate": 8.506988866450399e-06, + "loss": 4.8535, + "step": 49730 + }, + { + "epoch": 1.0118611653645833, + "grad_norm": 20.659730911254883, + "learning_rate": 8.506703977124974e-06, + "loss": 4.8706, + "step": 49735 + }, + { + "epoch": 1.011962890625, + "grad_norm": 16.234037399291992, + "learning_rate": 8.506419065392823e-06, + "loss": 4.9264, + "step": 49740 + }, + { + "epoch": 1.0120646158854167, + "grad_norm": 17.137901306152344, + "learning_rate": 8.50613413125577e-06, + "loss": 5.0693, + "step": 49745 + }, + { + "epoch": 1.0121663411458333, + "grad_norm": 14.101688385009766, + "learning_rate": 8.50584917471563e-06, + "loss": 4.8943, + "step": 49750 + }, + { + "epoch": 1.01226806640625, + "grad_norm": 18.581478118896484, + "learning_rate": 8.505564195774229e-06, + "loss": 5.0886, + "step": 49755 + }, + { + "epoch": 1.0123697916666667, + "grad_norm": 15.582416534423828, + "learning_rate": 8.505279194433385e-06, + "loss": 4.721, + "step": 49760 + }, + { + "epoch": 1.0124715169270833, + "grad_norm": 20.67353630065918, + "learning_rate": 8.50499417069492e-06, + "loss": 4.8076, + "step": 49765 + }, + { + "epoch": 1.0125732421875, + "grad_norm": 13.932839393615723, + "learning_rate": 8.504709124560655e-06, + "loss": 4.7925, + "step": 49770 + }, + { + "epoch": 1.0126749674479167, + "grad_norm": 15.922731399536133, + "learning_rate": 8.50442405603241e-06, + "loss": 5.1111, + "step": 49775 + }, + { + "epoch": 1.0127766927083333, + "grad_norm": 13.011545181274414, + "learning_rate": 8.504138965112008e-06, + "loss": 5.0996, + "step": 49780 + }, + { + "epoch": 1.01287841796875, + "grad_norm": 17.157686233520508, + "learning_rate": 8.50385385180127e-06, + "loss": 4.9385, + "step": 49785 + }, + { + "epoch": 1.0129801432291667, + "grad_norm": 16.86151123046875, + "learning_rate": 8.50356871610202e-06, + "loss": 4.78, + "step": 49790 + }, + { + "epoch": 1.0130818684895833, + "grad_norm": 13.08370590209961, + "learning_rate": 8.503283558016077e-06, + "loss": 5.1061, + "step": 49795 + }, + { + "epoch": 1.01318359375, + "grad_norm": 19.063425064086914, + "learning_rate": 8.502998377545262e-06, + "loss": 4.8986, + "step": 49800 + }, + { + "epoch": 1.0132853190104167, + "grad_norm": 16.271358489990234, + "learning_rate": 8.502713174691402e-06, + "loss": 4.9219, + "step": 49805 + }, + { + "epoch": 1.0133870442708333, + "grad_norm": 13.294010162353516, + "learning_rate": 8.502427949456318e-06, + "loss": 5.0919, + "step": 49810 + }, + { + "epoch": 1.01348876953125, + "grad_norm": 15.592976570129395, + "learning_rate": 8.50214270184183e-06, + "loss": 4.7328, + "step": 49815 + }, + { + "epoch": 1.0135904947916667, + "grad_norm": 14.473162651062012, + "learning_rate": 8.50185743184976e-06, + "loss": 4.8559, + "step": 49820 + }, + { + "epoch": 1.0136922200520833, + "grad_norm": 33.657474517822266, + "learning_rate": 8.501572139481932e-06, + "loss": 4.9657, + "step": 49825 + }, + { + "epoch": 1.0137939453125, + "grad_norm": 16.30073356628418, + "learning_rate": 8.501286824740173e-06, + "loss": 4.6563, + "step": 49830 + }, + { + "epoch": 1.0138956705729167, + "grad_norm": 22.40778923034668, + "learning_rate": 8.501001487626302e-06, + "loss": 5.1206, + "step": 49835 + }, + { + "epoch": 1.0139973958333333, + "grad_norm": 17.32355499267578, + "learning_rate": 8.500716128142141e-06, + "loss": 5.018, + "step": 49840 + }, + { + "epoch": 1.01409912109375, + "grad_norm": 22.192628860473633, + "learning_rate": 8.500430746289516e-06, + "loss": 5.1388, + "step": 49845 + }, + { + "epoch": 1.0142008463541667, + "grad_norm": 16.103466033935547, + "learning_rate": 8.50014534207025e-06, + "loss": 4.7608, + "step": 49850 + }, + { + "epoch": 1.0143025716145833, + "grad_norm": 21.573461532592773, + "learning_rate": 8.499859915486167e-06, + "loss": 4.8056, + "step": 49855 + }, + { + "epoch": 1.014404296875, + "grad_norm": 14.587327003479004, + "learning_rate": 8.49957446653909e-06, + "loss": 4.9924, + "step": 49860 + }, + { + "epoch": 1.0145060221354167, + "grad_norm": 27.845645904541016, + "learning_rate": 8.49928899523084e-06, + "loss": 5.3517, + "step": 49865 + }, + { + "epoch": 1.0146077473958333, + "grad_norm": 19.91553497314453, + "learning_rate": 8.499003501563247e-06, + "loss": 5.071, + "step": 49870 + }, + { + "epoch": 1.01470947265625, + "grad_norm": 18.903675079345703, + "learning_rate": 8.498717985538134e-06, + "loss": 4.9603, + "step": 49875 + }, + { + "epoch": 1.0148111979166667, + "grad_norm": 16.808311462402344, + "learning_rate": 8.498432447157321e-06, + "loss": 4.8549, + "step": 49880 + }, + { + "epoch": 1.0149129231770833, + "grad_norm": 17.50374412536621, + "learning_rate": 8.498146886422636e-06, + "loss": 5.2383, + "step": 49885 + }, + { + "epoch": 1.0150146484375, + "grad_norm": 11.460244178771973, + "learning_rate": 8.497861303335902e-06, + "loss": 4.8124, + "step": 49890 + }, + { + "epoch": 1.0151163736979167, + "grad_norm": 16.88950538635254, + "learning_rate": 8.497575697898946e-06, + "loss": 4.9265, + "step": 49895 + }, + { + "epoch": 1.0152180989583333, + "grad_norm": 17.509862899780273, + "learning_rate": 8.497290070113591e-06, + "loss": 5.0451, + "step": 49900 + }, + { + "epoch": 1.01531982421875, + "grad_norm": 13.934195518493652, + "learning_rate": 8.497004419981662e-06, + "loss": 5.0492, + "step": 49905 + }, + { + "epoch": 1.0154215494791667, + "grad_norm": 18.021583557128906, + "learning_rate": 8.496718747504986e-06, + "loss": 4.9792, + "step": 49910 + }, + { + "epoch": 1.0155232747395833, + "grad_norm": 13.819695472717285, + "learning_rate": 8.496433052685387e-06, + "loss": 4.7657, + "step": 49915 + }, + { + "epoch": 1.015625, + "grad_norm": 30.618568420410156, + "learning_rate": 8.49614733552469e-06, + "loss": 5.202, + "step": 49920 + }, + { + "epoch": 1.0157267252604167, + "grad_norm": 14.584504127502441, + "learning_rate": 8.495861596024723e-06, + "loss": 4.8392, + "step": 49925 + }, + { + "epoch": 1.0158284505208333, + "grad_norm": 21.64369773864746, + "learning_rate": 8.495575834187308e-06, + "loss": 4.8831, + "step": 49930 + }, + { + "epoch": 1.01593017578125, + "grad_norm": 19.867137908935547, + "learning_rate": 8.495290050014274e-06, + "loss": 4.8426, + "step": 49935 + }, + { + "epoch": 1.0160319010416667, + "grad_norm": 29.255626678466797, + "learning_rate": 8.495004243507446e-06, + "loss": 4.9356, + "step": 49940 + }, + { + "epoch": 1.0161336263020833, + "grad_norm": 15.688980102539062, + "learning_rate": 8.49471841466865e-06, + "loss": 5.0749, + "step": 49945 + }, + { + "epoch": 1.0162353515625, + "grad_norm": 12.321012496948242, + "learning_rate": 8.494432563499716e-06, + "loss": 4.8386, + "step": 49950 + }, + { + "epoch": 1.0163370768229167, + "grad_norm": 19.47052764892578, + "learning_rate": 8.494146690002463e-06, + "loss": 4.809, + "step": 49955 + }, + { + "epoch": 1.0164388020833333, + "grad_norm": 21.205921173095703, + "learning_rate": 8.493860794178724e-06, + "loss": 4.9267, + "step": 49960 + }, + { + "epoch": 1.01654052734375, + "grad_norm": 19.79601287841797, + "learning_rate": 8.493574876030322e-06, + "loss": 4.8729, + "step": 49965 + }, + { + "epoch": 1.0166422526041667, + "grad_norm": 19.157133102416992, + "learning_rate": 8.493288935559086e-06, + "loss": 4.7695, + "step": 49970 + }, + { + "epoch": 1.0167439778645833, + "grad_norm": 18.452350616455078, + "learning_rate": 8.493002972766844e-06, + "loss": 4.8989, + "step": 49975 + }, + { + "epoch": 1.016845703125, + "grad_norm": 16.575366973876953, + "learning_rate": 8.492716987655419e-06, + "loss": 5.1621, + "step": 49980 + }, + { + "epoch": 1.0169474283854167, + "grad_norm": 17.79438018798828, + "learning_rate": 8.492430980226644e-06, + "loss": 4.7906, + "step": 49985 + }, + { + "epoch": 1.0170491536458333, + "grad_norm": 18.603708267211914, + "learning_rate": 8.492144950482341e-06, + "loss": 4.9188, + "step": 49990 + }, + { + "epoch": 1.01715087890625, + "grad_norm": 21.567413330078125, + "learning_rate": 8.491858898424343e-06, + "loss": 4.9384, + "step": 49995 + }, + { + "epoch": 1.0172526041666667, + "grad_norm": 15.1708984375, + "learning_rate": 8.491572824054475e-06, + "loss": 4.8546, + "step": 50000 + }, + { + "epoch": 1.0173543294270833, + "grad_norm": 16.98476219177246, + "learning_rate": 8.491286727374564e-06, + "loss": 4.9242, + "step": 50005 + }, + { + "epoch": 1.0174560546875, + "grad_norm": 19.049951553344727, + "learning_rate": 8.49100060838644e-06, + "loss": 4.9418, + "step": 50010 + }, + { + "epoch": 1.0175577799479167, + "grad_norm": 18.684585571289062, + "learning_rate": 8.490714467091928e-06, + "loss": 4.8681, + "step": 50015 + }, + { + "epoch": 1.0176595052083333, + "grad_norm": 17.793312072753906, + "learning_rate": 8.49042830349286e-06, + "loss": 4.8609, + "step": 50020 + }, + { + "epoch": 1.01776123046875, + "grad_norm": 16.108102798461914, + "learning_rate": 8.490142117591065e-06, + "loss": 5.013, + "step": 50025 + }, + { + "epoch": 1.0178629557291667, + "grad_norm": 17.655847549438477, + "learning_rate": 8.489855909388367e-06, + "loss": 4.8867, + "step": 50030 + }, + { + "epoch": 1.0179646809895833, + "grad_norm": 18.544189453125, + "learning_rate": 8.489569678886598e-06, + "loss": 4.7038, + "step": 50035 + }, + { + "epoch": 1.01806640625, + "grad_norm": 16.216930389404297, + "learning_rate": 8.489283426087588e-06, + "loss": 4.9427, + "step": 50040 + }, + { + "epoch": 1.0181681315104167, + "grad_norm": 22.85780143737793, + "learning_rate": 8.488997150993163e-06, + "loss": 4.8277, + "step": 50045 + }, + { + "epoch": 1.0182698567708333, + "grad_norm": 18.028968811035156, + "learning_rate": 8.488710853605155e-06, + "loss": 4.784, + "step": 50050 + }, + { + "epoch": 1.01837158203125, + "grad_norm": 18.441694259643555, + "learning_rate": 8.488424533925393e-06, + "loss": 4.8474, + "step": 50055 + }, + { + "epoch": 1.0184733072916667, + "grad_norm": 15.80014419555664, + "learning_rate": 8.488138191955702e-06, + "loss": 5.0815, + "step": 50060 + }, + { + "epoch": 1.0185750325520833, + "grad_norm": 19.246566772460938, + "learning_rate": 8.487851827697919e-06, + "loss": 4.9523, + "step": 50065 + }, + { + "epoch": 1.0186767578125, + "grad_norm": 20.751331329345703, + "learning_rate": 8.487565441153867e-06, + "loss": 4.9973, + "step": 50070 + }, + { + "epoch": 1.0187784830729167, + "grad_norm": 15.652628898620605, + "learning_rate": 8.487279032325381e-06, + "loss": 5.0734, + "step": 50075 + }, + { + "epoch": 1.0188802083333333, + "grad_norm": 17.980440139770508, + "learning_rate": 8.48699260121429e-06, + "loss": 4.9033, + "step": 50080 + }, + { + "epoch": 1.01898193359375, + "grad_norm": 14.645204544067383, + "learning_rate": 8.48670614782242e-06, + "loss": 5.0037, + "step": 50085 + }, + { + "epoch": 1.0190836588541667, + "grad_norm": 24.362083435058594, + "learning_rate": 8.486419672151607e-06, + "loss": 5.1385, + "step": 50090 + }, + { + "epoch": 1.0191853841145833, + "grad_norm": 24.87851905822754, + "learning_rate": 8.48613317420368e-06, + "loss": 5.1847, + "step": 50095 + }, + { + "epoch": 1.019287109375, + "grad_norm": 19.743253707885742, + "learning_rate": 8.485846653980466e-06, + "loss": 4.9515, + "step": 50100 + }, + { + "epoch": 1.0193888346354167, + "grad_norm": 20.25563621520996, + "learning_rate": 8.4855601114838e-06, + "loss": 4.8853, + "step": 50105 + }, + { + "epoch": 1.0194905598958333, + "grad_norm": 15.988036155700684, + "learning_rate": 8.48527354671551e-06, + "loss": 4.8531, + "step": 50110 + }, + { + "epoch": 1.01959228515625, + "grad_norm": 13.394937515258789, + "learning_rate": 8.484986959677429e-06, + "loss": 4.8166, + "step": 50115 + }, + { + "epoch": 1.0196940104166667, + "grad_norm": 14.221681594848633, + "learning_rate": 8.484700350371386e-06, + "loss": 5.01, + "step": 50120 + }, + { + "epoch": 1.0197957356770833, + "grad_norm": 17.538129806518555, + "learning_rate": 8.484413718799217e-06, + "loss": 4.9994, + "step": 50125 + }, + { + "epoch": 1.0198974609375, + "grad_norm": 20.12438201904297, + "learning_rate": 8.484127064962748e-06, + "loss": 4.8792, + "step": 50130 + }, + { + "epoch": 1.0199991861979167, + "grad_norm": 15.36622142791748, + "learning_rate": 8.483840388863813e-06, + "loss": 4.968, + "step": 50135 + }, + { + "epoch": 1.0201009114583333, + "grad_norm": 11.072412490844727, + "learning_rate": 8.483553690504246e-06, + "loss": 5.0678, + "step": 50140 + }, + { + "epoch": 1.02020263671875, + "grad_norm": 18.056638717651367, + "learning_rate": 8.483266969885877e-06, + "loss": 5.1928, + "step": 50145 + }, + { + "epoch": 1.0203043619791667, + "grad_norm": 19.226184844970703, + "learning_rate": 8.482980227010536e-06, + "loss": 4.9701, + "step": 50150 + }, + { + "epoch": 1.0204060872395833, + "grad_norm": 17.500944137573242, + "learning_rate": 8.482693461880058e-06, + "loss": 4.9578, + "step": 50155 + }, + { + "epoch": 1.0205078125, + "grad_norm": 16.634706497192383, + "learning_rate": 8.482406674496273e-06, + "loss": 4.8281, + "step": 50160 + }, + { + "epoch": 1.0206095377604167, + "grad_norm": 17.106151580810547, + "learning_rate": 8.482119864861015e-06, + "loss": 5.1066, + "step": 50165 + }, + { + "epoch": 1.0207112630208333, + "grad_norm": 16.542871475219727, + "learning_rate": 8.481833032976118e-06, + "loss": 5.0473, + "step": 50170 + }, + { + "epoch": 1.02081298828125, + "grad_norm": 26.573991775512695, + "learning_rate": 8.481546178843411e-06, + "loss": 4.9957, + "step": 50175 + }, + { + "epoch": 1.0209147135416667, + "grad_norm": 14.208905220031738, + "learning_rate": 8.481259302464731e-06, + "loss": 4.9442, + "step": 50180 + }, + { + "epoch": 1.0210164388020833, + "grad_norm": 23.2220401763916, + "learning_rate": 8.48097240384191e-06, + "loss": 4.9751, + "step": 50185 + }, + { + "epoch": 1.0211181640625, + "grad_norm": 15.141679763793945, + "learning_rate": 8.480685482976778e-06, + "loss": 5.0284, + "step": 50190 + }, + { + "epoch": 1.0212198893229167, + "grad_norm": 18.963472366333008, + "learning_rate": 8.480398539871171e-06, + "loss": 4.7754, + "step": 50195 + }, + { + "epoch": 1.0213216145833333, + "grad_norm": 15.799792289733887, + "learning_rate": 8.480111574526924e-06, + "loss": 4.8649, + "step": 50200 + }, + { + "epoch": 1.02142333984375, + "grad_norm": 23.20758056640625, + "learning_rate": 8.479824586945867e-06, + "loss": 4.9656, + "step": 50205 + }, + { + "epoch": 1.0215250651041667, + "grad_norm": 17.56075668334961, + "learning_rate": 8.479537577129836e-06, + "loss": 5.2197, + "step": 50210 + }, + { + "epoch": 1.0216267903645833, + "grad_norm": 17.3813533782959, + "learning_rate": 8.479250545080664e-06, + "loss": 5.3566, + "step": 50215 + }, + { + "epoch": 1.021728515625, + "grad_norm": 17.513221740722656, + "learning_rate": 8.478963490800187e-06, + "loss": 5.2234, + "step": 50220 + }, + { + "epoch": 1.0218302408854167, + "grad_norm": 17.9853515625, + "learning_rate": 8.478676414290235e-06, + "loss": 4.9181, + "step": 50225 + }, + { + "epoch": 1.0219319661458333, + "grad_norm": 16.91400718688965, + "learning_rate": 8.478389315552648e-06, + "loss": 5.0402, + "step": 50230 + }, + { + "epoch": 1.02203369140625, + "grad_norm": 18.69678497314453, + "learning_rate": 8.478102194589257e-06, + "loss": 5.0911, + "step": 50235 + }, + { + "epoch": 1.0221354166666667, + "grad_norm": 15.955687522888184, + "learning_rate": 8.477815051401894e-06, + "loss": 5.2091, + "step": 50240 + }, + { + "epoch": 1.0222371419270833, + "grad_norm": 18.389963150024414, + "learning_rate": 8.4775278859924e-06, + "loss": 5.0305, + "step": 50245 + }, + { + "epoch": 1.0223388671875, + "grad_norm": 17.14676284790039, + "learning_rate": 8.477240698362605e-06, + "loss": 4.8675, + "step": 50250 + }, + { + "epoch": 1.0224405924479167, + "grad_norm": 14.684617042541504, + "learning_rate": 8.476953488514348e-06, + "loss": 4.7418, + "step": 50255 + }, + { + "epoch": 1.0225423177083333, + "grad_norm": 18.794336318969727, + "learning_rate": 8.47666625644946e-06, + "loss": 4.9753, + "step": 50260 + }, + { + "epoch": 1.02264404296875, + "grad_norm": 14.155050277709961, + "learning_rate": 8.476379002169778e-06, + "loss": 5.1444, + "step": 50265 + }, + { + "epoch": 1.0227457682291667, + "grad_norm": 16.263891220092773, + "learning_rate": 8.476091725677138e-06, + "loss": 4.9665, + "step": 50270 + }, + { + "epoch": 1.0228474934895833, + "grad_norm": 15.166207313537598, + "learning_rate": 8.475804426973378e-06, + "loss": 4.8048, + "step": 50275 + }, + { + "epoch": 1.02294921875, + "grad_norm": 16.467308044433594, + "learning_rate": 8.475517106060326e-06, + "loss": 4.9713, + "step": 50280 + }, + { + "epoch": 1.0230509440104167, + "grad_norm": 20.363569259643555, + "learning_rate": 8.475229762939827e-06, + "loss": 4.7622, + "step": 50285 + }, + { + "epoch": 1.0231526692708333, + "grad_norm": 21.27765655517578, + "learning_rate": 8.474942397613712e-06, + "loss": 5.0601, + "step": 50290 + }, + { + "epoch": 1.02325439453125, + "grad_norm": 16.804853439331055, + "learning_rate": 8.474655010083818e-06, + "loss": 4.9823, + "step": 50295 + }, + { + "epoch": 1.0233561197916667, + "grad_norm": 16.758529663085938, + "learning_rate": 8.474367600351982e-06, + "loss": 4.8728, + "step": 50300 + }, + { + "epoch": 1.0234578450520833, + "grad_norm": 18.127689361572266, + "learning_rate": 8.474080168420038e-06, + "loss": 4.9299, + "step": 50305 + }, + { + "epoch": 1.0235595703125, + "grad_norm": 16.43207550048828, + "learning_rate": 8.473792714289826e-06, + "loss": 4.7323, + "step": 50310 + }, + { + "epoch": 1.0236612955729167, + "grad_norm": 17.029706954956055, + "learning_rate": 8.473505237963183e-06, + "loss": 5.0906, + "step": 50315 + }, + { + "epoch": 1.0237630208333333, + "grad_norm": 21.818471908569336, + "learning_rate": 8.473217739441942e-06, + "loss": 4.694, + "step": 50320 + }, + { + "epoch": 1.02386474609375, + "grad_norm": 29.193775177001953, + "learning_rate": 8.472930218727941e-06, + "loss": 4.8579, + "step": 50325 + }, + { + "epoch": 1.0239664713541667, + "grad_norm": 13.668070793151855, + "learning_rate": 8.47264267582302e-06, + "loss": 4.7987, + "step": 50330 + }, + { + "epoch": 1.0240681966145833, + "grad_norm": 16.26969337463379, + "learning_rate": 8.472355110729016e-06, + "loss": 5.089, + "step": 50335 + }, + { + "epoch": 1.024169921875, + "grad_norm": 18.017047882080078, + "learning_rate": 8.472067523447763e-06, + "loss": 4.9982, + "step": 50340 + }, + { + "epoch": 1.0242716471354167, + "grad_norm": 17.82901954650879, + "learning_rate": 8.471779913981102e-06, + "loss": 4.6977, + "step": 50345 + }, + { + "epoch": 1.0243733723958333, + "grad_norm": 15.139969825744629, + "learning_rate": 8.471492282330869e-06, + "loss": 5.0432, + "step": 50350 + }, + { + "epoch": 1.02447509765625, + "grad_norm": 14.872218132019043, + "learning_rate": 8.471204628498901e-06, + "loss": 4.8563, + "step": 50355 + }, + { + "epoch": 1.0245768229166667, + "grad_norm": 21.333486557006836, + "learning_rate": 8.470916952487039e-06, + "loss": 4.9867, + "step": 50360 + }, + { + "epoch": 1.0246785481770833, + "grad_norm": 16.14594268798828, + "learning_rate": 8.47062925429712e-06, + "loss": 5.1913, + "step": 50365 + }, + { + "epoch": 1.0247802734375, + "grad_norm": 15.080728530883789, + "learning_rate": 8.47034153393098e-06, + "loss": 5.0551, + "step": 50370 + }, + { + "epoch": 1.0248819986979167, + "grad_norm": 20.9852294921875, + "learning_rate": 8.47005379139046e-06, + "loss": 4.9934, + "step": 50375 + }, + { + "epoch": 1.0249837239583333, + "grad_norm": 21.57513999938965, + "learning_rate": 8.469766026677398e-06, + "loss": 4.74, + "step": 50380 + }, + { + "epoch": 1.02508544921875, + "grad_norm": 17.084468841552734, + "learning_rate": 8.469478239793632e-06, + "loss": 5.1599, + "step": 50385 + }, + { + "epoch": 1.0251871744791667, + "grad_norm": 17.348608016967773, + "learning_rate": 8.469190430741002e-06, + "loss": 5.1223, + "step": 50390 + }, + { + "epoch": 1.0252888997395833, + "grad_norm": 15.724925994873047, + "learning_rate": 8.468902599521346e-06, + "loss": 4.9258, + "step": 50395 + }, + { + "epoch": 1.025390625, + "grad_norm": 13.557586669921875, + "learning_rate": 8.468614746136504e-06, + "loss": 4.8715, + "step": 50400 + }, + { + "epoch": 1.0254923502604167, + "grad_norm": 17.440595626831055, + "learning_rate": 8.468326870588315e-06, + "loss": 4.8098, + "step": 50405 + }, + { + "epoch": 1.0255940755208333, + "grad_norm": 13.587381362915039, + "learning_rate": 8.468038972878618e-06, + "loss": 4.9333, + "step": 50410 + }, + { + "epoch": 1.02569580078125, + "grad_norm": 13.663537979125977, + "learning_rate": 8.467751053009253e-06, + "loss": 4.984, + "step": 50415 + }, + { + "epoch": 1.0257975260416667, + "grad_norm": 14.789412498474121, + "learning_rate": 8.467463110982059e-06, + "loss": 4.7645, + "step": 50420 + }, + { + "epoch": 1.0258992513020833, + "grad_norm": 17.039400100708008, + "learning_rate": 8.467175146798878e-06, + "loss": 4.8184, + "step": 50425 + }, + { + "epoch": 1.0260009765625, + "grad_norm": 20.987043380737305, + "learning_rate": 8.466887160461547e-06, + "loss": 5.0687, + "step": 50430 + }, + { + "epoch": 1.0261027018229167, + "grad_norm": 15.772207260131836, + "learning_rate": 8.466599151971907e-06, + "loss": 4.7854, + "step": 50435 + }, + { + "epoch": 1.0262044270833333, + "grad_norm": 16.909954071044922, + "learning_rate": 8.4663111213318e-06, + "loss": 4.7886, + "step": 50440 + }, + { + "epoch": 1.02630615234375, + "grad_norm": 18.570432662963867, + "learning_rate": 8.466023068543066e-06, + "loss": 4.9674, + "step": 50445 + }, + { + "epoch": 1.0264078776041667, + "grad_norm": 16.774734497070312, + "learning_rate": 8.465734993607542e-06, + "loss": 4.8864, + "step": 50450 + }, + { + "epoch": 1.0265096028645833, + "grad_norm": 13.691143035888672, + "learning_rate": 8.465446896527074e-06, + "loss": 5.1149, + "step": 50455 + }, + { + "epoch": 1.026611328125, + "grad_norm": 14.954242706298828, + "learning_rate": 8.4651587773035e-06, + "loss": 4.9652, + "step": 50460 + }, + { + "epoch": 1.0267130533854167, + "grad_norm": 21.602983474731445, + "learning_rate": 8.464870635938661e-06, + "loss": 5.002, + "step": 50465 + }, + { + "epoch": 1.0268147786458333, + "grad_norm": 12.980751991271973, + "learning_rate": 8.4645824724344e-06, + "loss": 4.9127, + "step": 50470 + }, + { + "epoch": 1.02691650390625, + "grad_norm": 15.72696304321289, + "learning_rate": 8.464294286792555e-06, + "loss": 5.0429, + "step": 50475 + }, + { + "epoch": 1.0270182291666667, + "grad_norm": 15.569068908691406, + "learning_rate": 8.46400607901497e-06, + "loss": 4.9728, + "step": 50480 + }, + { + "epoch": 1.0271199544270833, + "grad_norm": 16.159074783325195, + "learning_rate": 8.463717849103485e-06, + "loss": 5.0161, + "step": 50485 + }, + { + "epoch": 1.0272216796875, + "grad_norm": 15.934042930603027, + "learning_rate": 8.463429597059944e-06, + "loss": 4.8048, + "step": 50490 + }, + { + "epoch": 1.0273234049479167, + "grad_norm": 14.167491912841797, + "learning_rate": 8.463141322886185e-06, + "loss": 4.6692, + "step": 50495 + }, + { + "epoch": 1.0274251302083333, + "grad_norm": 16.29657554626465, + "learning_rate": 8.462853026584054e-06, + "loss": 4.813, + "step": 50500 + }, + { + "epoch": 1.02752685546875, + "grad_norm": 20.067781448364258, + "learning_rate": 8.462564708155391e-06, + "loss": 5.0454, + "step": 50505 + }, + { + "epoch": 1.0276285807291667, + "grad_norm": 16.62909507751465, + "learning_rate": 8.46227636760204e-06, + "loss": 4.7897, + "step": 50510 + }, + { + "epoch": 1.0277303059895833, + "grad_norm": 24.04147720336914, + "learning_rate": 8.46198800492584e-06, + "loss": 4.9592, + "step": 50515 + }, + { + "epoch": 1.02783203125, + "grad_norm": 16.4671688079834, + "learning_rate": 8.461699620128636e-06, + "loss": 5.3302, + "step": 50520 + }, + { + "epoch": 1.0279337565104167, + "grad_norm": 18.908491134643555, + "learning_rate": 8.461411213212272e-06, + "loss": 4.8563, + "step": 50525 + }, + { + "epoch": 1.0280354817708333, + "grad_norm": 16.905685424804688, + "learning_rate": 8.461122784178587e-06, + "loss": 5.0138, + "step": 50530 + }, + { + "epoch": 1.02813720703125, + "grad_norm": 15.746245384216309, + "learning_rate": 8.460834333029428e-06, + "loss": 5.0018, + "step": 50535 + }, + { + "epoch": 1.0282389322916667, + "grad_norm": 17.672597885131836, + "learning_rate": 8.460545859766635e-06, + "loss": 5.0137, + "step": 50540 + }, + { + "epoch": 1.0283406575520833, + "grad_norm": 15.159749031066895, + "learning_rate": 8.460257364392053e-06, + "loss": 4.835, + "step": 50545 + }, + { + "epoch": 1.0284423828125, + "grad_norm": 15.405731201171875, + "learning_rate": 8.459968846907525e-06, + "loss": 4.853, + "step": 50550 + }, + { + "epoch": 1.0285441080729167, + "grad_norm": 17.96040916442871, + "learning_rate": 8.459680307314893e-06, + "loss": 5.1449, + "step": 50555 + }, + { + "epoch": 1.0286458333333333, + "grad_norm": 21.018516540527344, + "learning_rate": 8.459391745616003e-06, + "loss": 4.8527, + "step": 50560 + }, + { + "epoch": 1.02874755859375, + "grad_norm": 18.90149688720703, + "learning_rate": 8.459103161812698e-06, + "loss": 5.0084, + "step": 50565 + }, + { + "epoch": 1.0288492838541667, + "grad_norm": 16.25439453125, + "learning_rate": 8.458814555906823e-06, + "loss": 4.8728, + "step": 50570 + }, + { + "epoch": 1.0289510091145833, + "grad_norm": 24.355546951293945, + "learning_rate": 8.458525927900218e-06, + "loss": 5.0905, + "step": 50575 + }, + { + "epoch": 1.029052734375, + "grad_norm": 19.803178787231445, + "learning_rate": 8.458237277794734e-06, + "loss": 5.1041, + "step": 50580 + }, + { + "epoch": 1.0291544596354167, + "grad_norm": 20.515216827392578, + "learning_rate": 8.45794860559221e-06, + "loss": 5.0839, + "step": 50585 + }, + { + "epoch": 1.0292561848958333, + "grad_norm": 18.5421142578125, + "learning_rate": 8.45765991129449e-06, + "loss": 4.9496, + "step": 50590 + }, + { + "epoch": 1.02935791015625, + "grad_norm": 20.7968692779541, + "learning_rate": 8.457371194903423e-06, + "loss": 5.2412, + "step": 50595 + }, + { + "epoch": 1.0294596354166667, + "grad_norm": 18.425586700439453, + "learning_rate": 8.45708245642085e-06, + "loss": 4.9356, + "step": 50600 + }, + { + "epoch": 1.0295613606770833, + "grad_norm": 23.818584442138672, + "learning_rate": 8.45679369584862e-06, + "loss": 4.9266, + "step": 50605 + }, + { + "epoch": 1.0296630859375, + "grad_norm": 16.227811813354492, + "learning_rate": 8.456504913188574e-06, + "loss": 4.8074, + "step": 50610 + }, + { + "epoch": 1.0297648111979167, + "grad_norm": 17.680307388305664, + "learning_rate": 8.45621610844256e-06, + "loss": 5.2202, + "step": 50615 + }, + { + "epoch": 1.0298665364583333, + "grad_norm": 20.051313400268555, + "learning_rate": 8.455927281612421e-06, + "loss": 4.9143, + "step": 50620 + }, + { + "epoch": 1.02996826171875, + "grad_norm": 13.945127487182617, + "learning_rate": 8.455638432700005e-06, + "loss": 4.9313, + "step": 50625 + }, + { + "epoch": 1.0300699869791667, + "grad_norm": 19.90728759765625, + "learning_rate": 8.455349561707154e-06, + "loss": 4.9905, + "step": 50630 + }, + { + "epoch": 1.0301717122395833, + "grad_norm": 15.208710670471191, + "learning_rate": 8.455060668635718e-06, + "loss": 5.1681, + "step": 50635 + }, + { + "epoch": 1.0302734375, + "grad_norm": 14.96871566772461, + "learning_rate": 8.454771753487541e-06, + "loss": 4.9746, + "step": 50640 + }, + { + "epoch": 1.0303751627604167, + "grad_norm": 18.930017471313477, + "learning_rate": 8.45448281626447e-06, + "loss": 4.9523, + "step": 50645 + }, + { + "epoch": 1.0304768880208333, + "grad_norm": 20.981077194213867, + "learning_rate": 8.45419385696835e-06, + "loss": 5.1897, + "step": 50650 + }, + { + "epoch": 1.03057861328125, + "grad_norm": 20.211299896240234, + "learning_rate": 8.453904875601025e-06, + "loss": 5.2145, + "step": 50655 + }, + { + "epoch": 1.0306803385416667, + "grad_norm": 17.926105499267578, + "learning_rate": 8.453615872164346e-06, + "loss": 5.0759, + "step": 50660 + }, + { + "epoch": 1.0307820638020833, + "grad_norm": 19.964519500732422, + "learning_rate": 8.45332684666016e-06, + "loss": 4.9397, + "step": 50665 + }, + { + "epoch": 1.0308837890625, + "grad_norm": 17.6419677734375, + "learning_rate": 8.45303779909031e-06, + "loss": 5.1658, + "step": 50670 + }, + { + "epoch": 1.0309855143229167, + "grad_norm": 15.45552921295166, + "learning_rate": 8.452748729456644e-06, + "loss": 4.8848, + "step": 50675 + }, + { + "epoch": 1.0310872395833333, + "grad_norm": 24.35243034362793, + "learning_rate": 8.45245963776101e-06, + "loss": 5.1074, + "step": 50680 + }, + { + "epoch": 1.03118896484375, + "grad_norm": 20.111614227294922, + "learning_rate": 8.452170524005254e-06, + "loss": 5.133, + "step": 50685 + }, + { + "epoch": 1.0312906901041667, + "grad_norm": 15.172468185424805, + "learning_rate": 8.451881388191226e-06, + "loss": 4.9254, + "step": 50690 + }, + { + "epoch": 1.0313924153645833, + "grad_norm": 17.7161865234375, + "learning_rate": 8.451592230320771e-06, + "loss": 5.1141, + "step": 50695 + }, + { + "epoch": 1.031494140625, + "grad_norm": 14.017176628112793, + "learning_rate": 8.451303050395738e-06, + "loss": 4.8657, + "step": 50700 + }, + { + "epoch": 1.0315958658854167, + "grad_norm": 15.67011833190918, + "learning_rate": 8.451013848417973e-06, + "loss": 4.9216, + "step": 50705 + }, + { + "epoch": 1.0316975911458333, + "grad_norm": 17.162010192871094, + "learning_rate": 8.450724624389327e-06, + "loss": 4.9331, + "step": 50710 + }, + { + "epoch": 1.03179931640625, + "grad_norm": 15.271934509277344, + "learning_rate": 8.450435378311644e-06, + "loss": 4.9662, + "step": 50715 + }, + { + "epoch": 1.0319010416666667, + "grad_norm": 20.216753005981445, + "learning_rate": 8.450146110186774e-06, + "loss": 5.0724, + "step": 50720 + }, + { + "epoch": 1.0320027669270833, + "grad_norm": 17.521663665771484, + "learning_rate": 8.449856820016567e-06, + "loss": 5.085, + "step": 50725 + }, + { + "epoch": 1.0321044921875, + "grad_norm": 18.8655948638916, + "learning_rate": 8.44956750780287e-06, + "loss": 5.1022, + "step": 50730 + }, + { + "epoch": 1.0322062174479167, + "grad_norm": 14.336759567260742, + "learning_rate": 8.449278173547529e-06, + "loss": 5.0269, + "step": 50735 + }, + { + "epoch": 1.0323079427083333, + "grad_norm": 16.649532318115234, + "learning_rate": 8.448988817252398e-06, + "loss": 4.9406, + "step": 50740 + }, + { + "epoch": 1.03240966796875, + "grad_norm": 19.969518661499023, + "learning_rate": 8.448699438919324e-06, + "loss": 4.8873, + "step": 50745 + }, + { + "epoch": 1.0325113932291667, + "grad_norm": 13.506816864013672, + "learning_rate": 8.448410038550153e-06, + "loss": 5.3292, + "step": 50750 + }, + { + "epoch": 1.0326131184895833, + "grad_norm": 24.7177677154541, + "learning_rate": 8.448120616146737e-06, + "loss": 4.7549, + "step": 50755 + }, + { + "epoch": 1.03271484375, + "grad_norm": 17.67327117919922, + "learning_rate": 8.447831171710926e-06, + "loss": 5.1658, + "step": 50760 + }, + { + "epoch": 1.0328165690104167, + "grad_norm": 19.578262329101562, + "learning_rate": 8.447541705244569e-06, + "loss": 5.0949, + "step": 50765 + }, + { + "epoch": 1.0329182942708333, + "grad_norm": 18.33977508544922, + "learning_rate": 8.447252216749512e-06, + "loss": 4.9409, + "step": 50770 + }, + { + "epoch": 1.03302001953125, + "grad_norm": 17.087018966674805, + "learning_rate": 8.446962706227611e-06, + "loss": 4.9012, + "step": 50775 + }, + { + "epoch": 1.0331217447916667, + "grad_norm": 18.220266342163086, + "learning_rate": 8.44667317368071e-06, + "loss": 4.9513, + "step": 50780 + }, + { + "epoch": 1.0332234700520833, + "grad_norm": 18.81464958190918, + "learning_rate": 8.446383619110663e-06, + "loss": 5.1239, + "step": 50785 + }, + { + "epoch": 1.0333251953125, + "grad_norm": 13.006145477294922, + "learning_rate": 8.446094042519316e-06, + "loss": 4.8958, + "step": 50790 + }, + { + "epoch": 1.0334269205729167, + "grad_norm": 20.106842041015625, + "learning_rate": 8.445804443908525e-06, + "loss": 4.7499, + "step": 50795 + }, + { + "epoch": 1.0335286458333333, + "grad_norm": 17.048175811767578, + "learning_rate": 8.445514823280137e-06, + "loss": 4.8269, + "step": 50800 + }, + { + "epoch": 1.03363037109375, + "grad_norm": 17.213891983032227, + "learning_rate": 8.445225180636003e-06, + "loss": 5.0625, + "step": 50805 + }, + { + "epoch": 1.0337320963541667, + "grad_norm": 22.511728286743164, + "learning_rate": 8.444935515977974e-06, + "loss": 4.9067, + "step": 50810 + }, + { + "epoch": 1.0338338216145833, + "grad_norm": 20.563230514526367, + "learning_rate": 8.444645829307899e-06, + "loss": 4.8054, + "step": 50815 + }, + { + "epoch": 1.033935546875, + "grad_norm": 20.425954818725586, + "learning_rate": 8.444356120627632e-06, + "loss": 4.7521, + "step": 50820 + }, + { + "epoch": 1.0340372721354167, + "grad_norm": 19.81227684020996, + "learning_rate": 8.444066389939023e-06, + "loss": 4.7715, + "step": 50825 + }, + { + "epoch": 1.0341389973958333, + "grad_norm": 14.0562105178833, + "learning_rate": 8.443776637243923e-06, + "loss": 4.7465, + "step": 50830 + }, + { + "epoch": 1.03424072265625, + "grad_norm": 17.391883850097656, + "learning_rate": 8.443486862544182e-06, + "loss": 4.7448, + "step": 50835 + }, + { + "epoch": 1.0343424479166667, + "grad_norm": 16.59539794921875, + "learning_rate": 8.443197065841654e-06, + "loss": 4.9901, + "step": 50840 + }, + { + "epoch": 1.0344441731770833, + "grad_norm": 14.491026878356934, + "learning_rate": 8.44290724713819e-06, + "loss": 5.0378, + "step": 50845 + }, + { + "epoch": 1.0345458984375, + "grad_norm": 22.231870651245117, + "learning_rate": 8.44261740643564e-06, + "loss": 4.8727, + "step": 50850 + }, + { + "epoch": 1.0346476236979167, + "grad_norm": 16.840166091918945, + "learning_rate": 8.442327543735859e-06, + "loss": 5.0741, + "step": 50855 + }, + { + "epoch": 1.0347493489583333, + "grad_norm": 15.062009811401367, + "learning_rate": 8.442037659040698e-06, + "loss": 4.7649, + "step": 50860 + }, + { + "epoch": 1.03485107421875, + "grad_norm": 17.391155242919922, + "learning_rate": 8.44174775235201e-06, + "loss": 4.874, + "step": 50865 + }, + { + "epoch": 1.0349527994791667, + "grad_norm": 31.7540340423584, + "learning_rate": 8.441457823671644e-06, + "loss": 4.9367, + "step": 50870 + }, + { + "epoch": 1.0350545247395833, + "grad_norm": 22.13982391357422, + "learning_rate": 8.441167873001457e-06, + "loss": 4.94, + "step": 50875 + }, + { + "epoch": 1.03515625, + "grad_norm": 14.521327018737793, + "learning_rate": 8.440877900343298e-06, + "loss": 4.8398, + "step": 50880 + }, + { + "epoch": 1.0352579752604167, + "grad_norm": 18.047916412353516, + "learning_rate": 8.440587905699023e-06, + "loss": 5.232, + "step": 50885 + }, + { + "epoch": 1.0353597005208333, + "grad_norm": 35.007240295410156, + "learning_rate": 8.440297889070482e-06, + "loss": 5.0136, + "step": 50890 + }, + { + "epoch": 1.03546142578125, + "grad_norm": 19.475011825561523, + "learning_rate": 8.44000785045953e-06, + "loss": 5.1837, + "step": 50895 + }, + { + "epoch": 1.0355631510416667, + "grad_norm": 20.886127471923828, + "learning_rate": 8.43971778986802e-06, + "loss": 4.9282, + "step": 50900 + }, + { + "epoch": 1.0356648763020833, + "grad_norm": 15.77023983001709, + "learning_rate": 8.439427707297806e-06, + "loss": 4.7835, + "step": 50905 + }, + { + "epoch": 1.0357666015625, + "grad_norm": 16.913372039794922, + "learning_rate": 8.43913760275074e-06, + "loss": 4.9056, + "step": 50910 + }, + { + "epoch": 1.0358683268229167, + "grad_norm": 16.521459579467773, + "learning_rate": 8.438847476228677e-06, + "loss": 4.9851, + "step": 50915 + }, + { + "epoch": 1.0359700520833333, + "grad_norm": 17.970203399658203, + "learning_rate": 8.43855732773347e-06, + "loss": 5.0604, + "step": 50920 + }, + { + "epoch": 1.03607177734375, + "grad_norm": 23.25864028930664, + "learning_rate": 8.438267157266972e-06, + "loss": 5.048, + "step": 50925 + }, + { + "epoch": 1.0361735026041667, + "grad_norm": 17.74666404724121, + "learning_rate": 8.437976964831041e-06, + "loss": 4.8965, + "step": 50930 + }, + { + "epoch": 1.0362752278645833, + "grad_norm": 15.917464256286621, + "learning_rate": 8.437686750427527e-06, + "loss": 4.9476, + "step": 50935 + }, + { + "epoch": 1.036376953125, + "grad_norm": 15.053730010986328, + "learning_rate": 8.437396514058286e-06, + "loss": 4.8924, + "step": 50940 + }, + { + "epoch": 1.0364786783854167, + "grad_norm": 15.307760238647461, + "learning_rate": 8.437106255725172e-06, + "loss": 4.8097, + "step": 50945 + }, + { + "epoch": 1.0365804036458333, + "grad_norm": 11.704116821289062, + "learning_rate": 8.43681597543004e-06, + "loss": 5.0539, + "step": 50950 + }, + { + "epoch": 1.03668212890625, + "grad_norm": 18.995113372802734, + "learning_rate": 8.436525673174747e-06, + "loss": 4.9586, + "step": 50955 + }, + { + "epoch": 1.0367838541666667, + "grad_norm": 18.63156509399414, + "learning_rate": 8.436235348961143e-06, + "loss": 5.113, + "step": 50960 + }, + { + "epoch": 1.0368855794270833, + "grad_norm": 15.840622901916504, + "learning_rate": 8.435945002791087e-06, + "loss": 4.7851, + "step": 50965 + }, + { + "epoch": 1.0369873046875, + "grad_norm": 15.284492492675781, + "learning_rate": 8.435654634666434e-06, + "loss": 5.0334, + "step": 50970 + }, + { + "epoch": 1.0370890299479167, + "grad_norm": 15.624507904052734, + "learning_rate": 8.435364244589038e-06, + "loss": 4.791, + "step": 50975 + }, + { + "epoch": 1.0371907552083333, + "grad_norm": 14.616540908813477, + "learning_rate": 8.435073832560753e-06, + "loss": 4.9671, + "step": 50980 + }, + { + "epoch": 1.03729248046875, + "grad_norm": 20.022443771362305, + "learning_rate": 8.43478339858344e-06, + "loss": 4.9804, + "step": 50985 + }, + { + "epoch": 1.0373942057291667, + "grad_norm": 15.110490798950195, + "learning_rate": 8.434492942658949e-06, + "loss": 5.2454, + "step": 50990 + }, + { + "epoch": 1.0374959309895833, + "grad_norm": 19.361295700073242, + "learning_rate": 8.43420246478914e-06, + "loss": 4.9386, + "step": 50995 + }, + { + "epoch": 1.03759765625, + "grad_norm": 15.260002136230469, + "learning_rate": 8.433911964975867e-06, + "loss": 5.1776, + "step": 51000 + }, + { + "epoch": 1.0376993815104167, + "grad_norm": 16.489959716796875, + "learning_rate": 8.433621443220987e-06, + "loss": 4.9274, + "step": 51005 + }, + { + "epoch": 1.0378011067708333, + "grad_norm": 18.49524688720703, + "learning_rate": 8.433330899526355e-06, + "loss": 4.9893, + "step": 51010 + }, + { + "epoch": 1.03790283203125, + "grad_norm": 25.583778381347656, + "learning_rate": 8.433040333893827e-06, + "loss": 4.8663, + "step": 51015 + }, + { + "epoch": 1.0380045572916667, + "grad_norm": 17.25176239013672, + "learning_rate": 8.432749746325264e-06, + "loss": 4.6946, + "step": 51020 + }, + { + "epoch": 1.0381062825520833, + "grad_norm": 13.811149597167969, + "learning_rate": 8.43245913682252e-06, + "loss": 4.9662, + "step": 51025 + }, + { + "epoch": 1.0382080078125, + "grad_norm": 18.266992568969727, + "learning_rate": 8.432168505387447e-06, + "loss": 4.8992, + "step": 51030 + }, + { + "epoch": 1.0383097330729167, + "grad_norm": 15.137153625488281, + "learning_rate": 8.431877852021912e-06, + "loss": 4.7166, + "step": 51035 + }, + { + "epoch": 1.0384114583333333, + "grad_norm": 30.113279342651367, + "learning_rate": 8.431587176727765e-06, + "loss": 4.9533, + "step": 51040 + }, + { + "epoch": 1.03851318359375, + "grad_norm": 18.929950714111328, + "learning_rate": 8.431296479506865e-06, + "loss": 5.2574, + "step": 51045 + }, + { + "epoch": 1.0386149088541667, + "grad_norm": 17.804492950439453, + "learning_rate": 8.431005760361071e-06, + "loss": 4.8236, + "step": 51050 + }, + { + "epoch": 1.0387166341145833, + "grad_norm": 16.892786026000977, + "learning_rate": 8.430715019292237e-06, + "loss": 4.8916, + "step": 51055 + }, + { + "epoch": 1.038818359375, + "grad_norm": 20.203569412231445, + "learning_rate": 8.430424256302226e-06, + "loss": 4.8418, + "step": 51060 + }, + { + "epoch": 1.0389200846354167, + "grad_norm": 24.99413299560547, + "learning_rate": 8.430133471392891e-06, + "loss": 4.7578, + "step": 51065 + }, + { + "epoch": 1.0390218098958333, + "grad_norm": 18.319217681884766, + "learning_rate": 8.429842664566092e-06, + "loss": 4.8731, + "step": 51070 + }, + { + "epoch": 1.03912353515625, + "grad_norm": 19.04087257385254, + "learning_rate": 8.429551835823688e-06, + "loss": 4.9405, + "step": 51075 + }, + { + "epoch": 1.0392252604166667, + "grad_norm": 15.932788848876953, + "learning_rate": 8.429260985167536e-06, + "loss": 4.9262, + "step": 51080 + }, + { + "epoch": 1.0393269856770833, + "grad_norm": 17.69917869567871, + "learning_rate": 8.428970112599494e-06, + "loss": 5.0077, + "step": 51085 + }, + { + "epoch": 1.0394287109375, + "grad_norm": 19.884296417236328, + "learning_rate": 8.428679218121423e-06, + "loss": 4.8123, + "step": 51090 + }, + { + "epoch": 1.0395304361979167, + "grad_norm": 15.637608528137207, + "learning_rate": 8.42838830173518e-06, + "loss": 4.8988, + "step": 51095 + }, + { + "epoch": 1.0396321614583333, + "grad_norm": 18.72841453552246, + "learning_rate": 8.428097363442624e-06, + "loss": 4.8833, + "step": 51100 + }, + { + "epoch": 1.03973388671875, + "grad_norm": 20.26716423034668, + "learning_rate": 8.427806403245614e-06, + "loss": 4.9729, + "step": 51105 + }, + { + "epoch": 1.0398356119791667, + "grad_norm": 17.584552764892578, + "learning_rate": 8.427515421146009e-06, + "loss": 4.9169, + "step": 51110 + }, + { + "epoch": 1.0399373372395833, + "grad_norm": 17.870288848876953, + "learning_rate": 8.427224417145668e-06, + "loss": 4.9834, + "step": 51115 + }, + { + "epoch": 1.0400390625, + "grad_norm": 17.185155868530273, + "learning_rate": 8.42693339124645e-06, + "loss": 5.1492, + "step": 51120 + }, + { + "epoch": 1.0401407877604167, + "grad_norm": 15.56647777557373, + "learning_rate": 8.426642343450218e-06, + "loss": 5.2565, + "step": 51125 + }, + { + "epoch": 1.0402425130208333, + "grad_norm": 16.8125, + "learning_rate": 8.426351273758828e-06, + "loss": 4.9422, + "step": 51130 + }, + { + "epoch": 1.04034423828125, + "grad_norm": 34.45158767700195, + "learning_rate": 8.42606018217414e-06, + "loss": 4.8617, + "step": 51135 + }, + { + "epoch": 1.0404459635416667, + "grad_norm": 19.924816131591797, + "learning_rate": 8.425769068698018e-06, + "loss": 4.9505, + "step": 51140 + }, + { + "epoch": 1.0405476888020833, + "grad_norm": 21.83172607421875, + "learning_rate": 8.425477933332316e-06, + "loss": 4.9659, + "step": 51145 + }, + { + "epoch": 1.0406494140625, + "grad_norm": 28.883729934692383, + "learning_rate": 8.425186776078898e-06, + "loss": 5.0396, + "step": 51150 + }, + { + "epoch": 1.0407511393229167, + "grad_norm": 13.926584243774414, + "learning_rate": 8.424895596939625e-06, + "loss": 5.1241, + "step": 51155 + }, + { + "epoch": 1.0408528645833333, + "grad_norm": 25.59903907775879, + "learning_rate": 8.424604395916356e-06, + "loss": 4.9247, + "step": 51160 + }, + { + "epoch": 1.04095458984375, + "grad_norm": 13.334944725036621, + "learning_rate": 8.424313173010952e-06, + "loss": 4.9869, + "step": 51165 + }, + { + "epoch": 1.0410563151041667, + "grad_norm": 19.2703914642334, + "learning_rate": 8.424021928225272e-06, + "loss": 4.821, + "step": 51170 + }, + { + "epoch": 1.0411580403645833, + "grad_norm": 18.056549072265625, + "learning_rate": 8.423730661561183e-06, + "loss": 5.0095, + "step": 51175 + }, + { + "epoch": 1.041259765625, + "grad_norm": 14.570122718811035, + "learning_rate": 8.423439373020538e-06, + "loss": 4.8369, + "step": 51180 + }, + { + "epoch": 1.0413614908854167, + "grad_norm": 21.179500579833984, + "learning_rate": 8.423148062605205e-06, + "loss": 4.8713, + "step": 51185 + }, + { + "epoch": 1.0414632161458333, + "grad_norm": 27.539384841918945, + "learning_rate": 8.42285673031704e-06, + "loss": 4.7654, + "step": 51190 + }, + { + "epoch": 1.04156494140625, + "grad_norm": 15.907326698303223, + "learning_rate": 8.422565376157907e-06, + "loss": 4.8158, + "step": 51195 + }, + { + "epoch": 1.0416666666666667, + "grad_norm": 18.924724578857422, + "learning_rate": 8.422274000129668e-06, + "loss": 4.8443, + "step": 51200 + }, + { + "epoch": 1.0417683919270833, + "grad_norm": 19.93748664855957, + "learning_rate": 8.421982602234187e-06, + "loss": 5.0484, + "step": 51205 + }, + { + "epoch": 1.0418701171875, + "grad_norm": 15.952412605285645, + "learning_rate": 8.421691182473321e-06, + "loss": 4.8629, + "step": 51210 + }, + { + "epoch": 1.0419718424479167, + "grad_norm": 19.057985305786133, + "learning_rate": 8.421399740848935e-06, + "loss": 4.9658, + "step": 51215 + }, + { + "epoch": 1.0420735677083333, + "grad_norm": 14.311602592468262, + "learning_rate": 8.421108277362893e-06, + "loss": 4.9541, + "step": 51220 + }, + { + "epoch": 1.04217529296875, + "grad_norm": 20.31086540222168, + "learning_rate": 8.420816792017052e-06, + "loss": 5.1587, + "step": 51225 + }, + { + "epoch": 1.0422770182291667, + "grad_norm": 16.410097122192383, + "learning_rate": 8.420525284813279e-06, + "loss": 4.9749, + "step": 51230 + }, + { + "epoch": 1.0423787434895833, + "grad_norm": 14.963788032531738, + "learning_rate": 8.420233755753436e-06, + "loss": 4.9818, + "step": 51235 + }, + { + "epoch": 1.04248046875, + "grad_norm": 21.170513153076172, + "learning_rate": 8.419942204839382e-06, + "loss": 4.8636, + "step": 51240 + }, + { + "epoch": 1.0425821940104167, + "grad_norm": 21.15716552734375, + "learning_rate": 8.419650632072987e-06, + "loss": 5.0065, + "step": 51245 + }, + { + "epoch": 1.0426839192708333, + "grad_norm": 15.931116104125977, + "learning_rate": 8.419359037456108e-06, + "loss": 4.5675, + "step": 51250 + }, + { + "epoch": 1.04278564453125, + "grad_norm": 17.318872451782227, + "learning_rate": 8.41906742099061e-06, + "loss": 5.0346, + "step": 51255 + }, + { + "epoch": 1.0428873697916667, + "grad_norm": 18.92283821105957, + "learning_rate": 8.418775782678357e-06, + "loss": 4.858, + "step": 51260 + }, + { + "epoch": 1.0429890950520833, + "grad_norm": 14.288225173950195, + "learning_rate": 8.41848412252121e-06, + "loss": 4.883, + "step": 51265 + }, + { + "epoch": 1.0430908203125, + "grad_norm": 32.15504455566406, + "learning_rate": 8.418192440521038e-06, + "loss": 5.0312, + "step": 51270 + }, + { + "epoch": 1.0431925455729167, + "grad_norm": 16.640178680419922, + "learning_rate": 8.417900736679699e-06, + "loss": 4.7846, + "step": 51275 + }, + { + "epoch": 1.0432942708333333, + "grad_norm": 14.491278648376465, + "learning_rate": 8.41760901099906e-06, + "loss": 4.9168, + "step": 51280 + }, + { + "epoch": 1.04339599609375, + "grad_norm": 20.5165958404541, + "learning_rate": 8.417317263480985e-06, + "loss": 4.9299, + "step": 51285 + }, + { + "epoch": 1.0434977213541667, + "grad_norm": 20.225248336791992, + "learning_rate": 8.417025494127337e-06, + "loss": 4.6735, + "step": 51290 + }, + { + "epoch": 1.0435994466145833, + "grad_norm": 23.539846420288086, + "learning_rate": 8.41673370293998e-06, + "loss": 5.0271, + "step": 51295 + }, + { + "epoch": 1.043701171875, + "grad_norm": 14.71236515045166, + "learning_rate": 8.41644188992078e-06, + "loss": 4.965, + "step": 51300 + }, + { + "epoch": 1.0438028971354167, + "grad_norm": 20.470212936401367, + "learning_rate": 8.416150055071601e-06, + "loss": 4.8814, + "step": 51305 + }, + { + "epoch": 1.0439046223958333, + "grad_norm": 14.132301330566406, + "learning_rate": 8.415858198394307e-06, + "loss": 4.9868, + "step": 51310 + }, + { + "epoch": 1.04400634765625, + "grad_norm": 21.407909393310547, + "learning_rate": 8.415566319890764e-06, + "loss": 5.0466, + "step": 51315 + }, + { + "epoch": 1.0441080729166667, + "grad_norm": 15.080955505371094, + "learning_rate": 8.415274419562837e-06, + "loss": 5.0179, + "step": 51320 + }, + { + "epoch": 1.0442097981770833, + "grad_norm": 20.033266067504883, + "learning_rate": 8.41498249741239e-06, + "loss": 4.9988, + "step": 51325 + }, + { + "epoch": 1.0443115234375, + "grad_norm": 16.794097900390625, + "learning_rate": 8.414690553441287e-06, + "loss": 5.0315, + "step": 51330 + }, + { + "epoch": 1.0444132486979167, + "grad_norm": 19.305082321166992, + "learning_rate": 8.414398587651397e-06, + "loss": 5.0571, + "step": 51335 + }, + { + "epoch": 1.0445149739583333, + "grad_norm": 19.973777770996094, + "learning_rate": 8.414106600044585e-06, + "loss": 4.8417, + "step": 51340 + }, + { + "epoch": 1.04461669921875, + "grad_norm": 22.50524139404297, + "learning_rate": 8.413814590622715e-06, + "loss": 4.9694, + "step": 51345 + }, + { + "epoch": 1.0447184244791667, + "grad_norm": 20.302156448364258, + "learning_rate": 8.413522559387653e-06, + "loss": 4.9327, + "step": 51350 + }, + { + "epoch": 1.0448201497395833, + "grad_norm": 16.270610809326172, + "learning_rate": 8.413230506341266e-06, + "loss": 5.2979, + "step": 51355 + }, + { + "epoch": 1.044921875, + "grad_norm": 47.70444107055664, + "learning_rate": 8.412938431485418e-06, + "loss": 5.0079, + "step": 51360 + }, + { + "epoch": 1.0450236002604167, + "grad_norm": 14.860913276672363, + "learning_rate": 8.412646334821979e-06, + "loss": 4.8271, + "step": 51365 + }, + { + "epoch": 1.0451253255208333, + "grad_norm": 14.140523910522461, + "learning_rate": 8.412354216352811e-06, + "loss": 4.9433, + "step": 51370 + }, + { + "epoch": 1.04522705078125, + "grad_norm": 14.622159004211426, + "learning_rate": 8.412062076079786e-06, + "loss": 4.6248, + "step": 51375 + }, + { + "epoch": 1.0453287760416667, + "grad_norm": 22.18695068359375, + "learning_rate": 8.411769914004764e-06, + "loss": 4.9458, + "step": 51380 + }, + { + "epoch": 1.0454305013020833, + "grad_norm": 16.85041618347168, + "learning_rate": 8.411477730129618e-06, + "loss": 4.912, + "step": 51385 + }, + { + "epoch": 1.0455322265625, + "grad_norm": 14.845355987548828, + "learning_rate": 8.41118552445621e-06, + "loss": 4.9049, + "step": 51390 + }, + { + "epoch": 1.0456339518229167, + "grad_norm": 16.842660903930664, + "learning_rate": 8.41089329698641e-06, + "loss": 5.036, + "step": 51395 + }, + { + "epoch": 1.0457356770833333, + "grad_norm": 14.967358589172363, + "learning_rate": 8.410601047722085e-06, + "loss": 4.74, + "step": 51400 + }, + { + "epoch": 1.04583740234375, + "grad_norm": 13.1415376663208, + "learning_rate": 8.4103087766651e-06, + "loss": 4.9031, + "step": 51405 + }, + { + "epoch": 1.0459391276041667, + "grad_norm": 23.194887161254883, + "learning_rate": 8.410016483817328e-06, + "loss": 4.95, + "step": 51410 + }, + { + "epoch": 1.0460408528645833, + "grad_norm": 14.474971771240234, + "learning_rate": 8.409724169180632e-06, + "loss": 4.7696, + "step": 51415 + }, + { + "epoch": 1.046142578125, + "grad_norm": 16.321502685546875, + "learning_rate": 8.40943183275688e-06, + "loss": 4.722, + "step": 51420 + }, + { + "epoch": 1.0462443033854167, + "grad_norm": 18.59343147277832, + "learning_rate": 8.409139474547943e-06, + "loss": 4.8704, + "step": 51425 + }, + { + "epoch": 1.0463460286458333, + "grad_norm": 12.829129219055176, + "learning_rate": 8.408847094555684e-06, + "loss": 4.7954, + "step": 51430 + }, + { + "epoch": 1.04644775390625, + "grad_norm": 22.280628204345703, + "learning_rate": 8.408554692781976e-06, + "loss": 4.9249, + "step": 51435 + }, + { + "epoch": 1.0465494791666667, + "grad_norm": 27.994726181030273, + "learning_rate": 8.408262269228685e-06, + "loss": 5.0595, + "step": 51440 + }, + { + "epoch": 1.0466512044270833, + "grad_norm": 14.197497367858887, + "learning_rate": 8.407969823897679e-06, + "loss": 5.1823, + "step": 51445 + }, + { + "epoch": 1.0467529296875, + "grad_norm": 19.345605850219727, + "learning_rate": 8.40767735679083e-06, + "loss": 5.0187, + "step": 51450 + }, + { + "epoch": 1.0468546549479167, + "grad_norm": 17.535341262817383, + "learning_rate": 8.407384867910002e-06, + "loss": 4.9799, + "step": 51455 + }, + { + "epoch": 1.0469563802083333, + "grad_norm": 22.594282150268555, + "learning_rate": 8.407092357257067e-06, + "loss": 4.7381, + "step": 51460 + }, + { + "epoch": 1.04705810546875, + "grad_norm": 24.91754913330078, + "learning_rate": 8.406799824833895e-06, + "loss": 4.9382, + "step": 51465 + }, + { + "epoch": 1.0471598307291667, + "grad_norm": 14.685212135314941, + "learning_rate": 8.406507270642352e-06, + "loss": 4.8605, + "step": 51470 + }, + { + "epoch": 1.0472615559895833, + "grad_norm": 23.338634490966797, + "learning_rate": 8.40621469468431e-06, + "loss": 5.0842, + "step": 51475 + }, + { + "epoch": 1.04736328125, + "grad_norm": 19.782182693481445, + "learning_rate": 8.405922096961636e-06, + "loss": 5.1022, + "step": 51480 + }, + { + "epoch": 1.0474650065104167, + "grad_norm": 18.66299057006836, + "learning_rate": 8.405629477476203e-06, + "loss": 5.2343, + "step": 51485 + }, + { + "epoch": 1.0475667317708333, + "grad_norm": 17.711740493774414, + "learning_rate": 8.405336836229878e-06, + "loss": 4.8427, + "step": 51490 + }, + { + "epoch": 1.04766845703125, + "grad_norm": 16.559680938720703, + "learning_rate": 8.40504417322453e-06, + "loss": 4.9188, + "step": 51495 + }, + { + "epoch": 1.0477701822916667, + "grad_norm": 20.137840270996094, + "learning_rate": 8.404751488462033e-06, + "loss": 5.2228, + "step": 51500 + }, + { + "epoch": 1.0478719075520833, + "grad_norm": 18.648595809936523, + "learning_rate": 8.404458781944253e-06, + "loss": 4.9708, + "step": 51505 + }, + { + "epoch": 1.0479736328125, + "grad_norm": 19.82358169555664, + "learning_rate": 8.404166053673061e-06, + "loss": 4.7063, + "step": 51510 + }, + { + "epoch": 1.0480753580729167, + "grad_norm": 18.81534767150879, + "learning_rate": 8.403873303650331e-06, + "loss": 4.8537, + "step": 51515 + }, + { + "epoch": 1.0481770833333333, + "grad_norm": 16.597074508666992, + "learning_rate": 8.403580531877931e-06, + "loss": 5.1318, + "step": 51520 + }, + { + "epoch": 1.04827880859375, + "grad_norm": 19.82201385498047, + "learning_rate": 8.403287738357732e-06, + "loss": 4.9348, + "step": 51525 + }, + { + "epoch": 1.0483805338541667, + "grad_norm": 15.05232048034668, + "learning_rate": 8.402994923091603e-06, + "loss": 5.0026, + "step": 51530 + }, + { + "epoch": 1.0484822591145833, + "grad_norm": 13.191256523132324, + "learning_rate": 8.402702086081418e-06, + "loss": 4.9173, + "step": 51535 + }, + { + "epoch": 1.048583984375, + "grad_norm": 16.67408561706543, + "learning_rate": 8.402409227329046e-06, + "loss": 4.9136, + "step": 51540 + }, + { + "epoch": 1.0486857096354167, + "grad_norm": 16.325523376464844, + "learning_rate": 8.40211634683636e-06, + "loss": 4.9997, + "step": 51545 + }, + { + "epoch": 1.0487874348958333, + "grad_norm": 17.050426483154297, + "learning_rate": 8.40182344460523e-06, + "loss": 4.9099, + "step": 51550 + }, + { + "epoch": 1.04888916015625, + "grad_norm": 16.506839752197266, + "learning_rate": 8.401530520637528e-06, + "loss": 4.9522, + "step": 51555 + }, + { + "epoch": 1.0489908854166667, + "grad_norm": 15.787609100341797, + "learning_rate": 8.401237574935125e-06, + "loss": 4.7283, + "step": 51560 + }, + { + "epoch": 1.0490926106770833, + "grad_norm": 17.147640228271484, + "learning_rate": 8.400944607499895e-06, + "loss": 4.9571, + "step": 51565 + }, + { + "epoch": 1.0491943359375, + "grad_norm": 21.619813919067383, + "learning_rate": 8.400651618333708e-06, + "loss": 4.8387, + "step": 51570 + }, + { + "epoch": 1.0492960611979167, + "grad_norm": 18.29776382446289, + "learning_rate": 8.400358607438436e-06, + "loss": 4.9914, + "step": 51575 + }, + { + "epoch": 1.0493977864583333, + "grad_norm": 22.173328399658203, + "learning_rate": 8.400065574815954e-06, + "loss": 4.8493, + "step": 51580 + }, + { + "epoch": 1.04949951171875, + "grad_norm": 20.994218826293945, + "learning_rate": 8.39977252046813e-06, + "loss": 5.0908, + "step": 51585 + }, + { + "epoch": 1.0496012369791667, + "grad_norm": 18.3663272857666, + "learning_rate": 8.399479444396841e-06, + "loss": 4.8148, + "step": 51590 + }, + { + "epoch": 1.0497029622395833, + "grad_norm": 17.371234893798828, + "learning_rate": 8.399186346603955e-06, + "loss": 4.8422, + "step": 51595 + }, + { + "epoch": 1.0498046875, + "grad_norm": 20.39690399169922, + "learning_rate": 8.398893227091349e-06, + "loss": 4.8414, + "step": 51600 + }, + { + "epoch": 1.0499064127604167, + "grad_norm": 17.385496139526367, + "learning_rate": 8.398600085860894e-06, + "loss": 4.7991, + "step": 51605 + }, + { + "epoch": 1.0500081380208333, + "grad_norm": 22.03078842163086, + "learning_rate": 8.398306922914462e-06, + "loss": 5.1643, + "step": 51610 + }, + { + "epoch": 1.05010986328125, + "grad_norm": 18.889741897583008, + "learning_rate": 8.398013738253929e-06, + "loss": 5.193, + "step": 51615 + }, + { + "epoch": 1.0502115885416667, + "grad_norm": 13.280324935913086, + "learning_rate": 8.397720531881167e-06, + "loss": 4.9452, + "step": 51620 + }, + { + "epoch": 1.0503133138020833, + "grad_norm": 19.967634201049805, + "learning_rate": 8.397427303798049e-06, + "loss": 5.1107, + "step": 51625 + }, + { + "epoch": 1.0504150390625, + "grad_norm": 15.976757049560547, + "learning_rate": 8.397134054006449e-06, + "loss": 5.2296, + "step": 51630 + }, + { + "epoch": 1.0505167643229167, + "grad_norm": 13.320701599121094, + "learning_rate": 8.39684078250824e-06, + "loss": 4.7131, + "step": 51635 + }, + { + "epoch": 1.0506184895833333, + "grad_norm": 19.345500946044922, + "learning_rate": 8.396547489305298e-06, + "loss": 4.8732, + "step": 51640 + }, + { + "epoch": 1.05072021484375, + "grad_norm": 15.845565795898438, + "learning_rate": 8.396254174399495e-06, + "loss": 4.935, + "step": 51645 + }, + { + "epoch": 1.0508219401041667, + "grad_norm": 14.881945610046387, + "learning_rate": 8.395960837792707e-06, + "loss": 5.0099, + "step": 51650 + }, + { + "epoch": 1.0509236653645833, + "grad_norm": 20.443819046020508, + "learning_rate": 8.395667479486806e-06, + "loss": 4.9191, + "step": 51655 + }, + { + "epoch": 1.051025390625, + "grad_norm": 15.610960960388184, + "learning_rate": 8.395374099483667e-06, + "loss": 4.9042, + "step": 51660 + }, + { + "epoch": 1.0511271158854167, + "grad_norm": 16.011571884155273, + "learning_rate": 8.395080697785167e-06, + "loss": 5.3431, + "step": 51665 + }, + { + "epoch": 1.0512288411458333, + "grad_norm": 12.847662925720215, + "learning_rate": 8.39478727439318e-06, + "loss": 5.0418, + "step": 51670 + }, + { + "epoch": 1.05133056640625, + "grad_norm": 14.530843734741211, + "learning_rate": 8.394493829309576e-06, + "loss": 4.8615, + "step": 51675 + }, + { + "epoch": 1.0514322916666667, + "grad_norm": 14.082489013671875, + "learning_rate": 8.394200362536237e-06, + "loss": 5.3197, + "step": 51680 + }, + { + "epoch": 1.0515340169270833, + "grad_norm": 15.555440902709961, + "learning_rate": 8.393906874075036e-06, + "loss": 5.1008, + "step": 51685 + }, + { + "epoch": 1.0516357421875, + "grad_norm": 17.85211753845215, + "learning_rate": 8.393613363927846e-06, + "loss": 4.5648, + "step": 51690 + }, + { + "epoch": 1.0517374674479167, + "grad_norm": 14.728729248046875, + "learning_rate": 8.393319832096543e-06, + "loss": 4.7637, + "step": 51695 + }, + { + "epoch": 1.0518391927083333, + "grad_norm": 19.88884735107422, + "learning_rate": 8.393026278583004e-06, + "loss": 5.0315, + "step": 51700 + }, + { + "epoch": 1.05194091796875, + "grad_norm": 17.862802505493164, + "learning_rate": 8.392732703389105e-06, + "loss": 4.9542, + "step": 51705 + }, + { + "epoch": 1.0520426432291667, + "grad_norm": 13.805960655212402, + "learning_rate": 8.39243910651672e-06, + "loss": 5.1584, + "step": 51710 + }, + { + "epoch": 1.0521443684895833, + "grad_norm": 22.040206909179688, + "learning_rate": 8.392145487967726e-06, + "loss": 5.2641, + "step": 51715 + }, + { + "epoch": 1.05224609375, + "grad_norm": 15.900613784790039, + "learning_rate": 8.391851847744002e-06, + "loss": 5.2428, + "step": 51720 + }, + { + "epoch": 1.0523478190104167, + "grad_norm": 20.406999588012695, + "learning_rate": 8.391558185847419e-06, + "loss": 5.1062, + "step": 51725 + }, + { + "epoch": 1.0524495442708333, + "grad_norm": 21.446279525756836, + "learning_rate": 8.391264502279855e-06, + "loss": 4.7913, + "step": 51730 + }, + { + "epoch": 1.05255126953125, + "grad_norm": 17.389293670654297, + "learning_rate": 8.390970797043188e-06, + "loss": 4.8705, + "step": 51735 + }, + { + "epoch": 1.0526529947916667, + "grad_norm": 18.29146385192871, + "learning_rate": 8.390677070139295e-06, + "loss": 4.8715, + "step": 51740 + }, + { + "epoch": 1.0527547200520833, + "grad_norm": 19.361759185791016, + "learning_rate": 8.39038332157005e-06, + "loss": 4.8548, + "step": 51745 + }, + { + "epoch": 1.0528564453125, + "grad_norm": 16.82459831237793, + "learning_rate": 8.390089551337335e-06, + "loss": 4.9412, + "step": 51750 + }, + { + "epoch": 1.0529581705729167, + "grad_norm": 18.235952377319336, + "learning_rate": 8.38979575944302e-06, + "loss": 5.0382, + "step": 51755 + }, + { + "epoch": 1.0530598958333333, + "grad_norm": 19.573503494262695, + "learning_rate": 8.38950194588899e-06, + "loss": 4.952, + "step": 51760 + }, + { + "epoch": 1.05316162109375, + "grad_norm": 26.669750213623047, + "learning_rate": 8.389208110677116e-06, + "loss": 5.3011, + "step": 51765 + }, + { + "epoch": 1.0532633463541667, + "grad_norm": 15.658990859985352, + "learning_rate": 8.38891425380928e-06, + "loss": 4.9872, + "step": 51770 + }, + { + "epoch": 1.0533650716145833, + "grad_norm": 15.294325828552246, + "learning_rate": 8.388620375287355e-06, + "loss": 4.715, + "step": 51775 + }, + { + "epoch": 1.053466796875, + "grad_norm": 15.839608192443848, + "learning_rate": 8.388326475113223e-06, + "loss": 4.8891, + "step": 51780 + }, + { + "epoch": 1.0535685221354167, + "grad_norm": 20.881364822387695, + "learning_rate": 8.388032553288761e-06, + "loss": 4.9103, + "step": 51785 + }, + { + "epoch": 1.0536702473958333, + "grad_norm": 17.952911376953125, + "learning_rate": 8.387738609815846e-06, + "loss": 4.8603, + "step": 51790 + }, + { + "epoch": 1.05377197265625, + "grad_norm": 17.774446487426758, + "learning_rate": 8.387444644696357e-06, + "loss": 4.8106, + "step": 51795 + }, + { + "epoch": 1.0538736979166667, + "grad_norm": 16.377975463867188, + "learning_rate": 8.38715065793217e-06, + "loss": 4.9606, + "step": 51800 + }, + { + "epoch": 1.0539754231770833, + "grad_norm": 19.58979606628418, + "learning_rate": 8.386856649525168e-06, + "loss": 5.0099, + "step": 51805 + }, + { + "epoch": 1.0540771484375, + "grad_norm": 17.911026000976562, + "learning_rate": 8.386562619477225e-06, + "loss": 4.9813, + "step": 51810 + }, + { + "epoch": 1.0541788736979167, + "grad_norm": 16.306425094604492, + "learning_rate": 8.386268567790224e-06, + "loss": 5.1072, + "step": 51815 + }, + { + "epoch": 1.0542805989583333, + "grad_norm": 15.297706604003906, + "learning_rate": 8.38597449446604e-06, + "loss": 4.764, + "step": 51820 + }, + { + "epoch": 1.05438232421875, + "grad_norm": 14.606778144836426, + "learning_rate": 8.385680399506556e-06, + "loss": 4.8595, + "step": 51825 + }, + { + "epoch": 1.0544840494791667, + "grad_norm": 15.495267868041992, + "learning_rate": 8.385386282913647e-06, + "loss": 4.8975, + "step": 51830 + }, + { + "epoch": 1.0545857747395833, + "grad_norm": 22.768573760986328, + "learning_rate": 8.385092144689196e-06, + "loss": 4.8298, + "step": 51835 + }, + { + "epoch": 1.0546875, + "grad_norm": 19.292282104492188, + "learning_rate": 8.38479798483508e-06, + "loss": 4.9096, + "step": 51840 + }, + { + "epoch": 1.0547892252604167, + "grad_norm": 22.355724334716797, + "learning_rate": 8.384503803353178e-06, + "loss": 4.8594, + "step": 51845 + }, + { + "epoch": 1.0548909505208333, + "grad_norm": 12.661928176879883, + "learning_rate": 8.384209600245375e-06, + "loss": 4.8325, + "step": 51850 + }, + { + "epoch": 1.05499267578125, + "grad_norm": 18.84069061279297, + "learning_rate": 8.383915375513543e-06, + "loss": 4.9123, + "step": 51855 + }, + { + "epoch": 1.0550944010416667, + "grad_norm": 13.204700469970703, + "learning_rate": 8.383621129159568e-06, + "loss": 4.7595, + "step": 51860 + }, + { + "epoch": 1.0551961263020833, + "grad_norm": 25.290441513061523, + "learning_rate": 8.383326861185325e-06, + "loss": 4.7531, + "step": 51865 + }, + { + "epoch": 1.0552978515625, + "grad_norm": 15.09814167022705, + "learning_rate": 8.3830325715927e-06, + "loss": 4.8185, + "step": 51870 + }, + { + "epoch": 1.0553995768229167, + "grad_norm": 21.15576171875, + "learning_rate": 8.382738260383572e-06, + "loss": 4.9539, + "step": 51875 + }, + { + "epoch": 1.0555013020833333, + "grad_norm": 24.734621047973633, + "learning_rate": 8.382443927559818e-06, + "loss": 5.3611, + "step": 51880 + }, + { + "epoch": 1.05560302734375, + "grad_norm": 15.035401344299316, + "learning_rate": 8.382149573123322e-06, + "loss": 4.8023, + "step": 51885 + }, + { + "epoch": 1.0557047526041667, + "grad_norm": 21.385900497436523, + "learning_rate": 8.381855197075963e-06, + "loss": 4.955, + "step": 51890 + }, + { + "epoch": 1.0558064778645833, + "grad_norm": 16.47530746459961, + "learning_rate": 8.381560799419623e-06, + "loss": 5.1084, + "step": 51895 + }, + { + "epoch": 1.055908203125, + "grad_norm": 17.747377395629883, + "learning_rate": 8.381266380156185e-06, + "loss": 4.8937, + "step": 51900 + }, + { + "epoch": 1.0560099283854167, + "grad_norm": 16.994958877563477, + "learning_rate": 8.380971939287528e-06, + "loss": 4.9189, + "step": 51905 + }, + { + "epoch": 1.0561116536458333, + "grad_norm": 18.547731399536133, + "learning_rate": 8.38067747681553e-06, + "loss": 5.0142, + "step": 51910 + }, + { + "epoch": 1.05621337890625, + "grad_norm": 16.69068717956543, + "learning_rate": 8.38038299274208e-06, + "loss": 4.9188, + "step": 51915 + }, + { + "epoch": 1.0563151041666667, + "grad_norm": 18.9122371673584, + "learning_rate": 8.380088487069054e-06, + "loss": 5.0002, + "step": 51920 + }, + { + "epoch": 1.0564168294270833, + "grad_norm": 17.23124122619629, + "learning_rate": 8.379793959798335e-06, + "loss": 4.947, + "step": 51925 + }, + { + "epoch": 1.0565185546875, + "grad_norm": 15.271464347839355, + "learning_rate": 8.379499410931806e-06, + "loss": 4.9728, + "step": 51930 + }, + { + "epoch": 1.0566202799479167, + "grad_norm": 18.204235076904297, + "learning_rate": 8.37920484047135e-06, + "loss": 4.9933, + "step": 51935 + }, + { + "epoch": 1.0567220052083333, + "grad_norm": 19.8541202545166, + "learning_rate": 8.378910248418845e-06, + "loss": 4.9515, + "step": 51940 + }, + { + "epoch": 1.05682373046875, + "grad_norm": 13.965619087219238, + "learning_rate": 8.378615634776178e-06, + "loss": 4.7683, + "step": 51945 + }, + { + "epoch": 1.0569254557291667, + "grad_norm": 13.326964378356934, + "learning_rate": 8.378320999545228e-06, + "loss": 5.0857, + "step": 51950 + }, + { + "epoch": 1.0570271809895833, + "grad_norm": 19.15492820739746, + "learning_rate": 8.378026342727882e-06, + "loss": 5.0065, + "step": 51955 + }, + { + "epoch": 1.05712890625, + "grad_norm": 20.07196617126465, + "learning_rate": 8.377731664326017e-06, + "loss": 5.1867, + "step": 51960 + }, + { + "epoch": 1.0572306315104167, + "grad_norm": 19.905611038208008, + "learning_rate": 8.37743696434152e-06, + "loss": 5.0331, + "step": 51965 + }, + { + "epoch": 1.0573323567708333, + "grad_norm": 15.04259967803955, + "learning_rate": 8.377142242776275e-06, + "loss": 5.0352, + "step": 51970 + }, + { + "epoch": 1.05743408203125, + "grad_norm": 14.270405769348145, + "learning_rate": 8.376847499632161e-06, + "loss": 4.9203, + "step": 51975 + }, + { + "epoch": 1.0575358072916667, + "grad_norm": 16.72032928466797, + "learning_rate": 8.376552734911065e-06, + "loss": 4.9681, + "step": 51980 + }, + { + "epoch": 1.0576375325520833, + "grad_norm": 15.83079719543457, + "learning_rate": 8.376257948614866e-06, + "loss": 5.0466, + "step": 51985 + }, + { + "epoch": 1.0577392578125, + "grad_norm": 19.952131271362305, + "learning_rate": 8.375963140745454e-06, + "loss": 5.1275, + "step": 51990 + }, + { + "epoch": 1.0578409830729167, + "grad_norm": 18.651966094970703, + "learning_rate": 8.375668311304706e-06, + "loss": 5.3154, + "step": 51995 + }, + { + "epoch": 1.0579427083333333, + "grad_norm": 13.046945571899414, + "learning_rate": 8.375373460294511e-06, + "loss": 4.9033, + "step": 52000 + }, + { + "epoch": 1.05804443359375, + "grad_norm": 16.05242347717285, + "learning_rate": 8.37507858771675e-06, + "loss": 4.9678, + "step": 52005 + }, + { + "epoch": 1.0581461588541667, + "grad_norm": 17.61176109313965, + "learning_rate": 8.37478369357331e-06, + "loss": 4.8333, + "step": 52010 + }, + { + "epoch": 1.0582478841145833, + "grad_norm": 20.5650634765625, + "learning_rate": 8.374488777866071e-06, + "loss": 5.0093, + "step": 52015 + }, + { + "epoch": 1.058349609375, + "grad_norm": 15.49235725402832, + "learning_rate": 8.374193840596923e-06, + "loss": 4.8124, + "step": 52020 + }, + { + "epoch": 1.0584513346354167, + "grad_norm": 17.756465911865234, + "learning_rate": 8.373898881767743e-06, + "loss": 5.1823, + "step": 52025 + }, + { + "epoch": 1.0585530598958333, + "grad_norm": 17.407806396484375, + "learning_rate": 8.373603901380425e-06, + "loss": 5.0562, + "step": 52030 + }, + { + "epoch": 1.05865478515625, + "grad_norm": 14.520013809204102, + "learning_rate": 8.373308899436846e-06, + "loss": 4.8651, + "step": 52035 + }, + { + "epoch": 1.0587565104166667, + "grad_norm": 17.2132511138916, + "learning_rate": 8.373013875938895e-06, + "loss": 4.9194, + "step": 52040 + }, + { + "epoch": 1.0588582356770833, + "grad_norm": 15.808133125305176, + "learning_rate": 8.372718830888455e-06, + "loss": 4.8243, + "step": 52045 + }, + { + "epoch": 1.0589599609375, + "grad_norm": 20.411691665649414, + "learning_rate": 8.372423764287414e-06, + "loss": 4.5881, + "step": 52050 + }, + { + "epoch": 1.0590616861979167, + "grad_norm": 17.9958438873291, + "learning_rate": 8.372128676137654e-06, + "loss": 5.0598, + "step": 52055 + }, + { + "epoch": 1.0591634114583333, + "grad_norm": 14.549033164978027, + "learning_rate": 8.371833566441063e-06, + "loss": 5.1456, + "step": 52060 + }, + { + "epoch": 1.05926513671875, + "grad_norm": 17.911895751953125, + "learning_rate": 8.371538435199526e-06, + "loss": 5.026, + "step": 52065 + }, + { + "epoch": 1.0593668619791667, + "grad_norm": 23.09992218017578, + "learning_rate": 8.37124328241493e-06, + "loss": 4.752, + "step": 52070 + }, + { + "epoch": 1.0594685872395833, + "grad_norm": 24.518543243408203, + "learning_rate": 8.370948108089159e-06, + "loss": 4.7015, + "step": 52075 + }, + { + "epoch": 1.0595703125, + "grad_norm": 19.425737380981445, + "learning_rate": 8.370652912224097e-06, + "loss": 4.9871, + "step": 52080 + }, + { + "epoch": 1.0596720377604167, + "grad_norm": 14.987024307250977, + "learning_rate": 8.370357694821636e-06, + "loss": 4.7497, + "step": 52085 + }, + { + "epoch": 1.0597737630208333, + "grad_norm": 19.11042022705078, + "learning_rate": 8.370062455883658e-06, + "loss": 4.9467, + "step": 52090 + }, + { + "epoch": 1.05987548828125, + "grad_norm": 16.27131462097168, + "learning_rate": 8.369767195412051e-06, + "loss": 4.6174, + "step": 52095 + }, + { + "epoch": 1.0599772135416667, + "grad_norm": 16.961498260498047, + "learning_rate": 8.3694719134087e-06, + "loss": 5.1207, + "step": 52100 + }, + { + "epoch": 1.0600789388020833, + "grad_norm": 11.847372055053711, + "learning_rate": 8.369176609875495e-06, + "loss": 4.8813, + "step": 52105 + }, + { + "epoch": 1.0601806640625, + "grad_norm": 21.216899871826172, + "learning_rate": 8.36888128481432e-06, + "loss": 5.0136, + "step": 52110 + }, + { + "epoch": 1.0602823893229167, + "grad_norm": 21.44137954711914, + "learning_rate": 8.368585938227063e-06, + "loss": 4.8891, + "step": 52115 + }, + { + "epoch": 1.0603841145833333, + "grad_norm": 19.54468536376953, + "learning_rate": 8.368290570115612e-06, + "loss": 5.0276, + "step": 52120 + }, + { + "epoch": 1.06048583984375, + "grad_norm": 17.11756706237793, + "learning_rate": 8.367995180481852e-06, + "loss": 4.9449, + "step": 52125 + }, + { + "epoch": 1.0605875651041667, + "grad_norm": 28.72748374938965, + "learning_rate": 8.367699769327673e-06, + "loss": 5.3103, + "step": 52130 + }, + { + "epoch": 1.0606892903645833, + "grad_norm": 15.79369831085205, + "learning_rate": 8.36740433665496e-06, + "loss": 5.0586, + "step": 52135 + }, + { + "epoch": 1.060791015625, + "grad_norm": 14.423141479492188, + "learning_rate": 8.367108882465604e-06, + "loss": 5.0992, + "step": 52140 + }, + { + "epoch": 1.0608927408854167, + "grad_norm": 21.273731231689453, + "learning_rate": 8.366813406761491e-06, + "loss": 4.9329, + "step": 52145 + }, + { + "epoch": 1.0609944661458333, + "grad_norm": 15.70129108428955, + "learning_rate": 8.366517909544509e-06, + "loss": 4.7951, + "step": 52150 + }, + { + "epoch": 1.06109619140625, + "grad_norm": 15.159929275512695, + "learning_rate": 8.366222390816545e-06, + "loss": 4.7708, + "step": 52155 + }, + { + "epoch": 1.0611979166666667, + "grad_norm": 20.671184539794922, + "learning_rate": 8.365926850579491e-06, + "loss": 4.8521, + "step": 52160 + }, + { + "epoch": 1.0612996419270833, + "grad_norm": 19.23441505432129, + "learning_rate": 8.365631288835231e-06, + "loss": 5.0035, + "step": 52165 + }, + { + "epoch": 1.0614013671875, + "grad_norm": 18.22010612487793, + "learning_rate": 8.365335705585655e-06, + "loss": 4.8738, + "step": 52170 + }, + { + "epoch": 1.0615030924479167, + "grad_norm": 17.662561416625977, + "learning_rate": 8.365040100832653e-06, + "loss": 4.9903, + "step": 52175 + }, + { + "epoch": 1.0616048177083333, + "grad_norm": 13.013257026672363, + "learning_rate": 8.364744474578112e-06, + "loss": 4.9217, + "step": 52180 + }, + { + "epoch": 1.06170654296875, + "grad_norm": 21.983749389648438, + "learning_rate": 8.364448826823924e-06, + "loss": 4.8499, + "step": 52185 + }, + { + "epoch": 1.0618082682291667, + "grad_norm": 20.840118408203125, + "learning_rate": 8.364153157571974e-06, + "loss": 4.9936, + "step": 52190 + }, + { + "epoch": 1.0619099934895833, + "grad_norm": 19.962804794311523, + "learning_rate": 8.363857466824156e-06, + "loss": 4.9965, + "step": 52195 + }, + { + "epoch": 1.06201171875, + "grad_norm": 17.089645385742188, + "learning_rate": 8.363561754582354e-06, + "loss": 4.9813, + "step": 52200 + }, + { + "epoch": 1.0621134440104167, + "grad_norm": 18.924684524536133, + "learning_rate": 8.363266020848461e-06, + "loss": 5.1388, + "step": 52205 + }, + { + "epoch": 1.0622151692708333, + "grad_norm": 24.14250946044922, + "learning_rate": 8.362970265624366e-06, + "loss": 5.1756, + "step": 52210 + }, + { + "epoch": 1.06231689453125, + "grad_norm": 18.960412979125977, + "learning_rate": 8.362674488911958e-06, + "loss": 4.7603, + "step": 52215 + }, + { + "epoch": 1.0624186197916667, + "grad_norm": 19.942611694335938, + "learning_rate": 8.362378690713127e-06, + "loss": 4.7432, + "step": 52220 + }, + { + "epoch": 1.0625203450520833, + "grad_norm": 21.75023651123047, + "learning_rate": 8.362082871029765e-06, + "loss": 4.9125, + "step": 52225 + }, + { + "epoch": 1.0626220703125, + "grad_norm": 16.705551147460938, + "learning_rate": 8.36178702986376e-06, + "loss": 4.9485, + "step": 52230 + }, + { + "epoch": 1.0627237955729167, + "grad_norm": 16.750282287597656, + "learning_rate": 8.361491167217002e-06, + "loss": 4.8783, + "step": 52235 + }, + { + "epoch": 1.0628255208333333, + "grad_norm": 15.983569145202637, + "learning_rate": 8.361195283091384e-06, + "loss": 4.8271, + "step": 52240 + }, + { + "epoch": 1.06292724609375, + "grad_norm": 18.85069465637207, + "learning_rate": 8.360899377488795e-06, + "loss": 5.2281, + "step": 52245 + }, + { + "epoch": 1.0630289713541667, + "grad_norm": 14.88808536529541, + "learning_rate": 8.360603450411126e-06, + "loss": 4.8027, + "step": 52250 + }, + { + "epoch": 1.0631306966145833, + "grad_norm": 23.33976936340332, + "learning_rate": 8.360307501860268e-06, + "loss": 5.0859, + "step": 52255 + }, + { + "epoch": 1.063232421875, + "grad_norm": 15.548725128173828, + "learning_rate": 8.360011531838111e-06, + "loss": 5.1503, + "step": 52260 + }, + { + "epoch": 1.0633341471354167, + "grad_norm": 13.376704216003418, + "learning_rate": 8.359715540346546e-06, + "loss": 4.9845, + "step": 52265 + }, + { + "epoch": 1.0634358723958333, + "grad_norm": 17.702255249023438, + "learning_rate": 8.359419527387465e-06, + "loss": 4.7401, + "step": 52270 + }, + { + "epoch": 1.06353759765625, + "grad_norm": 22.251789093017578, + "learning_rate": 8.359123492962762e-06, + "loss": 5.2987, + "step": 52275 + }, + { + "epoch": 1.0636393229166667, + "grad_norm": 16.086584091186523, + "learning_rate": 8.358827437074324e-06, + "loss": 4.9908, + "step": 52280 + }, + { + "epoch": 1.0637410481770833, + "grad_norm": 15.528725624084473, + "learning_rate": 8.358531359724047e-06, + "loss": 5.3025, + "step": 52285 + }, + { + "epoch": 1.0638427734375, + "grad_norm": 14.86195182800293, + "learning_rate": 8.35823526091382e-06, + "loss": 4.9911, + "step": 52290 + }, + { + "epoch": 1.0639444986979167, + "grad_norm": 14.832035064697266, + "learning_rate": 8.357939140645535e-06, + "loss": 4.9523, + "step": 52295 + }, + { + "epoch": 1.0640462239583333, + "grad_norm": 15.941704750061035, + "learning_rate": 8.357642998921085e-06, + "loss": 4.9631, + "step": 52300 + }, + { + "epoch": 1.06414794921875, + "grad_norm": 13.755990028381348, + "learning_rate": 8.357346835742362e-06, + "loss": 4.9176, + "step": 52305 + }, + { + "epoch": 1.0642496744791667, + "grad_norm": 17.42641258239746, + "learning_rate": 8.357050651111258e-06, + "loss": 4.972, + "step": 52310 + }, + { + "epoch": 1.0643513997395833, + "grad_norm": 18.84827423095703, + "learning_rate": 8.356754445029669e-06, + "loss": 5.1084, + "step": 52315 + }, + { + "epoch": 1.064453125, + "grad_norm": 26.369335174560547, + "learning_rate": 8.35645821749948e-06, + "loss": 5.0399, + "step": 52320 + }, + { + "epoch": 1.0645548502604167, + "grad_norm": 16.826377868652344, + "learning_rate": 8.356161968522593e-06, + "loss": 4.9524, + "step": 52325 + }, + { + "epoch": 1.0646565755208333, + "grad_norm": 14.586939811706543, + "learning_rate": 8.355865698100894e-06, + "loss": 4.9205, + "step": 52330 + }, + { + "epoch": 1.06475830078125, + "grad_norm": 22.11830711364746, + "learning_rate": 8.355569406236278e-06, + "loss": 4.8059, + "step": 52335 + }, + { + "epoch": 1.0648600260416667, + "grad_norm": 17.64168357849121, + "learning_rate": 8.35527309293064e-06, + "loss": 4.7268, + "step": 52340 + }, + { + "epoch": 1.0649617513020833, + "grad_norm": 18.400848388671875, + "learning_rate": 8.354976758185874e-06, + "loss": 5.106, + "step": 52345 + }, + { + "epoch": 1.0650634765625, + "grad_norm": 25.225719451904297, + "learning_rate": 8.354680402003868e-06, + "loss": 5.0323, + "step": 52350 + }, + { + "epoch": 1.0651652018229167, + "grad_norm": 14.784774780273438, + "learning_rate": 8.35438402438652e-06, + "loss": 5.1306, + "step": 52355 + }, + { + "epoch": 1.0652669270833333, + "grad_norm": 14.315848350524902, + "learning_rate": 8.354087625335722e-06, + "loss": 5.0358, + "step": 52360 + }, + { + "epoch": 1.06536865234375, + "grad_norm": 22.17462730407715, + "learning_rate": 8.353791204853372e-06, + "loss": 5.0819, + "step": 52365 + }, + { + "epoch": 1.0654703776041667, + "grad_norm": 14.11983585357666, + "learning_rate": 8.353494762941359e-06, + "loss": 5.012, + "step": 52370 + }, + { + "epoch": 1.0655721028645833, + "grad_norm": 25.1555233001709, + "learning_rate": 8.353198299601577e-06, + "loss": 4.9282, + "step": 52375 + }, + { + "epoch": 1.065673828125, + "grad_norm": 15.616385459899902, + "learning_rate": 8.352901814835926e-06, + "loss": 4.9819, + "step": 52380 + }, + { + "epoch": 1.0657755533854167, + "grad_norm": 20.738283157348633, + "learning_rate": 8.352605308646293e-06, + "loss": 4.6282, + "step": 52385 + }, + { + "epoch": 1.0658772786458333, + "grad_norm": 18.334911346435547, + "learning_rate": 8.35230878103458e-06, + "loss": 5.0894, + "step": 52390 + }, + { + "epoch": 1.06597900390625, + "grad_norm": 20.117294311523438, + "learning_rate": 8.352012232002675e-06, + "loss": 5.0641, + "step": 52395 + }, + { + "epoch": 1.0660807291666667, + "grad_norm": 15.087897300720215, + "learning_rate": 8.351715661552477e-06, + "loss": 4.8538, + "step": 52400 + }, + { + "epoch": 1.0661824544270833, + "grad_norm": 13.910131454467773, + "learning_rate": 8.35141906968588e-06, + "loss": 4.8979, + "step": 52405 + }, + { + "epoch": 1.0662841796875, + "grad_norm": 17.707014083862305, + "learning_rate": 8.35112245640478e-06, + "loss": 5.2931, + "step": 52410 + }, + { + "epoch": 1.0663859049479167, + "grad_norm": 20.144603729248047, + "learning_rate": 8.35082582171107e-06, + "loss": 4.7847, + "step": 52415 + }, + { + "epoch": 1.0664876302083333, + "grad_norm": 26.868303298950195, + "learning_rate": 8.350529165606647e-06, + "loss": 4.8397, + "step": 52420 + }, + { + "epoch": 1.06658935546875, + "grad_norm": 18.38013458251953, + "learning_rate": 8.350232488093406e-06, + "loss": 5.054, + "step": 52425 + }, + { + "epoch": 1.0666910807291667, + "grad_norm": 15.80346965789795, + "learning_rate": 8.349935789173245e-06, + "loss": 4.7403, + "step": 52430 + }, + { + "epoch": 1.0667928059895833, + "grad_norm": 15.684500694274902, + "learning_rate": 8.349639068848055e-06, + "loss": 4.8188, + "step": 52435 + }, + { + "epoch": 1.06689453125, + "grad_norm": 19.239885330200195, + "learning_rate": 8.349342327119736e-06, + "loss": 4.865, + "step": 52440 + }, + { + "epoch": 1.0669962565104167, + "grad_norm": 17.081602096557617, + "learning_rate": 8.349045563990182e-06, + "loss": 4.9813, + "step": 52445 + }, + { + "epoch": 1.0670979817708333, + "grad_norm": 15.36328125, + "learning_rate": 8.34874877946129e-06, + "loss": 4.827, + "step": 52450 + }, + { + "epoch": 1.06719970703125, + "grad_norm": 27.82834243774414, + "learning_rate": 8.348451973534957e-06, + "loss": 4.7512, + "step": 52455 + }, + { + "epoch": 1.0673014322916667, + "grad_norm": 15.943964958190918, + "learning_rate": 8.34815514621308e-06, + "loss": 4.944, + "step": 52460 + }, + { + "epoch": 1.0674031575520833, + "grad_norm": 18.83327865600586, + "learning_rate": 8.347858297497551e-06, + "loss": 4.7253, + "step": 52465 + }, + { + "epoch": 1.0675048828125, + "grad_norm": 15.94711685180664, + "learning_rate": 8.347561427390273e-06, + "loss": 4.7581, + "step": 52470 + }, + { + "epoch": 1.0676066080729167, + "grad_norm": 16.82832908630371, + "learning_rate": 8.34726453589314e-06, + "loss": 4.9453, + "step": 52475 + }, + { + "epoch": 1.0677083333333333, + "grad_norm": 12.77479362487793, + "learning_rate": 8.346967623008048e-06, + "loss": 5.0565, + "step": 52480 + }, + { + "epoch": 1.06781005859375, + "grad_norm": 19.737276077270508, + "learning_rate": 8.346670688736896e-06, + "loss": 5.0732, + "step": 52485 + }, + { + "epoch": 1.0679117838541667, + "grad_norm": 20.057802200317383, + "learning_rate": 8.346373733081581e-06, + "loss": 4.9617, + "step": 52490 + }, + { + "epoch": 1.0680135091145833, + "grad_norm": 20.1363525390625, + "learning_rate": 8.346076756044e-06, + "loss": 4.8752, + "step": 52495 + }, + { + "epoch": 1.068115234375, + "grad_norm": 15.813024520874023, + "learning_rate": 8.345779757626051e-06, + "loss": 4.6546, + "step": 52500 + }, + { + "epoch": 1.0682169596354167, + "grad_norm": 24.59486961364746, + "learning_rate": 8.34548273782963e-06, + "loss": 4.9009, + "step": 52505 + }, + { + "epoch": 1.0683186848958333, + "grad_norm": 24.568824768066406, + "learning_rate": 8.345185696656637e-06, + "loss": 4.9618, + "step": 52510 + }, + { + "epoch": 1.06842041015625, + "grad_norm": 18.086734771728516, + "learning_rate": 8.34488863410897e-06, + "loss": 5.052, + "step": 52515 + }, + { + "epoch": 1.0685221354166667, + "grad_norm": 16.852615356445312, + "learning_rate": 8.344591550188525e-06, + "loss": 4.9016, + "step": 52520 + }, + { + "epoch": 1.0686238606770833, + "grad_norm": 23.13271713256836, + "learning_rate": 8.344294444897202e-06, + "loss": 4.98, + "step": 52525 + }, + { + "epoch": 1.0687255859375, + "grad_norm": 16.574871063232422, + "learning_rate": 8.3439973182369e-06, + "loss": 4.9429, + "step": 52530 + }, + { + "epoch": 1.0688273111979167, + "grad_norm": 19.553115844726562, + "learning_rate": 8.343700170209519e-06, + "loss": 4.9908, + "step": 52535 + }, + { + "epoch": 1.0689290364583333, + "grad_norm": 12.475449562072754, + "learning_rate": 8.343403000816952e-06, + "loss": 5.2412, + "step": 52540 + }, + { + "epoch": 1.06903076171875, + "grad_norm": 16.80788803100586, + "learning_rate": 8.343105810061101e-06, + "loss": 4.8338, + "step": 52545 + }, + { + "epoch": 1.0691324869791667, + "grad_norm": 23.526437759399414, + "learning_rate": 8.342808597943867e-06, + "loss": 4.8517, + "step": 52550 + }, + { + "epoch": 1.0692342122395833, + "grad_norm": 18.2193603515625, + "learning_rate": 8.342511364467146e-06, + "loss": 5.0335, + "step": 52555 + }, + { + "epoch": 1.0693359375, + "grad_norm": 18.934249877929688, + "learning_rate": 8.342214109632838e-06, + "loss": 4.8553, + "step": 52560 + }, + { + "epoch": 1.0694376627604167, + "grad_norm": 17.229455947875977, + "learning_rate": 8.341916833442845e-06, + "loss": 4.8909, + "step": 52565 + }, + { + "epoch": 1.0695393880208333, + "grad_norm": 14.451704025268555, + "learning_rate": 8.341619535899062e-06, + "loss": 4.9737, + "step": 52570 + }, + { + "epoch": 1.06964111328125, + "grad_norm": 34.487606048583984, + "learning_rate": 8.341322217003391e-06, + "loss": 5.1715, + "step": 52575 + }, + { + "epoch": 1.0697428385416667, + "grad_norm": 18.8770751953125, + "learning_rate": 8.341024876757733e-06, + "loss": 5.0775, + "step": 52580 + }, + { + "epoch": 1.0698445638020833, + "grad_norm": 16.569732666015625, + "learning_rate": 8.340727515163987e-06, + "loss": 4.837, + "step": 52585 + }, + { + "epoch": 1.0699462890625, + "grad_norm": 14.637929916381836, + "learning_rate": 8.340430132224051e-06, + "loss": 4.9144, + "step": 52590 + }, + { + "epoch": 1.0700480143229167, + "grad_norm": 17.00057029724121, + "learning_rate": 8.340132727939828e-06, + "loss": 5.2912, + "step": 52595 + }, + { + "epoch": 1.0701497395833333, + "grad_norm": 15.071603775024414, + "learning_rate": 8.339835302313218e-06, + "loss": 4.7389, + "step": 52600 + }, + { + "epoch": 1.07025146484375, + "grad_norm": 18.225177764892578, + "learning_rate": 8.339537855346118e-06, + "loss": 5.1136, + "step": 52605 + }, + { + "epoch": 1.0703531901041667, + "grad_norm": 18.357547760009766, + "learning_rate": 8.339240387040432e-06, + "loss": 5.0667, + "step": 52610 + }, + { + "epoch": 1.0704549153645833, + "grad_norm": 15.171685218811035, + "learning_rate": 8.33894289739806e-06, + "loss": 4.9274, + "step": 52615 + }, + { + "epoch": 1.070556640625, + "grad_norm": 16.522729873657227, + "learning_rate": 8.338645386420904e-06, + "loss": 4.9174, + "step": 52620 + }, + { + "epoch": 1.0706583658854167, + "grad_norm": 17.781707763671875, + "learning_rate": 8.338347854110864e-06, + "loss": 4.9971, + "step": 52625 + }, + { + "epoch": 1.0707600911458333, + "grad_norm": 22.613920211791992, + "learning_rate": 8.33805030046984e-06, + "loss": 4.9603, + "step": 52630 + }, + { + "epoch": 1.07086181640625, + "grad_norm": 18.841022491455078, + "learning_rate": 8.337752725499735e-06, + "loss": 4.9195, + "step": 52635 + }, + { + "epoch": 1.0709635416666667, + "grad_norm": 16.42896270751953, + "learning_rate": 8.337455129202448e-06, + "loss": 5.1656, + "step": 52640 + }, + { + "epoch": 1.0710652669270833, + "grad_norm": 18.55830955505371, + "learning_rate": 8.337157511579883e-06, + "loss": 4.7384, + "step": 52645 + }, + { + "epoch": 1.0711669921875, + "grad_norm": 18.3997802734375, + "learning_rate": 8.336859872633943e-06, + "loss": 5.0362, + "step": 52650 + }, + { + "epoch": 1.0712687174479167, + "grad_norm": 16.5651798248291, + "learning_rate": 8.336562212366525e-06, + "loss": 5.0197, + "step": 52655 + }, + { + "epoch": 1.0713704427083333, + "grad_norm": 20.300649642944336, + "learning_rate": 8.336264530779534e-06, + "loss": 5.3308, + "step": 52660 + }, + { + "epoch": 1.07147216796875, + "grad_norm": 14.641271591186523, + "learning_rate": 8.335966827874871e-06, + "loss": 4.8294, + "step": 52665 + }, + { + "epoch": 1.0715738932291667, + "grad_norm": 15.016786575317383, + "learning_rate": 8.33566910365444e-06, + "loss": 4.8603, + "step": 52670 + }, + { + "epoch": 1.0716756184895833, + "grad_norm": 18.683717727661133, + "learning_rate": 8.335371358120142e-06, + "loss": 4.9769, + "step": 52675 + }, + { + "epoch": 1.07177734375, + "grad_norm": 19.22532844543457, + "learning_rate": 8.335073591273882e-06, + "loss": 4.9554, + "step": 52680 + }, + { + "epoch": 1.0718790690104167, + "grad_norm": 16.830076217651367, + "learning_rate": 8.334775803117557e-06, + "loss": 5.1659, + "step": 52685 + }, + { + "epoch": 1.0719807942708333, + "grad_norm": 16.881214141845703, + "learning_rate": 8.334477993653075e-06, + "loss": 5.0098, + "step": 52690 + }, + { + "epoch": 1.07208251953125, + "grad_norm": 13.337115287780762, + "learning_rate": 8.334180162882338e-06, + "loss": 5.0054, + "step": 52695 + }, + { + "epoch": 1.0721842447916667, + "grad_norm": 22.819440841674805, + "learning_rate": 8.333882310807246e-06, + "loss": 5.1046, + "step": 52700 + }, + { + "epoch": 1.0722859700520833, + "grad_norm": 21.21221160888672, + "learning_rate": 8.333584437429705e-06, + "loss": 4.6411, + "step": 52705 + }, + { + "epoch": 1.0723876953125, + "grad_norm": 16.765146255493164, + "learning_rate": 8.333286542751618e-06, + "loss": 4.8228, + "step": 52710 + }, + { + "epoch": 1.0724894205729167, + "grad_norm": 15.022711753845215, + "learning_rate": 8.33298862677489e-06, + "loss": 4.8747, + "step": 52715 + }, + { + "epoch": 1.0725911458333333, + "grad_norm": 15.218716621398926, + "learning_rate": 8.332690689501421e-06, + "loss": 5.0852, + "step": 52720 + }, + { + "epoch": 1.07269287109375, + "grad_norm": 20.45473289489746, + "learning_rate": 8.332392730933114e-06, + "loss": 5.1455, + "step": 52725 + }, + { + "epoch": 1.0727945963541667, + "grad_norm": 14.909536361694336, + "learning_rate": 8.332094751071878e-06, + "loss": 4.7525, + "step": 52730 + }, + { + "epoch": 1.0728963216145833, + "grad_norm": 15.559723854064941, + "learning_rate": 8.331796749919615e-06, + "loss": 4.74, + "step": 52735 + }, + { + "epoch": 1.072998046875, + "grad_norm": 22.40888214111328, + "learning_rate": 8.331498727478227e-06, + "loss": 5.0329, + "step": 52740 + }, + { + "epoch": 1.0730997721354167, + "grad_norm": 15.284919738769531, + "learning_rate": 8.33120068374962e-06, + "loss": 4.9765, + "step": 52745 + }, + { + "epoch": 1.0732014973958333, + "grad_norm": 14.623363494873047, + "learning_rate": 8.330902618735698e-06, + "loss": 4.9019, + "step": 52750 + }, + { + "epoch": 1.07330322265625, + "grad_norm": 16.99552345275879, + "learning_rate": 8.330604532438367e-06, + "loss": 4.7641, + "step": 52755 + }, + { + "epoch": 1.0734049479166667, + "grad_norm": 14.758151054382324, + "learning_rate": 8.33030642485953e-06, + "loss": 4.7725, + "step": 52760 + }, + { + "epoch": 1.0735066731770833, + "grad_norm": 21.96367835998535, + "learning_rate": 8.330008296001088e-06, + "loss": 5.0908, + "step": 52765 + }, + { + "epoch": 1.0736083984375, + "grad_norm": 15.260730743408203, + "learning_rate": 8.329710145864955e-06, + "loss": 4.9238, + "step": 52770 + }, + { + "epoch": 1.0737101236979167, + "grad_norm": 15.7594633102417, + "learning_rate": 8.329411974453029e-06, + "loss": 4.7901, + "step": 52775 + }, + { + "epoch": 1.0738118489583333, + "grad_norm": 15.22178840637207, + "learning_rate": 8.329113781767217e-06, + "loss": 4.8889, + "step": 52780 + }, + { + "epoch": 1.07391357421875, + "grad_norm": 16.14510726928711, + "learning_rate": 8.328815567809427e-06, + "loss": 4.938, + "step": 52785 + }, + { + "epoch": 1.0740152994791667, + "grad_norm": 14.612584114074707, + "learning_rate": 8.32851733258156e-06, + "loss": 4.9461, + "step": 52790 + }, + { + "epoch": 1.0741170247395833, + "grad_norm": 18.162553787231445, + "learning_rate": 8.328219076085524e-06, + "loss": 4.9927, + "step": 52795 + }, + { + "epoch": 1.07421875, + "grad_norm": 16.264142990112305, + "learning_rate": 8.327920798323225e-06, + "loss": 4.9199, + "step": 52800 + }, + { + "epoch": 1.0743204752604167, + "grad_norm": 17.85926055908203, + "learning_rate": 8.327622499296571e-06, + "loss": 4.9551, + "step": 52805 + }, + { + "epoch": 1.0744222005208333, + "grad_norm": 17.59687614440918, + "learning_rate": 8.327324179007464e-06, + "loss": 5.002, + "step": 52810 + }, + { + "epoch": 1.07452392578125, + "grad_norm": 15.091007232666016, + "learning_rate": 8.327025837457809e-06, + "loss": 4.9797, + "step": 52815 + }, + { + "epoch": 1.0746256510416667, + "grad_norm": 20.193613052368164, + "learning_rate": 8.326727474649518e-06, + "loss": 5.1007, + "step": 52820 + }, + { + "epoch": 1.0747273763020833, + "grad_norm": 15.221306800842285, + "learning_rate": 8.326429090584493e-06, + "loss": 4.9162, + "step": 52825 + }, + { + "epoch": 1.0748291015625, + "grad_norm": 20.44710922241211, + "learning_rate": 8.326130685264644e-06, + "loss": 4.9982, + "step": 52830 + }, + { + "epoch": 1.0749308268229167, + "grad_norm": 18.594038009643555, + "learning_rate": 8.325832258691873e-06, + "loss": 4.9898, + "step": 52835 + }, + { + "epoch": 1.0750325520833333, + "grad_norm": 16.31365394592285, + "learning_rate": 8.325533810868092e-06, + "loss": 5.1293, + "step": 52840 + }, + { + "epoch": 1.07513427734375, + "grad_norm": 19.25638198852539, + "learning_rate": 8.325235341795204e-06, + "loss": 4.8989, + "step": 52845 + }, + { + "epoch": 1.0752360026041667, + "grad_norm": 16.407440185546875, + "learning_rate": 8.324936851475119e-06, + "loss": 5.083, + "step": 52850 + }, + { + "epoch": 1.0753377278645833, + "grad_norm": 18.14568328857422, + "learning_rate": 8.324638339909741e-06, + "loss": 5.0268, + "step": 52855 + }, + { + "epoch": 1.075439453125, + "grad_norm": 13.612451553344727, + "learning_rate": 8.324339807100983e-06, + "loss": 4.9615, + "step": 52860 + }, + { + "epoch": 1.0755411783854167, + "grad_norm": 16.52218246459961, + "learning_rate": 8.324041253050745e-06, + "loss": 5.0083, + "step": 52865 + }, + { + "epoch": 1.0756429036458333, + "grad_norm": 14.549381256103516, + "learning_rate": 8.323742677760939e-06, + "loss": 4.7435, + "step": 52870 + }, + { + "epoch": 1.07574462890625, + "grad_norm": 17.81694793701172, + "learning_rate": 8.323444081233476e-06, + "loss": 4.8488, + "step": 52875 + }, + { + "epoch": 1.0758463541666667, + "grad_norm": 20.22174072265625, + "learning_rate": 8.323145463470257e-06, + "loss": 4.9657, + "step": 52880 + }, + { + "epoch": 1.0759480794270833, + "grad_norm": 16.11979866027832, + "learning_rate": 8.322846824473196e-06, + "loss": 4.5954, + "step": 52885 + }, + { + "epoch": 1.0760498046875, + "grad_norm": 17.523405075073242, + "learning_rate": 8.322548164244196e-06, + "loss": 4.9754, + "step": 52890 + }, + { + "epoch": 1.0761515299479167, + "grad_norm": 14.619656562805176, + "learning_rate": 8.32224948278517e-06, + "loss": 4.9044, + "step": 52895 + }, + { + "epoch": 1.0762532552083333, + "grad_norm": 16.583942413330078, + "learning_rate": 8.321950780098023e-06, + "loss": 5.1859, + "step": 52900 + }, + { + "epoch": 1.07635498046875, + "grad_norm": 17.271419525146484, + "learning_rate": 8.321652056184665e-06, + "loss": 4.9221, + "step": 52905 + }, + { + "epoch": 1.0764567057291667, + "grad_norm": 18.401010513305664, + "learning_rate": 8.321353311047005e-06, + "loss": 4.7053, + "step": 52910 + }, + { + "epoch": 1.0765584309895833, + "grad_norm": 19.337032318115234, + "learning_rate": 8.321054544686952e-06, + "loss": 5.1822, + "step": 52915 + }, + { + "epoch": 1.07666015625, + "grad_norm": 18.20515251159668, + "learning_rate": 8.320755757106414e-06, + "loss": 5.04, + "step": 52920 + }, + { + "epoch": 1.0767618815104167, + "grad_norm": 17.671384811401367, + "learning_rate": 8.320456948307302e-06, + "loss": 4.8899, + "step": 52925 + }, + { + "epoch": 1.0768636067708333, + "grad_norm": 18.760454177856445, + "learning_rate": 8.320158118291524e-06, + "loss": 4.7855, + "step": 52930 + }, + { + "epoch": 1.07696533203125, + "grad_norm": 19.703659057617188, + "learning_rate": 8.319859267060988e-06, + "loss": 5.1659, + "step": 52935 + }, + { + "epoch": 1.0770670572916667, + "grad_norm": 13.807249069213867, + "learning_rate": 8.319560394617606e-06, + "loss": 4.9315, + "step": 52940 + }, + { + "epoch": 1.0771687825520833, + "grad_norm": 14.635214805603027, + "learning_rate": 8.319261500963288e-06, + "loss": 5.1456, + "step": 52945 + }, + { + "epoch": 1.0772705078125, + "grad_norm": 13.937771797180176, + "learning_rate": 8.318962586099944e-06, + "loss": 5.148, + "step": 52950 + }, + { + "epoch": 1.0773722330729167, + "grad_norm": 14.973244667053223, + "learning_rate": 8.31866365002948e-06, + "loss": 4.9272, + "step": 52955 + }, + { + "epoch": 1.0774739583333333, + "grad_norm": 15.407692909240723, + "learning_rate": 8.318364692753809e-06, + "loss": 5.0174, + "step": 52960 + }, + { + "epoch": 1.07757568359375, + "grad_norm": 19.97748565673828, + "learning_rate": 8.318065714274841e-06, + "loss": 5.033, + "step": 52965 + }, + { + "epoch": 1.0776774088541667, + "grad_norm": 19.648006439208984, + "learning_rate": 8.317766714594486e-06, + "loss": 4.7697, + "step": 52970 + }, + { + "epoch": 1.0777791341145833, + "grad_norm": 15.894782066345215, + "learning_rate": 8.317467693714655e-06, + "loss": 4.7642, + "step": 52975 + }, + { + "epoch": 1.077880859375, + "grad_norm": 18.75507354736328, + "learning_rate": 8.31716865163726e-06, + "loss": 5.1718, + "step": 52980 + }, + { + "epoch": 1.0779825846354167, + "grad_norm": 17.788217544555664, + "learning_rate": 8.31686958836421e-06, + "loss": 5.0479, + "step": 52985 + }, + { + "epoch": 1.0780843098958333, + "grad_norm": 15.187555313110352, + "learning_rate": 8.316570503897416e-06, + "loss": 5.1609, + "step": 52990 + }, + { + "epoch": 1.07818603515625, + "grad_norm": 16.2204647064209, + "learning_rate": 8.316271398238789e-06, + "loss": 4.8332, + "step": 52995 + }, + { + "epoch": 1.0782877604166667, + "grad_norm": 13.94418716430664, + "learning_rate": 8.31597227139024e-06, + "loss": 5.6536, + "step": 53000 + }, + { + "epoch": 1.0783894856770833, + "grad_norm": 18.99245262145996, + "learning_rate": 8.315673123353683e-06, + "loss": 4.9733, + "step": 53005 + }, + { + "epoch": 1.0784912109375, + "grad_norm": 12.987677574157715, + "learning_rate": 8.315373954131024e-06, + "loss": 5.2106, + "step": 53010 + }, + { + "epoch": 1.0785929361979167, + "grad_norm": 18.33221435546875, + "learning_rate": 8.31507476372418e-06, + "loss": 4.8504, + "step": 53015 + }, + { + "epoch": 1.0786946614583333, + "grad_norm": 17.95985221862793, + "learning_rate": 8.31477555213506e-06, + "loss": 4.8773, + "step": 53020 + }, + { + "epoch": 1.07879638671875, + "grad_norm": 21.145002365112305, + "learning_rate": 8.314476319365576e-06, + "loss": 4.8575, + "step": 53025 + }, + { + "epoch": 1.0788981119791667, + "grad_norm": 18.99636459350586, + "learning_rate": 8.314177065417642e-06, + "loss": 5.2062, + "step": 53030 + }, + { + "epoch": 1.0789998372395833, + "grad_norm": 11.457725524902344, + "learning_rate": 8.313877790293167e-06, + "loss": 5.0093, + "step": 53035 + }, + { + "epoch": 1.0791015625, + "grad_norm": 14.392315864562988, + "learning_rate": 8.313578493994066e-06, + "loss": 5.0962, + "step": 53040 + }, + { + "epoch": 1.0792032877604167, + "grad_norm": 19.306245803833008, + "learning_rate": 8.31327917652225e-06, + "loss": 4.6514, + "step": 53045 + }, + { + "epoch": 1.0793050130208333, + "grad_norm": 15.875314712524414, + "learning_rate": 8.31297983787963e-06, + "loss": 5.1905, + "step": 53050 + }, + { + "epoch": 1.07940673828125, + "grad_norm": 14.887837409973145, + "learning_rate": 8.312680478068124e-06, + "loss": 5.2492, + "step": 53055 + }, + { + "epoch": 1.0795084635416667, + "grad_norm": 15.983718872070312, + "learning_rate": 8.312381097089639e-06, + "loss": 5.2486, + "step": 53060 + }, + { + "epoch": 1.0796101888020833, + "grad_norm": 15.662854194641113, + "learning_rate": 8.31208169494609e-06, + "loss": 5.0743, + "step": 53065 + }, + { + "epoch": 1.0797119140625, + "grad_norm": 16.74164390563965, + "learning_rate": 8.311782271639393e-06, + "loss": 5.0685, + "step": 53070 + }, + { + "epoch": 1.0798136393229167, + "grad_norm": 21.60484504699707, + "learning_rate": 8.311482827171455e-06, + "loss": 5.3227, + "step": 53075 + }, + { + "epoch": 1.0799153645833333, + "grad_norm": 17.730510711669922, + "learning_rate": 8.311183361544195e-06, + "loss": 4.8958, + "step": 53080 + }, + { + "epoch": 1.08001708984375, + "grad_norm": 16.585485458374023, + "learning_rate": 8.310883874759523e-06, + "loss": 4.8203, + "step": 53085 + }, + { + "epoch": 1.0801188151041667, + "grad_norm": 21.424848556518555, + "learning_rate": 8.310584366819357e-06, + "loss": 5.2946, + "step": 53090 + }, + { + "epoch": 1.0802205403645833, + "grad_norm": 15.678078651428223, + "learning_rate": 8.310284837725606e-06, + "loss": 4.8889, + "step": 53095 + }, + { + "epoch": 1.080322265625, + "grad_norm": 19.20667839050293, + "learning_rate": 8.309985287480184e-06, + "loss": 5.0251, + "step": 53100 + }, + { + "epoch": 1.0804239908854167, + "grad_norm": 19.64582633972168, + "learning_rate": 8.30968571608501e-06, + "loss": 4.9354, + "step": 53105 + }, + { + "epoch": 1.0805257161458333, + "grad_norm": 20.082674026489258, + "learning_rate": 8.309386123541994e-06, + "loss": 5.195, + "step": 53110 + }, + { + "epoch": 1.08062744140625, + "grad_norm": 20.835023880004883, + "learning_rate": 8.309086509853051e-06, + "loss": 4.8767, + "step": 53115 + }, + { + "epoch": 1.0807291666666667, + "grad_norm": 23.4842529296875, + "learning_rate": 8.308786875020094e-06, + "loss": 4.8372, + "step": 53120 + }, + { + "epoch": 1.0808308919270833, + "grad_norm": 21.64080047607422, + "learning_rate": 8.30848721904504e-06, + "loss": 5.1325, + "step": 53125 + }, + { + "epoch": 1.0809326171875, + "grad_norm": 19.77172088623047, + "learning_rate": 8.308187541929805e-06, + "loss": 4.806, + "step": 53130 + }, + { + "epoch": 1.0810343424479167, + "grad_norm": 15.947271347045898, + "learning_rate": 8.3078878436763e-06, + "loss": 4.968, + "step": 53135 + }, + { + "epoch": 1.0811360677083333, + "grad_norm": 17.146032333374023, + "learning_rate": 8.30758812428644e-06, + "loss": 4.9036, + "step": 53140 + }, + { + "epoch": 1.08123779296875, + "grad_norm": 16.401954650878906, + "learning_rate": 8.307288383762144e-06, + "loss": 4.8683, + "step": 53145 + }, + { + "epoch": 1.0813395182291667, + "grad_norm": 19.766860961914062, + "learning_rate": 8.306988622105326e-06, + "loss": 4.9595, + "step": 53150 + }, + { + "epoch": 1.0814412434895833, + "grad_norm": 17.690282821655273, + "learning_rate": 8.306688839317899e-06, + "loss": 4.9146, + "step": 53155 + }, + { + "epoch": 1.08154296875, + "grad_norm": 20.192119598388672, + "learning_rate": 8.30638903540178e-06, + "loss": 5.1055, + "step": 53160 + }, + { + "epoch": 1.0816446940104167, + "grad_norm": 19.9058837890625, + "learning_rate": 8.306089210358886e-06, + "loss": 4.7931, + "step": 53165 + }, + { + "epoch": 1.0817464192708333, + "grad_norm": 14.389153480529785, + "learning_rate": 8.30578936419113e-06, + "loss": 4.8904, + "step": 53170 + }, + { + "epoch": 1.08184814453125, + "grad_norm": 18.456701278686523, + "learning_rate": 8.30548949690043e-06, + "loss": 4.9743, + "step": 53175 + }, + { + "epoch": 1.0819498697916667, + "grad_norm": 16.18947410583496, + "learning_rate": 8.3051896084887e-06, + "loss": 4.9747, + "step": 53180 + }, + { + "epoch": 1.0820515950520833, + "grad_norm": 18.42300796508789, + "learning_rate": 8.304889698957858e-06, + "loss": 4.7647, + "step": 53185 + }, + { + "epoch": 1.0821533203125, + "grad_norm": 15.903430938720703, + "learning_rate": 8.304589768309822e-06, + "loss": 5.2307, + "step": 53190 + }, + { + "epoch": 1.0822550455729167, + "grad_norm": 19.48758316040039, + "learning_rate": 8.304289816546505e-06, + "loss": 4.8314, + "step": 53195 + }, + { + "epoch": 1.0823567708333333, + "grad_norm": 18.315113067626953, + "learning_rate": 8.303989843669825e-06, + "loss": 5.0954, + "step": 53200 + }, + { + "epoch": 1.08245849609375, + "grad_norm": 16.006587982177734, + "learning_rate": 8.3036898496817e-06, + "loss": 5.0103, + "step": 53205 + }, + { + "epoch": 1.0825602213541667, + "grad_norm": 19.79611587524414, + "learning_rate": 8.303389834584044e-06, + "loss": 5.0594, + "step": 53210 + }, + { + "epoch": 1.0826619466145833, + "grad_norm": 16.99493980407715, + "learning_rate": 8.303089798378775e-06, + "loss": 4.9818, + "step": 53215 + }, + { + "epoch": 1.082763671875, + "grad_norm": 16.744998931884766, + "learning_rate": 8.302789741067812e-06, + "loss": 5.1297, + "step": 53220 + }, + { + "epoch": 1.0828653971354167, + "grad_norm": 23.69385528564453, + "learning_rate": 8.302489662653072e-06, + "loss": 5.079, + "step": 53225 + }, + { + "epoch": 1.0829671223958333, + "grad_norm": 16.914888381958008, + "learning_rate": 8.302189563136468e-06, + "loss": 4.8111, + "step": 53230 + }, + { + "epoch": 1.08306884765625, + "grad_norm": 19.667724609375, + "learning_rate": 8.301889442519925e-06, + "loss": 5.0323, + "step": 53235 + }, + { + "epoch": 1.0831705729166667, + "grad_norm": 18.239974975585938, + "learning_rate": 8.301589300805354e-06, + "loss": 5.0198, + "step": 53240 + }, + { + "epoch": 1.0832722981770833, + "grad_norm": 19.357284545898438, + "learning_rate": 8.301289137994677e-06, + "loss": 5.0419, + "step": 53245 + }, + { + "epoch": 1.0833740234375, + "grad_norm": 19.81806755065918, + "learning_rate": 8.300988954089809e-06, + "loss": 4.8719, + "step": 53250 + }, + { + "epoch": 1.0834757486979167, + "grad_norm": 23.556943893432617, + "learning_rate": 8.30068874909267e-06, + "loss": 5.0252, + "step": 53255 + }, + { + "epoch": 1.0835774739583333, + "grad_norm": 15.988277435302734, + "learning_rate": 8.300388523005179e-06, + "loss": 5.0855, + "step": 53260 + }, + { + "epoch": 1.08367919921875, + "grad_norm": 15.707179069519043, + "learning_rate": 8.30008827582925e-06, + "loss": 4.8814, + "step": 53265 + }, + { + "epoch": 1.0837809244791667, + "grad_norm": 17.796672821044922, + "learning_rate": 8.299788007566806e-06, + "loss": 5.1297, + "step": 53270 + }, + { + "epoch": 1.0838826497395833, + "grad_norm": 16.931081771850586, + "learning_rate": 8.299487718219765e-06, + "loss": 5.1348, + "step": 53275 + }, + { + "epoch": 1.083984375, + "grad_norm": 14.984935760498047, + "learning_rate": 8.299187407790044e-06, + "loss": 5.0803, + "step": 53280 + }, + { + "epoch": 1.0840861002604167, + "grad_norm": 18.36372184753418, + "learning_rate": 8.298887076279563e-06, + "loss": 4.8057, + "step": 53285 + }, + { + "epoch": 1.0841878255208333, + "grad_norm": 19.321393966674805, + "learning_rate": 8.298586723690241e-06, + "loss": 4.7402, + "step": 53290 + }, + { + "epoch": 1.08428955078125, + "grad_norm": 15.338457107543945, + "learning_rate": 8.298286350023997e-06, + "loss": 4.8795, + "step": 53295 + }, + { + "epoch": 1.0843912760416667, + "grad_norm": 17.263784408569336, + "learning_rate": 8.29798595528275e-06, + "loss": 5.0356, + "step": 53300 + }, + { + "epoch": 1.0844930013020833, + "grad_norm": 16.861875534057617, + "learning_rate": 8.297685539468419e-06, + "loss": 5.0114, + "step": 53305 + }, + { + "epoch": 1.0845947265625, + "grad_norm": 16.46442985534668, + "learning_rate": 8.297385102582924e-06, + "loss": 4.8757, + "step": 53310 + }, + { + "epoch": 1.0846964518229167, + "grad_norm": 16.805015563964844, + "learning_rate": 8.297084644628187e-06, + "loss": 4.9566, + "step": 53315 + }, + { + "epoch": 1.0847981770833333, + "grad_norm": 14.255967140197754, + "learning_rate": 8.296784165606125e-06, + "loss": 4.9141, + "step": 53320 + }, + { + "epoch": 1.08489990234375, + "grad_norm": 20.179847717285156, + "learning_rate": 8.296483665518658e-06, + "loss": 4.9562, + "step": 53325 + }, + { + "epoch": 1.0850016276041667, + "grad_norm": 18.48834800720215, + "learning_rate": 8.296183144367706e-06, + "loss": 4.8116, + "step": 53330 + }, + { + "epoch": 1.0851033528645833, + "grad_norm": 14.853571891784668, + "learning_rate": 8.295882602155192e-06, + "loss": 5.1564, + "step": 53335 + }, + { + "epoch": 1.085205078125, + "grad_norm": 18.35362434387207, + "learning_rate": 8.295582038883034e-06, + "loss": 5.0267, + "step": 53340 + }, + { + "epoch": 1.0853068033854167, + "grad_norm": 16.535011291503906, + "learning_rate": 8.295281454553153e-06, + "loss": 4.8416, + "step": 53345 + }, + { + "epoch": 1.0854085286458333, + "grad_norm": 15.909886360168457, + "learning_rate": 8.294980849167468e-06, + "loss": 5.2676, + "step": 53350 + }, + { + "epoch": 1.08551025390625, + "grad_norm": 16.004669189453125, + "learning_rate": 8.294680222727902e-06, + "loss": 4.931, + "step": 53355 + }, + { + "epoch": 1.0856119791666667, + "grad_norm": 15.594894409179688, + "learning_rate": 8.294379575236375e-06, + "loss": 4.9473, + "step": 53360 + }, + { + "epoch": 1.0857137044270833, + "grad_norm": 14.767251968383789, + "learning_rate": 8.29407890669481e-06, + "loss": 4.7305, + "step": 53365 + }, + { + "epoch": 1.0858154296875, + "grad_norm": 16.63031578063965, + "learning_rate": 8.293778217105125e-06, + "loss": 4.9292, + "step": 53370 + }, + { + "epoch": 1.0859171549479167, + "grad_norm": 22.189701080322266, + "learning_rate": 8.293477506469244e-06, + "loss": 5.0809, + "step": 53375 + }, + { + "epoch": 1.0860188802083333, + "grad_norm": 18.729854583740234, + "learning_rate": 8.293176774789087e-06, + "loss": 4.7337, + "step": 53380 + }, + { + "epoch": 1.08612060546875, + "grad_norm": 20.23584747314453, + "learning_rate": 8.292876022066575e-06, + "loss": 4.8663, + "step": 53385 + }, + { + "epoch": 1.0862223307291667, + "grad_norm": 20.492414474487305, + "learning_rate": 8.29257524830363e-06, + "loss": 5.0388, + "step": 53390 + }, + { + "epoch": 1.0863240559895833, + "grad_norm": 18.338043212890625, + "learning_rate": 8.292274453502176e-06, + "loss": 4.8099, + "step": 53395 + }, + { + "epoch": 1.08642578125, + "grad_norm": 16.007299423217773, + "learning_rate": 8.291973637664132e-06, + "loss": 4.9034, + "step": 53400 + }, + { + "epoch": 1.0865275065104167, + "grad_norm": 19.652843475341797, + "learning_rate": 8.291672800791422e-06, + "loss": 4.826, + "step": 53405 + }, + { + "epoch": 1.0866292317708333, + "grad_norm": 17.227582931518555, + "learning_rate": 8.291371942885968e-06, + "loss": 4.9581, + "step": 53410 + }, + { + "epoch": 1.08673095703125, + "grad_norm": 16.63295555114746, + "learning_rate": 8.291071063949692e-06, + "loss": 5.0534, + "step": 53415 + }, + { + "epoch": 1.0868326822916667, + "grad_norm": 16.50734519958496, + "learning_rate": 8.290770163984516e-06, + "loss": 4.8556, + "step": 53420 + }, + { + "epoch": 1.0869344075520833, + "grad_norm": 16.98623275756836, + "learning_rate": 8.290469242992363e-06, + "loss": 5.0598, + "step": 53425 + }, + { + "epoch": 1.0870361328125, + "grad_norm": 21.34958267211914, + "learning_rate": 8.290168300975156e-06, + "loss": 4.9713, + "step": 53430 + }, + { + "epoch": 1.0871378580729167, + "grad_norm": 16.300270080566406, + "learning_rate": 8.289867337934819e-06, + "loss": 5.5213, + "step": 53435 + }, + { + "epoch": 1.0872395833333333, + "grad_norm": 16.004480361938477, + "learning_rate": 8.289566353873275e-06, + "loss": 4.8901, + "step": 53440 + }, + { + "epoch": 1.08734130859375, + "grad_norm": 19.80897331237793, + "learning_rate": 8.289265348792446e-06, + "loss": 4.6974, + "step": 53445 + }, + { + "epoch": 1.0874430338541667, + "grad_norm": 16.510602951049805, + "learning_rate": 8.288964322694253e-06, + "loss": 4.8802, + "step": 53450 + }, + { + "epoch": 1.0875447591145833, + "grad_norm": 19.36961555480957, + "learning_rate": 8.288663275580624e-06, + "loss": 4.7481, + "step": 53455 + }, + { + "epoch": 1.087646484375, + "grad_norm": 18.911117553710938, + "learning_rate": 8.28836220745348e-06, + "loss": 5.013, + "step": 53460 + }, + { + "epoch": 1.0877482096354167, + "grad_norm": 17.614639282226562, + "learning_rate": 8.288061118314746e-06, + "loss": 4.8026, + "step": 53465 + }, + { + "epoch": 1.0878499348958333, + "grad_norm": 12.812925338745117, + "learning_rate": 8.287760008166344e-06, + "loss": 5.0787, + "step": 53470 + }, + { + "epoch": 1.08795166015625, + "grad_norm": 22.636693954467773, + "learning_rate": 8.2874588770102e-06, + "loss": 5.1183, + "step": 53475 + }, + { + "epoch": 1.0880533854166667, + "grad_norm": 16.311527252197266, + "learning_rate": 8.287157724848238e-06, + "loss": 5.1448, + "step": 53480 + }, + { + "epoch": 1.0881551106770833, + "grad_norm": 17.58592414855957, + "learning_rate": 8.28685655168238e-06, + "loss": 5.13, + "step": 53485 + }, + { + "epoch": 1.0882568359375, + "grad_norm": 21.899137496948242, + "learning_rate": 8.286555357514555e-06, + "loss": 5.3904, + "step": 53490 + }, + { + "epoch": 1.0883585611979167, + "grad_norm": 17.000524520874023, + "learning_rate": 8.286254142346682e-06, + "loss": 5.0993, + "step": 53495 + }, + { + "epoch": 1.0884602864583333, + "grad_norm": 18.52937889099121, + "learning_rate": 8.285952906180689e-06, + "loss": 4.911, + "step": 53500 + }, + { + "epoch": 1.08856201171875, + "grad_norm": 17.295387268066406, + "learning_rate": 8.2856516490185e-06, + "loss": 4.7504, + "step": 53505 + }, + { + "epoch": 1.0886637369791667, + "grad_norm": 16.242956161499023, + "learning_rate": 8.28535037086204e-06, + "loss": 4.9518, + "step": 53510 + }, + { + "epoch": 1.0887654622395833, + "grad_norm": 21.913867950439453, + "learning_rate": 8.285049071713233e-06, + "loss": 4.8541, + "step": 53515 + }, + { + "epoch": 1.0888671875, + "grad_norm": 25.436405181884766, + "learning_rate": 8.284747751574007e-06, + "loss": 5.0643, + "step": 53520 + }, + { + "epoch": 1.0889689127604167, + "grad_norm": 22.258363723754883, + "learning_rate": 8.284446410446283e-06, + "loss": 5.0868, + "step": 53525 + }, + { + "epoch": 1.0890706380208333, + "grad_norm": 16.534757614135742, + "learning_rate": 8.284145048331994e-06, + "loss": 4.9357, + "step": 53530 + }, + { + "epoch": 1.08917236328125, + "grad_norm": 17.79094696044922, + "learning_rate": 8.283843665233056e-06, + "loss": 5.1118, + "step": 53535 + }, + { + "epoch": 1.0892740885416667, + "grad_norm": 15.984225273132324, + "learning_rate": 8.2835422611514e-06, + "loss": 4.96, + "step": 53540 + }, + { + "epoch": 1.0893758138020833, + "grad_norm": 16.95253562927246, + "learning_rate": 8.283240836088953e-06, + "loss": 4.8519, + "step": 53545 + }, + { + "epoch": 1.0894775390625, + "grad_norm": 14.907760620117188, + "learning_rate": 8.282939390047639e-06, + "loss": 4.979, + "step": 53550 + }, + { + "epoch": 1.0895792643229167, + "grad_norm": 19.524267196655273, + "learning_rate": 8.282637923029382e-06, + "loss": 5.0422, + "step": 53555 + }, + { + "epoch": 1.0896809895833333, + "grad_norm": 23.19914436340332, + "learning_rate": 8.282336435036114e-06, + "loss": 5.0863, + "step": 53560 + }, + { + "epoch": 1.08978271484375, + "grad_norm": 18.368547439575195, + "learning_rate": 8.282034926069757e-06, + "loss": 4.736, + "step": 53565 + }, + { + "epoch": 1.0898844401041667, + "grad_norm": 17.26194953918457, + "learning_rate": 8.281733396132238e-06, + "loss": 5.0342, + "step": 53570 + }, + { + "epoch": 1.0899861653645833, + "grad_norm": 20.376127243041992, + "learning_rate": 8.281431845225485e-06, + "loss": 4.878, + "step": 53575 + }, + { + "epoch": 1.090087890625, + "grad_norm": 29.111730575561523, + "learning_rate": 8.281130273351423e-06, + "loss": 4.978, + "step": 53580 + }, + { + "epoch": 1.0901896158854167, + "grad_norm": 17.813249588012695, + "learning_rate": 8.280828680511983e-06, + "loss": 4.6993, + "step": 53585 + }, + { + "epoch": 1.0902913411458333, + "grad_norm": 28.02440643310547, + "learning_rate": 8.280527066709085e-06, + "loss": 5.3551, + "step": 53590 + }, + { + "epoch": 1.09039306640625, + "grad_norm": 17.13641357421875, + "learning_rate": 8.280225431944664e-06, + "loss": 4.932, + "step": 53595 + }, + { + "epoch": 1.0904947916666667, + "grad_norm": 17.38331413269043, + "learning_rate": 8.279923776220642e-06, + "loss": 4.916, + "step": 53600 + }, + { + "epoch": 1.0905965169270833, + "grad_norm": 18.146526336669922, + "learning_rate": 8.27962209953895e-06, + "loss": 5.1686, + "step": 53605 + }, + { + "epoch": 1.0906982421875, + "grad_norm": 19.049907684326172, + "learning_rate": 8.279320401901511e-06, + "loss": 5.2385, + "step": 53610 + }, + { + "epoch": 1.0907999674479167, + "grad_norm": 15.697101593017578, + "learning_rate": 8.279018683310258e-06, + "loss": 5.128, + "step": 53615 + }, + { + "epoch": 1.0909016927083333, + "grad_norm": 17.81991195678711, + "learning_rate": 8.278716943767115e-06, + "loss": 4.9774, + "step": 53620 + }, + { + "epoch": 1.09100341796875, + "grad_norm": 15.736581802368164, + "learning_rate": 8.278415183274013e-06, + "loss": 4.6036, + "step": 53625 + }, + { + "epoch": 1.0911051432291667, + "grad_norm": 18.90018653869629, + "learning_rate": 8.278113401832878e-06, + "loss": 4.7852, + "step": 53630 + }, + { + "epoch": 1.0912068684895833, + "grad_norm": 20.43570899963379, + "learning_rate": 8.27781159944564e-06, + "loss": 5.0833, + "step": 53635 + }, + { + "epoch": 1.09130859375, + "grad_norm": 16.020671844482422, + "learning_rate": 8.277509776114226e-06, + "loss": 4.8067, + "step": 53640 + }, + { + "epoch": 1.0914103190104167, + "grad_norm": 14.210065841674805, + "learning_rate": 8.277207931840563e-06, + "loss": 4.9175, + "step": 53645 + }, + { + "epoch": 1.0915120442708333, + "grad_norm": 19.50562286376953, + "learning_rate": 8.276906066626583e-06, + "loss": 4.9373, + "step": 53650 + }, + { + "epoch": 1.09161376953125, + "grad_norm": 19.37808609008789, + "learning_rate": 8.276604180474214e-06, + "loss": 5.1673, + "step": 53655 + }, + { + "epoch": 1.0917154947916667, + "grad_norm": 19.93374252319336, + "learning_rate": 8.276302273385384e-06, + "loss": 5.3407, + "step": 53660 + }, + { + "epoch": 1.0918172200520833, + "grad_norm": 16.00514793395996, + "learning_rate": 8.276000345362023e-06, + "loss": 4.7639, + "step": 53665 + }, + { + "epoch": 1.0919189453125, + "grad_norm": 16.94774055480957, + "learning_rate": 8.275698396406058e-06, + "loss": 4.9379, + "step": 53670 + }, + { + "epoch": 1.0920206705729167, + "grad_norm": 16.229597091674805, + "learning_rate": 8.27539642651942e-06, + "loss": 5.0831, + "step": 53675 + }, + { + "epoch": 1.0921223958333333, + "grad_norm": 19.04464340209961, + "learning_rate": 8.275094435704041e-06, + "loss": 5.0043, + "step": 53680 + }, + { + "epoch": 1.09222412109375, + "grad_norm": 17.846147537231445, + "learning_rate": 8.274792423961846e-06, + "loss": 4.7366, + "step": 53685 + }, + { + "epoch": 1.0923258463541667, + "grad_norm": 18.901290893554688, + "learning_rate": 8.274490391294769e-06, + "loss": 5.0408, + "step": 53690 + }, + { + "epoch": 1.0924275716145833, + "grad_norm": 17.47467613220215, + "learning_rate": 8.274188337704736e-06, + "loss": 4.9911, + "step": 53695 + }, + { + "epoch": 1.092529296875, + "grad_norm": 14.411445617675781, + "learning_rate": 8.27388626319368e-06, + "loss": 4.998, + "step": 53700 + }, + { + "epoch": 1.0926310221354167, + "grad_norm": 18.01923179626465, + "learning_rate": 8.273584167763531e-06, + "loss": 4.8511, + "step": 53705 + }, + { + "epoch": 1.0927327473958333, + "grad_norm": 13.688859939575195, + "learning_rate": 8.273282051416216e-06, + "loss": 4.8556, + "step": 53710 + }, + { + "epoch": 1.09283447265625, + "grad_norm": 14.925070762634277, + "learning_rate": 8.272979914153669e-06, + "loss": 4.9246, + "step": 53715 + }, + { + "epoch": 1.0929361979166667, + "grad_norm": 13.478147506713867, + "learning_rate": 8.272677755977819e-06, + "loss": 4.9926, + "step": 53720 + }, + { + "epoch": 1.0930379231770833, + "grad_norm": 22.84379768371582, + "learning_rate": 8.272375576890599e-06, + "loss": 4.9602, + "step": 53725 + }, + { + "epoch": 1.0931396484375, + "grad_norm": 14.847111701965332, + "learning_rate": 8.272073376893935e-06, + "loss": 4.9551, + "step": 53730 + }, + { + "epoch": 1.0932413736979167, + "grad_norm": 17.138704299926758, + "learning_rate": 8.271771155989762e-06, + "loss": 5.1046, + "step": 53735 + }, + { + "epoch": 1.0933430989583333, + "grad_norm": 16.817224502563477, + "learning_rate": 8.27146891418001e-06, + "loss": 5.1301, + "step": 53740 + }, + { + "epoch": 1.09344482421875, + "grad_norm": 15.838083267211914, + "learning_rate": 8.27116665146661e-06, + "loss": 4.7701, + "step": 53745 + }, + { + "epoch": 1.0935465494791667, + "grad_norm": 14.602056503295898, + "learning_rate": 8.270864367851492e-06, + "loss": 4.9287, + "step": 53750 + }, + { + "epoch": 1.0936482747395833, + "grad_norm": 16.56187629699707, + "learning_rate": 8.270562063336592e-06, + "loss": 5.2848, + "step": 53755 + }, + { + "epoch": 1.09375, + "grad_norm": 13.32974910736084, + "learning_rate": 8.270259737923835e-06, + "loss": 5.0297, + "step": 53760 + }, + { + "epoch": 1.0938517252604167, + "grad_norm": 17.08775520324707, + "learning_rate": 8.269957391615159e-06, + "loss": 5.0172, + "step": 53765 + }, + { + "epoch": 1.0939534505208333, + "grad_norm": 18.559473037719727, + "learning_rate": 8.269655024412496e-06, + "loss": 5.1567, + "step": 53770 + }, + { + "epoch": 1.09405517578125, + "grad_norm": 16.18183708190918, + "learning_rate": 8.26935263631777e-06, + "loss": 5.2639, + "step": 53775 + }, + { + "epoch": 1.0941569010416667, + "grad_norm": 16.00288200378418, + "learning_rate": 8.269050227332921e-06, + "loss": 4.73, + "step": 53780 + }, + { + "epoch": 1.0942586263020833, + "grad_norm": 17.9433536529541, + "learning_rate": 8.268747797459878e-06, + "loss": 5.0221, + "step": 53785 + }, + { + "epoch": 1.0943603515625, + "grad_norm": 18.271360397338867, + "learning_rate": 8.268445346700575e-06, + "loss": 5.2323, + "step": 53790 + }, + { + "epoch": 1.0944620768229167, + "grad_norm": 15.061139106750488, + "learning_rate": 8.268142875056944e-06, + "loss": 5.077, + "step": 53795 + }, + { + "epoch": 1.0945638020833333, + "grad_norm": 19.18107032775879, + "learning_rate": 8.267840382530916e-06, + "loss": 4.7714, + "step": 53800 + }, + { + "epoch": 1.09466552734375, + "grad_norm": 21.123903274536133, + "learning_rate": 8.267537869124427e-06, + "loss": 4.908, + "step": 53805 + }, + { + "epoch": 1.0947672526041667, + "grad_norm": 19.62960433959961, + "learning_rate": 8.267235334839407e-06, + "loss": 5.0487, + "step": 53810 + }, + { + "epoch": 1.0948689778645833, + "grad_norm": 20.116792678833008, + "learning_rate": 8.26693277967779e-06, + "loss": 4.9685, + "step": 53815 + }, + { + "epoch": 1.094970703125, + "grad_norm": 18.1890926361084, + "learning_rate": 8.26663020364151e-06, + "loss": 5.0147, + "step": 53820 + }, + { + "epoch": 1.0950724283854167, + "grad_norm": 17.744836807250977, + "learning_rate": 8.2663276067325e-06, + "loss": 4.607, + "step": 53825 + }, + { + "epoch": 1.0951741536458333, + "grad_norm": 18.616300582885742, + "learning_rate": 8.266024988952695e-06, + "loss": 4.6066, + "step": 53830 + }, + { + "epoch": 1.09527587890625, + "grad_norm": 17.78786849975586, + "learning_rate": 8.265722350304026e-06, + "loss": 4.8139, + "step": 53835 + }, + { + "epoch": 1.0953776041666667, + "grad_norm": 22.5281925201416, + "learning_rate": 8.265419690788427e-06, + "loss": 4.8491, + "step": 53840 + }, + { + "epoch": 1.0954793294270833, + "grad_norm": 22.727346420288086, + "learning_rate": 8.265117010407836e-06, + "loss": 5.0359, + "step": 53845 + }, + { + "epoch": 1.0955810546875, + "grad_norm": 16.61709976196289, + "learning_rate": 8.264814309164179e-06, + "loss": 4.9438, + "step": 53850 + }, + { + "epoch": 1.0956827799479167, + "grad_norm": 20.405044555664062, + "learning_rate": 8.264511587059398e-06, + "loss": 5.057, + "step": 53855 + }, + { + "epoch": 1.0957845052083333, + "grad_norm": 15.564959526062012, + "learning_rate": 8.264208844095423e-06, + "loss": 4.9175, + "step": 53860 + }, + { + "epoch": 1.09588623046875, + "grad_norm": 26.048847198486328, + "learning_rate": 8.26390608027419e-06, + "loss": 4.9797, + "step": 53865 + }, + { + "epoch": 1.0959879557291667, + "grad_norm": 18.902542114257812, + "learning_rate": 8.263603295597634e-06, + "loss": 5.1209, + "step": 53870 + }, + { + "epoch": 1.0960896809895833, + "grad_norm": 28.075817108154297, + "learning_rate": 8.263300490067689e-06, + "loss": 5.0112, + "step": 53875 + }, + { + "epoch": 1.09619140625, + "grad_norm": 20.482065200805664, + "learning_rate": 8.262997663686288e-06, + "loss": 5.026, + "step": 53880 + }, + { + "epoch": 1.0962931315104167, + "grad_norm": 20.858327865600586, + "learning_rate": 8.26269481645537e-06, + "loss": 4.8476, + "step": 53885 + }, + { + "epoch": 1.0963948567708333, + "grad_norm": 15.705907821655273, + "learning_rate": 8.262391948376866e-06, + "loss": 5.1538, + "step": 53890 + }, + { + "epoch": 1.09649658203125, + "grad_norm": 13.167104721069336, + "learning_rate": 8.262089059452714e-06, + "loss": 4.5714, + "step": 53895 + }, + { + "epoch": 1.0965983072916667, + "grad_norm": 13.836445808410645, + "learning_rate": 8.261786149684848e-06, + "loss": 5.0075, + "step": 53900 + }, + { + "epoch": 1.0967000325520833, + "grad_norm": 16.455902099609375, + "learning_rate": 8.261483219075204e-06, + "loss": 5.0435, + "step": 53905 + }, + { + "epoch": 1.0968017578125, + "grad_norm": 15.186219215393066, + "learning_rate": 8.26118026762572e-06, + "loss": 5.1172, + "step": 53910 + }, + { + "epoch": 1.0969034830729167, + "grad_norm": 17.12296485900879, + "learning_rate": 8.260877295338325e-06, + "loss": 4.7182, + "step": 53915 + }, + { + "epoch": 1.0970052083333333, + "grad_norm": 16.833534240722656, + "learning_rate": 8.260574302214963e-06, + "loss": 4.7694, + "step": 53920 + }, + { + "epoch": 1.09710693359375, + "grad_norm": 18.261112213134766, + "learning_rate": 8.260271288257564e-06, + "loss": 5.0702, + "step": 53925 + }, + { + "epoch": 1.0972086588541667, + "grad_norm": 26.912799835205078, + "learning_rate": 8.259968253468067e-06, + "loss": 5.166, + "step": 53930 + }, + { + "epoch": 1.0973103841145833, + "grad_norm": 15.912737846374512, + "learning_rate": 8.259665197848408e-06, + "loss": 5.1589, + "step": 53935 + }, + { + "epoch": 1.097412109375, + "grad_norm": 16.899185180664062, + "learning_rate": 8.259362121400523e-06, + "loss": 5.1327, + "step": 53940 + }, + { + "epoch": 1.0975138346354167, + "grad_norm": 17.99809455871582, + "learning_rate": 8.259059024126348e-06, + "loss": 5.1346, + "step": 53945 + }, + { + "epoch": 1.0976155598958333, + "grad_norm": 18.85715675354004, + "learning_rate": 8.258755906027823e-06, + "loss": 4.852, + "step": 53950 + }, + { + "epoch": 1.09771728515625, + "grad_norm": 13.681702613830566, + "learning_rate": 8.258452767106879e-06, + "loss": 4.9824, + "step": 53955 + }, + { + "epoch": 1.0978190104166667, + "grad_norm": 16.04336166381836, + "learning_rate": 8.258149607365458e-06, + "loss": 5.0181, + "step": 53960 + }, + { + "epoch": 1.0979207356770833, + "grad_norm": 19.323436737060547, + "learning_rate": 8.257846426805494e-06, + "loss": 4.8183, + "step": 53965 + }, + { + "epoch": 1.0980224609375, + "grad_norm": 18.41128158569336, + "learning_rate": 8.257543225428926e-06, + "loss": 4.7514, + "step": 53970 + }, + { + "epoch": 1.0981241861979167, + "grad_norm": 16.518003463745117, + "learning_rate": 8.257240003237691e-06, + "loss": 4.8236, + "step": 53975 + }, + { + "epoch": 1.0982259114583333, + "grad_norm": 23.82954978942871, + "learning_rate": 8.256936760233728e-06, + "loss": 5.5269, + "step": 53980 + }, + { + "epoch": 1.09832763671875, + "grad_norm": 12.234626770019531, + "learning_rate": 8.25663349641897e-06, + "loss": 4.8597, + "step": 53985 + }, + { + "epoch": 1.0984293619791667, + "grad_norm": 18.44965934753418, + "learning_rate": 8.25633021179536e-06, + "loss": 4.8758, + "step": 53990 + }, + { + "epoch": 1.0985310872395833, + "grad_norm": 19.2401123046875, + "learning_rate": 8.256026906364832e-06, + "loss": 5.2635, + "step": 53995 + }, + { + "epoch": 1.0986328125, + "grad_norm": 17.128080368041992, + "learning_rate": 8.255723580129328e-06, + "loss": 5.1625, + "step": 54000 + }, + { + "epoch": 1.0987345377604167, + "grad_norm": 19.87021827697754, + "learning_rate": 8.25542023309078e-06, + "loss": 4.6514, + "step": 54005 + }, + { + "epoch": 1.0988362630208333, + "grad_norm": 20.310544967651367, + "learning_rate": 8.255116865251134e-06, + "loss": 4.9605, + "step": 54010 + }, + { + "epoch": 1.09893798828125, + "grad_norm": 22.56101417541504, + "learning_rate": 8.254813476612322e-06, + "loss": 4.8053, + "step": 54015 + }, + { + "epoch": 1.0990397135416667, + "grad_norm": 12.938146591186523, + "learning_rate": 8.254510067176285e-06, + "loss": 5.2466, + "step": 54020 + }, + { + "epoch": 1.0991414388020833, + "grad_norm": 15.276867866516113, + "learning_rate": 8.254206636944964e-06, + "loss": 5.0127, + "step": 54025 + }, + { + "epoch": 1.0992431640625, + "grad_norm": 14.79013442993164, + "learning_rate": 8.253903185920294e-06, + "loss": 4.9945, + "step": 54030 + }, + { + "epoch": 1.0993448893229167, + "grad_norm": 17.590576171875, + "learning_rate": 8.253599714104217e-06, + "loss": 4.9252, + "step": 54035 + }, + { + "epoch": 1.0994466145833333, + "grad_norm": 14.33618450164795, + "learning_rate": 8.25329622149867e-06, + "loss": 5.0812, + "step": 54040 + }, + { + "epoch": 1.09954833984375, + "grad_norm": 17.88554573059082, + "learning_rate": 8.252992708105592e-06, + "loss": 4.8019, + "step": 54045 + }, + { + "epoch": 1.0996500651041667, + "grad_norm": 11.98546314239502, + "learning_rate": 8.252689173926924e-06, + "loss": 4.8931, + "step": 54050 + }, + { + "epoch": 1.0997517903645833, + "grad_norm": 15.303669929504395, + "learning_rate": 8.252385618964605e-06, + "loss": 4.7572, + "step": 54055 + }, + { + "epoch": 1.099853515625, + "grad_norm": 18.582796096801758, + "learning_rate": 8.252082043220574e-06, + "loss": 5.1077, + "step": 54060 + }, + { + "epoch": 1.0999552408854167, + "grad_norm": 25.221181869506836, + "learning_rate": 8.25177844669677e-06, + "loss": 4.8791, + "step": 54065 + }, + { + "epoch": 1.1000569661458333, + "grad_norm": 18.74992561340332, + "learning_rate": 8.251474829395136e-06, + "loss": 4.8131, + "step": 54070 + }, + { + "epoch": 1.10015869140625, + "grad_norm": 18.597270965576172, + "learning_rate": 8.251171191317608e-06, + "loss": 5.2506, + "step": 54075 + }, + { + "epoch": 1.1002604166666667, + "grad_norm": 17.39882469177246, + "learning_rate": 8.25086753246613e-06, + "loss": 4.658, + "step": 54080 + }, + { + "epoch": 1.1003621419270833, + "grad_norm": 15.735751152038574, + "learning_rate": 8.250563852842639e-06, + "loss": 4.869, + "step": 54085 + }, + { + "epoch": 1.1004638671875, + "grad_norm": 25.11513328552246, + "learning_rate": 8.250260152449078e-06, + "loss": 5.1308, + "step": 54090 + }, + { + "epoch": 1.1005655924479167, + "grad_norm": 15.594578742980957, + "learning_rate": 8.249956431287385e-06, + "loss": 5.0356, + "step": 54095 + }, + { + "epoch": 1.1006673177083333, + "grad_norm": 15.469430923461914, + "learning_rate": 8.249652689359503e-06, + "loss": 4.9298, + "step": 54100 + }, + { + "epoch": 1.10076904296875, + "grad_norm": 20.137706756591797, + "learning_rate": 8.249348926667372e-06, + "loss": 4.7861, + "step": 54105 + }, + { + "epoch": 1.1008707682291667, + "grad_norm": 17.116241455078125, + "learning_rate": 8.249045143212932e-06, + "loss": 4.9743, + "step": 54110 + }, + { + "epoch": 1.1009724934895833, + "grad_norm": 23.454769134521484, + "learning_rate": 8.248741338998127e-06, + "loss": 5.0095, + "step": 54115 + }, + { + "epoch": 1.10107421875, + "grad_norm": 19.664350509643555, + "learning_rate": 8.248437514024893e-06, + "loss": 4.9858, + "step": 54120 + }, + { + "epoch": 1.1011759440104167, + "grad_norm": 14.687652587890625, + "learning_rate": 8.248133668295177e-06, + "loss": 4.8978, + "step": 54125 + }, + { + "epoch": 1.1012776692708333, + "grad_norm": 18.54550552368164, + "learning_rate": 8.247829801810919e-06, + "loss": 4.5939, + "step": 54130 + }, + { + "epoch": 1.10137939453125, + "grad_norm": 17.64936637878418, + "learning_rate": 8.247525914574055e-06, + "loss": 4.7791, + "step": 54135 + }, + { + "epoch": 1.1014811197916667, + "grad_norm": 16.314659118652344, + "learning_rate": 8.247222006586535e-06, + "loss": 4.998, + "step": 54140 + }, + { + "epoch": 1.1015828450520833, + "grad_norm": 15.862951278686523, + "learning_rate": 8.246918077850297e-06, + "loss": 5.2088, + "step": 54145 + }, + { + "epoch": 1.1016845703125, + "grad_norm": 16.676198959350586, + "learning_rate": 8.24661412836728e-06, + "loss": 5.1781, + "step": 54150 + }, + { + "epoch": 1.1017862955729167, + "grad_norm": 17.251436233520508, + "learning_rate": 8.246310158139432e-06, + "loss": 4.7913, + "step": 54155 + }, + { + "epoch": 1.1018880208333333, + "grad_norm": 19.926122665405273, + "learning_rate": 8.246006167168691e-06, + "loss": 4.8278, + "step": 54160 + }, + { + "epoch": 1.10198974609375, + "grad_norm": 16.676227569580078, + "learning_rate": 8.245702155457006e-06, + "loss": 4.9157, + "step": 54165 + }, + { + "epoch": 1.1020914713541667, + "grad_norm": 15.807001113891602, + "learning_rate": 8.24539812300631e-06, + "loss": 4.8709, + "step": 54170 + }, + { + "epoch": 1.1021931966145833, + "grad_norm": 15.332677841186523, + "learning_rate": 8.24509406981855e-06, + "loss": 4.8859, + "step": 54175 + }, + { + "epoch": 1.102294921875, + "grad_norm": 13.129786491394043, + "learning_rate": 8.24478999589567e-06, + "loss": 4.873, + "step": 54180 + }, + { + "epoch": 1.1023966471354167, + "grad_norm": 15.839837074279785, + "learning_rate": 8.244485901239611e-06, + "loss": 5.1968, + "step": 54185 + }, + { + "epoch": 1.1024983723958333, + "grad_norm": 18.230615615844727, + "learning_rate": 8.24418178585232e-06, + "loss": 4.8935, + "step": 54190 + }, + { + "epoch": 1.10260009765625, + "grad_norm": 20.861724853515625, + "learning_rate": 8.243877649735736e-06, + "loss": 4.763, + "step": 54195 + }, + { + "epoch": 1.1027018229166667, + "grad_norm": 15.326332092285156, + "learning_rate": 8.2435734928918e-06, + "loss": 4.7638, + "step": 54200 + }, + { + "epoch": 1.1028035481770833, + "grad_norm": 15.35434627532959, + "learning_rate": 8.243269315322465e-06, + "loss": 4.7857, + "step": 54205 + }, + { + "epoch": 1.1029052734375, + "grad_norm": 18.3499755859375, + "learning_rate": 8.242965117029665e-06, + "loss": 4.8681, + "step": 54210 + }, + { + "epoch": 1.1030069986979167, + "grad_norm": 22.57895851135254, + "learning_rate": 8.242660898015346e-06, + "loss": 4.729, + "step": 54215 + }, + { + "epoch": 1.1031087239583333, + "grad_norm": 18.92702865600586, + "learning_rate": 8.242356658281456e-06, + "loss": 5.0937, + "step": 54220 + }, + { + "epoch": 1.10321044921875, + "grad_norm": 17.80980682373047, + "learning_rate": 8.242052397829936e-06, + "loss": 4.9293, + "step": 54225 + }, + { + "epoch": 1.1033121744791667, + "grad_norm": 19.928516387939453, + "learning_rate": 8.24174811666273e-06, + "loss": 4.9236, + "step": 54230 + }, + { + "epoch": 1.1034138997395833, + "grad_norm": 20.05200958251953, + "learning_rate": 8.241443814781783e-06, + "loss": 4.8409, + "step": 54235 + }, + { + "epoch": 1.103515625, + "grad_norm": 21.937335968017578, + "learning_rate": 8.241139492189036e-06, + "loss": 5.0139, + "step": 54240 + }, + { + "epoch": 1.1036173502604167, + "grad_norm": 18.24981689453125, + "learning_rate": 8.24083514888644e-06, + "loss": 4.8836, + "step": 54245 + }, + { + "epoch": 1.1037190755208333, + "grad_norm": 14.80862045288086, + "learning_rate": 8.240530784875935e-06, + "loss": 4.8636, + "step": 54250 + }, + { + "epoch": 1.10382080078125, + "grad_norm": 12.181798934936523, + "learning_rate": 8.240226400159466e-06, + "loss": 5.0465, + "step": 54255 + }, + { + "epoch": 1.1039225260416667, + "grad_norm": 14.0560884475708, + "learning_rate": 8.23992199473898e-06, + "loss": 4.7734, + "step": 54260 + }, + { + "epoch": 1.1040242513020833, + "grad_norm": 14.76064395904541, + "learning_rate": 8.23961756861642e-06, + "loss": 4.5652, + "step": 54265 + }, + { + "epoch": 1.1041259765625, + "grad_norm": 22.50926971435547, + "learning_rate": 8.239313121793734e-06, + "loss": 5.0878, + "step": 54270 + }, + { + "epoch": 1.1042277018229167, + "grad_norm": 15.407154083251953, + "learning_rate": 8.239008654272864e-06, + "loss": 4.9658, + "step": 54275 + }, + { + "epoch": 1.1043294270833333, + "grad_norm": 13.428374290466309, + "learning_rate": 8.238704166055758e-06, + "loss": 5.2607, + "step": 54280 + }, + { + "epoch": 1.10443115234375, + "grad_norm": 18.61833381652832, + "learning_rate": 8.23839965714436e-06, + "loss": 5.0951, + "step": 54285 + }, + { + "epoch": 1.1045328776041667, + "grad_norm": 14.969680786132812, + "learning_rate": 8.238095127540615e-06, + "loss": 4.9785, + "step": 54290 + }, + { + "epoch": 1.1046346028645833, + "grad_norm": 17.847267150878906, + "learning_rate": 8.237790577246472e-06, + "loss": 5.0839, + "step": 54295 + }, + { + "epoch": 1.104736328125, + "grad_norm": 21.5604305267334, + "learning_rate": 8.237486006263873e-06, + "loss": 5.4782, + "step": 54300 + }, + { + "epoch": 1.1048380533854167, + "grad_norm": 19.674766540527344, + "learning_rate": 8.237181414594768e-06, + "loss": 5.0888, + "step": 54305 + }, + { + "epoch": 1.1049397786458333, + "grad_norm": 20.18660545349121, + "learning_rate": 8.236876802241101e-06, + "loss": 5.0504, + "step": 54310 + }, + { + "epoch": 1.10504150390625, + "grad_norm": 20.055978775024414, + "learning_rate": 8.236572169204818e-06, + "loss": 5.0282, + "step": 54315 + }, + { + "epoch": 1.1051432291666667, + "grad_norm": 14.598179817199707, + "learning_rate": 8.236267515487866e-06, + "loss": 5.1172, + "step": 54320 + }, + { + "epoch": 1.1052449544270833, + "grad_norm": 16.874073028564453, + "learning_rate": 8.235962841092193e-06, + "loss": 5.0133, + "step": 54325 + }, + { + "epoch": 1.1053466796875, + "grad_norm": 16.7474365234375, + "learning_rate": 8.235658146019744e-06, + "loss": 4.9482, + "step": 54330 + }, + { + "epoch": 1.1054484049479167, + "grad_norm": 19.101274490356445, + "learning_rate": 8.235353430272467e-06, + "loss": 4.787, + "step": 54335 + }, + { + "epoch": 1.1055501302083333, + "grad_norm": 15.959355354309082, + "learning_rate": 8.235048693852309e-06, + "loss": 5.0445, + "step": 54340 + }, + { + "epoch": 1.10565185546875, + "grad_norm": 19.98863983154297, + "learning_rate": 8.234743936761214e-06, + "loss": 4.8577, + "step": 54345 + }, + { + "epoch": 1.1057535807291667, + "grad_norm": 16.496356964111328, + "learning_rate": 8.234439159001136e-06, + "loss": 5.1682, + "step": 54350 + }, + { + "epoch": 1.1058553059895833, + "grad_norm": 20.255361557006836, + "learning_rate": 8.234134360574015e-06, + "loss": 5.1497, + "step": 54355 + }, + { + "epoch": 1.10595703125, + "grad_norm": 16.919145584106445, + "learning_rate": 8.233829541481804e-06, + "loss": 4.8271, + "step": 54360 + }, + { + "epoch": 1.1060587565104167, + "grad_norm": 20.264345169067383, + "learning_rate": 8.23352470172645e-06, + "loss": 4.9072, + "step": 54365 + }, + { + "epoch": 1.1061604817708333, + "grad_norm": 18.975446701049805, + "learning_rate": 8.233219841309898e-06, + "loss": 5.0695, + "step": 54370 + }, + { + "epoch": 1.10626220703125, + "grad_norm": 16.9041805267334, + "learning_rate": 8.232914960234097e-06, + "loss": 5.1103, + "step": 54375 + }, + { + "epoch": 1.1063639322916667, + "grad_norm": 15.529908180236816, + "learning_rate": 8.232610058500997e-06, + "loss": 5.0276, + "step": 54380 + }, + { + "epoch": 1.1064656575520833, + "grad_norm": 18.190872192382812, + "learning_rate": 8.232305136112544e-06, + "loss": 4.962, + "step": 54385 + }, + { + "epoch": 1.1065673828125, + "grad_norm": 15.374537467956543, + "learning_rate": 8.232000193070688e-06, + "loss": 5.061, + "step": 54390 + }, + { + "epoch": 1.1066691080729167, + "grad_norm": 18.481258392333984, + "learning_rate": 8.231695229377377e-06, + "loss": 4.9255, + "step": 54395 + }, + { + "epoch": 1.1067708333333333, + "grad_norm": 14.685301780700684, + "learning_rate": 8.231390245034558e-06, + "loss": 5.0414, + "step": 54400 + }, + { + "epoch": 1.10687255859375, + "grad_norm": 15.836023330688477, + "learning_rate": 8.231085240044181e-06, + "loss": 4.9181, + "step": 54405 + }, + { + "epoch": 1.1069742838541667, + "grad_norm": 17.055131912231445, + "learning_rate": 8.230780214408196e-06, + "loss": 4.9025, + "step": 54410 + }, + { + "epoch": 1.1070760091145833, + "grad_norm": 19.361549377441406, + "learning_rate": 8.23047516812855e-06, + "loss": 5.0984, + "step": 54415 + }, + { + "epoch": 1.107177734375, + "grad_norm": 17.521623611450195, + "learning_rate": 8.230170101207196e-06, + "loss": 5.0592, + "step": 54420 + }, + { + "epoch": 1.1072794596354167, + "grad_norm": 16.398637771606445, + "learning_rate": 8.229865013646078e-06, + "loss": 4.9528, + "step": 54425 + }, + { + "epoch": 1.1073811848958333, + "grad_norm": 16.551572799682617, + "learning_rate": 8.22955990544715e-06, + "loss": 4.8559, + "step": 54430 + }, + { + "epoch": 1.10748291015625, + "grad_norm": 18.862041473388672, + "learning_rate": 8.229254776612356e-06, + "loss": 4.9116, + "step": 54435 + }, + { + "epoch": 1.1075846354166667, + "grad_norm": 16.23636245727539, + "learning_rate": 8.22894962714365e-06, + "loss": 4.9319, + "step": 54440 + }, + { + "epoch": 1.1076863606770833, + "grad_norm": 15.333661079406738, + "learning_rate": 8.228644457042983e-06, + "loss": 4.951, + "step": 54445 + }, + { + "epoch": 1.1077880859375, + "grad_norm": 19.813907623291016, + "learning_rate": 8.2283392663123e-06, + "loss": 4.9521, + "step": 54450 + }, + { + "epoch": 1.1078898111979167, + "grad_norm": 16.592832565307617, + "learning_rate": 8.228034054953556e-06, + "loss": 4.915, + "step": 54455 + }, + { + "epoch": 1.1079915364583333, + "grad_norm": 15.250937461853027, + "learning_rate": 8.2277288229687e-06, + "loss": 4.6716, + "step": 54460 + }, + { + "epoch": 1.10809326171875, + "grad_norm": 25.076942443847656, + "learning_rate": 8.227423570359679e-06, + "loss": 4.8895, + "step": 54465 + }, + { + "epoch": 1.1081949869791667, + "grad_norm": 18.660564422607422, + "learning_rate": 8.227118297128447e-06, + "loss": 4.9385, + "step": 54470 + }, + { + "epoch": 1.1082967122395833, + "grad_norm": 19.236051559448242, + "learning_rate": 8.226813003276951e-06, + "loss": 5.0035, + "step": 54475 + }, + { + "epoch": 1.1083984375, + "grad_norm": 19.750343322753906, + "learning_rate": 8.226507688807148e-06, + "loss": 4.8241, + "step": 54480 + }, + { + "epoch": 1.1085001627604167, + "grad_norm": 14.482620239257812, + "learning_rate": 8.226202353720984e-06, + "loss": 5.1736, + "step": 54485 + }, + { + "epoch": 1.1086018880208333, + "grad_norm": 19.007291793823242, + "learning_rate": 8.22589699802041e-06, + "loss": 5.1949, + "step": 54490 + }, + { + "epoch": 1.10870361328125, + "grad_norm": 12.720458030700684, + "learning_rate": 8.22559162170738e-06, + "loss": 5.0628, + "step": 54495 + }, + { + "epoch": 1.1088053385416667, + "grad_norm": 16.255842208862305, + "learning_rate": 8.225286224783842e-06, + "loss": 5.0705, + "step": 54500 + }, + { + "epoch": 1.1089070638020833, + "grad_norm": 17.675195693969727, + "learning_rate": 8.22498080725175e-06, + "loss": 4.9167, + "step": 54505 + }, + { + "epoch": 1.1090087890625, + "grad_norm": 15.390296936035156, + "learning_rate": 8.224675369113053e-06, + "loss": 5.0498, + "step": 54510 + }, + { + "epoch": 1.1091105143229167, + "grad_norm": 19.195974349975586, + "learning_rate": 8.224369910369704e-06, + "loss": 4.9454, + "step": 54515 + }, + { + "epoch": 1.1092122395833333, + "grad_norm": 28.343914031982422, + "learning_rate": 8.224064431023657e-06, + "loss": 4.7183, + "step": 54520 + }, + { + "epoch": 1.10931396484375, + "grad_norm": 32.77104568481445, + "learning_rate": 8.22375893107686e-06, + "loss": 4.9376, + "step": 54525 + }, + { + "epoch": 1.1094156901041667, + "grad_norm": 21.35913848876953, + "learning_rate": 8.223453410531269e-06, + "loss": 4.9176, + "step": 54530 + }, + { + "epoch": 1.1095174153645833, + "grad_norm": 20.187013626098633, + "learning_rate": 8.22314786938883e-06, + "loss": 4.863, + "step": 54535 + }, + { + "epoch": 1.109619140625, + "grad_norm": 20.15289878845215, + "learning_rate": 8.222842307651501e-06, + "loss": 4.9091, + "step": 54540 + }, + { + "epoch": 1.1097208658854167, + "grad_norm": 16.527538299560547, + "learning_rate": 8.222536725321234e-06, + "loss": 4.8939, + "step": 54545 + }, + { + "epoch": 1.1098225911458333, + "grad_norm": 17.51457405090332, + "learning_rate": 8.22223112239998e-06, + "loss": 4.7292, + "step": 54550 + }, + { + "epoch": 1.10992431640625, + "grad_norm": 21.35895347595215, + "learning_rate": 8.221925498889692e-06, + "loss": 5.026, + "step": 54555 + }, + { + "epoch": 1.1100260416666667, + "grad_norm": 16.93853187561035, + "learning_rate": 8.221619854792321e-06, + "loss": 5.2669, + "step": 54560 + }, + { + "epoch": 1.1101277669270833, + "grad_norm": 16.790050506591797, + "learning_rate": 8.221314190109823e-06, + "loss": 4.8229, + "step": 54565 + }, + { + "epoch": 1.1102294921875, + "grad_norm": 16.285484313964844, + "learning_rate": 8.22100850484415e-06, + "loss": 4.9118, + "step": 54570 + }, + { + "epoch": 1.1103312174479167, + "grad_norm": 18.223634719848633, + "learning_rate": 8.220702798997256e-06, + "loss": 4.9654, + "step": 54575 + }, + { + "epoch": 1.1104329427083333, + "grad_norm": 14.611148834228516, + "learning_rate": 8.220397072571093e-06, + "loss": 5.1448, + "step": 54580 + }, + { + "epoch": 1.11053466796875, + "grad_norm": 25.256004333496094, + "learning_rate": 8.220091325567615e-06, + "loss": 4.9156, + "step": 54585 + }, + { + "epoch": 1.1106363932291667, + "grad_norm": 19.792470932006836, + "learning_rate": 8.219785557988774e-06, + "loss": 4.8801, + "step": 54590 + }, + { + "epoch": 1.1107381184895833, + "grad_norm": 16.465076446533203, + "learning_rate": 8.219479769836527e-06, + "loss": 4.9574, + "step": 54595 + }, + { + "epoch": 1.11083984375, + "grad_norm": 34.116233825683594, + "learning_rate": 8.219173961112826e-06, + "loss": 5.2166, + "step": 54600 + }, + { + "epoch": 1.1109415690104167, + "grad_norm": 14.508183479309082, + "learning_rate": 8.218868131819625e-06, + "loss": 4.9025, + "step": 54605 + }, + { + "epoch": 1.1110432942708333, + "grad_norm": 15.333868026733398, + "learning_rate": 8.218562281958878e-06, + "loss": 5.0919, + "step": 54610 + }, + { + "epoch": 1.11114501953125, + "grad_norm": 13.6915864944458, + "learning_rate": 8.21825641153254e-06, + "loss": 4.9433, + "step": 54615 + }, + { + "epoch": 1.1112467447916667, + "grad_norm": 13.815673828125, + "learning_rate": 8.217950520542566e-06, + "loss": 4.9203, + "step": 54620 + }, + { + "epoch": 1.1113484700520833, + "grad_norm": 16.869848251342773, + "learning_rate": 8.217644608990908e-06, + "loss": 4.872, + "step": 54625 + }, + { + "epoch": 1.1114501953125, + "grad_norm": 16.32183265686035, + "learning_rate": 8.217338676879524e-06, + "loss": 4.9282, + "step": 54630 + }, + { + "epoch": 1.1115519205729167, + "grad_norm": 13.346619606018066, + "learning_rate": 8.217032724210368e-06, + "loss": 4.8494, + "step": 54635 + }, + { + "epoch": 1.1116536458333333, + "grad_norm": 15.397651672363281, + "learning_rate": 8.216726750985393e-06, + "loss": 5.1249, + "step": 54640 + }, + { + "epoch": 1.11175537109375, + "grad_norm": 17.48270606994629, + "learning_rate": 8.216420757206555e-06, + "loss": 4.728, + "step": 54645 + }, + { + "epoch": 1.1118570963541667, + "grad_norm": 19.872936248779297, + "learning_rate": 8.216114742875809e-06, + "loss": 4.9993, + "step": 54650 + }, + { + "epoch": 1.1119588216145833, + "grad_norm": 17.249832153320312, + "learning_rate": 8.215808707995112e-06, + "loss": 4.8497, + "step": 54655 + }, + { + "epoch": 1.112060546875, + "grad_norm": 16.705087661743164, + "learning_rate": 8.215502652566417e-06, + "loss": 5.1827, + "step": 54660 + }, + { + "epoch": 1.1121622721354167, + "grad_norm": 16.15424346923828, + "learning_rate": 8.215196576591681e-06, + "loss": 5.0381, + "step": 54665 + }, + { + "epoch": 1.1122639973958333, + "grad_norm": 18.954986572265625, + "learning_rate": 8.21489048007286e-06, + "loss": 4.957, + "step": 54670 + }, + { + "epoch": 1.11236572265625, + "grad_norm": 21.099760055541992, + "learning_rate": 8.214584363011908e-06, + "loss": 4.8301, + "step": 54675 + }, + { + "epoch": 1.1124674479166667, + "grad_norm": 15.790409088134766, + "learning_rate": 8.214278225410783e-06, + "loss": 4.9542, + "step": 54680 + }, + { + "epoch": 1.1125691731770833, + "grad_norm": 13.303177833557129, + "learning_rate": 8.213972067271441e-06, + "loss": 4.7659, + "step": 54685 + }, + { + "epoch": 1.1126708984375, + "grad_norm": 21.141820907592773, + "learning_rate": 8.213665888595837e-06, + "loss": 5.0989, + "step": 54690 + }, + { + "epoch": 1.1127726236979167, + "grad_norm": 15.642363548278809, + "learning_rate": 8.21335968938593e-06, + "loss": 4.9913, + "step": 54695 + }, + { + "epoch": 1.1128743489583333, + "grad_norm": 19.266592025756836, + "learning_rate": 8.213053469643673e-06, + "loss": 4.8633, + "step": 54700 + }, + { + "epoch": 1.11297607421875, + "grad_norm": 19.529672622680664, + "learning_rate": 8.212747229371025e-06, + "loss": 4.8898, + "step": 54705 + }, + { + "epoch": 1.1130777994791667, + "grad_norm": 21.841394424438477, + "learning_rate": 8.212440968569941e-06, + "loss": 4.9414, + "step": 54710 + }, + { + "epoch": 1.1131795247395833, + "grad_norm": 15.376835823059082, + "learning_rate": 8.212134687242381e-06, + "loss": 5.0028, + "step": 54715 + }, + { + "epoch": 1.11328125, + "grad_norm": 19.149961471557617, + "learning_rate": 8.211828385390297e-06, + "loss": 4.8103, + "step": 54720 + }, + { + "epoch": 1.1133829752604167, + "grad_norm": 14.024057388305664, + "learning_rate": 8.211522063015654e-06, + "loss": 5.0385, + "step": 54725 + }, + { + "epoch": 1.1134847005208333, + "grad_norm": 24.21324348449707, + "learning_rate": 8.2112157201204e-06, + "loss": 5.0646, + "step": 54730 + }, + { + "epoch": 1.11358642578125, + "grad_norm": 16.814023971557617, + "learning_rate": 8.210909356706498e-06, + "loss": 5.1614, + "step": 54735 + }, + { + "epoch": 1.1136881510416667, + "grad_norm": 14.639111518859863, + "learning_rate": 8.210602972775906e-06, + "loss": 4.9814, + "step": 54740 + }, + { + "epoch": 1.1137898763020833, + "grad_norm": 17.481369018554688, + "learning_rate": 8.210296568330579e-06, + "loss": 4.6444, + "step": 54745 + }, + { + "epoch": 1.1138916015625, + "grad_norm": 15.261207580566406, + "learning_rate": 8.209990143372475e-06, + "loss": 4.903, + "step": 54750 + }, + { + "epoch": 1.1139933268229167, + "grad_norm": 15.805668830871582, + "learning_rate": 8.209683697903553e-06, + "loss": 5.1723, + "step": 54755 + }, + { + "epoch": 1.1140950520833333, + "grad_norm": 23.23613929748535, + "learning_rate": 8.209377231925773e-06, + "loss": 4.8188, + "step": 54760 + }, + { + "epoch": 1.11419677734375, + "grad_norm": 19.799610137939453, + "learning_rate": 8.20907074544109e-06, + "loss": 5.0057, + "step": 54765 + }, + { + "epoch": 1.1142985026041667, + "grad_norm": 21.261594772338867, + "learning_rate": 8.208764238451464e-06, + "loss": 4.9616, + "step": 54770 + }, + { + "epoch": 1.1144002278645833, + "grad_norm": 19.30398941040039, + "learning_rate": 8.208457710958854e-06, + "loss": 4.8375, + "step": 54775 + }, + { + "epoch": 1.114501953125, + "grad_norm": 15.247200965881348, + "learning_rate": 8.208151162965217e-06, + "loss": 4.7947, + "step": 54780 + }, + { + "epoch": 1.1146036783854167, + "grad_norm": 17.577285766601562, + "learning_rate": 8.207844594472512e-06, + "loss": 5.0223, + "step": 54785 + }, + { + "epoch": 1.1147054036458333, + "grad_norm": 23.40019989013672, + "learning_rate": 8.207538005482698e-06, + "loss": 5.111, + "step": 54790 + }, + { + "epoch": 1.11480712890625, + "grad_norm": 18.976444244384766, + "learning_rate": 8.207231395997735e-06, + "loss": 4.949, + "step": 54795 + }, + { + "epoch": 1.1149088541666667, + "grad_norm": 20.588781356811523, + "learning_rate": 8.206924766019582e-06, + "loss": 5.0109, + "step": 54800 + }, + { + "epoch": 1.1150105794270833, + "grad_norm": 16.704423904418945, + "learning_rate": 8.206618115550198e-06, + "loss": 4.7716, + "step": 54805 + }, + { + "epoch": 1.1151123046875, + "grad_norm": 19.623437881469727, + "learning_rate": 8.20631144459154e-06, + "loss": 4.8956, + "step": 54810 + }, + { + "epoch": 1.1152140299479167, + "grad_norm": 19.19603157043457, + "learning_rate": 8.20600475314557e-06, + "loss": 4.8252, + "step": 54815 + }, + { + "epoch": 1.1153157552083333, + "grad_norm": 23.848384857177734, + "learning_rate": 8.20569804121425e-06, + "loss": 5.2045, + "step": 54820 + }, + { + "epoch": 1.11541748046875, + "grad_norm": 17.277599334716797, + "learning_rate": 8.205391308799535e-06, + "loss": 5.2719, + "step": 54825 + }, + { + "epoch": 1.1155192057291667, + "grad_norm": 16.952695846557617, + "learning_rate": 8.205084555903388e-06, + "loss": 5.0217, + "step": 54830 + }, + { + "epoch": 1.1156209309895833, + "grad_norm": 14.903968811035156, + "learning_rate": 8.204777782527768e-06, + "loss": 4.7943, + "step": 54835 + }, + { + "epoch": 1.11572265625, + "grad_norm": 18.222082138061523, + "learning_rate": 8.204470988674636e-06, + "loss": 5.0295, + "step": 54840 + }, + { + "epoch": 1.1158243815104167, + "grad_norm": 15.040075302124023, + "learning_rate": 8.20416417434595e-06, + "loss": 5.012, + "step": 54845 + }, + { + "epoch": 1.1159261067708333, + "grad_norm": 21.26192855834961, + "learning_rate": 8.203857339543672e-06, + "loss": 5.1178, + "step": 54850 + }, + { + "epoch": 1.11602783203125, + "grad_norm": 19.554527282714844, + "learning_rate": 8.203550484269765e-06, + "loss": 4.9354, + "step": 54855 + }, + { + "epoch": 1.1161295572916667, + "grad_norm": 18.631494522094727, + "learning_rate": 8.203243608526185e-06, + "loss": 4.8264, + "step": 54860 + }, + { + "epoch": 1.1162312825520833, + "grad_norm": 20.936599731445312, + "learning_rate": 8.202936712314896e-06, + "loss": 4.9256, + "step": 54865 + }, + { + "epoch": 1.1163330078125, + "grad_norm": 18.992319107055664, + "learning_rate": 8.202629795637857e-06, + "loss": 4.8601, + "step": 54870 + }, + { + "epoch": 1.1164347330729167, + "grad_norm": 15.542969703674316, + "learning_rate": 8.202322858497031e-06, + "loss": 4.8885, + "step": 54875 + }, + { + "epoch": 1.1165364583333333, + "grad_norm": 18.082162857055664, + "learning_rate": 8.20201590089438e-06, + "loss": 5.0776, + "step": 54880 + }, + { + "epoch": 1.11663818359375, + "grad_norm": 18.410831451416016, + "learning_rate": 8.201708922831862e-06, + "loss": 5.4358, + "step": 54885 + }, + { + "epoch": 1.1167399088541667, + "grad_norm": 23.933210372924805, + "learning_rate": 8.201401924311441e-06, + "loss": 4.9563, + "step": 54890 + }, + { + "epoch": 1.1168416341145833, + "grad_norm": 18.023767471313477, + "learning_rate": 8.20109490533508e-06, + "loss": 4.7258, + "step": 54895 + }, + { + "epoch": 1.116943359375, + "grad_norm": 15.169840812683105, + "learning_rate": 8.200787865904735e-06, + "loss": 4.7473, + "step": 54900 + }, + { + "epoch": 1.1170450846354167, + "grad_norm": 18.718704223632812, + "learning_rate": 8.200480806022372e-06, + "loss": 4.8769, + "step": 54905 + }, + { + "epoch": 1.1171468098958333, + "grad_norm": 12.582695007324219, + "learning_rate": 8.200173725689954e-06, + "loss": 4.8849, + "step": 54910 + }, + { + "epoch": 1.11724853515625, + "grad_norm": 13.451717376708984, + "learning_rate": 8.199866624909442e-06, + "loss": 5.0347, + "step": 54915 + }, + { + "epoch": 1.1173502604166667, + "grad_norm": 15.585089683532715, + "learning_rate": 8.199559503682799e-06, + "loss": 4.9863, + "step": 54920 + }, + { + "epoch": 1.1174519856770833, + "grad_norm": 13.592560768127441, + "learning_rate": 8.199252362011985e-06, + "loss": 4.8231, + "step": 54925 + }, + { + "epoch": 1.1175537109375, + "grad_norm": 17.120115280151367, + "learning_rate": 8.198945199898964e-06, + "loss": 5.0584, + "step": 54930 + }, + { + "epoch": 1.1176554361979167, + "grad_norm": 19.682809829711914, + "learning_rate": 8.198638017345699e-06, + "loss": 4.9988, + "step": 54935 + }, + { + "epoch": 1.1177571614583333, + "grad_norm": 19.952070236206055, + "learning_rate": 8.198330814354151e-06, + "loss": 5.1471, + "step": 54940 + }, + { + "epoch": 1.11785888671875, + "grad_norm": 25.808130264282227, + "learning_rate": 8.198023590926288e-06, + "loss": 4.9026, + "step": 54945 + }, + { + "epoch": 1.1179606119791667, + "grad_norm": 17.483015060424805, + "learning_rate": 8.197716347064066e-06, + "loss": 4.8907, + "step": 54950 + }, + { + "epoch": 1.1180623372395833, + "grad_norm": 31.86693572998047, + "learning_rate": 8.197409082769453e-06, + "loss": 4.9437, + "step": 54955 + }, + { + "epoch": 1.1181640625, + "grad_norm": 23.098876953125, + "learning_rate": 8.197101798044411e-06, + "loss": 4.9269, + "step": 54960 + }, + { + "epoch": 1.1182657877604167, + "grad_norm": 19.32411003112793, + "learning_rate": 8.196794492890902e-06, + "loss": 5.3368, + "step": 54965 + }, + { + "epoch": 1.1183675130208333, + "grad_norm": 15.08047866821289, + "learning_rate": 8.196487167310894e-06, + "loss": 5.0655, + "step": 54970 + }, + { + "epoch": 1.11846923828125, + "grad_norm": 17.80780601501465, + "learning_rate": 8.196179821306345e-06, + "loss": 4.748, + "step": 54975 + }, + { + "epoch": 1.1185709635416667, + "grad_norm": 18.77204704284668, + "learning_rate": 8.195872454879221e-06, + "loss": 4.9113, + "step": 54980 + }, + { + "epoch": 1.1186726888020833, + "grad_norm": 18.816204071044922, + "learning_rate": 8.195565068031488e-06, + "loss": 4.8767, + "step": 54985 + }, + { + "epoch": 1.1187744140625, + "grad_norm": 19.57417106628418, + "learning_rate": 8.19525766076511e-06, + "loss": 4.9564, + "step": 54990 + }, + { + "epoch": 1.1188761393229167, + "grad_norm": 15.791802406311035, + "learning_rate": 8.19495023308205e-06, + "loss": 5.024, + "step": 54995 + }, + { + "epoch": 1.1189778645833333, + "grad_norm": 17.20021629333496, + "learning_rate": 8.194642784984268e-06, + "loss": 4.8688, + "step": 55000 + }, + { + "epoch": 1.11907958984375, + "grad_norm": 17.46060562133789, + "learning_rate": 8.194335316473736e-06, + "loss": 4.8765, + "step": 55005 + }, + { + "epoch": 1.1191813151041667, + "grad_norm": 20.72218894958496, + "learning_rate": 8.194027827552415e-06, + "loss": 5.0446, + "step": 55010 + }, + { + "epoch": 1.1192830403645833, + "grad_norm": 19.549232482910156, + "learning_rate": 8.19372031822227e-06, + "loss": 4.8706, + "step": 55015 + }, + { + "epoch": 1.119384765625, + "grad_norm": 19.081809997558594, + "learning_rate": 8.193412788485264e-06, + "loss": 4.9057, + "step": 55020 + }, + { + "epoch": 1.1194864908854167, + "grad_norm": 14.480727195739746, + "learning_rate": 8.193105238343367e-06, + "loss": 4.7239, + "step": 55025 + }, + { + "epoch": 1.1195882161458333, + "grad_norm": 16.69268798828125, + "learning_rate": 8.192797667798538e-06, + "loss": 4.8596, + "step": 55030 + }, + { + "epoch": 1.11968994140625, + "grad_norm": 17.97955894470215, + "learning_rate": 8.192490076852749e-06, + "loss": 5.1293, + "step": 55035 + }, + { + "epoch": 1.1197916666666667, + "grad_norm": 14.983587265014648, + "learning_rate": 8.19218246550796e-06, + "loss": 4.832, + "step": 55040 + }, + { + "epoch": 1.1198933919270833, + "grad_norm": 13.328211784362793, + "learning_rate": 8.191874833766139e-06, + "loss": 4.9201, + "step": 55045 + }, + { + "epoch": 1.1199951171875, + "grad_norm": 20.03533172607422, + "learning_rate": 8.19156718162925e-06, + "loss": 4.9459, + "step": 55050 + }, + { + "epoch": 1.1200968424479167, + "grad_norm": 16.176376342773438, + "learning_rate": 8.191259509099261e-06, + "loss": 4.5995, + "step": 55055 + }, + { + "epoch": 1.1201985677083333, + "grad_norm": 16.91315460205078, + "learning_rate": 8.190951816178137e-06, + "loss": 4.8542, + "step": 55060 + }, + { + "epoch": 1.12030029296875, + "grad_norm": 18.513595581054688, + "learning_rate": 8.190644102867841e-06, + "loss": 5.0674, + "step": 55065 + }, + { + "epoch": 1.1204020182291667, + "grad_norm": 14.662572860717773, + "learning_rate": 8.190336369170344e-06, + "loss": 5.2423, + "step": 55070 + }, + { + "epoch": 1.1205037434895833, + "grad_norm": 20.03913688659668, + "learning_rate": 8.19002861508761e-06, + "loss": 4.8568, + "step": 55075 + }, + { + "epoch": 1.12060546875, + "grad_norm": 14.698458671569824, + "learning_rate": 8.189720840621606e-06, + "loss": 4.9823, + "step": 55080 + }, + { + "epoch": 1.1207071940104167, + "grad_norm": 17.362192153930664, + "learning_rate": 8.1894130457743e-06, + "loss": 4.8511, + "step": 55085 + }, + { + "epoch": 1.1208089192708333, + "grad_norm": 20.22083854675293, + "learning_rate": 8.189105230547657e-06, + "loss": 4.9992, + "step": 55090 + }, + { + "epoch": 1.12091064453125, + "grad_norm": 24.560819625854492, + "learning_rate": 8.188797394943642e-06, + "loss": 4.7841, + "step": 55095 + }, + { + "epoch": 1.1210123697916667, + "grad_norm": 20.473899841308594, + "learning_rate": 8.188489538964225e-06, + "loss": 5.157, + "step": 55100 + }, + { + "epoch": 1.1211140950520833, + "grad_norm": 15.997573852539062, + "learning_rate": 8.18818166261137e-06, + "loss": 4.9209, + "step": 55105 + }, + { + "epoch": 1.1212158203125, + "grad_norm": 17.665184020996094, + "learning_rate": 8.187873765887048e-06, + "loss": 4.9952, + "step": 55110 + }, + { + "epoch": 1.1213175455729167, + "grad_norm": 16.966758728027344, + "learning_rate": 8.187565848793226e-06, + "loss": 4.8679, + "step": 55115 + }, + { + "epoch": 1.1214192708333333, + "grad_norm": 17.935073852539062, + "learning_rate": 8.187257911331867e-06, + "loss": 5.0383, + "step": 55120 + }, + { + "epoch": 1.12152099609375, + "grad_norm": 21.728437423706055, + "learning_rate": 8.186949953504943e-06, + "loss": 4.8373, + "step": 55125 + }, + { + "epoch": 1.1216227213541667, + "grad_norm": 20.292997360229492, + "learning_rate": 8.18664197531442e-06, + "loss": 4.879, + "step": 55130 + }, + { + "epoch": 1.1217244466145833, + "grad_norm": 16.029117584228516, + "learning_rate": 8.186333976762269e-06, + "loss": 4.8019, + "step": 55135 + }, + { + "epoch": 1.121826171875, + "grad_norm": 16.05030059814453, + "learning_rate": 8.186025957850452e-06, + "loss": 5.051, + "step": 55140 + }, + { + "epoch": 1.1219278971354167, + "grad_norm": 19.835926055908203, + "learning_rate": 8.185717918580943e-06, + "loss": 5.0758, + "step": 55145 + }, + { + "epoch": 1.1220296223958333, + "grad_norm": 20.39017105102539, + "learning_rate": 8.185409858955706e-06, + "loss": 4.9366, + "step": 55150 + }, + { + "epoch": 1.12213134765625, + "grad_norm": 11.876090049743652, + "learning_rate": 8.185101778976711e-06, + "loss": 4.8286, + "step": 55155 + }, + { + "epoch": 1.1222330729166667, + "grad_norm": 20.164827346801758, + "learning_rate": 8.184793678645929e-06, + "loss": 4.8933, + "step": 55160 + }, + { + "epoch": 1.1223347981770833, + "grad_norm": 16.812335968017578, + "learning_rate": 8.184485557965326e-06, + "loss": 4.6906, + "step": 55165 + }, + { + "epoch": 1.1224365234375, + "grad_norm": 16.20962142944336, + "learning_rate": 8.184177416936868e-06, + "loss": 5.0959, + "step": 55170 + }, + { + "epoch": 1.1225382486979167, + "grad_norm": 23.20538902282715, + "learning_rate": 8.18386925556253e-06, + "loss": 5.1196, + "step": 55175 + }, + { + "epoch": 1.1226399739583333, + "grad_norm": 18.438617706298828, + "learning_rate": 8.183561073844277e-06, + "loss": 4.8632, + "step": 55180 + }, + { + "epoch": 1.12274169921875, + "grad_norm": 19.1766414642334, + "learning_rate": 8.18325287178408e-06, + "loss": 4.969, + "step": 55185 + }, + { + "epoch": 1.1228434244791667, + "grad_norm": 18.888246536254883, + "learning_rate": 8.182944649383909e-06, + "loss": 5.0047, + "step": 55190 + }, + { + "epoch": 1.1229451497395833, + "grad_norm": 18.618587493896484, + "learning_rate": 8.18263640664573e-06, + "loss": 5.1744, + "step": 55195 + }, + { + "epoch": 1.123046875, + "grad_norm": 17.621580123901367, + "learning_rate": 8.182328143571515e-06, + "loss": 4.9951, + "step": 55200 + }, + { + "epoch": 1.1231486002604167, + "grad_norm": 15.243193626403809, + "learning_rate": 8.182019860163233e-06, + "loss": 4.9465, + "step": 55205 + }, + { + "epoch": 1.1232503255208333, + "grad_norm": 20.241533279418945, + "learning_rate": 8.181711556422855e-06, + "loss": 5.156, + "step": 55210 + }, + { + "epoch": 1.12335205078125, + "grad_norm": 18.326814651489258, + "learning_rate": 8.18140323235235e-06, + "loss": 4.8602, + "step": 55215 + }, + { + "epoch": 1.1234537760416667, + "grad_norm": 18.858205795288086, + "learning_rate": 8.181094887953687e-06, + "loss": 4.9002, + "step": 55220 + }, + { + "epoch": 1.1235555013020833, + "grad_norm": 17.87504768371582, + "learning_rate": 8.180786523228838e-06, + "loss": 5.4617, + "step": 55225 + }, + { + "epoch": 1.1236572265625, + "grad_norm": 12.746597290039062, + "learning_rate": 8.180478138179775e-06, + "loss": 5.0406, + "step": 55230 + }, + { + "epoch": 1.1237589518229167, + "grad_norm": 16.735095977783203, + "learning_rate": 8.180169732808466e-06, + "loss": 4.7892, + "step": 55235 + }, + { + "epoch": 1.1238606770833333, + "grad_norm": 17.26240348815918, + "learning_rate": 8.179861307116878e-06, + "loss": 4.9219, + "step": 55240 + }, + { + "epoch": 1.12396240234375, + "grad_norm": 21.112060546875, + "learning_rate": 8.17955286110699e-06, + "loss": 5.0591, + "step": 55245 + }, + { + "epoch": 1.1240641276041667, + "grad_norm": 15.7099027633667, + "learning_rate": 8.179244394780767e-06, + "loss": 4.8443, + "step": 55250 + }, + { + "epoch": 1.1241658528645833, + "grad_norm": 18.614395141601562, + "learning_rate": 8.178935908140182e-06, + "loss": 4.834, + "step": 55255 + }, + { + "epoch": 1.124267578125, + "grad_norm": 20.908071517944336, + "learning_rate": 8.178627401187205e-06, + "loss": 4.8445, + "step": 55260 + }, + { + "epoch": 1.1243693033854167, + "grad_norm": 17.762052536010742, + "learning_rate": 8.178318873923807e-06, + "loss": 4.7698, + "step": 55265 + }, + { + "epoch": 1.1244710286458333, + "grad_norm": 18.09467315673828, + "learning_rate": 8.178010326351962e-06, + "loss": 4.8485, + "step": 55270 + }, + { + "epoch": 1.12457275390625, + "grad_norm": 17.463666915893555, + "learning_rate": 8.177701758473639e-06, + "loss": 5.1003, + "step": 55275 + }, + { + "epoch": 1.1246744791666667, + "grad_norm": 17.826623916625977, + "learning_rate": 8.17739317029081e-06, + "loss": 5.0225, + "step": 55280 + }, + { + "epoch": 1.1247762044270833, + "grad_norm": 25.84820556640625, + "learning_rate": 8.177084561805449e-06, + "loss": 5.0312, + "step": 55285 + }, + { + "epoch": 1.1248779296875, + "grad_norm": 19.52773094177246, + "learning_rate": 8.176775933019525e-06, + "loss": 4.6731, + "step": 55290 + }, + { + "epoch": 1.1249796549479167, + "grad_norm": 22.392614364624023, + "learning_rate": 8.176467283935012e-06, + "loss": 4.7934, + "step": 55295 + }, + { + "epoch": 1.1250813802083333, + "grad_norm": 19.832773208618164, + "learning_rate": 8.176158614553879e-06, + "loss": 4.9643, + "step": 55300 + }, + { + "epoch": 1.12518310546875, + "grad_norm": 15.632636070251465, + "learning_rate": 8.175849924878104e-06, + "loss": 4.6776, + "step": 55305 + }, + { + "epoch": 1.1252848307291667, + "grad_norm": 17.17580795288086, + "learning_rate": 8.175541214909655e-06, + "loss": 5.005, + "step": 55310 + }, + { + "epoch": 1.1253865559895833, + "grad_norm": 15.123658180236816, + "learning_rate": 8.175232484650504e-06, + "loss": 4.7752, + "step": 55315 + }, + { + "epoch": 1.12548828125, + "grad_norm": 17.311635971069336, + "learning_rate": 8.174923734102628e-06, + "loss": 5.1113, + "step": 55320 + }, + { + "epoch": 1.1255900065104167, + "grad_norm": 17.279747009277344, + "learning_rate": 8.174614963267995e-06, + "loss": 4.8695, + "step": 55325 + }, + { + "epoch": 1.1256917317708333, + "grad_norm": 14.172916412353516, + "learning_rate": 8.174306172148583e-06, + "loss": 4.8382, + "step": 55330 + }, + { + "epoch": 1.12579345703125, + "grad_norm": 14.792123794555664, + "learning_rate": 8.17399736074636e-06, + "loss": 4.7967, + "step": 55335 + }, + { + "epoch": 1.1258951822916667, + "grad_norm": 19.540971755981445, + "learning_rate": 8.173688529063301e-06, + "loss": 4.979, + "step": 55340 + }, + { + "epoch": 1.1259969075520833, + "grad_norm": 22.52972412109375, + "learning_rate": 8.17337967710138e-06, + "loss": 5.0169, + "step": 55345 + }, + { + "epoch": 1.1260986328125, + "grad_norm": 21.25628089904785, + "learning_rate": 8.173070804862571e-06, + "loss": 4.7435, + "step": 55350 + }, + { + "epoch": 1.1262003580729167, + "grad_norm": 19.949752807617188, + "learning_rate": 8.172761912348848e-06, + "loss": 5.0194, + "step": 55355 + }, + { + "epoch": 1.1263020833333333, + "grad_norm": 16.000831604003906, + "learning_rate": 8.172452999562182e-06, + "loss": 4.8588, + "step": 55360 + }, + { + "epoch": 1.12640380859375, + "grad_norm": 24.24630355834961, + "learning_rate": 8.172144066504548e-06, + "loss": 5.066, + "step": 55365 + }, + { + "epoch": 1.1265055338541667, + "grad_norm": 17.739221572875977, + "learning_rate": 8.171835113177921e-06, + "loss": 4.95, + "step": 55370 + }, + { + "epoch": 1.1266072591145833, + "grad_norm": 16.31348991394043, + "learning_rate": 8.171526139584275e-06, + "loss": 4.8108, + "step": 55375 + }, + { + "epoch": 1.126708984375, + "grad_norm": 13.441838264465332, + "learning_rate": 8.171217145725583e-06, + "loss": 4.7742, + "step": 55380 + }, + { + "epoch": 1.1268107096354167, + "grad_norm": 16.609899520874023, + "learning_rate": 8.170908131603821e-06, + "loss": 4.8527, + "step": 55385 + }, + { + "epoch": 1.1269124348958333, + "grad_norm": 19.307498931884766, + "learning_rate": 8.170599097220961e-06, + "loss": 5.229, + "step": 55390 + }, + { + "epoch": 1.12701416015625, + "grad_norm": 17.054851531982422, + "learning_rate": 8.17029004257898e-06, + "loss": 4.9512, + "step": 55395 + }, + { + "epoch": 1.1271158854166667, + "grad_norm": 18.796159744262695, + "learning_rate": 8.169980967679851e-06, + "loss": 4.7153, + "step": 55400 + }, + { + "epoch": 1.1272176106770833, + "grad_norm": 13.52328872680664, + "learning_rate": 8.169671872525552e-06, + "loss": 4.6859, + "step": 55405 + }, + { + "epoch": 1.1273193359375, + "grad_norm": 14.470532417297363, + "learning_rate": 8.169362757118055e-06, + "loss": 4.9975, + "step": 55410 + }, + { + "epoch": 1.1274210611979167, + "grad_norm": 24.314105987548828, + "learning_rate": 8.169053621459335e-06, + "loss": 4.7756, + "step": 55415 + }, + { + "epoch": 1.1275227864583333, + "grad_norm": 14.094664573669434, + "learning_rate": 8.16874446555137e-06, + "loss": 4.8389, + "step": 55420 + }, + { + "epoch": 1.12762451171875, + "grad_norm": 20.29549217224121, + "learning_rate": 8.168435289396134e-06, + "loss": 5.0952, + "step": 55425 + }, + { + "epoch": 1.1277262369791667, + "grad_norm": 26.84075164794922, + "learning_rate": 8.1681260929956e-06, + "loss": 4.9022, + "step": 55430 + }, + { + "epoch": 1.1278279622395833, + "grad_norm": 15.889816284179688, + "learning_rate": 8.167816876351748e-06, + "loss": 5.093, + "step": 55435 + }, + { + "epoch": 1.1279296875, + "grad_norm": 13.945850372314453, + "learning_rate": 8.167507639466551e-06, + "loss": 5.0775, + "step": 55440 + }, + { + "epoch": 1.1280314127604167, + "grad_norm": 15.13913345336914, + "learning_rate": 8.167198382341987e-06, + "loss": 4.9071, + "step": 55445 + }, + { + "epoch": 1.1281331380208333, + "grad_norm": 20.124723434448242, + "learning_rate": 8.16688910498003e-06, + "loss": 4.8656, + "step": 55450 + }, + { + "epoch": 1.12823486328125, + "grad_norm": 14.68579387664795, + "learning_rate": 8.166579807382656e-06, + "loss": 4.813, + "step": 55455 + }, + { + "epoch": 1.1283365885416667, + "grad_norm": 17.346920013427734, + "learning_rate": 8.166270489551843e-06, + "loss": 4.9123, + "step": 55460 + }, + { + "epoch": 1.1284383138020833, + "grad_norm": 15.793000221252441, + "learning_rate": 8.165961151489567e-06, + "loss": 4.9171, + "step": 55465 + }, + { + "epoch": 1.1285400390625, + "grad_norm": 15.92540168762207, + "learning_rate": 8.165651793197803e-06, + "loss": 4.9516, + "step": 55470 + }, + { + "epoch": 1.1286417643229167, + "grad_norm": 14.597973823547363, + "learning_rate": 8.16534241467853e-06, + "loss": 4.7602, + "step": 55475 + }, + { + "epoch": 1.1287434895833333, + "grad_norm": 16.751140594482422, + "learning_rate": 8.165033015933724e-06, + "loss": 4.8914, + "step": 55480 + }, + { + "epoch": 1.12884521484375, + "grad_norm": 16.18271255493164, + "learning_rate": 8.16472359696536e-06, + "loss": 5.1191, + "step": 55485 + }, + { + "epoch": 1.1289469401041667, + "grad_norm": 17.999746322631836, + "learning_rate": 8.164414157775419e-06, + "loss": 5.1375, + "step": 55490 + }, + { + "epoch": 1.1290486653645833, + "grad_norm": 16.95197868347168, + "learning_rate": 8.164104698365874e-06, + "loss": 5.1008, + "step": 55495 + }, + { + "epoch": 1.129150390625, + "grad_norm": 25.836448669433594, + "learning_rate": 8.163795218738706e-06, + "loss": 4.9349, + "step": 55500 + }, + { + "epoch": 1.1292521158854167, + "grad_norm": 17.424545288085938, + "learning_rate": 8.16348571889589e-06, + "loss": 4.9716, + "step": 55505 + }, + { + "epoch": 1.1293538411458333, + "grad_norm": 14.750826835632324, + "learning_rate": 8.163176198839405e-06, + "loss": 5.0546, + "step": 55510 + }, + { + "epoch": 1.12945556640625, + "grad_norm": 20.180490493774414, + "learning_rate": 8.16286665857123e-06, + "loss": 4.6278, + "step": 55515 + }, + { + "epoch": 1.1295572916666667, + "grad_norm": 15.155683517456055, + "learning_rate": 8.162557098093338e-06, + "loss": 5.1308, + "step": 55520 + }, + { + "epoch": 1.1296590169270833, + "grad_norm": 19.777894973754883, + "learning_rate": 8.16224751740771e-06, + "loss": 4.9585, + "step": 55525 + }, + { + "epoch": 1.1297607421875, + "grad_norm": 19.593002319335938, + "learning_rate": 8.161937916516327e-06, + "loss": 5.1684, + "step": 55530 + }, + { + "epoch": 1.1298624674479167, + "grad_norm": 19.82541847229004, + "learning_rate": 8.161628295421161e-06, + "loss": 4.8322, + "step": 55535 + }, + { + "epoch": 1.1299641927083333, + "grad_norm": 19.377737045288086, + "learning_rate": 8.161318654124197e-06, + "loss": 4.7332, + "step": 55540 + }, + { + "epoch": 1.13006591796875, + "grad_norm": 21.608909606933594, + "learning_rate": 8.161008992627408e-06, + "loss": 5.3074, + "step": 55545 + }, + { + "epoch": 1.1301676432291667, + "grad_norm": 14.77971363067627, + "learning_rate": 8.160699310932776e-06, + "loss": 5.1716, + "step": 55550 + }, + { + "epoch": 1.1302693684895833, + "grad_norm": 16.20998191833496, + "learning_rate": 8.16038960904228e-06, + "loss": 4.936, + "step": 55555 + }, + { + "epoch": 1.13037109375, + "grad_norm": 14.872657775878906, + "learning_rate": 8.160079886957895e-06, + "loss": 4.9961, + "step": 55560 + }, + { + "epoch": 1.1304728190104167, + "grad_norm": 15.946368217468262, + "learning_rate": 8.159770144681605e-06, + "loss": 5.0395, + "step": 55565 + }, + { + "epoch": 1.1305745442708333, + "grad_norm": 18.136747360229492, + "learning_rate": 8.159460382215385e-06, + "loss": 4.8228, + "step": 55570 + }, + { + "epoch": 1.13067626953125, + "grad_norm": 17.77008056640625, + "learning_rate": 8.15915059956122e-06, + "loss": 5.0428, + "step": 55575 + }, + { + "epoch": 1.1307779947916667, + "grad_norm": 17.041231155395508, + "learning_rate": 8.15884079672108e-06, + "loss": 4.8628, + "step": 55580 + }, + { + "epoch": 1.1308797200520833, + "grad_norm": 15.741969108581543, + "learning_rate": 8.158530973696953e-06, + "loss": 5.0674, + "step": 55585 + }, + { + "epoch": 1.1309814453125, + "grad_norm": 15.47657299041748, + "learning_rate": 8.158221130490816e-06, + "loss": 4.941, + "step": 55590 + }, + { + "epoch": 1.1310831705729167, + "grad_norm": 18.965469360351562, + "learning_rate": 8.157911267104647e-06, + "loss": 4.7739, + "step": 55595 + }, + { + "epoch": 1.1311848958333333, + "grad_norm": 15.656847953796387, + "learning_rate": 8.157601383540429e-06, + "loss": 4.9783, + "step": 55600 + }, + { + "epoch": 1.13128662109375, + "grad_norm": 18.507347106933594, + "learning_rate": 8.15729147980014e-06, + "loss": 4.9275, + "step": 55605 + }, + { + "epoch": 1.1313883463541667, + "grad_norm": 13.758672714233398, + "learning_rate": 8.15698155588576e-06, + "loss": 5.152, + "step": 55610 + }, + { + "epoch": 1.1314900716145833, + "grad_norm": 23.76906967163086, + "learning_rate": 8.15667161179927e-06, + "loss": 4.6474, + "step": 55615 + }, + { + "epoch": 1.131591796875, + "grad_norm": 18.459360122680664, + "learning_rate": 8.156361647542652e-06, + "loss": 5.1146, + "step": 55620 + }, + { + "epoch": 1.1316935221354167, + "grad_norm": 18.669130325317383, + "learning_rate": 8.156051663117884e-06, + "loss": 5.0528, + "step": 55625 + }, + { + "epoch": 1.1317952473958333, + "grad_norm": 17.438692092895508, + "learning_rate": 8.155741658526948e-06, + "loss": 5.2671, + "step": 55630 + }, + { + "epoch": 1.13189697265625, + "grad_norm": 16.598079681396484, + "learning_rate": 8.155431633771823e-06, + "loss": 5.1223, + "step": 55635 + }, + { + "epoch": 1.1319986979166667, + "grad_norm": 29.314071655273438, + "learning_rate": 8.155121588854493e-06, + "loss": 4.7795, + "step": 55640 + }, + { + "epoch": 1.1321004231770833, + "grad_norm": 17.898412704467773, + "learning_rate": 8.154811523776938e-06, + "loss": 4.9953, + "step": 55645 + }, + { + "epoch": 1.1322021484375, + "grad_norm": 15.392599105834961, + "learning_rate": 8.154501438541138e-06, + "loss": 5.0283, + "step": 55650 + }, + { + "epoch": 1.1323038736979167, + "grad_norm": 21.876710891723633, + "learning_rate": 8.154191333149076e-06, + "loss": 4.7456, + "step": 55655 + }, + { + "epoch": 1.1324055989583333, + "grad_norm": 20.479713439941406, + "learning_rate": 8.153881207602732e-06, + "loss": 4.7989, + "step": 55660 + }, + { + "epoch": 1.13250732421875, + "grad_norm": 19.424232482910156, + "learning_rate": 8.153571061904088e-06, + "loss": 4.6762, + "step": 55665 + }, + { + "epoch": 1.1326090494791667, + "grad_norm": 15.633679389953613, + "learning_rate": 8.153260896055127e-06, + "loss": 4.882, + "step": 55670 + }, + { + "epoch": 1.1327107747395833, + "grad_norm": 16.562536239624023, + "learning_rate": 8.15295071005783e-06, + "loss": 5.0116, + "step": 55675 + }, + { + "epoch": 1.1328125, + "grad_norm": 17.03070640563965, + "learning_rate": 8.152640503914178e-06, + "loss": 4.9468, + "step": 55680 + }, + { + "epoch": 1.1329142252604167, + "grad_norm": 17.52288055419922, + "learning_rate": 8.152330277626153e-06, + "loss": 5.1868, + "step": 55685 + }, + { + "epoch": 1.1330159505208333, + "grad_norm": 18.506107330322266, + "learning_rate": 8.15202003119574e-06, + "loss": 4.9899, + "step": 55690 + }, + { + "epoch": 1.13311767578125, + "grad_norm": 21.855152130126953, + "learning_rate": 8.151709764624919e-06, + "loss": 4.9854, + "step": 55695 + }, + { + "epoch": 1.1332194010416667, + "grad_norm": 20.471792221069336, + "learning_rate": 8.15139947791567e-06, + "loss": 5.0546, + "step": 55700 + }, + { + "epoch": 1.1333211263020833, + "grad_norm": 14.584654808044434, + "learning_rate": 8.151089171069983e-06, + "loss": 4.8892, + "step": 55705 + }, + { + "epoch": 1.1334228515625, + "grad_norm": 15.959391593933105, + "learning_rate": 8.150778844089838e-06, + "loss": 4.7794, + "step": 55710 + }, + { + "epoch": 1.1335245768229167, + "grad_norm": 18.60394859313965, + "learning_rate": 8.150468496977212e-06, + "loss": 4.8215, + "step": 55715 + }, + { + "epoch": 1.1336263020833333, + "grad_norm": 21.60489273071289, + "learning_rate": 8.150158129734094e-06, + "loss": 5.011, + "step": 55720 + }, + { + "epoch": 1.13372802734375, + "grad_norm": 19.822233200073242, + "learning_rate": 8.149847742362465e-06, + "loss": 5.18, + "step": 55725 + }, + { + "epoch": 1.1338297526041667, + "grad_norm": 23.909008026123047, + "learning_rate": 8.14953733486431e-06, + "loss": 4.9051, + "step": 55730 + }, + { + "epoch": 1.1339314778645833, + "grad_norm": 15.794668197631836, + "learning_rate": 8.14922690724161e-06, + "loss": 4.8239, + "step": 55735 + }, + { + "epoch": 1.134033203125, + "grad_norm": 23.795303344726562, + "learning_rate": 8.148916459496349e-06, + "loss": 5.2206, + "step": 55740 + }, + { + "epoch": 1.1341349283854167, + "grad_norm": 14.192825317382812, + "learning_rate": 8.148605991630513e-06, + "loss": 4.9359, + "step": 55745 + }, + { + "epoch": 1.1342366536458333, + "grad_norm": 15.720088958740234, + "learning_rate": 8.148295503646086e-06, + "loss": 4.7463, + "step": 55750 + }, + { + "epoch": 1.13433837890625, + "grad_norm": 16.33659553527832, + "learning_rate": 8.147984995545047e-06, + "loss": 5.0178, + "step": 55755 + }, + { + "epoch": 1.1344401041666667, + "grad_norm": 13.1063871383667, + "learning_rate": 8.147674467329385e-06, + "loss": 4.6171, + "step": 55760 + }, + { + "epoch": 1.1345418294270833, + "grad_norm": 22.618202209472656, + "learning_rate": 8.147363919001081e-06, + "loss": 5.1694, + "step": 55765 + }, + { + "epoch": 1.1346435546875, + "grad_norm": 21.26824188232422, + "learning_rate": 8.147053350562122e-06, + "loss": 4.8871, + "step": 55770 + }, + { + "epoch": 1.1347452799479167, + "grad_norm": 16.80057716369629, + "learning_rate": 8.14674276201449e-06, + "loss": 5.059, + "step": 55775 + }, + { + "epoch": 1.1348470052083333, + "grad_norm": 17.742839813232422, + "learning_rate": 8.146432153360172e-06, + "loss": 4.9367, + "step": 55780 + }, + { + "epoch": 1.13494873046875, + "grad_norm": 14.966949462890625, + "learning_rate": 8.146121524601149e-06, + "loss": 4.9884, + "step": 55785 + }, + { + "epoch": 1.1350504557291667, + "grad_norm": 15.757560729980469, + "learning_rate": 8.145810875739409e-06, + "loss": 4.6015, + "step": 55790 + }, + { + "epoch": 1.1351521809895833, + "grad_norm": 22.00935935974121, + "learning_rate": 8.145500206776938e-06, + "loss": 5.1557, + "step": 55795 + }, + { + "epoch": 1.13525390625, + "grad_norm": 17.62444305419922, + "learning_rate": 8.145189517715719e-06, + "loss": 4.7999, + "step": 55800 + }, + { + "epoch": 1.1353556315104167, + "grad_norm": 27.29587745666504, + "learning_rate": 8.144878808557735e-06, + "loss": 4.9039, + "step": 55805 + }, + { + "epoch": 1.1354573567708333, + "grad_norm": 14.844914436340332, + "learning_rate": 8.144568079304975e-06, + "loss": 5.2682, + "step": 55810 + }, + { + "epoch": 1.13555908203125, + "grad_norm": 21.86805534362793, + "learning_rate": 8.144257329959423e-06, + "loss": 4.6756, + "step": 55815 + }, + { + "epoch": 1.1356608072916667, + "grad_norm": 19.508058547973633, + "learning_rate": 8.143946560523064e-06, + "loss": 4.8666, + "step": 55820 + }, + { + "epoch": 1.1357625325520833, + "grad_norm": 17.538854598999023, + "learning_rate": 8.143635770997886e-06, + "loss": 4.7664, + "step": 55825 + }, + { + "epoch": 1.1358642578125, + "grad_norm": 15.162161827087402, + "learning_rate": 8.143324961385872e-06, + "loss": 4.7722, + "step": 55830 + }, + { + "epoch": 1.1359659830729167, + "grad_norm": 20.486928939819336, + "learning_rate": 8.143014131689012e-06, + "loss": 5.0198, + "step": 55835 + }, + { + "epoch": 1.1360677083333333, + "grad_norm": 19.586427688598633, + "learning_rate": 8.142703281909288e-06, + "loss": 4.8114, + "step": 55840 + }, + { + "epoch": 1.13616943359375, + "grad_norm": 17.08920669555664, + "learning_rate": 8.142392412048686e-06, + "loss": 4.9253, + "step": 55845 + }, + { + "epoch": 1.1362711588541667, + "grad_norm": 20.487815856933594, + "learning_rate": 8.142081522109195e-06, + "loss": 4.7408, + "step": 55850 + }, + { + "epoch": 1.1363728841145833, + "grad_norm": 20.180503845214844, + "learning_rate": 8.141770612092801e-06, + "loss": 4.7665, + "step": 55855 + }, + { + "epoch": 1.136474609375, + "grad_norm": 16.812816619873047, + "learning_rate": 8.14145968200149e-06, + "loss": 4.9699, + "step": 55860 + }, + { + "epoch": 1.1365763346354167, + "grad_norm": 14.308375358581543, + "learning_rate": 8.14114873183725e-06, + "loss": 5.0422, + "step": 55865 + }, + { + "epoch": 1.1366780598958333, + "grad_norm": 18.657907485961914, + "learning_rate": 8.140837761602064e-06, + "loss": 4.9405, + "step": 55870 + }, + { + "epoch": 1.13677978515625, + "grad_norm": 19.029207229614258, + "learning_rate": 8.140526771297924e-06, + "loss": 5.1787, + "step": 55875 + }, + { + "epoch": 1.1368815104166667, + "grad_norm": 16.440717697143555, + "learning_rate": 8.140215760926813e-06, + "loss": 5.0979, + "step": 55880 + }, + { + "epoch": 1.1369832356770833, + "grad_norm": 44.18733596801758, + "learning_rate": 8.139904730490722e-06, + "loss": 5.2584, + "step": 55885 + }, + { + "epoch": 1.1370849609375, + "grad_norm": 20.266437530517578, + "learning_rate": 8.139593679991635e-06, + "loss": 5.0554, + "step": 55890 + }, + { + "epoch": 1.1371866861979167, + "grad_norm": 22.95728302001953, + "learning_rate": 8.139282609431543e-06, + "loss": 5.1394, + "step": 55895 + }, + { + "epoch": 1.1372884114583333, + "grad_norm": 18.10162925720215, + "learning_rate": 8.138971518812429e-06, + "loss": 4.6994, + "step": 55900 + }, + { + "epoch": 1.13739013671875, + "grad_norm": 24.87864875793457, + "learning_rate": 8.138660408136286e-06, + "loss": 5.0131, + "step": 55905 + }, + { + "epoch": 1.1374918619791667, + "grad_norm": 15.864633560180664, + "learning_rate": 8.138349277405098e-06, + "loss": 5.1135, + "step": 55910 + }, + { + "epoch": 1.1375935872395833, + "grad_norm": 18.59693717956543, + "learning_rate": 8.138038126620856e-06, + "loss": 4.7841, + "step": 55915 + }, + { + "epoch": 1.1376953125, + "grad_norm": 20.399696350097656, + "learning_rate": 8.137726955785543e-06, + "loss": 4.9471, + "step": 55920 + }, + { + "epoch": 1.1377970377604167, + "grad_norm": 17.052719116210938, + "learning_rate": 8.137415764901153e-06, + "loss": 5.0857, + "step": 55925 + }, + { + "epoch": 1.1378987630208333, + "grad_norm": 13.587652206420898, + "learning_rate": 8.137104553969673e-06, + "loss": 4.9515, + "step": 55930 + }, + { + "epoch": 1.13800048828125, + "grad_norm": 21.16567039489746, + "learning_rate": 8.13679332299309e-06, + "loss": 4.8257, + "step": 55935 + }, + { + "epoch": 1.1381022135416667, + "grad_norm": 17.331104278564453, + "learning_rate": 8.136482071973392e-06, + "loss": 4.8663, + "step": 55940 + }, + { + "epoch": 1.1382039388020833, + "grad_norm": 14.512194633483887, + "learning_rate": 8.13617080091257e-06, + "loss": 5.0622, + "step": 55945 + }, + { + "epoch": 1.1383056640625, + "grad_norm": 17.77875518798828, + "learning_rate": 8.135859509812613e-06, + "loss": 4.6347, + "step": 55950 + }, + { + "epoch": 1.1384073893229167, + "grad_norm": 17.71902084350586, + "learning_rate": 8.13554819867551e-06, + "loss": 4.8045, + "step": 55955 + }, + { + "epoch": 1.1385091145833333, + "grad_norm": 16.612449645996094, + "learning_rate": 8.135236867503246e-06, + "loss": 4.9299, + "step": 55960 + }, + { + "epoch": 1.13861083984375, + "grad_norm": 12.808982849121094, + "learning_rate": 8.134925516297817e-06, + "loss": 5.0353, + "step": 55965 + }, + { + "epoch": 1.1387125651041667, + "grad_norm": 17.05945587158203, + "learning_rate": 8.134614145061207e-06, + "loss": 4.6896, + "step": 55970 + }, + { + "epoch": 1.1388142903645833, + "grad_norm": 22.232969284057617, + "learning_rate": 8.13430275379541e-06, + "loss": 4.7781, + "step": 55975 + }, + { + "epoch": 1.138916015625, + "grad_norm": 19.692258834838867, + "learning_rate": 8.133991342502411e-06, + "loss": 4.9874, + "step": 55980 + }, + { + "epoch": 1.1390177408854167, + "grad_norm": 15.265161514282227, + "learning_rate": 8.133679911184204e-06, + "loss": 5.0019, + "step": 55985 + }, + { + "epoch": 1.1391194661458333, + "grad_norm": 19.554141998291016, + "learning_rate": 8.133368459842776e-06, + "loss": 4.9968, + "step": 55990 + }, + { + "epoch": 1.13922119140625, + "grad_norm": 16.49011993408203, + "learning_rate": 8.133056988480119e-06, + "loss": 4.8296, + "step": 55995 + }, + { + "epoch": 1.1393229166666667, + "grad_norm": 15.42139720916748, + "learning_rate": 8.132745497098223e-06, + "loss": 5.066, + "step": 56000 + }, + { + "epoch": 1.1394246419270833, + "grad_norm": 19.511491775512695, + "learning_rate": 8.132433985699075e-06, + "loss": 4.9232, + "step": 56005 + }, + { + "epoch": 1.1395263671875, + "grad_norm": 14.375988006591797, + "learning_rate": 8.13212245428467e-06, + "loss": 4.9143, + "step": 56010 + }, + { + "epoch": 1.1396280924479167, + "grad_norm": 20.189979553222656, + "learning_rate": 8.131810902856997e-06, + "loss": 4.8971, + "step": 56015 + }, + { + "epoch": 1.1397298177083333, + "grad_norm": 22.66452407836914, + "learning_rate": 8.131499331418047e-06, + "loss": 4.9122, + "step": 56020 + }, + { + "epoch": 1.13983154296875, + "grad_norm": 15.435922622680664, + "learning_rate": 8.131187739969808e-06, + "loss": 5.0414, + "step": 56025 + }, + { + "epoch": 1.1399332682291667, + "grad_norm": 13.982890129089355, + "learning_rate": 8.130876128514276e-06, + "loss": 4.9457, + "step": 56030 + }, + { + "epoch": 1.1400349934895833, + "grad_norm": 21.663461685180664, + "learning_rate": 8.130564497053438e-06, + "loss": 4.7222, + "step": 56035 + }, + { + "epoch": 1.14013671875, + "grad_norm": 19.02402687072754, + "learning_rate": 8.130252845589286e-06, + "loss": 4.9096, + "step": 56040 + }, + { + "epoch": 1.1402384440104167, + "grad_norm": 18.650840759277344, + "learning_rate": 8.129941174123815e-06, + "loss": 5.0357, + "step": 56045 + }, + { + "epoch": 1.1403401692708333, + "grad_norm": 20.07752227783203, + "learning_rate": 8.12962948265901e-06, + "loss": 4.9969, + "step": 56050 + }, + { + "epoch": 1.14044189453125, + "grad_norm": 20.244136810302734, + "learning_rate": 8.129317771196868e-06, + "loss": 5.2081, + "step": 56055 + }, + { + "epoch": 1.1405436197916667, + "grad_norm": 22.19261932373047, + "learning_rate": 8.129006039739378e-06, + "loss": 4.9855, + "step": 56060 + }, + { + "epoch": 1.1406453450520833, + "grad_norm": 14.195953369140625, + "learning_rate": 8.128694288288532e-06, + "loss": 4.8083, + "step": 56065 + }, + { + "epoch": 1.1407470703125, + "grad_norm": 17.155475616455078, + "learning_rate": 8.128382516846325e-06, + "loss": 4.6911, + "step": 56070 + }, + { + "epoch": 1.1408487955729167, + "grad_norm": 16.631561279296875, + "learning_rate": 8.128070725414744e-06, + "loss": 4.5804, + "step": 56075 + }, + { + "epoch": 1.1409505208333333, + "grad_norm": 15.607728958129883, + "learning_rate": 8.127758913995786e-06, + "loss": 4.9438, + "step": 56080 + }, + { + "epoch": 1.14105224609375, + "grad_norm": 18.183414459228516, + "learning_rate": 8.127447082591441e-06, + "loss": 4.9602, + "step": 56085 + }, + { + "epoch": 1.1411539713541667, + "grad_norm": 12.356343269348145, + "learning_rate": 8.1271352312037e-06, + "loss": 4.8204, + "step": 56090 + }, + { + "epoch": 1.1412556966145833, + "grad_norm": 16.546337127685547, + "learning_rate": 8.12682335983456e-06, + "loss": 5.3016, + "step": 56095 + }, + { + "epoch": 1.141357421875, + "grad_norm": 19.336336135864258, + "learning_rate": 8.126511468486011e-06, + "loss": 5.1132, + "step": 56100 + }, + { + "epoch": 1.1414591471354167, + "grad_norm": 15.810958862304688, + "learning_rate": 8.126199557160045e-06, + "loss": 4.9753, + "step": 56105 + }, + { + "epoch": 1.1415608723958333, + "grad_norm": 17.353866577148438, + "learning_rate": 8.12588762585866e-06, + "loss": 4.8414, + "step": 56110 + }, + { + "epoch": 1.14166259765625, + "grad_norm": 14.545721054077148, + "learning_rate": 8.125575674583841e-06, + "loss": 5.0867, + "step": 56115 + }, + { + "epoch": 1.1417643229166667, + "grad_norm": 18.3267765045166, + "learning_rate": 8.125263703337588e-06, + "loss": 5.0097, + "step": 56120 + }, + { + "epoch": 1.1418660481770833, + "grad_norm": 17.530202865600586, + "learning_rate": 8.124951712121892e-06, + "loss": 5.0719, + "step": 56125 + }, + { + "epoch": 1.1419677734375, + "grad_norm": 16.75968360900879, + "learning_rate": 8.124639700938743e-06, + "loss": 4.9435, + "step": 56130 + }, + { + "epoch": 1.1420694986979167, + "grad_norm": 19.865339279174805, + "learning_rate": 8.124327669790143e-06, + "loss": 5.1554, + "step": 56135 + }, + { + "epoch": 1.1421712239583333, + "grad_norm": 16.770544052124023, + "learning_rate": 8.124015618678079e-06, + "loss": 5.0439, + "step": 56140 + }, + { + "epoch": 1.14227294921875, + "grad_norm": 20.382049560546875, + "learning_rate": 8.123703547604545e-06, + "loss": 4.944, + "step": 56145 + }, + { + "epoch": 1.1423746744791667, + "grad_norm": 10.997532844543457, + "learning_rate": 8.12339145657154e-06, + "loss": 4.9297, + "step": 56150 + }, + { + "epoch": 1.1424763997395833, + "grad_norm": 21.29022789001465, + "learning_rate": 8.123079345581053e-06, + "loss": 4.9986, + "step": 56155 + }, + { + "epoch": 1.142578125, + "grad_norm": 14.723771095275879, + "learning_rate": 8.122767214635082e-06, + "loss": 5.0348, + "step": 56160 + }, + { + "epoch": 1.1426798502604167, + "grad_norm": 16.134445190429688, + "learning_rate": 8.122455063735617e-06, + "loss": 4.9579, + "step": 56165 + }, + { + "epoch": 1.1427815755208333, + "grad_norm": 14.448782920837402, + "learning_rate": 8.122142892884658e-06, + "loss": 4.7367, + "step": 56170 + }, + { + "epoch": 1.14288330078125, + "grad_norm": 13.633452415466309, + "learning_rate": 8.121830702084196e-06, + "loss": 5.1986, + "step": 56175 + }, + { + "epoch": 1.1429850260416667, + "grad_norm": 13.620585441589355, + "learning_rate": 8.121518491336226e-06, + "loss": 5.0504, + "step": 56180 + }, + { + "epoch": 1.1430867513020833, + "grad_norm": 15.585406303405762, + "learning_rate": 8.121206260642747e-06, + "loss": 5.0227, + "step": 56185 + }, + { + "epoch": 1.1431884765625, + "grad_norm": 20.564937591552734, + "learning_rate": 8.120894010005749e-06, + "loss": 4.9263, + "step": 56190 + }, + { + "epoch": 1.1432902018229167, + "grad_norm": 16.60288429260254, + "learning_rate": 8.120581739427228e-06, + "loss": 4.9791, + "step": 56195 + }, + { + "epoch": 1.1433919270833333, + "grad_norm": 23.022798538208008, + "learning_rate": 8.12026944890918e-06, + "loss": 4.8249, + "step": 56200 + }, + { + "epoch": 1.14349365234375, + "grad_norm": 18.138669967651367, + "learning_rate": 8.119957138453603e-06, + "loss": 4.8178, + "step": 56205 + }, + { + "epoch": 1.1435953776041667, + "grad_norm": 22.5386962890625, + "learning_rate": 8.119644808062487e-06, + "loss": 5.0272, + "step": 56210 + }, + { + "epoch": 1.1436971028645833, + "grad_norm": 20.07242774963379, + "learning_rate": 8.119332457737834e-06, + "loss": 4.74, + "step": 56215 + }, + { + "epoch": 1.143798828125, + "grad_norm": 12.12027645111084, + "learning_rate": 8.119020087481637e-06, + "loss": 4.7239, + "step": 56220 + }, + { + "epoch": 1.1439005533854167, + "grad_norm": 19.233003616333008, + "learning_rate": 8.118707697295889e-06, + "loss": 5.0191, + "step": 56225 + }, + { + "epoch": 1.1440022786458333, + "grad_norm": 26.37506866455078, + "learning_rate": 8.11839528718259e-06, + "loss": 4.9748, + "step": 56230 + }, + { + "epoch": 1.14410400390625, + "grad_norm": 17.452632904052734, + "learning_rate": 8.118082857143736e-06, + "loss": 4.8611, + "step": 56235 + }, + { + "epoch": 1.1442057291666667, + "grad_norm": 18.238117218017578, + "learning_rate": 8.117770407181323e-06, + "loss": 5.1762, + "step": 56240 + }, + { + "epoch": 1.1443074544270833, + "grad_norm": 18.72785758972168, + "learning_rate": 8.117457937297344e-06, + "loss": 5.2484, + "step": 56245 + }, + { + "epoch": 1.1444091796875, + "grad_norm": 13.935351371765137, + "learning_rate": 8.1171454474938e-06, + "loss": 4.6348, + "step": 56250 + }, + { + "epoch": 1.1445109049479167, + "grad_norm": 18.02918815612793, + "learning_rate": 8.116832937772686e-06, + "loss": 4.5829, + "step": 56255 + }, + { + "epoch": 1.1446126302083333, + "grad_norm": 19.77610969543457, + "learning_rate": 8.116520408136e-06, + "loss": 5.0698, + "step": 56260 + }, + { + "epoch": 1.14471435546875, + "grad_norm": 19.5131778717041, + "learning_rate": 8.116207858585736e-06, + "loss": 4.7204, + "step": 56265 + }, + { + "epoch": 1.1448160807291667, + "grad_norm": 17.785985946655273, + "learning_rate": 8.115895289123894e-06, + "loss": 4.8795, + "step": 56270 + }, + { + "epoch": 1.1449178059895833, + "grad_norm": 17.528587341308594, + "learning_rate": 8.11558269975247e-06, + "loss": 4.7689, + "step": 56275 + }, + { + "epoch": 1.14501953125, + "grad_norm": 21.073572158813477, + "learning_rate": 8.115270090473462e-06, + "loss": 4.9518, + "step": 56280 + }, + { + "epoch": 1.1451212565104167, + "grad_norm": 15.599408149719238, + "learning_rate": 8.114957461288866e-06, + "loss": 4.9691, + "step": 56285 + }, + { + "epoch": 1.1452229817708333, + "grad_norm": 17.019359588623047, + "learning_rate": 8.114644812200681e-06, + "loss": 4.8488, + "step": 56290 + }, + { + "epoch": 1.14532470703125, + "grad_norm": 18.1334228515625, + "learning_rate": 8.114332143210905e-06, + "loss": 4.8596, + "step": 56295 + }, + { + "epoch": 1.1454264322916667, + "grad_norm": 16.184659957885742, + "learning_rate": 8.114019454321535e-06, + "loss": 4.9242, + "step": 56300 + }, + { + "epoch": 1.1455281575520833, + "grad_norm": 18.996788024902344, + "learning_rate": 8.11370674553457e-06, + "loss": 4.8925, + "step": 56305 + }, + { + "epoch": 1.1456298828125, + "grad_norm": 22.275306701660156, + "learning_rate": 8.113394016852005e-06, + "loss": 4.8623, + "step": 56310 + }, + { + "epoch": 1.1457316080729167, + "grad_norm": 22.434804916381836, + "learning_rate": 8.113081268275843e-06, + "loss": 5.0021, + "step": 56315 + }, + { + "epoch": 1.1458333333333333, + "grad_norm": 14.810842514038086, + "learning_rate": 8.11276849980808e-06, + "loss": 4.9516, + "step": 56320 + }, + { + "epoch": 1.14593505859375, + "grad_norm": 18.775978088378906, + "learning_rate": 8.112455711450714e-06, + "loss": 5.1366, + "step": 56325 + }, + { + "epoch": 1.1460367838541667, + "grad_norm": 16.75908088684082, + "learning_rate": 8.112142903205743e-06, + "loss": 5.0935, + "step": 56330 + }, + { + "epoch": 1.1461385091145833, + "grad_norm": 14.540868759155273, + "learning_rate": 8.111830075075166e-06, + "loss": 4.8712, + "step": 56335 + }, + { + "epoch": 1.146240234375, + "grad_norm": 24.630645751953125, + "learning_rate": 8.111517227060983e-06, + "loss": 5.0763, + "step": 56340 + }, + { + "epoch": 1.1463419596354167, + "grad_norm": 15.096487045288086, + "learning_rate": 8.111204359165194e-06, + "loss": 5.2092, + "step": 56345 + }, + { + "epoch": 1.1464436848958333, + "grad_norm": 16.78805160522461, + "learning_rate": 8.110891471389798e-06, + "loss": 4.8031, + "step": 56350 + }, + { + "epoch": 1.14654541015625, + "grad_norm": 21.356760025024414, + "learning_rate": 8.11057856373679e-06, + "loss": 4.7006, + "step": 56355 + }, + { + "epoch": 1.1466471354166667, + "grad_norm": 19.057283401489258, + "learning_rate": 8.110265636208173e-06, + "loss": 4.976, + "step": 56360 + }, + { + "epoch": 1.1467488606770833, + "grad_norm": 18.506284713745117, + "learning_rate": 8.109952688805947e-06, + "loss": 5.1049, + "step": 56365 + }, + { + "epoch": 1.1468505859375, + "grad_norm": 19.41403579711914, + "learning_rate": 8.10963972153211e-06, + "loss": 5.0836, + "step": 56370 + }, + { + "epoch": 1.1469523111979167, + "grad_norm": 16.495868682861328, + "learning_rate": 8.109326734388663e-06, + "loss": 4.8575, + "step": 56375 + }, + { + "epoch": 1.1470540364583333, + "grad_norm": 13.561206817626953, + "learning_rate": 8.109013727377604e-06, + "loss": 4.8638, + "step": 56380 + }, + { + "epoch": 1.14715576171875, + "grad_norm": 18.466014862060547, + "learning_rate": 8.108700700500934e-06, + "loss": 4.9111, + "step": 56385 + }, + { + "epoch": 1.1472574869791667, + "grad_norm": 16.20345687866211, + "learning_rate": 8.108387653760655e-06, + "loss": 4.8186, + "step": 56390 + }, + { + "epoch": 1.1473592122395833, + "grad_norm": 22.076574325561523, + "learning_rate": 8.108074587158765e-06, + "loss": 4.922, + "step": 56395 + }, + { + "epoch": 1.1474609375, + "grad_norm": 17.39841079711914, + "learning_rate": 8.107761500697264e-06, + "loss": 4.7517, + "step": 56400 + }, + { + "epoch": 1.1475626627604167, + "grad_norm": 20.61244773864746, + "learning_rate": 8.107448394378155e-06, + "loss": 4.8022, + "step": 56405 + }, + { + "epoch": 1.1476643880208333, + "grad_norm": 19.072120666503906, + "learning_rate": 8.107135268203436e-06, + "loss": 4.8831, + "step": 56410 + }, + { + "epoch": 1.14776611328125, + "grad_norm": 23.876684188842773, + "learning_rate": 8.106822122175111e-06, + "loss": 4.8166, + "step": 56415 + }, + { + "epoch": 1.1478678385416667, + "grad_norm": 18.250598907470703, + "learning_rate": 8.106508956295178e-06, + "loss": 4.9569, + "step": 56420 + }, + { + "epoch": 1.1479695638020833, + "grad_norm": 17.643566131591797, + "learning_rate": 8.106195770565637e-06, + "loss": 4.974, + "step": 56425 + }, + { + "epoch": 1.1480712890625, + "grad_norm": 17.181909561157227, + "learning_rate": 8.105882564988491e-06, + "loss": 4.9983, + "step": 56430 + }, + { + "epoch": 1.1481730143229167, + "grad_norm": 17.264562606811523, + "learning_rate": 8.105569339565745e-06, + "loss": 4.8569, + "step": 56435 + }, + { + "epoch": 1.1482747395833333, + "grad_norm": 22.527795791625977, + "learning_rate": 8.105256094299393e-06, + "loss": 5.0719, + "step": 56440 + }, + { + "epoch": 1.14837646484375, + "grad_norm": 28.38457679748535, + "learning_rate": 8.104942829191443e-06, + "loss": 4.7864, + "step": 56445 + }, + { + "epoch": 1.1484781901041667, + "grad_norm": 18.4913272857666, + "learning_rate": 8.104629544243891e-06, + "loss": 4.7298, + "step": 56450 + }, + { + "epoch": 1.1485799153645833, + "grad_norm": 15.082559585571289, + "learning_rate": 8.104316239458745e-06, + "loss": 4.5162, + "step": 56455 + }, + { + "epoch": 1.148681640625, + "grad_norm": 17.374114990234375, + "learning_rate": 8.104002914838004e-06, + "loss": 4.9492, + "step": 56460 + }, + { + "epoch": 1.1487833658854167, + "grad_norm": 15.180771827697754, + "learning_rate": 8.103689570383665e-06, + "loss": 4.7103, + "step": 56465 + }, + { + "epoch": 1.1488850911458333, + "grad_norm": 17.839927673339844, + "learning_rate": 8.10337620609774e-06, + "loss": 4.8212, + "step": 56470 + }, + { + "epoch": 1.14898681640625, + "grad_norm": 16.077667236328125, + "learning_rate": 8.103062821982222e-06, + "loss": 4.8814, + "step": 56475 + }, + { + "epoch": 1.1490885416666667, + "grad_norm": 20.05994415283203, + "learning_rate": 8.10274941803912e-06, + "loss": 5.1365, + "step": 56480 + }, + { + "epoch": 1.1491902669270833, + "grad_norm": 16.17608642578125, + "learning_rate": 8.102435994270434e-06, + "loss": 4.9507, + "step": 56485 + }, + { + "epoch": 1.1492919921875, + "grad_norm": 24.22762107849121, + "learning_rate": 8.102122550678167e-06, + "loss": 5.371, + "step": 56490 + }, + { + "epoch": 1.1493937174479167, + "grad_norm": 19.07741928100586, + "learning_rate": 8.101809087264321e-06, + "loss": 4.7521, + "step": 56495 + }, + { + "epoch": 1.1494954427083333, + "grad_norm": 19.936283111572266, + "learning_rate": 8.1014956040309e-06, + "loss": 4.8441, + "step": 56500 + }, + { + "epoch": 1.14959716796875, + "grad_norm": 14.416830062866211, + "learning_rate": 8.101182100979907e-06, + "loss": 4.9182, + "step": 56505 + }, + { + "epoch": 1.1496988932291667, + "grad_norm": 16.528356552124023, + "learning_rate": 8.100868578113344e-06, + "loss": 4.9567, + "step": 56510 + }, + { + "epoch": 1.1498006184895833, + "grad_norm": 19.545053482055664, + "learning_rate": 8.100555035433215e-06, + "loss": 4.9371, + "step": 56515 + }, + { + "epoch": 1.14990234375, + "grad_norm": 18.043737411499023, + "learning_rate": 8.100241472941525e-06, + "loss": 5.061, + "step": 56520 + }, + { + "epoch": 1.1500040690104167, + "grad_norm": 15.577412605285645, + "learning_rate": 8.099927890640276e-06, + "loss": 4.8541, + "step": 56525 + }, + { + "epoch": 1.1501057942708333, + "grad_norm": 21.531904220581055, + "learning_rate": 8.09961428853147e-06, + "loss": 5.0215, + "step": 56530 + }, + { + "epoch": 1.15020751953125, + "grad_norm": 17.37523078918457, + "learning_rate": 8.099300666617114e-06, + "loss": 5.06, + "step": 56535 + }, + { + "epoch": 1.1503092447916667, + "grad_norm": 15.431808471679688, + "learning_rate": 8.09898702489921e-06, + "loss": 4.9295, + "step": 56540 + }, + { + "epoch": 1.1504109700520833, + "grad_norm": 20.63079833984375, + "learning_rate": 8.098673363379764e-06, + "loss": 5.1149, + "step": 56545 + }, + { + "epoch": 1.1505126953125, + "grad_norm": 18.485658645629883, + "learning_rate": 8.098359682060777e-06, + "loss": 5.0448, + "step": 56550 + }, + { + "epoch": 1.1506144205729167, + "grad_norm": 17.508562088012695, + "learning_rate": 8.098045980944256e-06, + "loss": 4.773, + "step": 56555 + }, + { + "epoch": 1.1507161458333333, + "grad_norm": 22.364879608154297, + "learning_rate": 8.097732260032204e-06, + "loss": 4.9854, + "step": 56560 + }, + { + "epoch": 1.15081787109375, + "grad_norm": 18.335418701171875, + "learning_rate": 8.097418519326627e-06, + "loss": 4.9492, + "step": 56565 + }, + { + "epoch": 1.1509195963541667, + "grad_norm": 15.514617919921875, + "learning_rate": 8.097104758829528e-06, + "loss": 4.8887, + "step": 56570 + }, + { + "epoch": 1.1510213216145833, + "grad_norm": 20.10460090637207, + "learning_rate": 8.096790978542914e-06, + "loss": 5.154, + "step": 56575 + }, + { + "epoch": 1.151123046875, + "grad_norm": 18.980287551879883, + "learning_rate": 8.096477178468788e-06, + "loss": 4.6998, + "step": 56580 + }, + { + "epoch": 1.1512247721354167, + "grad_norm": 19.993820190429688, + "learning_rate": 8.096163358609156e-06, + "loss": 5.071, + "step": 56585 + }, + { + "epoch": 1.1513264973958333, + "grad_norm": 19.04774284362793, + "learning_rate": 8.095849518966023e-06, + "loss": 5.4523, + "step": 56590 + }, + { + "epoch": 1.15142822265625, + "grad_norm": 17.899303436279297, + "learning_rate": 8.095535659541394e-06, + "loss": 4.947, + "step": 56595 + }, + { + "epoch": 1.1515299479166667, + "grad_norm": 14.620561599731445, + "learning_rate": 8.095221780337274e-06, + "loss": 5.0107, + "step": 56600 + }, + { + "epoch": 1.1516316731770833, + "grad_norm": 21.678829193115234, + "learning_rate": 8.094907881355671e-06, + "loss": 4.8927, + "step": 56605 + }, + { + "epoch": 1.1517333984375, + "grad_norm": 14.031059265136719, + "learning_rate": 8.09459396259859e-06, + "loss": 5.0396, + "step": 56610 + }, + { + "epoch": 1.1518351236979167, + "grad_norm": 15.68561840057373, + "learning_rate": 8.094280024068033e-06, + "loss": 4.985, + "step": 56615 + }, + { + "epoch": 1.1519368489583333, + "grad_norm": 16.923091888427734, + "learning_rate": 8.09396606576601e-06, + "loss": 4.8646, + "step": 56620 + }, + { + "epoch": 1.15203857421875, + "grad_norm": 22.91004180908203, + "learning_rate": 8.093652087694528e-06, + "loss": 5.015, + "step": 56625 + }, + { + "epoch": 1.1521402994791667, + "grad_norm": 15.175999641418457, + "learning_rate": 8.09333808985559e-06, + "loss": 4.8, + "step": 56630 + }, + { + "epoch": 1.1522420247395833, + "grad_norm": 15.511096954345703, + "learning_rate": 8.093024072251202e-06, + "loss": 4.8782, + "step": 56635 + }, + { + "epoch": 1.15234375, + "grad_norm": 23.482799530029297, + "learning_rate": 8.092710034883373e-06, + "loss": 5.0347, + "step": 56640 + }, + { + "epoch": 1.1524454752604167, + "grad_norm": 17.828500747680664, + "learning_rate": 8.092395977754109e-06, + "loss": 4.8808, + "step": 56645 + }, + { + "epoch": 1.1525472005208333, + "grad_norm": 29.005395889282227, + "learning_rate": 8.092081900865416e-06, + "loss": 4.8195, + "step": 56650 + }, + { + "epoch": 1.15264892578125, + "grad_norm": 22.75422477722168, + "learning_rate": 8.091767804219302e-06, + "loss": 5.0715, + "step": 56655 + }, + { + "epoch": 1.1527506510416667, + "grad_norm": 19.515003204345703, + "learning_rate": 8.091453687817772e-06, + "loss": 5.2507, + "step": 56660 + }, + { + "epoch": 1.1528523763020833, + "grad_norm": 12.954317092895508, + "learning_rate": 8.091139551662834e-06, + "loss": 4.9023, + "step": 56665 + }, + { + "epoch": 1.1529541015625, + "grad_norm": 22.131059646606445, + "learning_rate": 8.090825395756496e-06, + "loss": 5.0059, + "step": 56670 + }, + { + "epoch": 1.1530558268229167, + "grad_norm": 21.39564323425293, + "learning_rate": 8.090511220100766e-06, + "loss": 4.9277, + "step": 56675 + }, + { + "epoch": 1.1531575520833333, + "grad_norm": 16.7652645111084, + "learning_rate": 8.09019702469765e-06, + "loss": 5.1231, + "step": 56680 + }, + { + "epoch": 1.15325927734375, + "grad_norm": 27.72130584716797, + "learning_rate": 8.089882809549155e-06, + "loss": 5.2973, + "step": 56685 + }, + { + "epoch": 1.1533610026041667, + "grad_norm": 17.385578155517578, + "learning_rate": 8.089568574657291e-06, + "loss": 5.0848, + "step": 56690 + }, + { + "epoch": 1.1534627278645833, + "grad_norm": 17.65016746520996, + "learning_rate": 8.089254320024063e-06, + "loss": 4.8006, + "step": 56695 + }, + { + "epoch": 1.153564453125, + "grad_norm": 16.513933181762695, + "learning_rate": 8.088940045651482e-06, + "loss": 5.1044, + "step": 56700 + }, + { + "epoch": 1.1536661783854167, + "grad_norm": 21.078025817871094, + "learning_rate": 8.088625751541554e-06, + "loss": 5.0148, + "step": 56705 + }, + { + "epoch": 1.1537679036458333, + "grad_norm": 21.431615829467773, + "learning_rate": 8.088311437696288e-06, + "loss": 5.2032, + "step": 56710 + }, + { + "epoch": 1.15386962890625, + "grad_norm": 16.053524017333984, + "learning_rate": 8.087997104117693e-06, + "loss": 5.141, + "step": 56715 + }, + { + "epoch": 1.1539713541666667, + "grad_norm": 16.619115829467773, + "learning_rate": 8.087682750807776e-06, + "loss": 4.9771, + "step": 56720 + }, + { + "epoch": 1.1540730794270833, + "grad_norm": 19.489192962646484, + "learning_rate": 8.087368377768545e-06, + "loss": 4.7807, + "step": 56725 + }, + { + "epoch": 1.1541748046875, + "grad_norm": 13.049065589904785, + "learning_rate": 8.087053985002013e-06, + "loss": 5.0178, + "step": 56730 + }, + { + "epoch": 1.1542765299479167, + "grad_norm": 18.248050689697266, + "learning_rate": 8.086739572510184e-06, + "loss": 4.9997, + "step": 56735 + }, + { + "epoch": 1.1543782552083333, + "grad_norm": 15.299439430236816, + "learning_rate": 8.08642514029507e-06, + "loss": 4.9887, + "step": 56740 + }, + { + "epoch": 1.15447998046875, + "grad_norm": 28.433332443237305, + "learning_rate": 8.086110688358678e-06, + "loss": 5.0037, + "step": 56745 + }, + { + "epoch": 1.1545817057291667, + "grad_norm": 14.832365036010742, + "learning_rate": 8.085796216703019e-06, + "loss": 4.8434, + "step": 56750 + }, + { + "epoch": 1.1546834309895833, + "grad_norm": 15.084193229675293, + "learning_rate": 8.085481725330102e-06, + "loss": 4.8146, + "step": 56755 + }, + { + "epoch": 1.15478515625, + "grad_norm": 22.170015335083008, + "learning_rate": 8.085167214241935e-06, + "loss": 5.0601, + "step": 56760 + }, + { + "epoch": 1.1548868815104167, + "grad_norm": 16.738658905029297, + "learning_rate": 8.084852683440532e-06, + "loss": 4.7199, + "step": 56765 + }, + { + "epoch": 1.1549886067708333, + "grad_norm": 23.000253677368164, + "learning_rate": 8.084538132927897e-06, + "loss": 5.0543, + "step": 56770 + }, + { + "epoch": 1.15509033203125, + "grad_norm": 22.052099227905273, + "learning_rate": 8.084223562706041e-06, + "loss": 5.229, + "step": 56775 + }, + { + "epoch": 1.1551920572916667, + "grad_norm": 19.8981990814209, + "learning_rate": 8.08390897277698e-06, + "loss": 4.8633, + "step": 56780 + }, + { + "epoch": 1.1552937825520833, + "grad_norm": 15.77259349822998, + "learning_rate": 8.083594363142717e-06, + "loss": 4.9183, + "step": 56785 + }, + { + "epoch": 1.1553955078125, + "grad_norm": 14.7151460647583, + "learning_rate": 8.083279733805264e-06, + "loss": 4.8308, + "step": 56790 + }, + { + "epoch": 1.1554972330729167, + "grad_norm": 17.75267219543457, + "learning_rate": 8.082965084766634e-06, + "loss": 4.9064, + "step": 56795 + }, + { + "epoch": 1.1555989583333333, + "grad_norm": 18.838720321655273, + "learning_rate": 8.082650416028834e-06, + "loss": 4.8355, + "step": 56800 + }, + { + "epoch": 1.15570068359375, + "grad_norm": 18.482587814331055, + "learning_rate": 8.082335727593878e-06, + "loss": 4.7444, + "step": 56805 + }, + { + "epoch": 1.1558024088541667, + "grad_norm": 16.947519302368164, + "learning_rate": 8.082021019463774e-06, + "loss": 4.9361, + "step": 56810 + }, + { + "epoch": 1.1559041341145833, + "grad_norm": 17.634370803833008, + "learning_rate": 8.081706291640536e-06, + "loss": 5.065, + "step": 56815 + }, + { + "epoch": 1.156005859375, + "grad_norm": 16.44050407409668, + "learning_rate": 8.081391544126172e-06, + "loss": 4.874, + "step": 56820 + }, + { + "epoch": 1.1561075846354167, + "grad_norm": 16.863840103149414, + "learning_rate": 8.081076776922693e-06, + "loss": 4.9599, + "step": 56825 + }, + { + "epoch": 1.1562093098958333, + "grad_norm": 16.877992630004883, + "learning_rate": 8.080761990032113e-06, + "loss": 4.7893, + "step": 56830 + }, + { + "epoch": 1.15631103515625, + "grad_norm": 16.589075088500977, + "learning_rate": 8.080447183456442e-06, + "loss": 5.0003, + "step": 56835 + }, + { + "epoch": 1.1564127604166667, + "grad_norm": 26.585309982299805, + "learning_rate": 8.080132357197688e-06, + "loss": 4.9056, + "step": 56840 + }, + { + "epoch": 1.1565144856770833, + "grad_norm": 21.217388153076172, + "learning_rate": 8.07981751125787e-06, + "loss": 5.0004, + "step": 56845 + }, + { + "epoch": 1.1566162109375, + "grad_norm": 18.96023178100586, + "learning_rate": 8.079502645638995e-06, + "loss": 5.3406, + "step": 56850 + }, + { + "epoch": 1.1567179361979167, + "grad_norm": 21.336013793945312, + "learning_rate": 8.079187760343074e-06, + "loss": 4.9704, + "step": 56855 + }, + { + "epoch": 1.1568196614583333, + "grad_norm": 19.03418731689453, + "learning_rate": 8.078872855372122e-06, + "loss": 5.005, + "step": 56860 + }, + { + "epoch": 1.15692138671875, + "grad_norm": 23.168081283569336, + "learning_rate": 8.078557930728147e-06, + "loss": 4.9278, + "step": 56865 + }, + { + "epoch": 1.1570231119791667, + "grad_norm": 21.251874923706055, + "learning_rate": 8.078242986413169e-06, + "loss": 4.6642, + "step": 56870 + }, + { + "epoch": 1.1571248372395833, + "grad_norm": 16.568819046020508, + "learning_rate": 8.077928022429193e-06, + "loss": 4.8837, + "step": 56875 + }, + { + "epoch": 1.1572265625, + "grad_norm": 13.315433502197266, + "learning_rate": 8.077613038778233e-06, + "loss": 5.2095, + "step": 56880 + }, + { + "epoch": 1.1573282877604167, + "grad_norm": 18.04656219482422, + "learning_rate": 8.077298035462304e-06, + "loss": 4.7942, + "step": 56885 + }, + { + "epoch": 1.1574300130208333, + "grad_norm": 14.06417179107666, + "learning_rate": 8.076983012483417e-06, + "loss": 5.0656, + "step": 56890 + }, + { + "epoch": 1.15753173828125, + "grad_norm": 16.091707229614258, + "learning_rate": 8.076667969843585e-06, + "loss": 4.894, + "step": 56895 + }, + { + "epoch": 1.1576334635416667, + "grad_norm": 16.14795684814453, + "learning_rate": 8.076352907544823e-06, + "loss": 5.0273, + "step": 56900 + }, + { + "epoch": 1.1577351888020833, + "grad_norm": 16.67700958251953, + "learning_rate": 8.07603782558914e-06, + "loss": 4.783, + "step": 56905 + }, + { + "epoch": 1.1578369140625, + "grad_norm": 19.14459991455078, + "learning_rate": 8.075722723978553e-06, + "loss": 4.9997, + "step": 56910 + }, + { + "epoch": 1.1579386393229167, + "grad_norm": 18.943796157836914, + "learning_rate": 8.075407602715075e-06, + "loss": 4.8787, + "step": 56915 + }, + { + "epoch": 1.1580403645833333, + "grad_norm": 20.325437545776367, + "learning_rate": 8.075092461800718e-06, + "loss": 4.9554, + "step": 56920 + }, + { + "epoch": 1.15814208984375, + "grad_norm": 19.3474063873291, + "learning_rate": 8.074777301237498e-06, + "loss": 4.997, + "step": 56925 + }, + { + "epoch": 1.1582438151041667, + "grad_norm": 18.08451271057129, + "learning_rate": 8.074462121027425e-06, + "loss": 5.1653, + "step": 56930 + }, + { + "epoch": 1.1583455403645833, + "grad_norm": 16.723302841186523, + "learning_rate": 8.074146921172515e-06, + "loss": 4.8903, + "step": 56935 + }, + { + "epoch": 1.158447265625, + "grad_norm": 19.289762496948242, + "learning_rate": 8.07383170167478e-06, + "loss": 4.8216, + "step": 56940 + }, + { + "epoch": 1.1585489908854167, + "grad_norm": 18.236148834228516, + "learning_rate": 8.073516462536241e-06, + "loss": 4.9706, + "step": 56945 + }, + { + "epoch": 1.1586507161458333, + "grad_norm": 18.189414978027344, + "learning_rate": 8.073201203758905e-06, + "loss": 4.9298, + "step": 56950 + }, + { + "epoch": 1.15875244140625, + "grad_norm": 13.051789283752441, + "learning_rate": 8.072885925344788e-06, + "loss": 5.1549, + "step": 56955 + }, + { + "epoch": 1.1588541666666667, + "grad_norm": 16.280826568603516, + "learning_rate": 8.072570627295907e-06, + "loss": 5.0323, + "step": 56960 + }, + { + "epoch": 1.1589558919270833, + "grad_norm": 16.33539390563965, + "learning_rate": 8.072255309614274e-06, + "loss": 4.9411, + "step": 56965 + }, + { + "epoch": 1.1590576171875, + "grad_norm": 16.452505111694336, + "learning_rate": 8.071939972301905e-06, + "loss": 4.7413, + "step": 56970 + }, + { + "epoch": 1.1591593424479167, + "grad_norm": 22.52553939819336, + "learning_rate": 8.071624615360817e-06, + "loss": 5.0055, + "step": 56975 + }, + { + "epoch": 1.1592610677083333, + "grad_norm": 21.61675262451172, + "learning_rate": 8.07130923879302e-06, + "loss": 4.8876, + "step": 56980 + }, + { + "epoch": 1.15936279296875, + "grad_norm": 17.13001823425293, + "learning_rate": 8.070993842600533e-06, + "loss": 4.8419, + "step": 56985 + }, + { + "epoch": 1.1594645182291667, + "grad_norm": 14.967058181762695, + "learning_rate": 8.070678426785369e-06, + "loss": 5.0599, + "step": 56990 + }, + { + "epoch": 1.1595662434895833, + "grad_norm": 17.680419921875, + "learning_rate": 8.070362991349546e-06, + "loss": 5.0595, + "step": 56995 + }, + { + "epoch": 1.15966796875, + "grad_norm": 15.288651466369629, + "learning_rate": 8.070047536295078e-06, + "loss": 5.2388, + "step": 57000 + }, + { + "epoch": 1.1597696940104167, + "grad_norm": 20.28746795654297, + "learning_rate": 8.06973206162398e-06, + "loss": 5.0432, + "step": 57005 + }, + { + "epoch": 1.1598714192708333, + "grad_norm": 19.056198120117188, + "learning_rate": 8.069416567338268e-06, + "loss": 4.9977, + "step": 57010 + }, + { + "epoch": 1.15997314453125, + "grad_norm": 13.695535659790039, + "learning_rate": 8.06910105343996e-06, + "loss": 5.087, + "step": 57015 + }, + { + "epoch": 1.1600748697916667, + "grad_norm": 15.578597068786621, + "learning_rate": 8.06878551993107e-06, + "loss": 5.0011, + "step": 57020 + }, + { + "epoch": 1.1601765950520833, + "grad_norm": 17.02947425842285, + "learning_rate": 8.068469966813615e-06, + "loss": 5.1896, + "step": 57025 + }, + { + "epoch": 1.1602783203125, + "grad_norm": 17.77238655090332, + "learning_rate": 8.068154394089611e-06, + "loss": 5.367, + "step": 57030 + }, + { + "epoch": 1.1603800455729167, + "grad_norm": 20.666074752807617, + "learning_rate": 8.067838801761072e-06, + "loss": 4.9262, + "step": 57035 + }, + { + "epoch": 1.1604817708333333, + "grad_norm": 17.759315490722656, + "learning_rate": 8.06752318983002e-06, + "loss": 4.8799, + "step": 57040 + }, + { + "epoch": 1.16058349609375, + "grad_norm": 19.98250961303711, + "learning_rate": 8.067207558298467e-06, + "loss": 5.4867, + "step": 57045 + }, + { + "epoch": 1.1606852213541667, + "grad_norm": 14.857451438903809, + "learning_rate": 8.066891907168432e-06, + "loss": 4.7378, + "step": 57050 + }, + { + "epoch": 1.1607869466145833, + "grad_norm": 19.175827026367188, + "learning_rate": 8.066576236441932e-06, + "loss": 5.2174, + "step": 57055 + }, + { + "epoch": 1.160888671875, + "grad_norm": 18.042194366455078, + "learning_rate": 8.066260546120983e-06, + "loss": 4.7941, + "step": 57060 + }, + { + "epoch": 1.1609903971354167, + "grad_norm": 14.608259201049805, + "learning_rate": 8.065944836207602e-06, + "loss": 4.8248, + "step": 57065 + }, + { + "epoch": 1.1610921223958333, + "grad_norm": 17.540359497070312, + "learning_rate": 8.065629106703807e-06, + "loss": 4.7155, + "step": 57070 + }, + { + "epoch": 1.16119384765625, + "grad_norm": 16.18647003173828, + "learning_rate": 8.065313357611614e-06, + "loss": 5.0079, + "step": 57075 + }, + { + "epoch": 1.1612955729166667, + "grad_norm": 16.55143165588379, + "learning_rate": 8.064997588933043e-06, + "loss": 4.903, + "step": 57080 + }, + { + "epoch": 1.1613972981770833, + "grad_norm": 18.683929443359375, + "learning_rate": 8.064681800670112e-06, + "loss": 5.131, + "step": 57085 + }, + { + "epoch": 1.1614990234375, + "grad_norm": 16.038509368896484, + "learning_rate": 8.064365992824834e-06, + "loss": 5.1019, + "step": 57090 + }, + { + "epoch": 1.1616007486979167, + "grad_norm": 21.363271713256836, + "learning_rate": 8.06405016539923e-06, + "loss": 4.875, + "step": 57095 + }, + { + "epoch": 1.1617024739583333, + "grad_norm": 16.249624252319336, + "learning_rate": 8.06373431839532e-06, + "loss": 4.7711, + "step": 57100 + }, + { + "epoch": 1.16180419921875, + "grad_norm": 18.273868560791016, + "learning_rate": 8.06341845181512e-06, + "loss": 5.1169, + "step": 57105 + }, + { + "epoch": 1.1619059244791667, + "grad_norm": 20.859582901000977, + "learning_rate": 8.063102565660648e-06, + "loss": 5.0104, + "step": 57110 + }, + { + "epoch": 1.1620076497395833, + "grad_norm": 28.558382034301758, + "learning_rate": 8.062786659933923e-06, + "loss": 4.9584, + "step": 57115 + }, + { + "epoch": 1.162109375, + "grad_norm": 16.236656188964844, + "learning_rate": 8.062470734636965e-06, + "loss": 5.1416, + "step": 57120 + }, + { + "epoch": 1.1622111002604167, + "grad_norm": 21.959325790405273, + "learning_rate": 8.062154789771788e-06, + "loss": 5.2107, + "step": 57125 + }, + { + "epoch": 1.1623128255208333, + "grad_norm": 16.172931671142578, + "learning_rate": 8.061838825340417e-06, + "loss": 5.0863, + "step": 57130 + }, + { + "epoch": 1.16241455078125, + "grad_norm": 14.694379806518555, + "learning_rate": 8.061522841344866e-06, + "loss": 5.0757, + "step": 57135 + }, + { + "epoch": 1.1625162760416667, + "grad_norm": 20.898025512695312, + "learning_rate": 8.061206837787157e-06, + "loss": 4.941, + "step": 57140 + }, + { + "epoch": 1.1626180013020833, + "grad_norm": 17.42929458618164, + "learning_rate": 8.060890814669308e-06, + "loss": 4.7358, + "step": 57145 + }, + { + "epoch": 1.1627197265625, + "grad_norm": 18.99582290649414, + "learning_rate": 8.060574771993337e-06, + "loss": 5.1134, + "step": 57150 + }, + { + "epoch": 1.1628214518229167, + "grad_norm": 17.470731735229492, + "learning_rate": 8.060258709761266e-06, + "loss": 5.0775, + "step": 57155 + }, + { + "epoch": 1.1629231770833333, + "grad_norm": 18.97799301147461, + "learning_rate": 8.059942627975113e-06, + "loss": 5.199, + "step": 57160 + }, + { + "epoch": 1.16302490234375, + "grad_norm": 16.750017166137695, + "learning_rate": 8.059626526636898e-06, + "loss": 4.8299, + "step": 57165 + }, + { + "epoch": 1.1631266276041667, + "grad_norm": 17.946834564208984, + "learning_rate": 8.059310405748641e-06, + "loss": 5.0035, + "step": 57170 + }, + { + "epoch": 1.1632283528645833, + "grad_norm": 16.96571922302246, + "learning_rate": 8.058994265312361e-06, + "loss": 5.0093, + "step": 57175 + }, + { + "epoch": 1.163330078125, + "grad_norm": 20.650999069213867, + "learning_rate": 8.05867810533008e-06, + "loss": 4.8061, + "step": 57180 + }, + { + "epoch": 1.1634318033854167, + "grad_norm": 19.018674850463867, + "learning_rate": 8.058361925803815e-06, + "loss": 4.9992, + "step": 57185 + }, + { + "epoch": 1.1635335286458333, + "grad_norm": 17.238954544067383, + "learning_rate": 8.05804572673559e-06, + "loss": 5.1553, + "step": 57190 + }, + { + "epoch": 1.16363525390625, + "grad_norm": 20.81892204284668, + "learning_rate": 8.057729508127422e-06, + "loss": 4.86, + "step": 57195 + }, + { + "epoch": 1.1637369791666667, + "grad_norm": 16.44945526123047, + "learning_rate": 8.057413269981334e-06, + "loss": 4.9675, + "step": 57200 + }, + { + "epoch": 1.1638387044270833, + "grad_norm": 15.878952980041504, + "learning_rate": 8.057097012299345e-06, + "loss": 4.8695, + "step": 57205 + }, + { + "epoch": 1.1639404296875, + "grad_norm": 15.805420875549316, + "learning_rate": 8.056780735083478e-06, + "loss": 4.9945, + "step": 57210 + }, + { + "epoch": 1.1640421549479167, + "grad_norm": 19.853534698486328, + "learning_rate": 8.056464438335752e-06, + "loss": 4.8317, + "step": 57215 + }, + { + "epoch": 1.1641438802083333, + "grad_norm": 17.290761947631836, + "learning_rate": 8.056148122058187e-06, + "loss": 4.8701, + "step": 57220 + }, + { + "epoch": 1.16424560546875, + "grad_norm": 14.945313453674316, + "learning_rate": 8.055831786252807e-06, + "loss": 5.0793, + "step": 57225 + }, + { + "epoch": 1.1643473307291667, + "grad_norm": 15.624310493469238, + "learning_rate": 8.05551543092163e-06, + "loss": 5.0733, + "step": 57230 + }, + { + "epoch": 1.1644490559895833, + "grad_norm": 14.996241569519043, + "learning_rate": 8.055199056066682e-06, + "loss": 5.1141, + "step": 57235 + }, + { + "epoch": 1.16455078125, + "grad_norm": 15.664009094238281, + "learning_rate": 8.05488266168998e-06, + "loss": 4.8038, + "step": 57240 + }, + { + "epoch": 1.1646525065104167, + "grad_norm": 19.812271118164062, + "learning_rate": 8.054566247793549e-06, + "loss": 4.8684, + "step": 57245 + }, + { + "epoch": 1.1647542317708333, + "grad_norm": 15.229586601257324, + "learning_rate": 8.05424981437941e-06, + "loss": 4.9631, + "step": 57250 + }, + { + "epoch": 1.16485595703125, + "grad_norm": 15.578749656677246, + "learning_rate": 8.053933361449582e-06, + "loss": 4.8702, + "step": 57255 + }, + { + "epoch": 1.1649576822916667, + "grad_norm": 22.53860092163086, + "learning_rate": 8.05361688900609e-06, + "loss": 4.7441, + "step": 57260 + }, + { + "epoch": 1.1650594075520833, + "grad_norm": 14.7344970703125, + "learning_rate": 8.053300397050956e-06, + "loss": 4.7709, + "step": 57265 + }, + { + "epoch": 1.1651611328125, + "grad_norm": 21.29250717163086, + "learning_rate": 8.052983885586202e-06, + "loss": 5.0619, + "step": 57270 + }, + { + "epoch": 1.1652628580729167, + "grad_norm": 19.429105758666992, + "learning_rate": 8.05266735461385e-06, + "loss": 4.8274, + "step": 57275 + }, + { + "epoch": 1.1653645833333333, + "grad_norm": 20.60519027709961, + "learning_rate": 8.05235080413592e-06, + "loss": 4.7118, + "step": 57280 + }, + { + "epoch": 1.16546630859375, + "grad_norm": 15.86143970489502, + "learning_rate": 8.05203423415444e-06, + "loss": 4.9811, + "step": 57285 + }, + { + "epoch": 1.1655680338541667, + "grad_norm": 18.63205337524414, + "learning_rate": 8.05171764467143e-06, + "loss": 4.9906, + "step": 57290 + }, + { + "epoch": 1.1656697591145833, + "grad_norm": 13.655732154846191, + "learning_rate": 8.051401035688913e-06, + "loss": 4.7892, + "step": 57295 + }, + { + "epoch": 1.165771484375, + "grad_norm": 20.56405258178711, + "learning_rate": 8.051084407208912e-06, + "loss": 4.8698, + "step": 57300 + }, + { + "epoch": 1.1658732096354167, + "grad_norm": 17.07036781311035, + "learning_rate": 8.05076775923345e-06, + "loss": 5.0608, + "step": 57305 + }, + { + "epoch": 1.1659749348958333, + "grad_norm": 16.578847885131836, + "learning_rate": 8.050451091764549e-06, + "loss": 4.9924, + "step": 57310 + }, + { + "epoch": 1.16607666015625, + "grad_norm": 16.105703353881836, + "learning_rate": 8.050134404804236e-06, + "loss": 5.0431, + "step": 57315 + }, + { + "epoch": 1.1661783854166667, + "grad_norm": 18.43210220336914, + "learning_rate": 8.04981769835453e-06, + "loss": 4.9105, + "step": 57320 + }, + { + "epoch": 1.1662801106770833, + "grad_norm": 20.495468139648438, + "learning_rate": 8.049500972417459e-06, + "loss": 4.8534, + "step": 57325 + }, + { + "epoch": 1.1663818359375, + "grad_norm": 17.824542999267578, + "learning_rate": 8.049184226995044e-06, + "loss": 5.0341, + "step": 57330 + }, + { + "epoch": 1.1664835611979167, + "grad_norm": 20.588579177856445, + "learning_rate": 8.048867462089309e-06, + "loss": 4.7425, + "step": 57335 + }, + { + "epoch": 1.1665852864583333, + "grad_norm": 18.366600036621094, + "learning_rate": 8.048550677702282e-06, + "loss": 4.8914, + "step": 57340 + }, + { + "epoch": 1.16668701171875, + "grad_norm": 17.876720428466797, + "learning_rate": 8.04823387383598e-06, + "loss": 4.9823, + "step": 57345 + }, + { + "epoch": 1.1667887369791667, + "grad_norm": 16.061092376708984, + "learning_rate": 8.047917050492433e-06, + "loss": 5.3573, + "step": 57350 + }, + { + "epoch": 1.1668904622395833, + "grad_norm": 14.701366424560547, + "learning_rate": 8.047600207673663e-06, + "loss": 4.9273, + "step": 57355 + }, + { + "epoch": 1.1669921875, + "grad_norm": 18.329607009887695, + "learning_rate": 8.047283345381696e-06, + "loss": 4.9316, + "step": 57360 + }, + { + "epoch": 1.1670939127604167, + "grad_norm": 18.018213272094727, + "learning_rate": 8.046966463618555e-06, + "loss": 4.8011, + "step": 57365 + }, + { + "epoch": 1.1671956380208333, + "grad_norm": 23.456954956054688, + "learning_rate": 8.046649562386266e-06, + "loss": 5.0462, + "step": 57370 + }, + { + "epoch": 1.16729736328125, + "grad_norm": 17.040719985961914, + "learning_rate": 8.046332641686854e-06, + "loss": 4.8219, + "step": 57375 + }, + { + "epoch": 1.1673990885416667, + "grad_norm": 16.343536376953125, + "learning_rate": 8.046015701522343e-06, + "loss": 5.1221, + "step": 57380 + }, + { + "epoch": 1.1675008138020833, + "grad_norm": 24.439393997192383, + "learning_rate": 8.045698741894758e-06, + "loss": 5.0583, + "step": 57385 + }, + { + "epoch": 1.1676025390625, + "grad_norm": 12.2423095703125, + "learning_rate": 8.045381762806126e-06, + "loss": 4.7696, + "step": 57390 + }, + { + "epoch": 1.1677042643229167, + "grad_norm": 21.53199577331543, + "learning_rate": 8.04506476425847e-06, + "loss": 4.8452, + "step": 57395 + }, + { + "epoch": 1.1678059895833333, + "grad_norm": 18.968639373779297, + "learning_rate": 8.044747746253819e-06, + "loss": 4.8856, + "step": 57400 + }, + { + "epoch": 1.16790771484375, + "grad_norm": 17.137245178222656, + "learning_rate": 8.044430708794193e-06, + "loss": 4.6399, + "step": 57405 + }, + { + "epoch": 1.1680094401041667, + "grad_norm": 20.02597999572754, + "learning_rate": 8.044113651881624e-06, + "loss": 4.7855, + "step": 57410 + }, + { + "epoch": 1.1681111653645833, + "grad_norm": 14.32944107055664, + "learning_rate": 8.043796575518133e-06, + "loss": 4.9267, + "step": 57415 + }, + { + "epoch": 1.168212890625, + "grad_norm": 19.07976531982422, + "learning_rate": 8.043479479705749e-06, + "loss": 5.1139, + "step": 57420 + }, + { + "epoch": 1.1683146158854167, + "grad_norm": 13.549330711364746, + "learning_rate": 8.043162364446497e-06, + "loss": 4.6424, + "step": 57425 + }, + { + "epoch": 1.1684163411458333, + "grad_norm": 26.808868408203125, + "learning_rate": 8.042845229742405e-06, + "loss": 4.9652, + "step": 57430 + }, + { + "epoch": 1.16851806640625, + "grad_norm": 15.236144065856934, + "learning_rate": 8.042528075595496e-06, + "loss": 5.1378, + "step": 57435 + }, + { + "epoch": 1.1686197916666667, + "grad_norm": 19.63205337524414, + "learning_rate": 8.0422109020078e-06, + "loss": 4.9585, + "step": 57440 + }, + { + "epoch": 1.1687215169270833, + "grad_norm": 16.18246841430664, + "learning_rate": 8.041893708981342e-06, + "loss": 4.8433, + "step": 57445 + }, + { + "epoch": 1.1688232421875, + "grad_norm": 16.1866397857666, + "learning_rate": 8.041576496518146e-06, + "loss": 5.0913, + "step": 57450 + }, + { + "epoch": 1.1689249674479167, + "grad_norm": 18.875804901123047, + "learning_rate": 8.041259264620245e-06, + "loss": 4.7698, + "step": 57455 + }, + { + "epoch": 1.1690266927083333, + "grad_norm": 14.518885612487793, + "learning_rate": 8.040942013289662e-06, + "loss": 4.6859, + "step": 57460 + }, + { + "epoch": 1.16912841796875, + "grad_norm": 13.565038681030273, + "learning_rate": 8.040624742528424e-06, + "loss": 5.1215, + "step": 57465 + }, + { + "epoch": 1.1692301432291667, + "grad_norm": 16.51409912109375, + "learning_rate": 8.040307452338561e-06, + "loss": 4.9096, + "step": 57470 + }, + { + "epoch": 1.1693318684895833, + "grad_norm": 18.13043785095215, + "learning_rate": 8.039990142722096e-06, + "loss": 5.079, + "step": 57475 + }, + { + "epoch": 1.16943359375, + "grad_norm": 15.140172958374023, + "learning_rate": 8.039672813681061e-06, + "loss": 4.9366, + "step": 57480 + }, + { + "epoch": 1.1695353190104167, + "grad_norm": 20.964237213134766, + "learning_rate": 8.03935546521748e-06, + "loss": 5.024, + "step": 57485 + }, + { + "epoch": 1.1696370442708333, + "grad_norm": 14.068255424499512, + "learning_rate": 8.039038097333382e-06, + "loss": 5.1912, + "step": 57490 + }, + { + "epoch": 1.16973876953125, + "grad_norm": 20.848207473754883, + "learning_rate": 8.038720710030797e-06, + "loss": 4.9217, + "step": 57495 + }, + { + "epoch": 1.1698404947916667, + "grad_norm": 15.382038116455078, + "learning_rate": 8.03840330331175e-06, + "loss": 4.6736, + "step": 57500 + }, + { + "epoch": 1.1699422200520833, + "grad_norm": 19.518207550048828, + "learning_rate": 8.038085877178273e-06, + "loss": 4.9871, + "step": 57505 + }, + { + "epoch": 1.1700439453125, + "grad_norm": 17.111848831176758, + "learning_rate": 8.03776843163239e-06, + "loss": 5.1003, + "step": 57510 + }, + { + "epoch": 1.1701456705729167, + "grad_norm": 18.54449462890625, + "learning_rate": 8.03745096667613e-06, + "loss": 4.8886, + "step": 57515 + }, + { + "epoch": 1.1702473958333333, + "grad_norm": 20.171476364135742, + "learning_rate": 8.037133482311522e-06, + "loss": 5.1617, + "step": 57520 + }, + { + "epoch": 1.17034912109375, + "grad_norm": 15.96645736694336, + "learning_rate": 8.036815978540598e-06, + "loss": 5.0402, + "step": 57525 + }, + { + "epoch": 1.1704508463541667, + "grad_norm": 17.245468139648438, + "learning_rate": 8.036498455365383e-06, + "loss": 4.9973, + "step": 57530 + }, + { + "epoch": 1.1705525716145833, + "grad_norm": 18.842863082885742, + "learning_rate": 8.036180912787905e-06, + "loss": 4.8416, + "step": 57535 + }, + { + "epoch": 1.170654296875, + "grad_norm": 20.625043869018555, + "learning_rate": 8.035863350810195e-06, + "loss": 4.8652, + "step": 57540 + }, + { + "epoch": 1.1707560221354167, + "grad_norm": 20.66164779663086, + "learning_rate": 8.035545769434285e-06, + "loss": 4.7798, + "step": 57545 + }, + { + "epoch": 1.1708577473958333, + "grad_norm": 17.127296447753906, + "learning_rate": 8.035228168662199e-06, + "loss": 4.7995, + "step": 57550 + }, + { + "epoch": 1.17095947265625, + "grad_norm": 18.031721115112305, + "learning_rate": 8.034910548495967e-06, + "loss": 4.9567, + "step": 57555 + }, + { + "epoch": 1.1710611979166667, + "grad_norm": 17.102922439575195, + "learning_rate": 8.034592908937623e-06, + "loss": 4.8124, + "step": 57560 + }, + { + "epoch": 1.1711629231770833, + "grad_norm": 12.37092399597168, + "learning_rate": 8.034275249989191e-06, + "loss": 5.2459, + "step": 57565 + }, + { + "epoch": 1.1712646484375, + "grad_norm": 20.76835060119629, + "learning_rate": 8.033957571652706e-06, + "loss": 5.177, + "step": 57570 + }, + { + "epoch": 1.1713663736979167, + "grad_norm": 16.21285057067871, + "learning_rate": 8.033639873930193e-06, + "loss": 5.0736, + "step": 57575 + }, + { + "epoch": 1.1714680989583333, + "grad_norm": 15.574462890625, + "learning_rate": 8.033322156823685e-06, + "loss": 4.7188, + "step": 57580 + }, + { + "epoch": 1.17156982421875, + "grad_norm": 17.206279754638672, + "learning_rate": 8.033004420335211e-06, + "loss": 4.9809, + "step": 57585 + }, + { + "epoch": 1.1716715494791667, + "grad_norm": 18.126859664916992, + "learning_rate": 8.032686664466802e-06, + "loss": 4.9792, + "step": 57590 + }, + { + "epoch": 1.1717732747395833, + "grad_norm": 23.159116744995117, + "learning_rate": 8.032368889220487e-06, + "loss": 4.7337, + "step": 57595 + }, + { + "epoch": 1.171875, + "grad_norm": 17.8488826751709, + "learning_rate": 8.0320510945983e-06, + "loss": 4.7697, + "step": 57600 + }, + { + "epoch": 1.1719767252604167, + "grad_norm": 19.89524269104004, + "learning_rate": 8.031733280602266e-06, + "loss": 4.9203, + "step": 57605 + }, + { + "epoch": 1.1720784505208333, + "grad_norm": 16.501239776611328, + "learning_rate": 8.031415447234421e-06, + "loss": 4.9508, + "step": 57610 + }, + { + "epoch": 1.17218017578125, + "grad_norm": 14.980035781860352, + "learning_rate": 8.031097594496791e-06, + "loss": 5.001, + "step": 57615 + }, + { + "epoch": 1.1722819010416667, + "grad_norm": 20.361677169799805, + "learning_rate": 8.030779722391412e-06, + "loss": 4.7405, + "step": 57620 + }, + { + "epoch": 1.1723836263020833, + "grad_norm": 22.373355865478516, + "learning_rate": 8.030461830920311e-06, + "loss": 4.8654, + "step": 57625 + }, + { + "epoch": 1.1724853515625, + "grad_norm": 17.12364387512207, + "learning_rate": 8.030143920085521e-06, + "loss": 4.935, + "step": 57630 + }, + { + "epoch": 1.1725870768229167, + "grad_norm": 25.682859420776367, + "learning_rate": 8.029825989889074e-06, + "loss": 4.9375, + "step": 57635 + }, + { + "epoch": 1.1726888020833333, + "grad_norm": 14.293022155761719, + "learning_rate": 8.029508040333e-06, + "loss": 4.934, + "step": 57640 + }, + { + "epoch": 1.17279052734375, + "grad_norm": 16.249311447143555, + "learning_rate": 8.029190071419333e-06, + "loss": 5.1041, + "step": 57645 + }, + { + "epoch": 1.1728922526041667, + "grad_norm": 14.22036075592041, + "learning_rate": 8.028872083150103e-06, + "loss": 5.0145, + "step": 57650 + }, + { + "epoch": 1.1729939778645833, + "grad_norm": 17.673803329467773, + "learning_rate": 8.02855407552734e-06, + "loss": 5.3315, + "step": 57655 + }, + { + "epoch": 1.173095703125, + "grad_norm": 13.095314025878906, + "learning_rate": 8.02823604855308e-06, + "loss": 4.6063, + "step": 57660 + }, + { + "epoch": 1.1731974283854167, + "grad_norm": 14.978985786437988, + "learning_rate": 8.02791800222935e-06, + "loss": 5.3847, + "step": 57665 + }, + { + "epoch": 1.1732991536458333, + "grad_norm": 15.876484870910645, + "learning_rate": 8.027599936558189e-06, + "loss": 4.814, + "step": 57670 + }, + { + "epoch": 1.17340087890625, + "grad_norm": 18.41143226623535, + "learning_rate": 8.027281851541622e-06, + "loss": 4.8868, + "step": 57675 + }, + { + "epoch": 1.1735026041666667, + "grad_norm": 18.132320404052734, + "learning_rate": 8.026963747181686e-06, + "loss": 4.9804, + "step": 57680 + }, + { + "epoch": 1.1736043294270833, + "grad_norm": 15.332298278808594, + "learning_rate": 8.026645623480414e-06, + "loss": 4.8478, + "step": 57685 + }, + { + "epoch": 1.1737060546875, + "grad_norm": 14.270827293395996, + "learning_rate": 8.026327480439836e-06, + "loss": 4.9521, + "step": 57690 + }, + { + "epoch": 1.1738077799479167, + "grad_norm": 14.818568229675293, + "learning_rate": 8.026009318061985e-06, + "loss": 4.9813, + "step": 57695 + }, + { + "epoch": 1.1739095052083333, + "grad_norm": 16.808399200439453, + "learning_rate": 8.025691136348896e-06, + "loss": 4.7398, + "step": 57700 + }, + { + "epoch": 1.17401123046875, + "grad_norm": 16.204566955566406, + "learning_rate": 8.025372935302601e-06, + "loss": 4.7837, + "step": 57705 + }, + { + "epoch": 1.1741129557291667, + "grad_norm": 19.553586959838867, + "learning_rate": 8.025054714925133e-06, + "loss": 5.0351, + "step": 57710 + }, + { + "epoch": 1.1742146809895833, + "grad_norm": 19.498920440673828, + "learning_rate": 8.024736475218528e-06, + "loss": 4.8974, + "step": 57715 + }, + { + "epoch": 1.17431640625, + "grad_norm": 23.036113739013672, + "learning_rate": 8.024418216184816e-06, + "loss": 5.1051, + "step": 57720 + }, + { + "epoch": 1.1744181315104167, + "grad_norm": 16.29098129272461, + "learning_rate": 8.02409993782603e-06, + "loss": 4.8666, + "step": 57725 + }, + { + "epoch": 1.1745198567708333, + "grad_norm": 18.05074691772461, + "learning_rate": 8.023781640144207e-06, + "loss": 4.9134, + "step": 57730 + }, + { + "epoch": 1.17462158203125, + "grad_norm": 16.53269386291504, + "learning_rate": 8.023463323141378e-06, + "loss": 4.8186, + "step": 57735 + }, + { + "epoch": 1.1747233072916667, + "grad_norm": 15.758800506591797, + "learning_rate": 8.02314498681958e-06, + "loss": 4.9284, + "step": 57740 + }, + { + "epoch": 1.1748250325520833, + "grad_norm": 14.959686279296875, + "learning_rate": 8.022826631180843e-06, + "loss": 4.8305, + "step": 57745 + }, + { + "epoch": 1.1749267578125, + "grad_norm": 14.537308692932129, + "learning_rate": 8.022508256227204e-06, + "loss": 4.9589, + "step": 57750 + }, + { + "epoch": 1.1750284830729167, + "grad_norm": 23.211467742919922, + "learning_rate": 8.022189861960698e-06, + "loss": 4.7344, + "step": 57755 + }, + { + "epoch": 1.1751302083333333, + "grad_norm": 15.934319496154785, + "learning_rate": 8.021871448383356e-06, + "loss": 5.0296, + "step": 57760 + }, + { + "epoch": 1.17523193359375, + "grad_norm": 15.310772895812988, + "learning_rate": 8.021553015497216e-06, + "loss": 5.1455, + "step": 57765 + }, + { + "epoch": 1.1753336588541667, + "grad_norm": 14.83488941192627, + "learning_rate": 8.021234563304311e-06, + "loss": 4.9282, + "step": 57770 + }, + { + "epoch": 1.1754353841145833, + "grad_norm": 20.38404655456543, + "learning_rate": 8.020916091806678e-06, + "loss": 4.9636, + "step": 57775 + }, + { + "epoch": 1.175537109375, + "grad_norm": 20.142114639282227, + "learning_rate": 8.020597601006349e-06, + "loss": 4.9341, + "step": 57780 + }, + { + "epoch": 1.1756388346354167, + "grad_norm": 15.707342147827148, + "learning_rate": 8.020279090905359e-06, + "loss": 4.949, + "step": 57785 + }, + { + "epoch": 1.1757405598958333, + "grad_norm": 19.344602584838867, + "learning_rate": 8.019960561505747e-06, + "loss": 4.9028, + "step": 57790 + }, + { + "epoch": 1.17584228515625, + "grad_norm": 15.785324096679688, + "learning_rate": 8.019642012809542e-06, + "loss": 4.9231, + "step": 57795 + }, + { + "epoch": 1.1759440104166667, + "grad_norm": 19.00065803527832, + "learning_rate": 8.019323444818786e-06, + "loss": 4.8366, + "step": 57800 + }, + { + "epoch": 1.1760457356770833, + "grad_norm": 13.14077377319336, + "learning_rate": 8.01900485753551e-06, + "loss": 5.3147, + "step": 57805 + }, + { + "epoch": 1.1761474609375, + "grad_norm": 17.29574966430664, + "learning_rate": 8.01868625096175e-06, + "loss": 4.7018, + "step": 57810 + }, + { + "epoch": 1.1762491861979167, + "grad_norm": 21.919801712036133, + "learning_rate": 8.018367625099548e-06, + "loss": 4.9887, + "step": 57815 + }, + { + "epoch": 1.1763509114583333, + "grad_norm": 16.27346420288086, + "learning_rate": 8.018048979950933e-06, + "loss": 4.8494, + "step": 57820 + }, + { + "epoch": 1.17645263671875, + "grad_norm": 16.772594451904297, + "learning_rate": 8.017730315517941e-06, + "loss": 4.7281, + "step": 57825 + }, + { + "epoch": 1.1765543619791667, + "grad_norm": 18.1431884765625, + "learning_rate": 8.017411631802611e-06, + "loss": 4.8768, + "step": 57830 + }, + { + "epoch": 1.1766560872395833, + "grad_norm": 15.198410034179688, + "learning_rate": 8.017092928806978e-06, + "loss": 4.799, + "step": 57835 + }, + { + "epoch": 1.1767578125, + "grad_norm": 19.762178421020508, + "learning_rate": 8.016774206533082e-06, + "loss": 4.9253, + "step": 57840 + }, + { + "epoch": 1.1768595377604167, + "grad_norm": 15.48051643371582, + "learning_rate": 8.016455464982953e-06, + "loss": 4.8659, + "step": 57845 + }, + { + "epoch": 1.1769612630208333, + "grad_norm": 16.14997100830078, + "learning_rate": 8.016136704158633e-06, + "loss": 4.8908, + "step": 57850 + }, + { + "epoch": 1.17706298828125, + "grad_norm": 20.498336791992188, + "learning_rate": 8.015817924062158e-06, + "loss": 4.9025, + "step": 57855 + }, + { + "epoch": 1.1771647135416667, + "grad_norm": 22.256837844848633, + "learning_rate": 8.015499124695561e-06, + "loss": 5.1312, + "step": 57860 + }, + { + "epoch": 1.1772664388020833, + "grad_norm": 22.478321075439453, + "learning_rate": 8.015180306060884e-06, + "loss": 4.8198, + "step": 57865 + }, + { + "epoch": 1.1773681640625, + "grad_norm": 19.169721603393555, + "learning_rate": 8.014861468160162e-06, + "loss": 5.245, + "step": 57870 + }, + { + "epoch": 1.1774698893229167, + "grad_norm": 18.708396911621094, + "learning_rate": 8.01454261099543e-06, + "loss": 4.8747, + "step": 57875 + }, + { + "epoch": 1.1775716145833333, + "grad_norm": 15.999444007873535, + "learning_rate": 8.014223734568729e-06, + "loss": 4.7395, + "step": 57880 + }, + { + "epoch": 1.17767333984375, + "grad_norm": 18.48668098449707, + "learning_rate": 8.013904838882095e-06, + "loss": 4.9889, + "step": 57885 + }, + { + "epoch": 1.1777750651041667, + "grad_norm": 15.019014358520508, + "learning_rate": 8.013585923937565e-06, + "loss": 5.0692, + "step": 57890 + }, + { + "epoch": 1.1778767903645833, + "grad_norm": 15.654541015625, + "learning_rate": 8.01326698973718e-06, + "loss": 5.1092, + "step": 57895 + }, + { + "epoch": 1.177978515625, + "grad_norm": 23.787242889404297, + "learning_rate": 8.012948036282975e-06, + "loss": 5.246, + "step": 57900 + }, + { + "epoch": 1.1780802408854167, + "grad_norm": 24.360300064086914, + "learning_rate": 8.012629063576986e-06, + "loss": 4.845, + "step": 57905 + }, + { + "epoch": 1.1781819661458333, + "grad_norm": 17.506851196289062, + "learning_rate": 8.012310071621257e-06, + "loss": 5.2658, + "step": 57910 + }, + { + "epoch": 1.17828369140625, + "grad_norm": 19.319541931152344, + "learning_rate": 8.01199106041782e-06, + "loss": 4.7831, + "step": 57915 + }, + { + "epoch": 1.1783854166666667, + "grad_norm": 21.011518478393555, + "learning_rate": 8.011672029968719e-06, + "loss": 4.9978, + "step": 57920 + }, + { + "epoch": 1.1784871419270833, + "grad_norm": 12.999550819396973, + "learning_rate": 8.011352980275986e-06, + "loss": 4.7637, + "step": 57925 + }, + { + "epoch": 1.1785888671875, + "grad_norm": 16.150657653808594, + "learning_rate": 8.011033911341666e-06, + "loss": 4.8408, + "step": 57930 + }, + { + "epoch": 1.1786905924479167, + "grad_norm": 19.2450008392334, + "learning_rate": 8.010714823167795e-06, + "loss": 5.0191, + "step": 57935 + }, + { + "epoch": 1.1787923177083333, + "grad_norm": 19.269384384155273, + "learning_rate": 8.010395715756412e-06, + "loss": 4.8485, + "step": 57940 + }, + { + "epoch": 1.17889404296875, + "grad_norm": 19.31490135192871, + "learning_rate": 8.010076589109556e-06, + "loss": 4.7384, + "step": 57945 + }, + { + "epoch": 1.1789957682291667, + "grad_norm": 15.079292297363281, + "learning_rate": 8.009757443229265e-06, + "loss": 4.8808, + "step": 57950 + }, + { + "epoch": 1.1790974934895833, + "grad_norm": 17.514116287231445, + "learning_rate": 8.00943827811758e-06, + "loss": 4.6315, + "step": 57955 + }, + { + "epoch": 1.17919921875, + "grad_norm": 16.406047821044922, + "learning_rate": 8.00911909377654e-06, + "loss": 5.0114, + "step": 57960 + }, + { + "epoch": 1.1793009440104167, + "grad_norm": 16.593660354614258, + "learning_rate": 8.008799890208183e-06, + "loss": 4.901, + "step": 57965 + }, + { + "epoch": 1.1794026692708333, + "grad_norm": 14.806418418884277, + "learning_rate": 8.00848066741455e-06, + "loss": 5.0199, + "step": 57970 + }, + { + "epoch": 1.17950439453125, + "grad_norm": 17.21122932434082, + "learning_rate": 8.008161425397684e-06, + "loss": 4.7728, + "step": 57975 + }, + { + "epoch": 1.1796061197916667, + "grad_norm": 20.682477951049805, + "learning_rate": 8.007842164159618e-06, + "loss": 4.7061, + "step": 57980 + }, + { + "epoch": 1.1797078450520833, + "grad_norm": 17.7463436126709, + "learning_rate": 8.007522883702397e-06, + "loss": 4.9535, + "step": 57985 + }, + { + "epoch": 1.1798095703125, + "grad_norm": 17.709218978881836, + "learning_rate": 8.007203584028058e-06, + "loss": 5.0142, + "step": 57990 + }, + { + "epoch": 1.1799112955729167, + "grad_norm": 17.129716873168945, + "learning_rate": 8.006884265138644e-06, + "loss": 4.8684, + "step": 57995 + }, + { + "epoch": 1.1800130208333333, + "grad_norm": 19.85175895690918, + "learning_rate": 8.006564927036194e-06, + "loss": 5.3327, + "step": 58000 + }, + { + "epoch": 1.18011474609375, + "grad_norm": 13.199762344360352, + "learning_rate": 8.006245569722747e-06, + "loss": 4.9822, + "step": 58005 + }, + { + "epoch": 1.1802164713541667, + "grad_norm": 16.781600952148438, + "learning_rate": 8.005926193200346e-06, + "loss": 5.1513, + "step": 58010 + }, + { + "epoch": 1.1803181966145833, + "grad_norm": 15.882912635803223, + "learning_rate": 8.00560679747103e-06, + "loss": 5.0892, + "step": 58015 + }, + { + "epoch": 1.180419921875, + "grad_norm": 21.586137771606445, + "learning_rate": 8.005287382536844e-06, + "loss": 4.7825, + "step": 58020 + }, + { + "epoch": 1.1805216471354167, + "grad_norm": 23.290613174438477, + "learning_rate": 8.004967948399821e-06, + "loss": 5.1082, + "step": 58025 + }, + { + "epoch": 1.1806233723958333, + "grad_norm": 15.51068115234375, + "learning_rate": 8.00464849506201e-06, + "loss": 5.1528, + "step": 58030 + }, + { + "epoch": 1.18072509765625, + "grad_norm": 25.710098266601562, + "learning_rate": 8.004329022525447e-06, + "loss": 4.9074, + "step": 58035 + }, + { + "epoch": 1.1808268229166667, + "grad_norm": 14.25202751159668, + "learning_rate": 8.004009530792178e-06, + "loss": 5.0476, + "step": 58040 + }, + { + "epoch": 1.1809285481770833, + "grad_norm": 13.928496360778809, + "learning_rate": 8.00369001986424e-06, + "loss": 5.0121, + "step": 58045 + }, + { + "epoch": 1.1810302734375, + "grad_norm": 20.22421646118164, + "learning_rate": 8.003370489743676e-06, + "loss": 4.6356, + "step": 58050 + }, + { + "epoch": 1.1811319986979167, + "grad_norm": 12.134170532226562, + "learning_rate": 8.003050940432527e-06, + "loss": 4.8821, + "step": 58055 + }, + { + "epoch": 1.1812337239583333, + "grad_norm": 19.396560668945312, + "learning_rate": 8.002731371932838e-06, + "loss": 4.8149, + "step": 58060 + }, + { + "epoch": 1.18133544921875, + "grad_norm": 12.79233455657959, + "learning_rate": 8.002411784246649e-06, + "loss": 4.7531, + "step": 58065 + }, + { + "epoch": 1.1814371744791667, + "grad_norm": 20.189537048339844, + "learning_rate": 8.002092177376e-06, + "loss": 5.0105, + "step": 58070 + }, + { + "epoch": 1.1815388997395833, + "grad_norm": 18.84254264831543, + "learning_rate": 8.001772551322934e-06, + "loss": 4.727, + "step": 58075 + }, + { + "epoch": 1.181640625, + "grad_norm": 16.569337844848633, + "learning_rate": 8.001452906089495e-06, + "loss": 5.0354, + "step": 58080 + }, + { + "epoch": 1.1817423502604167, + "grad_norm": 18.174299240112305, + "learning_rate": 8.001133241677726e-06, + "loss": 4.9576, + "step": 58085 + }, + { + "epoch": 1.1818440755208333, + "grad_norm": 15.492441177368164, + "learning_rate": 8.000813558089669e-06, + "loss": 4.8912, + "step": 58090 + }, + { + "epoch": 1.18194580078125, + "grad_norm": 15.574504852294922, + "learning_rate": 8.000493855327365e-06, + "loss": 4.8625, + "step": 58095 + }, + { + "epoch": 1.1820475260416667, + "grad_norm": 18.288219451904297, + "learning_rate": 8.000174133392857e-06, + "loss": 4.8301, + "step": 58100 + }, + { + "epoch": 1.1821492513020833, + "grad_norm": 20.74794578552246, + "learning_rate": 7.999854392288189e-06, + "loss": 4.8663, + "step": 58105 + }, + { + "epoch": 1.1822509765625, + "grad_norm": 11.940775871276855, + "learning_rate": 7.999534632015404e-06, + "loss": 4.7955, + "step": 58110 + }, + { + "epoch": 1.1823527018229167, + "grad_norm": 15.549650192260742, + "learning_rate": 7.999214852576545e-06, + "loss": 4.7453, + "step": 58115 + }, + { + "epoch": 1.1824544270833333, + "grad_norm": 16.357421875, + "learning_rate": 7.998895053973655e-06, + "loss": 4.9935, + "step": 58120 + }, + { + "epoch": 1.18255615234375, + "grad_norm": 16.71915626525879, + "learning_rate": 7.998575236208776e-06, + "loss": 4.8859, + "step": 58125 + }, + { + "epoch": 1.1826578776041667, + "grad_norm": 18.94481086730957, + "learning_rate": 7.998255399283955e-06, + "loss": 5.0251, + "step": 58130 + }, + { + "epoch": 1.1827596028645833, + "grad_norm": 15.922419548034668, + "learning_rate": 7.997935543201232e-06, + "loss": 5.0472, + "step": 58135 + }, + { + "epoch": 1.182861328125, + "grad_norm": 26.19142723083496, + "learning_rate": 7.997615667962654e-06, + "loss": 4.8013, + "step": 58140 + }, + { + "epoch": 1.1829630533854167, + "grad_norm": 19.3118839263916, + "learning_rate": 7.997295773570264e-06, + "loss": 5.081, + "step": 58145 + }, + { + "epoch": 1.1830647786458333, + "grad_norm": 20.37442970275879, + "learning_rate": 7.996975860026105e-06, + "loss": 5.0808, + "step": 58150 + }, + { + "epoch": 1.18316650390625, + "grad_norm": 20.63500213623047, + "learning_rate": 7.996655927332222e-06, + "loss": 5.2336, + "step": 58155 + }, + { + "epoch": 1.1832682291666667, + "grad_norm": 20.473791122436523, + "learning_rate": 7.996335975490658e-06, + "loss": 4.8208, + "step": 58160 + }, + { + "epoch": 1.1833699544270833, + "grad_norm": 22.230308532714844, + "learning_rate": 7.996016004503459e-06, + "loss": 4.8842, + "step": 58165 + }, + { + "epoch": 1.1834716796875, + "grad_norm": 14.505268096923828, + "learning_rate": 7.995696014372668e-06, + "loss": 4.9611, + "step": 58170 + }, + { + "epoch": 1.1835734049479167, + "grad_norm": 15.610993385314941, + "learning_rate": 7.99537600510033e-06, + "loss": 5.0701, + "step": 58175 + }, + { + "epoch": 1.1836751302083333, + "grad_norm": 18.98390769958496, + "learning_rate": 7.995055976688492e-06, + "loss": 5.2463, + "step": 58180 + }, + { + "epoch": 1.18377685546875, + "grad_norm": 15.148300170898438, + "learning_rate": 7.994735929139194e-06, + "loss": 5.0881, + "step": 58185 + }, + { + "epoch": 1.1838785807291667, + "grad_norm": 17.91613006591797, + "learning_rate": 7.994415862454488e-06, + "loss": 4.9884, + "step": 58190 + }, + { + "epoch": 1.1839803059895833, + "grad_norm": 18.467453002929688, + "learning_rate": 7.994095776636413e-06, + "loss": 5.0101, + "step": 58195 + }, + { + "epoch": 1.18408203125, + "grad_norm": 15.671603202819824, + "learning_rate": 7.993775671687017e-06, + "loss": 5.1656, + "step": 58200 + }, + { + "epoch": 1.1841837565104167, + "grad_norm": 14.636541366577148, + "learning_rate": 7.993455547608345e-06, + "loss": 4.8618, + "step": 58205 + }, + { + "epoch": 1.1842854817708333, + "grad_norm": 16.115325927734375, + "learning_rate": 7.99313540440244e-06, + "loss": 4.9916, + "step": 58210 + }, + { + "epoch": 1.18438720703125, + "grad_norm": 16.674287796020508, + "learning_rate": 7.992815242071354e-06, + "loss": 4.9102, + "step": 58215 + }, + { + "epoch": 1.1844889322916667, + "grad_norm": 14.123038291931152, + "learning_rate": 7.992495060617127e-06, + "loss": 4.8712, + "step": 58220 + }, + { + "epoch": 1.1845906575520833, + "grad_norm": 16.022918701171875, + "learning_rate": 7.992174860041805e-06, + "loss": 5.0355, + "step": 58225 + }, + { + "epoch": 1.1846923828125, + "grad_norm": 18.04780387878418, + "learning_rate": 7.991854640347436e-06, + "loss": 4.9608, + "step": 58230 + }, + { + "epoch": 1.1847941080729167, + "grad_norm": 17.997594833374023, + "learning_rate": 7.991534401536066e-06, + "loss": 5.0059, + "step": 58235 + }, + { + "epoch": 1.1848958333333333, + "grad_norm": 16.500459671020508, + "learning_rate": 7.991214143609741e-06, + "loss": 4.9275, + "step": 58240 + }, + { + "epoch": 1.18499755859375, + "grad_norm": 15.205491065979004, + "learning_rate": 7.990893866570507e-06, + "loss": 4.6924, + "step": 58245 + }, + { + "epoch": 1.1850992838541667, + "grad_norm": 16.032989501953125, + "learning_rate": 7.99057357042041e-06, + "loss": 4.6345, + "step": 58250 + }, + { + "epoch": 1.1852010091145833, + "grad_norm": 17.992290496826172, + "learning_rate": 7.9902532551615e-06, + "loss": 4.878, + "step": 58255 + }, + { + "epoch": 1.185302734375, + "grad_norm": 18.95846939086914, + "learning_rate": 7.989932920795817e-06, + "loss": 4.8455, + "step": 58260 + }, + { + "epoch": 1.1854044596354167, + "grad_norm": 18.21095848083496, + "learning_rate": 7.989612567325414e-06, + "loss": 4.9564, + "step": 58265 + }, + { + "epoch": 1.1855061848958333, + "grad_norm": 17.850976943969727, + "learning_rate": 7.989292194752336e-06, + "loss": 4.698, + "step": 58270 + }, + { + "epoch": 1.18560791015625, + "grad_norm": 14.693441390991211, + "learning_rate": 7.988971803078629e-06, + "loss": 4.9869, + "step": 58275 + }, + { + "epoch": 1.1857096354166667, + "grad_norm": 16.870506286621094, + "learning_rate": 7.988651392306341e-06, + "loss": 4.7904, + "step": 58280 + }, + { + "epoch": 1.1858113606770833, + "grad_norm": 18.67391014099121, + "learning_rate": 7.988330962437518e-06, + "loss": 5.0823, + "step": 58285 + }, + { + "epoch": 1.1859130859375, + "grad_norm": 20.414562225341797, + "learning_rate": 7.988010513474211e-06, + "loss": 4.9832, + "step": 58290 + }, + { + "epoch": 1.1860148111979167, + "grad_norm": 19.396324157714844, + "learning_rate": 7.987690045418465e-06, + "loss": 4.912, + "step": 58295 + }, + { + "epoch": 1.1861165364583333, + "grad_norm": 14.933368682861328, + "learning_rate": 7.987369558272327e-06, + "loss": 4.582, + "step": 58300 + }, + { + "epoch": 1.18621826171875, + "grad_norm": 19.080764770507812, + "learning_rate": 7.987049052037846e-06, + "loss": 4.9093, + "step": 58305 + }, + { + "epoch": 1.1863199869791667, + "grad_norm": 17.030071258544922, + "learning_rate": 7.986728526717071e-06, + "loss": 4.7464, + "step": 58310 + }, + { + "epoch": 1.1864217122395833, + "grad_norm": 22.49353790283203, + "learning_rate": 7.986407982312048e-06, + "loss": 4.6598, + "step": 58315 + }, + { + "epoch": 1.1865234375, + "grad_norm": 16.752084732055664, + "learning_rate": 7.986087418824827e-06, + "loss": 4.8193, + "step": 58320 + }, + { + "epoch": 1.1866251627604167, + "grad_norm": 11.99448299407959, + "learning_rate": 7.985766836257454e-06, + "loss": 4.945, + "step": 58325 + }, + { + "epoch": 1.1867268880208333, + "grad_norm": 17.80876350402832, + "learning_rate": 7.985446234611978e-06, + "loss": 4.9399, + "step": 58330 + }, + { + "epoch": 1.18682861328125, + "grad_norm": 21.48919105529785, + "learning_rate": 7.98512561389045e-06, + "loss": 4.9166, + "step": 58335 + }, + { + "epoch": 1.1869303385416667, + "grad_norm": 19.383947372436523, + "learning_rate": 7.984804974094916e-06, + "loss": 4.9605, + "step": 58340 + }, + { + "epoch": 1.1870320638020833, + "grad_norm": 18.039506912231445, + "learning_rate": 7.984484315227425e-06, + "loss": 4.988, + "step": 58345 + }, + { + "epoch": 1.1871337890625, + "grad_norm": 18.93854331970215, + "learning_rate": 7.984163637290028e-06, + "loss": 4.7135, + "step": 58350 + }, + { + "epoch": 1.1872355143229167, + "grad_norm": 18.540346145629883, + "learning_rate": 7.983842940284773e-06, + "loss": 5.1193, + "step": 58355 + }, + { + "epoch": 1.1873372395833333, + "grad_norm": 17.451431274414062, + "learning_rate": 7.983522224213708e-06, + "loss": 5.2331, + "step": 58360 + }, + { + "epoch": 1.18743896484375, + "grad_norm": 17.066566467285156, + "learning_rate": 7.983201489078881e-06, + "loss": 4.8964, + "step": 58365 + }, + { + "epoch": 1.1875406901041667, + "grad_norm": 20.136266708374023, + "learning_rate": 7.982880734882347e-06, + "loss": 4.7592, + "step": 58370 + }, + { + "epoch": 1.1876424153645833, + "grad_norm": 16.579452514648438, + "learning_rate": 7.982559961626153e-06, + "loss": 4.909, + "step": 58375 + }, + { + "epoch": 1.187744140625, + "grad_norm": 20.03336524963379, + "learning_rate": 7.982239169312343e-06, + "loss": 4.9214, + "step": 58380 + }, + { + "epoch": 1.1878458658854167, + "grad_norm": 18.570281982421875, + "learning_rate": 7.981918357942976e-06, + "loss": 4.9072, + "step": 58385 + }, + { + "epoch": 1.1879475911458333, + "grad_norm": 18.62318992614746, + "learning_rate": 7.981597527520093e-06, + "loss": 5.051, + "step": 58390 + }, + { + "epoch": 1.18804931640625, + "grad_norm": 19.656248092651367, + "learning_rate": 7.981276678045751e-06, + "loss": 4.8443, + "step": 58395 + }, + { + "epoch": 1.1881510416666667, + "grad_norm": 20.42673683166504, + "learning_rate": 7.980955809521995e-06, + "loss": 5.0469, + "step": 58400 + }, + { + "epoch": 1.1882527669270833, + "grad_norm": 14.096620559692383, + "learning_rate": 7.980634921950881e-06, + "loss": 5.0713, + "step": 58405 + }, + { + "epoch": 1.1883544921875, + "grad_norm": 14.637540817260742, + "learning_rate": 7.980314015334455e-06, + "loss": 5.2144, + "step": 58410 + }, + { + "epoch": 1.1884562174479167, + "grad_norm": 16.51445770263672, + "learning_rate": 7.979993089674767e-06, + "loss": 5.1411, + "step": 58415 + }, + { + "epoch": 1.1885579427083333, + "grad_norm": 18.35549545288086, + "learning_rate": 7.979672144973869e-06, + "loss": 4.9244, + "step": 58420 + }, + { + "epoch": 1.18865966796875, + "grad_norm": 18.854530334472656, + "learning_rate": 7.979351181233815e-06, + "loss": 5.1538, + "step": 58425 + }, + { + "epoch": 1.1887613932291667, + "grad_norm": 19.696693420410156, + "learning_rate": 7.97903019845665e-06, + "loss": 4.8432, + "step": 58430 + }, + { + "epoch": 1.1888631184895833, + "grad_norm": 16.417232513427734, + "learning_rate": 7.978709196644428e-06, + "loss": 4.7412, + "step": 58435 + }, + { + "epoch": 1.18896484375, + "grad_norm": 17.217361450195312, + "learning_rate": 7.9783881757992e-06, + "loss": 4.9209, + "step": 58440 + }, + { + "epoch": 1.1890665690104167, + "grad_norm": 15.760072708129883, + "learning_rate": 7.978067135923017e-06, + "loss": 4.9059, + "step": 58445 + }, + { + "epoch": 1.1891682942708333, + "grad_norm": 16.263944625854492, + "learning_rate": 7.97774607701793e-06, + "loss": 4.809, + "step": 58450 + }, + { + "epoch": 1.18927001953125, + "grad_norm": 13.49644947052002, + "learning_rate": 7.977424999085991e-06, + "loss": 4.9091, + "step": 58455 + }, + { + "epoch": 1.1893717447916667, + "grad_norm": 21.68926429748535, + "learning_rate": 7.977103902129252e-06, + "loss": 4.9819, + "step": 58460 + }, + { + "epoch": 1.1894734700520833, + "grad_norm": 21.847694396972656, + "learning_rate": 7.976782786149764e-06, + "loss": 4.8269, + "step": 58465 + }, + { + "epoch": 1.1895751953125, + "grad_norm": 16.627254486083984, + "learning_rate": 7.976461651149579e-06, + "loss": 5.0899, + "step": 58470 + }, + { + "epoch": 1.1896769205729167, + "grad_norm": 16.54035758972168, + "learning_rate": 7.97614049713075e-06, + "loss": 4.9453, + "step": 58475 + }, + { + "epoch": 1.1897786458333333, + "grad_norm": 16.40277099609375, + "learning_rate": 7.975819324095327e-06, + "loss": 4.6241, + "step": 58480 + }, + { + "epoch": 1.18988037109375, + "grad_norm": 20.43531036376953, + "learning_rate": 7.97549813204536e-06, + "loss": 5.0319, + "step": 58485 + }, + { + "epoch": 1.1899820963541667, + "grad_norm": 20.609294891357422, + "learning_rate": 7.975176920982908e-06, + "loss": 4.8901, + "step": 58490 + }, + { + "epoch": 1.1900838216145833, + "grad_norm": 15.379325866699219, + "learning_rate": 7.974855690910019e-06, + "loss": 4.8137, + "step": 58495 + }, + { + "epoch": 1.190185546875, + "grad_norm": 15.760255813598633, + "learning_rate": 7.974534441828748e-06, + "loss": 5.005, + "step": 58500 + }, + { + "epoch": 1.1902872721354167, + "grad_norm": 19.003284454345703, + "learning_rate": 7.974213173741146e-06, + "loss": 4.9771, + "step": 58505 + }, + { + "epoch": 1.1903889973958333, + "grad_norm": 22.24993896484375, + "learning_rate": 7.973891886649263e-06, + "loss": 5.1521, + "step": 58510 + }, + { + "epoch": 1.19049072265625, + "grad_norm": 16.22916603088379, + "learning_rate": 7.973570580555156e-06, + "loss": 4.7884, + "step": 58515 + }, + { + "epoch": 1.1905924479166667, + "grad_norm": 19.655622482299805, + "learning_rate": 7.973249255460879e-06, + "loss": 5.53, + "step": 58520 + }, + { + "epoch": 1.1906941731770833, + "grad_norm": 18.759918212890625, + "learning_rate": 7.972927911368483e-06, + "loss": 4.728, + "step": 58525 + }, + { + "epoch": 1.1907958984375, + "grad_norm": 15.147451400756836, + "learning_rate": 7.972606548280019e-06, + "loss": 4.7727, + "step": 58530 + }, + { + "epoch": 1.1908976236979167, + "grad_norm": 18.73996925354004, + "learning_rate": 7.972285166197545e-06, + "loss": 4.8706, + "step": 58535 + }, + { + "epoch": 1.1909993489583333, + "grad_norm": 16.91047477722168, + "learning_rate": 7.97196376512311e-06, + "loss": 5.1058, + "step": 58540 + }, + { + "epoch": 1.19110107421875, + "grad_norm": 17.949338912963867, + "learning_rate": 7.971642345058773e-06, + "loss": 5.1727, + "step": 58545 + }, + { + "epoch": 1.1912027994791667, + "grad_norm": 13.464547157287598, + "learning_rate": 7.97132090600658e-06, + "loss": 4.934, + "step": 58550 + }, + { + "epoch": 1.1913045247395833, + "grad_norm": 16.004053115844727, + "learning_rate": 7.970999447968593e-06, + "loss": 4.9893, + "step": 58555 + }, + { + "epoch": 1.19140625, + "grad_norm": 17.412994384765625, + "learning_rate": 7.970677970946863e-06, + "loss": 5.1163, + "step": 58560 + }, + { + "epoch": 1.1915079752604167, + "grad_norm": 16.35930061340332, + "learning_rate": 7.970356474943444e-06, + "loss": 4.7702, + "step": 58565 + }, + { + "epoch": 1.1916097005208333, + "grad_norm": 19.22073745727539, + "learning_rate": 7.970034959960389e-06, + "loss": 5.1063, + "step": 58570 + }, + { + "epoch": 1.19171142578125, + "grad_norm": 16.263940811157227, + "learning_rate": 7.969713425999752e-06, + "loss": 4.9041, + "step": 58575 + }, + { + "epoch": 1.1918131510416667, + "grad_norm": 24.907896041870117, + "learning_rate": 7.969391873063592e-06, + "loss": 4.8436, + "step": 58580 + }, + { + "epoch": 1.1919148763020833, + "grad_norm": 21.64178466796875, + "learning_rate": 7.969070301153958e-06, + "loss": 4.6872, + "step": 58585 + }, + { + "epoch": 1.1920166015625, + "grad_norm": 15.055084228515625, + "learning_rate": 7.968748710272908e-06, + "loss": 5.1087, + "step": 58590 + }, + { + "epoch": 1.1921183268229167, + "grad_norm": 15.213611602783203, + "learning_rate": 7.968427100422499e-06, + "loss": 5.159, + "step": 58595 + }, + { + "epoch": 1.1922200520833333, + "grad_norm": 15.038260459899902, + "learning_rate": 7.968105471604778e-06, + "loss": 4.8583, + "step": 58600 + }, + { + "epoch": 1.19232177734375, + "grad_norm": 14.987110137939453, + "learning_rate": 7.967783823821809e-06, + "loss": 4.7288, + "step": 58605 + }, + { + "epoch": 1.1924235026041667, + "grad_norm": 16.569866180419922, + "learning_rate": 7.967462157075642e-06, + "loss": 4.9555, + "step": 58610 + }, + { + "epoch": 1.1925252278645833, + "grad_norm": 19.915037155151367, + "learning_rate": 7.967140471368333e-06, + "loss": 4.9263, + "step": 58615 + }, + { + "epoch": 1.192626953125, + "grad_norm": 18.001537322998047, + "learning_rate": 7.96681876670194e-06, + "loss": 4.895, + "step": 58620 + }, + { + "epoch": 1.1927286783854167, + "grad_norm": 25.93442153930664, + "learning_rate": 7.966497043078516e-06, + "loss": 4.79, + "step": 58625 + }, + { + "epoch": 1.1928304036458333, + "grad_norm": 17.53459358215332, + "learning_rate": 7.966175300500119e-06, + "loss": 4.9262, + "step": 58630 + }, + { + "epoch": 1.19293212890625, + "grad_norm": 13.724899291992188, + "learning_rate": 7.965853538968802e-06, + "loss": 4.7975, + "step": 58635 + }, + { + "epoch": 1.1930338541666667, + "grad_norm": 14.19210147857666, + "learning_rate": 7.965531758486622e-06, + "loss": 4.9438, + "step": 58640 + }, + { + "epoch": 1.1931355794270833, + "grad_norm": 15.823081016540527, + "learning_rate": 7.965209959055637e-06, + "loss": 4.9052, + "step": 58645 + }, + { + "epoch": 1.1932373046875, + "grad_norm": 18.019567489624023, + "learning_rate": 7.9648881406779e-06, + "loss": 5.3805, + "step": 58650 + }, + { + "epoch": 1.1933390299479167, + "grad_norm": 17.422765731811523, + "learning_rate": 7.96456630335547e-06, + "loss": 4.9169, + "step": 58655 + }, + { + "epoch": 1.1934407552083333, + "grad_norm": 19.93830680847168, + "learning_rate": 7.964244447090401e-06, + "loss": 5.1896, + "step": 58660 + }, + { + "epoch": 1.19354248046875, + "grad_norm": 15.367193222045898, + "learning_rate": 7.963922571884752e-06, + "loss": 4.7255, + "step": 58665 + }, + { + "epoch": 1.1936442057291667, + "grad_norm": 13.32784652709961, + "learning_rate": 7.96360067774058e-06, + "loss": 4.6876, + "step": 58670 + }, + { + "epoch": 1.1937459309895833, + "grad_norm": 19.163267135620117, + "learning_rate": 7.963278764659939e-06, + "loss": 4.5721, + "step": 58675 + }, + { + "epoch": 1.19384765625, + "grad_norm": 22.357816696166992, + "learning_rate": 7.962956832644888e-06, + "loss": 4.8381, + "step": 58680 + }, + { + "epoch": 1.1939493815104167, + "grad_norm": 19.4720516204834, + "learning_rate": 7.962634881697481e-06, + "loss": 4.927, + "step": 58685 + }, + { + "epoch": 1.1940511067708333, + "grad_norm": 21.53831672668457, + "learning_rate": 7.962312911819781e-06, + "loss": 4.9365, + "step": 58690 + }, + { + "epoch": 1.19415283203125, + "grad_norm": 19.30400276184082, + "learning_rate": 7.96199092301384e-06, + "loss": 4.802, + "step": 58695 + }, + { + "epoch": 1.1942545572916667, + "grad_norm": 18.241975784301758, + "learning_rate": 7.961668915281717e-06, + "loss": 4.9171, + "step": 58700 + }, + { + "epoch": 1.1943562825520833, + "grad_norm": 16.663022994995117, + "learning_rate": 7.96134688862547e-06, + "loss": 4.751, + "step": 58705 + }, + { + "epoch": 1.1944580078125, + "grad_norm": 18.094484329223633, + "learning_rate": 7.961024843047159e-06, + "loss": 4.9579, + "step": 58710 + }, + { + "epoch": 1.1945597330729167, + "grad_norm": 15.23846435546875, + "learning_rate": 7.960702778548835e-06, + "loss": 4.9695, + "step": 58715 + }, + { + "epoch": 1.1946614583333333, + "grad_norm": 16.02237892150879, + "learning_rate": 7.960380695132563e-06, + "loss": 5.0269, + "step": 58720 + }, + { + "epoch": 1.19476318359375, + "grad_norm": 15.496077537536621, + "learning_rate": 7.960058592800398e-06, + "loss": 5.0325, + "step": 58725 + }, + { + "epoch": 1.1948649088541667, + "grad_norm": 17.667585372924805, + "learning_rate": 7.959736471554395e-06, + "loss": 4.9838, + "step": 58730 + }, + { + "epoch": 1.1949666341145833, + "grad_norm": 20.95571517944336, + "learning_rate": 7.95941433139662e-06, + "loss": 5.0188, + "step": 58735 + }, + { + "epoch": 1.195068359375, + "grad_norm": 15.146299362182617, + "learning_rate": 7.959092172329124e-06, + "loss": 4.7965, + "step": 58740 + }, + { + "epoch": 1.1951700846354167, + "grad_norm": 26.29986572265625, + "learning_rate": 7.95876999435397e-06, + "loss": 4.9837, + "step": 58745 + }, + { + "epoch": 1.1952718098958333, + "grad_norm": 16.03529930114746, + "learning_rate": 7.958447797473214e-06, + "loss": 4.9196, + "step": 58750 + }, + { + "epoch": 1.19537353515625, + "grad_norm": 15.175127029418945, + "learning_rate": 7.958125581688915e-06, + "loss": 4.8847, + "step": 58755 + }, + { + "epoch": 1.1954752604166667, + "grad_norm": 17.291664123535156, + "learning_rate": 7.957803347003134e-06, + "loss": 4.779, + "step": 58760 + }, + { + "epoch": 1.1955769856770833, + "grad_norm": 27.522581100463867, + "learning_rate": 7.957481093417928e-06, + "loss": 4.7949, + "step": 58765 + }, + { + "epoch": 1.1956787109375, + "grad_norm": 23.66109275817871, + "learning_rate": 7.957158820935356e-06, + "loss": 5.1252, + "step": 58770 + }, + { + "epoch": 1.1957804361979167, + "grad_norm": 14.842301368713379, + "learning_rate": 7.956836529557479e-06, + "loss": 5.0772, + "step": 58775 + }, + { + "epoch": 1.1958821614583333, + "grad_norm": 16.81344985961914, + "learning_rate": 7.956514219286355e-06, + "loss": 4.8072, + "step": 58780 + }, + { + "epoch": 1.19598388671875, + "grad_norm": 13.640575408935547, + "learning_rate": 7.956191890124043e-06, + "loss": 5.056, + "step": 58785 + }, + { + "epoch": 1.1960856119791667, + "grad_norm": 19.105710983276367, + "learning_rate": 7.955869542072604e-06, + "loss": 4.9269, + "step": 58790 + }, + { + "epoch": 1.1961873372395833, + "grad_norm": 13.613615989685059, + "learning_rate": 7.955547175134096e-06, + "loss": 5.0717, + "step": 58795 + }, + { + "epoch": 1.1962890625, + "grad_norm": 15.315388679504395, + "learning_rate": 7.955224789310581e-06, + "loss": 4.8904, + "step": 58800 + }, + { + "epoch": 1.1963907877604167, + "grad_norm": 14.984381675720215, + "learning_rate": 7.954902384604117e-06, + "loss": 4.9388, + "step": 58805 + }, + { + "epoch": 1.1964925130208333, + "grad_norm": 15.410088539123535, + "learning_rate": 7.954579961016764e-06, + "loss": 5.0569, + "step": 58810 + }, + { + "epoch": 1.19659423828125, + "grad_norm": 15.096136093139648, + "learning_rate": 7.954257518550585e-06, + "loss": 4.8412, + "step": 58815 + }, + { + "epoch": 1.1966959635416667, + "grad_norm": 18.827728271484375, + "learning_rate": 7.953935057207636e-06, + "loss": 5.2736, + "step": 58820 + }, + { + "epoch": 1.1967976888020833, + "grad_norm": 16.399866104125977, + "learning_rate": 7.953612576989983e-06, + "loss": 4.8318, + "step": 58825 + }, + { + "epoch": 1.1968994140625, + "grad_norm": 22.8079776763916, + "learning_rate": 7.953290077899679e-06, + "loss": 4.9595, + "step": 58830 + }, + { + "epoch": 1.1970011393229167, + "grad_norm": 22.911632537841797, + "learning_rate": 7.952967559938792e-06, + "loss": 5.102, + "step": 58835 + }, + { + "epoch": 1.1971028645833333, + "grad_norm": 18.002140045166016, + "learning_rate": 7.952645023109378e-06, + "loss": 4.8287, + "step": 58840 + }, + { + "epoch": 1.19720458984375, + "grad_norm": 14.69532299041748, + "learning_rate": 7.952322467413499e-06, + "loss": 4.8183, + "step": 58845 + }, + { + "epoch": 1.1973063151041667, + "grad_norm": 18.20501136779785, + "learning_rate": 7.951999892853218e-06, + "loss": 5.048, + "step": 58850 + }, + { + "epoch": 1.1974080403645833, + "grad_norm": 15.432168006896973, + "learning_rate": 7.951677299430596e-06, + "loss": 5.0438, + "step": 58855 + }, + { + "epoch": 1.197509765625, + "grad_norm": 15.92208480834961, + "learning_rate": 7.95135468714769e-06, + "loss": 5.0036, + "step": 58860 + }, + { + "epoch": 1.1976114908854167, + "grad_norm": 13.42992877960205, + "learning_rate": 7.951032056006567e-06, + "loss": 4.89, + "step": 58865 + }, + { + "epoch": 1.1977132161458333, + "grad_norm": 20.772533416748047, + "learning_rate": 7.950709406009285e-06, + "loss": 4.8306, + "step": 58870 + }, + { + "epoch": 1.19781494140625, + "grad_norm": 15.551609992980957, + "learning_rate": 7.950386737157906e-06, + "loss": 5.0948, + "step": 58875 + }, + { + "epoch": 1.1979166666666667, + "grad_norm": 17.915042877197266, + "learning_rate": 7.950064049454493e-06, + "loss": 5.0625, + "step": 58880 + }, + { + "epoch": 1.1980183919270833, + "grad_norm": 15.343132019042969, + "learning_rate": 7.949741342901107e-06, + "loss": 4.8779, + "step": 58885 + }, + { + "epoch": 1.1981201171875, + "grad_norm": 21.866954803466797, + "learning_rate": 7.949418617499812e-06, + "loss": 4.9206, + "step": 58890 + }, + { + "epoch": 1.1982218424479167, + "grad_norm": 15.190711975097656, + "learning_rate": 7.949095873252665e-06, + "loss": 5.0067, + "step": 58895 + }, + { + "epoch": 1.1983235677083333, + "grad_norm": 43.838680267333984, + "learning_rate": 7.948773110161733e-06, + "loss": 5.1124, + "step": 58900 + }, + { + "epoch": 1.19842529296875, + "grad_norm": 20.825368881225586, + "learning_rate": 7.948450328229077e-06, + "loss": 4.9143, + "step": 58905 + }, + { + "epoch": 1.1985270182291667, + "grad_norm": 18.733896255493164, + "learning_rate": 7.948127527456759e-06, + "loss": 4.8657, + "step": 58910 + }, + { + "epoch": 1.1986287434895833, + "grad_norm": 14.804136276245117, + "learning_rate": 7.947804707846843e-06, + "loss": 4.8466, + "step": 58915 + }, + { + "epoch": 1.19873046875, + "grad_norm": 14.564234733581543, + "learning_rate": 7.94748186940139e-06, + "loss": 4.8806, + "step": 58920 + }, + { + "epoch": 1.1988321940104167, + "grad_norm": 26.830320358276367, + "learning_rate": 7.947159012122463e-06, + "loss": 5.2156, + "step": 58925 + }, + { + "epoch": 1.1989339192708333, + "grad_norm": 17.775487899780273, + "learning_rate": 7.946836136012126e-06, + "loss": 5.1355, + "step": 58930 + }, + { + "epoch": 1.19903564453125, + "grad_norm": 14.910673141479492, + "learning_rate": 7.946513241072443e-06, + "loss": 4.9071, + "step": 58935 + }, + { + "epoch": 1.1991373697916667, + "grad_norm": 16.188711166381836, + "learning_rate": 7.946190327305474e-06, + "loss": 4.7119, + "step": 58940 + }, + { + "epoch": 1.1992390950520833, + "grad_norm": 14.457450866699219, + "learning_rate": 7.945867394713285e-06, + "loss": 4.8671, + "step": 58945 + }, + { + "epoch": 1.1993408203125, + "grad_norm": 17.34372329711914, + "learning_rate": 7.945544443297936e-06, + "loss": 5.06, + "step": 58950 + }, + { + "epoch": 1.1994425455729167, + "grad_norm": 16.961746215820312, + "learning_rate": 7.945221473061495e-06, + "loss": 4.9819, + "step": 58955 + }, + { + "epoch": 1.1995442708333333, + "grad_norm": 24.917579650878906, + "learning_rate": 7.944898484006022e-06, + "loss": 5.3648, + "step": 58960 + }, + { + "epoch": 1.19964599609375, + "grad_norm": 18.359970092773438, + "learning_rate": 7.944575476133586e-06, + "loss": 5.0317, + "step": 58965 + }, + { + "epoch": 1.1997477213541667, + "grad_norm": 17.714296340942383, + "learning_rate": 7.944252449446246e-06, + "loss": 4.8713, + "step": 58970 + }, + { + "epoch": 1.1998494466145833, + "grad_norm": 11.567482948303223, + "learning_rate": 7.943929403946067e-06, + "loss": 4.7524, + "step": 58975 + }, + { + "epoch": 1.199951171875, + "grad_norm": 27.036100387573242, + "learning_rate": 7.943606339635112e-06, + "loss": 5.1933, + "step": 58980 + }, + { + "epoch": 1.2000528971354167, + "grad_norm": 17.8358097076416, + "learning_rate": 7.94328325651545e-06, + "loss": 4.7181, + "step": 58985 + }, + { + "epoch": 1.2001546223958333, + "grad_norm": 20.405834197998047, + "learning_rate": 7.942960154589141e-06, + "loss": 5.0738, + "step": 58990 + }, + { + "epoch": 1.20025634765625, + "grad_norm": 14.944464683532715, + "learning_rate": 7.94263703385825e-06, + "loss": 4.957, + "step": 58995 + }, + { + "epoch": 1.2003580729166667, + "grad_norm": 17.639694213867188, + "learning_rate": 7.942313894324844e-06, + "loss": 4.971, + "step": 59000 + }, + { + "epoch": 1.2004597981770833, + "grad_norm": 22.32814598083496, + "learning_rate": 7.941990735990986e-06, + "loss": 5.2261, + "step": 59005 + }, + { + "epoch": 1.2005615234375, + "grad_norm": 19.93135643005371, + "learning_rate": 7.94166755885874e-06, + "loss": 4.9006, + "step": 59010 + }, + { + "epoch": 1.2006632486979167, + "grad_norm": 15.253475189208984, + "learning_rate": 7.941344362930172e-06, + "loss": 5.0016, + "step": 59015 + }, + { + "epoch": 1.2007649739583333, + "grad_norm": 14.4833984375, + "learning_rate": 7.941021148207349e-06, + "loss": 4.7826, + "step": 59020 + }, + { + "epoch": 1.20086669921875, + "grad_norm": 19.63616180419922, + "learning_rate": 7.940697914692332e-06, + "loss": 4.6179, + "step": 59025 + }, + { + "epoch": 1.2009684244791667, + "grad_norm": 14.667572975158691, + "learning_rate": 7.940374662387191e-06, + "loss": 4.7546, + "step": 59030 + }, + { + "epoch": 1.2010701497395833, + "grad_norm": 18.75111198425293, + "learning_rate": 7.940051391293988e-06, + "loss": 4.8128, + "step": 59035 + }, + { + "epoch": 1.201171875, + "grad_norm": 18.486268997192383, + "learning_rate": 7.93972810141479e-06, + "loss": 4.7452, + "step": 59040 + }, + { + "epoch": 1.2012736002604167, + "grad_norm": 15.926276206970215, + "learning_rate": 7.939404792751664e-06, + "loss": 5.049, + "step": 59045 + }, + { + "epoch": 1.2013753255208333, + "grad_norm": 14.318572044372559, + "learning_rate": 7.939081465306673e-06, + "loss": 5.0124, + "step": 59050 + }, + { + "epoch": 1.20147705078125, + "grad_norm": 18.771066665649414, + "learning_rate": 7.938758119081885e-06, + "loss": 5.0141, + "step": 59055 + }, + { + "epoch": 1.2015787760416667, + "grad_norm": 20.254066467285156, + "learning_rate": 7.938434754079367e-06, + "loss": 4.9923, + "step": 59060 + }, + { + "epoch": 1.2016805013020833, + "grad_norm": 18.589916229248047, + "learning_rate": 7.938111370301181e-06, + "loss": 4.6872, + "step": 59065 + }, + { + "epoch": 1.2017822265625, + "grad_norm": 16.756196975708008, + "learning_rate": 7.937787967749398e-06, + "loss": 4.9188, + "step": 59070 + }, + { + "epoch": 1.2018839518229167, + "grad_norm": 20.02516746520996, + "learning_rate": 7.937464546426082e-06, + "loss": 4.9077, + "step": 59075 + }, + { + "epoch": 1.2019856770833333, + "grad_norm": 18.036577224731445, + "learning_rate": 7.937141106333299e-06, + "loss": 5.0772, + "step": 59080 + }, + { + "epoch": 1.20208740234375, + "grad_norm": 14.980390548706055, + "learning_rate": 7.936817647473116e-06, + "loss": 4.8223, + "step": 59085 + }, + { + "epoch": 1.2021891276041667, + "grad_norm": 14.067768096923828, + "learning_rate": 7.936494169847602e-06, + "loss": 4.8315, + "step": 59090 + }, + { + "epoch": 1.2022908528645833, + "grad_norm": 13.58788776397705, + "learning_rate": 7.936170673458824e-06, + "loss": 4.8863, + "step": 59095 + }, + { + "epoch": 1.202392578125, + "grad_norm": 19.53571128845215, + "learning_rate": 7.935847158308845e-06, + "loss": 5.0083, + "step": 59100 + }, + { + "epoch": 1.2024943033854167, + "grad_norm": 17.384607315063477, + "learning_rate": 7.935523624399734e-06, + "loss": 4.9081, + "step": 59105 + }, + { + "epoch": 1.2025960286458333, + "grad_norm": 17.93145751953125, + "learning_rate": 7.935200071733562e-06, + "loss": 5.0137, + "step": 59110 + }, + { + "epoch": 1.20269775390625, + "grad_norm": 20.57648277282715, + "learning_rate": 7.934876500312389e-06, + "loss": 4.9146, + "step": 59115 + }, + { + "epoch": 1.2027994791666667, + "grad_norm": 18.876596450805664, + "learning_rate": 7.934552910138292e-06, + "loss": 4.8188, + "step": 59120 + }, + { + "epoch": 1.2029012044270833, + "grad_norm": 18.039146423339844, + "learning_rate": 7.934229301213329e-06, + "loss": 5.2329, + "step": 59125 + }, + { + "epoch": 1.2030029296875, + "grad_norm": 14.499164581298828, + "learning_rate": 7.933905673539575e-06, + "loss": 4.7565, + "step": 59130 + }, + { + "epoch": 1.2031046549479167, + "grad_norm": 18.321578979492188, + "learning_rate": 7.933582027119095e-06, + "loss": 4.6219, + "step": 59135 + }, + { + "epoch": 1.2032063802083333, + "grad_norm": 21.15450096130371, + "learning_rate": 7.933258361953956e-06, + "loss": 4.8412, + "step": 59140 + }, + { + "epoch": 1.20330810546875, + "grad_norm": 19.000499725341797, + "learning_rate": 7.932934678046227e-06, + "loss": 4.8692, + "step": 59145 + }, + { + "epoch": 1.2034098307291667, + "grad_norm": 18.748626708984375, + "learning_rate": 7.932610975397976e-06, + "loss": 5.1109, + "step": 59150 + }, + { + "epoch": 1.2035115559895833, + "grad_norm": 18.308115005493164, + "learning_rate": 7.932287254011272e-06, + "loss": 4.7621, + "step": 59155 + }, + { + "epoch": 1.20361328125, + "grad_norm": 14.694340705871582, + "learning_rate": 7.931963513888186e-06, + "loss": 4.9432, + "step": 59160 + }, + { + "epoch": 1.2037150065104167, + "grad_norm": 19.669544219970703, + "learning_rate": 7.931639755030782e-06, + "loss": 4.9206, + "step": 59165 + }, + { + "epoch": 1.2038167317708333, + "grad_norm": 15.717430114746094, + "learning_rate": 7.93131597744113e-06, + "loss": 4.9829, + "step": 59170 + }, + { + "epoch": 1.20391845703125, + "grad_norm": 19.41700553894043, + "learning_rate": 7.9309921811213e-06, + "loss": 5.3019, + "step": 59175 + }, + { + "epoch": 1.2040201822916667, + "grad_norm": 17.56094741821289, + "learning_rate": 7.93066836607336e-06, + "loss": 5.0735, + "step": 59180 + }, + { + "epoch": 1.2041219075520833, + "grad_norm": 20.179195404052734, + "learning_rate": 7.930344532299378e-06, + "loss": 4.7823, + "step": 59185 + }, + { + "epoch": 1.2042236328125, + "grad_norm": 22.273496627807617, + "learning_rate": 7.930020679801428e-06, + "loss": 4.8828, + "step": 59190 + }, + { + "epoch": 1.2043253580729167, + "grad_norm": 17.858882904052734, + "learning_rate": 7.929696808581574e-06, + "loss": 4.9708, + "step": 59195 + }, + { + "epoch": 1.2044270833333333, + "grad_norm": 16.331077575683594, + "learning_rate": 7.92937291864189e-06, + "loss": 4.843, + "step": 59200 + }, + { + "epoch": 1.20452880859375, + "grad_norm": 17.30413818359375, + "learning_rate": 7.92904900998444e-06, + "loss": 5.0074, + "step": 59205 + }, + { + "epoch": 1.2046305338541667, + "grad_norm": 16.070552825927734, + "learning_rate": 7.928725082611295e-06, + "loss": 4.9792, + "step": 59210 + }, + { + "epoch": 1.2047322591145833, + "grad_norm": 22.82529067993164, + "learning_rate": 7.928401136524531e-06, + "loss": 4.8297, + "step": 59215 + }, + { + "epoch": 1.204833984375, + "grad_norm": 16.135480880737305, + "learning_rate": 7.92807717172621e-06, + "loss": 4.7545, + "step": 59220 + }, + { + "epoch": 1.2049357096354167, + "grad_norm": 20.227567672729492, + "learning_rate": 7.927753188218407e-06, + "loss": 5.2075, + "step": 59225 + }, + { + "epoch": 1.2050374348958333, + "grad_norm": 19.107433319091797, + "learning_rate": 7.92742918600319e-06, + "loss": 5.0657, + "step": 59230 + }, + { + "epoch": 1.20513916015625, + "grad_norm": 16.219820022583008, + "learning_rate": 7.92710516508263e-06, + "loss": 5.043, + "step": 59235 + }, + { + "epoch": 1.2052408854166667, + "grad_norm": 18.84449577331543, + "learning_rate": 7.926781125458796e-06, + "loss": 4.794, + "step": 59240 + }, + { + "epoch": 1.2053426106770833, + "grad_norm": 12.952081680297852, + "learning_rate": 7.92645706713376e-06, + "loss": 4.6701, + "step": 59245 + }, + { + "epoch": 1.2054443359375, + "grad_norm": 17.303544998168945, + "learning_rate": 7.926132990109593e-06, + "loss": 5.0085, + "step": 59250 + }, + { + "epoch": 1.2055460611979167, + "grad_norm": 20.065776824951172, + "learning_rate": 7.925808894388365e-06, + "loss": 5.0176, + "step": 59255 + }, + { + "epoch": 1.2056477864583333, + "grad_norm": 18.369976043701172, + "learning_rate": 7.925484779972148e-06, + "loss": 5.0759, + "step": 59260 + }, + { + "epoch": 1.20574951171875, + "grad_norm": 16.536222457885742, + "learning_rate": 7.925160646863011e-06, + "loss": 4.8415, + "step": 59265 + }, + { + "epoch": 1.2058512369791667, + "grad_norm": 16.10375213623047, + "learning_rate": 7.924836495063027e-06, + "loss": 4.552, + "step": 59270 + }, + { + "epoch": 1.2059529622395833, + "grad_norm": 20.962772369384766, + "learning_rate": 7.924512324574263e-06, + "loss": 4.9566, + "step": 59275 + }, + { + "epoch": 1.2060546875, + "grad_norm": 21.30280876159668, + "learning_rate": 7.924188135398798e-06, + "loss": 5.0289, + "step": 59280 + }, + { + "epoch": 1.2061564127604167, + "grad_norm": 19.345067977905273, + "learning_rate": 7.923863927538695e-06, + "loss": 5.1043, + "step": 59285 + }, + { + "epoch": 1.2062581380208333, + "grad_norm": 19.34067726135254, + "learning_rate": 7.923539700996032e-06, + "loss": 5.1275, + "step": 59290 + }, + { + "epoch": 1.20635986328125, + "grad_norm": 18.40644645690918, + "learning_rate": 7.923215455772879e-06, + "loss": 5.2179, + "step": 59295 + }, + { + "epoch": 1.2064615885416667, + "grad_norm": 18.969011306762695, + "learning_rate": 7.922891191871305e-06, + "loss": 4.8842, + "step": 59300 + }, + { + "epoch": 1.2065633138020833, + "grad_norm": 17.011018753051758, + "learning_rate": 7.922566909293385e-06, + "loss": 4.9, + "step": 59305 + }, + { + "epoch": 1.2066650390625, + "grad_norm": 22.683746337890625, + "learning_rate": 7.92224260804119e-06, + "loss": 4.8951, + "step": 59310 + }, + { + "epoch": 1.2067667643229167, + "grad_norm": 16.52255630493164, + "learning_rate": 7.921918288116794e-06, + "loss": 5.024, + "step": 59315 + }, + { + "epoch": 1.2068684895833333, + "grad_norm": 14.523539543151855, + "learning_rate": 7.921593949522267e-06, + "loss": 4.9327, + "step": 59320 + }, + { + "epoch": 1.20697021484375, + "grad_norm": 16.645416259765625, + "learning_rate": 7.921269592259682e-06, + "loss": 5.2283, + "step": 59325 + }, + { + "epoch": 1.2070719401041667, + "grad_norm": 17.50410270690918, + "learning_rate": 7.92094521633111e-06, + "loss": 4.688, + "step": 59330 + }, + { + "epoch": 1.2071736653645833, + "grad_norm": 23.327299118041992, + "learning_rate": 7.920620821738627e-06, + "loss": 5.052, + "step": 59335 + }, + { + "epoch": 1.207275390625, + "grad_norm": 16.854555130004883, + "learning_rate": 7.920296408484304e-06, + "loss": 5.0872, + "step": 59340 + }, + { + "epoch": 1.2073771158854167, + "grad_norm": 17.698408126831055, + "learning_rate": 7.919971976570214e-06, + "loss": 4.8663, + "step": 59345 + }, + { + "epoch": 1.2074788411458333, + "grad_norm": 12.98280143737793, + "learning_rate": 7.919647525998429e-06, + "loss": 5.2996, + "step": 59350 + }, + { + "epoch": 1.20758056640625, + "grad_norm": 18.107999801635742, + "learning_rate": 7.919323056771025e-06, + "loss": 4.8187, + "step": 59355 + }, + { + "epoch": 1.2076822916666667, + "grad_norm": 18.4349308013916, + "learning_rate": 7.918998568890071e-06, + "loss": 4.9245, + "step": 59360 + }, + { + "epoch": 1.2077840169270833, + "grad_norm": 15.079440116882324, + "learning_rate": 7.918674062357644e-06, + "loss": 4.961, + "step": 59365 + }, + { + "epoch": 1.2078857421875, + "grad_norm": 15.284764289855957, + "learning_rate": 7.918349537175816e-06, + "loss": 4.8515, + "step": 59370 + }, + { + "epoch": 1.2079874674479167, + "grad_norm": 16.804485321044922, + "learning_rate": 7.91802499334666e-06, + "loss": 4.9701, + "step": 59375 + }, + { + "epoch": 1.2080891927083333, + "grad_norm": 22.79857063293457, + "learning_rate": 7.917700430872252e-06, + "loss": 4.737, + "step": 59380 + }, + { + "epoch": 1.20819091796875, + "grad_norm": 17.412879943847656, + "learning_rate": 7.917375849754665e-06, + "loss": 4.8922, + "step": 59385 + }, + { + "epoch": 1.2082926432291667, + "grad_norm": 17.5612850189209, + "learning_rate": 7.91705124999597e-06, + "loss": 4.9652, + "step": 59390 + }, + { + "epoch": 1.2083943684895833, + "grad_norm": 16.443143844604492, + "learning_rate": 7.916726631598246e-06, + "loss": 5.1805, + "step": 59395 + }, + { + "epoch": 1.20849609375, + "grad_norm": 14.216609001159668, + "learning_rate": 7.916401994563563e-06, + "loss": 4.7821, + "step": 59400 + }, + { + "epoch": 1.2085978190104167, + "grad_norm": 21.633928298950195, + "learning_rate": 7.916077338893998e-06, + "loss": 4.8336, + "step": 59405 + }, + { + "epoch": 1.2086995442708333, + "grad_norm": 14.012299537658691, + "learning_rate": 7.915752664591625e-06, + "loss": 4.8989, + "step": 59410 + }, + { + "epoch": 1.20880126953125, + "grad_norm": 14.195463180541992, + "learning_rate": 7.915427971658517e-06, + "loss": 4.9076, + "step": 59415 + }, + { + "epoch": 1.2089029947916667, + "grad_norm": 16.14631462097168, + "learning_rate": 7.91510326009675e-06, + "loss": 4.7903, + "step": 59420 + }, + { + "epoch": 1.2090047200520833, + "grad_norm": 45.559391021728516, + "learning_rate": 7.914778529908398e-06, + "loss": 4.7327, + "step": 59425 + }, + { + "epoch": 1.2091064453125, + "grad_norm": 19.15484046936035, + "learning_rate": 7.914453781095539e-06, + "loss": 4.8418, + "step": 59430 + }, + { + "epoch": 1.2092081705729167, + "grad_norm": 15.416104316711426, + "learning_rate": 7.914129013660243e-06, + "loss": 5.2031, + "step": 59435 + }, + { + "epoch": 1.2093098958333333, + "grad_norm": 19.43401336669922, + "learning_rate": 7.913804227604586e-06, + "loss": 5.0453, + "step": 59440 + }, + { + "epoch": 1.20941162109375, + "grad_norm": 18.81270408630371, + "learning_rate": 7.913479422930648e-06, + "loss": 5.1388, + "step": 59445 + }, + { + "epoch": 1.2095133463541667, + "grad_norm": 15.41551399230957, + "learning_rate": 7.913154599640502e-06, + "loss": 4.8545, + "step": 59450 + }, + { + "epoch": 1.2096150716145833, + "grad_norm": 17.498376846313477, + "learning_rate": 7.91282975773622e-06, + "loss": 4.7442, + "step": 59455 + }, + { + "epoch": 1.209716796875, + "grad_norm": 17.483301162719727, + "learning_rate": 7.912504897219883e-06, + "loss": 4.921, + "step": 59460 + }, + { + "epoch": 1.2098185221354167, + "grad_norm": 18.263708114624023, + "learning_rate": 7.912180018093563e-06, + "loss": 4.7693, + "step": 59465 + }, + { + "epoch": 1.2099202473958333, + "grad_norm": 15.366487503051758, + "learning_rate": 7.911855120359338e-06, + "loss": 4.72, + "step": 59470 + }, + { + "epoch": 1.21002197265625, + "grad_norm": 15.248455047607422, + "learning_rate": 7.911530204019282e-06, + "loss": 4.8512, + "step": 59475 + }, + { + "epoch": 1.2101236979166667, + "grad_norm": 18.60976791381836, + "learning_rate": 7.911205269075472e-06, + "loss": 4.8579, + "step": 59480 + }, + { + "epoch": 1.2102254231770833, + "grad_norm": 17.745595932006836, + "learning_rate": 7.910880315529986e-06, + "loss": 4.934, + "step": 59485 + }, + { + "epoch": 1.2103271484375, + "grad_norm": 15.557550430297852, + "learning_rate": 7.910555343384897e-06, + "loss": 4.9299, + "step": 59490 + }, + { + "epoch": 1.2104288736979167, + "grad_norm": 17.94373321533203, + "learning_rate": 7.910230352642286e-06, + "loss": 4.6162, + "step": 59495 + }, + { + "epoch": 1.2105305989583333, + "grad_norm": 24.595497131347656, + "learning_rate": 7.909905343304224e-06, + "loss": 4.9823, + "step": 59500 + }, + { + "epoch": 1.21063232421875, + "grad_norm": 15.467735290527344, + "learning_rate": 7.90958031537279e-06, + "loss": 4.7319, + "step": 59505 + }, + { + "epoch": 1.2107340494791667, + "grad_norm": 18.427108764648438, + "learning_rate": 7.909255268850064e-06, + "loss": 5.263, + "step": 59510 + }, + { + "epoch": 1.2108357747395833, + "grad_norm": 19.844884872436523, + "learning_rate": 7.908930203738119e-06, + "loss": 5.0752, + "step": 59515 + }, + { + "epoch": 1.2109375, + "grad_norm": 20.55388069152832, + "learning_rate": 7.908605120039035e-06, + "loss": 5.0077, + "step": 59520 + }, + { + "epoch": 1.2110392252604167, + "grad_norm": 17.37554359436035, + "learning_rate": 7.908280017754885e-06, + "loss": 5.0046, + "step": 59525 + }, + { + "epoch": 1.2111409505208333, + "grad_norm": 15.331778526306152, + "learning_rate": 7.90795489688775e-06, + "loss": 4.9242, + "step": 59530 + }, + { + "epoch": 1.21124267578125, + "grad_norm": 13.457752227783203, + "learning_rate": 7.907629757439707e-06, + "loss": 5.0349, + "step": 59535 + }, + { + "epoch": 1.2113444010416667, + "grad_norm": 20.643802642822266, + "learning_rate": 7.907304599412835e-06, + "loss": 4.7536, + "step": 59540 + }, + { + "epoch": 1.2114461263020833, + "grad_norm": 16.217912673950195, + "learning_rate": 7.906979422809206e-06, + "loss": 4.8614, + "step": 59545 + }, + { + "epoch": 1.2115478515625, + "grad_norm": 15.720159530639648, + "learning_rate": 7.906654227630903e-06, + "loss": 4.8611, + "step": 59550 + }, + { + "epoch": 1.2116495768229167, + "grad_norm": 17.797842025756836, + "learning_rate": 7.90632901388e-06, + "loss": 5.0448, + "step": 59555 + }, + { + "epoch": 1.2117513020833333, + "grad_norm": 17.67546844482422, + "learning_rate": 7.90600378155858e-06, + "loss": 4.8324, + "step": 59560 + }, + { + "epoch": 1.21185302734375, + "grad_norm": 17.582679748535156, + "learning_rate": 7.905678530668717e-06, + "loss": 5.204, + "step": 59565 + }, + { + "epoch": 1.2119547526041667, + "grad_norm": 17.030338287353516, + "learning_rate": 7.905353261212491e-06, + "loss": 5.1832, + "step": 59570 + }, + { + "epoch": 1.2120564778645833, + "grad_norm": 18.000886917114258, + "learning_rate": 7.90502797319198e-06, + "loss": 4.8112, + "step": 59575 + }, + { + "epoch": 1.212158203125, + "grad_norm": 22.1180362701416, + "learning_rate": 7.90470266660926e-06, + "loss": 4.9098, + "step": 59580 + }, + { + "epoch": 1.2122599283854167, + "grad_norm": 19.7182559967041, + "learning_rate": 7.904377341466415e-06, + "loss": 5.1692, + "step": 59585 + }, + { + "epoch": 1.2123616536458333, + "grad_norm": 25.12578773498535, + "learning_rate": 7.90405199776552e-06, + "loss": 5.0117, + "step": 59590 + }, + { + "epoch": 1.21246337890625, + "grad_norm": 16.68218421936035, + "learning_rate": 7.903726635508654e-06, + "loss": 4.8111, + "step": 59595 + }, + { + "epoch": 1.2125651041666667, + "grad_norm": 17.576398849487305, + "learning_rate": 7.903401254697899e-06, + "loss": 4.8946, + "step": 59600 + }, + { + "epoch": 1.2126668294270833, + "grad_norm": 17.64992904663086, + "learning_rate": 7.903075855335328e-06, + "loss": 4.8167, + "step": 59605 + }, + { + "epoch": 1.2127685546875, + "grad_norm": 12.619423866271973, + "learning_rate": 7.902750437423027e-06, + "loss": 5.0347, + "step": 59610 + }, + { + "epoch": 1.2128702799479167, + "grad_norm": 22.041086196899414, + "learning_rate": 7.90242500096307e-06, + "loss": 5.3382, + "step": 59615 + }, + { + "epoch": 1.2129720052083333, + "grad_norm": 20.503585815429688, + "learning_rate": 7.902099545957538e-06, + "loss": 5.02, + "step": 59620 + }, + { + "epoch": 1.21307373046875, + "grad_norm": 16.753849029541016, + "learning_rate": 7.901774072408512e-06, + "loss": 4.9514, + "step": 59625 + }, + { + "epoch": 1.2131754557291667, + "grad_norm": 17.59686279296875, + "learning_rate": 7.901448580318073e-06, + "loss": 5.0088, + "step": 59630 + }, + { + "epoch": 1.2132771809895833, + "grad_norm": 21.881216049194336, + "learning_rate": 7.901123069688295e-06, + "loss": 4.6921, + "step": 59635 + }, + { + "epoch": 1.21337890625, + "grad_norm": 15.020279884338379, + "learning_rate": 7.900797540521264e-06, + "loss": 4.9601, + "step": 59640 + }, + { + "epoch": 1.2134806315104167, + "grad_norm": 21.113725662231445, + "learning_rate": 7.900471992819058e-06, + "loss": 5.0562, + "step": 59645 + }, + { + "epoch": 1.2135823567708333, + "grad_norm": 15.74809741973877, + "learning_rate": 7.900146426583754e-06, + "loss": 5.1629, + "step": 59650 + }, + { + "epoch": 1.21368408203125, + "grad_norm": 20.479938507080078, + "learning_rate": 7.899820841817438e-06, + "loss": 5.0229, + "step": 59655 + }, + { + "epoch": 1.2137858072916667, + "grad_norm": 26.059612274169922, + "learning_rate": 7.899495238522185e-06, + "loss": 4.8982, + "step": 59660 + }, + { + "epoch": 1.2138875325520833, + "grad_norm": 15.065985679626465, + "learning_rate": 7.89916961670008e-06, + "loss": 4.9434, + "step": 59665 + }, + { + "epoch": 1.2139892578125, + "grad_norm": 16.55508804321289, + "learning_rate": 7.8988439763532e-06, + "loss": 4.8346, + "step": 59670 + }, + { + "epoch": 1.2140909830729167, + "grad_norm": 15.830881118774414, + "learning_rate": 7.898518317483627e-06, + "loss": 5.1526, + "step": 59675 + }, + { + "epoch": 1.2141927083333333, + "grad_norm": 13.416595458984375, + "learning_rate": 7.898192640093443e-06, + "loss": 4.922, + "step": 59680 + }, + { + "epoch": 1.21429443359375, + "grad_norm": 17.065200805664062, + "learning_rate": 7.897866944184727e-06, + "loss": 5.0789, + "step": 59685 + }, + { + "epoch": 1.2143961588541667, + "grad_norm": 17.03451919555664, + "learning_rate": 7.897541229759563e-06, + "loss": 4.9133, + "step": 59690 + }, + { + "epoch": 1.2144978841145833, + "grad_norm": 16.751510620117188, + "learning_rate": 7.897215496820027e-06, + "loss": 4.9826, + "step": 59695 + }, + { + "epoch": 1.214599609375, + "grad_norm": 17.248191833496094, + "learning_rate": 7.896889745368206e-06, + "loss": 4.9743, + "step": 59700 + }, + { + "epoch": 1.2147013346354167, + "grad_norm": 18.85099983215332, + "learning_rate": 7.89656397540618e-06, + "loss": 4.8833, + "step": 59705 + }, + { + "epoch": 1.2148030598958333, + "grad_norm": 13.871967315673828, + "learning_rate": 7.896238186936027e-06, + "loss": 4.809, + "step": 59710 + }, + { + "epoch": 1.21490478515625, + "grad_norm": 18.716567993164062, + "learning_rate": 7.895912379959833e-06, + "loss": 4.8915, + "step": 59715 + }, + { + "epoch": 1.2150065104166667, + "grad_norm": 20.316741943359375, + "learning_rate": 7.895586554479678e-06, + "loss": 4.9473, + "step": 59720 + }, + { + "epoch": 1.2151082356770833, + "grad_norm": 17.26836395263672, + "learning_rate": 7.895260710497644e-06, + "loss": 4.8948, + "step": 59725 + }, + { + "epoch": 1.2152099609375, + "grad_norm": 21.723886489868164, + "learning_rate": 7.894934848015815e-06, + "loss": 5.0967, + "step": 59730 + }, + { + "epoch": 1.2153116861979167, + "grad_norm": 18.735151290893555, + "learning_rate": 7.89460896703627e-06, + "loss": 5.0631, + "step": 59735 + }, + { + "epoch": 1.2154134114583333, + "grad_norm": 15.730350494384766, + "learning_rate": 7.894283067561091e-06, + "loss": 4.9374, + "step": 59740 + }, + { + "epoch": 1.21551513671875, + "grad_norm": 19.7238826751709, + "learning_rate": 7.893957149592364e-06, + "loss": 4.8392, + "step": 59745 + }, + { + "epoch": 1.2156168619791667, + "grad_norm": 16.354110717773438, + "learning_rate": 7.893631213132167e-06, + "loss": 4.7771, + "step": 59750 + }, + { + "epoch": 1.2157185872395833, + "grad_norm": 23.13250160217285, + "learning_rate": 7.893305258182588e-06, + "loss": 5.0492, + "step": 59755 + }, + { + "epoch": 1.2158203125, + "grad_norm": 22.108383178710938, + "learning_rate": 7.892979284745705e-06, + "loss": 4.8536, + "step": 59760 + }, + { + "epoch": 1.2159220377604167, + "grad_norm": 20.683116912841797, + "learning_rate": 7.892653292823603e-06, + "loss": 4.7404, + "step": 59765 + }, + { + "epoch": 1.2160237630208333, + "grad_norm": 15.574682235717773, + "learning_rate": 7.892327282418365e-06, + "loss": 4.8846, + "step": 59770 + }, + { + "epoch": 1.21612548828125, + "grad_norm": 24.22710609436035, + "learning_rate": 7.892001253532073e-06, + "loss": 5.0811, + "step": 59775 + }, + { + "epoch": 1.2162272135416667, + "grad_norm": 17.750568389892578, + "learning_rate": 7.891675206166814e-06, + "loss": 5.1591, + "step": 59780 + }, + { + "epoch": 1.2163289388020833, + "grad_norm": 19.675251007080078, + "learning_rate": 7.891349140324666e-06, + "loss": 4.8422, + "step": 59785 + }, + { + "epoch": 1.2164306640625, + "grad_norm": 19.103450775146484, + "learning_rate": 7.891023056007713e-06, + "loss": 5.0692, + "step": 59790 + }, + { + "epoch": 1.2165323893229167, + "grad_norm": 22.59267234802246, + "learning_rate": 7.890696953218044e-06, + "loss": 5.2742, + "step": 59795 + }, + { + "epoch": 1.2166341145833333, + "grad_norm": 19.2728214263916, + "learning_rate": 7.890370831957737e-06, + "loss": 4.8552, + "step": 59800 + }, + { + "epoch": 1.21673583984375, + "grad_norm": 21.557802200317383, + "learning_rate": 7.890044692228878e-06, + "loss": 5.001, + "step": 59805 + }, + { + "epoch": 1.2168375651041667, + "grad_norm": 16.017620086669922, + "learning_rate": 7.889718534033552e-06, + "loss": 4.6758, + "step": 59810 + }, + { + "epoch": 1.2169392903645833, + "grad_norm": 16.257932662963867, + "learning_rate": 7.889392357373842e-06, + "loss": 4.6927, + "step": 59815 + }, + { + "epoch": 1.217041015625, + "grad_norm": 17.262346267700195, + "learning_rate": 7.88906616225183e-06, + "loss": 4.945, + "step": 59820 + }, + { + "epoch": 1.2171427408854167, + "grad_norm": 16.734777450561523, + "learning_rate": 7.888739948669604e-06, + "loss": 4.8226, + "step": 59825 + }, + { + "epoch": 1.2172444661458333, + "grad_norm": 16.763540267944336, + "learning_rate": 7.888413716629246e-06, + "loss": 4.9192, + "step": 59830 + }, + { + "epoch": 1.21734619140625, + "grad_norm": 17.14814567565918, + "learning_rate": 7.888087466132842e-06, + "loss": 4.9385, + "step": 59835 + }, + { + "epoch": 1.2174479166666667, + "grad_norm": 19.03131103515625, + "learning_rate": 7.887761197182474e-06, + "loss": 4.9552, + "step": 59840 + }, + { + "epoch": 1.2175496419270833, + "grad_norm": 17.026535034179688, + "learning_rate": 7.887434909780231e-06, + "loss": 5.1817, + "step": 59845 + }, + { + "epoch": 1.2176513671875, + "grad_norm": 21.830785751342773, + "learning_rate": 7.887108603928196e-06, + "loss": 4.8353, + "step": 59850 + }, + { + "epoch": 1.2177530924479167, + "grad_norm": 20.65592384338379, + "learning_rate": 7.88678227962845e-06, + "loss": 4.9215, + "step": 59855 + }, + { + "epoch": 1.2178548177083333, + "grad_norm": 14.464929580688477, + "learning_rate": 7.886455936883087e-06, + "loss": 4.8834, + "step": 59860 + }, + { + "epoch": 1.21795654296875, + "grad_norm": 12.75761604309082, + "learning_rate": 7.886129575694182e-06, + "loss": 4.8426, + "step": 59865 + }, + { + "epoch": 1.2180582682291667, + "grad_norm": 18.858257293701172, + "learning_rate": 7.885803196063827e-06, + "loss": 5.2661, + "step": 59870 + }, + { + "epoch": 1.2181599934895833, + "grad_norm": 17.410091400146484, + "learning_rate": 7.885476797994106e-06, + "loss": 4.7665, + "step": 59875 + }, + { + "epoch": 1.21826171875, + "grad_norm": 16.496091842651367, + "learning_rate": 7.885150381487104e-06, + "loss": 4.948, + "step": 59880 + }, + { + "epoch": 1.2183634440104167, + "grad_norm": 21.039915084838867, + "learning_rate": 7.884823946544906e-06, + "loss": 5.0451, + "step": 59885 + }, + { + "epoch": 1.2184651692708333, + "grad_norm": 16.4070987701416, + "learning_rate": 7.8844974931696e-06, + "loss": 5.1448, + "step": 59890 + }, + { + "epoch": 1.21856689453125, + "grad_norm": 15.334725379943848, + "learning_rate": 7.88417102136327e-06, + "loss": 4.9456, + "step": 59895 + }, + { + "epoch": 1.2186686197916667, + "grad_norm": 15.961052894592285, + "learning_rate": 7.883844531128002e-06, + "loss": 4.6886, + "step": 59900 + }, + { + "epoch": 1.2187703450520833, + "grad_norm": 13.907537460327148, + "learning_rate": 7.883518022465883e-06, + "loss": 5.1649, + "step": 59905 + }, + { + "epoch": 1.2188720703125, + "grad_norm": 18.360721588134766, + "learning_rate": 7.883191495379001e-06, + "loss": 4.8881, + "step": 59910 + }, + { + "epoch": 1.2189737955729167, + "grad_norm": 13.561196327209473, + "learning_rate": 7.88286494986944e-06, + "loss": 4.8099, + "step": 59915 + }, + { + "epoch": 1.2190755208333333, + "grad_norm": 15.914854049682617, + "learning_rate": 7.882538385939285e-06, + "loss": 4.8266, + "step": 59920 + }, + { + "epoch": 1.21917724609375, + "grad_norm": 17.402965545654297, + "learning_rate": 7.882211803590627e-06, + "loss": 4.9189, + "step": 59925 + }, + { + "epoch": 1.2192789713541667, + "grad_norm": 18.91320037841797, + "learning_rate": 7.881885202825552e-06, + "loss": 4.9751, + "step": 59930 + }, + { + "epoch": 1.2193806966145833, + "grad_norm": 16.431440353393555, + "learning_rate": 7.881558583646142e-06, + "loss": 4.9874, + "step": 59935 + }, + { + "epoch": 1.219482421875, + "grad_norm": 18.22421646118164, + "learning_rate": 7.881231946054488e-06, + "loss": 4.736, + "step": 59940 + }, + { + "epoch": 1.2195841471354167, + "grad_norm": 21.514820098876953, + "learning_rate": 7.880905290052677e-06, + "loss": 4.7639, + "step": 59945 + }, + { + "epoch": 1.2196858723958333, + "grad_norm": 20.715805053710938, + "learning_rate": 7.880578615642796e-06, + "loss": 4.9955, + "step": 59950 + }, + { + "epoch": 1.21978759765625, + "grad_norm": 17.875131607055664, + "learning_rate": 7.880251922826933e-06, + "loss": 5.0047, + "step": 59955 + }, + { + "epoch": 1.2198893229166667, + "grad_norm": 20.35540008544922, + "learning_rate": 7.879925211607174e-06, + "loss": 5.029, + "step": 59960 + }, + { + "epoch": 1.2199910481770833, + "grad_norm": 16.431604385375977, + "learning_rate": 7.879598481985606e-06, + "loss": 5.1124, + "step": 59965 + }, + { + "epoch": 1.2200927734375, + "grad_norm": 16.655963897705078, + "learning_rate": 7.879271733964318e-06, + "loss": 5.1402, + "step": 59970 + }, + { + "epoch": 1.2201944986979167, + "grad_norm": 13.121526718139648, + "learning_rate": 7.878944967545397e-06, + "loss": 4.9142, + "step": 59975 + }, + { + "epoch": 1.2202962239583333, + "grad_norm": 15.498088836669922, + "learning_rate": 7.878618182730933e-06, + "loss": 4.8278, + "step": 59980 + }, + { + "epoch": 1.22039794921875, + "grad_norm": 25.484710693359375, + "learning_rate": 7.878291379523011e-06, + "loss": 4.9482, + "step": 59985 + }, + { + "epoch": 1.2204996744791667, + "grad_norm": 22.675743103027344, + "learning_rate": 7.877964557923722e-06, + "loss": 4.8929, + "step": 59990 + }, + { + "epoch": 1.2206013997395833, + "grad_norm": 17.618915557861328, + "learning_rate": 7.877637717935153e-06, + "loss": 4.8604, + "step": 59995 + }, + { + "epoch": 1.220703125, + "grad_norm": 17.718473434448242, + "learning_rate": 7.877310859559395e-06, + "loss": 4.7694, + "step": 60000 + }, + { + "epoch": 1.2208048502604167, + "grad_norm": 15.587076187133789, + "learning_rate": 7.87698398279853e-06, + "loss": 4.8534, + "step": 60005 + }, + { + "epoch": 1.2209065755208333, + "grad_norm": 16.668075561523438, + "learning_rate": 7.876657087654652e-06, + "loss": 5.1826, + "step": 60010 + }, + { + "epoch": 1.22100830078125, + "grad_norm": 18.46328353881836, + "learning_rate": 7.87633017412985e-06, + "loss": 4.8624, + "step": 60015 + }, + { + "epoch": 1.2211100260416667, + "grad_norm": 17.518823623657227, + "learning_rate": 7.876003242226209e-06, + "loss": 4.7262, + "step": 60020 + }, + { + "epoch": 1.2212117513020833, + "grad_norm": 22.960506439208984, + "learning_rate": 7.87567629194582e-06, + "loss": 4.8964, + "step": 60025 + }, + { + "epoch": 1.2213134765625, + "grad_norm": 20.16040802001953, + "learning_rate": 7.875349323290775e-06, + "loss": 5.0061, + "step": 60030 + }, + { + "epoch": 1.2214152018229167, + "grad_norm": 16.944520950317383, + "learning_rate": 7.875022336263158e-06, + "loss": 4.8289, + "step": 60035 + }, + { + "epoch": 1.2215169270833333, + "grad_norm": 14.360456466674805, + "learning_rate": 7.874695330865061e-06, + "loss": 4.9599, + "step": 60040 + }, + { + "epoch": 1.22161865234375, + "grad_norm": 14.48867416381836, + "learning_rate": 7.874368307098574e-06, + "loss": 4.7934, + "step": 60045 + }, + { + "epoch": 1.2217203776041667, + "grad_norm": 16.19363021850586, + "learning_rate": 7.874041264965785e-06, + "loss": 4.9448, + "step": 60050 + }, + { + "epoch": 1.2218221028645833, + "grad_norm": 23.604068756103516, + "learning_rate": 7.873714204468787e-06, + "loss": 5.1142, + "step": 60055 + }, + { + "epoch": 1.221923828125, + "grad_norm": 21.076953887939453, + "learning_rate": 7.873387125609666e-06, + "loss": 4.9265, + "step": 60060 + }, + { + "epoch": 1.2220255533854167, + "grad_norm": 13.600797653198242, + "learning_rate": 7.873060028390516e-06, + "loss": 4.8279, + "step": 60065 + }, + { + "epoch": 1.2221272786458333, + "grad_norm": 19.712129592895508, + "learning_rate": 7.872732912813422e-06, + "loss": 5.349, + "step": 60070 + }, + { + "epoch": 1.22222900390625, + "grad_norm": 17.494279861450195, + "learning_rate": 7.872405778880476e-06, + "loss": 5.0477, + "step": 60075 + }, + { + "epoch": 1.2223307291666667, + "grad_norm": 19.518970489501953, + "learning_rate": 7.87207862659377e-06, + "loss": 4.8744, + "step": 60080 + }, + { + "epoch": 1.2224324544270833, + "grad_norm": 16.64456558227539, + "learning_rate": 7.871751455955393e-06, + "loss": 4.7161, + "step": 60085 + }, + { + "epoch": 1.2225341796875, + "grad_norm": 19.601259231567383, + "learning_rate": 7.871424266967436e-06, + "loss": 4.8525, + "step": 60090 + }, + { + "epoch": 1.2226359049479167, + "grad_norm": 22.656496047973633, + "learning_rate": 7.871097059631989e-06, + "loss": 4.9491, + "step": 60095 + }, + { + "epoch": 1.2227376302083333, + "grad_norm": 19.113365173339844, + "learning_rate": 7.870769833951143e-06, + "loss": 5.2568, + "step": 60100 + }, + { + "epoch": 1.22283935546875, + "grad_norm": 16.155757904052734, + "learning_rate": 7.87044258992699e-06, + "loss": 5.0549, + "step": 60105 + }, + { + "epoch": 1.2229410807291667, + "grad_norm": 23.112401962280273, + "learning_rate": 7.870115327561619e-06, + "loss": 4.9751, + "step": 60110 + }, + { + "epoch": 1.2230428059895833, + "grad_norm": 17.079580307006836, + "learning_rate": 7.869788046857123e-06, + "loss": 5.1872, + "step": 60115 + }, + { + "epoch": 1.22314453125, + "grad_norm": 14.15616226196289, + "learning_rate": 7.869460747815591e-06, + "loss": 5.448, + "step": 60120 + }, + { + "epoch": 1.2232462565104167, + "grad_norm": 15.844379425048828, + "learning_rate": 7.869133430439117e-06, + "loss": 4.8759, + "step": 60125 + }, + { + "epoch": 1.2233479817708333, + "grad_norm": 21.117816925048828, + "learning_rate": 7.868806094729791e-06, + "loss": 4.9476, + "step": 60130 + }, + { + "epoch": 1.22344970703125, + "grad_norm": 17.47467803955078, + "learning_rate": 7.868478740689706e-06, + "loss": 4.8591, + "step": 60135 + }, + { + "epoch": 1.2235514322916667, + "grad_norm": 13.68593978881836, + "learning_rate": 7.86815136832095e-06, + "loss": 5.0201, + "step": 60140 + }, + { + "epoch": 1.2236531575520833, + "grad_norm": 20.538585662841797, + "learning_rate": 7.86782397762562e-06, + "loss": 4.9684, + "step": 60145 + }, + { + "epoch": 1.2237548828125, + "grad_norm": 14.631121635437012, + "learning_rate": 7.867496568605804e-06, + "loss": 5.033, + "step": 60150 + }, + { + "epoch": 1.2238566080729167, + "grad_norm": 12.307855606079102, + "learning_rate": 7.867169141263593e-06, + "loss": 4.7896, + "step": 60155 + }, + { + "epoch": 1.2239583333333333, + "grad_norm": 17.61899185180664, + "learning_rate": 7.866841695601084e-06, + "loss": 4.8357, + "step": 60160 + }, + { + "epoch": 1.22406005859375, + "grad_norm": 18.15283203125, + "learning_rate": 7.866514231620365e-06, + "loss": 4.8431, + "step": 60165 + }, + { + "epoch": 1.2241617838541667, + "grad_norm": 22.412324905395508, + "learning_rate": 7.86618674932353e-06, + "loss": 5.2983, + "step": 60170 + }, + { + "epoch": 1.2242635091145833, + "grad_norm": 12.823738098144531, + "learning_rate": 7.865859248712673e-06, + "loss": 4.982, + "step": 60175 + }, + { + "epoch": 1.224365234375, + "grad_norm": 14.412196159362793, + "learning_rate": 7.865531729789883e-06, + "loss": 5.2801, + "step": 60180 + }, + { + "epoch": 1.2244669596354167, + "grad_norm": 23.072839736938477, + "learning_rate": 7.865204192557258e-06, + "loss": 5.0153, + "step": 60185 + }, + { + "epoch": 1.2245686848958333, + "grad_norm": 17.073383331298828, + "learning_rate": 7.864876637016885e-06, + "loss": 4.9068, + "step": 60190 + }, + { + "epoch": 1.22467041015625, + "grad_norm": 16.10546112060547, + "learning_rate": 7.864549063170861e-06, + "loss": 5.0465, + "step": 60195 + }, + { + "epoch": 1.2247721354166667, + "grad_norm": 16.551416397094727, + "learning_rate": 7.864221471021279e-06, + "loss": 4.9566, + "step": 60200 + }, + { + "epoch": 1.2248738606770833, + "grad_norm": 20.751903533935547, + "learning_rate": 7.863893860570228e-06, + "loss": 4.8562, + "step": 60205 + }, + { + "epoch": 1.2249755859375, + "grad_norm": 14.575098037719727, + "learning_rate": 7.863566231819806e-06, + "loss": 4.8699, + "step": 60210 + }, + { + "epoch": 1.2250773111979167, + "grad_norm": 21.011262893676758, + "learning_rate": 7.863238584772104e-06, + "loss": 5.1251, + "step": 60215 + }, + { + "epoch": 1.2251790364583333, + "grad_norm": 19.526247024536133, + "learning_rate": 7.862910919429219e-06, + "loss": 4.9948, + "step": 60220 + }, + { + "epoch": 1.22528076171875, + "grad_norm": 14.640229225158691, + "learning_rate": 7.862583235793239e-06, + "loss": 5.02, + "step": 60225 + }, + { + "epoch": 1.2253824869791667, + "grad_norm": 20.259445190429688, + "learning_rate": 7.862255533866262e-06, + "loss": 5.0409, + "step": 60230 + }, + { + "epoch": 1.2254842122395833, + "grad_norm": 18.108036041259766, + "learning_rate": 7.86192781365038e-06, + "loss": 5.0405, + "step": 60235 + }, + { + "epoch": 1.2255859375, + "grad_norm": 15.3093843460083, + "learning_rate": 7.86160007514769e-06, + "loss": 4.9562, + "step": 60240 + }, + { + "epoch": 1.2256876627604167, + "grad_norm": 17.23712921142578, + "learning_rate": 7.861272318360281e-06, + "loss": 5.0092, + "step": 60245 + }, + { + "epoch": 1.2257893880208333, + "grad_norm": 11.571666717529297, + "learning_rate": 7.860944543290253e-06, + "loss": 4.7954, + "step": 60250 + }, + { + "epoch": 1.22589111328125, + "grad_norm": 20.467254638671875, + "learning_rate": 7.860616749939694e-06, + "loss": 5.1053, + "step": 60255 + }, + { + "epoch": 1.2259928385416667, + "grad_norm": 16.67926025390625, + "learning_rate": 7.860288938310705e-06, + "loss": 4.9236, + "step": 60260 + }, + { + "epoch": 1.2260945638020833, + "grad_norm": 17.92586326599121, + "learning_rate": 7.859961108405376e-06, + "loss": 4.8997, + "step": 60265 + }, + { + "epoch": 1.2261962890625, + "grad_norm": 23.948339462280273, + "learning_rate": 7.859633260225805e-06, + "loss": 4.8287, + "step": 60270 + }, + { + "epoch": 1.2262980143229167, + "grad_norm": 16.285289764404297, + "learning_rate": 7.859305393774085e-06, + "loss": 5.2748, + "step": 60275 + }, + { + "epoch": 1.2263997395833333, + "grad_norm": 13.532407760620117, + "learning_rate": 7.85897750905231e-06, + "loss": 5.0, + "step": 60280 + }, + { + "epoch": 1.22650146484375, + "grad_norm": 22.54188346862793, + "learning_rate": 7.858649606062577e-06, + "loss": 5.009, + "step": 60285 + }, + { + "epoch": 1.2266031901041667, + "grad_norm": 12.566372871398926, + "learning_rate": 7.858321684806981e-06, + "loss": 4.9024, + "step": 60290 + }, + { + "epoch": 1.2267049153645833, + "grad_norm": 18.681652069091797, + "learning_rate": 7.857993745287615e-06, + "loss": 4.9797, + "step": 60295 + }, + { + "epoch": 1.226806640625, + "grad_norm": 15.687307357788086, + "learning_rate": 7.857665787506579e-06, + "loss": 4.9937, + "step": 60300 + }, + { + "epoch": 1.2269083658854167, + "grad_norm": 17.093122482299805, + "learning_rate": 7.857337811465964e-06, + "loss": 4.8966, + "step": 60305 + }, + { + "epoch": 1.2270100911458333, + "grad_norm": 16.692834854125977, + "learning_rate": 7.857009817167867e-06, + "loss": 4.929, + "step": 60310 + }, + { + "epoch": 1.22711181640625, + "grad_norm": 16.513216018676758, + "learning_rate": 7.856681804614384e-06, + "loss": 4.9127, + "step": 60315 + }, + { + "epoch": 1.2272135416666667, + "grad_norm": 18.986825942993164, + "learning_rate": 7.856353773807613e-06, + "loss": 4.9162, + "step": 60320 + }, + { + "epoch": 1.2273152669270833, + "grad_norm": 15.716141700744629, + "learning_rate": 7.856025724749646e-06, + "loss": 4.9305, + "step": 60325 + }, + { + "epoch": 1.2274169921875, + "grad_norm": 19.231319427490234, + "learning_rate": 7.855697657442583e-06, + "loss": 4.9108, + "step": 60330 + }, + { + "epoch": 1.2275187174479167, + "grad_norm": 16.948017120361328, + "learning_rate": 7.855369571888517e-06, + "loss": 4.6081, + "step": 60335 + }, + { + "epoch": 1.2276204427083333, + "grad_norm": 26.040637969970703, + "learning_rate": 7.855041468089547e-06, + "loss": 5.0094, + "step": 60340 + }, + { + "epoch": 1.22772216796875, + "grad_norm": 20.640419006347656, + "learning_rate": 7.854713346047767e-06, + "loss": 5.0602, + "step": 60345 + }, + { + "epoch": 1.2278238932291667, + "grad_norm": 17.27028465270996, + "learning_rate": 7.854385205765277e-06, + "loss": 5.4893, + "step": 60350 + }, + { + "epoch": 1.2279256184895833, + "grad_norm": 20.614044189453125, + "learning_rate": 7.85405704724417e-06, + "loss": 4.6361, + "step": 60355 + }, + { + "epoch": 1.22802734375, + "grad_norm": 19.05046272277832, + "learning_rate": 7.853728870486545e-06, + "loss": 5.1635, + "step": 60360 + }, + { + "epoch": 1.2281290690104167, + "grad_norm": 16.960018157958984, + "learning_rate": 7.853400675494498e-06, + "loss": 4.9554, + "step": 60365 + }, + { + "epoch": 1.2282307942708333, + "grad_norm": 25.84810447692871, + "learning_rate": 7.853072462270128e-06, + "loss": 4.9949, + "step": 60370 + }, + { + "epoch": 1.22833251953125, + "grad_norm": 20.03343963623047, + "learning_rate": 7.852744230815528e-06, + "loss": 4.9251, + "step": 60375 + }, + { + "epoch": 1.2284342447916667, + "grad_norm": 17.534671783447266, + "learning_rate": 7.8524159811328e-06, + "loss": 4.7771, + "step": 60380 + }, + { + "epoch": 1.2285359700520833, + "grad_norm": 22.462329864501953, + "learning_rate": 7.852087713224039e-06, + "loss": 5.0733, + "step": 60385 + }, + { + "epoch": 1.2286376953125, + "grad_norm": 16.211498260498047, + "learning_rate": 7.851759427091343e-06, + "loss": 5.0134, + "step": 60390 + }, + { + "epoch": 1.2287394205729167, + "grad_norm": 16.636655807495117, + "learning_rate": 7.85143112273681e-06, + "loss": 4.9028, + "step": 60395 + }, + { + "epoch": 1.2288411458333333, + "grad_norm": 23.655893325805664, + "learning_rate": 7.851102800162536e-06, + "loss": 5.2149, + "step": 60400 + }, + { + "epoch": 1.22894287109375, + "grad_norm": 21.528884887695312, + "learning_rate": 7.850774459370621e-06, + "loss": 5.0943, + "step": 60405 + }, + { + "epoch": 1.2290445963541667, + "grad_norm": 16.29191017150879, + "learning_rate": 7.85044610036316e-06, + "loss": 4.784, + "step": 60410 + }, + { + "epoch": 1.2291463216145833, + "grad_norm": 16.529630661010742, + "learning_rate": 7.850117723142257e-06, + "loss": 4.9622, + "step": 60415 + }, + { + "epoch": 1.229248046875, + "grad_norm": 27.394060134887695, + "learning_rate": 7.849789327710004e-06, + "loss": 4.8147, + "step": 60420 + }, + { + "epoch": 1.2293497721354167, + "grad_norm": 18.34729766845703, + "learning_rate": 7.849460914068502e-06, + "loss": 4.6529, + "step": 60425 + }, + { + "epoch": 1.2294514973958333, + "grad_norm": 18.583463668823242, + "learning_rate": 7.84913248221985e-06, + "loss": 4.8398, + "step": 60430 + }, + { + "epoch": 1.22955322265625, + "grad_norm": 18.688810348510742, + "learning_rate": 7.848804032166145e-06, + "loss": 4.8744, + "step": 60435 + }, + { + "epoch": 1.2296549479166667, + "grad_norm": 19.63589859008789, + "learning_rate": 7.848475563909487e-06, + "loss": 4.9984, + "step": 60440 + }, + { + "epoch": 1.2297566731770833, + "grad_norm": 21.449087142944336, + "learning_rate": 7.848147077451975e-06, + "loss": 4.9536, + "step": 60445 + }, + { + "epoch": 1.2298583984375, + "grad_norm": 20.14872169494629, + "learning_rate": 7.847818572795708e-06, + "loss": 5.2201, + "step": 60450 + }, + { + "epoch": 1.2299601236979167, + "grad_norm": 13.681060791015625, + "learning_rate": 7.847490049942782e-06, + "loss": 5.1695, + "step": 60455 + }, + { + "epoch": 1.2300618489583333, + "grad_norm": 12.921591758728027, + "learning_rate": 7.8471615088953e-06, + "loss": 4.9046, + "step": 60460 + }, + { + "epoch": 1.23016357421875, + "grad_norm": 18.018430709838867, + "learning_rate": 7.846832949655358e-06, + "loss": 4.8732, + "step": 60465 + }, + { + "epoch": 1.2302652994791667, + "grad_norm": 15.964990615844727, + "learning_rate": 7.84650437222506e-06, + "loss": 4.9301, + "step": 60470 + }, + { + "epoch": 1.2303670247395833, + "grad_norm": 17.16713523864746, + "learning_rate": 7.846175776606502e-06, + "loss": 4.8621, + "step": 60475 + }, + { + "epoch": 1.23046875, + "grad_norm": 19.268465042114258, + "learning_rate": 7.845847162801783e-06, + "loss": 4.8653, + "step": 60480 + }, + { + "epoch": 1.2305704752604167, + "grad_norm": 16.130701065063477, + "learning_rate": 7.845518530813006e-06, + "loss": 4.7524, + "step": 60485 + }, + { + "epoch": 1.2306722005208333, + "grad_norm": 23.290437698364258, + "learning_rate": 7.845189880642265e-06, + "loss": 5.0567, + "step": 60490 + }, + { + "epoch": 1.23077392578125, + "grad_norm": 19.39605712890625, + "learning_rate": 7.844861212291668e-06, + "loss": 5.1378, + "step": 60495 + }, + { + "epoch": 1.2308756510416667, + "grad_norm": 21.0126953125, + "learning_rate": 7.844532525763308e-06, + "loss": 4.9831, + "step": 60500 + }, + { + "epoch": 1.2309773763020833, + "grad_norm": 18.042665481567383, + "learning_rate": 7.84420382105929e-06, + "loss": 5.2603, + "step": 60505 + }, + { + "epoch": 1.2310791015625, + "grad_norm": 20.378543853759766, + "learning_rate": 7.843875098181711e-06, + "loss": 5.0104, + "step": 60510 + }, + { + "epoch": 1.2311808268229167, + "grad_norm": 18.605575561523438, + "learning_rate": 7.843546357132675e-06, + "loss": 4.7647, + "step": 60515 + }, + { + "epoch": 1.2312825520833333, + "grad_norm": 15.598134994506836, + "learning_rate": 7.843217597914279e-06, + "loss": 4.878, + "step": 60520 + }, + { + "epoch": 1.23138427734375, + "grad_norm": 14.711308479309082, + "learning_rate": 7.842888820528625e-06, + "loss": 5.1699, + "step": 60525 + }, + { + "epoch": 1.2314860026041667, + "grad_norm": 17.372488021850586, + "learning_rate": 7.842560024977815e-06, + "loss": 5.0198, + "step": 60530 + }, + { + "epoch": 1.2315877278645833, + "grad_norm": 15.749918937683105, + "learning_rate": 7.842231211263946e-06, + "loss": 4.8888, + "step": 60535 + }, + { + "epoch": 1.231689453125, + "grad_norm": 14.304365158081055, + "learning_rate": 7.841902379389124e-06, + "loss": 5.1393, + "step": 60540 + }, + { + "epoch": 1.2317911783854167, + "grad_norm": 19.209379196166992, + "learning_rate": 7.841573529355449e-06, + "loss": 5.0196, + "step": 60545 + }, + { + "epoch": 1.2318929036458333, + "grad_norm": 22.34406280517578, + "learning_rate": 7.841244661165019e-06, + "loss": 5.1075, + "step": 60550 + }, + { + "epoch": 1.23199462890625, + "grad_norm": 18.20597267150879, + "learning_rate": 7.840915774819938e-06, + "loss": 4.9097, + "step": 60555 + }, + { + "epoch": 1.2320963541666667, + "grad_norm": 14.120816230773926, + "learning_rate": 7.840586870322309e-06, + "loss": 5.132, + "step": 60560 + }, + { + "epoch": 1.2321980794270833, + "grad_norm": 20.690855026245117, + "learning_rate": 7.84025794767423e-06, + "loss": 4.85, + "step": 60565 + }, + { + "epoch": 1.2322998046875, + "grad_norm": 19.83964729309082, + "learning_rate": 7.839929006877804e-06, + "loss": 5.033, + "step": 60570 + }, + { + "epoch": 1.2324015299479167, + "grad_norm": 18.456850051879883, + "learning_rate": 7.839600047935133e-06, + "loss": 4.8569, + "step": 60575 + }, + { + "epoch": 1.2325032552083333, + "grad_norm": 16.40476417541504, + "learning_rate": 7.83927107084832e-06, + "loss": 4.8754, + "step": 60580 + }, + { + "epoch": 1.23260498046875, + "grad_norm": 18.07317352294922, + "learning_rate": 7.838942075619466e-06, + "loss": 4.8275, + "step": 60585 + }, + { + "epoch": 1.2327067057291667, + "grad_norm": 16.786771774291992, + "learning_rate": 7.838613062250673e-06, + "loss": 5.0581, + "step": 60590 + }, + { + "epoch": 1.2328084309895833, + "grad_norm": 15.165318489074707, + "learning_rate": 7.838284030744044e-06, + "loss": 5.0587, + "step": 60595 + }, + { + "epoch": 1.23291015625, + "grad_norm": 14.93738842010498, + "learning_rate": 7.83795498110168e-06, + "loss": 4.6588, + "step": 60600 + }, + { + "epoch": 1.2330118815104167, + "grad_norm": 15.676443099975586, + "learning_rate": 7.837625913325685e-06, + "loss": 4.8583, + "step": 60605 + }, + { + "epoch": 1.2331136067708333, + "grad_norm": 21.910144805908203, + "learning_rate": 7.837296827418162e-06, + "loss": 5.021, + "step": 60610 + }, + { + "epoch": 1.23321533203125, + "grad_norm": 14.9222412109375, + "learning_rate": 7.836967723381214e-06, + "loss": 4.9265, + "step": 60615 + }, + { + "epoch": 1.2333170572916667, + "grad_norm": 20.50992202758789, + "learning_rate": 7.83663860121694e-06, + "loss": 5.0923, + "step": 60620 + }, + { + "epoch": 1.2334187825520833, + "grad_norm": 16.30855369567871, + "learning_rate": 7.836309460927447e-06, + "loss": 4.8602, + "step": 60625 + }, + { + "epoch": 1.2335205078125, + "grad_norm": 14.54433822631836, + "learning_rate": 7.835980302514837e-06, + "loss": 4.9248, + "step": 60630 + }, + { + "epoch": 1.2336222330729167, + "grad_norm": 27.629053115844727, + "learning_rate": 7.835651125981214e-06, + "loss": 4.928, + "step": 60635 + }, + { + "epoch": 1.2337239583333333, + "grad_norm": 15.182511329650879, + "learning_rate": 7.835321931328679e-06, + "loss": 4.8076, + "step": 60640 + }, + { + "epoch": 1.23382568359375, + "grad_norm": 22.012523651123047, + "learning_rate": 7.834992718559337e-06, + "loss": 4.7969, + "step": 60645 + }, + { + "epoch": 1.2339274088541667, + "grad_norm": 17.43231964111328, + "learning_rate": 7.834663487675294e-06, + "loss": 4.921, + "step": 60650 + }, + { + "epoch": 1.2340291341145833, + "grad_norm": 16.26782989501953, + "learning_rate": 7.834334238678647e-06, + "loss": 5.2367, + "step": 60655 + }, + { + "epoch": 1.234130859375, + "grad_norm": 15.504984855651855, + "learning_rate": 7.834004971571508e-06, + "loss": 4.926, + "step": 60660 + }, + { + "epoch": 1.2342325846354167, + "grad_norm": 24.958885192871094, + "learning_rate": 7.833675686355975e-06, + "loss": 5.2259, + "step": 60665 + }, + { + "epoch": 1.2343343098958333, + "grad_norm": 21.276987075805664, + "learning_rate": 7.833346383034153e-06, + "loss": 5.297, + "step": 60670 + }, + { + "epoch": 1.23443603515625, + "grad_norm": 17.63115882873535, + "learning_rate": 7.833017061608148e-06, + "loss": 4.9194, + "step": 60675 + }, + { + "epoch": 1.2345377604166667, + "grad_norm": 17.447614669799805, + "learning_rate": 7.832687722080063e-06, + "loss": 5.1627, + "step": 60680 + }, + { + "epoch": 1.2346394856770833, + "grad_norm": 19.96437644958496, + "learning_rate": 7.832358364452004e-06, + "loss": 5.3812, + "step": 60685 + }, + { + "epoch": 1.2347412109375, + "grad_norm": 19.549680709838867, + "learning_rate": 7.832028988726073e-06, + "loss": 4.795, + "step": 60690 + }, + { + "epoch": 1.2348429361979167, + "grad_norm": 22.65894317626953, + "learning_rate": 7.831699594904375e-06, + "loss": 5.0009, + "step": 60695 + }, + { + "epoch": 1.2349446614583333, + "grad_norm": 13.803560256958008, + "learning_rate": 7.831370182989017e-06, + "loss": 4.9321, + "step": 60700 + }, + { + "epoch": 1.23504638671875, + "grad_norm": 14.151161193847656, + "learning_rate": 7.831040752982101e-06, + "loss": 4.9865, + "step": 60705 + }, + { + "epoch": 1.2351481119791667, + "grad_norm": 19.05336570739746, + "learning_rate": 7.830711304885734e-06, + "loss": 4.872, + "step": 60710 + }, + { + "epoch": 1.2352498372395833, + "grad_norm": 14.601045608520508, + "learning_rate": 7.830381838702021e-06, + "loss": 4.9793, + "step": 60715 + }, + { + "epoch": 1.2353515625, + "grad_norm": 18.071876525878906, + "learning_rate": 7.830052354433065e-06, + "loss": 5.1408, + "step": 60720 + }, + { + "epoch": 1.2354532877604167, + "grad_norm": 19.1141414642334, + "learning_rate": 7.829722852080975e-06, + "loss": 4.8646, + "step": 60725 + }, + { + "epoch": 1.2355550130208333, + "grad_norm": 15.263745307922363, + "learning_rate": 7.829393331647852e-06, + "loss": 4.877, + "step": 60730 + }, + { + "epoch": 1.23565673828125, + "grad_norm": 13.872458457946777, + "learning_rate": 7.829063793135805e-06, + "loss": 4.7478, + "step": 60735 + }, + { + "epoch": 1.2357584635416667, + "grad_norm": 21.127037048339844, + "learning_rate": 7.828734236546937e-06, + "loss": 4.8544, + "step": 60740 + }, + { + "epoch": 1.2358601888020833, + "grad_norm": 17.126934051513672, + "learning_rate": 7.828404661883358e-06, + "loss": 4.952, + "step": 60745 + }, + { + "epoch": 1.2359619140625, + "grad_norm": 21.580997467041016, + "learning_rate": 7.82807506914717e-06, + "loss": 4.8145, + "step": 60750 + }, + { + "epoch": 1.2360636393229167, + "grad_norm": 24.554790496826172, + "learning_rate": 7.82774545834048e-06, + "loss": 5.4071, + "step": 60755 + }, + { + "epoch": 1.2361653645833333, + "grad_norm": 13.572484970092773, + "learning_rate": 7.827415829465392e-06, + "loss": 4.7853, + "step": 60760 + }, + { + "epoch": 1.23626708984375, + "grad_norm": 15.206899642944336, + "learning_rate": 7.827086182524016e-06, + "loss": 4.9759, + "step": 60765 + }, + { + "epoch": 1.2363688151041667, + "grad_norm": 17.811067581176758, + "learning_rate": 7.826756517518456e-06, + "loss": 4.838, + "step": 60770 + }, + { + "epoch": 1.2364705403645833, + "grad_norm": 14.463020324707031, + "learning_rate": 7.826426834450821e-06, + "loss": 5.1142, + "step": 60775 + }, + { + "epoch": 1.236572265625, + "grad_norm": 19.505367279052734, + "learning_rate": 7.826097133323214e-06, + "loss": 5.2031, + "step": 60780 + }, + { + "epoch": 1.2366739908854167, + "grad_norm": 20.989105224609375, + "learning_rate": 7.825767414137744e-06, + "loss": 4.9286, + "step": 60785 + }, + { + "epoch": 1.2367757161458333, + "grad_norm": 17.10540199279785, + "learning_rate": 7.825437676896518e-06, + "loss": 4.8491, + "step": 60790 + }, + { + "epoch": 1.23687744140625, + "grad_norm": 18.376842498779297, + "learning_rate": 7.825107921601642e-06, + "loss": 4.9446, + "step": 60795 + }, + { + "epoch": 1.2369791666666667, + "grad_norm": 20.29593849182129, + "learning_rate": 7.824778148255223e-06, + "loss": 4.706, + "step": 60800 + }, + { + "epoch": 1.2370808919270833, + "grad_norm": 18.245569229125977, + "learning_rate": 7.824448356859367e-06, + "loss": 5.0292, + "step": 60805 + }, + { + "epoch": 1.2371826171875, + "grad_norm": 19.95473861694336, + "learning_rate": 7.824118547416184e-06, + "loss": 4.8345, + "step": 60810 + }, + { + "epoch": 1.2372843424479167, + "grad_norm": 14.405517578125, + "learning_rate": 7.82378871992778e-06, + "loss": 4.9772, + "step": 60815 + }, + { + "epoch": 1.2373860677083333, + "grad_norm": 14.541032791137695, + "learning_rate": 7.823458874396261e-06, + "loss": 4.6374, + "step": 60820 + }, + { + "epoch": 1.23748779296875, + "grad_norm": 19.428804397583008, + "learning_rate": 7.823129010823738e-06, + "loss": 4.9618, + "step": 60825 + }, + { + "epoch": 1.2375895182291667, + "grad_norm": 18.086429595947266, + "learning_rate": 7.822799129212316e-06, + "loss": 4.9613, + "step": 60830 + }, + { + "epoch": 1.2376912434895833, + "grad_norm": 19.310279846191406, + "learning_rate": 7.822469229564103e-06, + "loss": 5.1507, + "step": 60835 + }, + { + "epoch": 1.23779296875, + "grad_norm": 16.16983985900879, + "learning_rate": 7.82213931188121e-06, + "loss": 4.6937, + "step": 60840 + }, + { + "epoch": 1.2378946940104167, + "grad_norm": 20.226835250854492, + "learning_rate": 7.82180937616574e-06, + "loss": 5.1791, + "step": 60845 + }, + { + "epoch": 1.2379964192708333, + "grad_norm": 19.591650009155273, + "learning_rate": 7.821479422419804e-06, + "loss": 5.1741, + "step": 60850 + }, + { + "epoch": 1.23809814453125, + "grad_norm": 18.814149856567383, + "learning_rate": 7.82114945064551e-06, + "loss": 4.8675, + "step": 60855 + }, + { + "epoch": 1.2381998697916667, + "grad_norm": 14.763254165649414, + "learning_rate": 7.820819460844968e-06, + "loss": 4.892, + "step": 60860 + }, + { + "epoch": 1.2383015950520833, + "grad_norm": 22.224035263061523, + "learning_rate": 7.820489453020285e-06, + "loss": 4.964, + "step": 60865 + }, + { + "epoch": 1.2384033203125, + "grad_norm": 15.47658634185791, + "learning_rate": 7.82015942717357e-06, + "loss": 4.708, + "step": 60870 + }, + { + "epoch": 1.2385050455729167, + "grad_norm": 16.419492721557617, + "learning_rate": 7.819829383306929e-06, + "loss": 4.9839, + "step": 60875 + }, + { + "epoch": 1.2386067708333333, + "grad_norm": 20.825490951538086, + "learning_rate": 7.819499321422477e-06, + "loss": 5.1412, + "step": 60880 + }, + { + "epoch": 1.23870849609375, + "grad_norm": 17.27989387512207, + "learning_rate": 7.819169241522317e-06, + "loss": 4.8493, + "step": 60885 + }, + { + "epoch": 1.2388102213541667, + "grad_norm": 16.070716857910156, + "learning_rate": 7.818839143608562e-06, + "loss": 4.9086, + "step": 60890 + }, + { + "epoch": 1.2389119466145833, + "grad_norm": 18.006723403930664, + "learning_rate": 7.818509027683318e-06, + "loss": 4.8899, + "step": 60895 + }, + { + "epoch": 1.239013671875, + "grad_norm": 20.04848861694336, + "learning_rate": 7.818178893748697e-06, + "loss": 5.0368, + "step": 60900 + }, + { + "epoch": 1.2391153971354167, + "grad_norm": 19.46053695678711, + "learning_rate": 7.817848741806806e-06, + "loss": 4.7951, + "step": 60905 + }, + { + "epoch": 1.2392171223958333, + "grad_norm": 17.547399520874023, + "learning_rate": 7.817518571859758e-06, + "loss": 4.9739, + "step": 60910 + }, + { + "epoch": 1.23931884765625, + "grad_norm": 15.650362014770508, + "learning_rate": 7.81718838390966e-06, + "loss": 5.1641, + "step": 60915 + }, + { + "epoch": 1.2394205729166667, + "grad_norm": 16.93083381652832, + "learning_rate": 7.816858177958622e-06, + "loss": 4.9272, + "step": 60920 + }, + { + "epoch": 1.2395222981770833, + "grad_norm": 18.481958389282227, + "learning_rate": 7.816527954008756e-06, + "loss": 5.155, + "step": 60925 + }, + { + "epoch": 1.2396240234375, + "grad_norm": 18.292285919189453, + "learning_rate": 7.816197712062167e-06, + "loss": 5.0328, + "step": 60930 + }, + { + "epoch": 1.2397257486979167, + "grad_norm": 18.920005798339844, + "learning_rate": 7.815867452120973e-06, + "loss": 4.8902, + "step": 60935 + }, + { + "epoch": 1.2398274739583333, + "grad_norm": 18.522924423217773, + "learning_rate": 7.815537174187278e-06, + "loss": 5.0234, + "step": 60940 + }, + { + "epoch": 1.23992919921875, + "grad_norm": 15.589768409729004, + "learning_rate": 7.815206878263193e-06, + "loss": 4.9468, + "step": 60945 + }, + { + "epoch": 1.2400309244791667, + "grad_norm": 16.20911979675293, + "learning_rate": 7.814876564350832e-06, + "loss": 5.0761, + "step": 60950 + }, + { + "epoch": 1.2401326497395833, + "grad_norm": 16.443330764770508, + "learning_rate": 7.8145462324523e-06, + "loss": 5.1522, + "step": 60955 + }, + { + "epoch": 1.240234375, + "grad_norm": 17.393478393554688, + "learning_rate": 7.814215882569716e-06, + "loss": 4.8726, + "step": 60960 + }, + { + "epoch": 1.2403361002604167, + "grad_norm": 21.25337791442871, + "learning_rate": 7.813885514705182e-06, + "loss": 4.7966, + "step": 60965 + }, + { + "epoch": 1.2404378255208333, + "grad_norm": 18.114967346191406, + "learning_rate": 7.813555128860814e-06, + "loss": 4.759, + "step": 60970 + }, + { + "epoch": 1.24053955078125, + "grad_norm": 12.538198471069336, + "learning_rate": 7.813224725038721e-06, + "loss": 4.9975, + "step": 60975 + }, + { + "epoch": 1.2406412760416667, + "grad_norm": 19.156221389770508, + "learning_rate": 7.812894303241017e-06, + "loss": 5.0338, + "step": 60980 + }, + { + "epoch": 1.2407430013020833, + "grad_norm": 13.257064819335938, + "learning_rate": 7.812563863469809e-06, + "loss": 5.0013, + "step": 60985 + }, + { + "epoch": 1.2408447265625, + "grad_norm": 14.16629695892334, + "learning_rate": 7.812233405727213e-06, + "loss": 5.0971, + "step": 60990 + }, + { + "epoch": 1.2409464518229167, + "grad_norm": 24.750972747802734, + "learning_rate": 7.811902930015336e-06, + "loss": 4.6293, + "step": 60995 + }, + { + "epoch": 1.2410481770833333, + "grad_norm": 20.261621475219727, + "learning_rate": 7.811572436336294e-06, + "loss": 4.9276, + "step": 61000 + }, + { + "epoch": 1.24114990234375, + "grad_norm": 18.676864624023438, + "learning_rate": 7.811241924692195e-06, + "loss": 5.2091, + "step": 61005 + }, + { + "epoch": 1.2412516276041667, + "grad_norm": 19.52159309387207, + "learning_rate": 7.810911395085152e-06, + "loss": 5.004, + "step": 61010 + }, + { + "epoch": 1.2413533528645833, + "grad_norm": 18.181840896606445, + "learning_rate": 7.810580847517279e-06, + "loss": 4.8811, + "step": 61015 + }, + { + "epoch": 1.241455078125, + "grad_norm": 21.535808563232422, + "learning_rate": 7.810250281990685e-06, + "loss": 4.7862, + "step": 61020 + }, + { + "epoch": 1.2415568033854167, + "grad_norm": 15.472476959228516, + "learning_rate": 7.809919698507487e-06, + "loss": 4.5285, + "step": 61025 + }, + { + "epoch": 1.2416585286458333, + "grad_norm": 13.75605297088623, + "learning_rate": 7.809589097069792e-06, + "loss": 5.2299, + "step": 61030 + }, + { + "epoch": 1.24176025390625, + "grad_norm": 13.307574272155762, + "learning_rate": 7.809258477679714e-06, + "loss": 4.8082, + "step": 61035 + }, + { + "epoch": 1.2418619791666667, + "grad_norm": 19.2449951171875, + "learning_rate": 7.808927840339367e-06, + "loss": 4.9421, + "step": 61040 + }, + { + "epoch": 1.2419637044270833, + "grad_norm": 26.463407516479492, + "learning_rate": 7.808597185050863e-06, + "loss": 4.8899, + "step": 61045 + }, + { + "epoch": 1.2420654296875, + "grad_norm": 16.32897186279297, + "learning_rate": 7.808266511816314e-06, + "loss": 4.858, + "step": 61050 + }, + { + "epoch": 1.2421671549479167, + "grad_norm": 20.1162052154541, + "learning_rate": 7.807935820637833e-06, + "loss": 4.9903, + "step": 61055 + }, + { + "epoch": 1.2422688802083333, + "grad_norm": 17.43939208984375, + "learning_rate": 7.807605111517534e-06, + "loss": 4.8403, + "step": 61060 + }, + { + "epoch": 1.24237060546875, + "grad_norm": 16.79310417175293, + "learning_rate": 7.80727438445753e-06, + "loss": 4.9499, + "step": 61065 + }, + { + "epoch": 1.2424723307291667, + "grad_norm": 20.533899307250977, + "learning_rate": 7.806943639459931e-06, + "loss": 5.1465, + "step": 61070 + }, + { + "epoch": 1.2425740559895833, + "grad_norm": 13.615560531616211, + "learning_rate": 7.806612876526858e-06, + "loss": 4.7988, + "step": 61075 + }, + { + "epoch": 1.24267578125, + "grad_norm": 13.934579849243164, + "learning_rate": 7.806282095660417e-06, + "loss": 5.1117, + "step": 61080 + }, + { + "epoch": 1.2427775065104167, + "grad_norm": 15.305490493774414, + "learning_rate": 7.805951296862724e-06, + "loss": 5.1058, + "step": 61085 + }, + { + "epoch": 1.2428792317708333, + "grad_norm": 16.30496597290039, + "learning_rate": 7.805620480135893e-06, + "loss": 4.7941, + "step": 61090 + }, + { + "epoch": 1.24298095703125, + "grad_norm": 18.86327362060547, + "learning_rate": 7.80528964548204e-06, + "loss": 4.8864, + "step": 61095 + }, + { + "epoch": 1.2430826822916667, + "grad_norm": 19.478649139404297, + "learning_rate": 7.804958792903274e-06, + "loss": 4.8379, + "step": 61100 + }, + { + "epoch": 1.2431844075520833, + "grad_norm": 14.537601470947266, + "learning_rate": 7.804627922401712e-06, + "loss": 4.8021, + "step": 61105 + }, + { + "epoch": 1.2432861328125, + "grad_norm": 14.697569847106934, + "learning_rate": 7.80429703397947e-06, + "loss": 4.8378, + "step": 61110 + }, + { + "epoch": 1.2433878580729167, + "grad_norm": 18.86237144470215, + "learning_rate": 7.803966127638658e-06, + "loss": 4.9041, + "step": 61115 + }, + { + "epoch": 1.2434895833333333, + "grad_norm": 19.33370590209961, + "learning_rate": 7.803635203381393e-06, + "loss": 4.7231, + "step": 61120 + }, + { + "epoch": 1.24359130859375, + "grad_norm": 17.718582153320312, + "learning_rate": 7.803304261209788e-06, + "loss": 5.1519, + "step": 61125 + }, + { + "epoch": 1.2436930338541667, + "grad_norm": 19.972442626953125, + "learning_rate": 7.802973301125962e-06, + "loss": 4.8743, + "step": 61130 + }, + { + "epoch": 1.2437947591145833, + "grad_norm": 17.045135498046875, + "learning_rate": 7.802642323132024e-06, + "loss": 5.0344, + "step": 61135 + }, + { + "epoch": 1.243896484375, + "grad_norm": 16.249744415283203, + "learning_rate": 7.802311327230091e-06, + "loss": 4.81, + "step": 61140 + }, + { + "epoch": 1.2439982096354167, + "grad_norm": 14.508828163146973, + "learning_rate": 7.80198031342228e-06, + "loss": 4.8154, + "step": 61145 + }, + { + "epoch": 1.2440999348958333, + "grad_norm": 15.772865295410156, + "learning_rate": 7.8016492817107e-06, + "loss": 5.093, + "step": 61150 + }, + { + "epoch": 1.24420166015625, + "grad_norm": 19.68963623046875, + "learning_rate": 7.801318232097475e-06, + "loss": 5.0302, + "step": 61155 + }, + { + "epoch": 1.2443033854166667, + "grad_norm": 15.60908031463623, + "learning_rate": 7.800987164584715e-06, + "loss": 4.8721, + "step": 61160 + }, + { + "epoch": 1.2444051106770833, + "grad_norm": 19.441478729248047, + "learning_rate": 7.800656079174534e-06, + "loss": 4.9361, + "step": 61165 + }, + { + "epoch": 1.2445068359375, + "grad_norm": 15.876513481140137, + "learning_rate": 7.800324975869052e-06, + "loss": 4.6295, + "step": 61170 + }, + { + "epoch": 1.2446085611979167, + "grad_norm": 20.929210662841797, + "learning_rate": 7.79999385467038e-06, + "loss": 4.8632, + "step": 61175 + }, + { + "epoch": 1.2447102864583333, + "grad_norm": 15.214783668518066, + "learning_rate": 7.799662715580639e-06, + "loss": 4.845, + "step": 61180 + }, + { + "epoch": 1.24481201171875, + "grad_norm": 18.52183723449707, + "learning_rate": 7.79933155860194e-06, + "loss": 4.9097, + "step": 61185 + }, + { + "epoch": 1.2449137369791667, + "grad_norm": 20.279272079467773, + "learning_rate": 7.7990003837364e-06, + "loss": 4.9056, + "step": 61190 + }, + { + "epoch": 1.2450154622395833, + "grad_norm": 14.815866470336914, + "learning_rate": 7.798669190986138e-06, + "loss": 4.8251, + "step": 61195 + }, + { + "epoch": 1.2451171875, + "grad_norm": 18.46608543395996, + "learning_rate": 7.798337980353267e-06, + "loss": 4.7299, + "step": 61200 + }, + { + "epoch": 1.2452189127604167, + "grad_norm": 15.818761825561523, + "learning_rate": 7.798006751839907e-06, + "loss": 4.7154, + "step": 61205 + }, + { + "epoch": 1.2453206380208333, + "grad_norm": 17.322263717651367, + "learning_rate": 7.797675505448171e-06, + "loss": 4.8005, + "step": 61210 + }, + { + "epoch": 1.24542236328125, + "grad_norm": 13.55514144897461, + "learning_rate": 7.797344241180176e-06, + "loss": 4.9158, + "step": 61215 + }, + { + "epoch": 1.2455240885416667, + "grad_norm": 16.944570541381836, + "learning_rate": 7.79701295903804e-06, + "loss": 5.1259, + "step": 61220 + }, + { + "epoch": 1.2456258138020833, + "grad_norm": 16.954517364501953, + "learning_rate": 7.796681659023877e-06, + "loss": 4.7249, + "step": 61225 + }, + { + "epoch": 1.2457275390625, + "grad_norm": 18.690837860107422, + "learning_rate": 7.79635034113981e-06, + "loss": 4.9393, + "step": 61230 + }, + { + "epoch": 1.2458292643229167, + "grad_norm": 17.619977951049805, + "learning_rate": 7.796019005387948e-06, + "loss": 4.861, + "step": 61235 + }, + { + "epoch": 1.2459309895833333, + "grad_norm": 28.991369247436523, + "learning_rate": 7.795687651770415e-06, + "loss": 4.7701, + "step": 61240 + }, + { + "epoch": 1.24603271484375, + "grad_norm": 25.31216812133789, + "learning_rate": 7.795356280289322e-06, + "loss": 5.3297, + "step": 61245 + }, + { + "epoch": 1.2461344401041667, + "grad_norm": 18.213302612304688, + "learning_rate": 7.795024890946792e-06, + "loss": 4.9661, + "step": 61250 + }, + { + "epoch": 1.2462361653645833, + "grad_norm": 26.037809371948242, + "learning_rate": 7.79469348374494e-06, + "loss": 4.8333, + "step": 61255 + }, + { + "epoch": 1.246337890625, + "grad_norm": 22.145524978637695, + "learning_rate": 7.794362058685884e-06, + "loss": 4.7728, + "step": 61260 + }, + { + "epoch": 1.2464396158854167, + "grad_norm": 20.43905258178711, + "learning_rate": 7.79403061577174e-06, + "loss": 5.0881, + "step": 61265 + }, + { + "epoch": 1.2465413411458333, + "grad_norm": 22.983163833618164, + "learning_rate": 7.793699155004629e-06, + "loss": 4.9762, + "step": 61270 + }, + { + "epoch": 1.24664306640625, + "grad_norm": 22.18060874938965, + "learning_rate": 7.793367676386668e-06, + "loss": 5.0187, + "step": 61275 + }, + { + "epoch": 1.2467447916666667, + "grad_norm": 19.168134689331055, + "learning_rate": 7.793036179919972e-06, + "loss": 5.124, + "step": 61280 + }, + { + "epoch": 1.2468465169270833, + "grad_norm": 14.08495807647705, + "learning_rate": 7.792704665606662e-06, + "loss": 5.0106, + "step": 61285 + }, + { + "epoch": 1.2469482421875, + "grad_norm": 15.3582181930542, + "learning_rate": 7.792373133448856e-06, + "loss": 5.007, + "step": 61290 + }, + { + "epoch": 1.2470499674479167, + "grad_norm": 17.43183135986328, + "learning_rate": 7.792041583448672e-06, + "loss": 4.8565, + "step": 61295 + }, + { + "epoch": 1.2471516927083333, + "grad_norm": 16.46634292602539, + "learning_rate": 7.79171001560823e-06, + "loss": 4.7982, + "step": 61300 + }, + { + "epoch": 1.24725341796875, + "grad_norm": 15.452914237976074, + "learning_rate": 7.791378429929646e-06, + "loss": 4.976, + "step": 61305 + }, + { + "epoch": 1.2473551432291667, + "grad_norm": 18.02031898498535, + "learning_rate": 7.79104682641504e-06, + "loss": 4.8874, + "step": 61310 + }, + { + "epoch": 1.2474568684895833, + "grad_norm": 21.730329513549805, + "learning_rate": 7.79071520506653e-06, + "loss": 4.6125, + "step": 61315 + }, + { + "epoch": 1.24755859375, + "grad_norm": 19.387706756591797, + "learning_rate": 7.790383565886237e-06, + "loss": 5.3008, + "step": 61320 + }, + { + "epoch": 1.2476603190104167, + "grad_norm": 14.034443855285645, + "learning_rate": 7.790051908876279e-06, + "loss": 4.7002, + "step": 61325 + }, + { + "epoch": 1.2477620442708333, + "grad_norm": 17.904510498046875, + "learning_rate": 7.789720234038773e-06, + "loss": 4.945, + "step": 61330 + }, + { + "epoch": 1.24786376953125, + "grad_norm": 16.749547958374023, + "learning_rate": 7.789388541375842e-06, + "loss": 4.7579, + "step": 61335 + }, + { + "epoch": 1.2479654947916667, + "grad_norm": 18.78040313720703, + "learning_rate": 7.789056830889605e-06, + "loss": 5.2166, + "step": 61340 + }, + { + "epoch": 1.2480672200520833, + "grad_norm": 16.677522659301758, + "learning_rate": 7.788725102582177e-06, + "loss": 4.8786, + "step": 61345 + }, + { + "epoch": 1.2481689453125, + "grad_norm": 16.297958374023438, + "learning_rate": 7.788393356455683e-06, + "loss": 4.7946, + "step": 61350 + }, + { + "epoch": 1.2482706705729167, + "grad_norm": 19.322404861450195, + "learning_rate": 7.78806159251224e-06, + "loss": 4.9731, + "step": 61355 + }, + { + "epoch": 1.2483723958333333, + "grad_norm": 26.494943618774414, + "learning_rate": 7.787729810753968e-06, + "loss": 4.7956, + "step": 61360 + }, + { + "epoch": 1.24847412109375, + "grad_norm": 20.8502254486084, + "learning_rate": 7.787398011182989e-06, + "loss": 4.9385, + "step": 61365 + }, + { + "epoch": 1.2485758463541667, + "grad_norm": 26.97126007080078, + "learning_rate": 7.78706619380142e-06, + "loss": 4.925, + "step": 61370 + }, + { + "epoch": 1.2486775716145833, + "grad_norm": 19.513633728027344, + "learning_rate": 7.786734358611382e-06, + "loss": 4.8792, + "step": 61375 + }, + { + "epoch": 1.248779296875, + "grad_norm": 17.502033233642578, + "learning_rate": 7.786402505614999e-06, + "loss": 4.8108, + "step": 61380 + }, + { + "epoch": 1.2488810221354167, + "grad_norm": 15.383148193359375, + "learning_rate": 7.786070634814386e-06, + "loss": 4.9599, + "step": 61385 + }, + { + "epoch": 1.2489827473958333, + "grad_norm": 15.049294471740723, + "learning_rate": 7.785738746211665e-06, + "loss": 4.8531, + "step": 61390 + }, + { + "epoch": 1.24908447265625, + "grad_norm": 11.66539192199707, + "learning_rate": 7.78540683980896e-06, + "loss": 4.9345, + "step": 61395 + }, + { + "epoch": 1.2491861979166667, + "grad_norm": 18.771259307861328, + "learning_rate": 7.78507491560839e-06, + "loss": 5.194, + "step": 61400 + }, + { + "epoch": 1.2492879231770833, + "grad_norm": 17.826080322265625, + "learning_rate": 7.784742973612075e-06, + "loss": 4.9374, + "step": 61405 + }, + { + "epoch": 1.2493896484375, + "grad_norm": 18.8244686126709, + "learning_rate": 7.784411013822136e-06, + "loss": 4.9957, + "step": 61410 + }, + { + "epoch": 1.2494913736979167, + "grad_norm": 16.638748168945312, + "learning_rate": 7.784079036240694e-06, + "loss": 5.0222, + "step": 61415 + }, + { + "epoch": 1.2495930989583333, + "grad_norm": 20.147384643554688, + "learning_rate": 7.783747040869872e-06, + "loss": 4.8104, + "step": 61420 + }, + { + "epoch": 1.24969482421875, + "grad_norm": 18.92158317565918, + "learning_rate": 7.78341502771179e-06, + "loss": 4.9098, + "step": 61425 + }, + { + "epoch": 1.2497965494791667, + "grad_norm": 18.544551849365234, + "learning_rate": 7.783082996768568e-06, + "loss": 4.9985, + "step": 61430 + }, + { + "epoch": 1.2498982747395833, + "grad_norm": 14.475509643554688, + "learning_rate": 7.782750948042329e-06, + "loss": 4.8742, + "step": 61435 + }, + { + "epoch": 1.25, + "grad_norm": 15.037766456604004, + "learning_rate": 7.782418881535196e-06, + "loss": 5.1481, + "step": 61440 + }, + { + "epoch": 1.25, + "eval_loss": 4.992877960205078, + "eval_runtime": 107.508, + "eval_samples_per_second": 18.668, + "eval_steps_per_second": 9.339, + "step": 61440 + }, + { + "epoch": 1.2501017252604167, + "grad_norm": 16.712743759155273, + "learning_rate": 7.782086797249288e-06, + "loss": 5.0622, + "step": 61445 + }, + { + "epoch": 1.2502034505208333, + "grad_norm": 17.359647750854492, + "learning_rate": 7.781754695186731e-06, + "loss": 4.8608, + "step": 61450 + }, + { + "epoch": 1.25030517578125, + "grad_norm": 21.170515060424805, + "learning_rate": 7.781422575349645e-06, + "loss": 4.9305, + "step": 61455 + }, + { + "epoch": 1.2504069010416667, + "grad_norm": 14.89231014251709, + "learning_rate": 7.781090437740149e-06, + "loss": 5.0373, + "step": 61460 + }, + { + "epoch": 1.2505086263020833, + "grad_norm": 18.34011459350586, + "learning_rate": 7.780758282360368e-06, + "loss": 4.8973, + "step": 61465 + }, + { + "epoch": 1.2506103515625, + "grad_norm": 15.552326202392578, + "learning_rate": 7.780426109212424e-06, + "loss": 4.9091, + "step": 61470 + }, + { + "epoch": 1.2507120768229167, + "grad_norm": 17.31421661376953, + "learning_rate": 7.780093918298443e-06, + "loss": 5.1995, + "step": 61475 + }, + { + "epoch": 1.2508138020833333, + "grad_norm": 23.373769760131836, + "learning_rate": 7.779761709620542e-06, + "loss": 5.008, + "step": 61480 + }, + { + "epoch": 1.25091552734375, + "grad_norm": 15.737563133239746, + "learning_rate": 7.779429483180847e-06, + "loss": 5.0299, + "step": 61485 + }, + { + "epoch": 1.2510172526041667, + "grad_norm": 13.310885429382324, + "learning_rate": 7.77909723898148e-06, + "loss": 5.021, + "step": 61490 + }, + { + "epoch": 1.2511189778645833, + "grad_norm": 18.675451278686523, + "learning_rate": 7.778764977024563e-06, + "loss": 4.9787, + "step": 61495 + }, + { + "epoch": 1.251220703125, + "grad_norm": 21.9801025390625, + "learning_rate": 7.77843269731222e-06, + "loss": 5.0239, + "step": 61500 + }, + { + "epoch": 1.2513224283854167, + "grad_norm": 17.588329315185547, + "learning_rate": 7.778100399846573e-06, + "loss": 5.0125, + "step": 61505 + }, + { + "epoch": 1.2514241536458333, + "grad_norm": 28.945476531982422, + "learning_rate": 7.777768084629748e-06, + "loss": 5.6054, + "step": 61510 + }, + { + "epoch": 1.25152587890625, + "grad_norm": 20.277982711791992, + "learning_rate": 7.777435751663866e-06, + "loss": 4.8885, + "step": 61515 + }, + { + "epoch": 1.2516276041666667, + "grad_norm": 13.96451473236084, + "learning_rate": 7.777103400951052e-06, + "loss": 4.7145, + "step": 61520 + }, + { + "epoch": 1.2517293294270833, + "grad_norm": 15.006179809570312, + "learning_rate": 7.77677103249343e-06, + "loss": 4.9507, + "step": 61525 + }, + { + "epoch": 1.2518310546875, + "grad_norm": 19.54368019104004, + "learning_rate": 7.77643864629312e-06, + "loss": 5.0738, + "step": 61530 + }, + { + "epoch": 1.2519327799479167, + "grad_norm": 19.669553756713867, + "learning_rate": 7.776106242352248e-06, + "loss": 5.3118, + "step": 61535 + }, + { + "epoch": 1.2520345052083333, + "grad_norm": 23.64791488647461, + "learning_rate": 7.77577382067294e-06, + "loss": 4.9946, + "step": 61540 + }, + { + "epoch": 1.25213623046875, + "grad_norm": 18.64529800415039, + "learning_rate": 7.77544138125732e-06, + "loss": 4.9372, + "step": 61545 + }, + { + "epoch": 1.2522379557291667, + "grad_norm": 19.29250717163086, + "learning_rate": 7.775108924107507e-06, + "loss": 4.8416, + "step": 61550 + }, + { + "epoch": 1.2523396809895833, + "grad_norm": 20.25327491760254, + "learning_rate": 7.774776449225631e-06, + "loss": 5.0198, + "step": 61555 + }, + { + "epoch": 1.25244140625, + "grad_norm": 20.026138305664062, + "learning_rate": 7.774443956613815e-06, + "loss": 4.9364, + "step": 61560 + }, + { + "epoch": 1.2525431315104167, + "grad_norm": 15.424864768981934, + "learning_rate": 7.774111446274181e-06, + "loss": 5.0347, + "step": 61565 + }, + { + "epoch": 1.2526448567708333, + "grad_norm": 16.780794143676758, + "learning_rate": 7.773778918208856e-06, + "loss": 5.1369, + "step": 61570 + }, + { + "epoch": 1.25274658203125, + "grad_norm": 16.939794540405273, + "learning_rate": 7.773446372419965e-06, + "loss": 4.9738, + "step": 61575 + }, + { + "epoch": 1.2528483072916667, + "grad_norm": 16.326717376708984, + "learning_rate": 7.773113808909632e-06, + "loss": 4.9915, + "step": 61580 + }, + { + "epoch": 1.2529500325520833, + "grad_norm": 16.54102897644043, + "learning_rate": 7.772781227679982e-06, + "loss": 4.7281, + "step": 61585 + }, + { + "epoch": 1.2530517578125, + "grad_norm": 17.813047409057617, + "learning_rate": 7.77244862873314e-06, + "loss": 4.8713, + "step": 61590 + }, + { + "epoch": 1.2531534830729167, + "grad_norm": 17.243816375732422, + "learning_rate": 7.772116012071231e-06, + "loss": 5.0341, + "step": 61595 + }, + { + "epoch": 1.2532552083333333, + "grad_norm": 17.36880111694336, + "learning_rate": 7.77178337769638e-06, + "loss": 5.2259, + "step": 61600 + }, + { + "epoch": 1.25335693359375, + "grad_norm": 17.185874938964844, + "learning_rate": 7.771450725610715e-06, + "loss": 4.5706, + "step": 61605 + }, + { + "epoch": 1.2534586588541667, + "grad_norm": 27.230749130249023, + "learning_rate": 7.771118055816358e-06, + "loss": 5.0143, + "step": 61610 + }, + { + "epoch": 1.2535603841145833, + "grad_norm": 16.899473190307617, + "learning_rate": 7.770785368315437e-06, + "loss": 4.7964, + "step": 61615 + }, + { + "epoch": 1.253662109375, + "grad_norm": 22.069612503051758, + "learning_rate": 7.770452663110078e-06, + "loss": 5.0159, + "step": 61620 + }, + { + "epoch": 1.2537638346354167, + "grad_norm": 18.378421783447266, + "learning_rate": 7.770119940202405e-06, + "loss": 4.874, + "step": 61625 + }, + { + "epoch": 1.2538655598958333, + "grad_norm": 16.41977882385254, + "learning_rate": 7.769787199594545e-06, + "loss": 4.9087, + "step": 61630 + }, + { + "epoch": 1.25396728515625, + "grad_norm": 17.382558822631836, + "learning_rate": 7.769454441288622e-06, + "loss": 4.8343, + "step": 61635 + }, + { + "epoch": 1.2540690104166667, + "grad_norm": 19.449419021606445, + "learning_rate": 7.769121665286767e-06, + "loss": 4.8122, + "step": 61640 + }, + { + "epoch": 1.2541707356770833, + "grad_norm": 18.961488723754883, + "learning_rate": 7.768788871591102e-06, + "loss": 5.1933, + "step": 61645 + }, + { + "epoch": 1.2542724609375, + "grad_norm": 22.406644821166992, + "learning_rate": 7.768456060203758e-06, + "loss": 5.0528, + "step": 61650 + }, + { + "epoch": 1.2543741861979167, + "grad_norm": 17.532934188842773, + "learning_rate": 7.768123231126854e-06, + "loss": 5.2171, + "step": 61655 + }, + { + "epoch": 1.2544759114583333, + "grad_norm": 20.755050659179688, + "learning_rate": 7.767790384362524e-06, + "loss": 5.0457, + "step": 61660 + }, + { + "epoch": 1.25457763671875, + "grad_norm": 19.879039764404297, + "learning_rate": 7.767457519912893e-06, + "loss": 4.9165, + "step": 61665 + }, + { + "epoch": 1.2546793619791667, + "grad_norm": 18.839744567871094, + "learning_rate": 7.767124637780086e-06, + "loss": 4.8781, + "step": 61670 + }, + { + "epoch": 1.2547810872395833, + "grad_norm": 19.97855567932129, + "learning_rate": 7.76679173796623e-06, + "loss": 4.9737, + "step": 61675 + }, + { + "epoch": 1.2548828125, + "grad_norm": 18.041391372680664, + "learning_rate": 7.766458820473453e-06, + "loss": 4.892, + "step": 61680 + }, + { + "epoch": 1.2549845377604167, + "grad_norm": 23.134355545043945, + "learning_rate": 7.766125885303883e-06, + "loss": 5.2684, + "step": 61685 + }, + { + "epoch": 1.2550862630208333, + "grad_norm": 27.245628356933594, + "learning_rate": 7.765792932459646e-06, + "loss": 4.949, + "step": 61690 + }, + { + "epoch": 1.25518798828125, + "grad_norm": 17.17523765563965, + "learning_rate": 7.765459961942872e-06, + "loss": 4.7741, + "step": 61695 + }, + { + "epoch": 1.2552897135416667, + "grad_norm": 17.850095748901367, + "learning_rate": 7.765126973755685e-06, + "loss": 4.8358, + "step": 61700 + }, + { + "epoch": 1.2553914388020833, + "grad_norm": 16.01470184326172, + "learning_rate": 7.764793967900215e-06, + "loss": 4.7286, + "step": 61705 + }, + { + "epoch": 1.2554931640625, + "grad_norm": 20.184707641601562, + "learning_rate": 7.76446094437859e-06, + "loss": 4.7597, + "step": 61710 + }, + { + "epoch": 1.2555948893229167, + "grad_norm": 16.348905563354492, + "learning_rate": 7.764127903192937e-06, + "loss": 4.7248, + "step": 61715 + }, + { + "epoch": 1.2556966145833333, + "grad_norm": 19.0174617767334, + "learning_rate": 7.763794844345383e-06, + "loss": 5.0824, + "step": 61720 + }, + { + "epoch": 1.25579833984375, + "grad_norm": 14.587300300598145, + "learning_rate": 7.763461767838058e-06, + "loss": 4.7344, + "step": 61725 + }, + { + "epoch": 1.2559000651041667, + "grad_norm": 24.610797882080078, + "learning_rate": 7.763128673673088e-06, + "loss": 5.081, + "step": 61730 + }, + { + "epoch": 1.2560017903645833, + "grad_norm": 16.42473030090332, + "learning_rate": 7.762795561852605e-06, + "loss": 4.6313, + "step": 61735 + }, + { + "epoch": 1.256103515625, + "grad_norm": 15.600606918334961, + "learning_rate": 7.762462432378735e-06, + "loss": 5.0137, + "step": 61740 + }, + { + "epoch": 1.2562052408854167, + "grad_norm": 16.51143455505371, + "learning_rate": 7.762129285253608e-06, + "loss": 4.9764, + "step": 61745 + }, + { + "epoch": 1.2563069661458333, + "grad_norm": 14.164628982543945, + "learning_rate": 7.76179612047935e-06, + "loss": 4.8418, + "step": 61750 + }, + { + "epoch": 1.25640869140625, + "grad_norm": 17.842226028442383, + "learning_rate": 7.76146293805809e-06, + "loss": 4.7033, + "step": 61755 + }, + { + "epoch": 1.2565104166666667, + "grad_norm": 21.072120666503906, + "learning_rate": 7.761129737991962e-06, + "loss": 5.1895, + "step": 61760 + }, + { + "epoch": 1.2566121419270833, + "grad_norm": 16.434600830078125, + "learning_rate": 7.760796520283089e-06, + "loss": 4.8827, + "step": 61765 + }, + { + "epoch": 1.2567138671875, + "grad_norm": 17.031658172607422, + "learning_rate": 7.760463284933603e-06, + "loss": 5.1559, + "step": 61770 + }, + { + "epoch": 1.2568155924479167, + "grad_norm": 18.403844833374023, + "learning_rate": 7.760130031945633e-06, + "loss": 4.9423, + "step": 61775 + }, + { + "epoch": 1.2569173177083333, + "grad_norm": 13.88180923461914, + "learning_rate": 7.75979676132131e-06, + "loss": 5.03, + "step": 61780 + }, + { + "epoch": 1.25701904296875, + "grad_norm": 15.191028594970703, + "learning_rate": 7.75946347306276e-06, + "loss": 5.0051, + "step": 61785 + }, + { + "epoch": 1.2571207682291667, + "grad_norm": 15.080812454223633, + "learning_rate": 7.759130167172114e-06, + "loss": 4.7232, + "step": 61790 + }, + { + "epoch": 1.2572224934895833, + "grad_norm": 14.484621047973633, + "learning_rate": 7.758796843651502e-06, + "loss": 4.8482, + "step": 61795 + }, + { + "epoch": 1.25732421875, + "grad_norm": 16.286258697509766, + "learning_rate": 7.758463502503056e-06, + "loss": 4.98, + "step": 61800 + }, + { + "epoch": 1.2574259440104167, + "grad_norm": 20.939638137817383, + "learning_rate": 7.758130143728902e-06, + "loss": 4.8281, + "step": 61805 + }, + { + "epoch": 1.2575276692708333, + "grad_norm": 15.28708267211914, + "learning_rate": 7.757796767331173e-06, + "loss": 4.7232, + "step": 61810 + }, + { + "epoch": 1.25762939453125, + "grad_norm": 19.033863067626953, + "learning_rate": 7.757463373311998e-06, + "loss": 4.9581, + "step": 61815 + }, + { + "epoch": 1.2577311197916667, + "grad_norm": 14.125205039978027, + "learning_rate": 7.757129961673507e-06, + "loss": 4.7463, + "step": 61820 + }, + { + "epoch": 1.2578328450520833, + "grad_norm": 15.380729675292969, + "learning_rate": 7.75679653241783e-06, + "loss": 4.9462, + "step": 61825 + }, + { + "epoch": 1.2579345703125, + "grad_norm": 20.92230224609375, + "learning_rate": 7.756463085547099e-06, + "loss": 4.796, + "step": 61830 + }, + { + "epoch": 1.2580362955729167, + "grad_norm": 14.58529281616211, + "learning_rate": 7.756129621063444e-06, + "loss": 4.7394, + "step": 61835 + }, + { + "epoch": 1.2581380208333333, + "grad_norm": 23.57917022705078, + "learning_rate": 7.755796138968996e-06, + "loss": 4.9615, + "step": 61840 + }, + { + "epoch": 1.25823974609375, + "grad_norm": 14.012657165527344, + "learning_rate": 7.755462639265886e-06, + "loss": 5.3275, + "step": 61845 + }, + { + "epoch": 1.2583414713541667, + "grad_norm": 17.378482818603516, + "learning_rate": 7.755129121956241e-06, + "loss": 4.7006, + "step": 61850 + }, + { + "epoch": 1.2584431966145833, + "grad_norm": 20.259769439697266, + "learning_rate": 7.754795587042198e-06, + "loss": 5.0333, + "step": 61855 + }, + { + "epoch": 1.258544921875, + "grad_norm": 14.41797161102295, + "learning_rate": 7.754462034525887e-06, + "loss": 4.7856, + "step": 61860 + }, + { + "epoch": 1.2586466471354167, + "grad_norm": 16.832063674926758, + "learning_rate": 7.754128464409436e-06, + "loss": 5.0858, + "step": 61865 + }, + { + "epoch": 1.2587483723958333, + "grad_norm": 17.11713981628418, + "learning_rate": 7.75379487669498e-06, + "loss": 5.3074, + "step": 61870 + }, + { + "epoch": 1.25885009765625, + "grad_norm": 17.505260467529297, + "learning_rate": 7.753461271384648e-06, + "loss": 4.8039, + "step": 61875 + }, + { + "epoch": 1.2589518229166667, + "grad_norm": 16.50591278076172, + "learning_rate": 7.753127648480572e-06, + "loss": 4.7416, + "step": 61880 + }, + { + "epoch": 1.2590535481770833, + "grad_norm": 17.322290420532227, + "learning_rate": 7.752794007984885e-06, + "loss": 5.0693, + "step": 61885 + }, + { + "epoch": 1.2591552734375, + "grad_norm": 24.75278663635254, + "learning_rate": 7.752460349899718e-06, + "loss": 4.8554, + "step": 61890 + }, + { + "epoch": 1.2592569986979167, + "grad_norm": 17.521743774414062, + "learning_rate": 7.752126674227202e-06, + "loss": 5.1111, + "step": 61895 + }, + { + "epoch": 1.2593587239583333, + "grad_norm": 13.3996000289917, + "learning_rate": 7.751792980969472e-06, + "loss": 5.0665, + "step": 61900 + }, + { + "epoch": 1.25946044921875, + "grad_norm": 19.551910400390625, + "learning_rate": 7.751459270128658e-06, + "loss": 4.9791, + "step": 61905 + }, + { + "epoch": 1.2595621744791667, + "grad_norm": 16.516511917114258, + "learning_rate": 7.751125541706892e-06, + "loss": 4.91, + "step": 61910 + }, + { + "epoch": 1.2596638997395833, + "grad_norm": 15.14561653137207, + "learning_rate": 7.750791795706307e-06, + "loss": 5.0053, + "step": 61915 + }, + { + "epoch": 1.259765625, + "grad_norm": 16.02497100830078, + "learning_rate": 7.750458032129037e-06, + "loss": 4.6582, + "step": 61920 + }, + { + "epoch": 1.2598673502604167, + "grad_norm": 17.957971572875977, + "learning_rate": 7.750124250977212e-06, + "loss": 5.0716, + "step": 61925 + }, + { + "epoch": 1.2599690755208333, + "grad_norm": 16.81985092163086, + "learning_rate": 7.749790452252967e-06, + "loss": 4.9071, + "step": 61930 + }, + { + "epoch": 1.26007080078125, + "grad_norm": 17.567161560058594, + "learning_rate": 7.749456635958433e-06, + "loss": 4.958, + "step": 61935 + }, + { + "epoch": 1.2601725260416667, + "grad_norm": 17.681987762451172, + "learning_rate": 7.749122802095744e-06, + "loss": 4.9205, + "step": 61940 + }, + { + "epoch": 1.2602742513020833, + "grad_norm": 18.995769500732422, + "learning_rate": 7.748788950667035e-06, + "loss": 4.991, + "step": 61945 + }, + { + "epoch": 1.2603759765625, + "grad_norm": 16.247474670410156, + "learning_rate": 7.748455081674433e-06, + "loss": 4.9378, + "step": 61950 + }, + { + "epoch": 1.2604777018229167, + "grad_norm": 14.072039604187012, + "learning_rate": 7.748121195120079e-06, + "loss": 4.8589, + "step": 61955 + }, + { + "epoch": 1.2605794270833333, + "grad_norm": 16.79129981994629, + "learning_rate": 7.7477872910061e-06, + "loss": 5.0623, + "step": 61960 + }, + { + "epoch": 1.26068115234375, + "grad_norm": 16.69533920288086, + "learning_rate": 7.747453369334637e-06, + "loss": 4.7187, + "step": 61965 + }, + { + "epoch": 1.2607828776041667, + "grad_norm": 21.694822311401367, + "learning_rate": 7.747119430107816e-06, + "loss": 4.8774, + "step": 61970 + }, + { + "epoch": 1.2608846028645833, + "grad_norm": 13.965642929077148, + "learning_rate": 7.746785473327773e-06, + "loss": 4.7867, + "step": 61975 + }, + { + "epoch": 1.260986328125, + "grad_norm": 22.031829833984375, + "learning_rate": 7.746451498996646e-06, + "loss": 4.9191, + "step": 61980 + }, + { + "epoch": 1.2610880533854167, + "grad_norm": 18.701152801513672, + "learning_rate": 7.746117507116562e-06, + "loss": 5.3396, + "step": 61985 + }, + { + "epoch": 1.2611897786458333, + "grad_norm": 13.815664291381836, + "learning_rate": 7.745783497689661e-06, + "loss": 4.9855, + "step": 61990 + }, + { + "epoch": 1.26129150390625, + "grad_norm": 14.61198902130127, + "learning_rate": 7.745449470718075e-06, + "loss": 4.8944, + "step": 61995 + }, + { + "epoch": 1.2613932291666667, + "grad_norm": 19.356548309326172, + "learning_rate": 7.745115426203939e-06, + "loss": 4.7834, + "step": 62000 + }, + { + "epoch": 1.2614949544270833, + "grad_norm": 17.16282081604004, + "learning_rate": 7.744781364149385e-06, + "loss": 5.0885, + "step": 62005 + }, + { + "epoch": 1.2615966796875, + "grad_norm": 14.060749053955078, + "learning_rate": 7.744447284556551e-06, + "loss": 4.9609, + "step": 62010 + }, + { + "epoch": 1.2616984049479167, + "grad_norm": 19.687618255615234, + "learning_rate": 7.74411318742757e-06, + "loss": 4.8165, + "step": 62015 + }, + { + "epoch": 1.2618001302083333, + "grad_norm": 20.26531410217285, + "learning_rate": 7.743779072764577e-06, + "loss": 4.9731, + "step": 62020 + }, + { + "epoch": 1.26190185546875, + "grad_norm": 17.183780670166016, + "learning_rate": 7.743444940569706e-06, + "loss": 5.0273, + "step": 62025 + }, + { + "epoch": 1.2620035807291667, + "grad_norm": 15.669096946716309, + "learning_rate": 7.743110790845092e-06, + "loss": 5.0395, + "step": 62030 + }, + { + "epoch": 1.2621053059895833, + "grad_norm": 14.825716018676758, + "learning_rate": 7.742776623592872e-06, + "loss": 4.8664, + "step": 62035 + }, + { + "epoch": 1.26220703125, + "grad_norm": 19.978002548217773, + "learning_rate": 7.742442438815178e-06, + "loss": 4.8862, + "step": 62040 + }, + { + "epoch": 1.2623087565104167, + "grad_norm": 15.873942375183105, + "learning_rate": 7.74210823651415e-06, + "loss": 4.8353, + "step": 62045 + }, + { + "epoch": 1.2624104817708333, + "grad_norm": 17.067852020263672, + "learning_rate": 7.74177401669192e-06, + "loss": 4.9986, + "step": 62050 + }, + { + "epoch": 1.26251220703125, + "grad_norm": 16.14996910095215, + "learning_rate": 7.741439779350624e-06, + "loss": 4.9107, + "step": 62055 + }, + { + "epoch": 1.2626139322916667, + "grad_norm": 14.066813468933105, + "learning_rate": 7.741105524492398e-06, + "loss": 5.1088, + "step": 62060 + }, + { + "epoch": 1.2627156575520833, + "grad_norm": 15.124977111816406, + "learning_rate": 7.740771252119378e-06, + "loss": 5.0447, + "step": 62065 + }, + { + "epoch": 1.2628173828125, + "grad_norm": 18.710559844970703, + "learning_rate": 7.7404369622337e-06, + "loss": 5.0746, + "step": 62070 + }, + { + "epoch": 1.2629191080729167, + "grad_norm": 21.8489990234375, + "learning_rate": 7.7401026548375e-06, + "loss": 4.9685, + "step": 62075 + }, + { + "epoch": 1.2630208333333333, + "grad_norm": 22.977310180664062, + "learning_rate": 7.739768329932913e-06, + "loss": 4.8555, + "step": 62080 + }, + { + "epoch": 1.26312255859375, + "grad_norm": 21.061574935913086, + "learning_rate": 7.739433987522079e-06, + "loss": 4.8434, + "step": 62085 + }, + { + "epoch": 1.2632242838541667, + "grad_norm": 13.371969223022461, + "learning_rate": 7.739099627607127e-06, + "loss": 4.9508, + "step": 62090 + }, + { + "epoch": 1.2633260091145833, + "grad_norm": 18.386932373046875, + "learning_rate": 7.7387652501902e-06, + "loss": 5.0076, + "step": 62095 + }, + { + "epoch": 1.263427734375, + "grad_norm": 15.366786003112793, + "learning_rate": 7.738430855273433e-06, + "loss": 4.8588, + "step": 62100 + }, + { + "epoch": 1.2635294596354167, + "grad_norm": 14.41849136352539, + "learning_rate": 7.738096442858961e-06, + "loss": 4.8202, + "step": 62105 + }, + { + "epoch": 1.2636311848958333, + "grad_norm": 17.554367065429688, + "learning_rate": 7.737762012948923e-06, + "loss": 4.7695, + "step": 62110 + }, + { + "epoch": 1.26373291015625, + "grad_norm": 17.945093154907227, + "learning_rate": 7.737427565545454e-06, + "loss": 5.2845, + "step": 62115 + }, + { + "epoch": 1.2638346354166667, + "grad_norm": 18.43836784362793, + "learning_rate": 7.737093100650692e-06, + "loss": 4.6974, + "step": 62120 + }, + { + "epoch": 1.2639363606770833, + "grad_norm": 18.207141876220703, + "learning_rate": 7.736758618266774e-06, + "loss": 4.9997, + "step": 62125 + }, + { + "epoch": 1.2640380859375, + "grad_norm": 15.53303050994873, + "learning_rate": 7.736424118395836e-06, + "loss": 5.0963, + "step": 62130 + }, + { + "epoch": 1.2641398111979167, + "grad_norm": 14.357666969299316, + "learning_rate": 7.736089601040019e-06, + "loss": 4.8867, + "step": 62135 + }, + { + "epoch": 1.2642415364583333, + "grad_norm": 13.29043960571289, + "learning_rate": 7.735755066201457e-06, + "loss": 4.8546, + "step": 62140 + }, + { + "epoch": 1.26434326171875, + "grad_norm": 16.67203712463379, + "learning_rate": 7.735420513882287e-06, + "loss": 5.0253, + "step": 62145 + }, + { + "epoch": 1.2644449869791667, + "grad_norm": 21.855525970458984, + "learning_rate": 7.73508594408465e-06, + "loss": 4.8676, + "step": 62150 + }, + { + "epoch": 1.2645467122395833, + "grad_norm": 18.54911994934082, + "learning_rate": 7.734751356810683e-06, + "loss": 4.9305, + "step": 62155 + }, + { + "epoch": 1.2646484375, + "grad_norm": 18.084678649902344, + "learning_rate": 7.734416752062521e-06, + "loss": 4.8107, + "step": 62160 + }, + { + "epoch": 1.2647501627604167, + "grad_norm": 22.81219482421875, + "learning_rate": 7.734082129842304e-06, + "loss": 4.9436, + "step": 62165 + }, + { + "epoch": 1.2648518880208333, + "grad_norm": 16.546579360961914, + "learning_rate": 7.73374749015217e-06, + "loss": 4.7049, + "step": 62170 + }, + { + "epoch": 1.26495361328125, + "grad_norm": 20.696102142333984, + "learning_rate": 7.73341283299426e-06, + "loss": 5.0889, + "step": 62175 + }, + { + "epoch": 1.2650553385416667, + "grad_norm": 13.679630279541016, + "learning_rate": 7.733078158370708e-06, + "loss": 5.2367, + "step": 62180 + }, + { + "epoch": 1.2651570638020833, + "grad_norm": 19.385000228881836, + "learning_rate": 7.732743466283654e-06, + "loss": 5.1994, + "step": 62185 + }, + { + "epoch": 1.2652587890625, + "grad_norm": 19.720932006835938, + "learning_rate": 7.732408756735236e-06, + "loss": 5.0186, + "step": 62190 + }, + { + "epoch": 1.2653605143229167, + "grad_norm": 21.461029052734375, + "learning_rate": 7.732074029727593e-06, + "loss": 4.9407, + "step": 62195 + }, + { + "epoch": 1.2654622395833333, + "grad_norm": 14.850868225097656, + "learning_rate": 7.731739285262866e-06, + "loss": 4.9076, + "step": 62200 + }, + { + "epoch": 1.26556396484375, + "grad_norm": 20.88100242614746, + "learning_rate": 7.73140452334319e-06, + "loss": 5.1807, + "step": 62205 + }, + { + "epoch": 1.2656656901041667, + "grad_norm": 19.235380172729492, + "learning_rate": 7.731069743970708e-06, + "loss": 4.8235, + "step": 62210 + }, + { + "epoch": 1.2657674153645833, + "grad_norm": 18.894784927368164, + "learning_rate": 7.730734947147558e-06, + "loss": 4.7867, + "step": 62215 + }, + { + "epoch": 1.265869140625, + "grad_norm": 18.3923282623291, + "learning_rate": 7.730400132875878e-06, + "loss": 4.9576, + "step": 62220 + }, + { + "epoch": 1.2659708658854167, + "grad_norm": 20.27290153503418, + "learning_rate": 7.730065301157806e-06, + "loss": 4.665, + "step": 62225 + }, + { + "epoch": 1.2660725911458333, + "grad_norm": 16.354724884033203, + "learning_rate": 7.729730451995485e-06, + "loss": 4.9108, + "step": 62230 + }, + { + "epoch": 1.26617431640625, + "grad_norm": 17.503318786621094, + "learning_rate": 7.729395585391054e-06, + "loss": 4.5878, + "step": 62235 + }, + { + "epoch": 1.2662760416666667, + "grad_norm": 19.567852020263672, + "learning_rate": 7.729060701346649e-06, + "loss": 5.1015, + "step": 62240 + }, + { + "epoch": 1.2663777669270833, + "grad_norm": 15.46180248260498, + "learning_rate": 7.728725799864414e-06, + "loss": 4.6293, + "step": 62245 + }, + { + "epoch": 1.2664794921875, + "grad_norm": 21.160816192626953, + "learning_rate": 7.728390880946488e-06, + "loss": 5.0875, + "step": 62250 + }, + { + "epoch": 1.2665812174479167, + "grad_norm": 23.113025665283203, + "learning_rate": 7.728055944595007e-06, + "loss": 4.7487, + "step": 62255 + }, + { + "epoch": 1.2666829427083333, + "grad_norm": 19.283279418945312, + "learning_rate": 7.727720990812118e-06, + "loss": 5.0111, + "step": 62260 + }, + { + "epoch": 1.26678466796875, + "grad_norm": 18.69537925720215, + "learning_rate": 7.727386019599956e-06, + "loss": 4.9608, + "step": 62265 + }, + { + "epoch": 1.2668863932291667, + "grad_norm": 16.80699920654297, + "learning_rate": 7.727051030960666e-06, + "loss": 5.1053, + "step": 62270 + }, + { + "epoch": 1.2669881184895833, + "grad_norm": 16.034563064575195, + "learning_rate": 7.72671602489638e-06, + "loss": 5.03, + "step": 62275 + }, + { + "epoch": 1.26708984375, + "grad_norm": 16.74744987487793, + "learning_rate": 7.72638100140925e-06, + "loss": 4.9084, + "step": 62280 + }, + { + "epoch": 1.2671915690104167, + "grad_norm": 16.239261627197266, + "learning_rate": 7.72604596050141e-06, + "loss": 5.1101, + "step": 62285 + }, + { + "epoch": 1.2672932942708333, + "grad_norm": 24.290359497070312, + "learning_rate": 7.725710902174999e-06, + "loss": 4.9064, + "step": 62290 + }, + { + "epoch": 1.26739501953125, + "grad_norm": 16.37493133544922, + "learning_rate": 7.725375826432163e-06, + "loss": 5.0731, + "step": 62295 + }, + { + "epoch": 1.2674967447916667, + "grad_norm": 16.025632858276367, + "learning_rate": 7.725040733275038e-06, + "loss": 4.9757, + "step": 62300 + }, + { + "epoch": 1.2675984700520833, + "grad_norm": 19.48078727722168, + "learning_rate": 7.724705622705768e-06, + "loss": 4.9764, + "step": 62305 + }, + { + "epoch": 1.2677001953125, + "grad_norm": 18.16435432434082, + "learning_rate": 7.724370494726496e-06, + "loss": 4.687, + "step": 62310 + }, + { + "epoch": 1.2678019205729167, + "grad_norm": 19.284818649291992, + "learning_rate": 7.72403534933936e-06, + "loss": 4.6442, + "step": 62315 + }, + { + "epoch": 1.2679036458333333, + "grad_norm": 14.98302936553955, + "learning_rate": 7.723700186546503e-06, + "loss": 4.9842, + "step": 62320 + }, + { + "epoch": 1.26800537109375, + "grad_norm": 19.079544067382812, + "learning_rate": 7.723365006350066e-06, + "loss": 4.657, + "step": 62325 + }, + { + "epoch": 1.2681070963541667, + "grad_norm": 21.58732795715332, + "learning_rate": 7.723029808752192e-06, + "loss": 4.9963, + "step": 62330 + }, + { + "epoch": 1.2682088216145833, + "grad_norm": 17.939350128173828, + "learning_rate": 7.722694593755022e-06, + "loss": 5.0479, + "step": 62335 + }, + { + "epoch": 1.268310546875, + "grad_norm": 20.350828170776367, + "learning_rate": 7.722359361360698e-06, + "loss": 5.0369, + "step": 62340 + }, + { + "epoch": 1.2684122721354167, + "grad_norm": 16.84966468811035, + "learning_rate": 7.722024111571362e-06, + "loss": 4.7893, + "step": 62345 + }, + { + "epoch": 1.2685139973958333, + "grad_norm": 17.748497009277344, + "learning_rate": 7.721688844389154e-06, + "loss": 5.063, + "step": 62350 + }, + { + "epoch": 1.26861572265625, + "grad_norm": 20.634185791015625, + "learning_rate": 7.721353559816221e-06, + "loss": 4.9793, + "step": 62355 + }, + { + "epoch": 1.2687174479166667, + "grad_norm": 18.390710830688477, + "learning_rate": 7.721018257854702e-06, + "loss": 4.7974, + "step": 62360 + }, + { + "epoch": 1.2688191731770833, + "grad_norm": 17.601667404174805, + "learning_rate": 7.72068293850674e-06, + "loss": 5.0401, + "step": 62365 + }, + { + "epoch": 1.2689208984375, + "grad_norm": 16.022903442382812, + "learning_rate": 7.720347601774476e-06, + "loss": 5.111, + "step": 62370 + }, + { + "epoch": 1.2690226236979167, + "grad_norm": 20.695470809936523, + "learning_rate": 7.720012247660055e-06, + "loss": 5.034, + "step": 62375 + }, + { + "epoch": 1.2691243489583333, + "grad_norm": 16.099058151245117, + "learning_rate": 7.71967687616562e-06, + "loss": 4.9776, + "step": 62380 + }, + { + "epoch": 1.26922607421875, + "grad_norm": 15.068683624267578, + "learning_rate": 7.719341487293314e-06, + "loss": 4.9288, + "step": 62385 + }, + { + "epoch": 1.2693277994791667, + "grad_norm": 14.922411918640137, + "learning_rate": 7.719006081045278e-06, + "loss": 4.7352, + "step": 62390 + }, + { + "epoch": 1.2694295247395833, + "grad_norm": 16.61612892150879, + "learning_rate": 7.718670657423655e-06, + "loss": 5.0454, + "step": 62395 + }, + { + "epoch": 1.26953125, + "grad_norm": 20.45032501220703, + "learning_rate": 7.718335216430592e-06, + "loss": 5.0922, + "step": 62400 + }, + { + "epoch": 1.2696329752604167, + "grad_norm": 18.531686782836914, + "learning_rate": 7.717999758068228e-06, + "loss": 4.7616, + "step": 62405 + }, + { + "epoch": 1.2697347005208333, + "grad_norm": 16.862258911132812, + "learning_rate": 7.717664282338711e-06, + "loss": 5.0176, + "step": 62410 + }, + { + "epoch": 1.26983642578125, + "grad_norm": 13.968123435974121, + "learning_rate": 7.717328789244178e-06, + "loss": 5.1578, + "step": 62415 + }, + { + "epoch": 1.2699381510416667, + "grad_norm": 15.94645881652832, + "learning_rate": 7.71699327878678e-06, + "loss": 5.0577, + "step": 62420 + }, + { + "epoch": 1.2700398763020833, + "grad_norm": 20.53711700439453, + "learning_rate": 7.716657750968656e-06, + "loss": 5.0222, + "step": 62425 + }, + { + "epoch": 1.2701416015625, + "grad_norm": 18.21357536315918, + "learning_rate": 7.716322205791951e-06, + "loss": 4.7699, + "step": 62430 + }, + { + "epoch": 1.2702433268229167, + "grad_norm": 18.586170196533203, + "learning_rate": 7.715986643258811e-06, + "loss": 5.186, + "step": 62435 + }, + { + "epoch": 1.2703450520833333, + "grad_norm": 18.48337173461914, + "learning_rate": 7.715651063371377e-06, + "loss": 4.7427, + "step": 62440 + }, + { + "epoch": 1.27044677734375, + "grad_norm": 12.799881935119629, + "learning_rate": 7.715315466131794e-06, + "loss": 4.956, + "step": 62445 + }, + { + "epoch": 1.2705485026041667, + "grad_norm": 15.829934120178223, + "learning_rate": 7.714979851542207e-06, + "loss": 4.8119, + "step": 62450 + }, + { + "epoch": 1.2706502278645833, + "grad_norm": 15.77665901184082, + "learning_rate": 7.714644219604763e-06, + "loss": 4.9201, + "step": 62455 + }, + { + "epoch": 1.270751953125, + "grad_norm": 16.2459659576416, + "learning_rate": 7.714308570321602e-06, + "loss": 4.7602, + "step": 62460 + }, + { + "epoch": 1.2708536783854167, + "grad_norm": 16.215242385864258, + "learning_rate": 7.713972903694873e-06, + "loss": 5.0447, + "step": 62465 + }, + { + "epoch": 1.2709554036458333, + "grad_norm": 14.959039688110352, + "learning_rate": 7.713637219726718e-06, + "loss": 4.7574, + "step": 62470 + }, + { + "epoch": 1.27105712890625, + "grad_norm": 14.244551658630371, + "learning_rate": 7.713301518419282e-06, + "loss": 4.9955, + "step": 62475 + }, + { + "epoch": 1.2711588541666667, + "grad_norm": 17.302820205688477, + "learning_rate": 7.71296579977471e-06, + "loss": 5.0162, + "step": 62480 + }, + { + "epoch": 1.2712605794270833, + "grad_norm": 13.119850158691406, + "learning_rate": 7.712630063795147e-06, + "loss": 4.9835, + "step": 62485 + }, + { + "epoch": 1.2713623046875, + "grad_norm": 22.873981475830078, + "learning_rate": 7.712294310482742e-06, + "loss": 5.0827, + "step": 62490 + }, + { + "epoch": 1.2714640299479167, + "grad_norm": 14.118870735168457, + "learning_rate": 7.711958539839635e-06, + "loss": 4.7848, + "step": 62495 + }, + { + "epoch": 1.2715657552083333, + "grad_norm": 22.15793800354004, + "learning_rate": 7.711622751867975e-06, + "loss": 5.0609, + "step": 62500 + }, + { + "epoch": 1.27166748046875, + "grad_norm": 11.582454681396484, + "learning_rate": 7.711286946569907e-06, + "loss": 4.93, + "step": 62505 + }, + { + "epoch": 1.2717692057291667, + "grad_norm": 12.433496475219727, + "learning_rate": 7.710951123947575e-06, + "loss": 5.0073, + "step": 62510 + }, + { + "epoch": 1.2718709309895833, + "grad_norm": 18.070302963256836, + "learning_rate": 7.710615284003127e-06, + "loss": 5.2781, + "step": 62515 + }, + { + "epoch": 1.27197265625, + "grad_norm": 17.596237182617188, + "learning_rate": 7.710279426738706e-06, + "loss": 4.914, + "step": 62520 + }, + { + "epoch": 1.2720743815104167, + "grad_norm": 21.74873924255371, + "learning_rate": 7.70994355215646e-06, + "loss": 5.4083, + "step": 62525 + }, + { + "epoch": 1.2721761067708333, + "grad_norm": 17.713998794555664, + "learning_rate": 7.709607660258536e-06, + "loss": 4.8324, + "step": 62530 + }, + { + "epoch": 1.27227783203125, + "grad_norm": 16.716136932373047, + "learning_rate": 7.709271751047079e-06, + "loss": 4.8444, + "step": 62535 + }, + { + "epoch": 1.2723795572916667, + "grad_norm": 17.648706436157227, + "learning_rate": 7.708935824524235e-06, + "loss": 4.7477, + "step": 62540 + }, + { + "epoch": 1.2724812825520833, + "grad_norm": 15.974311828613281, + "learning_rate": 7.708599880692151e-06, + "loss": 4.8989, + "step": 62545 + }, + { + "epoch": 1.2725830078125, + "grad_norm": 20.39532470703125, + "learning_rate": 7.708263919552973e-06, + "loss": 5.2463, + "step": 62550 + }, + { + "epoch": 1.2726847330729167, + "grad_norm": 17.682846069335938, + "learning_rate": 7.707927941108851e-06, + "loss": 4.8621, + "step": 62555 + }, + { + "epoch": 1.2727864583333333, + "grad_norm": 22.214933395385742, + "learning_rate": 7.707591945361928e-06, + "loss": 4.9291, + "step": 62560 + }, + { + "epoch": 1.27288818359375, + "grad_norm": 15.528410911560059, + "learning_rate": 7.70725593231435e-06, + "loss": 5.189, + "step": 62565 + }, + { + "epoch": 1.2729899088541667, + "grad_norm": 15.380434036254883, + "learning_rate": 7.706919901968268e-06, + "loss": 4.6822, + "step": 62570 + }, + { + "epoch": 1.2730916341145833, + "grad_norm": 16.481489181518555, + "learning_rate": 7.706583854325827e-06, + "loss": 4.9186, + "step": 62575 + }, + { + "epoch": 1.273193359375, + "grad_norm": 20.681177139282227, + "learning_rate": 7.706247789389173e-06, + "loss": 5.0148, + "step": 62580 + }, + { + "epoch": 1.2732950846354167, + "grad_norm": 23.74420928955078, + "learning_rate": 7.705911707160456e-06, + "loss": 5.313, + "step": 62585 + }, + { + "epoch": 1.2733968098958333, + "grad_norm": 12.781412124633789, + "learning_rate": 7.70557560764182e-06, + "loss": 5.009, + "step": 62590 + }, + { + "epoch": 1.27349853515625, + "grad_norm": 26.682594299316406, + "learning_rate": 7.705239490835417e-06, + "loss": 4.9106, + "step": 62595 + }, + { + "epoch": 1.2736002604166667, + "grad_norm": 20.296232223510742, + "learning_rate": 7.704903356743392e-06, + "loss": 4.8428, + "step": 62600 + }, + { + "epoch": 1.2737019856770833, + "grad_norm": 16.11696434020996, + "learning_rate": 7.704567205367894e-06, + "loss": 5.0927, + "step": 62605 + }, + { + "epoch": 1.2738037109375, + "grad_norm": 18.326675415039062, + "learning_rate": 7.704231036711067e-06, + "loss": 4.6537, + "step": 62610 + }, + { + "epoch": 1.2739054361979167, + "grad_norm": 17.30082130432129, + "learning_rate": 7.703894850775064e-06, + "loss": 4.9296, + "step": 62615 + }, + { + "epoch": 1.2740071614583333, + "grad_norm": 19.14491081237793, + "learning_rate": 7.703558647562033e-06, + "loss": 5.0479, + "step": 62620 + }, + { + "epoch": 1.27410888671875, + "grad_norm": 14.977700233459473, + "learning_rate": 7.703222427074118e-06, + "loss": 4.8557, + "step": 62625 + }, + { + "epoch": 1.2742106119791667, + "grad_norm": 17.987001419067383, + "learning_rate": 7.702886189313472e-06, + "loss": 4.9182, + "step": 62630 + }, + { + "epoch": 1.2743123372395833, + "grad_norm": 16.957670211791992, + "learning_rate": 7.70254993428224e-06, + "loss": 4.8271, + "step": 62635 + }, + { + "epoch": 1.2744140625, + "grad_norm": 27.30196762084961, + "learning_rate": 7.702213661982572e-06, + "loss": 4.8767, + "step": 62640 + }, + { + "epoch": 1.2745157877604167, + "grad_norm": 15.582996368408203, + "learning_rate": 7.701877372416614e-06, + "loss": 4.887, + "step": 62645 + }, + { + "epoch": 1.2746175130208333, + "grad_norm": 20.126140594482422, + "learning_rate": 7.701541065586521e-06, + "loss": 4.836, + "step": 62650 + }, + { + "epoch": 1.27471923828125, + "grad_norm": 12.091750144958496, + "learning_rate": 7.701204741494437e-06, + "loss": 4.7191, + "step": 62655 + }, + { + "epoch": 1.2748209635416667, + "grad_norm": 19.462759017944336, + "learning_rate": 7.700868400142512e-06, + "loss": 5.0664, + "step": 62660 + }, + { + "epoch": 1.2749226888020833, + "grad_norm": 18.173015594482422, + "learning_rate": 7.700532041532895e-06, + "loss": 5.0528, + "step": 62665 + }, + { + "epoch": 1.2750244140625, + "grad_norm": 13.260177612304688, + "learning_rate": 7.700195665667736e-06, + "loss": 4.8779, + "step": 62670 + }, + { + "epoch": 1.2751261393229167, + "grad_norm": 21.585376739501953, + "learning_rate": 7.699859272549182e-06, + "loss": 4.8472, + "step": 62675 + }, + { + "epoch": 1.2752278645833333, + "grad_norm": 20.232349395751953, + "learning_rate": 7.699522862179387e-06, + "loss": 4.9741, + "step": 62680 + }, + { + "epoch": 1.27532958984375, + "grad_norm": 11.512125015258789, + "learning_rate": 7.699186434560497e-06, + "loss": 4.7568, + "step": 62685 + }, + { + "epoch": 1.2754313151041667, + "grad_norm": 19.22349739074707, + "learning_rate": 7.698849989694662e-06, + "loss": 4.8887, + "step": 62690 + }, + { + "epoch": 1.2755330403645833, + "grad_norm": 16.117164611816406, + "learning_rate": 7.698513527584032e-06, + "loss": 4.6575, + "step": 62695 + }, + { + "epoch": 1.275634765625, + "grad_norm": 17.686052322387695, + "learning_rate": 7.698177048230756e-06, + "loss": 5.1156, + "step": 62700 + }, + { + "epoch": 1.2757364908854167, + "grad_norm": 16.520687103271484, + "learning_rate": 7.697840551636989e-06, + "loss": 4.9737, + "step": 62705 + }, + { + "epoch": 1.2758382161458333, + "grad_norm": 18.760225296020508, + "learning_rate": 7.697504037804874e-06, + "loss": 4.7797, + "step": 62710 + }, + { + "epoch": 1.27593994140625, + "grad_norm": 19.527406692504883, + "learning_rate": 7.697167506736567e-06, + "loss": 4.8712, + "step": 62715 + }, + { + "epoch": 1.2760416666666667, + "grad_norm": 15.454957008361816, + "learning_rate": 7.696830958434213e-06, + "loss": 5.028, + "step": 62720 + }, + { + "epoch": 1.2761433919270833, + "grad_norm": 19.139616012573242, + "learning_rate": 7.69649439289997e-06, + "loss": 4.8657, + "step": 62725 + }, + { + "epoch": 1.2762451171875, + "grad_norm": 17.56804656982422, + "learning_rate": 7.696157810135979e-06, + "loss": 4.6662, + "step": 62730 + }, + { + "epoch": 1.2763468424479167, + "grad_norm": 16.47400665283203, + "learning_rate": 7.695821210144398e-06, + "loss": 4.6876, + "step": 62735 + }, + { + "epoch": 1.2764485677083333, + "grad_norm": 17.14927864074707, + "learning_rate": 7.695484592927373e-06, + "loss": 5.2491, + "step": 62740 + }, + { + "epoch": 1.27655029296875, + "grad_norm": 17.111095428466797, + "learning_rate": 7.695147958487059e-06, + "loss": 4.9026, + "step": 62745 + }, + { + "epoch": 1.2766520182291667, + "grad_norm": 18.79044532775879, + "learning_rate": 7.694811306825604e-06, + "loss": 5.0154, + "step": 62750 + }, + { + "epoch": 1.2767537434895833, + "grad_norm": 25.346803665161133, + "learning_rate": 7.69447463794516e-06, + "loss": 4.6919, + "step": 62755 + }, + { + "epoch": 1.27685546875, + "grad_norm": 22.567184448242188, + "learning_rate": 7.69413795184788e-06, + "loss": 4.9802, + "step": 62760 + }, + { + "epoch": 1.2769571940104167, + "grad_norm": 19.00103187561035, + "learning_rate": 7.693801248535913e-06, + "loss": 4.9539, + "step": 62765 + }, + { + "epoch": 1.2770589192708333, + "grad_norm": 15.163225173950195, + "learning_rate": 7.69346452801141e-06, + "loss": 5.2123, + "step": 62770 + }, + { + "epoch": 1.27716064453125, + "grad_norm": 18.35973358154297, + "learning_rate": 7.693127790276525e-06, + "loss": 5.1314, + "step": 62775 + }, + { + "epoch": 1.2772623697916667, + "grad_norm": 14.587059020996094, + "learning_rate": 7.692791035333407e-06, + "loss": 4.925, + "step": 62780 + }, + { + "epoch": 1.2773640950520833, + "grad_norm": 16.623092651367188, + "learning_rate": 7.692454263184211e-06, + "loss": 4.7555, + "step": 62785 + }, + { + "epoch": 1.2774658203125, + "grad_norm": 17.75309944152832, + "learning_rate": 7.692117473831085e-06, + "loss": 4.752, + "step": 62790 + }, + { + "epoch": 1.2775675455729167, + "grad_norm": 18.10736083984375, + "learning_rate": 7.691780667276184e-06, + "loss": 5.0101, + "step": 62795 + }, + { + "epoch": 1.2776692708333333, + "grad_norm": 23.47715187072754, + "learning_rate": 7.69144384352166e-06, + "loss": 5.0189, + "step": 62800 + }, + { + "epoch": 1.27777099609375, + "grad_norm": 15.667634963989258, + "learning_rate": 7.691107002569663e-06, + "loss": 5.0456, + "step": 62805 + }, + { + "epoch": 1.2778727213541667, + "grad_norm": 16.281070709228516, + "learning_rate": 7.690770144422347e-06, + "loss": 4.9757, + "step": 62810 + }, + { + "epoch": 1.2779744466145833, + "grad_norm": 16.679567337036133, + "learning_rate": 7.690433269081864e-06, + "loss": 5.0746, + "step": 62815 + }, + { + "epoch": 1.278076171875, + "grad_norm": 20.84222984313965, + "learning_rate": 7.690096376550367e-06, + "loss": 4.9375, + "step": 62820 + }, + { + "epoch": 1.2781778971354167, + "grad_norm": 21.191198348999023, + "learning_rate": 7.689759466830006e-06, + "loss": 4.9305, + "step": 62825 + }, + { + "epoch": 1.2782796223958333, + "grad_norm": 19.482177734375, + "learning_rate": 7.689422539922937e-06, + "loss": 5.1741, + "step": 62830 + }, + { + "epoch": 1.27838134765625, + "grad_norm": 18.581260681152344, + "learning_rate": 7.689085595831313e-06, + "loss": 4.9409, + "step": 62835 + }, + { + "epoch": 1.2784830729166667, + "grad_norm": 19.83069610595703, + "learning_rate": 7.688748634557283e-06, + "loss": 5.0079, + "step": 62840 + }, + { + "epoch": 1.2785847981770833, + "grad_norm": 20.422855377197266, + "learning_rate": 7.688411656103006e-06, + "loss": 4.9923, + "step": 62845 + }, + { + "epoch": 1.2786865234375, + "grad_norm": 18.72713279724121, + "learning_rate": 7.68807466047063e-06, + "loss": 4.7866, + "step": 62850 + }, + { + "epoch": 1.2787882486979167, + "grad_norm": 16.152847290039062, + "learning_rate": 7.68773764766231e-06, + "loss": 4.8525, + "step": 62855 + }, + { + "epoch": 1.2788899739583333, + "grad_norm": 18.958467483520508, + "learning_rate": 7.6874006176802e-06, + "loss": 5.0019, + "step": 62860 + }, + { + "epoch": 1.27899169921875, + "grad_norm": 17.871978759765625, + "learning_rate": 7.687063570526453e-06, + "loss": 5.1948, + "step": 62865 + }, + { + "epoch": 1.2790934244791667, + "grad_norm": 17.696340560913086, + "learning_rate": 7.686726506203223e-06, + "loss": 5.0926, + "step": 62870 + }, + { + "epoch": 1.2791951497395833, + "grad_norm": 16.575464248657227, + "learning_rate": 7.686389424712663e-06, + "loss": 4.9719, + "step": 62875 + }, + { + "epoch": 1.279296875, + "grad_norm": 14.900771141052246, + "learning_rate": 7.686052326056928e-06, + "loss": 5.0168, + "step": 62880 + }, + { + "epoch": 1.2793986002604167, + "grad_norm": 14.971451759338379, + "learning_rate": 7.685715210238171e-06, + "loss": 4.7112, + "step": 62885 + }, + { + "epoch": 1.2795003255208333, + "grad_norm": 23.24203872680664, + "learning_rate": 7.685378077258547e-06, + "loss": 4.9727, + "step": 62890 + }, + { + "epoch": 1.27960205078125, + "grad_norm": 17.526994705200195, + "learning_rate": 7.685040927120208e-06, + "loss": 4.6792, + "step": 62895 + }, + { + "epoch": 1.2797037760416667, + "grad_norm": 20.353408813476562, + "learning_rate": 7.684703759825311e-06, + "loss": 4.9145, + "step": 62900 + }, + { + "epoch": 1.2798055013020833, + "grad_norm": 18.41188621520996, + "learning_rate": 7.68436657537601e-06, + "loss": 4.9465, + "step": 62905 + }, + { + "epoch": 1.2799072265625, + "grad_norm": 16.753929138183594, + "learning_rate": 7.684029373774458e-06, + "loss": 4.8886, + "step": 62910 + }, + { + "epoch": 1.2800089518229167, + "grad_norm": 19.898723602294922, + "learning_rate": 7.68369215502281e-06, + "loss": 5.0202, + "step": 62915 + }, + { + "epoch": 1.2801106770833333, + "grad_norm": 19.170063018798828, + "learning_rate": 7.683354919123221e-06, + "loss": 5.1774, + "step": 62920 + }, + { + "epoch": 1.28021240234375, + "grad_norm": 18.014785766601562, + "learning_rate": 7.683017666077845e-06, + "loss": 4.8982, + "step": 62925 + }, + { + "epoch": 1.2803141276041667, + "grad_norm": 20.912139892578125, + "learning_rate": 7.68268039588884e-06, + "loss": 4.9442, + "step": 62930 + }, + { + "epoch": 1.2804158528645833, + "grad_norm": 25.958757400512695, + "learning_rate": 7.682343108558357e-06, + "loss": 4.9354, + "step": 62935 + }, + { + "epoch": 1.280517578125, + "grad_norm": 14.841263771057129, + "learning_rate": 7.682005804088555e-06, + "loss": 4.7038, + "step": 62940 + }, + { + "epoch": 1.2806193033854167, + "grad_norm": 19.020795822143555, + "learning_rate": 7.681668482481585e-06, + "loss": 4.9772, + "step": 62945 + }, + { + "epoch": 1.2807210286458333, + "grad_norm": 19.271677017211914, + "learning_rate": 7.681331143739606e-06, + "loss": 5.1418, + "step": 62950 + }, + { + "epoch": 1.28082275390625, + "grad_norm": 17.85433578491211, + "learning_rate": 7.680993787864773e-06, + "loss": 4.99, + "step": 62955 + }, + { + "epoch": 1.2809244791666667, + "grad_norm": 16.340852737426758, + "learning_rate": 7.680656414859238e-06, + "loss": 4.8811, + "step": 62960 + }, + { + "epoch": 1.2810262044270833, + "grad_norm": 15.038981437683105, + "learning_rate": 7.680319024725163e-06, + "loss": 4.8197, + "step": 62965 + }, + { + "epoch": 1.2811279296875, + "grad_norm": 37.769935607910156, + "learning_rate": 7.679981617464698e-06, + "loss": 4.9042, + "step": 62970 + }, + { + "epoch": 1.2812296549479167, + "grad_norm": 16.39055061340332, + "learning_rate": 7.67964419308e-06, + "loss": 5.0066, + "step": 62975 + }, + { + "epoch": 1.2813313802083333, + "grad_norm": 18.154870986938477, + "learning_rate": 7.67930675157323e-06, + "loss": 4.8307, + "step": 62980 + }, + { + "epoch": 1.28143310546875, + "grad_norm": 18.48917579650879, + "learning_rate": 7.678969292946537e-06, + "loss": 5.0952, + "step": 62985 + }, + { + "epoch": 1.2815348307291667, + "grad_norm": 22.427392959594727, + "learning_rate": 7.678631817202083e-06, + "loss": 5.0573, + "step": 62990 + }, + { + "epoch": 1.2816365559895833, + "grad_norm": 15.68436336517334, + "learning_rate": 7.67829432434202e-06, + "loss": 5.0734, + "step": 62995 + }, + { + "epoch": 1.28173828125, + "grad_norm": 15.959607124328613, + "learning_rate": 7.677956814368507e-06, + "loss": 4.9821, + "step": 63000 + }, + { + "epoch": 1.2818400065104167, + "grad_norm": 16.810163497924805, + "learning_rate": 7.6776192872837e-06, + "loss": 4.9744, + "step": 63005 + }, + { + "epoch": 1.2819417317708333, + "grad_norm": 13.727181434631348, + "learning_rate": 7.677281743089756e-06, + "loss": 5.2938, + "step": 63010 + }, + { + "epoch": 1.28204345703125, + "grad_norm": 18.191232681274414, + "learning_rate": 7.676944181788832e-06, + "loss": 4.9624, + "step": 63015 + }, + { + "epoch": 1.2821451822916667, + "grad_norm": 15.648002624511719, + "learning_rate": 7.676606603383083e-06, + "loss": 5.0047, + "step": 63020 + }, + { + "epoch": 1.2822469075520833, + "grad_norm": 17.082931518554688, + "learning_rate": 7.67626900787467e-06, + "loss": 5.0755, + "step": 63025 + }, + { + "epoch": 1.2823486328125, + "grad_norm": 19.378768920898438, + "learning_rate": 7.675931395265745e-06, + "loss": 4.9195, + "step": 63030 + }, + { + "epoch": 1.2824503580729167, + "grad_norm": 12.861471176147461, + "learning_rate": 7.67559376555847e-06, + "loss": 5.0477, + "step": 63035 + }, + { + "epoch": 1.2825520833333333, + "grad_norm": 20.194774627685547, + "learning_rate": 7.675256118754999e-06, + "loss": 4.9355, + "step": 63040 + }, + { + "epoch": 1.28265380859375, + "grad_norm": 16.989885330200195, + "learning_rate": 7.674918454857491e-06, + "loss": 4.8226, + "step": 63045 + }, + { + "epoch": 1.2827555338541667, + "grad_norm": 14.681517601013184, + "learning_rate": 7.674580773868103e-06, + "loss": 4.9378, + "step": 63050 + }, + { + "epoch": 1.2828572591145833, + "grad_norm": 18.790882110595703, + "learning_rate": 7.674243075788992e-06, + "loss": 4.8971, + "step": 63055 + }, + { + "epoch": 1.282958984375, + "grad_norm": 26.319866180419922, + "learning_rate": 7.673905360622317e-06, + "loss": 5.1115, + "step": 63060 + }, + { + "epoch": 1.2830607096354167, + "grad_norm": 18.826555252075195, + "learning_rate": 7.673567628370237e-06, + "loss": 4.7443, + "step": 63065 + }, + { + "epoch": 1.2831624348958333, + "grad_norm": 15.16531753540039, + "learning_rate": 7.67322987903491e-06, + "loss": 4.8346, + "step": 63070 + }, + { + "epoch": 1.28326416015625, + "grad_norm": 23.57696533203125, + "learning_rate": 7.672892112618489e-06, + "loss": 5.3313, + "step": 63075 + }, + { + "epoch": 1.2833658854166667, + "grad_norm": 18.918569564819336, + "learning_rate": 7.672554329123136e-06, + "loss": 5.0475, + "step": 63080 + }, + { + "epoch": 1.2834676106770833, + "grad_norm": 16.602075576782227, + "learning_rate": 7.672216528551012e-06, + "loss": 4.8668, + "step": 63085 + }, + { + "epoch": 1.2835693359375, + "grad_norm": 21.819408416748047, + "learning_rate": 7.67187871090427e-06, + "loss": 4.7361, + "step": 63090 + }, + { + "epoch": 1.2836710611979167, + "grad_norm": 16.895465850830078, + "learning_rate": 7.671540876185072e-06, + "loss": 5.0907, + "step": 63095 + }, + { + "epoch": 1.2837727864583333, + "grad_norm": 17.615447998046875, + "learning_rate": 7.671203024395577e-06, + "loss": 4.8515, + "step": 63100 + }, + { + "epoch": 1.28387451171875, + "grad_norm": 16.25126075744629, + "learning_rate": 7.670865155537942e-06, + "loss": 5.1376, + "step": 63105 + }, + { + "epoch": 1.2839762369791667, + "grad_norm": 18.40363883972168, + "learning_rate": 7.670527269614327e-06, + "loss": 4.9937, + "step": 63110 + }, + { + "epoch": 1.2840779622395833, + "grad_norm": 15.347865104675293, + "learning_rate": 7.67018936662689e-06, + "loss": 4.8899, + "step": 63115 + }, + { + "epoch": 1.2841796875, + "grad_norm": 18.32297706604004, + "learning_rate": 7.66985144657779e-06, + "loss": 5.2051, + "step": 63120 + }, + { + "epoch": 1.2842814127604167, + "grad_norm": 19.74745750427246, + "learning_rate": 7.669513509469188e-06, + "loss": 5.1687, + "step": 63125 + }, + { + "epoch": 1.2843831380208333, + "grad_norm": 14.171048164367676, + "learning_rate": 7.669175555303241e-06, + "loss": 4.8155, + "step": 63130 + }, + { + "epoch": 1.28448486328125, + "grad_norm": 19.369211196899414, + "learning_rate": 7.668837584082109e-06, + "loss": 5.0503, + "step": 63135 + }, + { + "epoch": 1.2845865885416667, + "grad_norm": 16.289663314819336, + "learning_rate": 7.668499595807954e-06, + "loss": 4.9131, + "step": 63140 + }, + { + "epoch": 1.2846883138020833, + "grad_norm": 23.89605712890625, + "learning_rate": 7.668161590482933e-06, + "loss": 5.0112, + "step": 63145 + }, + { + "epoch": 1.2847900390625, + "grad_norm": 14.908425331115723, + "learning_rate": 7.667823568109205e-06, + "loss": 4.6499, + "step": 63150 + }, + { + "epoch": 1.2848917643229167, + "grad_norm": 19.1927490234375, + "learning_rate": 7.667485528688932e-06, + "loss": 4.9979, + "step": 63155 + }, + { + "epoch": 1.2849934895833333, + "grad_norm": 18.411399841308594, + "learning_rate": 7.667147472224273e-06, + "loss": 4.9555, + "step": 63160 + }, + { + "epoch": 1.28509521484375, + "grad_norm": 17.24689292907715, + "learning_rate": 7.666809398717389e-06, + "loss": 4.927, + "step": 63165 + }, + { + "epoch": 1.2851969401041667, + "grad_norm": 13.863259315490723, + "learning_rate": 7.666471308170438e-06, + "loss": 5.0032, + "step": 63170 + }, + { + "epoch": 1.2852986653645833, + "grad_norm": 14.84496784210205, + "learning_rate": 7.666133200585584e-06, + "loss": 5.0692, + "step": 63175 + }, + { + "epoch": 1.285400390625, + "grad_norm": 26.2303524017334, + "learning_rate": 7.665795075964983e-06, + "loss": 4.9858, + "step": 63180 + }, + { + "epoch": 1.2855021158854167, + "grad_norm": 17.59271240234375, + "learning_rate": 7.665456934310801e-06, + "loss": 4.9356, + "step": 63185 + }, + { + "epoch": 1.2856038411458333, + "grad_norm": 18.72928237915039, + "learning_rate": 7.665118775625193e-06, + "loss": 4.9629, + "step": 63190 + }, + { + "epoch": 1.28570556640625, + "grad_norm": 18.778148651123047, + "learning_rate": 7.664780599910323e-06, + "loss": 5.4067, + "step": 63195 + }, + { + "epoch": 1.2858072916666667, + "grad_norm": 21.840452194213867, + "learning_rate": 7.66444240716835e-06, + "loss": 5.0959, + "step": 63200 + }, + { + "epoch": 1.2859090169270833, + "grad_norm": 21.706501007080078, + "learning_rate": 7.664104197401436e-06, + "loss": 5.0538, + "step": 63205 + }, + { + "epoch": 1.2860107421875, + "grad_norm": 19.023786544799805, + "learning_rate": 7.663765970611741e-06, + "loss": 4.8082, + "step": 63210 + }, + { + "epoch": 1.2861124674479167, + "grad_norm": 21.29167366027832, + "learning_rate": 7.663427726801429e-06, + "loss": 4.8562, + "step": 63215 + }, + { + "epoch": 1.2862141927083333, + "grad_norm": 16.887340545654297, + "learning_rate": 7.663089465972658e-06, + "loss": 5.0808, + "step": 63220 + }, + { + "epoch": 1.28631591796875, + "grad_norm": 18.58875274658203, + "learning_rate": 7.662751188127591e-06, + "loss": 5.2589, + "step": 63225 + }, + { + "epoch": 1.2864176432291667, + "grad_norm": 15.740553855895996, + "learning_rate": 7.662412893268389e-06, + "loss": 4.7424, + "step": 63230 + }, + { + "epoch": 1.2865193684895833, + "grad_norm": 19.898876190185547, + "learning_rate": 7.662074581397214e-06, + "loss": 5.1957, + "step": 63235 + }, + { + "epoch": 1.28662109375, + "grad_norm": 13.011017799377441, + "learning_rate": 7.661736252516227e-06, + "loss": 4.9386, + "step": 63240 + }, + { + "epoch": 1.2867228190104167, + "grad_norm": 20.532379150390625, + "learning_rate": 7.661397906627591e-06, + "loss": 4.9662, + "step": 63245 + }, + { + "epoch": 1.2868245442708333, + "grad_norm": 18.056215286254883, + "learning_rate": 7.661059543733468e-06, + "loss": 4.8385, + "step": 63250 + }, + { + "epoch": 1.28692626953125, + "grad_norm": 16.0009765625, + "learning_rate": 7.660721163836018e-06, + "loss": 4.8376, + "step": 63255 + }, + { + "epoch": 1.2870279947916667, + "grad_norm": 19.389728546142578, + "learning_rate": 7.660382766937404e-06, + "loss": 5.0312, + "step": 63260 + }, + { + "epoch": 1.2871297200520833, + "grad_norm": 17.162796020507812, + "learning_rate": 7.66004435303979e-06, + "loss": 4.784, + "step": 63265 + }, + { + "epoch": 1.2872314453125, + "grad_norm": 18.962810516357422, + "learning_rate": 7.659705922145335e-06, + "loss": 4.8683, + "step": 63270 + }, + { + "epoch": 1.2873331705729167, + "grad_norm": 19.893831253051758, + "learning_rate": 7.659367474256205e-06, + "loss": 4.8263, + "step": 63275 + }, + { + "epoch": 1.2874348958333333, + "grad_norm": 19.55193519592285, + "learning_rate": 7.659029009374561e-06, + "loss": 5.0565, + "step": 63280 + }, + { + "epoch": 1.28753662109375, + "grad_norm": 19.505403518676758, + "learning_rate": 7.658690527502566e-06, + "loss": 4.8363, + "step": 63285 + }, + { + "epoch": 1.2876383463541667, + "grad_norm": 17.042387008666992, + "learning_rate": 7.65835202864238e-06, + "loss": 4.9133, + "step": 63290 + }, + { + "epoch": 1.2877400716145833, + "grad_norm": 17.48938751220703, + "learning_rate": 7.658013512796171e-06, + "loss": 5.0605, + "step": 63295 + }, + { + "epoch": 1.287841796875, + "grad_norm": 20.71303367614746, + "learning_rate": 7.657674979966098e-06, + "loss": 5.0271, + "step": 63300 + }, + { + "epoch": 1.2879435221354167, + "grad_norm": 14.868595123291016, + "learning_rate": 7.657336430154327e-06, + "loss": 4.8313, + "step": 63305 + }, + { + "epoch": 1.2880452473958333, + "grad_norm": 18.221065521240234, + "learning_rate": 7.656997863363019e-06, + "loss": 4.8167, + "step": 63310 + }, + { + "epoch": 1.28814697265625, + "grad_norm": 20.899229049682617, + "learning_rate": 7.656659279594338e-06, + "loss": 4.7588, + "step": 63315 + }, + { + "epoch": 1.2882486979166667, + "grad_norm": 16.23971939086914, + "learning_rate": 7.656320678850449e-06, + "loss": 5.0233, + "step": 63320 + }, + { + "epoch": 1.2883504231770833, + "grad_norm": 19.152069091796875, + "learning_rate": 7.655982061133512e-06, + "loss": 5.0421, + "step": 63325 + }, + { + "epoch": 1.2884521484375, + "grad_norm": 18.316423416137695, + "learning_rate": 7.655643426445694e-06, + "loss": 5.0712, + "step": 63330 + }, + { + "epoch": 1.2885538736979167, + "grad_norm": 18.374813079833984, + "learning_rate": 7.655304774789157e-06, + "loss": 5.1479, + "step": 63335 + }, + { + "epoch": 1.2886555989583333, + "grad_norm": 20.275678634643555, + "learning_rate": 7.654966106166065e-06, + "loss": 4.8617, + "step": 63340 + }, + { + "epoch": 1.28875732421875, + "grad_norm": 18.21112823486328, + "learning_rate": 7.654627420578584e-06, + "loss": 5.1743, + "step": 63345 + }, + { + "epoch": 1.2888590494791667, + "grad_norm": 18.307083129882812, + "learning_rate": 7.654288718028875e-06, + "loss": 4.9563, + "step": 63350 + }, + { + "epoch": 1.2889607747395833, + "grad_norm": 14.892424583435059, + "learning_rate": 7.653949998519103e-06, + "loss": 4.9513, + "step": 63355 + }, + { + "epoch": 1.2890625, + "grad_norm": 16.182382583618164, + "learning_rate": 7.653611262051436e-06, + "loss": 5.0032, + "step": 63360 + }, + { + "epoch": 1.2891642252604167, + "grad_norm": 18.856576919555664, + "learning_rate": 7.653272508628032e-06, + "loss": 5.1972, + "step": 63365 + }, + { + "epoch": 1.2892659505208333, + "grad_norm": 14.891741752624512, + "learning_rate": 7.65293373825106e-06, + "loss": 4.9454, + "step": 63370 + }, + { + "epoch": 1.28936767578125, + "grad_norm": 20.79216194152832, + "learning_rate": 7.652594950922686e-06, + "loss": 4.9443, + "step": 63375 + }, + { + "epoch": 1.2894694010416667, + "grad_norm": 18.055442810058594, + "learning_rate": 7.65225614664507e-06, + "loss": 5.0958, + "step": 63380 + }, + { + "epoch": 1.2895711263020833, + "grad_norm": 18.975950241088867, + "learning_rate": 7.651917325420379e-06, + "loss": 4.9833, + "step": 63385 + }, + { + "epoch": 1.2896728515625, + "grad_norm": 12.793468475341797, + "learning_rate": 7.651578487250778e-06, + "loss": 4.9634, + "step": 63390 + }, + { + "epoch": 1.2897745768229167, + "grad_norm": 14.323960304260254, + "learning_rate": 7.651239632138433e-06, + "loss": 4.7316, + "step": 63395 + }, + { + "epoch": 1.2898763020833333, + "grad_norm": 19.186830520629883, + "learning_rate": 7.650900760085506e-06, + "loss": 4.8313, + "step": 63400 + }, + { + "epoch": 1.28997802734375, + "grad_norm": 20.435487747192383, + "learning_rate": 7.65056187109417e-06, + "loss": 4.8183, + "step": 63405 + }, + { + "epoch": 1.2900797526041667, + "grad_norm": 14.714106559753418, + "learning_rate": 7.65022296516658e-06, + "loss": 4.796, + "step": 63410 + }, + { + "epoch": 1.2901814778645833, + "grad_norm": 20.033103942871094, + "learning_rate": 7.649884042304909e-06, + "loss": 4.9878, + "step": 63415 + }, + { + "epoch": 1.290283203125, + "grad_norm": 17.90815544128418, + "learning_rate": 7.649545102511317e-06, + "loss": 5.1796, + "step": 63420 + }, + { + "epoch": 1.2903849283854167, + "grad_norm": 23.037071228027344, + "learning_rate": 7.649206145787978e-06, + "loss": 4.9684, + "step": 63425 + }, + { + "epoch": 1.2904866536458333, + "grad_norm": 15.127262115478516, + "learning_rate": 7.648867172137048e-06, + "loss": 4.7733, + "step": 63430 + }, + { + "epoch": 1.29058837890625, + "grad_norm": 17.314178466796875, + "learning_rate": 7.648528181560699e-06, + "loss": 4.9425, + "step": 63435 + }, + { + "epoch": 1.2906901041666667, + "grad_norm": 19.686429977416992, + "learning_rate": 7.648189174061097e-06, + "loss": 4.6486, + "step": 63440 + }, + { + "epoch": 1.2907918294270833, + "grad_norm": 14.557364463806152, + "learning_rate": 7.647850149640406e-06, + "loss": 5.0135, + "step": 63445 + }, + { + "epoch": 1.2908935546875, + "grad_norm": 23.25045394897461, + "learning_rate": 7.647511108300793e-06, + "loss": 5.1558, + "step": 63450 + }, + { + "epoch": 1.2909952799479167, + "grad_norm": 18.12040138244629, + "learning_rate": 7.647172050044425e-06, + "loss": 5.0048, + "step": 63455 + }, + { + "epoch": 1.2910970052083333, + "grad_norm": 15.557125091552734, + "learning_rate": 7.646832974873469e-06, + "loss": 4.6952, + "step": 63460 + }, + { + "epoch": 1.29119873046875, + "grad_norm": 20.995033264160156, + "learning_rate": 7.646493882790087e-06, + "loss": 4.7818, + "step": 63465 + }, + { + "epoch": 1.2913004557291667, + "grad_norm": 21.069229125976562, + "learning_rate": 7.646154773796453e-06, + "loss": 4.9425, + "step": 63470 + }, + { + "epoch": 1.2914021809895833, + "grad_norm": 18.96564292907715, + "learning_rate": 7.645815647894727e-06, + "loss": 5.0118, + "step": 63475 + }, + { + "epoch": 1.29150390625, + "grad_norm": 16.68157196044922, + "learning_rate": 7.645476505087082e-06, + "loss": 4.7736, + "step": 63480 + }, + { + "epoch": 1.2916056315104167, + "grad_norm": 18.225053787231445, + "learning_rate": 7.645137345375679e-06, + "loss": 4.9647, + "step": 63485 + }, + { + "epoch": 1.2917073567708333, + "grad_norm": 15.38809585571289, + "learning_rate": 7.64479816876269e-06, + "loss": 4.9282, + "step": 63490 + }, + { + "epoch": 1.29180908203125, + "grad_norm": 17.80666160583496, + "learning_rate": 7.644458975250279e-06, + "loss": 4.844, + "step": 63495 + }, + { + "epoch": 1.2919108072916667, + "grad_norm": 18.688528060913086, + "learning_rate": 7.644119764840614e-06, + "loss": 4.8202, + "step": 63500 + }, + { + "epoch": 1.2920125325520833, + "grad_norm": 24.731189727783203, + "learning_rate": 7.643780537535865e-06, + "loss": 4.973, + "step": 63505 + }, + { + "epoch": 1.2921142578125, + "grad_norm": 15.249688148498535, + "learning_rate": 7.643441293338197e-06, + "loss": 4.8141, + "step": 63510 + }, + { + "epoch": 1.2922159830729167, + "grad_norm": 18.240827560424805, + "learning_rate": 7.643102032249777e-06, + "loss": 5.0427, + "step": 63515 + }, + { + "epoch": 1.2923177083333333, + "grad_norm": 13.062031745910645, + "learning_rate": 7.642762754272777e-06, + "loss": 5.0159, + "step": 63520 + }, + { + "epoch": 1.29241943359375, + "grad_norm": 15.610978126525879, + "learning_rate": 7.642423459409359e-06, + "loss": 4.7358, + "step": 63525 + }, + { + "epoch": 1.2925211588541667, + "grad_norm": 18.949975967407227, + "learning_rate": 7.642084147661694e-06, + "loss": 4.8813, + "step": 63530 + }, + { + "epoch": 1.2926228841145833, + "grad_norm": 13.483814239501953, + "learning_rate": 7.641744819031953e-06, + "loss": 4.8743, + "step": 63535 + }, + { + "epoch": 1.292724609375, + "grad_norm": 18.0944881439209, + "learning_rate": 7.641405473522298e-06, + "loss": 5.27, + "step": 63540 + }, + { + "epoch": 1.2928263346354167, + "grad_norm": 21.306493759155273, + "learning_rate": 7.641066111134901e-06, + "loss": 5.1066, + "step": 63545 + }, + { + "epoch": 1.2929280598958333, + "grad_norm": 14.608094215393066, + "learning_rate": 7.64072673187193e-06, + "loss": 5.0899, + "step": 63550 + }, + { + "epoch": 1.29302978515625, + "grad_norm": 15.325492858886719, + "learning_rate": 7.640387335735556e-06, + "loss": 4.7871, + "step": 63555 + }, + { + "epoch": 1.2931315104166667, + "grad_norm": 17.380388259887695, + "learning_rate": 7.640047922727941e-06, + "loss": 4.9675, + "step": 63560 + }, + { + "epoch": 1.2932332356770833, + "grad_norm": 20.592897415161133, + "learning_rate": 7.639708492851261e-06, + "loss": 5.0849, + "step": 63565 + }, + { + "epoch": 1.2933349609375, + "grad_norm": 16.011608123779297, + "learning_rate": 7.639369046107681e-06, + "loss": 4.9548, + "step": 63570 + }, + { + "epoch": 1.2934366861979167, + "grad_norm": 16.736528396606445, + "learning_rate": 7.639029582499369e-06, + "loss": 4.7031, + "step": 63575 + }, + { + "epoch": 1.2935384114583333, + "grad_norm": 17.988649368286133, + "learning_rate": 7.638690102028498e-06, + "loss": 5.0276, + "step": 63580 + }, + { + "epoch": 1.29364013671875, + "grad_norm": 17.59498405456543, + "learning_rate": 7.638350604697234e-06, + "loss": 4.9309, + "step": 63585 + }, + { + "epoch": 1.2937418619791667, + "grad_norm": 21.54682731628418, + "learning_rate": 7.638011090507746e-06, + "loss": 4.76, + "step": 63590 + }, + { + "epoch": 1.2938435872395833, + "grad_norm": 17.048810958862305, + "learning_rate": 7.637671559462206e-06, + "loss": 5.1466, + "step": 63595 + }, + { + "epoch": 1.2939453125, + "grad_norm": 17.025527954101562, + "learning_rate": 7.637332011562782e-06, + "loss": 5.1328, + "step": 63600 + }, + { + "epoch": 1.2940470377604167, + "grad_norm": 16.070690155029297, + "learning_rate": 7.636992446811645e-06, + "loss": 4.7274, + "step": 63605 + }, + { + "epoch": 1.2941487630208333, + "grad_norm": 18.208166122436523, + "learning_rate": 7.63665286521096e-06, + "loss": 4.9129, + "step": 63610 + }, + { + "epoch": 1.29425048828125, + "grad_norm": 17.75841522216797, + "learning_rate": 7.636313266762902e-06, + "loss": 5.0742, + "step": 63615 + }, + { + "epoch": 1.2943522135416667, + "grad_norm": 20.476011276245117, + "learning_rate": 7.635973651469639e-06, + "loss": 4.7026, + "step": 63620 + }, + { + "epoch": 1.2944539388020833, + "grad_norm": 19.987438201904297, + "learning_rate": 7.63563401933334e-06, + "loss": 4.8605, + "step": 63625 + }, + { + "epoch": 1.2945556640625, + "grad_norm": 13.873090744018555, + "learning_rate": 7.635294370356179e-06, + "loss": 4.9904, + "step": 63630 + }, + { + "epoch": 1.2946573893229167, + "grad_norm": 19.086820602416992, + "learning_rate": 7.634954704540321e-06, + "loss": 4.8348, + "step": 63635 + }, + { + "epoch": 1.2947591145833333, + "grad_norm": 16.471750259399414, + "learning_rate": 7.63461502188794e-06, + "loss": 4.8785, + "step": 63640 + }, + { + "epoch": 1.29486083984375, + "grad_norm": 22.55637550354004, + "learning_rate": 7.634275322401206e-06, + "loss": 4.8154, + "step": 63645 + }, + { + "epoch": 1.2949625651041667, + "grad_norm": 16.992002487182617, + "learning_rate": 7.633935606082288e-06, + "loss": 5.1291, + "step": 63650 + }, + { + "epoch": 1.2950642903645833, + "grad_norm": 20.49782371520996, + "learning_rate": 7.633595872933357e-06, + "loss": 4.8913, + "step": 63655 + }, + { + "epoch": 1.295166015625, + "grad_norm": 59.410682678222656, + "learning_rate": 7.633256122956585e-06, + "loss": 5.3282, + "step": 63660 + }, + { + "epoch": 1.2952677408854167, + "grad_norm": 13.63537311553955, + "learning_rate": 7.632916356154144e-06, + "loss": 5.0428, + "step": 63665 + }, + { + "epoch": 1.2953694661458333, + "grad_norm": 22.001237869262695, + "learning_rate": 7.6325765725282e-06, + "loss": 5.1351, + "step": 63670 + }, + { + "epoch": 1.29547119140625, + "grad_norm": 15.338910102844238, + "learning_rate": 7.63223677208093e-06, + "loss": 4.9613, + "step": 63675 + }, + { + "epoch": 1.2955729166666667, + "grad_norm": 19.85097312927246, + "learning_rate": 7.6318969548145e-06, + "loss": 5.0769, + "step": 63680 + }, + { + "epoch": 1.2956746419270833, + "grad_norm": 21.5010929107666, + "learning_rate": 7.631557120731086e-06, + "loss": 5.0039, + "step": 63685 + }, + { + "epoch": 1.2957763671875, + "grad_norm": 18.124122619628906, + "learning_rate": 7.631217269832856e-06, + "loss": 4.8223, + "step": 63690 + }, + { + "epoch": 1.2958780924479167, + "grad_norm": 17.98585319519043, + "learning_rate": 7.630877402121984e-06, + "loss": 4.8368, + "step": 63695 + }, + { + "epoch": 1.2959798177083333, + "grad_norm": 15.324594497680664, + "learning_rate": 7.63053751760064e-06, + "loss": 4.9754, + "step": 63700 + }, + { + "epoch": 1.29608154296875, + "grad_norm": 20.511634826660156, + "learning_rate": 7.630197616270998e-06, + "loss": 4.7739, + "step": 63705 + }, + { + "epoch": 1.2961832682291667, + "grad_norm": 17.831575393676758, + "learning_rate": 7.629857698135226e-06, + "loss": 4.9992, + "step": 63710 + }, + { + "epoch": 1.2962849934895833, + "grad_norm": 18.375524520874023, + "learning_rate": 7.6295177631955e-06, + "loss": 4.9399, + "step": 63715 + }, + { + "epoch": 1.29638671875, + "grad_norm": 14.039156913757324, + "learning_rate": 7.629177811453988e-06, + "loss": 4.8562, + "step": 63720 + }, + { + "epoch": 1.2964884440104167, + "grad_norm": 19.456335067749023, + "learning_rate": 7.6288378429128664e-06, + "loss": 4.7962, + "step": 63725 + }, + { + "epoch": 1.2965901692708333, + "grad_norm": 14.970884323120117, + "learning_rate": 7.628497857574304e-06, + "loss": 4.9043, + "step": 63730 + }, + { + "epoch": 1.29669189453125, + "grad_norm": 17.278779983520508, + "learning_rate": 7.628157855440474e-06, + "loss": 4.8315, + "step": 63735 + }, + { + "epoch": 1.2967936197916667, + "grad_norm": 15.225500106811523, + "learning_rate": 7.627817836513553e-06, + "loss": 4.8739, + "step": 63740 + }, + { + "epoch": 1.2968953450520833, + "grad_norm": 21.861787796020508, + "learning_rate": 7.627477800795708e-06, + "loss": 4.903, + "step": 63745 + }, + { + "epoch": 1.2969970703125, + "grad_norm": 23.075807571411133, + "learning_rate": 7.627137748289113e-06, + "loss": 5.0325, + "step": 63750 + }, + { + "epoch": 1.2970987955729167, + "grad_norm": 22.158296585083008, + "learning_rate": 7.626797678995943e-06, + "loss": 5.0355, + "step": 63755 + }, + { + "epoch": 1.2972005208333333, + "grad_norm": 15.327608108520508, + "learning_rate": 7.62645759291837e-06, + "loss": 5.2228, + "step": 63760 + }, + { + "epoch": 1.29730224609375, + "grad_norm": 18.884620666503906, + "learning_rate": 7.626117490058567e-06, + "loss": 4.9708, + "step": 63765 + }, + { + "epoch": 1.2974039713541667, + "grad_norm": 15.882715225219727, + "learning_rate": 7.625777370418705e-06, + "loss": 4.9602, + "step": 63770 + }, + { + "epoch": 1.2975056966145833, + "grad_norm": 18.67342758178711, + "learning_rate": 7.625437234000961e-06, + "loss": 4.6635, + "step": 63775 + }, + { + "epoch": 1.297607421875, + "grad_norm": 14.839578628540039, + "learning_rate": 7.625097080807507e-06, + "loss": 4.9194, + "step": 63780 + }, + { + "epoch": 1.2977091471354167, + "grad_norm": 14.698177337646484, + "learning_rate": 7.624756910840514e-06, + "loss": 4.8887, + "step": 63785 + }, + { + "epoch": 1.2978108723958333, + "grad_norm": 22.61765480041504, + "learning_rate": 7.624416724102158e-06, + "loss": 4.8996, + "step": 63790 + }, + { + "epoch": 1.29791259765625, + "grad_norm": 20.03559684753418, + "learning_rate": 7.624076520594612e-06, + "loss": 4.8722, + "step": 63795 + }, + { + "epoch": 1.2980143229166667, + "grad_norm": 21.22085189819336, + "learning_rate": 7.62373630032005e-06, + "loss": 5.008, + "step": 63800 + }, + { + "epoch": 1.2981160481770833, + "grad_norm": 17.674280166625977, + "learning_rate": 7.623396063280648e-06, + "loss": 4.9787, + "step": 63805 + }, + { + "epoch": 1.2982177734375, + "grad_norm": 31.859695434570312, + "learning_rate": 7.623055809478577e-06, + "loss": 4.9064, + "step": 63810 + }, + { + "epoch": 1.2983194986979167, + "grad_norm": 21.91958236694336, + "learning_rate": 7.62271553891601e-06, + "loss": 4.9768, + "step": 63815 + }, + { + "epoch": 1.2984212239583333, + "grad_norm": 20.030227661132812, + "learning_rate": 7.622375251595125e-06, + "loss": 4.9259, + "step": 63820 + }, + { + "epoch": 1.29852294921875, + "grad_norm": 20.640466690063477, + "learning_rate": 7.622034947518094e-06, + "loss": 4.7353, + "step": 63825 + }, + { + "epoch": 1.2986246744791667, + "grad_norm": 22.349414825439453, + "learning_rate": 7.621694626687092e-06, + "loss": 5.0945, + "step": 63830 + }, + { + "epoch": 1.2987263997395833, + "grad_norm": 21.602445602416992, + "learning_rate": 7.621354289104294e-06, + "loss": 4.9585, + "step": 63835 + }, + { + "epoch": 1.298828125, + "grad_norm": 28.927284240722656, + "learning_rate": 7.6210139347718735e-06, + "loss": 5.4351, + "step": 63840 + }, + { + "epoch": 1.2989298502604167, + "grad_norm": 21.860118865966797, + "learning_rate": 7.620673563692007e-06, + "loss": 5.0165, + "step": 63845 + }, + { + "epoch": 1.2990315755208333, + "grad_norm": 12.280867576599121, + "learning_rate": 7.620333175866868e-06, + "loss": 4.7377, + "step": 63850 + }, + { + "epoch": 1.29913330078125, + "grad_norm": 18.445070266723633, + "learning_rate": 7.6199927712986325e-06, + "loss": 4.5622, + "step": 63855 + }, + { + "epoch": 1.2992350260416667, + "grad_norm": 13.74000072479248, + "learning_rate": 7.619652349989475e-06, + "loss": 4.8491, + "step": 63860 + }, + { + "epoch": 1.2993367513020833, + "grad_norm": 16.182172775268555, + "learning_rate": 7.619311911941568e-06, + "loss": 4.6875, + "step": 63865 + }, + { + "epoch": 1.2994384765625, + "grad_norm": 18.16469955444336, + "learning_rate": 7.618971457157092e-06, + "loss": 5.0208, + "step": 63870 + }, + { + "epoch": 1.2995402018229167, + "grad_norm": 17.18588638305664, + "learning_rate": 7.618630985638219e-06, + "loss": 4.9239, + "step": 63875 + }, + { + "epoch": 1.2996419270833333, + "grad_norm": 15.057616233825684, + "learning_rate": 7.618290497387124e-06, + "loss": 4.8017, + "step": 63880 + }, + { + "epoch": 1.29974365234375, + "grad_norm": 25.619400024414062, + "learning_rate": 7.617949992405987e-06, + "loss": 5.042, + "step": 63885 + }, + { + "epoch": 1.2998453776041667, + "grad_norm": 16.8629150390625, + "learning_rate": 7.617609470696977e-06, + "loss": 4.8126, + "step": 63890 + }, + { + "epoch": 1.2999471028645833, + "grad_norm": 14.014239311218262, + "learning_rate": 7.617268932262276e-06, + "loss": 4.9529, + "step": 63895 + }, + { + "epoch": 1.300048828125, + "grad_norm": 20.74308967590332, + "learning_rate": 7.616928377104056e-06, + "loss": 4.971, + "step": 63900 + }, + { + "epoch": 1.3001505533854167, + "grad_norm": 12.944037437438965, + "learning_rate": 7.616587805224495e-06, + "loss": 5.0346, + "step": 63905 + }, + { + "epoch": 1.3002522786458333, + "grad_norm": 19.46670913696289, + "learning_rate": 7.616247216625769e-06, + "loss": 5.0035, + "step": 63910 + }, + { + "epoch": 1.30035400390625, + "grad_norm": 20.679292678833008, + "learning_rate": 7.615906611310054e-06, + "loss": 4.9918, + "step": 63915 + }, + { + "epoch": 1.3004557291666667, + "grad_norm": 19.733701705932617, + "learning_rate": 7.615565989279523e-06, + "loss": 4.9503, + "step": 63920 + }, + { + "epoch": 1.3005574544270833, + "grad_norm": 15.369771957397461, + "learning_rate": 7.615225350536359e-06, + "loss": 4.9035, + "step": 63925 + }, + { + "epoch": 1.3006591796875, + "grad_norm": 21.42504119873047, + "learning_rate": 7.614884695082734e-06, + "loss": 4.8692, + "step": 63930 + }, + { + "epoch": 1.3007609049479167, + "grad_norm": 16.477163314819336, + "learning_rate": 7.614544022920826e-06, + "loss": 4.9535, + "step": 63935 + }, + { + "epoch": 1.3008626302083333, + "grad_norm": 21.46350860595703, + "learning_rate": 7.614203334052812e-06, + "loss": 4.8568, + "step": 63940 + }, + { + "epoch": 1.30096435546875, + "grad_norm": 15.380215644836426, + "learning_rate": 7.613862628480868e-06, + "loss": 4.9901, + "step": 63945 + }, + { + "epoch": 1.3010660807291667, + "grad_norm": 16.815176010131836, + "learning_rate": 7.613521906207171e-06, + "loss": 5.0697, + "step": 63950 + }, + { + "epoch": 1.3011678059895833, + "grad_norm": 17.454750061035156, + "learning_rate": 7.6131811672339e-06, + "loss": 5.0667, + "step": 63955 + }, + { + "epoch": 1.30126953125, + "grad_norm": 24.9890193939209, + "learning_rate": 7.612840411563229e-06, + "loss": 5.015, + "step": 63960 + }, + { + "epoch": 1.3013712565104167, + "grad_norm": 17.490991592407227, + "learning_rate": 7.61249963919734e-06, + "loss": 4.879, + "step": 63965 + }, + { + "epoch": 1.3014729817708333, + "grad_norm": 20.558197021484375, + "learning_rate": 7.6121588501384045e-06, + "loss": 4.7049, + "step": 63970 + }, + { + "epoch": 1.30157470703125, + "grad_norm": 16.565046310424805, + "learning_rate": 7.6118180443886055e-06, + "loss": 4.8446, + "step": 63975 + }, + { + "epoch": 1.3016764322916667, + "grad_norm": 21.219430923461914, + "learning_rate": 7.611477221950118e-06, + "loss": 4.7164, + "step": 63980 + }, + { + "epoch": 1.3017781575520833, + "grad_norm": 13.884797096252441, + "learning_rate": 7.611136382825119e-06, + "loss": 4.911, + "step": 63985 + }, + { + "epoch": 1.3018798828125, + "grad_norm": 17.891929626464844, + "learning_rate": 7.610795527015788e-06, + "loss": 5.0148, + "step": 63990 + }, + { + "epoch": 1.3019816080729167, + "grad_norm": 13.368635177612305, + "learning_rate": 7.610454654524302e-06, + "loss": 5.1986, + "step": 63995 + }, + { + "epoch": 1.3020833333333333, + "grad_norm": 12.204655647277832, + "learning_rate": 7.61011376535284e-06, + "loss": 4.7804, + "step": 64000 + }, + { + "epoch": 1.30218505859375, + "grad_norm": 15.14490795135498, + "learning_rate": 7.60977285950358e-06, + "loss": 4.9939, + "step": 64005 + }, + { + "epoch": 1.3022867838541667, + "grad_norm": 16.68805694580078, + "learning_rate": 7.609431936978699e-06, + "loss": 4.9239, + "step": 64010 + }, + { + "epoch": 1.3023885091145833, + "grad_norm": 14.30722713470459, + "learning_rate": 7.609090997780377e-06, + "loss": 5.0313, + "step": 64015 + }, + { + "epoch": 1.302490234375, + "grad_norm": 18.486112594604492, + "learning_rate": 7.608750041910791e-06, + "loss": 4.721, + "step": 64020 + }, + { + "epoch": 1.3025919596354167, + "grad_norm": 15.072474479675293, + "learning_rate": 7.60840906937212e-06, + "loss": 5.0495, + "step": 64025 + }, + { + "epoch": 1.3026936848958333, + "grad_norm": 20.12018394470215, + "learning_rate": 7.608068080166544e-06, + "loss": 4.8141, + "step": 64030 + }, + { + "epoch": 1.30279541015625, + "grad_norm": 19.230260848999023, + "learning_rate": 7.60772707429624e-06, + "loss": 4.6751, + "step": 64035 + }, + { + "epoch": 1.3028971354166667, + "grad_norm": 19.704133987426758, + "learning_rate": 7.60738605176339e-06, + "loss": 4.8097, + "step": 64040 + }, + { + "epoch": 1.3029988606770833, + "grad_norm": 15.926237106323242, + "learning_rate": 7.607045012570169e-06, + "loss": 4.8737, + "step": 64045 + }, + { + "epoch": 1.3031005859375, + "grad_norm": 18.252391815185547, + "learning_rate": 7.606703956718757e-06, + "loss": 5.0254, + "step": 64050 + }, + { + "epoch": 1.3032023111979167, + "grad_norm": 20.6734561920166, + "learning_rate": 7.606362884211337e-06, + "loss": 5.0224, + "step": 64055 + }, + { + "epoch": 1.3033040364583333, + "grad_norm": 17.373899459838867, + "learning_rate": 7.606021795050083e-06, + "loss": 4.8873, + "step": 64060 + }, + { + "epoch": 1.30340576171875, + "grad_norm": 15.804901123046875, + "learning_rate": 7.605680689237178e-06, + "loss": 4.9978, + "step": 64065 + }, + { + "epoch": 1.3035074869791667, + "grad_norm": 19.183164596557617, + "learning_rate": 7.605339566774801e-06, + "loss": 4.8162, + "step": 64070 + }, + { + "epoch": 1.3036092122395833, + "grad_norm": 18.553958892822266, + "learning_rate": 7.6049984276651304e-06, + "loss": 5.001, + "step": 64075 + }, + { + "epoch": 1.3037109375, + "grad_norm": 19.431884765625, + "learning_rate": 7.604657271910348e-06, + "loss": 5.0143, + "step": 64080 + }, + { + "epoch": 1.3038126627604167, + "grad_norm": 16.53635025024414, + "learning_rate": 7.60431609951263e-06, + "loss": 4.9414, + "step": 64085 + }, + { + "epoch": 1.3039143880208333, + "grad_norm": 15.338953971862793, + "learning_rate": 7.60397491047416e-06, + "loss": 4.8177, + "step": 64090 + }, + { + "epoch": 1.30401611328125, + "grad_norm": 21.149259567260742, + "learning_rate": 7.603633704797118e-06, + "loss": 4.7363, + "step": 64095 + }, + { + "epoch": 1.3041178385416667, + "grad_norm": 21.99663734436035, + "learning_rate": 7.603292482483681e-06, + "loss": 4.8829, + "step": 64100 + }, + { + "epoch": 1.3042195638020833, + "grad_norm": 26.284194946289062, + "learning_rate": 7.602951243536034e-06, + "loss": 5.2264, + "step": 64105 + }, + { + "epoch": 1.3043212890625, + "grad_norm": 17.39214324951172, + "learning_rate": 7.602609987956353e-06, + "loss": 5.1067, + "step": 64110 + }, + { + "epoch": 1.3044230143229167, + "grad_norm": 18.357751846313477, + "learning_rate": 7.6022687157468215e-06, + "loss": 4.9492, + "step": 64115 + }, + { + "epoch": 1.3045247395833333, + "grad_norm": 18.277973175048828, + "learning_rate": 7.601927426909619e-06, + "loss": 4.9397, + "step": 64120 + }, + { + "epoch": 1.30462646484375, + "grad_norm": 17.64722442626953, + "learning_rate": 7.601586121446925e-06, + "loss": 5.0208, + "step": 64125 + }, + { + "epoch": 1.3047281901041667, + "grad_norm": 17.796489715576172, + "learning_rate": 7.601244799360923e-06, + "loss": 4.9838, + "step": 64130 + }, + { + "epoch": 1.3048299153645833, + "grad_norm": 18.114017486572266, + "learning_rate": 7.600903460653792e-06, + "loss": 4.7728, + "step": 64135 + }, + { + "epoch": 1.304931640625, + "grad_norm": 14.621331214904785, + "learning_rate": 7.6005621053277134e-06, + "loss": 4.8498, + "step": 64140 + }, + { + "epoch": 1.3050333658854167, + "grad_norm": 19.574039459228516, + "learning_rate": 7.600220733384869e-06, + "loss": 4.7876, + "step": 64145 + }, + { + "epoch": 1.3051350911458333, + "grad_norm": 15.79472541809082, + "learning_rate": 7.599879344827438e-06, + "loss": 4.9017, + "step": 64150 + }, + { + "epoch": 1.30523681640625, + "grad_norm": 18.597747802734375, + "learning_rate": 7.5995379396576045e-06, + "loss": 5.0955, + "step": 64155 + }, + { + "epoch": 1.3053385416666667, + "grad_norm": 13.646483421325684, + "learning_rate": 7.599196517877548e-06, + "loss": 4.7216, + "step": 64160 + }, + { + "epoch": 1.3054402669270833, + "grad_norm": 20.63340950012207, + "learning_rate": 7.598855079489451e-06, + "loss": 4.7872, + "step": 64165 + }, + { + "epoch": 1.3055419921875, + "grad_norm": 19.74671173095703, + "learning_rate": 7.598513624495496e-06, + "loss": 5.0653, + "step": 64170 + }, + { + "epoch": 1.3056437174479167, + "grad_norm": 23.661020278930664, + "learning_rate": 7.598172152897863e-06, + "loss": 4.7387, + "step": 64175 + }, + { + "epoch": 1.3057454427083333, + "grad_norm": 18.901445388793945, + "learning_rate": 7.597830664698735e-06, + "loss": 4.7023, + "step": 64180 + }, + { + "epoch": 1.30584716796875, + "grad_norm": 19.249088287353516, + "learning_rate": 7.597489159900294e-06, + "loss": 4.852, + "step": 64185 + }, + { + "epoch": 1.3059488932291667, + "grad_norm": 18.868602752685547, + "learning_rate": 7.5971476385047205e-06, + "loss": 4.8577, + "step": 64190 + }, + { + "epoch": 1.3060506184895833, + "grad_norm": 18.44291114807129, + "learning_rate": 7.5968061005142e-06, + "loss": 5.0692, + "step": 64195 + }, + { + "epoch": 1.30615234375, + "grad_norm": 23.106576919555664, + "learning_rate": 7.5964645459309126e-06, + "loss": 5.0884, + "step": 64200 + }, + { + "epoch": 1.3062540690104167, + "grad_norm": 17.251779556274414, + "learning_rate": 7.596122974757039e-06, + "loss": 5.0185, + "step": 64205 + }, + { + "epoch": 1.3063557942708333, + "grad_norm": 16.714136123657227, + "learning_rate": 7.595781386994765e-06, + "loss": 4.7063, + "step": 64210 + }, + { + "epoch": 1.30645751953125, + "grad_norm": 20.070844650268555, + "learning_rate": 7.595439782646272e-06, + "loss": 4.6798, + "step": 64215 + }, + { + "epoch": 1.3065592447916667, + "grad_norm": 12.119759559631348, + "learning_rate": 7.595098161713743e-06, + "loss": 5.2135, + "step": 64220 + }, + { + "epoch": 1.3066609700520833, + "grad_norm": 18.807191848754883, + "learning_rate": 7.594756524199359e-06, + "loss": 4.8409, + "step": 64225 + }, + { + "epoch": 1.3067626953125, + "grad_norm": 20.530256271362305, + "learning_rate": 7.5944148701053044e-06, + "loss": 4.8742, + "step": 64230 + }, + { + "epoch": 1.3068644205729167, + "grad_norm": 13.79006576538086, + "learning_rate": 7.5940731994337644e-06, + "loss": 4.912, + "step": 64235 + }, + { + "epoch": 1.3069661458333333, + "grad_norm": 17.061147689819336, + "learning_rate": 7.59373151218692e-06, + "loss": 5.2628, + "step": 64240 + }, + { + "epoch": 1.30706787109375, + "grad_norm": 18.30512046813965, + "learning_rate": 7.593389808366954e-06, + "loss": 4.7821, + "step": 64245 + }, + { + "epoch": 1.3071695963541667, + "grad_norm": 20.31515884399414, + "learning_rate": 7.593048087976049e-06, + "loss": 5.3727, + "step": 64250 + }, + { + "epoch": 1.3072713216145833, + "grad_norm": 23.879623413085938, + "learning_rate": 7.592706351016391e-06, + "loss": 4.8605, + "step": 64255 + }, + { + "epoch": 1.307373046875, + "grad_norm": 42.72747039794922, + "learning_rate": 7.5923645974901624e-06, + "loss": 5.058, + "step": 64260 + }, + { + "epoch": 1.3074747721354167, + "grad_norm": 17.04574966430664, + "learning_rate": 7.592022827399547e-06, + "loss": 4.8202, + "step": 64265 + }, + { + "epoch": 1.3075764973958333, + "grad_norm": 15.991150856018066, + "learning_rate": 7.59168104074673e-06, + "loss": 4.724, + "step": 64270 + }, + { + "epoch": 1.30767822265625, + "grad_norm": 21.319446563720703, + "learning_rate": 7.591339237533892e-06, + "loss": 4.9629, + "step": 64275 + }, + { + "epoch": 1.3077799479166667, + "grad_norm": 16.664871215820312, + "learning_rate": 7.590997417763219e-06, + "loss": 4.7433, + "step": 64280 + }, + { + "epoch": 1.3078816731770833, + "grad_norm": 16.72453498840332, + "learning_rate": 7.590655581436897e-06, + "loss": 5.104, + "step": 64285 + }, + { + "epoch": 1.3079833984375, + "grad_norm": 13.366039276123047, + "learning_rate": 7.590313728557106e-06, + "loss": 5.0947, + "step": 64290 + }, + { + "epoch": 1.3080851236979167, + "grad_norm": 17.942598342895508, + "learning_rate": 7.589971859126033e-06, + "loss": 4.9981, + "step": 64295 + }, + { + "epoch": 1.3081868489583333, + "grad_norm": 19.102136611938477, + "learning_rate": 7.589629973145862e-06, + "loss": 4.8664, + "step": 64300 + }, + { + "epoch": 1.30828857421875, + "grad_norm": 16.837947845458984, + "learning_rate": 7.589288070618777e-06, + "loss": 4.9947, + "step": 64305 + }, + { + "epoch": 1.3083902994791667, + "grad_norm": 12.759050369262695, + "learning_rate": 7.588946151546965e-06, + "loss": 5.008, + "step": 64310 + }, + { + "epoch": 1.3084920247395833, + "grad_norm": 23.2030086517334, + "learning_rate": 7.588604215932608e-06, + "loss": 4.9765, + "step": 64315 + }, + { + "epoch": 1.30859375, + "grad_norm": 16.062711715698242, + "learning_rate": 7.58826226377789e-06, + "loss": 4.6442, + "step": 64320 + }, + { + "epoch": 1.3086954752604167, + "grad_norm": 15.818807601928711, + "learning_rate": 7.587920295085001e-06, + "loss": 5.1025, + "step": 64325 + }, + { + "epoch": 1.3087972005208333, + "grad_norm": 19.44411277770996, + "learning_rate": 7.587578309856121e-06, + "loss": 4.8877, + "step": 64330 + }, + { + "epoch": 1.30889892578125, + "grad_norm": 19.85701560974121, + "learning_rate": 7.587236308093438e-06, + "loss": 4.7862, + "step": 64335 + }, + { + "epoch": 1.3090006510416667, + "grad_norm": 17.413166046142578, + "learning_rate": 7.586894289799135e-06, + "loss": 4.816, + "step": 64340 + }, + { + "epoch": 1.3091023763020833, + "grad_norm": 19.394506454467773, + "learning_rate": 7.586552254975399e-06, + "loss": 4.8013, + "step": 64345 + }, + { + "epoch": 1.3092041015625, + "grad_norm": 19.89971351623535, + "learning_rate": 7.586210203624416e-06, + "loss": 4.9327, + "step": 64350 + }, + { + "epoch": 1.3093058268229167, + "grad_norm": 18.978500366210938, + "learning_rate": 7.585868135748369e-06, + "loss": 4.9167, + "step": 64355 + }, + { + "epoch": 1.3094075520833333, + "grad_norm": 22.672155380249023, + "learning_rate": 7.585526051349447e-06, + "loss": 4.971, + "step": 64360 + }, + { + "epoch": 1.30950927734375, + "grad_norm": 14.339287757873535, + "learning_rate": 7.585183950429833e-06, + "loss": 4.6397, + "step": 64365 + }, + { + "epoch": 1.3096110026041667, + "grad_norm": 15.190136909484863, + "learning_rate": 7.584841832991714e-06, + "loss": 5.1249, + "step": 64370 + }, + { + "epoch": 1.3097127278645833, + "grad_norm": 18.513429641723633, + "learning_rate": 7.584499699037278e-06, + "loss": 4.9327, + "step": 64375 + }, + { + "epoch": 1.309814453125, + "grad_norm": 13.153135299682617, + "learning_rate": 7.584157548568709e-06, + "loss": 5.1551, + "step": 64380 + }, + { + "epoch": 1.3099161783854167, + "grad_norm": 13.734125137329102, + "learning_rate": 7.583815381588193e-06, + "loss": 4.8349, + "step": 64385 + }, + { + "epoch": 1.3100179036458333, + "grad_norm": 19.145044326782227, + "learning_rate": 7.583473198097917e-06, + "loss": 5.2701, + "step": 64390 + }, + { + "epoch": 1.31011962890625, + "grad_norm": 16.350841522216797, + "learning_rate": 7.583130998100068e-06, + "loss": 4.9071, + "step": 64395 + }, + { + "epoch": 1.3102213541666667, + "grad_norm": 17.9826602935791, + "learning_rate": 7.582788781596831e-06, + "loss": 4.8138, + "step": 64400 + }, + { + "epoch": 1.3103230794270833, + "grad_norm": 18.652822494506836, + "learning_rate": 7.582446548590394e-06, + "loss": 4.6282, + "step": 64405 + }, + { + "epoch": 1.3104248046875, + "grad_norm": 20.937358856201172, + "learning_rate": 7.582104299082943e-06, + "loss": 4.7308, + "step": 64410 + }, + { + "epoch": 1.3105265299479167, + "grad_norm": 14.975748062133789, + "learning_rate": 7.5817620330766664e-06, + "loss": 5.1376, + "step": 64415 + }, + { + "epoch": 1.3106282552083333, + "grad_norm": 22.849346160888672, + "learning_rate": 7.581419750573749e-06, + "loss": 4.749, + "step": 64420 + }, + { + "epoch": 1.31072998046875, + "grad_norm": 20.109127044677734, + "learning_rate": 7.581077451576379e-06, + "loss": 5.0391, + "step": 64425 + }, + { + "epoch": 1.3108317057291667, + "grad_norm": 18.463455200195312, + "learning_rate": 7.5807351360867435e-06, + "loss": 5.0854, + "step": 64430 + }, + { + "epoch": 1.3109334309895833, + "grad_norm": 19.750322341918945, + "learning_rate": 7.5803928041070286e-06, + "loss": 4.9378, + "step": 64435 + }, + { + "epoch": 1.31103515625, + "grad_norm": 21.4440975189209, + "learning_rate": 7.580050455639425e-06, + "loss": 4.9983, + "step": 64440 + }, + { + "epoch": 1.3111368815104167, + "grad_norm": 22.945919036865234, + "learning_rate": 7.579708090686118e-06, + "loss": 4.9826, + "step": 64445 + }, + { + "epoch": 1.3112386067708333, + "grad_norm": 16.771738052368164, + "learning_rate": 7.579365709249293e-06, + "loss": 4.8668, + "step": 64450 + }, + { + "epoch": 1.31134033203125, + "grad_norm": 24.51841163635254, + "learning_rate": 7.579023311331141e-06, + "loss": 4.8371, + "step": 64455 + }, + { + "epoch": 1.3114420572916667, + "grad_norm": 16.257381439208984, + "learning_rate": 7.578680896933848e-06, + "loss": 4.935, + "step": 64460 + }, + { + "epoch": 1.3115437825520833, + "grad_norm": 16.5434627532959, + "learning_rate": 7.5783384660596035e-06, + "loss": 5.0356, + "step": 64465 + }, + { + "epoch": 1.3116455078125, + "grad_norm": 22.41364860534668, + "learning_rate": 7.577996018710595e-06, + "loss": 4.7127, + "step": 64470 + }, + { + "epoch": 1.3117472330729167, + "grad_norm": 24.583663940429688, + "learning_rate": 7.577653554889009e-06, + "loss": 4.9955, + "step": 64475 + }, + { + "epoch": 1.3118489583333333, + "grad_norm": 12.196186065673828, + "learning_rate": 7.577311074597036e-06, + "loss": 4.8533, + "step": 64480 + }, + { + "epoch": 1.31195068359375, + "grad_norm": 17.976839065551758, + "learning_rate": 7.576968577836863e-06, + "loss": 4.8189, + "step": 64485 + }, + { + "epoch": 1.3120524088541667, + "grad_norm": 24.57912254333496, + "learning_rate": 7.5766260646106795e-06, + "loss": 4.9741, + "step": 64490 + }, + { + "epoch": 1.3121541341145833, + "grad_norm": 14.89223861694336, + "learning_rate": 7.576283534920672e-06, + "loss": 5.2024, + "step": 64495 + }, + { + "epoch": 1.312255859375, + "grad_norm": 17.34563446044922, + "learning_rate": 7.575940988769033e-06, + "loss": 4.8596, + "step": 64500 + }, + { + "epoch": 1.3123575846354167, + "grad_norm": 17.349336624145508, + "learning_rate": 7.575598426157946e-06, + "loss": 4.7296, + "step": 64505 + }, + { + "epoch": 1.3124593098958333, + "grad_norm": 16.871295928955078, + "learning_rate": 7.575255847089604e-06, + "loss": 5.1553, + "step": 64510 + }, + { + "epoch": 1.31256103515625, + "grad_norm": 14.8646240234375, + "learning_rate": 7.574913251566194e-06, + "loss": 4.5994, + "step": 64515 + }, + { + "epoch": 1.3126627604166667, + "grad_norm": 20.480552673339844, + "learning_rate": 7.574570639589907e-06, + "loss": 4.5684, + "step": 64520 + }, + { + "epoch": 1.3127644856770833, + "grad_norm": 18.63302993774414, + "learning_rate": 7.57422801116293e-06, + "loss": 5.0883, + "step": 64525 + }, + { + "epoch": 1.3128662109375, + "grad_norm": 16.02709197998047, + "learning_rate": 7.5738853662874534e-06, + "loss": 4.9818, + "step": 64530 + }, + { + "epoch": 1.3129679361979167, + "grad_norm": 17.384464263916016, + "learning_rate": 7.573542704965666e-06, + "loss": 4.8657, + "step": 64535 + }, + { + "epoch": 1.3130696614583333, + "grad_norm": 14.166932106018066, + "learning_rate": 7.573200027199758e-06, + "loss": 4.823, + "step": 64540 + }, + { + "epoch": 1.31317138671875, + "grad_norm": 15.830931663513184, + "learning_rate": 7.5728573329919185e-06, + "loss": 4.9758, + "step": 64545 + }, + { + "epoch": 1.3132731119791667, + "grad_norm": 16.410696029663086, + "learning_rate": 7.572514622344338e-06, + "loss": 4.9486, + "step": 64550 + }, + { + "epoch": 1.3133748372395833, + "grad_norm": 18.31444549560547, + "learning_rate": 7.572171895259206e-06, + "loss": 5.0021, + "step": 64555 + }, + { + "epoch": 1.3134765625, + "grad_norm": 17.093515396118164, + "learning_rate": 7.571829151738711e-06, + "loss": 4.8685, + "step": 64560 + }, + { + "epoch": 1.3135782877604167, + "grad_norm": 16.363576889038086, + "learning_rate": 7.571486391785043e-06, + "loss": 4.8545, + "step": 64565 + }, + { + "epoch": 1.3136800130208333, + "grad_norm": 19.21088218688965, + "learning_rate": 7.571143615400396e-06, + "loss": 4.96, + "step": 64570 + }, + { + "epoch": 1.31378173828125, + "grad_norm": 13.428400993347168, + "learning_rate": 7.570800822586957e-06, + "loss": 4.6882, + "step": 64575 + }, + { + "epoch": 1.3138834635416667, + "grad_norm": 16.394798278808594, + "learning_rate": 7.570458013346915e-06, + "loss": 4.7941, + "step": 64580 + }, + { + "epoch": 1.3139851888020833, + "grad_norm": 20.328819274902344, + "learning_rate": 7.5701151876824645e-06, + "loss": 5.0642, + "step": 64585 + }, + { + "epoch": 1.3140869140625, + "grad_norm": 20.482995986938477, + "learning_rate": 7.569772345595792e-06, + "loss": 4.8037, + "step": 64590 + }, + { + "epoch": 1.3141886393229167, + "grad_norm": 19.302404403686523, + "learning_rate": 7.569429487089091e-06, + "loss": 4.8766, + "step": 64595 + }, + { + "epoch": 1.3142903645833333, + "grad_norm": 19.675865173339844, + "learning_rate": 7.569086612164552e-06, + "loss": 4.5692, + "step": 64600 + }, + { + "epoch": 1.31439208984375, + "grad_norm": 20.39691925048828, + "learning_rate": 7.5687437208243615e-06, + "loss": 4.532, + "step": 64605 + }, + { + "epoch": 1.3144938151041667, + "grad_norm": 17.365921020507812, + "learning_rate": 7.568400813070718e-06, + "loss": 5.0826, + "step": 64610 + }, + { + "epoch": 1.3145955403645833, + "grad_norm": 22.541955947875977, + "learning_rate": 7.568057888905806e-06, + "loss": 5.0479, + "step": 64615 + }, + { + "epoch": 1.314697265625, + "grad_norm": 24.387714385986328, + "learning_rate": 7.567714948331821e-06, + "loss": 5.1927, + "step": 64620 + }, + { + "epoch": 1.3147989908854167, + "grad_norm": 21.1790828704834, + "learning_rate": 7.5673719913509515e-06, + "loss": 5.1852, + "step": 64625 + }, + { + "epoch": 1.3149007161458333, + "grad_norm": 16.026426315307617, + "learning_rate": 7.567029017965391e-06, + "loss": 5.071, + "step": 64630 + }, + { + "epoch": 1.31500244140625, + "grad_norm": 17.661354064941406, + "learning_rate": 7.566686028177328e-06, + "loss": 4.7039, + "step": 64635 + }, + { + "epoch": 1.3151041666666667, + "grad_norm": 19.20322036743164, + "learning_rate": 7.5663430219889575e-06, + "loss": 4.963, + "step": 64640 + }, + { + "epoch": 1.3152058919270833, + "grad_norm": 14.429190635681152, + "learning_rate": 7.565999999402469e-06, + "loss": 4.8481, + "step": 64645 + }, + { + "epoch": 1.3153076171875, + "grad_norm": 21.515188217163086, + "learning_rate": 7.565656960420057e-06, + "loss": 5.4472, + "step": 64650 + }, + { + "epoch": 1.3154093424479167, + "grad_norm": 22.01344871520996, + "learning_rate": 7.5653139050439095e-06, + "loss": 4.6553, + "step": 64655 + }, + { + "epoch": 1.3155110677083333, + "grad_norm": 17.618093490600586, + "learning_rate": 7.564970833276222e-06, + "loss": 5.1761, + "step": 64660 + }, + { + "epoch": 1.31561279296875, + "grad_norm": 19.855224609375, + "learning_rate": 7.564627745119184e-06, + "loss": 5.0381, + "step": 64665 + }, + { + "epoch": 1.3157145182291667, + "grad_norm": 15.418363571166992, + "learning_rate": 7.564284640574988e-06, + "loss": 4.7709, + "step": 64670 + }, + { + "epoch": 1.3158162434895833, + "grad_norm": 14.715957641601562, + "learning_rate": 7.56394151964583e-06, + "loss": 4.8728, + "step": 64675 + }, + { + "epoch": 1.31591796875, + "grad_norm": 17.871789932250977, + "learning_rate": 7.563598382333896e-06, + "loss": 5.0945, + "step": 64680 + }, + { + "epoch": 1.3160196940104167, + "grad_norm": 14.420511245727539, + "learning_rate": 7.563255228641385e-06, + "loss": 4.7908, + "step": 64685 + }, + { + "epoch": 1.3161214192708333, + "grad_norm": 16.685317993164062, + "learning_rate": 7.562912058570486e-06, + "loss": 4.8484, + "step": 64690 + }, + { + "epoch": 1.31622314453125, + "grad_norm": 12.867263793945312, + "learning_rate": 7.562568872123392e-06, + "loss": 4.9354, + "step": 64695 + }, + { + "epoch": 1.3163248697916667, + "grad_norm": 20.196332931518555, + "learning_rate": 7.562225669302297e-06, + "loss": 4.8175, + "step": 64700 + }, + { + "epoch": 1.3164265950520833, + "grad_norm": 20.171022415161133, + "learning_rate": 7.5618824501093935e-06, + "loss": 5.1667, + "step": 64705 + }, + { + "epoch": 1.3165283203125, + "grad_norm": 13.498929023742676, + "learning_rate": 7.561539214546874e-06, + "loss": 4.9726, + "step": 64710 + }, + { + "epoch": 1.3166300455729167, + "grad_norm": 15.889510154724121, + "learning_rate": 7.5611959626169315e-06, + "loss": 5.115, + "step": 64715 + }, + { + "epoch": 1.3167317708333333, + "grad_norm": 20.2499942779541, + "learning_rate": 7.560852694321761e-06, + "loss": 4.8187, + "step": 64720 + }, + { + "epoch": 1.31683349609375, + "grad_norm": 19.55009651184082, + "learning_rate": 7.560509409663554e-06, + "loss": 4.9496, + "step": 64725 + }, + { + "epoch": 1.3169352213541667, + "grad_norm": 15.313591957092285, + "learning_rate": 7.560166108644504e-06, + "loss": 4.8575, + "step": 64730 + }, + { + "epoch": 1.3170369466145833, + "grad_norm": 14.562724113464355, + "learning_rate": 7.559822791266806e-06, + "loss": 4.7673, + "step": 64735 + }, + { + "epoch": 1.317138671875, + "grad_norm": 19.550548553466797, + "learning_rate": 7.559479457532654e-06, + "loss": 4.8403, + "step": 64740 + }, + { + "epoch": 1.3172403971354167, + "grad_norm": 14.063047409057617, + "learning_rate": 7.559136107444238e-06, + "loss": 5.2233, + "step": 64745 + }, + { + "epoch": 1.3173421223958333, + "grad_norm": 18.534290313720703, + "learning_rate": 7.558792741003758e-06, + "loss": 4.9735, + "step": 64750 + }, + { + "epoch": 1.31744384765625, + "grad_norm": 19.139822006225586, + "learning_rate": 7.558449358213402e-06, + "loss": 4.6979, + "step": 64755 + }, + { + "epoch": 1.3175455729166667, + "grad_norm": 13.624576568603516, + "learning_rate": 7.558105959075368e-06, + "loss": 5.008, + "step": 64760 + }, + { + "epoch": 1.3176472981770833, + "grad_norm": 19.604015350341797, + "learning_rate": 7.557762543591849e-06, + "loss": 4.665, + "step": 64765 + }, + { + "epoch": 1.3177490234375, + "grad_norm": 25.038938522338867, + "learning_rate": 7.557419111765039e-06, + "loss": 5.0702, + "step": 64770 + }, + { + "epoch": 1.3178507486979167, + "grad_norm": 15.552614212036133, + "learning_rate": 7.557075663597132e-06, + "loss": 5.3898, + "step": 64775 + }, + { + "epoch": 1.3179524739583333, + "grad_norm": 23.252031326293945, + "learning_rate": 7.556732199090324e-06, + "loss": 4.9528, + "step": 64780 + }, + { + "epoch": 1.31805419921875, + "grad_norm": 17.352245330810547, + "learning_rate": 7.556388718246809e-06, + "loss": 5.0844, + "step": 64785 + }, + { + "epoch": 1.3181559244791667, + "grad_norm": 14.863828659057617, + "learning_rate": 7.556045221068781e-06, + "loss": 4.9376, + "step": 64790 + }, + { + "epoch": 1.3182576497395833, + "grad_norm": 17.556129455566406, + "learning_rate": 7.555701707558436e-06, + "loss": 5.0771, + "step": 64795 + }, + { + "epoch": 1.318359375, + "grad_norm": 16.83469581604004, + "learning_rate": 7.555358177717969e-06, + "loss": 4.8309, + "step": 64800 + }, + { + "epoch": 1.3184611002604167, + "grad_norm": 17.576452255249023, + "learning_rate": 7.5550146315495735e-06, + "loss": 4.8619, + "step": 64805 + }, + { + "epoch": 1.3185628255208333, + "grad_norm": 17.792011260986328, + "learning_rate": 7.554671069055446e-06, + "loss": 4.9467, + "step": 64810 + }, + { + "epoch": 1.31866455078125, + "grad_norm": 15.473549842834473, + "learning_rate": 7.55432749023778e-06, + "loss": 4.9851, + "step": 64815 + }, + { + "epoch": 1.3187662760416667, + "grad_norm": 15.03830623626709, + "learning_rate": 7.553983895098774e-06, + "loss": 4.886, + "step": 64820 + }, + { + "epoch": 1.3188680013020833, + "grad_norm": 21.111116409301758, + "learning_rate": 7.553640283640621e-06, + "loss": 4.8424, + "step": 64825 + }, + { + "epoch": 1.3189697265625, + "grad_norm": 18.796783447265625, + "learning_rate": 7.553296655865517e-06, + "loss": 5.085, + "step": 64830 + }, + { + "epoch": 1.3190714518229167, + "grad_norm": 16.64827537536621, + "learning_rate": 7.552953011775658e-06, + "loss": 4.9155, + "step": 64835 + }, + { + "epoch": 1.3191731770833333, + "grad_norm": 18.142574310302734, + "learning_rate": 7.55260935137324e-06, + "loss": 4.9522, + "step": 64840 + }, + { + "epoch": 1.31927490234375, + "grad_norm": 17.096054077148438, + "learning_rate": 7.552265674660458e-06, + "loss": 4.9944, + "step": 64845 + }, + { + "epoch": 1.3193766276041667, + "grad_norm": 17.18451499938965, + "learning_rate": 7.5519219816395095e-06, + "loss": 4.7644, + "step": 64850 + }, + { + "epoch": 1.3194783528645833, + "grad_norm": 20.80530548095703, + "learning_rate": 7.551578272312589e-06, + "loss": 5.0529, + "step": 64855 + }, + { + "epoch": 1.319580078125, + "grad_norm": 20.175832748413086, + "learning_rate": 7.551234546681894e-06, + "loss": 5.2285, + "step": 64860 + }, + { + "epoch": 1.3196818033854167, + "grad_norm": 15.345505714416504, + "learning_rate": 7.5508908047496175e-06, + "loss": 4.9906, + "step": 64865 + }, + { + "epoch": 1.3197835286458333, + "grad_norm": 19.15015983581543, + "learning_rate": 7.550547046517962e-06, + "loss": 5.0206, + "step": 64870 + }, + { + "epoch": 1.31988525390625, + "grad_norm": 19.368452072143555, + "learning_rate": 7.550203271989118e-06, + "loss": 5.135, + "step": 64875 + }, + { + "epoch": 1.3199869791666667, + "grad_norm": 13.140389442443848, + "learning_rate": 7.549859481165286e-06, + "loss": 4.9878, + "step": 64880 + }, + { + "epoch": 1.3200887044270833, + "grad_norm": 15.621871948242188, + "learning_rate": 7.54951567404866e-06, + "loss": 5.0376, + "step": 64885 + }, + { + "epoch": 1.3201904296875, + "grad_norm": 20.81356430053711, + "learning_rate": 7.549171850641439e-06, + "loss": 4.8171, + "step": 64890 + }, + { + "epoch": 1.3202921549479167, + "grad_norm": 22.187044143676758, + "learning_rate": 7.548828010945819e-06, + "loss": 4.8766, + "step": 64895 + }, + { + "epoch": 1.3203938802083333, + "grad_norm": 19.64812660217285, + "learning_rate": 7.548484154963998e-06, + "loss": 4.8904, + "step": 64900 + }, + { + "epoch": 1.32049560546875, + "grad_norm": 15.179551124572754, + "learning_rate": 7.548140282698171e-06, + "loss": 4.8595, + "step": 64905 + }, + { + "epoch": 1.3205973307291667, + "grad_norm": 13.102404594421387, + "learning_rate": 7.5477963941505375e-06, + "loss": 4.9536, + "step": 64910 + }, + { + "epoch": 1.3206990559895833, + "grad_norm": 18.252107620239258, + "learning_rate": 7.547452489323292e-06, + "loss": 4.9597, + "step": 64915 + }, + { + "epoch": 1.32080078125, + "grad_norm": 14.846945762634277, + "learning_rate": 7.547108568218635e-06, + "loss": 5.0306, + "step": 64920 + }, + { + "epoch": 1.3209025065104167, + "grad_norm": 18.138444900512695, + "learning_rate": 7.546764630838764e-06, + "loss": 4.8079, + "step": 64925 + }, + { + "epoch": 1.3210042317708333, + "grad_norm": 20.32619285583496, + "learning_rate": 7.5464206771858736e-06, + "loss": 4.9599, + "step": 64930 + }, + { + "epoch": 1.32110595703125, + "grad_norm": 20.72696876525879, + "learning_rate": 7.546076707262164e-06, + "loss": 4.9159, + "step": 64935 + }, + { + "epoch": 1.3212076822916667, + "grad_norm": 17.49576759338379, + "learning_rate": 7.545732721069834e-06, + "loss": 4.7814, + "step": 64940 + }, + { + "epoch": 1.3213094075520833, + "grad_norm": 21.88996696472168, + "learning_rate": 7.545388718611077e-06, + "loss": 5.1539, + "step": 64945 + }, + { + "epoch": 1.3214111328125, + "grad_norm": 14.549007415771484, + "learning_rate": 7.545044699888097e-06, + "loss": 4.9013, + "step": 64950 + }, + { + "epoch": 1.3215128580729167, + "grad_norm": 17.407276153564453, + "learning_rate": 7.5447006649030886e-06, + "loss": 5.0057, + "step": 64955 + }, + { + "epoch": 1.3216145833333333, + "grad_norm": 14.260449409484863, + "learning_rate": 7.54435661365825e-06, + "loss": 4.8636, + "step": 64960 + }, + { + "epoch": 1.32171630859375, + "grad_norm": 17.678897857666016, + "learning_rate": 7.54401254615578e-06, + "loss": 5.0334, + "step": 64965 + }, + { + "epoch": 1.3218180338541667, + "grad_norm": 23.008947372436523, + "learning_rate": 7.543668462397878e-06, + "loss": 5.0253, + "step": 64970 + }, + { + "epoch": 1.3219197591145833, + "grad_norm": 14.850589752197266, + "learning_rate": 7.543324362386743e-06, + "loss": 5.009, + "step": 64975 + }, + { + "epoch": 1.322021484375, + "grad_norm": 17.20258331298828, + "learning_rate": 7.542980246124571e-06, + "loss": 4.8635, + "step": 64980 + }, + { + "epoch": 1.3221232096354167, + "grad_norm": 20.426963806152344, + "learning_rate": 7.542636113613563e-06, + "loss": 4.9594, + "step": 64985 + }, + { + "epoch": 1.3222249348958333, + "grad_norm": 23.398225784301758, + "learning_rate": 7.5422919648559165e-06, + "loss": 4.7421, + "step": 64990 + }, + { + "epoch": 1.32232666015625, + "grad_norm": 14.71312141418457, + "learning_rate": 7.541947799853833e-06, + "loss": 4.9431, + "step": 64995 + }, + { + "epoch": 1.3224283854166667, + "grad_norm": 17.329349517822266, + "learning_rate": 7.54160361860951e-06, + "loss": 5.1314, + "step": 65000 + }, + { + "epoch": 1.3225301106770833, + "grad_norm": 16.58082389831543, + "learning_rate": 7.541259421125146e-06, + "loss": 4.9391, + "step": 65005 + }, + { + "epoch": 1.3226318359375, + "grad_norm": 21.361026763916016, + "learning_rate": 7.540915207402941e-06, + "loss": 4.9906, + "step": 65010 + }, + { + "epoch": 1.3227335611979167, + "grad_norm": 21.601741790771484, + "learning_rate": 7.540570977445094e-06, + "loss": 4.77, + "step": 65015 + }, + { + "epoch": 1.3228352864583333, + "grad_norm": 20.268787384033203, + "learning_rate": 7.540226731253806e-06, + "loss": 4.9903, + "step": 65020 + }, + { + "epoch": 1.32293701171875, + "grad_norm": 18.014806747436523, + "learning_rate": 7.539882468831275e-06, + "loss": 4.9146, + "step": 65025 + }, + { + "epoch": 1.3230387369791667, + "grad_norm": 16.67662239074707, + "learning_rate": 7.539538190179701e-06, + "loss": 4.5563, + "step": 65030 + }, + { + "epoch": 1.3231404622395833, + "grad_norm": 14.603075981140137, + "learning_rate": 7.539193895301283e-06, + "loss": 4.7574, + "step": 65035 + }, + { + "epoch": 1.3232421875, + "grad_norm": 15.353713989257812, + "learning_rate": 7.538849584198224e-06, + "loss": 4.9007, + "step": 65040 + }, + { + "epoch": 1.3233439127604167, + "grad_norm": 19.5292911529541, + "learning_rate": 7.538505256872721e-06, + "loss": 5.1496, + "step": 65045 + }, + { + "epoch": 1.3234456380208333, + "grad_norm": 23.61636734008789, + "learning_rate": 7.538160913326976e-06, + "loss": 4.6528, + "step": 65050 + }, + { + "epoch": 1.32354736328125, + "grad_norm": 28.213836669921875, + "learning_rate": 7.537816553563188e-06, + "loss": 4.5937, + "step": 65055 + }, + { + "epoch": 1.3236490885416667, + "grad_norm": 17.855783462524414, + "learning_rate": 7.537472177583558e-06, + "loss": 4.7974, + "step": 65060 + }, + { + "epoch": 1.3237508138020833, + "grad_norm": 18.819246292114258, + "learning_rate": 7.537127785390287e-06, + "loss": 5.0551, + "step": 65065 + }, + { + "epoch": 1.3238525390625, + "grad_norm": 12.834308624267578, + "learning_rate": 7.536783376985574e-06, + "loss": 4.9743, + "step": 65070 + }, + { + "epoch": 1.3239542643229167, + "grad_norm": 18.33173370361328, + "learning_rate": 7.536438952371621e-06, + "loss": 4.577, + "step": 65075 + }, + { + "epoch": 1.3240559895833333, + "grad_norm": 15.099251747131348, + "learning_rate": 7.536094511550629e-06, + "loss": 4.9439, + "step": 65080 + }, + { + "epoch": 1.32415771484375, + "grad_norm": 16.873149871826172, + "learning_rate": 7.5357500545247956e-06, + "loss": 4.5863, + "step": 65085 + }, + { + "epoch": 1.3242594401041667, + "grad_norm": 23.72490882873535, + "learning_rate": 7.5354055812963265e-06, + "loss": 5.3748, + "step": 65090 + }, + { + "epoch": 1.3243611653645833, + "grad_norm": 22.059778213500977, + "learning_rate": 7.53506109186742e-06, + "loss": 4.7682, + "step": 65095 + }, + { + "epoch": 1.324462890625, + "grad_norm": 20.268266677856445, + "learning_rate": 7.5347165862402785e-06, + "loss": 4.998, + "step": 65100 + }, + { + "epoch": 1.3245646158854167, + "grad_norm": 18.812070846557617, + "learning_rate": 7.534372064417102e-06, + "loss": 5.1187, + "step": 65105 + }, + { + "epoch": 1.3246663411458333, + "grad_norm": 17.047039031982422, + "learning_rate": 7.534027526400092e-06, + "loss": 5.0661, + "step": 65110 + }, + { + "epoch": 1.32476806640625, + "grad_norm": 16.24330711364746, + "learning_rate": 7.533682972191451e-06, + "loss": 4.8266, + "step": 65115 + }, + { + "epoch": 1.3248697916666667, + "grad_norm": 15.499776840209961, + "learning_rate": 7.5333384017933796e-06, + "loss": 4.9091, + "step": 65120 + }, + { + "epoch": 1.3249715169270833, + "grad_norm": 15.272528648376465, + "learning_rate": 7.53299381520808e-06, + "loss": 4.8355, + "step": 65125 + }, + { + "epoch": 1.3250732421875, + "grad_norm": 22.974998474121094, + "learning_rate": 7.532649212437755e-06, + "loss": 5.2122, + "step": 65130 + }, + { + "epoch": 1.3251749674479167, + "grad_norm": 15.061535835266113, + "learning_rate": 7.532304593484605e-06, + "loss": 4.9857, + "step": 65135 + }, + { + "epoch": 1.3252766927083333, + "grad_norm": 15.392556190490723, + "learning_rate": 7.531959958350832e-06, + "loss": 4.7856, + "step": 65140 + }, + { + "epoch": 1.32537841796875, + "grad_norm": 27.30059242248535, + "learning_rate": 7.53161530703864e-06, + "loss": 4.9749, + "step": 65145 + }, + { + "epoch": 1.3254801432291667, + "grad_norm": 19.5353946685791, + "learning_rate": 7.531270639550228e-06, + "loss": 5.0277, + "step": 65150 + }, + { + "epoch": 1.3255818684895833, + "grad_norm": 18.53433609008789, + "learning_rate": 7.5309259558878e-06, + "loss": 4.9492, + "step": 65155 + }, + { + "epoch": 1.32568359375, + "grad_norm": 20.246002197265625, + "learning_rate": 7.5305812560535606e-06, + "loss": 4.8557, + "step": 65160 + }, + { + "epoch": 1.3257853190104167, + "grad_norm": 17.56491470336914, + "learning_rate": 7.530236540049709e-06, + "loss": 4.8923, + "step": 65165 + }, + { + "epoch": 1.3258870442708333, + "grad_norm": 17.615598678588867, + "learning_rate": 7.529891807878449e-06, + "loss": 4.8669, + "step": 65170 + }, + { + "epoch": 1.32598876953125, + "grad_norm": 17.245296478271484, + "learning_rate": 7.5295470595419836e-06, + "loss": 4.9645, + "step": 65175 + }, + { + "epoch": 1.3260904947916667, + "grad_norm": 20.363828659057617, + "learning_rate": 7.529202295042517e-06, + "loss": 4.696, + "step": 65180 + }, + { + "epoch": 1.3261922200520833, + "grad_norm": 18.935951232910156, + "learning_rate": 7.528857514382248e-06, + "loss": 4.8628, + "step": 65185 + }, + { + "epoch": 1.3262939453125, + "grad_norm": 17.145252227783203, + "learning_rate": 7.528512717563384e-06, + "loss": 4.9969, + "step": 65190 + }, + { + "epoch": 1.3263956705729167, + "grad_norm": 19.853775024414062, + "learning_rate": 7.528167904588126e-06, + "loss": 4.7444, + "step": 65195 + }, + { + "epoch": 1.3264973958333333, + "grad_norm": 20.395404815673828, + "learning_rate": 7.527823075458678e-06, + "loss": 5.1208, + "step": 65200 + }, + { + "epoch": 1.32659912109375, + "grad_norm": 19.716230392456055, + "learning_rate": 7.527478230177243e-06, + "loss": 4.7573, + "step": 65205 + }, + { + "epoch": 1.3267008463541667, + "grad_norm": 19.717634201049805, + "learning_rate": 7.527133368746024e-06, + "loss": 5.0218, + "step": 65210 + }, + { + "epoch": 1.3268025716145833, + "grad_norm": 17.173585891723633, + "learning_rate": 7.526788491167225e-06, + "loss": 5.087, + "step": 65215 + }, + { + "epoch": 1.326904296875, + "grad_norm": 21.78041648864746, + "learning_rate": 7.526443597443049e-06, + "loss": 5.1727, + "step": 65220 + }, + { + "epoch": 1.3270060221354167, + "grad_norm": 15.613587379455566, + "learning_rate": 7.526098687575701e-06, + "loss": 4.7595, + "step": 65225 + }, + { + "epoch": 1.3271077473958333, + "grad_norm": 18.760286331176758, + "learning_rate": 7.5257537615673845e-06, + "loss": 4.9059, + "step": 65230 + }, + { + "epoch": 1.32720947265625, + "grad_norm": 20.67729949951172, + "learning_rate": 7.5254088194203036e-06, + "loss": 5.2121, + "step": 65235 + }, + { + "epoch": 1.3273111979166667, + "grad_norm": 16.715473175048828, + "learning_rate": 7.525063861136661e-06, + "loss": 5.0062, + "step": 65240 + }, + { + "epoch": 1.3274129231770833, + "grad_norm": 17.20186424255371, + "learning_rate": 7.524718886718663e-06, + "loss": 5.1545, + "step": 65245 + }, + { + "epoch": 1.3275146484375, + "grad_norm": 20.031658172607422, + "learning_rate": 7.524373896168512e-06, + "loss": 4.7657, + "step": 65250 + }, + { + "epoch": 1.3276163736979167, + "grad_norm": 20.57283592224121, + "learning_rate": 7.524028889488414e-06, + "loss": 4.7694, + "step": 65255 + }, + { + "epoch": 1.3277180989583333, + "grad_norm": 16.18259620666504, + "learning_rate": 7.5236838666805725e-06, + "loss": 4.9474, + "step": 65260 + }, + { + "epoch": 1.32781982421875, + "grad_norm": 18.194534301757812, + "learning_rate": 7.523338827747191e-06, + "loss": 4.8874, + "step": 65265 + }, + { + "epoch": 1.3279215494791667, + "grad_norm": 21.47846221923828, + "learning_rate": 7.522993772690476e-06, + "loss": 5.0115, + "step": 65270 + }, + { + "epoch": 1.3280232747395833, + "grad_norm": 20.73676872253418, + "learning_rate": 7.522648701512631e-06, + "loss": 4.9339, + "step": 65275 + }, + { + "epoch": 1.328125, + "grad_norm": 18.538406372070312, + "learning_rate": 7.5223036142158635e-06, + "loss": 4.9246, + "step": 65280 + }, + { + "epoch": 1.3282267252604167, + "grad_norm": 13.183521270751953, + "learning_rate": 7.521958510802376e-06, + "loss": 5.1535, + "step": 65285 + }, + { + "epoch": 1.3283284505208333, + "grad_norm": 22.875808715820312, + "learning_rate": 7.5216133912743735e-06, + "loss": 4.6409, + "step": 65290 + }, + { + "epoch": 1.32843017578125, + "grad_norm": 19.1414794921875, + "learning_rate": 7.521268255634061e-06, + "loss": 4.8705, + "step": 65295 + }, + { + "epoch": 1.3285319010416667, + "grad_norm": 18.78167152404785, + "learning_rate": 7.520923103883646e-06, + "loss": 4.835, + "step": 65300 + }, + { + "epoch": 1.3286336263020833, + "grad_norm": 14.435739517211914, + "learning_rate": 7.520577936025333e-06, + "loss": 4.8814, + "step": 65305 + }, + { + "epoch": 1.3287353515625, + "grad_norm": 22.90046501159668, + "learning_rate": 7.520232752061326e-06, + "loss": 5.0311, + "step": 65310 + }, + { + "epoch": 1.3288370768229167, + "grad_norm": 21.071290969848633, + "learning_rate": 7.519887551993832e-06, + "loss": 4.8627, + "step": 65315 + }, + { + "epoch": 1.3289388020833333, + "grad_norm": 13.283681869506836, + "learning_rate": 7.519542335825056e-06, + "loss": 5.024, + "step": 65320 + }, + { + "epoch": 1.32904052734375, + "grad_norm": 16.18885612487793, + "learning_rate": 7.519197103557206e-06, + "loss": 4.9077, + "step": 65325 + }, + { + "epoch": 1.3291422526041667, + "grad_norm": 15.360523223876953, + "learning_rate": 7.518851855192485e-06, + "loss": 4.967, + "step": 65330 + }, + { + "epoch": 1.3292439778645833, + "grad_norm": 17.407554626464844, + "learning_rate": 7.518506590733102e-06, + "loss": 4.6744, + "step": 65335 + }, + { + "epoch": 1.329345703125, + "grad_norm": 15.671525001525879, + "learning_rate": 7.518161310181258e-06, + "loss": 4.718, + "step": 65340 + }, + { + "epoch": 1.3294474283854167, + "grad_norm": 16.278244018554688, + "learning_rate": 7.517816013539165e-06, + "loss": 5.0096, + "step": 65345 + }, + { + "epoch": 1.3295491536458333, + "grad_norm": 13.288898468017578, + "learning_rate": 7.517470700809026e-06, + "loss": 4.9163, + "step": 65350 + }, + { + "epoch": 1.32965087890625, + "grad_norm": 14.909443855285645, + "learning_rate": 7.517125371993049e-06, + "loss": 4.8799, + "step": 65355 + }, + { + "epoch": 1.3297526041666667, + "grad_norm": 24.021276473999023, + "learning_rate": 7.51678002709344e-06, + "loss": 4.9992, + "step": 65360 + }, + { + "epoch": 1.3298543294270833, + "grad_norm": 12.450483322143555, + "learning_rate": 7.5164346661124046e-06, + "loss": 5.2146, + "step": 65365 + }, + { + "epoch": 1.3299560546875, + "grad_norm": 19.0225772857666, + "learning_rate": 7.516089289052152e-06, + "loss": 5.1042, + "step": 65370 + }, + { + "epoch": 1.3300577799479167, + "grad_norm": 21.206445693969727, + "learning_rate": 7.515743895914887e-06, + "loss": 5.2213, + "step": 65375 + }, + { + "epoch": 1.3301595052083333, + "grad_norm": 17.364953994750977, + "learning_rate": 7.515398486702816e-06, + "loss": 4.9845, + "step": 65380 + }, + { + "epoch": 1.33026123046875, + "grad_norm": 19.000322341918945, + "learning_rate": 7.515053061418148e-06, + "loss": 4.8716, + "step": 65385 + }, + { + "epoch": 1.3303629557291667, + "grad_norm": 20.55972671508789, + "learning_rate": 7.514707620063091e-06, + "loss": 4.6312, + "step": 65390 + }, + { + "epoch": 1.3304646809895833, + "grad_norm": 22.053735733032227, + "learning_rate": 7.514362162639848e-06, + "loss": 4.9077, + "step": 65395 + }, + { + "epoch": 1.33056640625, + "grad_norm": 20.494823455810547, + "learning_rate": 7.51401668915063e-06, + "loss": 4.8643, + "step": 65400 + }, + { + "epoch": 1.3306681315104167, + "grad_norm": 15.161890983581543, + "learning_rate": 7.513671199597643e-06, + "loss": 5.1407, + "step": 65405 + }, + { + "epoch": 1.3307698567708333, + "grad_norm": 19.121646881103516, + "learning_rate": 7.513325693983096e-06, + "loss": 5.1386, + "step": 65410 + }, + { + "epoch": 1.33087158203125, + "grad_norm": 18.524974822998047, + "learning_rate": 7.5129801723091945e-06, + "loss": 5.1268, + "step": 65415 + }, + { + "epoch": 1.3309733072916667, + "grad_norm": 16.115049362182617, + "learning_rate": 7.5126346345781465e-06, + "loss": 5.1179, + "step": 65420 + }, + { + "epoch": 1.3310750325520833, + "grad_norm": 13.53056812286377, + "learning_rate": 7.512289080792162e-06, + "loss": 4.7683, + "step": 65425 + }, + { + "epoch": 1.3311767578125, + "grad_norm": 14.877435684204102, + "learning_rate": 7.511943510953448e-06, + "loss": 4.8738, + "step": 65430 + }, + { + "epoch": 1.3312784830729167, + "grad_norm": 17.358068466186523, + "learning_rate": 7.511597925064211e-06, + "loss": 5.27, + "step": 65435 + }, + { + "epoch": 1.3313802083333333, + "grad_norm": 16.847198486328125, + "learning_rate": 7.511252323126663e-06, + "loss": 4.9535, + "step": 65440 + }, + { + "epoch": 1.33148193359375, + "grad_norm": 22.05145263671875, + "learning_rate": 7.510906705143008e-06, + "loss": 4.9776, + "step": 65445 + }, + { + "epoch": 1.3315836588541667, + "grad_norm": 15.023238182067871, + "learning_rate": 7.510561071115458e-06, + "loss": 4.8891, + "step": 65450 + }, + { + "epoch": 1.3316853841145833, + "grad_norm": 18.12995147705078, + "learning_rate": 7.510215421046219e-06, + "loss": 4.9244, + "step": 65455 + }, + { + "epoch": 1.331787109375, + "grad_norm": 19.788328170776367, + "learning_rate": 7.509869754937499e-06, + "loss": 4.8029, + "step": 65460 + }, + { + "epoch": 1.3318888346354167, + "grad_norm": 18.335729598999023, + "learning_rate": 7.50952407279151e-06, + "loss": 5.24, + "step": 65465 + }, + { + "epoch": 1.3319905598958333, + "grad_norm": 13.715157508850098, + "learning_rate": 7.509178374610456e-06, + "loss": 5.1318, + "step": 65470 + }, + { + "epoch": 1.33209228515625, + "grad_norm": 17.369539260864258, + "learning_rate": 7.5088326603965515e-06, + "loss": 5.1736, + "step": 65475 + }, + { + "epoch": 1.3321940104166667, + "grad_norm": 22.292484283447266, + "learning_rate": 7.508486930152002e-06, + "loss": 4.9175, + "step": 65480 + }, + { + "epoch": 1.3322957356770833, + "grad_norm": 20.449310302734375, + "learning_rate": 7.508141183879016e-06, + "loss": 5.1552, + "step": 65485 + }, + { + "epoch": 1.3323974609375, + "grad_norm": 20.026662826538086, + "learning_rate": 7.507795421579804e-06, + "loss": 5.0444, + "step": 65490 + }, + { + "epoch": 1.3324991861979167, + "grad_norm": 23.1492862701416, + "learning_rate": 7.5074496432565765e-06, + "loss": 4.8908, + "step": 65495 + }, + { + "epoch": 1.3326009114583333, + "grad_norm": 18.605777740478516, + "learning_rate": 7.507103848911542e-06, + "loss": 4.7811, + "step": 65500 + }, + { + "epoch": 1.33270263671875, + "grad_norm": 17.39668083190918, + "learning_rate": 7.50675803854691e-06, + "loss": 4.7159, + "step": 65505 + }, + { + "epoch": 1.3328043619791667, + "grad_norm": 29.379497528076172, + "learning_rate": 7.506412212164888e-06, + "loss": 5.2704, + "step": 65510 + }, + { + "epoch": 1.3329060872395833, + "grad_norm": 23.537044525146484, + "learning_rate": 7.506066369767691e-06, + "loss": 5.0056, + "step": 65515 + }, + { + "epoch": 1.3330078125, + "grad_norm": 19.654367446899414, + "learning_rate": 7.505720511357522e-06, + "loss": 5.0051, + "step": 65520 + }, + { + "epoch": 1.3331095377604167, + "grad_norm": 18.388822555541992, + "learning_rate": 7.505374636936596e-06, + "loss": 5.0901, + "step": 65525 + }, + { + "epoch": 1.3332112630208333, + "grad_norm": 18.468860626220703, + "learning_rate": 7.505028746507121e-06, + "loss": 4.9826, + "step": 65530 + }, + { + "epoch": 1.33331298828125, + "grad_norm": 16.118606567382812, + "learning_rate": 7.5046828400713065e-06, + "loss": 4.9175, + "step": 65535 + }, + { + "epoch": 1.3334147135416667, + "grad_norm": 19.541118621826172, + "learning_rate": 7.504336917631365e-06, + "loss": 4.9108, + "step": 65540 + }, + { + "epoch": 1.3335164388020833, + "grad_norm": 13.757919311523438, + "learning_rate": 7.503990979189506e-06, + "loss": 4.5916, + "step": 65545 + }, + { + "epoch": 1.3336181640625, + "grad_norm": 16.573989868164062, + "learning_rate": 7.503645024747939e-06, + "loss": 4.8942, + "step": 65550 + }, + { + "epoch": 1.3337198893229167, + "grad_norm": 16.34235954284668, + "learning_rate": 7.503299054308875e-06, + "loss": 4.7512, + "step": 65555 + }, + { + "epoch": 1.3338216145833333, + "grad_norm": 15.315339088439941, + "learning_rate": 7.5029530678745235e-06, + "loss": 4.9155, + "step": 65560 + }, + { + "epoch": 1.33392333984375, + "grad_norm": 20.162607192993164, + "learning_rate": 7.502607065447096e-06, + "loss": 5.0711, + "step": 65565 + }, + { + "epoch": 1.3340250651041667, + "grad_norm": 24.102384567260742, + "learning_rate": 7.502261047028807e-06, + "loss": 4.9922, + "step": 65570 + }, + { + "epoch": 1.3341267903645833, + "grad_norm": 15.114381790161133, + "learning_rate": 7.501915012621861e-06, + "loss": 5.0108, + "step": 65575 + }, + { + "epoch": 1.334228515625, + "grad_norm": 17.441566467285156, + "learning_rate": 7.501568962228475e-06, + "loss": 4.9865, + "step": 65580 + }, + { + "epoch": 1.3343302408854167, + "grad_norm": 16.08074188232422, + "learning_rate": 7.501222895850855e-06, + "loss": 4.8105, + "step": 65585 + }, + { + "epoch": 1.3344319661458333, + "grad_norm": 17.299936294555664, + "learning_rate": 7.5008768134912165e-06, + "loss": 4.9175, + "step": 65590 + }, + { + "epoch": 1.33453369140625, + "grad_norm": 19.312389373779297, + "learning_rate": 7.500530715151769e-06, + "loss": 5.0194, + "step": 65595 + }, + { + "epoch": 1.3346354166666667, + "grad_norm": 24.302534103393555, + "learning_rate": 7.500184600834724e-06, + "loss": 4.9477, + "step": 65600 + }, + { + "epoch": 1.3347371419270833, + "grad_norm": 17.963285446166992, + "learning_rate": 7.499838470542292e-06, + "loss": 5.0688, + "step": 65605 + }, + { + "epoch": 1.3348388671875, + "grad_norm": 16.401968002319336, + "learning_rate": 7.499492324276687e-06, + "loss": 5.0371, + "step": 65610 + }, + { + "epoch": 1.3349405924479167, + "grad_norm": 18.91037940979004, + "learning_rate": 7.499146162040118e-06, + "loss": 4.7119, + "step": 65615 + }, + { + "epoch": 1.3350423177083333, + "grad_norm": 15.139300346374512, + "learning_rate": 7.498799983834799e-06, + "loss": 4.9267, + "step": 65620 + }, + { + "epoch": 1.33514404296875, + "grad_norm": 14.217245101928711, + "learning_rate": 7.498453789662941e-06, + "loss": 4.8979, + "step": 65625 + }, + { + "epoch": 1.3352457682291667, + "grad_norm": 18.83346176147461, + "learning_rate": 7.498107579526758e-06, + "loss": 4.7297, + "step": 65630 + }, + { + "epoch": 1.3353474934895833, + "grad_norm": 15.610614776611328, + "learning_rate": 7.497761353428458e-06, + "loss": 4.7525, + "step": 65635 + }, + { + "epoch": 1.33544921875, + "grad_norm": 18.93500518798828, + "learning_rate": 7.497415111370258e-06, + "loss": 4.9445, + "step": 65640 + }, + { + "epoch": 1.3355509440104167, + "grad_norm": 16.257036209106445, + "learning_rate": 7.497068853354368e-06, + "loss": 4.9962, + "step": 65645 + }, + { + "epoch": 1.3356526692708333, + "grad_norm": 17.361553192138672, + "learning_rate": 7.496722579383e-06, + "loss": 5.1139, + "step": 65650 + }, + { + "epoch": 1.33575439453125, + "grad_norm": 16.014572143554688, + "learning_rate": 7.4963762894583666e-06, + "loss": 4.7836, + "step": 65655 + }, + { + "epoch": 1.3358561197916667, + "grad_norm": 18.644651412963867, + "learning_rate": 7.496029983582683e-06, + "loss": 4.8658, + "step": 65660 + }, + { + "epoch": 1.3359578450520833, + "grad_norm": 17.878995895385742, + "learning_rate": 7.49568366175816e-06, + "loss": 4.8408, + "step": 65665 + }, + { + "epoch": 1.3360595703125, + "grad_norm": 15.272200584411621, + "learning_rate": 7.4953373239870085e-06, + "loss": 4.6489, + "step": 65670 + }, + { + "epoch": 1.3361612955729167, + "grad_norm": 14.321714401245117, + "learning_rate": 7.494990970271446e-06, + "loss": 4.8934, + "step": 65675 + }, + { + "epoch": 1.3362630208333333, + "grad_norm": 18.066390991210938, + "learning_rate": 7.4946446006136815e-06, + "loss": 4.6116, + "step": 65680 + }, + { + "epoch": 1.33636474609375, + "grad_norm": 25.85806655883789, + "learning_rate": 7.494298215015931e-06, + "loss": 4.865, + "step": 65685 + }, + { + "epoch": 1.3364664713541667, + "grad_norm": 18.592390060424805, + "learning_rate": 7.493951813480406e-06, + "loss": 4.906, + "step": 65690 + }, + { + "epoch": 1.3365681966145833, + "grad_norm": 17.007640838623047, + "learning_rate": 7.493605396009321e-06, + "loss": 4.7246, + "step": 65695 + }, + { + "epoch": 1.336669921875, + "grad_norm": 32.39685821533203, + "learning_rate": 7.493258962604889e-06, + "loss": 4.8924, + "step": 65700 + }, + { + "epoch": 1.3367716471354167, + "grad_norm": 16.680435180664062, + "learning_rate": 7.4929125132693224e-06, + "loss": 5.0327, + "step": 65705 + }, + { + "epoch": 1.3368733723958333, + "grad_norm": 12.750327110290527, + "learning_rate": 7.492566048004839e-06, + "loss": 5.0925, + "step": 65710 + }, + { + "epoch": 1.33697509765625, + "grad_norm": 18.208789825439453, + "learning_rate": 7.492219566813648e-06, + "loss": 5.0083, + "step": 65715 + }, + { + "epoch": 1.3370768229166667, + "grad_norm": 15.227665901184082, + "learning_rate": 7.491873069697964e-06, + "loss": 4.9108, + "step": 65720 + }, + { + "epoch": 1.3371785481770833, + "grad_norm": 18.39636993408203, + "learning_rate": 7.491526556660005e-06, + "loss": 5.1516, + "step": 65725 + }, + { + "epoch": 1.3372802734375, + "grad_norm": 16.875904083251953, + "learning_rate": 7.4911800277019794e-06, + "loss": 4.9488, + "step": 65730 + }, + { + "epoch": 1.3373819986979167, + "grad_norm": 16.123878479003906, + "learning_rate": 7.490833482826107e-06, + "loss": 4.9871, + "step": 65735 + }, + { + "epoch": 1.3374837239583333, + "grad_norm": 20.1278076171875, + "learning_rate": 7.490486922034597e-06, + "loss": 5.1891, + "step": 65740 + }, + { + "epoch": 1.33758544921875, + "grad_norm": 17.38740348815918, + "learning_rate": 7.4901403453296676e-06, + "loss": 4.9471, + "step": 65745 + }, + { + "epoch": 1.3376871744791667, + "grad_norm": 24.70487403869629, + "learning_rate": 7.489793752713531e-06, + "loss": 4.8934, + "step": 65750 + }, + { + "epoch": 1.3377888997395833, + "grad_norm": 18.16274070739746, + "learning_rate": 7.489447144188403e-06, + "loss": 4.8253, + "step": 65755 + }, + { + "epoch": 1.337890625, + "grad_norm": 17.764236450195312, + "learning_rate": 7.489100519756498e-06, + "loss": 5.0365, + "step": 65760 + }, + { + "epoch": 1.3379923502604167, + "grad_norm": 19.98246192932129, + "learning_rate": 7.488753879420032e-06, + "loss": 4.8555, + "step": 65765 + }, + { + "epoch": 1.3380940755208333, + "grad_norm": 20.6583309173584, + "learning_rate": 7.4884072231812166e-06, + "loss": 4.8599, + "step": 65770 + }, + { + "epoch": 1.33819580078125, + "grad_norm": 19.64220428466797, + "learning_rate": 7.4880605510422696e-06, + "loss": 4.9469, + "step": 65775 + }, + { + "epoch": 1.3382975260416667, + "grad_norm": 21.519166946411133, + "learning_rate": 7.487713863005406e-06, + "loss": 5.0056, + "step": 65780 + }, + { + "epoch": 1.3383992513020833, + "grad_norm": 16.513275146484375, + "learning_rate": 7.48736715907284e-06, + "loss": 5.3487, + "step": 65785 + }, + { + "epoch": 1.3385009765625, + "grad_norm": 16.306129455566406, + "learning_rate": 7.48702043924679e-06, + "loss": 4.9421, + "step": 65790 + }, + { + "epoch": 1.3386027018229167, + "grad_norm": 18.859355926513672, + "learning_rate": 7.486673703529466e-06, + "loss": 4.7085, + "step": 65795 + }, + { + "epoch": 1.3387044270833333, + "grad_norm": 14.364882469177246, + "learning_rate": 7.486326951923086e-06, + "loss": 4.6327, + "step": 65800 + }, + { + "epoch": 1.33880615234375, + "grad_norm": 20.147043228149414, + "learning_rate": 7.485980184429868e-06, + "loss": 4.9181, + "step": 65805 + }, + { + "epoch": 1.3389078776041667, + "grad_norm": 18.907543182373047, + "learning_rate": 7.485633401052025e-06, + "loss": 4.9927, + "step": 65810 + }, + { + "epoch": 1.3390096028645833, + "grad_norm": 20.456377029418945, + "learning_rate": 7.4852866017917735e-06, + "loss": 4.8204, + "step": 65815 + }, + { + "epoch": 1.339111328125, + "grad_norm": 20.080821990966797, + "learning_rate": 7.484939786651329e-06, + "loss": 4.7255, + "step": 65820 + }, + { + "epoch": 1.3392130533854167, + "grad_norm": 14.00926685333252, + "learning_rate": 7.484592955632909e-06, + "loss": 4.7811, + "step": 65825 + }, + { + "epoch": 1.3393147786458333, + "grad_norm": 11.380099296569824, + "learning_rate": 7.484246108738728e-06, + "loss": 4.8446, + "step": 65830 + }, + { + "epoch": 1.33941650390625, + "grad_norm": 14.282723426818848, + "learning_rate": 7.483899245971004e-06, + "loss": 4.8419, + "step": 65835 + }, + { + "epoch": 1.3395182291666667, + "grad_norm": 18.444801330566406, + "learning_rate": 7.483552367331951e-06, + "loss": 4.9854, + "step": 65840 + }, + { + "epoch": 1.3396199544270833, + "grad_norm": 20.14075469970703, + "learning_rate": 7.483205472823787e-06, + "loss": 5.0121, + "step": 65845 + }, + { + "epoch": 1.3397216796875, + "grad_norm": 15.64073371887207, + "learning_rate": 7.4828585624487295e-06, + "loss": 4.6369, + "step": 65850 + }, + { + "epoch": 1.3398234049479167, + "grad_norm": 19.562162399291992, + "learning_rate": 7.482511636208994e-06, + "loss": 4.9946, + "step": 65855 + }, + { + "epoch": 1.3399251302083333, + "grad_norm": 23.327871322631836, + "learning_rate": 7.482164694106795e-06, + "loss": 4.8482, + "step": 65860 + }, + { + "epoch": 1.34002685546875, + "grad_norm": 12.93952751159668, + "learning_rate": 7.481817736144353e-06, + "loss": 4.8829, + "step": 65865 + }, + { + "epoch": 1.3401285807291667, + "grad_norm": 23.46917724609375, + "learning_rate": 7.481470762323883e-06, + "loss": 5.1982, + "step": 65870 + }, + { + "epoch": 1.3402303059895833, + "grad_norm": 13.378828048706055, + "learning_rate": 7.481123772647604e-06, + "loss": 5.0314, + "step": 65875 + }, + { + "epoch": 1.34033203125, + "grad_norm": 17.609207153320312, + "learning_rate": 7.48077676711773e-06, + "loss": 5.0095, + "step": 65880 + }, + { + "epoch": 1.3404337565104167, + "grad_norm": 26.537883758544922, + "learning_rate": 7.48042974573648e-06, + "loss": 4.798, + "step": 65885 + }, + { + "epoch": 1.3405354817708333, + "grad_norm": 19.21218490600586, + "learning_rate": 7.4800827085060735e-06, + "loss": 5.0904, + "step": 65890 + }, + { + "epoch": 1.34063720703125, + "grad_norm": 23.601341247558594, + "learning_rate": 7.479735655428725e-06, + "loss": 4.8474, + "step": 65895 + }, + { + "epoch": 1.3407389322916667, + "grad_norm": 18.408069610595703, + "learning_rate": 7.479388586506651e-06, + "loss": 5.2725, + "step": 65900 + }, + { + "epoch": 1.3408406575520833, + "grad_norm": 17.046062469482422, + "learning_rate": 7.479041501742072e-06, + "loss": 5.0672, + "step": 65905 + }, + { + "epoch": 1.3409423828125, + "grad_norm": 15.70458698272705, + "learning_rate": 7.478694401137205e-06, + "loss": 4.8844, + "step": 65910 + }, + { + "epoch": 1.3410441080729167, + "grad_norm": 16.060136795043945, + "learning_rate": 7.478347284694267e-06, + "loss": 4.9992, + "step": 65915 + }, + { + "epoch": 1.3411458333333333, + "grad_norm": 24.94729232788086, + "learning_rate": 7.478000152415479e-06, + "loss": 4.9207, + "step": 65920 + }, + { + "epoch": 1.34124755859375, + "grad_norm": 20.750110626220703, + "learning_rate": 7.477653004303054e-06, + "loss": 4.8287, + "step": 65925 + }, + { + "epoch": 1.3413492838541667, + "grad_norm": 14.992231369018555, + "learning_rate": 7.4773058403592135e-06, + "loss": 5.1192, + "step": 65930 + }, + { + "epoch": 1.3414510091145833, + "grad_norm": 14.524530410766602, + "learning_rate": 7.476958660586174e-06, + "loss": 4.8379, + "step": 65935 + }, + { + "epoch": 1.341552734375, + "grad_norm": 16.001127243041992, + "learning_rate": 7.476611464986156e-06, + "loss": 4.9327, + "step": 65940 + }, + { + "epoch": 1.3416544596354167, + "grad_norm": 20.39908218383789, + "learning_rate": 7.476264253561378e-06, + "loss": 4.659, + "step": 65945 + }, + { + "epoch": 1.3417561848958333, + "grad_norm": 16.443859100341797, + "learning_rate": 7.4759170263140565e-06, + "loss": 5.0445, + "step": 65950 + }, + { + "epoch": 1.34185791015625, + "grad_norm": 22.125505447387695, + "learning_rate": 7.4755697832464114e-06, + "loss": 4.7081, + "step": 65955 + }, + { + "epoch": 1.3419596354166667, + "grad_norm": 22.311418533325195, + "learning_rate": 7.47522252436066e-06, + "loss": 4.8286, + "step": 65960 + }, + { + "epoch": 1.3420613606770833, + "grad_norm": 17.37091064453125, + "learning_rate": 7.4748752496590235e-06, + "loss": 4.8511, + "step": 65965 + }, + { + "epoch": 1.3421630859375, + "grad_norm": 14.986549377441406, + "learning_rate": 7.47452795914372e-06, + "loss": 4.901, + "step": 65970 + }, + { + "epoch": 1.3422648111979167, + "grad_norm": 16.071325302124023, + "learning_rate": 7.474180652816967e-06, + "loss": 5.222, + "step": 65975 + }, + { + "epoch": 1.3423665364583333, + "grad_norm": 17.100942611694336, + "learning_rate": 7.4738333306809864e-06, + "loss": 4.7529, + "step": 65980 + }, + { + "epoch": 1.34246826171875, + "grad_norm": 11.855311393737793, + "learning_rate": 7.473485992737997e-06, + "loss": 4.7156, + "step": 65985 + }, + { + "epoch": 1.3425699869791667, + "grad_norm": 14.912687301635742, + "learning_rate": 7.473138638990215e-06, + "loss": 4.9743, + "step": 65990 + }, + { + "epoch": 1.3426717122395833, + "grad_norm": 15.077157020568848, + "learning_rate": 7.472791269439863e-06, + "loss": 4.7903, + "step": 65995 + }, + { + "epoch": 1.3427734375, + "grad_norm": 24.72382164001465, + "learning_rate": 7.47244388408916e-06, + "loss": 5.0382, + "step": 66000 + }, + { + "epoch": 1.3428751627604167, + "grad_norm": 19.14982032775879, + "learning_rate": 7.472096482940325e-06, + "loss": 4.9195, + "step": 66005 + }, + { + "epoch": 1.3429768880208333, + "grad_norm": 24.865371704101562, + "learning_rate": 7.471749065995581e-06, + "loss": 5.0431, + "step": 66010 + }, + { + "epoch": 1.34307861328125, + "grad_norm": 23.007898330688477, + "learning_rate": 7.471401633257142e-06, + "loss": 4.6854, + "step": 66015 + }, + { + "epoch": 1.3431803385416667, + "grad_norm": 22.448028564453125, + "learning_rate": 7.471054184727231e-06, + "loss": 5.1559, + "step": 66020 + }, + { + "epoch": 1.3432820638020833, + "grad_norm": 21.402511596679688, + "learning_rate": 7.470706720408069e-06, + "loss": 5.0027, + "step": 66025 + }, + { + "epoch": 1.3433837890625, + "grad_norm": 16.76889419555664, + "learning_rate": 7.470359240301877e-06, + "loss": 4.6494, + "step": 66030 + }, + { + "epoch": 1.3434855143229167, + "grad_norm": 16.943565368652344, + "learning_rate": 7.470011744410871e-06, + "loss": 5.0961, + "step": 66035 + }, + { + "epoch": 1.3435872395833333, + "grad_norm": 20.846233367919922, + "learning_rate": 7.469664232737275e-06, + "loss": 4.7505, + "step": 66040 + }, + { + "epoch": 1.34368896484375, + "grad_norm": 17.62108039855957, + "learning_rate": 7.469316705283309e-06, + "loss": 5.0247, + "step": 66045 + }, + { + "epoch": 1.3437906901041667, + "grad_norm": 12.064324378967285, + "learning_rate": 7.468969162051193e-06, + "loss": 4.9157, + "step": 66050 + }, + { + "epoch": 1.3438924153645833, + "grad_norm": 21.544029235839844, + "learning_rate": 7.468621603043147e-06, + "loss": 5.0442, + "step": 66055 + }, + { + "epoch": 1.343994140625, + "grad_norm": 17.718591690063477, + "learning_rate": 7.468274028261394e-06, + "loss": 5.0, + "step": 66060 + }, + { + "epoch": 1.3440958658854167, + "grad_norm": 22.385360717773438, + "learning_rate": 7.467926437708152e-06, + "loss": 4.8566, + "step": 66065 + }, + { + "epoch": 1.3441975911458333, + "grad_norm": 18.95868492126465, + "learning_rate": 7.467578831385644e-06, + "loss": 4.927, + "step": 66070 + }, + { + "epoch": 1.34429931640625, + "grad_norm": 17.1023006439209, + "learning_rate": 7.467231209296092e-06, + "loss": 5.0112, + "step": 66075 + }, + { + "epoch": 1.3444010416666667, + "grad_norm": 19.866567611694336, + "learning_rate": 7.466883571441714e-06, + "loss": 4.7904, + "step": 66080 + }, + { + "epoch": 1.3445027669270833, + "grad_norm": 20.332014083862305, + "learning_rate": 7.466535917824733e-06, + "loss": 4.9153, + "step": 66085 + }, + { + "epoch": 1.3446044921875, + "grad_norm": 22.289594650268555, + "learning_rate": 7.466188248447371e-06, + "loss": 5.0289, + "step": 66090 + }, + { + "epoch": 1.3447062174479167, + "grad_norm": 17.39463996887207, + "learning_rate": 7.46584056331185e-06, + "loss": 5.0493, + "step": 66095 + }, + { + "epoch": 1.3448079427083333, + "grad_norm": 17.871667861938477, + "learning_rate": 7.465492862420389e-06, + "loss": 4.8166, + "step": 66100 + }, + { + "epoch": 1.34490966796875, + "grad_norm": 14.361316680908203, + "learning_rate": 7.465145145775212e-06, + "loss": 4.8771, + "step": 66105 + }, + { + "epoch": 1.3450113932291667, + "grad_norm": 16.844825744628906, + "learning_rate": 7.46479741337854e-06, + "loss": 4.8072, + "step": 66110 + }, + { + "epoch": 1.3451131184895833, + "grad_norm": 20.301742553710938, + "learning_rate": 7.4644496652325945e-06, + "loss": 5.0037, + "step": 66115 + }, + { + "epoch": 1.34521484375, + "grad_norm": 13.38963508605957, + "learning_rate": 7.464101901339598e-06, + "loss": 4.9792, + "step": 66120 + }, + { + "epoch": 1.3453165690104167, + "grad_norm": 15.458901405334473, + "learning_rate": 7.463754121701773e-06, + "loss": 4.6679, + "step": 66125 + }, + { + "epoch": 1.3454182942708333, + "grad_norm": 16.954105377197266, + "learning_rate": 7.4634063263213404e-06, + "loss": 4.9195, + "step": 66130 + }, + { + "epoch": 1.34552001953125, + "grad_norm": 25.643848419189453, + "learning_rate": 7.463058515200525e-06, + "loss": 4.7457, + "step": 66135 + }, + { + "epoch": 1.3456217447916667, + "grad_norm": 18.31765365600586, + "learning_rate": 7.462710688341547e-06, + "loss": 5.0033, + "step": 66140 + }, + { + "epoch": 1.3457234700520833, + "grad_norm": 19.182003021240234, + "learning_rate": 7.462362845746629e-06, + "loss": 4.8517, + "step": 66145 + }, + { + "epoch": 1.3458251953125, + "grad_norm": 16.64269256591797, + "learning_rate": 7.462014987417994e-06, + "loss": 4.9733, + "step": 66150 + }, + { + "epoch": 1.3459269205729167, + "grad_norm": 18.41656494140625, + "learning_rate": 7.461667113357864e-06, + "loss": 4.839, + "step": 66155 + }, + { + "epoch": 1.3460286458333333, + "grad_norm": 17.819541931152344, + "learning_rate": 7.461319223568465e-06, + "loss": 4.8576, + "step": 66160 + }, + { + "epoch": 1.34613037109375, + "grad_norm": 16.64744758605957, + "learning_rate": 7.460971318052016e-06, + "loss": 4.9375, + "step": 66165 + }, + { + "epoch": 1.3462320963541667, + "grad_norm": 19.401954650878906, + "learning_rate": 7.460623396810741e-06, + "loss": 4.7431, + "step": 66170 + }, + { + "epoch": 1.3463338216145833, + "grad_norm": 19.379993438720703, + "learning_rate": 7.460275459846865e-06, + "loss": 5.0714, + "step": 66175 + }, + { + "epoch": 1.346435546875, + "grad_norm": 18.274051666259766, + "learning_rate": 7.4599275071626094e-06, + "loss": 4.7925, + "step": 66180 + }, + { + "epoch": 1.3465372721354167, + "grad_norm": 19.3553409576416, + "learning_rate": 7.459579538760197e-06, + "loss": 5.0368, + "step": 66185 + }, + { + "epoch": 1.3466389973958333, + "grad_norm": 18.746551513671875, + "learning_rate": 7.459231554641852e-06, + "loss": 4.9351, + "step": 66190 + }, + { + "epoch": 1.34674072265625, + "grad_norm": 14.99452018737793, + "learning_rate": 7.458883554809799e-06, + "loss": 5.0775, + "step": 66195 + }, + { + "epoch": 1.3468424479166667, + "grad_norm": 17.43594741821289, + "learning_rate": 7.458535539266261e-06, + "loss": 4.8745, + "step": 66200 + }, + { + "epoch": 1.3469441731770833, + "grad_norm": 17.577531814575195, + "learning_rate": 7.458187508013461e-06, + "loss": 5.2676, + "step": 66205 + }, + { + "epoch": 1.3470458984375, + "grad_norm": 16.99700164794922, + "learning_rate": 7.457839461053622e-06, + "loss": 4.8266, + "step": 66210 + }, + { + "epoch": 1.3471476236979167, + "grad_norm": 16.387014389038086, + "learning_rate": 7.45749139838897e-06, + "loss": 4.8415, + "step": 66215 + }, + { + "epoch": 1.3472493489583333, + "grad_norm": 17.19719886779785, + "learning_rate": 7.457143320021728e-06, + "loss": 4.7531, + "step": 66220 + }, + { + "epoch": 1.34735107421875, + "grad_norm": 19.373109817504883, + "learning_rate": 7.45679522595412e-06, + "loss": 5.0427, + "step": 66225 + }, + { + "epoch": 1.3474527994791667, + "grad_norm": 19.703170776367188, + "learning_rate": 7.45644711618837e-06, + "loss": 4.6178, + "step": 66230 + }, + { + "epoch": 1.3475545247395833, + "grad_norm": 18.810144424438477, + "learning_rate": 7.456098990726703e-06, + "loss": 4.9482, + "step": 66235 + }, + { + "epoch": 1.34765625, + "grad_norm": 20.60226821899414, + "learning_rate": 7.455750849571344e-06, + "loss": 5.1055, + "step": 66240 + }, + { + "epoch": 1.3477579752604167, + "grad_norm": 15.434432029724121, + "learning_rate": 7.455402692724516e-06, + "loss": 4.8654, + "step": 66245 + }, + { + "epoch": 1.3478597005208333, + "grad_norm": 15.15556812286377, + "learning_rate": 7.455054520188444e-06, + "loss": 4.8577, + "step": 66250 + }, + { + "epoch": 1.34796142578125, + "grad_norm": 17.362226486206055, + "learning_rate": 7.4547063319653535e-06, + "loss": 4.8759, + "step": 66255 + }, + { + "epoch": 1.3480631510416667, + "grad_norm": 16.94116973876953, + "learning_rate": 7.4543581280574685e-06, + "loss": 5.0551, + "step": 66260 + }, + { + "epoch": 1.3481648763020833, + "grad_norm": 22.784507751464844, + "learning_rate": 7.454009908467013e-06, + "loss": 4.7132, + "step": 66265 + }, + { + "epoch": 1.3482666015625, + "grad_norm": 16.462589263916016, + "learning_rate": 7.453661673196215e-06, + "loss": 5.1525, + "step": 66270 + }, + { + "epoch": 1.3483683268229167, + "grad_norm": 15.484063148498535, + "learning_rate": 7.453313422247297e-06, + "loss": 5.0646, + "step": 66275 + }, + { + "epoch": 1.3484700520833333, + "grad_norm": 24.960195541381836, + "learning_rate": 7.452965155622486e-06, + "loss": 5.2449, + "step": 66280 + }, + { + "epoch": 1.34857177734375, + "grad_norm": 19.971052169799805, + "learning_rate": 7.452616873324005e-06, + "loss": 4.8927, + "step": 66285 + }, + { + "epoch": 1.3486735026041667, + "grad_norm": 21.33244514465332, + "learning_rate": 7.452268575354082e-06, + "loss": 4.8875, + "step": 66290 + }, + { + "epoch": 1.3487752278645833, + "grad_norm": 16.292139053344727, + "learning_rate": 7.45192026171494e-06, + "loss": 5.0136, + "step": 66295 + }, + { + "epoch": 1.348876953125, + "grad_norm": 28.84184455871582, + "learning_rate": 7.451571932408805e-06, + "loss": 5.083, + "step": 66300 + }, + { + "epoch": 1.3489786783854167, + "grad_norm": 16.79098129272461, + "learning_rate": 7.451223587437906e-06, + "loss": 4.7992, + "step": 66305 + }, + { + "epoch": 1.3490804036458333, + "grad_norm": 22.504608154296875, + "learning_rate": 7.450875226804465e-06, + "loss": 5.0726, + "step": 66310 + }, + { + "epoch": 1.34918212890625, + "grad_norm": 20.100296020507812, + "learning_rate": 7.450526850510711e-06, + "loss": 4.9107, + "step": 66315 + }, + { + "epoch": 1.3492838541666667, + "grad_norm": 17.34429931640625, + "learning_rate": 7.450178458558866e-06, + "loss": 4.9591, + "step": 66320 + }, + { + "epoch": 1.3493855794270833, + "grad_norm": 17.67999267578125, + "learning_rate": 7.449830050951159e-06, + "loss": 4.7303, + "step": 66325 + }, + { + "epoch": 1.3494873046875, + "grad_norm": 20.366483688354492, + "learning_rate": 7.449481627689818e-06, + "loss": 4.7045, + "step": 66330 + }, + { + "epoch": 1.3495890299479167, + "grad_norm": 25.94993019104004, + "learning_rate": 7.449133188777065e-06, + "loss": 4.967, + "step": 66335 + }, + { + "epoch": 1.3496907552083333, + "grad_norm": 18.466270446777344, + "learning_rate": 7.448784734215127e-06, + "loss": 4.852, + "step": 66340 + }, + { + "epoch": 1.34979248046875, + "grad_norm": 17.89414405822754, + "learning_rate": 7.448436264006233e-06, + "loss": 4.6841, + "step": 66345 + }, + { + "epoch": 1.3498942057291667, + "grad_norm": 17.706016540527344, + "learning_rate": 7.448087778152609e-06, + "loss": 4.7994, + "step": 66350 + }, + { + "epoch": 1.3499959309895833, + "grad_norm": 18.387834548950195, + "learning_rate": 7.4477392766564815e-06, + "loss": 4.8768, + "step": 66355 + }, + { + "epoch": 1.35009765625, + "grad_norm": 15.091099739074707, + "learning_rate": 7.447390759520077e-06, + "loss": 5.0752, + "step": 66360 + }, + { + "epoch": 1.3501993815104167, + "grad_norm": 19.95486068725586, + "learning_rate": 7.4470422267456224e-06, + "loss": 4.9004, + "step": 66365 + }, + { + "epoch": 1.3503011067708333, + "grad_norm": 14.575539588928223, + "learning_rate": 7.446693678335344e-06, + "loss": 4.9987, + "step": 66370 + }, + { + "epoch": 1.35040283203125, + "grad_norm": 18.29110336303711, + "learning_rate": 7.44634511429147e-06, + "loss": 4.9987, + "step": 66375 + }, + { + "epoch": 1.3505045572916667, + "grad_norm": 14.742834091186523, + "learning_rate": 7.4459965346162265e-06, + "loss": 4.9559, + "step": 66380 + }, + { + "epoch": 1.3506062825520833, + "grad_norm": 12.987332344055176, + "learning_rate": 7.445647939311843e-06, + "loss": 4.9613, + "step": 66385 + }, + { + "epoch": 1.3507080078125, + "grad_norm": 16.25368881225586, + "learning_rate": 7.445299328380544e-06, + "loss": 5.0808, + "step": 66390 + }, + { + "epoch": 1.3508097330729167, + "grad_norm": 18.368982315063477, + "learning_rate": 7.44495070182456e-06, + "loss": 4.9091, + "step": 66395 + }, + { + "epoch": 1.3509114583333333, + "grad_norm": 22.138635635375977, + "learning_rate": 7.444602059646117e-06, + "loss": 5.1155, + "step": 66400 + }, + { + "epoch": 1.35101318359375, + "grad_norm": 16.708362579345703, + "learning_rate": 7.444253401847444e-06, + "loss": 4.7905, + "step": 66405 + }, + { + "epoch": 1.3511149088541667, + "grad_norm": 16.976417541503906, + "learning_rate": 7.443904728430766e-06, + "loss": 4.9097, + "step": 66410 + }, + { + "epoch": 1.3512166341145833, + "grad_norm": 19.71514129638672, + "learning_rate": 7.443556039398312e-06, + "loss": 5.0404, + "step": 66415 + }, + { + "epoch": 1.351318359375, + "grad_norm": 14.520707130432129, + "learning_rate": 7.443207334752312e-06, + "loss": 4.8957, + "step": 66420 + }, + { + "epoch": 1.3514200846354167, + "grad_norm": 18.322046279907227, + "learning_rate": 7.442858614494992e-06, + "loss": 4.5084, + "step": 66425 + }, + { + "epoch": 1.3515218098958333, + "grad_norm": 17.898908615112305, + "learning_rate": 7.442509878628581e-06, + "loss": 4.6908, + "step": 66430 + }, + { + "epoch": 1.35162353515625, + "grad_norm": 20.611257553100586, + "learning_rate": 7.4421611271553075e-06, + "loss": 4.945, + "step": 66435 + }, + { + "epoch": 1.3517252604166667, + "grad_norm": 18.51050567626953, + "learning_rate": 7.441812360077398e-06, + "loss": 4.8062, + "step": 66440 + }, + { + "epoch": 1.3518269856770833, + "grad_norm": 24.243846893310547, + "learning_rate": 7.441463577397086e-06, + "loss": 5.1069, + "step": 66445 + }, + { + "epoch": 1.3519287109375, + "grad_norm": 12.566925048828125, + "learning_rate": 7.441114779116594e-06, + "loss": 5.3321, + "step": 66450 + }, + { + "epoch": 1.3520304361979167, + "grad_norm": 19.403867721557617, + "learning_rate": 7.440765965238154e-06, + "loss": 5.0187, + "step": 66455 + }, + { + "epoch": 1.3521321614583333, + "grad_norm": 18.328603744506836, + "learning_rate": 7.440417135763996e-06, + "loss": 4.7634, + "step": 66460 + }, + { + "epoch": 1.35223388671875, + "grad_norm": 19.369075775146484, + "learning_rate": 7.440068290696345e-06, + "loss": 4.854, + "step": 66465 + }, + { + "epoch": 1.3523356119791667, + "grad_norm": 14.273224830627441, + "learning_rate": 7.4397194300374334e-06, + "loss": 4.8155, + "step": 66470 + }, + { + "epoch": 1.3524373372395833, + "grad_norm": 29.204757690429688, + "learning_rate": 7.439370553789489e-06, + "loss": 5.1161, + "step": 66475 + }, + { + "epoch": 1.3525390625, + "grad_norm": 15.160664558410645, + "learning_rate": 7.43902166195474e-06, + "loss": 5.1636, + "step": 66480 + }, + { + "epoch": 1.3526407877604167, + "grad_norm": 18.323747634887695, + "learning_rate": 7.438672754535418e-06, + "loss": 4.9347, + "step": 66485 + }, + { + "epoch": 1.3527425130208333, + "grad_norm": 20.18071746826172, + "learning_rate": 7.43832383153375e-06, + "loss": 4.9353, + "step": 66490 + }, + { + "epoch": 1.35284423828125, + "grad_norm": 16.833961486816406, + "learning_rate": 7.437974892951969e-06, + "loss": 5.127, + "step": 66495 + }, + { + "epoch": 1.3529459635416667, + "grad_norm": 17.39961051940918, + "learning_rate": 7.4376259387923e-06, + "loss": 4.9057, + "step": 66500 + }, + { + "epoch": 1.3530476888020833, + "grad_norm": 17.664573669433594, + "learning_rate": 7.437276969056977e-06, + "loss": 4.9673, + "step": 66505 + }, + { + "epoch": 1.3531494140625, + "grad_norm": 18.242202758789062, + "learning_rate": 7.436927983748225e-06, + "loss": 5.1933, + "step": 66510 + }, + { + "epoch": 1.3532511393229167, + "grad_norm": 20.362205505371094, + "learning_rate": 7.43657898286828e-06, + "loss": 4.992, + "step": 66515 + }, + { + "epoch": 1.3533528645833333, + "grad_norm": 24.94879722595215, + "learning_rate": 7.436229966419367e-06, + "loss": 4.8778, + "step": 66520 + }, + { + "epoch": 1.35345458984375, + "grad_norm": 17.213768005371094, + "learning_rate": 7.435880934403719e-06, + "loss": 4.9248, + "step": 66525 + }, + { + "epoch": 1.3535563151041667, + "grad_norm": 18.970468521118164, + "learning_rate": 7.435531886823563e-06, + "loss": 4.863, + "step": 66530 + }, + { + "epoch": 1.3536580403645833, + "grad_norm": 17.61522102355957, + "learning_rate": 7.435182823681133e-06, + "loss": 5.0144, + "step": 66535 + }, + { + "epoch": 1.353759765625, + "grad_norm": 19.990997314453125, + "learning_rate": 7.434833744978659e-06, + "loss": 4.8658, + "step": 66540 + }, + { + "epoch": 1.3538614908854167, + "grad_norm": 16.19689178466797, + "learning_rate": 7.4344846507183675e-06, + "loss": 4.7076, + "step": 66545 + }, + { + "epoch": 1.3539632161458333, + "grad_norm": 18.058271408081055, + "learning_rate": 7.434135540902494e-06, + "loss": 4.906, + "step": 66550 + }, + { + "epoch": 1.35406494140625, + "grad_norm": 19.0432186126709, + "learning_rate": 7.433786415533268e-06, + "loss": 5.2196, + "step": 66555 + }, + { + "epoch": 1.3541666666666667, + "grad_norm": 16.3552188873291, + "learning_rate": 7.433437274612916e-06, + "loss": 4.962, + "step": 66560 + }, + { + "epoch": 1.3542683919270833, + "grad_norm": 16.223499298095703, + "learning_rate": 7.433088118143674e-06, + "loss": 4.8128, + "step": 66565 + }, + { + "epoch": 1.3543701171875, + "grad_norm": 14.607945442199707, + "learning_rate": 7.432738946127771e-06, + "loss": 4.8987, + "step": 66570 + }, + { + "epoch": 1.3544718424479167, + "grad_norm": 17.588733673095703, + "learning_rate": 7.43238975856744e-06, + "loss": 4.8452, + "step": 66575 + }, + { + "epoch": 1.3545735677083333, + "grad_norm": 20.905487060546875, + "learning_rate": 7.43204055546491e-06, + "loss": 4.7977, + "step": 66580 + }, + { + "epoch": 1.35467529296875, + "grad_norm": 20.956558227539062, + "learning_rate": 7.431691336822411e-06, + "loss": 4.8459, + "step": 66585 + }, + { + "epoch": 1.3547770182291667, + "grad_norm": 24.830049514770508, + "learning_rate": 7.431342102642178e-06, + "loss": 4.7485, + "step": 66590 + }, + { + "epoch": 1.3548787434895833, + "grad_norm": 15.839820861816406, + "learning_rate": 7.430992852926439e-06, + "loss": 5.017, + "step": 66595 + }, + { + "epoch": 1.35498046875, + "grad_norm": 15.040416717529297, + "learning_rate": 7.430643587677429e-06, + "loss": 4.6984, + "step": 66600 + }, + { + "epoch": 1.3550821940104167, + "grad_norm": 14.773219108581543, + "learning_rate": 7.430294306897379e-06, + "loss": 4.7434, + "step": 66605 + }, + { + "epoch": 1.3551839192708333, + "grad_norm": 16.95623207092285, + "learning_rate": 7.429945010588517e-06, + "loss": 4.8248, + "step": 66610 + }, + { + "epoch": 1.35528564453125, + "grad_norm": 28.71949005126953, + "learning_rate": 7.42959569875308e-06, + "loss": 4.6462, + "step": 66615 + }, + { + "epoch": 1.3553873697916667, + "grad_norm": 23.926952362060547, + "learning_rate": 7.429246371393298e-06, + "loss": 5.0565, + "step": 66620 + }, + { + "epoch": 1.3554890950520833, + "grad_norm": 18.815113067626953, + "learning_rate": 7.428897028511401e-06, + "loss": 5.0468, + "step": 66625 + }, + { + "epoch": 1.3555908203125, + "grad_norm": 14.185515403747559, + "learning_rate": 7.4285476701096235e-06, + "loss": 4.8286, + "step": 66630 + }, + { + "epoch": 1.3556925455729167, + "grad_norm": 13.535439491271973, + "learning_rate": 7.428198296190199e-06, + "loss": 4.6758, + "step": 66635 + }, + { + "epoch": 1.3557942708333333, + "grad_norm": 15.897189140319824, + "learning_rate": 7.427848906755355e-06, + "loss": 4.903, + "step": 66640 + }, + { + "epoch": 1.35589599609375, + "grad_norm": 17.806529998779297, + "learning_rate": 7.42749950180733e-06, + "loss": 5.3832, + "step": 66645 + }, + { + "epoch": 1.3559977213541667, + "grad_norm": 16.45578956604004, + "learning_rate": 7.427150081348352e-06, + "loss": 4.8518, + "step": 66650 + }, + { + "epoch": 1.3560994466145833, + "grad_norm": 17.806015014648438, + "learning_rate": 7.426800645380658e-06, + "loss": 4.8076, + "step": 66655 + }, + { + "epoch": 1.356201171875, + "grad_norm": 16.256696701049805, + "learning_rate": 7.4264511939064754e-06, + "loss": 4.7327, + "step": 66660 + }, + { + "epoch": 1.3563028971354167, + "grad_norm": 19.37310218811035, + "learning_rate": 7.426101726928041e-06, + "loss": 5.1961, + "step": 66665 + }, + { + "epoch": 1.3564046223958333, + "grad_norm": 14.22137451171875, + "learning_rate": 7.425752244447588e-06, + "loss": 4.6205, + "step": 66670 + }, + { + "epoch": 1.35650634765625, + "grad_norm": 27.590511322021484, + "learning_rate": 7.425402746467346e-06, + "loss": 4.9066, + "step": 66675 + }, + { + "epoch": 1.3566080729166667, + "grad_norm": 18.721221923828125, + "learning_rate": 7.425053232989552e-06, + "loss": 5.0348, + "step": 66680 + }, + { + "epoch": 1.3567097981770833, + "grad_norm": 22.18437385559082, + "learning_rate": 7.424703704016436e-06, + "loss": 4.9345, + "step": 66685 + }, + { + "epoch": 1.3568115234375, + "grad_norm": 21.486713409423828, + "learning_rate": 7.424354159550236e-06, + "loss": 5.2147, + "step": 66690 + }, + { + "epoch": 1.3569132486979167, + "grad_norm": 32.27192306518555, + "learning_rate": 7.4240045995931795e-06, + "loss": 4.786, + "step": 66695 + }, + { + "epoch": 1.3570149739583333, + "grad_norm": 15.117780685424805, + "learning_rate": 7.423655024147505e-06, + "loss": 4.8041, + "step": 66700 + }, + { + "epoch": 1.35711669921875, + "grad_norm": 17.52720069885254, + "learning_rate": 7.423305433215443e-06, + "loss": 5.0262, + "step": 66705 + }, + { + "epoch": 1.3572184244791667, + "grad_norm": 18.592233657836914, + "learning_rate": 7.422955826799229e-06, + "loss": 5.0191, + "step": 66710 + }, + { + "epoch": 1.3573201497395833, + "grad_norm": 14.705594062805176, + "learning_rate": 7.4226062049010965e-06, + "loss": 4.9295, + "step": 66715 + }, + { + "epoch": 1.357421875, + "grad_norm": 18.522884368896484, + "learning_rate": 7.42225656752328e-06, + "loss": 5.0585, + "step": 66720 + }, + { + "epoch": 1.3575236002604167, + "grad_norm": 18.22498893737793, + "learning_rate": 7.421906914668011e-06, + "loss": 4.8034, + "step": 66725 + }, + { + "epoch": 1.3576253255208333, + "grad_norm": 15.50082015991211, + "learning_rate": 7.421557246337528e-06, + "loss": 4.8374, + "step": 66730 + }, + { + "epoch": 1.35772705078125, + "grad_norm": 16.216169357299805, + "learning_rate": 7.421207562534062e-06, + "loss": 4.8135, + "step": 66735 + }, + { + "epoch": 1.3578287760416667, + "grad_norm": 19.845861434936523, + "learning_rate": 7.420857863259847e-06, + "loss": 4.8677, + "step": 66740 + }, + { + "epoch": 1.3579305013020833, + "grad_norm": 17.760009765625, + "learning_rate": 7.42050814851712e-06, + "loss": 4.7336, + "step": 66745 + }, + { + "epoch": 1.3580322265625, + "grad_norm": 17.548789978027344, + "learning_rate": 7.4201584183081144e-06, + "loss": 4.8919, + "step": 66750 + }, + { + "epoch": 1.3581339518229167, + "grad_norm": 17.40427589416504, + "learning_rate": 7.419808672635065e-06, + "loss": 4.9917, + "step": 66755 + }, + { + "epoch": 1.3582356770833333, + "grad_norm": 14.880545616149902, + "learning_rate": 7.419458911500205e-06, + "loss": 4.8581, + "step": 66760 + }, + { + "epoch": 1.35833740234375, + "grad_norm": 20.605308532714844, + "learning_rate": 7.419109134905772e-06, + "loss": 5.1424, + "step": 66765 + }, + { + "epoch": 1.3584391276041667, + "grad_norm": 15.316008567810059, + "learning_rate": 7.418759342853998e-06, + "loss": 5.1862, + "step": 66770 + }, + { + "epoch": 1.3585408528645833, + "grad_norm": 15.511534690856934, + "learning_rate": 7.418409535347122e-06, + "loss": 4.9286, + "step": 66775 + }, + { + "epoch": 1.358642578125, + "grad_norm": 16.792423248291016, + "learning_rate": 7.418059712387374e-06, + "loss": 4.7436, + "step": 66780 + }, + { + "epoch": 1.3587443033854167, + "grad_norm": 14.328646659851074, + "learning_rate": 7.417709873976994e-06, + "loss": 5.1844, + "step": 66785 + }, + { + "epoch": 1.3588460286458333, + "grad_norm": 17.394163131713867, + "learning_rate": 7.417360020118214e-06, + "loss": 5.0686, + "step": 66790 + }, + { + "epoch": 1.35894775390625, + "grad_norm": 17.835596084594727, + "learning_rate": 7.417010150813272e-06, + "loss": 4.9336, + "step": 66795 + }, + { + "epoch": 1.3590494791666667, + "grad_norm": 20.6325626373291, + "learning_rate": 7.416660266064403e-06, + "loss": 4.9449, + "step": 66800 + }, + { + "epoch": 1.3591512044270833, + "grad_norm": 15.996380805969238, + "learning_rate": 7.41631036587384e-06, + "loss": 4.7306, + "step": 66805 + }, + { + "epoch": 1.3592529296875, + "grad_norm": 21.98931121826172, + "learning_rate": 7.415960450243822e-06, + "loss": 5.0253, + "step": 66810 + }, + { + "epoch": 1.3593546549479167, + "grad_norm": 12.932878494262695, + "learning_rate": 7.415610519176584e-06, + "loss": 5.1234, + "step": 66815 + }, + { + "epoch": 1.3594563802083333, + "grad_norm": 18.620433807373047, + "learning_rate": 7.415260572674361e-06, + "loss": 4.9639, + "step": 66820 + }, + { + "epoch": 1.35955810546875, + "grad_norm": 18.988412857055664, + "learning_rate": 7.4149106107393896e-06, + "loss": 4.8847, + "step": 66825 + }, + { + "epoch": 1.3596598307291667, + "grad_norm": 17.17414093017578, + "learning_rate": 7.4145606333739065e-06, + "loss": 5.0534, + "step": 66830 + }, + { + "epoch": 1.3597615559895833, + "grad_norm": 16.8463134765625, + "learning_rate": 7.414210640580147e-06, + "loss": 4.8526, + "step": 66835 + }, + { + "epoch": 1.35986328125, + "grad_norm": 14.1018648147583, + "learning_rate": 7.413860632360349e-06, + "loss": 4.7134, + "step": 66840 + }, + { + "epoch": 1.3599650065104167, + "grad_norm": 21.034692764282227, + "learning_rate": 7.413510608716746e-06, + "loss": 4.9641, + "step": 66845 + }, + { + "epoch": 1.3600667317708333, + "grad_norm": 17.443588256835938, + "learning_rate": 7.413160569651578e-06, + "loss": 4.894, + "step": 66850 + }, + { + "epoch": 1.36016845703125, + "grad_norm": 19.483436584472656, + "learning_rate": 7.412810515167078e-06, + "loss": 4.9311, + "step": 66855 + }, + { + "epoch": 1.3602701822916667, + "grad_norm": 16.036035537719727, + "learning_rate": 7.4124604452654865e-06, + "loss": 4.9866, + "step": 66860 + }, + { + "epoch": 1.3603719075520833, + "grad_norm": 29.542606353759766, + "learning_rate": 7.412110359949038e-06, + "loss": 4.6835, + "step": 66865 + }, + { + "epoch": 1.3604736328125, + "grad_norm": 22.768945693969727, + "learning_rate": 7.41176025921997e-06, + "loss": 4.877, + "step": 66870 + }, + { + "epoch": 1.3605753580729167, + "grad_norm": 16.86309814453125, + "learning_rate": 7.41141014308052e-06, + "loss": 4.875, + "step": 66875 + }, + { + "epoch": 1.3606770833333333, + "grad_norm": 20.350860595703125, + "learning_rate": 7.4110600115329234e-06, + "loss": 5.2781, + "step": 66880 + }, + { + "epoch": 1.36077880859375, + "grad_norm": 13.696741104125977, + "learning_rate": 7.41070986457942e-06, + "loss": 4.7358, + "step": 66885 + }, + { + "epoch": 1.3608805338541667, + "grad_norm": 17.231374740600586, + "learning_rate": 7.410359702222245e-06, + "loss": 4.8201, + "step": 66890 + }, + { + "epoch": 1.3609822591145833, + "grad_norm": 20.389352798461914, + "learning_rate": 7.4100095244636375e-06, + "loss": 4.6408, + "step": 66895 + }, + { + "epoch": 1.361083984375, + "grad_norm": 15.800028800964355, + "learning_rate": 7.409659331305833e-06, + "loss": 4.9162, + "step": 66900 + }, + { + "epoch": 1.3611857096354167, + "grad_norm": 18.28310203552246, + "learning_rate": 7.409309122751072e-06, + "loss": 4.8031, + "step": 66905 + }, + { + "epoch": 1.3612874348958333, + "grad_norm": 19.804428100585938, + "learning_rate": 7.408958898801588e-06, + "loss": 4.7507, + "step": 66910 + }, + { + "epoch": 1.36138916015625, + "grad_norm": 21.29274559020996, + "learning_rate": 7.408608659459624e-06, + "loss": 4.8111, + "step": 66915 + }, + { + "epoch": 1.3614908854166667, + "grad_norm": 18.46656036376953, + "learning_rate": 7.4082584047274134e-06, + "loss": 5.0645, + "step": 66920 + }, + { + "epoch": 1.3615926106770833, + "grad_norm": 17.282678604125977, + "learning_rate": 7.407908134607197e-06, + "loss": 4.9634, + "step": 66925 + }, + { + "epoch": 1.3616943359375, + "grad_norm": 19.309932708740234, + "learning_rate": 7.407557849101213e-06, + "loss": 4.8555, + "step": 66930 + }, + { + "epoch": 1.3617960611979167, + "grad_norm": 15.617399215698242, + "learning_rate": 7.407207548211695e-06, + "loss": 4.8839, + "step": 66935 + }, + { + "epoch": 1.3618977864583333, + "grad_norm": 17.49759864807129, + "learning_rate": 7.406857231940888e-06, + "loss": 4.9568, + "step": 66940 + }, + { + "epoch": 1.36199951171875, + "grad_norm": 14.314361572265625, + "learning_rate": 7.406506900291026e-06, + "loss": 4.981, + "step": 66945 + }, + { + "epoch": 1.3621012369791667, + "grad_norm": 19.82388687133789, + "learning_rate": 7.406156553264349e-06, + "loss": 4.9299, + "step": 66950 + }, + { + "epoch": 1.3622029622395833, + "grad_norm": 25.168729782104492, + "learning_rate": 7.405806190863097e-06, + "loss": 5.1082, + "step": 66955 + }, + { + "epoch": 1.3623046875, + "grad_norm": 21.671146392822266, + "learning_rate": 7.405455813089506e-06, + "loss": 5.1745, + "step": 66960 + }, + { + "epoch": 1.3624064127604167, + "grad_norm": 16.76918601989746, + "learning_rate": 7.405105419945816e-06, + "loss": 4.8802, + "step": 66965 + }, + { + "epoch": 1.3625081380208333, + "grad_norm": 15.019890785217285, + "learning_rate": 7.4047550114342655e-06, + "loss": 4.9216, + "step": 66970 + }, + { + "epoch": 1.36260986328125, + "grad_norm": 16.2127628326416, + "learning_rate": 7.404404587557095e-06, + "loss": 5.1423, + "step": 66975 + }, + { + "epoch": 1.3627115885416667, + "grad_norm": 21.38418960571289, + "learning_rate": 7.404054148316541e-06, + "loss": 4.844, + "step": 66980 + }, + { + "epoch": 1.3628133138020833, + "grad_norm": 18.085773468017578, + "learning_rate": 7.4037036937148455e-06, + "loss": 5.0319, + "step": 66985 + }, + { + "epoch": 1.3629150390625, + "grad_norm": 15.254314422607422, + "learning_rate": 7.403353223754246e-06, + "loss": 4.886, + "step": 66990 + }, + { + "epoch": 1.3630167643229167, + "grad_norm": 20.901655197143555, + "learning_rate": 7.403002738436981e-06, + "loss": 4.9216, + "step": 66995 + }, + { + "epoch": 1.3631184895833333, + "grad_norm": 23.72600746154785, + "learning_rate": 7.402652237765293e-06, + "loss": 5.0325, + "step": 67000 + }, + { + "epoch": 1.36322021484375, + "grad_norm": 19.48319435119629, + "learning_rate": 7.40230172174142e-06, + "loss": 5.1822, + "step": 67005 + }, + { + "epoch": 1.3633219401041667, + "grad_norm": 18.53206443786621, + "learning_rate": 7.4019511903676e-06, + "loss": 5.0146, + "step": 67010 + }, + { + "epoch": 1.3634236653645833, + "grad_norm": 15.462141036987305, + "learning_rate": 7.4016006436460755e-06, + "loss": 4.8234, + "step": 67015 + }, + { + "epoch": 1.363525390625, + "grad_norm": 17.577531814575195, + "learning_rate": 7.401250081579085e-06, + "loss": 5.2832, + "step": 67020 + }, + { + "epoch": 1.3636271158854167, + "grad_norm": 15.491592407226562, + "learning_rate": 7.400899504168868e-06, + "loss": 4.8522, + "step": 67025 + }, + { + "epoch": 1.3637288411458333, + "grad_norm": 23.398269653320312, + "learning_rate": 7.400548911417667e-06, + "loss": 4.8522, + "step": 67030 + }, + { + "epoch": 1.36383056640625, + "grad_norm": 14.717081069946289, + "learning_rate": 7.400198303327719e-06, + "loss": 4.8533, + "step": 67035 + }, + { + "epoch": 1.3639322916666667, + "grad_norm": 17.756153106689453, + "learning_rate": 7.3998476799012665e-06, + "loss": 4.8155, + "step": 67040 + }, + { + "epoch": 1.3640340169270833, + "grad_norm": 20.875516891479492, + "learning_rate": 7.399497041140549e-06, + "loss": 5.0557, + "step": 67045 + }, + { + "epoch": 1.3641357421875, + "grad_norm": 18.751693725585938, + "learning_rate": 7.399146387047806e-06, + "loss": 4.898, + "step": 67050 + }, + { + "epoch": 1.3642374674479167, + "grad_norm": 21.674230575561523, + "learning_rate": 7.398795717625281e-06, + "loss": 4.869, + "step": 67055 + }, + { + "epoch": 1.3643391927083333, + "grad_norm": 20.1216983795166, + "learning_rate": 7.398445032875212e-06, + "loss": 4.9364, + "step": 67060 + }, + { + "epoch": 1.36444091796875, + "grad_norm": 18.386932373046875, + "learning_rate": 7.39809433279984e-06, + "loss": 5.135, + "step": 67065 + }, + { + "epoch": 1.3645426432291667, + "grad_norm": 16.840953826904297, + "learning_rate": 7.3977436174014085e-06, + "loss": 4.9644, + "step": 67070 + }, + { + "epoch": 1.3646443684895833, + "grad_norm": 19.25299835205078, + "learning_rate": 7.397392886682154e-06, + "loss": 4.8744, + "step": 67075 + }, + { + "epoch": 1.36474609375, + "grad_norm": 14.026302337646484, + "learning_rate": 7.3970421406443215e-06, + "loss": 4.9506, + "step": 67080 + }, + { + "epoch": 1.3648478190104167, + "grad_norm": 20.444656372070312, + "learning_rate": 7.39669137929015e-06, + "loss": 4.9997, + "step": 67085 + }, + { + "epoch": 1.3649495442708333, + "grad_norm": 18.210481643676758, + "learning_rate": 7.396340602621882e-06, + "loss": 4.8872, + "step": 67090 + }, + { + "epoch": 1.36505126953125, + "grad_norm": 20.199045181274414, + "learning_rate": 7.395989810641758e-06, + "loss": 4.8841, + "step": 67095 + }, + { + "epoch": 1.3651529947916667, + "grad_norm": 14.611749649047852, + "learning_rate": 7.3956390033520185e-06, + "loss": 4.8898, + "step": 67100 + }, + { + "epoch": 1.3652547200520833, + "grad_norm": 18.865291595458984, + "learning_rate": 7.395288180754908e-06, + "loss": 4.7382, + "step": 67105 + }, + { + "epoch": 1.3653564453125, + "grad_norm": 16.52170753479004, + "learning_rate": 7.394937342852666e-06, + "loss": 4.8034, + "step": 67110 + }, + { + "epoch": 1.3654581705729167, + "grad_norm": 18.972549438476562, + "learning_rate": 7.394586489647534e-06, + "loss": 4.8177, + "step": 67115 + }, + { + "epoch": 1.3655598958333333, + "grad_norm": 18.0881404876709, + "learning_rate": 7.394235621141755e-06, + "loss": 4.9156, + "step": 67120 + }, + { + "epoch": 1.36566162109375, + "grad_norm": 17.084077835083008, + "learning_rate": 7.39388473733757e-06, + "loss": 4.7692, + "step": 67125 + }, + { + "epoch": 1.3657633463541667, + "grad_norm": 15.2094087600708, + "learning_rate": 7.393533838237222e-06, + "loss": 4.8434, + "step": 67130 + }, + { + "epoch": 1.3658650716145833, + "grad_norm": 20.294498443603516, + "learning_rate": 7.393182923842952e-06, + "loss": 4.7622, + "step": 67135 + }, + { + "epoch": 1.365966796875, + "grad_norm": 15.251999855041504, + "learning_rate": 7.392831994157004e-06, + "loss": 4.7613, + "step": 67140 + }, + { + "epoch": 1.3660685221354167, + "grad_norm": 15.604182243347168, + "learning_rate": 7.3924810491816175e-06, + "loss": 4.9642, + "step": 67145 + }, + { + "epoch": 1.3661702473958333, + "grad_norm": 20.222768783569336, + "learning_rate": 7.392130088919038e-06, + "loss": 4.746, + "step": 67150 + }, + { + "epoch": 1.36627197265625, + "grad_norm": 24.211925506591797, + "learning_rate": 7.391779113371505e-06, + "loss": 4.9785, + "step": 67155 + }, + { + "epoch": 1.3663736979166667, + "grad_norm": 19.38014030456543, + "learning_rate": 7.3914281225412645e-06, + "loss": 4.8076, + "step": 67160 + }, + { + "epoch": 1.3664754231770833, + "grad_norm": 17.678810119628906, + "learning_rate": 7.391077116430556e-06, + "loss": 5.0826, + "step": 67165 + }, + { + "epoch": 1.3665771484375, + "grad_norm": 15.548529624938965, + "learning_rate": 7.390726095041625e-06, + "loss": 4.7725, + "step": 67170 + }, + { + "epoch": 1.3666788736979167, + "grad_norm": 20.98904800415039, + "learning_rate": 7.390375058376711e-06, + "loss": 5.004, + "step": 67175 + }, + { + "epoch": 1.3667805989583333, + "grad_norm": 20.334951400756836, + "learning_rate": 7.3900240064380605e-06, + "loss": 4.842, + "step": 67180 + }, + { + "epoch": 1.36688232421875, + "grad_norm": 19.731557846069336, + "learning_rate": 7.389672939227916e-06, + "loss": 4.7737, + "step": 67185 + }, + { + "epoch": 1.3669840494791667, + "grad_norm": 18.383384704589844, + "learning_rate": 7.389321856748519e-06, + "loss": 4.8842, + "step": 67190 + }, + { + "epoch": 1.3670857747395833, + "grad_norm": 14.684517860412598, + "learning_rate": 7.3889707590021144e-06, + "loss": 4.8859, + "step": 67195 + }, + { + "epoch": 1.3671875, + "grad_norm": 18.496795654296875, + "learning_rate": 7.388619645990945e-06, + "loss": 5.0928, + "step": 67200 + }, + { + "epoch": 1.3672892252604167, + "grad_norm": 17.49211311340332, + "learning_rate": 7.388268517717254e-06, + "loss": 4.8687, + "step": 67205 + }, + { + "epoch": 1.3673909505208333, + "grad_norm": 15.963650703430176, + "learning_rate": 7.387917374183286e-06, + "loss": 4.8533, + "step": 67210 + }, + { + "epoch": 1.36749267578125, + "grad_norm": 17.079505920410156, + "learning_rate": 7.3875662153912845e-06, + "loss": 5.231, + "step": 67215 + }, + { + "epoch": 1.3675944010416667, + "grad_norm": 28.88792610168457, + "learning_rate": 7.3872150413434905e-06, + "loss": 4.7294, + "step": 67220 + }, + { + "epoch": 1.3676961263020833, + "grad_norm": 15.38591480255127, + "learning_rate": 7.386863852042152e-06, + "loss": 4.9851, + "step": 67225 + }, + { + "epoch": 1.3677978515625, + "grad_norm": 14.176165580749512, + "learning_rate": 7.38651264748951e-06, + "loss": 5.1236, + "step": 67230 + }, + { + "epoch": 1.3678995768229167, + "grad_norm": 19.978744506835938, + "learning_rate": 7.38616142768781e-06, + "loss": 5.3902, + "step": 67235 + }, + { + "epoch": 1.3680013020833333, + "grad_norm": 19.452255249023438, + "learning_rate": 7.3858101926392975e-06, + "loss": 4.7176, + "step": 67240 + }, + { + "epoch": 1.36810302734375, + "grad_norm": 17.9226131439209, + "learning_rate": 7.385458942346213e-06, + "loss": 5.1185, + "step": 67245 + }, + { + "epoch": 1.3682047526041667, + "grad_norm": 20.722185134887695, + "learning_rate": 7.3851076768108055e-06, + "loss": 4.9432, + "step": 67250 + }, + { + "epoch": 1.3683064778645833, + "grad_norm": 16.56012535095215, + "learning_rate": 7.384756396035315e-06, + "loss": 4.6417, + "step": 67255 + }, + { + "epoch": 1.368408203125, + "grad_norm": 24.106876373291016, + "learning_rate": 7.384405100021989e-06, + "loss": 4.8187, + "step": 67260 + }, + { + "epoch": 1.3685099283854167, + "grad_norm": 16.094207763671875, + "learning_rate": 7.384053788773072e-06, + "loss": 4.9208, + "step": 67265 + }, + { + "epoch": 1.3686116536458333, + "grad_norm": 20.29306411743164, + "learning_rate": 7.383702462290807e-06, + "loss": 4.974, + "step": 67270 + }, + { + "epoch": 1.36871337890625, + "grad_norm": 27.61029052734375, + "learning_rate": 7.38335112057744e-06, + "loss": 4.7839, + "step": 67275 + }, + { + "epoch": 1.3688151041666667, + "grad_norm": 17.35218048095703, + "learning_rate": 7.382999763635217e-06, + "loss": 4.8608, + "step": 67280 + }, + { + "epoch": 1.3689168294270833, + "grad_norm": 16.03403091430664, + "learning_rate": 7.382648391466381e-06, + "loss": 5.0335, + "step": 67285 + }, + { + "epoch": 1.3690185546875, + "grad_norm": 17.014841079711914, + "learning_rate": 7.382297004073178e-06, + "loss": 4.7321, + "step": 67290 + }, + { + "epoch": 1.3691202799479167, + "grad_norm": 22.07147216796875, + "learning_rate": 7.381945601457854e-06, + "loss": 4.8805, + "step": 67295 + }, + { + "epoch": 1.3692220052083333, + "grad_norm": 20.924026489257812, + "learning_rate": 7.3815941836226534e-06, + "loss": 4.9362, + "step": 67300 + }, + { + "epoch": 1.36932373046875, + "grad_norm": 18.88220977783203, + "learning_rate": 7.381242750569821e-06, + "loss": 4.7728, + "step": 67305 + }, + { + "epoch": 1.3694254557291667, + "grad_norm": 17.61874008178711, + "learning_rate": 7.380891302301605e-06, + "loss": 5.001, + "step": 67310 + }, + { + "epoch": 1.3695271809895833, + "grad_norm": 15.380510330200195, + "learning_rate": 7.380539838820249e-06, + "loss": 5.175, + "step": 67315 + }, + { + "epoch": 1.36962890625, + "grad_norm": 16.127347946166992, + "learning_rate": 7.380188360127998e-06, + "loss": 4.912, + "step": 67320 + }, + { + "epoch": 1.3697306315104167, + "grad_norm": 17.15536880493164, + "learning_rate": 7.3798368662271e-06, + "loss": 5.2801, + "step": 67325 + }, + { + "epoch": 1.3698323567708333, + "grad_norm": 16.958494186401367, + "learning_rate": 7.3794853571198e-06, + "loss": 4.9472, + "step": 67330 + }, + { + "epoch": 1.36993408203125, + "grad_norm": 20.487642288208008, + "learning_rate": 7.379133832808343e-06, + "loss": 4.8299, + "step": 67335 + }, + { + "epoch": 1.3700358072916667, + "grad_norm": 18.564481735229492, + "learning_rate": 7.378782293294977e-06, + "loss": 4.8843, + "step": 67340 + }, + { + "epoch": 1.3701375325520833, + "grad_norm": 18.661487579345703, + "learning_rate": 7.378430738581946e-06, + "loss": 4.7592, + "step": 67345 + }, + { + "epoch": 1.3702392578125, + "grad_norm": 18.350881576538086, + "learning_rate": 7.378079168671499e-06, + "loss": 5.0298, + "step": 67350 + }, + { + "epoch": 1.3703409830729167, + "grad_norm": 17.385164260864258, + "learning_rate": 7.377727583565881e-06, + "loss": 4.9619, + "step": 67355 + }, + { + "epoch": 1.3704427083333333, + "grad_norm": 15.987424850463867, + "learning_rate": 7.3773759832673385e-06, + "loss": 4.9499, + "step": 67360 + }, + { + "epoch": 1.37054443359375, + "grad_norm": 16.838022232055664, + "learning_rate": 7.377024367778118e-06, + "loss": 4.8468, + "step": 67365 + }, + { + "epoch": 1.3706461588541667, + "grad_norm": 17.116140365600586, + "learning_rate": 7.376672737100466e-06, + "loss": 4.9234, + "step": 67370 + }, + { + "epoch": 1.3707478841145833, + "grad_norm": 23.44621467590332, + "learning_rate": 7.376321091236629e-06, + "loss": 4.73, + "step": 67375 + }, + { + "epoch": 1.370849609375, + "grad_norm": 15.686812400817871, + "learning_rate": 7.375969430188857e-06, + "loss": 5.0703, + "step": 67380 + }, + { + "epoch": 1.3709513346354167, + "grad_norm": 26.632211685180664, + "learning_rate": 7.375617753959392e-06, + "loss": 4.8431, + "step": 67385 + }, + { + "epoch": 1.3710530598958333, + "grad_norm": 17.27474594116211, + "learning_rate": 7.3752660625504844e-06, + "loss": 4.8629, + "step": 67390 + }, + { + "epoch": 1.37115478515625, + "grad_norm": 14.84122085571289, + "learning_rate": 7.374914355964381e-06, + "loss": 4.8353, + "step": 67395 + }, + { + "epoch": 1.3712565104166667, + "grad_norm": 16.042909622192383, + "learning_rate": 7.374562634203328e-06, + "loss": 5.052, + "step": 67400 + }, + { + "epoch": 1.3713582356770833, + "grad_norm": 18.979276657104492, + "learning_rate": 7.374210897269575e-06, + "loss": 4.7112, + "step": 67405 + }, + { + "epoch": 1.3714599609375, + "grad_norm": 22.5435733795166, + "learning_rate": 7.373859145165366e-06, + "loss": 4.6408, + "step": 67410 + }, + { + "epoch": 1.3715616861979167, + "grad_norm": 18.41382598876953, + "learning_rate": 7.373507377892952e-06, + "loss": 5.04, + "step": 67415 + }, + { + "epoch": 1.3716634114583333, + "grad_norm": 18.225061416625977, + "learning_rate": 7.3731555954545805e-06, + "loss": 4.7789, + "step": 67420 + }, + { + "epoch": 1.37176513671875, + "grad_norm": 13.26913833618164, + "learning_rate": 7.3728037978524955e-06, + "loss": 4.7045, + "step": 67425 + }, + { + "epoch": 1.3718668619791667, + "grad_norm": 19.870615005493164, + "learning_rate": 7.372451985088949e-06, + "loss": 4.9193, + "step": 67430 + }, + { + "epoch": 1.3719685872395833, + "grad_norm": 15.704529762268066, + "learning_rate": 7.372100157166188e-06, + "loss": 4.7827, + "step": 67435 + }, + { + "epoch": 1.3720703125, + "grad_norm": 19.815900802612305, + "learning_rate": 7.371748314086459e-06, + "loss": 5.2114, + "step": 67440 + }, + { + "epoch": 1.3721720377604167, + "grad_norm": 19.665260314941406, + "learning_rate": 7.371396455852012e-06, + "loss": 4.745, + "step": 67445 + }, + { + "epoch": 1.3722737630208333, + "grad_norm": 17.438913345336914, + "learning_rate": 7.371044582465093e-06, + "loss": 4.9029, + "step": 67450 + }, + { + "epoch": 1.37237548828125, + "grad_norm": 16.498910903930664, + "learning_rate": 7.370692693927952e-06, + "loss": 4.7598, + "step": 67455 + }, + { + "epoch": 1.3724772135416667, + "grad_norm": 14.766287803649902, + "learning_rate": 7.370340790242838e-06, + "loss": 4.8247, + "step": 67460 + }, + { + "epoch": 1.3725789388020833, + "grad_norm": 20.574115753173828, + "learning_rate": 7.369988871411997e-06, + "loss": 4.7629, + "step": 67465 + }, + { + "epoch": 1.3726806640625, + "grad_norm": 17.798315048217773, + "learning_rate": 7.369636937437682e-06, + "loss": 4.7101, + "step": 67470 + }, + { + "epoch": 1.3727823893229167, + "grad_norm": 18.273866653442383, + "learning_rate": 7.3692849883221386e-06, + "loss": 4.8689, + "step": 67475 + }, + { + "epoch": 1.3728841145833333, + "grad_norm": 22.257530212402344, + "learning_rate": 7.368933024067615e-06, + "loss": 5.0761, + "step": 67480 + }, + { + "epoch": 1.37298583984375, + "grad_norm": 27.97083854675293, + "learning_rate": 7.368581044676363e-06, + "loss": 5.1757, + "step": 67485 + }, + { + "epoch": 1.3730875651041667, + "grad_norm": 15.78947639465332, + "learning_rate": 7.368229050150628e-06, + "loss": 4.9633, + "step": 67490 + }, + { + "epoch": 1.3731892903645833, + "grad_norm": 16.307605743408203, + "learning_rate": 7.367877040492664e-06, + "loss": 5.061, + "step": 67495 + }, + { + "epoch": 1.373291015625, + "grad_norm": 22.760595321655273, + "learning_rate": 7.367525015704715e-06, + "loss": 4.9559, + "step": 67500 + }, + { + "epoch": 1.3733927408854167, + "grad_norm": 18.928380966186523, + "learning_rate": 7.367172975789034e-06, + "loss": 4.9929, + "step": 67505 + }, + { + "epoch": 1.3734944661458333, + "grad_norm": 18.07764434814453, + "learning_rate": 7.366820920747868e-06, + "loss": 4.8448, + "step": 67510 + }, + { + "epoch": 1.37359619140625, + "grad_norm": 19.405046463012695, + "learning_rate": 7.3664688505834685e-06, + "loss": 4.7588, + "step": 67515 + }, + { + "epoch": 1.3736979166666667, + "grad_norm": 21.98055648803711, + "learning_rate": 7.366116765298084e-06, + "loss": 4.9437, + "step": 67520 + }, + { + "epoch": 1.3737996419270833, + "grad_norm": 17.59876823425293, + "learning_rate": 7.365764664893965e-06, + "loss": 4.627, + "step": 67525 + }, + { + "epoch": 1.3739013671875, + "grad_norm": 19.030033111572266, + "learning_rate": 7.36541254937336e-06, + "loss": 4.8415, + "step": 67530 + }, + { + "epoch": 1.3740030924479167, + "grad_norm": 12.95854377746582, + "learning_rate": 7.36506041873852e-06, + "loss": 4.7095, + "step": 67535 + }, + { + "epoch": 1.3741048177083333, + "grad_norm": 19.492977142333984, + "learning_rate": 7.364708272991694e-06, + "loss": 5.0852, + "step": 67540 + }, + { + "epoch": 1.37420654296875, + "grad_norm": 17.662500381469727, + "learning_rate": 7.364356112135133e-06, + "loss": 5.0029, + "step": 67545 + }, + { + "epoch": 1.3743082682291667, + "grad_norm": 18.832077026367188, + "learning_rate": 7.364003936171089e-06, + "loss": 5.0492, + "step": 67550 + }, + { + "epoch": 1.3744099934895833, + "grad_norm": 16.518665313720703, + "learning_rate": 7.3636517451018075e-06, + "loss": 4.8335, + "step": 67555 + }, + { + "epoch": 1.37451171875, + "grad_norm": 17.674800872802734, + "learning_rate": 7.363299538929543e-06, + "loss": 4.8468, + "step": 67560 + }, + { + "epoch": 1.3746134440104167, + "grad_norm": 16.35729217529297, + "learning_rate": 7.362947317656544e-06, + "loss": 4.9091, + "step": 67565 + }, + { + "epoch": 1.3747151692708333, + "grad_norm": 18.098247528076172, + "learning_rate": 7.362595081285063e-06, + "loss": 4.821, + "step": 67570 + }, + { + "epoch": 1.37481689453125, + "grad_norm": 25.020727157592773, + "learning_rate": 7.3622428298173485e-06, + "loss": 4.9246, + "step": 67575 + }, + { + "epoch": 1.3749186197916667, + "grad_norm": 25.56439781188965, + "learning_rate": 7.361890563255653e-06, + "loss": 5.1614, + "step": 67580 + }, + { + "epoch": 1.3750203450520833, + "grad_norm": 21.320236206054688, + "learning_rate": 7.361538281602225e-06, + "loss": 4.9076, + "step": 67585 + }, + { + "epoch": 1.3751220703125, + "grad_norm": 20.433067321777344, + "learning_rate": 7.361185984859318e-06, + "loss": 5.0713, + "step": 67590 + }, + { + "epoch": 1.3752237955729167, + "grad_norm": 17.4444580078125, + "learning_rate": 7.3608336730291815e-06, + "loss": 4.8141, + "step": 67595 + }, + { + "epoch": 1.3753255208333333, + "grad_norm": 19.12575912475586, + "learning_rate": 7.360481346114069e-06, + "loss": 4.9782, + "step": 67600 + }, + { + "epoch": 1.37542724609375, + "grad_norm": 28.72795295715332, + "learning_rate": 7.360129004116228e-06, + "loss": 4.9763, + "step": 67605 + }, + { + "epoch": 1.3755289713541667, + "grad_norm": 19.129459381103516, + "learning_rate": 7.359776647037913e-06, + "loss": 5.0055, + "step": 67610 + }, + { + "epoch": 1.3756306966145833, + "grad_norm": 16.885761260986328, + "learning_rate": 7.359424274881375e-06, + "loss": 4.944, + "step": 67615 + }, + { + "epoch": 1.375732421875, + "grad_norm": 21.17777442932129, + "learning_rate": 7.359071887648863e-06, + "loss": 5.0991, + "step": 67620 + }, + { + "epoch": 1.3758341471354167, + "grad_norm": 16.65648078918457, + "learning_rate": 7.358719485342631e-06, + "loss": 4.9758, + "step": 67625 + }, + { + "epoch": 1.3759358723958333, + "grad_norm": 18.038278579711914, + "learning_rate": 7.358367067964931e-06, + "loss": 5.0646, + "step": 67630 + }, + { + "epoch": 1.37603759765625, + "grad_norm": 13.883397102355957, + "learning_rate": 7.3580146355180135e-06, + "loss": 4.6329, + "step": 67635 + }, + { + "epoch": 1.3761393229166667, + "grad_norm": 16.615863800048828, + "learning_rate": 7.357662188004132e-06, + "loss": 4.8695, + "step": 67640 + }, + { + "epoch": 1.3762410481770833, + "grad_norm": 20.01296043395996, + "learning_rate": 7.357309725425537e-06, + "loss": 4.8668, + "step": 67645 + }, + { + "epoch": 1.3763427734375, + "grad_norm": 19.375959396362305, + "learning_rate": 7.356957247784481e-06, + "loss": 4.936, + "step": 67650 + }, + { + "epoch": 1.3764444986979167, + "grad_norm": 24.619848251342773, + "learning_rate": 7.356604755083216e-06, + "loss": 4.8444, + "step": 67655 + }, + { + "epoch": 1.3765462239583333, + "grad_norm": 19.684770584106445, + "learning_rate": 7.356252247323995e-06, + "loss": 4.9668, + "step": 67660 + }, + { + "epoch": 1.37664794921875, + "grad_norm": 21.892953872680664, + "learning_rate": 7.35589972450907e-06, + "loss": 4.9637, + "step": 67665 + }, + { + "epoch": 1.3767496744791667, + "grad_norm": 16.419971466064453, + "learning_rate": 7.355547186640695e-06, + "loss": 4.8296, + "step": 67670 + }, + { + "epoch": 1.3768513997395833, + "grad_norm": 15.625718116760254, + "learning_rate": 7.355194633721119e-06, + "loss": 4.917, + "step": 67675 + }, + { + "epoch": 1.376953125, + "grad_norm": 17.05826759338379, + "learning_rate": 7.354842065752599e-06, + "loss": 5.0269, + "step": 67680 + }, + { + "epoch": 1.3770548502604167, + "grad_norm": 22.23468780517578, + "learning_rate": 7.354489482737384e-06, + "loss": 4.8212, + "step": 67685 + }, + { + "epoch": 1.3771565755208333, + "grad_norm": 14.098881721496582, + "learning_rate": 7.35413688467773e-06, + "loss": 4.9396, + "step": 67690 + }, + { + "epoch": 1.37725830078125, + "grad_norm": 13.53941535949707, + "learning_rate": 7.353784271575887e-06, + "loss": 4.9434, + "step": 67695 + }, + { + "epoch": 1.3773600260416667, + "grad_norm": 17.99506187438965, + "learning_rate": 7.3534316434341115e-06, + "loss": 5.0995, + "step": 67700 + }, + { + "epoch": 1.3774617513020833, + "grad_norm": 15.741464614868164, + "learning_rate": 7.353079000254653e-06, + "loss": 4.802, + "step": 67705 + }, + { + "epoch": 1.3775634765625, + "grad_norm": 19.115461349487305, + "learning_rate": 7.352726342039767e-06, + "loss": 4.5223, + "step": 67710 + }, + { + "epoch": 1.3776652018229167, + "grad_norm": 18.582483291625977, + "learning_rate": 7.352373668791708e-06, + "loss": 5.2433, + "step": 67715 + }, + { + "epoch": 1.3777669270833333, + "grad_norm": 14.23809814453125, + "learning_rate": 7.352020980512727e-06, + "loss": 4.8177, + "step": 67720 + }, + { + "epoch": 1.37786865234375, + "grad_norm": 16.259876251220703, + "learning_rate": 7.351668277205079e-06, + "loss": 4.9043, + "step": 67725 + }, + { + "epoch": 1.3779703776041667, + "grad_norm": 14.730375289916992, + "learning_rate": 7.351315558871017e-06, + "loss": 4.751, + "step": 67730 + }, + { + "epoch": 1.3780721028645833, + "grad_norm": 27.84194564819336, + "learning_rate": 7.350962825512794e-06, + "loss": 4.8725, + "step": 67735 + }, + { + "epoch": 1.378173828125, + "grad_norm": 17.60769271850586, + "learning_rate": 7.350610077132666e-06, + "loss": 4.8222, + "step": 67740 + }, + { + "epoch": 1.3782755533854167, + "grad_norm": 19.157533645629883, + "learning_rate": 7.350257313732887e-06, + "loss": 5.1273, + "step": 67745 + }, + { + "epoch": 1.3783772786458333, + "grad_norm": 21.536117553710938, + "learning_rate": 7.3499045353157076e-06, + "loss": 4.9437, + "step": 67750 + }, + { + "epoch": 1.37847900390625, + "grad_norm": 15.146317481994629, + "learning_rate": 7.349551741883385e-06, + "loss": 5.0527, + "step": 67755 + }, + { + "epoch": 1.3785807291666667, + "grad_norm": 15.490507125854492, + "learning_rate": 7.349198933438172e-06, + "loss": 4.6739, + "step": 67760 + }, + { + "epoch": 1.3786824544270833, + "grad_norm": 15.254097938537598, + "learning_rate": 7.3488461099823246e-06, + "loss": 5.2133, + "step": 67765 + }, + { + "epoch": 1.3787841796875, + "grad_norm": 22.364307403564453, + "learning_rate": 7.348493271518096e-06, + "loss": 5.0197, + "step": 67770 + }, + { + "epoch": 1.3788859049479167, + "grad_norm": 12.998689651489258, + "learning_rate": 7.34814041804774e-06, + "loss": 4.8132, + "step": 67775 + }, + { + "epoch": 1.3789876302083333, + "grad_norm": 17.479793548583984, + "learning_rate": 7.347787549573513e-06, + "loss": 4.9166, + "step": 67780 + }, + { + "epoch": 1.37908935546875, + "grad_norm": 16.55936622619629, + "learning_rate": 7.347434666097668e-06, + "loss": 5.1514, + "step": 67785 + }, + { + "epoch": 1.3791910807291667, + "grad_norm": 19.337360382080078, + "learning_rate": 7.3470817676224625e-06, + "loss": 5.0199, + "step": 67790 + }, + { + "epoch": 1.3792928059895833, + "grad_norm": 14.865171432495117, + "learning_rate": 7.3467288541501495e-06, + "loss": 5.1241, + "step": 67795 + }, + { + "epoch": 1.37939453125, + "grad_norm": 18.35223960876465, + "learning_rate": 7.346375925682982e-06, + "loss": 5.0893, + "step": 67800 + }, + { + "epoch": 1.3794962565104167, + "grad_norm": 18.845266342163086, + "learning_rate": 7.346022982223218e-06, + "loss": 4.8787, + "step": 67805 + }, + { + "epoch": 1.3795979817708333, + "grad_norm": 18.667724609375, + "learning_rate": 7.345670023773112e-06, + "loss": 4.9239, + "step": 67810 + }, + { + "epoch": 1.37969970703125, + "grad_norm": 22.237049102783203, + "learning_rate": 7.345317050334919e-06, + "loss": 4.833, + "step": 67815 + }, + { + "epoch": 1.3798014322916667, + "grad_norm": 19.978500366210938, + "learning_rate": 7.344964061910895e-06, + "loss": 5.4059, + "step": 67820 + }, + { + "epoch": 1.3799031575520833, + "grad_norm": 17.87544059753418, + "learning_rate": 7.344611058503295e-06, + "loss": 4.9918, + "step": 67825 + }, + { + "epoch": 1.3800048828125, + "grad_norm": 17.684925079345703, + "learning_rate": 7.3442580401143736e-06, + "loss": 5.3713, + "step": 67830 + }, + { + "epoch": 1.3801066080729167, + "grad_norm": 16.600929260253906, + "learning_rate": 7.343905006746388e-06, + "loss": 4.8702, + "step": 67835 + }, + { + "epoch": 1.3802083333333333, + "grad_norm": 17.045883178710938, + "learning_rate": 7.3435519584015934e-06, + "loss": 4.9774, + "step": 67840 + }, + { + "epoch": 1.38031005859375, + "grad_norm": 20.899980545043945, + "learning_rate": 7.343198895082246e-06, + "loss": 4.9167, + "step": 67845 + }, + { + "epoch": 1.3804117838541667, + "grad_norm": 17.005958557128906, + "learning_rate": 7.342845816790602e-06, + "loss": 4.6795, + "step": 67850 + }, + { + "epoch": 1.3805135091145833, + "grad_norm": 18.452205657958984, + "learning_rate": 7.342492723528916e-06, + "loss": 4.7401, + "step": 67855 + }, + { + "epoch": 1.380615234375, + "grad_norm": 20.7545108795166, + "learning_rate": 7.342139615299446e-06, + "loss": 4.8392, + "step": 67860 + }, + { + "epoch": 1.3807169596354167, + "grad_norm": 15.838035583496094, + "learning_rate": 7.341786492104447e-06, + "loss": 4.9163, + "step": 67865 + }, + { + "epoch": 1.3808186848958333, + "grad_norm": 38.111358642578125, + "learning_rate": 7.3414333539461755e-06, + "loss": 5.3123, + "step": 67870 + }, + { + "epoch": 1.38092041015625, + "grad_norm": 20.723350524902344, + "learning_rate": 7.341080200826889e-06, + "loss": 4.8778, + "step": 67875 + }, + { + "epoch": 1.3810221354166667, + "grad_norm": 18.532684326171875, + "learning_rate": 7.340727032748841e-06, + "loss": 5.0551, + "step": 67880 + }, + { + "epoch": 1.3811238606770833, + "grad_norm": 18.048906326293945, + "learning_rate": 7.340373849714293e-06, + "loss": 5.1987, + "step": 67885 + }, + { + "epoch": 1.3812255859375, + "grad_norm": 16.486143112182617, + "learning_rate": 7.340020651725497e-06, + "loss": 5.1901, + "step": 67890 + }, + { + "epoch": 1.3813273111979167, + "grad_norm": 17.600542068481445, + "learning_rate": 7.339667438784713e-06, + "loss": 5.0495, + "step": 67895 + }, + { + "epoch": 1.3814290364583333, + "grad_norm": 18.527530670166016, + "learning_rate": 7.339314210894197e-06, + "loss": 5.0087, + "step": 67900 + }, + { + "epoch": 1.38153076171875, + "grad_norm": 18.09511375427246, + "learning_rate": 7.338960968056204e-06, + "loss": 4.9709, + "step": 67905 + }, + { + "epoch": 1.3816324869791667, + "grad_norm": 34.822853088378906, + "learning_rate": 7.338607710272995e-06, + "loss": 4.7649, + "step": 67910 + }, + { + "epoch": 1.3817342122395833, + "grad_norm": 22.133636474609375, + "learning_rate": 7.338254437546824e-06, + "loss": 4.9151, + "step": 67915 + }, + { + "epoch": 1.3818359375, + "grad_norm": 19.996828079223633, + "learning_rate": 7.337901149879949e-06, + "loss": 4.6483, + "step": 67920 + }, + { + "epoch": 1.3819376627604167, + "grad_norm": 23.222209930419922, + "learning_rate": 7.33754784727463e-06, + "loss": 4.9421, + "step": 67925 + }, + { + "epoch": 1.3820393880208333, + "grad_norm": 19.760793685913086, + "learning_rate": 7.3371945297331195e-06, + "loss": 4.8234, + "step": 67930 + }, + { + "epoch": 1.38214111328125, + "grad_norm": 19.62024688720703, + "learning_rate": 7.33684119725768e-06, + "loss": 5.038, + "step": 67935 + }, + { + "epoch": 1.3822428385416667, + "grad_norm": 15.664224624633789, + "learning_rate": 7.336487849850566e-06, + "loss": 4.9992, + "step": 67940 + }, + { + "epoch": 1.3823445638020833, + "grad_norm": 14.498945236206055, + "learning_rate": 7.336134487514036e-06, + "loss": 4.6244, + "step": 67945 + }, + { + "epoch": 1.3824462890625, + "grad_norm": 20.198871612548828, + "learning_rate": 7.3357811102503495e-06, + "loss": 4.7665, + "step": 67950 + }, + { + "epoch": 1.3825480143229167, + "grad_norm": 19.41120147705078, + "learning_rate": 7.3354277180617615e-06, + "loss": 4.9105, + "step": 67955 + }, + { + "epoch": 1.3826497395833333, + "grad_norm": 14.701106071472168, + "learning_rate": 7.335074310950533e-06, + "loss": 4.9597, + "step": 67960 + }, + { + "epoch": 1.38275146484375, + "grad_norm": 15.359785079956055, + "learning_rate": 7.3347208889189204e-06, + "loss": 4.9529, + "step": 67965 + }, + { + "epoch": 1.3828531901041667, + "grad_norm": 16.44038963317871, + "learning_rate": 7.334367451969182e-06, + "loss": 5.0407, + "step": 67970 + }, + { + "epoch": 1.3829549153645833, + "grad_norm": 18.62665557861328, + "learning_rate": 7.334014000103577e-06, + "loss": 4.8356, + "step": 67975 + }, + { + "epoch": 1.383056640625, + "grad_norm": 15.366589546203613, + "learning_rate": 7.333660533324365e-06, + "loss": 5.3416, + "step": 67980 + }, + { + "epoch": 1.3831583658854167, + "grad_norm": 14.723963737487793, + "learning_rate": 7.3333070516338e-06, + "loss": 4.8732, + "step": 67985 + }, + { + "epoch": 1.3832600911458333, + "grad_norm": 22.376256942749023, + "learning_rate": 7.332953555034146e-06, + "loss": 4.8467, + "step": 67990 + }, + { + "epoch": 1.38336181640625, + "grad_norm": 22.27486801147461, + "learning_rate": 7.332600043527658e-06, + "loss": 4.9616, + "step": 67995 + }, + { + "epoch": 1.3834635416666667, + "grad_norm": 18.00202178955078, + "learning_rate": 7.332246517116598e-06, + "loss": 4.8679, + "step": 68000 + }, + { + "epoch": 1.3835652669270833, + "grad_norm": 15.707840919494629, + "learning_rate": 7.331892975803221e-06, + "loss": 5.0913, + "step": 68005 + }, + { + "epoch": 1.3836669921875, + "grad_norm": 18.298171997070312, + "learning_rate": 7.331539419589789e-06, + "loss": 4.7802, + "step": 68010 + }, + { + "epoch": 1.3837687174479167, + "grad_norm": 18.009695053100586, + "learning_rate": 7.3311858484785615e-06, + "loss": 4.881, + "step": 68015 + }, + { + "epoch": 1.3838704427083333, + "grad_norm": 21.54038429260254, + "learning_rate": 7.3308322624717945e-06, + "loss": 4.9653, + "step": 68020 + }, + { + "epoch": 1.38397216796875, + "grad_norm": 13.145561218261719, + "learning_rate": 7.3304786615717506e-06, + "loss": 4.926, + "step": 68025 + }, + { + "epoch": 1.3840738932291667, + "grad_norm": 19.0977783203125, + "learning_rate": 7.330125045780687e-06, + "loss": 4.9646, + "step": 68030 + }, + { + "epoch": 1.3841756184895833, + "grad_norm": 15.025129318237305, + "learning_rate": 7.329771415100864e-06, + "loss": 5.1182, + "step": 68035 + }, + { + "epoch": 1.38427734375, + "grad_norm": 15.334320068359375, + "learning_rate": 7.329417769534542e-06, + "loss": 4.9584, + "step": 68040 + }, + { + "epoch": 1.3843790690104167, + "grad_norm": 16.733531951904297, + "learning_rate": 7.329064109083979e-06, + "loss": 4.9996, + "step": 68045 + }, + { + "epoch": 1.3844807942708333, + "grad_norm": 14.970011711120605, + "learning_rate": 7.328710433751436e-06, + "loss": 5.0075, + "step": 68050 + }, + { + "epoch": 1.38458251953125, + "grad_norm": 12.104331016540527, + "learning_rate": 7.328356743539173e-06, + "loss": 4.8217, + "step": 68055 + }, + { + "epoch": 1.3846842447916667, + "grad_norm": 16.49132537841797, + "learning_rate": 7.3280030384494494e-06, + "loss": 4.9398, + "step": 68060 + }, + { + "epoch": 1.3847859700520833, + "grad_norm": 17.228654861450195, + "learning_rate": 7.327649318484525e-06, + "loss": 5.0584, + "step": 68065 + }, + { + "epoch": 1.3848876953125, + "grad_norm": 16.316240310668945, + "learning_rate": 7.32729558364666e-06, + "loss": 4.9986, + "step": 68070 + }, + { + "epoch": 1.3849894205729167, + "grad_norm": 17.34557342529297, + "learning_rate": 7.3269418339381175e-06, + "loss": 4.8385, + "step": 68075 + }, + { + "epoch": 1.3850911458333333, + "grad_norm": 16.348018646240234, + "learning_rate": 7.326588069361152e-06, + "loss": 5.0643, + "step": 68080 + }, + { + "epoch": 1.38519287109375, + "grad_norm": 18.73133087158203, + "learning_rate": 7.326234289918028e-06, + "loss": 4.7455, + "step": 68085 + }, + { + "epoch": 1.3852945963541667, + "grad_norm": 18.972997665405273, + "learning_rate": 7.325880495611008e-06, + "loss": 4.8393, + "step": 68090 + }, + { + "epoch": 1.3853963216145833, + "grad_norm": 21.60887908935547, + "learning_rate": 7.325526686442348e-06, + "loss": 4.9401, + "step": 68095 + }, + { + "epoch": 1.385498046875, + "grad_norm": 13.431794166564941, + "learning_rate": 7.325172862414311e-06, + "loss": 5.1184, + "step": 68100 + }, + { + "epoch": 1.3855997721354167, + "grad_norm": 16.614797592163086, + "learning_rate": 7.324819023529155e-06, + "loss": 4.7105, + "step": 68105 + }, + { + "epoch": 1.3857014973958333, + "grad_norm": 23.368412017822266, + "learning_rate": 7.324465169789147e-06, + "loss": 4.7575, + "step": 68110 + }, + { + "epoch": 1.38580322265625, + "grad_norm": 14.094095230102539, + "learning_rate": 7.324111301196542e-06, + "loss": 4.8665, + "step": 68115 + }, + { + "epoch": 1.3859049479166667, + "grad_norm": 14.4821138381958, + "learning_rate": 7.323757417753605e-06, + "loss": 4.7984, + "step": 68120 + }, + { + "epoch": 1.3860066731770833, + "grad_norm": 19.22614097595215, + "learning_rate": 7.323403519462595e-06, + "loss": 4.8674, + "step": 68125 + }, + { + "epoch": 1.3861083984375, + "grad_norm": 19.65081214904785, + "learning_rate": 7.323049606325775e-06, + "loss": 4.7061, + "step": 68130 + }, + { + "epoch": 1.3862101236979167, + "grad_norm": 16.42061996459961, + "learning_rate": 7.322695678345404e-06, + "loss": 4.7909, + "step": 68135 + }, + { + "epoch": 1.3863118489583333, + "grad_norm": 18.320606231689453, + "learning_rate": 7.322341735523747e-06, + "loss": 5.0225, + "step": 68140 + }, + { + "epoch": 1.38641357421875, + "grad_norm": 20.66094970703125, + "learning_rate": 7.321987777863062e-06, + "loss": 4.9792, + "step": 68145 + }, + { + "epoch": 1.3865152994791667, + "grad_norm": 15.928886413574219, + "learning_rate": 7.3216338053656125e-06, + "loss": 5.0852, + "step": 68150 + }, + { + "epoch": 1.3866170247395833, + "grad_norm": 16.335229873657227, + "learning_rate": 7.321279818033661e-06, + "loss": 4.8523, + "step": 68155 + }, + { + "epoch": 1.38671875, + "grad_norm": 20.315256118774414, + "learning_rate": 7.320925815869467e-06, + "loss": 4.9917, + "step": 68160 + }, + { + "epoch": 1.3868204752604167, + "grad_norm": 19.442934036254883, + "learning_rate": 7.320571798875294e-06, + "loss": 4.9046, + "step": 68165 + }, + { + "epoch": 1.3869222005208333, + "grad_norm": 17.2354793548584, + "learning_rate": 7.320217767053405e-06, + "loss": 5.1419, + "step": 68170 + }, + { + "epoch": 1.38702392578125, + "grad_norm": 22.09217643737793, + "learning_rate": 7.319863720406059e-06, + "loss": 5.0057, + "step": 68175 + }, + { + "epoch": 1.3871256510416667, + "grad_norm": 18.255708694458008, + "learning_rate": 7.319509658935522e-06, + "loss": 4.9486, + "step": 68180 + }, + { + "epoch": 1.3872273763020833, + "grad_norm": 19.90134620666504, + "learning_rate": 7.319155582644054e-06, + "loss": 4.9827, + "step": 68185 + }, + { + "epoch": 1.3873291015625, + "grad_norm": 17.46381378173828, + "learning_rate": 7.318801491533918e-06, + "loss": 5.0532, + "step": 68190 + }, + { + "epoch": 1.3874308268229167, + "grad_norm": 18.75919532775879, + "learning_rate": 7.318447385607377e-06, + "loss": 4.9568, + "step": 68195 + }, + { + "epoch": 1.3875325520833333, + "grad_norm": 15.89238166809082, + "learning_rate": 7.318093264866691e-06, + "loss": 4.836, + "step": 68200 + }, + { + "epoch": 1.38763427734375, + "grad_norm": 20.556896209716797, + "learning_rate": 7.3177391293141275e-06, + "loss": 4.9786, + "step": 68205 + }, + { + "epoch": 1.3877360026041667, + "grad_norm": 23.54249382019043, + "learning_rate": 7.3173849789519456e-06, + "loss": 4.9014, + "step": 68210 + }, + { + "epoch": 1.3878377278645833, + "grad_norm": 22.818180084228516, + "learning_rate": 7.317030813782409e-06, + "loss": 4.6078, + "step": 68215 + }, + { + "epoch": 1.387939453125, + "grad_norm": 16.553133010864258, + "learning_rate": 7.316676633807782e-06, + "loss": 4.9667, + "step": 68220 + }, + { + "epoch": 1.3880411783854167, + "grad_norm": 21.265588760375977, + "learning_rate": 7.316322439030326e-06, + "loss": 4.8211, + "step": 68225 + }, + { + "epoch": 1.3881429036458333, + "grad_norm": 17.845428466796875, + "learning_rate": 7.315968229452305e-06, + "loss": 4.7781, + "step": 68230 + }, + { + "epoch": 1.38824462890625, + "grad_norm": 22.784440994262695, + "learning_rate": 7.315614005075982e-06, + "loss": 4.8024, + "step": 68235 + }, + { + "epoch": 1.3883463541666667, + "grad_norm": 13.472956657409668, + "learning_rate": 7.315259765903622e-06, + "loss": 4.8785, + "step": 68240 + }, + { + "epoch": 1.3884480794270833, + "grad_norm": 13.609721183776855, + "learning_rate": 7.3149055119374844e-06, + "loss": 4.8193, + "step": 68245 + }, + { + "epoch": 1.3885498046875, + "grad_norm": 14.829943656921387, + "learning_rate": 7.314551243179838e-06, + "loss": 4.7484, + "step": 68250 + }, + { + "epoch": 1.3886515299479167, + "grad_norm": 14.320131301879883, + "learning_rate": 7.314196959632942e-06, + "loss": 4.9069, + "step": 68255 + }, + { + "epoch": 1.3887532552083333, + "grad_norm": 17.974306106567383, + "learning_rate": 7.3138426612990634e-06, + "loss": 5.094, + "step": 68260 + }, + { + "epoch": 1.38885498046875, + "grad_norm": 19.5279541015625, + "learning_rate": 7.313488348180464e-06, + "loss": 5.0804, + "step": 68265 + }, + { + "epoch": 1.3889567057291667, + "grad_norm": 17.397062301635742, + "learning_rate": 7.313134020279409e-06, + "loss": 4.8091, + "step": 68270 + }, + { + "epoch": 1.3890584309895833, + "grad_norm": 19.49732208251953, + "learning_rate": 7.3127796775981615e-06, + "loss": 4.7913, + "step": 68275 + }, + { + "epoch": 1.38916015625, + "grad_norm": 19.4796199798584, + "learning_rate": 7.312425320138986e-06, + "loss": 4.9288, + "step": 68280 + }, + { + "epoch": 1.3892618815104167, + "grad_norm": 21.603517532348633, + "learning_rate": 7.312070947904147e-06, + "loss": 4.6838, + "step": 68285 + }, + { + "epoch": 1.3893636067708333, + "grad_norm": 18.22673225402832, + "learning_rate": 7.311716560895908e-06, + "loss": 4.9555, + "step": 68290 + }, + { + "epoch": 1.38946533203125, + "grad_norm": 29.78773307800293, + "learning_rate": 7.311362159116535e-06, + "loss": 5.1584, + "step": 68295 + }, + { + "epoch": 1.3895670572916667, + "grad_norm": 14.46401596069336, + "learning_rate": 7.3110077425682925e-06, + "loss": 4.894, + "step": 68300 + }, + { + "epoch": 1.3896687825520833, + "grad_norm": 18.80799102783203, + "learning_rate": 7.3106533112534415e-06, + "loss": 4.7514, + "step": 68305 + }, + { + "epoch": 1.3897705078125, + "grad_norm": 26.229965209960938, + "learning_rate": 7.310298865174251e-06, + "loss": 5.2047, + "step": 68310 + }, + { + "epoch": 1.3898722330729167, + "grad_norm": 26.280414581298828, + "learning_rate": 7.309944404332984e-06, + "loss": 4.9425, + "step": 68315 + }, + { + "epoch": 1.3899739583333333, + "grad_norm": 17.826812744140625, + "learning_rate": 7.309589928731905e-06, + "loss": 4.8601, + "step": 68320 + }, + { + "epoch": 1.39007568359375, + "grad_norm": 17.490928649902344, + "learning_rate": 7.30923543837328e-06, + "loss": 4.8582, + "step": 68325 + }, + { + "epoch": 1.3901774088541667, + "grad_norm": 11.873260498046875, + "learning_rate": 7.308880933259372e-06, + "loss": 4.9946, + "step": 68330 + }, + { + "epoch": 1.3902791341145833, + "grad_norm": 16.135540008544922, + "learning_rate": 7.308526413392449e-06, + "loss": 4.747, + "step": 68335 + }, + { + "epoch": 1.390380859375, + "grad_norm": 22.361494064331055, + "learning_rate": 7.308171878774777e-06, + "loss": 4.8992, + "step": 68340 + }, + { + "epoch": 1.3904825846354167, + "grad_norm": 17.787973403930664, + "learning_rate": 7.3078173294086165e-06, + "loss": 4.9431, + "step": 68345 + }, + { + "epoch": 1.3905843098958333, + "grad_norm": 13.625933647155762, + "learning_rate": 7.307462765296237e-06, + "loss": 4.9972, + "step": 68350 + }, + { + "epoch": 1.39068603515625, + "grad_norm": 22.743309020996094, + "learning_rate": 7.307108186439903e-06, + "loss": 5.1841, + "step": 68355 + }, + { + "epoch": 1.3907877604166667, + "grad_norm": 19.60562515258789, + "learning_rate": 7.30675359284188e-06, + "loss": 4.7707, + "step": 68360 + }, + { + "epoch": 1.3908894856770833, + "grad_norm": 20.81018829345703, + "learning_rate": 7.306398984504433e-06, + "loss": 4.7334, + "step": 68365 + }, + { + "epoch": 1.3909912109375, + "grad_norm": 22.173810958862305, + "learning_rate": 7.30604436142983e-06, + "loss": 5.0526, + "step": 68370 + }, + { + "epoch": 1.3910929361979167, + "grad_norm": 18.11367416381836, + "learning_rate": 7.305689723620334e-06, + "loss": 4.787, + "step": 68375 + }, + { + "epoch": 1.3911946614583333, + "grad_norm": 22.45439910888672, + "learning_rate": 7.305335071078214e-06, + "loss": 4.9946, + "step": 68380 + }, + { + "epoch": 1.39129638671875, + "grad_norm": 20.033496856689453, + "learning_rate": 7.304980403805734e-06, + "loss": 5.2751, + "step": 68385 + }, + { + "epoch": 1.3913981119791667, + "grad_norm": 19.773971557617188, + "learning_rate": 7.304625721805161e-06, + "loss": 4.9874, + "step": 68390 + }, + { + "epoch": 1.3914998372395833, + "grad_norm": 16.41878318786621, + "learning_rate": 7.3042710250787604e-06, + "loss": 5.0383, + "step": 68395 + }, + { + "epoch": 1.3916015625, + "grad_norm": 15.754941940307617, + "learning_rate": 7.3039163136288015e-06, + "loss": 5.2165, + "step": 68400 + }, + { + "epoch": 1.3917032877604167, + "grad_norm": 15.643532752990723, + "learning_rate": 7.303561587457548e-06, + "loss": 4.9672, + "step": 68405 + }, + { + "epoch": 1.3918050130208333, + "grad_norm": 19.25537872314453, + "learning_rate": 7.303206846567266e-06, + "loss": 4.7919, + "step": 68410 + }, + { + "epoch": 1.39190673828125, + "grad_norm": 17.123563766479492, + "learning_rate": 7.302852090960224e-06, + "loss": 4.7431, + "step": 68415 + }, + { + "epoch": 1.3920084635416667, + "grad_norm": 17.164215087890625, + "learning_rate": 7.302497320638689e-06, + "loss": 5.0501, + "step": 68420 + }, + { + "epoch": 1.3921101888020833, + "grad_norm": 17.769067764282227, + "learning_rate": 7.302142535604926e-06, + "loss": 4.7654, + "step": 68425 + }, + { + "epoch": 1.3922119140625, + "grad_norm": 18.685279846191406, + "learning_rate": 7.301787735861203e-06, + "loss": 4.9774, + "step": 68430 + }, + { + "epoch": 1.3923136393229167, + "grad_norm": 22.11290168762207, + "learning_rate": 7.301432921409788e-06, + "loss": 5.2631, + "step": 68435 + }, + { + "epoch": 1.3924153645833333, + "grad_norm": 20.722095489501953, + "learning_rate": 7.301078092252947e-06, + "loss": 4.8126, + "step": 68440 + }, + { + "epoch": 1.39251708984375, + "grad_norm": 13.116620063781738, + "learning_rate": 7.300723248392947e-06, + "loss": 4.9374, + "step": 68445 + }, + { + "epoch": 1.3926188151041667, + "grad_norm": 24.527523040771484, + "learning_rate": 7.300368389832056e-06, + "loss": 4.8413, + "step": 68450 + }, + { + "epoch": 1.3927205403645833, + "grad_norm": 19.733537673950195, + "learning_rate": 7.300013516572542e-06, + "loss": 4.9809, + "step": 68455 + }, + { + "epoch": 1.392822265625, + "grad_norm": 15.24419116973877, + "learning_rate": 7.29965862861667e-06, + "loss": 4.6764, + "step": 68460 + }, + { + "epoch": 1.3929239908854167, + "grad_norm": 17.9500732421875, + "learning_rate": 7.29930372596671e-06, + "loss": 5.0822, + "step": 68465 + }, + { + "epoch": 1.3930257161458333, + "grad_norm": 23.373498916625977, + "learning_rate": 7.29894880862493e-06, + "loss": 4.8526, + "step": 68470 + }, + { + "epoch": 1.39312744140625, + "grad_norm": 25.66497802734375, + "learning_rate": 7.298593876593596e-06, + "loss": 5.0887, + "step": 68475 + }, + { + "epoch": 1.3932291666666667, + "grad_norm": 15.835247993469238, + "learning_rate": 7.2982389298749765e-06, + "loss": 5.0412, + "step": 68480 + }, + { + "epoch": 1.3933308919270833, + "grad_norm": 21.680932998657227, + "learning_rate": 7.2978839684713394e-06, + "loss": 4.936, + "step": 68485 + }, + { + "epoch": 1.3934326171875, + "grad_norm": 17.752887725830078, + "learning_rate": 7.297528992384955e-06, + "loss": 5.0938, + "step": 68490 + }, + { + "epoch": 1.3935343424479167, + "grad_norm": 16.031780242919922, + "learning_rate": 7.2971740016180895e-06, + "loss": 4.6595, + "step": 68495 + }, + { + "epoch": 1.3936360677083333, + "grad_norm": 17.942462921142578, + "learning_rate": 7.29681899617301e-06, + "loss": 4.9718, + "step": 68500 + }, + { + "epoch": 1.39373779296875, + "grad_norm": 14.574947357177734, + "learning_rate": 7.296463976051988e-06, + "loss": 5.0231, + "step": 68505 + }, + { + "epoch": 1.3938395182291667, + "grad_norm": 18.95125389099121, + "learning_rate": 7.2961089412572876e-06, + "loss": 4.9355, + "step": 68510 + }, + { + "epoch": 1.3939412434895833, + "grad_norm": 15.185986518859863, + "learning_rate": 7.295753891791181e-06, + "loss": 4.9343, + "step": 68515 + }, + { + "epoch": 1.39404296875, + "grad_norm": 20.596954345703125, + "learning_rate": 7.295398827655936e-06, + "loss": 5.0643, + "step": 68520 + }, + { + "epoch": 1.3941446940104167, + "grad_norm": 12.807368278503418, + "learning_rate": 7.2950437488538206e-06, + "loss": 5.0205, + "step": 68525 + }, + { + "epoch": 1.3942464192708333, + "grad_norm": 24.36256980895996, + "learning_rate": 7.294688655387106e-06, + "loss": 4.973, + "step": 68530 + }, + { + "epoch": 1.39434814453125, + "grad_norm": 29.18276023864746, + "learning_rate": 7.294333547258059e-06, + "loss": 4.7741, + "step": 68535 + }, + { + "epoch": 1.3944498697916667, + "grad_norm": 15.55588150024414, + "learning_rate": 7.293978424468947e-06, + "loss": 5.0295, + "step": 68540 + }, + { + "epoch": 1.3945515950520833, + "grad_norm": 28.087724685668945, + "learning_rate": 7.2936232870220426e-06, + "loss": 4.8737, + "step": 68545 + }, + { + "epoch": 1.3946533203125, + "grad_norm": 19.21293830871582, + "learning_rate": 7.2932681349196134e-06, + "loss": 5.0675, + "step": 68550 + }, + { + "epoch": 1.3947550455729167, + "grad_norm": 20.30093765258789, + "learning_rate": 7.292912968163928e-06, + "loss": 4.8597, + "step": 68555 + }, + { + "epoch": 1.3948567708333333, + "grad_norm": 14.622529983520508, + "learning_rate": 7.292557786757257e-06, + "loss": 4.9854, + "step": 68560 + }, + { + "epoch": 1.39495849609375, + "grad_norm": 18.32581329345703, + "learning_rate": 7.29220259070187e-06, + "loss": 4.98, + "step": 68565 + }, + { + "epoch": 1.3950602213541667, + "grad_norm": 16.998687744140625, + "learning_rate": 7.291847380000035e-06, + "loss": 4.7639, + "step": 68570 + }, + { + "epoch": 1.3951619466145833, + "grad_norm": 19.619630813598633, + "learning_rate": 7.291492154654024e-06, + "loss": 5.2399, + "step": 68575 + }, + { + "epoch": 1.395263671875, + "grad_norm": 13.414730072021484, + "learning_rate": 7.291136914666106e-06, + "loss": 5.0131, + "step": 68580 + }, + { + "epoch": 1.3953653971354167, + "grad_norm": 27.770286560058594, + "learning_rate": 7.29078166003855e-06, + "loss": 5.164, + "step": 68585 + }, + { + "epoch": 1.3954671223958333, + "grad_norm": 17.95295524597168, + "learning_rate": 7.290426390773624e-06, + "loss": 5.112, + "step": 68590 + }, + { + "epoch": 1.39556884765625, + "grad_norm": 17.09737777709961, + "learning_rate": 7.2900711068736026e-06, + "loss": 4.7458, + "step": 68595 + }, + { + "epoch": 1.3956705729166667, + "grad_norm": 17.196422576904297, + "learning_rate": 7.289715808340755e-06, + "loss": 5.0304, + "step": 68600 + }, + { + "epoch": 1.3957722981770833, + "grad_norm": 19.342527389526367, + "learning_rate": 7.289360495177348e-06, + "loss": 4.4818, + "step": 68605 + }, + { + "epoch": 1.3958740234375, + "grad_norm": 15.227543830871582, + "learning_rate": 7.289005167385655e-06, + "loss": 4.9946, + "step": 68610 + }, + { + "epoch": 1.3959757486979167, + "grad_norm": 16.763381958007812, + "learning_rate": 7.288649824967945e-06, + "loss": 4.9021, + "step": 68615 + }, + { + "epoch": 1.3960774739583333, + "grad_norm": 16.124313354492188, + "learning_rate": 7.28829446792649e-06, + "loss": 4.9127, + "step": 68620 + }, + { + "epoch": 1.39617919921875, + "grad_norm": 18.817485809326172, + "learning_rate": 7.28793909626356e-06, + "loss": 4.9614, + "step": 68625 + }, + { + "epoch": 1.3962809244791667, + "grad_norm": 15.241779327392578, + "learning_rate": 7.287583709981424e-06, + "loss": 4.7588, + "step": 68630 + }, + { + "epoch": 1.3963826497395833, + "grad_norm": 16.97316551208496, + "learning_rate": 7.287228309082356e-06, + "loss": 4.9834, + "step": 68635 + }, + { + "epoch": 1.396484375, + "grad_norm": 15.581225395202637, + "learning_rate": 7.286872893568624e-06, + "loss": 4.9411, + "step": 68640 + }, + { + "epoch": 1.3965861002604167, + "grad_norm": 15.348530769348145, + "learning_rate": 7.286517463442501e-06, + "loss": 4.7883, + "step": 68645 + }, + { + "epoch": 1.3966878255208333, + "grad_norm": 19.17560386657715, + "learning_rate": 7.286162018706255e-06, + "loss": 4.9082, + "step": 68650 + }, + { + "epoch": 1.39678955078125, + "grad_norm": 16.323484420776367, + "learning_rate": 7.285806559362161e-06, + "loss": 5.01, + "step": 68655 + }, + { + "epoch": 1.3968912760416667, + "grad_norm": 25.61153221130371, + "learning_rate": 7.285451085412489e-06, + "loss": 5.1481, + "step": 68660 + }, + { + "epoch": 1.3969930013020833, + "grad_norm": 16.22964859008789, + "learning_rate": 7.285095596859509e-06, + "loss": 5.056, + "step": 68665 + }, + { + "epoch": 1.3970947265625, + "grad_norm": 15.10871410369873, + "learning_rate": 7.284740093705496e-06, + "loss": 5.0153, + "step": 68670 + }, + { + "epoch": 1.3971964518229167, + "grad_norm": 21.44094467163086, + "learning_rate": 7.284384575952718e-06, + "loss": 5.0025, + "step": 68675 + }, + { + "epoch": 1.3972981770833333, + "grad_norm": 15.785333633422852, + "learning_rate": 7.284029043603446e-06, + "loss": 4.9994, + "step": 68680 + }, + { + "epoch": 1.39739990234375, + "grad_norm": 16.030466079711914, + "learning_rate": 7.283673496659955e-06, + "loss": 4.6715, + "step": 68685 + }, + { + "epoch": 1.3975016276041667, + "grad_norm": 24.84304428100586, + "learning_rate": 7.283317935124515e-06, + "loss": 4.9181, + "step": 68690 + }, + { + "epoch": 1.3976033528645833, + "grad_norm": 16.640775680541992, + "learning_rate": 7.282962358999398e-06, + "loss": 4.6545, + "step": 68695 + }, + { + "epoch": 1.397705078125, + "grad_norm": 18.926002502441406, + "learning_rate": 7.282606768286878e-06, + "loss": 5.1053, + "step": 68700 + }, + { + "epoch": 1.3978068033854167, + "grad_norm": 20.651838302612305, + "learning_rate": 7.282251162989222e-06, + "loss": 5.1178, + "step": 68705 + }, + { + "epoch": 1.3979085286458333, + "grad_norm": 19.42313003540039, + "learning_rate": 7.281895543108708e-06, + "loss": 4.9911, + "step": 68710 + }, + { + "epoch": 1.39801025390625, + "grad_norm": 19.192039489746094, + "learning_rate": 7.281539908647606e-06, + "loss": 5.3603, + "step": 68715 + }, + { + "epoch": 1.3981119791666667, + "grad_norm": 22.115154266357422, + "learning_rate": 7.281184259608186e-06, + "loss": 5.2496, + "step": 68720 + }, + { + "epoch": 1.3982137044270833, + "grad_norm": 16.717185974121094, + "learning_rate": 7.280828595992725e-06, + "loss": 4.9346, + "step": 68725 + }, + { + "epoch": 1.3983154296875, + "grad_norm": 17.195465087890625, + "learning_rate": 7.280472917803492e-06, + "loss": 4.9415, + "step": 68730 + }, + { + "epoch": 1.3984171549479167, + "grad_norm": 16.64679718017578, + "learning_rate": 7.280117225042762e-06, + "loss": 5.0437, + "step": 68735 + }, + { + "epoch": 1.3985188802083333, + "grad_norm": 19.337175369262695, + "learning_rate": 7.279761517712806e-06, + "loss": 5.0021, + "step": 68740 + }, + { + "epoch": 1.39862060546875, + "grad_norm": 18.949848175048828, + "learning_rate": 7.279405795815897e-06, + "loss": 4.9472, + "step": 68745 + }, + { + "epoch": 1.3987223307291667, + "grad_norm": 23.623329162597656, + "learning_rate": 7.27905005935431e-06, + "loss": 4.9173, + "step": 68750 + }, + { + "epoch": 1.3988240559895833, + "grad_norm": 15.417791366577148, + "learning_rate": 7.278694308330316e-06, + "loss": 5.0083, + "step": 68755 + }, + { + "epoch": 1.39892578125, + "grad_norm": 15.083940505981445, + "learning_rate": 7.278338542746188e-06, + "loss": 4.8636, + "step": 68760 + }, + { + "epoch": 1.3990275065104167, + "grad_norm": 19.36907386779785, + "learning_rate": 7.277982762604201e-06, + "loss": 4.957, + "step": 68765 + }, + { + "epoch": 1.3991292317708333, + "grad_norm": 16.708574295043945, + "learning_rate": 7.277626967906627e-06, + "loss": 4.956, + "step": 68770 + }, + { + "epoch": 1.39923095703125, + "grad_norm": 18.485118865966797, + "learning_rate": 7.27727115865574e-06, + "loss": 4.7365, + "step": 68775 + }, + { + "epoch": 1.3993326822916667, + "grad_norm": 16.147132873535156, + "learning_rate": 7.276915334853811e-06, + "loss": 4.8465, + "step": 68780 + }, + { + "epoch": 1.3994344075520833, + "grad_norm": 16.97020149230957, + "learning_rate": 7.276559496503118e-06, + "loss": 4.6428, + "step": 68785 + }, + { + "epoch": 1.3995361328125, + "grad_norm": 18.56277084350586, + "learning_rate": 7.276203643605932e-06, + "loss": 4.6997, + "step": 68790 + }, + { + "epoch": 1.3996378580729167, + "grad_norm": 15.402055740356445, + "learning_rate": 7.275847776164526e-06, + "loss": 4.9723, + "step": 68795 + }, + { + "epoch": 1.3997395833333333, + "grad_norm": 20.965599060058594, + "learning_rate": 7.275491894181178e-06, + "loss": 4.8058, + "step": 68800 + }, + { + "epoch": 1.39984130859375, + "grad_norm": 16.09123992919922, + "learning_rate": 7.275135997658158e-06, + "loss": 4.888, + "step": 68805 + }, + { + "epoch": 1.3999430338541667, + "grad_norm": 18.86332893371582, + "learning_rate": 7.27478008659774e-06, + "loss": 4.9466, + "step": 68810 + }, + { + "epoch": 1.4000447591145833, + "grad_norm": 15.842578887939453, + "learning_rate": 7.274424161002201e-06, + "loss": 5.079, + "step": 68815 + }, + { + "epoch": 1.400146484375, + "grad_norm": 16.07919692993164, + "learning_rate": 7.2740682208738134e-06, + "loss": 4.9193, + "step": 68820 + }, + { + "epoch": 1.4002482096354167, + "grad_norm": 17.264724731445312, + "learning_rate": 7.27371226621485e-06, + "loss": 4.7709, + "step": 68825 + }, + { + "epoch": 1.4003499348958333, + "grad_norm": 17.259220123291016, + "learning_rate": 7.2733562970275886e-06, + "loss": 4.9602, + "step": 68830 + }, + { + "epoch": 1.40045166015625, + "grad_norm": 18.775497436523438, + "learning_rate": 7.273000313314301e-06, + "loss": 4.9939, + "step": 68835 + }, + { + "epoch": 1.4005533854166667, + "grad_norm": 19.959373474121094, + "learning_rate": 7.272644315077264e-06, + "loss": 4.7928, + "step": 68840 + }, + { + "epoch": 1.4006551106770833, + "grad_norm": 20.088607788085938, + "learning_rate": 7.27228830231875e-06, + "loss": 4.9753, + "step": 68845 + }, + { + "epoch": 1.4007568359375, + "grad_norm": 17.93311309814453, + "learning_rate": 7.271932275041036e-06, + "loss": 5.056, + "step": 68850 + }, + { + "epoch": 1.4008585611979167, + "grad_norm": 18.751506805419922, + "learning_rate": 7.2715762332463964e-06, + "loss": 4.7235, + "step": 68855 + }, + { + "epoch": 1.4009602864583333, + "grad_norm": 17.239049911499023, + "learning_rate": 7.271220176937105e-06, + "loss": 4.8876, + "step": 68860 + }, + { + "epoch": 1.40106201171875, + "grad_norm": 19.092466354370117, + "learning_rate": 7.270864106115438e-06, + "loss": 4.799, + "step": 68865 + }, + { + "epoch": 1.4011637369791667, + "grad_norm": 22.37923240661621, + "learning_rate": 7.270508020783671e-06, + "loss": 4.9564, + "step": 68870 + }, + { + "epoch": 1.4012654622395833, + "grad_norm": 15.220450401306152, + "learning_rate": 7.270151920944077e-06, + "loss": 5.0467, + "step": 68875 + }, + { + "epoch": 1.4013671875, + "grad_norm": 14.390111923217773, + "learning_rate": 7.269795806598934e-06, + "loss": 4.9391, + "step": 68880 + }, + { + "epoch": 1.4014689127604167, + "grad_norm": 24.20328712463379, + "learning_rate": 7.269439677750515e-06, + "loss": 4.7983, + "step": 68885 + }, + { + "epoch": 1.4015706380208333, + "grad_norm": 16.25421905517578, + "learning_rate": 7.269083534401098e-06, + "loss": 4.9856, + "step": 68890 + }, + { + "epoch": 1.40167236328125, + "grad_norm": 15.50675106048584, + "learning_rate": 7.268727376552957e-06, + "loss": 5.0076, + "step": 68895 + }, + { + "epoch": 1.4017740885416667, + "grad_norm": 45.09336853027344, + "learning_rate": 7.268371204208369e-06, + "loss": 4.9983, + "step": 68900 + }, + { + "epoch": 1.4018758138020833, + "grad_norm": 20.78011703491211, + "learning_rate": 7.26801501736961e-06, + "loss": 4.8639, + "step": 68905 + }, + { + "epoch": 1.4019775390625, + "grad_norm": 18.23455238342285, + "learning_rate": 7.267658816038951e-06, + "loss": 5.1658, + "step": 68910 + }, + { + "epoch": 1.4020792643229167, + "grad_norm": 14.707791328430176, + "learning_rate": 7.267302600218675e-06, + "loss": 5.0488, + "step": 68915 + }, + { + "epoch": 1.4021809895833333, + "grad_norm": 23.74500274658203, + "learning_rate": 7.266946369911056e-06, + "loss": 4.9571, + "step": 68920 + }, + { + "epoch": 1.40228271484375, + "grad_norm": 18.35683250427246, + "learning_rate": 7.266590125118367e-06, + "loss": 5.2509, + "step": 68925 + }, + { + "epoch": 1.4023844401041667, + "grad_norm": 18.542083740234375, + "learning_rate": 7.266233865842888e-06, + "loss": 4.8831, + "step": 68930 + }, + { + "epoch": 1.4024861653645833, + "grad_norm": 17.79410743713379, + "learning_rate": 7.265877592086894e-06, + "loss": 4.813, + "step": 68935 + }, + { + "epoch": 1.402587890625, + "grad_norm": 14.632681846618652, + "learning_rate": 7.265521303852661e-06, + "loss": 5.2174, + "step": 68940 + }, + { + "epoch": 1.4026896158854167, + "grad_norm": 19.558326721191406, + "learning_rate": 7.265165001142466e-06, + "loss": 4.7931, + "step": 68945 + }, + { + "epoch": 1.4027913411458333, + "grad_norm": 22.65430450439453, + "learning_rate": 7.264808683958585e-06, + "loss": 5.0425, + "step": 68950 + }, + { + "epoch": 1.40289306640625, + "grad_norm": 16.559825897216797, + "learning_rate": 7.2644523523032975e-06, + "loss": 4.9972, + "step": 68955 + }, + { + "epoch": 1.4029947916666667, + "grad_norm": 16.852611541748047, + "learning_rate": 7.264096006178877e-06, + "loss": 5.0223, + "step": 68960 + }, + { + "epoch": 1.4030965169270833, + "grad_norm": 14.830911636352539, + "learning_rate": 7.2637396455876005e-06, + "loss": 4.7381, + "step": 68965 + }, + { + "epoch": 1.4031982421875, + "grad_norm": 22.89476776123047, + "learning_rate": 7.263383270531749e-06, + "loss": 5.0149, + "step": 68970 + }, + { + "epoch": 1.4032999674479167, + "grad_norm": 15.647021293640137, + "learning_rate": 7.263026881013595e-06, + "loss": 5.0007, + "step": 68975 + }, + { + "epoch": 1.4034016927083333, + "grad_norm": 14.855598449707031, + "learning_rate": 7.262670477035418e-06, + "loss": 4.8927, + "step": 68980 + }, + { + "epoch": 1.40350341796875, + "grad_norm": 19.163578033447266, + "learning_rate": 7.262314058599496e-06, + "loss": 4.6762, + "step": 68985 + }, + { + "epoch": 1.4036051432291667, + "grad_norm": 16.52606201171875, + "learning_rate": 7.2619576257081035e-06, + "loss": 4.953, + "step": 68990 + }, + { + "epoch": 1.4037068684895833, + "grad_norm": 17.924407958984375, + "learning_rate": 7.261601178363521e-06, + "loss": 4.7671, + "step": 68995 + }, + { + "epoch": 1.40380859375, + "grad_norm": 18.131044387817383, + "learning_rate": 7.261244716568025e-06, + "loss": 4.9283, + "step": 69000 + }, + { + "epoch": 1.4039103190104167, + "grad_norm": 18.370981216430664, + "learning_rate": 7.260888240323893e-06, + "loss": 5.1354, + "step": 69005 + }, + { + "epoch": 1.4040120442708333, + "grad_norm": 23.906522750854492, + "learning_rate": 7.260531749633402e-06, + "loss": 4.9732, + "step": 69010 + }, + { + "epoch": 1.40411376953125, + "grad_norm": 16.648944854736328, + "learning_rate": 7.26017524449883e-06, + "loss": 5.0528, + "step": 69015 + }, + { + "epoch": 1.4042154947916667, + "grad_norm": 13.332876205444336, + "learning_rate": 7.259818724922458e-06, + "loss": 4.9613, + "step": 69020 + }, + { + "epoch": 1.4043172200520833, + "grad_norm": 14.678479194641113, + "learning_rate": 7.259462190906561e-06, + "loss": 4.806, + "step": 69025 + }, + { + "epoch": 1.4044189453125, + "grad_norm": 19.882108688354492, + "learning_rate": 7.259105642453417e-06, + "loss": 4.9735, + "step": 69030 + }, + { + "epoch": 1.4045206705729167, + "grad_norm": 26.073070526123047, + "learning_rate": 7.258749079565306e-06, + "loss": 4.9311, + "step": 69035 + }, + { + "epoch": 1.4046223958333333, + "grad_norm": 19.182340621948242, + "learning_rate": 7.2583925022445045e-06, + "loss": 5.0404, + "step": 69040 + }, + { + "epoch": 1.40472412109375, + "grad_norm": 19.21050262451172, + "learning_rate": 7.258035910493293e-06, + "loss": 4.7691, + "step": 69045 + }, + { + "epoch": 1.4048258463541667, + "grad_norm": 19.75845718383789, + "learning_rate": 7.257679304313947e-06, + "loss": 4.7985, + "step": 69050 + }, + { + "epoch": 1.4049275716145833, + "grad_norm": 16.39754867553711, + "learning_rate": 7.257322683708748e-06, + "loss": 5.1601, + "step": 69055 + }, + { + "epoch": 1.405029296875, + "grad_norm": 15.511577606201172, + "learning_rate": 7.256966048679973e-06, + "loss": 5.2813, + "step": 69060 + }, + { + "epoch": 1.4051310221354167, + "grad_norm": 24.666479110717773, + "learning_rate": 7.2566093992299025e-06, + "loss": 4.9531, + "step": 69065 + }, + { + "epoch": 1.4052327473958333, + "grad_norm": 17.231355667114258, + "learning_rate": 7.256252735360814e-06, + "loss": 4.9452, + "step": 69070 + }, + { + "epoch": 1.40533447265625, + "grad_norm": 25.007173538208008, + "learning_rate": 7.255896057074987e-06, + "loss": 4.767, + "step": 69075 + }, + { + "epoch": 1.4054361979166667, + "grad_norm": 18.048187255859375, + "learning_rate": 7.255539364374699e-06, + "loss": 5.0867, + "step": 69080 + }, + { + "epoch": 1.4055379231770833, + "grad_norm": 15.593840599060059, + "learning_rate": 7.255182657262232e-06, + "loss": 4.9427, + "step": 69085 + }, + { + "epoch": 1.4056396484375, + "grad_norm": 17.070337295532227, + "learning_rate": 7.254825935739863e-06, + "loss": 4.8748, + "step": 69090 + }, + { + "epoch": 1.4057413736979167, + "grad_norm": 18.7763614654541, + "learning_rate": 7.254469199809871e-06, + "loss": 4.8912, + "step": 69095 + }, + { + "epoch": 1.4058430989583333, + "grad_norm": 18.130807876586914, + "learning_rate": 7.254112449474539e-06, + "loss": 5.2274, + "step": 69100 + }, + { + "epoch": 1.40594482421875, + "grad_norm": 17.683927536010742, + "learning_rate": 7.253755684736142e-06, + "loss": 4.8746, + "step": 69105 + }, + { + "epoch": 1.4060465494791667, + "grad_norm": 17.526357650756836, + "learning_rate": 7.253398905596962e-06, + "loss": 4.8855, + "step": 69110 + }, + { + "epoch": 1.4061482747395833, + "grad_norm": 16.83051872253418, + "learning_rate": 7.253042112059279e-06, + "loss": 4.8298, + "step": 69115 + }, + { + "epoch": 1.40625, + "grad_norm": 19.21038246154785, + "learning_rate": 7.252685304125371e-06, + "loss": 5.0242, + "step": 69120 + }, + { + "epoch": 1.4063517252604167, + "grad_norm": 21.737089157104492, + "learning_rate": 7.252328481797521e-06, + "loss": 4.9475, + "step": 69125 + }, + { + "epoch": 1.4064534505208333, + "grad_norm": 19.67099380493164, + "learning_rate": 7.2519716450780055e-06, + "loss": 4.9432, + "step": 69130 + }, + { + "epoch": 1.40655517578125, + "grad_norm": 18.995615005493164, + "learning_rate": 7.251614793969106e-06, + "loss": 4.6601, + "step": 69135 + }, + { + "epoch": 1.4066569010416667, + "grad_norm": 17.171279907226562, + "learning_rate": 7.251257928473104e-06, + "loss": 5.0962, + "step": 69140 + }, + { + "epoch": 1.4067586263020833, + "grad_norm": 19.316783905029297, + "learning_rate": 7.250901048592276e-06, + "loss": 5.1993, + "step": 69145 + }, + { + "epoch": 1.4068603515625, + "grad_norm": 18.571866989135742, + "learning_rate": 7.2505441543289064e-06, + "loss": 5.1388, + "step": 69150 + }, + { + "epoch": 1.4069620768229167, + "grad_norm": 17.470447540283203, + "learning_rate": 7.250187245685274e-06, + "loss": 4.8998, + "step": 69155 + }, + { + "epoch": 1.4070638020833333, + "grad_norm": 19.094858169555664, + "learning_rate": 7.249830322663659e-06, + "loss": 5.3519, + "step": 69160 + }, + { + "epoch": 1.40716552734375, + "grad_norm": 15.676965713500977, + "learning_rate": 7.249473385266344e-06, + "loss": 5.1978, + "step": 69165 + }, + { + "epoch": 1.4072672526041667, + "grad_norm": 16.735754013061523, + "learning_rate": 7.249116433495605e-06, + "loss": 4.9822, + "step": 69170 + }, + { + "epoch": 1.4073689778645833, + "grad_norm": 28.41473388671875, + "learning_rate": 7.248759467353729e-06, + "loss": 4.9962, + "step": 69175 + }, + { + "epoch": 1.407470703125, + "grad_norm": 22.492238998413086, + "learning_rate": 7.248402486842992e-06, + "loss": 5.2886, + "step": 69180 + }, + { + "epoch": 1.4075724283854167, + "grad_norm": 19.72259521484375, + "learning_rate": 7.248045491965678e-06, + "loss": 4.8155, + "step": 69185 + }, + { + "epoch": 1.4076741536458333, + "grad_norm": 19.72377586364746, + "learning_rate": 7.247688482724065e-06, + "loss": 4.9755, + "step": 69190 + }, + { + "epoch": 1.40777587890625, + "grad_norm": 24.866527557373047, + "learning_rate": 7.2473314591204395e-06, + "loss": 5.3231, + "step": 69195 + }, + { + "epoch": 1.4078776041666667, + "grad_norm": 16.764850616455078, + "learning_rate": 7.246974421157076e-06, + "loss": 4.9025, + "step": 69200 + }, + { + "epoch": 1.4079793294270833, + "grad_norm": 20.816162109375, + "learning_rate": 7.246617368836262e-06, + "loss": 4.8889, + "step": 69205 + }, + { + "epoch": 1.4080810546875, + "grad_norm": 14.285648345947266, + "learning_rate": 7.246260302160273e-06, + "loss": 4.9678, + "step": 69210 + }, + { + "epoch": 1.4081827799479167, + "grad_norm": 20.316024780273438, + "learning_rate": 7.245903221131396e-06, + "loss": 4.895, + "step": 69215 + }, + { + "epoch": 1.4082845052083333, + "grad_norm": 14.89846134185791, + "learning_rate": 7.245546125751911e-06, + "loss": 4.814, + "step": 69220 + }, + { + "epoch": 1.40838623046875, + "grad_norm": 21.237211227416992, + "learning_rate": 7.245189016024097e-06, + "loss": 4.9827, + "step": 69225 + }, + { + "epoch": 1.4084879557291667, + "grad_norm": 21.224756240844727, + "learning_rate": 7.244831891950239e-06, + "loss": 4.9513, + "step": 69230 + }, + { + "epoch": 1.4085896809895833, + "grad_norm": 16.632448196411133, + "learning_rate": 7.244474753532618e-06, + "loss": 5.2205, + "step": 69235 + }, + { + "epoch": 1.40869140625, + "grad_norm": 17.201873779296875, + "learning_rate": 7.244117600773516e-06, + "loss": 4.7744, + "step": 69240 + }, + { + "epoch": 1.4087931315104167, + "grad_norm": 17.22106170654297, + "learning_rate": 7.243760433675216e-06, + "loss": 4.801, + "step": 69245 + }, + { + "epoch": 1.4088948567708333, + "grad_norm": 18.67239761352539, + "learning_rate": 7.243403252239995e-06, + "loss": 5.1177, + "step": 69250 + }, + { + "epoch": 1.40899658203125, + "grad_norm": 15.695194244384766, + "learning_rate": 7.243046056470143e-06, + "loss": 4.9372, + "step": 69255 + }, + { + "epoch": 1.4090983072916667, + "grad_norm": 16.204586029052734, + "learning_rate": 7.242688846367938e-06, + "loss": 4.7444, + "step": 69260 + }, + { + "epoch": 1.4092000325520833, + "grad_norm": 17.81111717224121, + "learning_rate": 7.2423316219356634e-06, + "loss": 4.6393, + "step": 69265 + }, + { + "epoch": 1.4093017578125, + "grad_norm": 16.53125, + "learning_rate": 7.2419743831756e-06, + "loss": 4.741, + "step": 69270 + }, + { + "epoch": 1.4094034830729167, + "grad_norm": 20.66327476501465, + "learning_rate": 7.241617130090033e-06, + "loss": 4.9286, + "step": 69275 + }, + { + "epoch": 1.4095052083333333, + "grad_norm": 18.498754501342773, + "learning_rate": 7.241259862681245e-06, + "loss": 4.831, + "step": 69280 + }, + { + "epoch": 1.40960693359375, + "grad_norm": 19.044946670532227, + "learning_rate": 7.240902580951518e-06, + "loss": 4.8165, + "step": 69285 + }, + { + "epoch": 1.4097086588541667, + "grad_norm": 17.331350326538086, + "learning_rate": 7.2405452849031335e-06, + "loss": 4.8073, + "step": 69290 + }, + { + "epoch": 1.4098103841145833, + "grad_norm": 25.8282413482666, + "learning_rate": 7.240187974538376e-06, + "loss": 5.1362, + "step": 69295 + }, + { + "epoch": 1.409912109375, + "grad_norm": 16.370464324951172, + "learning_rate": 7.239830649859529e-06, + "loss": 4.7902, + "step": 69300 + }, + { + "epoch": 1.4100138346354167, + "grad_norm": 18.647953033447266, + "learning_rate": 7.239473310868874e-06, + "loss": 5.3263, + "step": 69305 + }, + { + "epoch": 1.4101155598958333, + "grad_norm": 19.048751831054688, + "learning_rate": 7.239115957568696e-06, + "loss": 4.943, + "step": 69310 + }, + { + "epoch": 1.41021728515625, + "grad_norm": 17.19240379333496, + "learning_rate": 7.238758589961279e-06, + "loss": 5.2692, + "step": 69315 + }, + { + "epoch": 1.4103190104166667, + "grad_norm": 16.210498809814453, + "learning_rate": 7.2384012080489045e-06, + "loss": 5.0116, + "step": 69320 + }, + { + "epoch": 1.4104207356770833, + "grad_norm": 16.299951553344727, + "learning_rate": 7.238043811833857e-06, + "loss": 5.0511, + "step": 69325 + }, + { + "epoch": 1.4105224609375, + "grad_norm": 17.205411911010742, + "learning_rate": 7.237686401318419e-06, + "loss": 4.8846, + "step": 69330 + }, + { + "epoch": 1.4106241861979167, + "grad_norm": 19.702634811401367, + "learning_rate": 7.237328976504877e-06, + "loss": 4.8489, + "step": 69335 + }, + { + "epoch": 1.4107259114583333, + "grad_norm": 16.798595428466797, + "learning_rate": 7.236971537395512e-06, + "loss": 5.2199, + "step": 69340 + }, + { + "epoch": 1.41082763671875, + "grad_norm": 15.026069641113281, + "learning_rate": 7.236614083992609e-06, + "loss": 4.8007, + "step": 69345 + }, + { + "epoch": 1.4109293619791667, + "grad_norm": 19.205604553222656, + "learning_rate": 7.236256616298453e-06, + "loss": 4.9366, + "step": 69350 + }, + { + "epoch": 1.4110310872395833, + "grad_norm": 14.903715133666992, + "learning_rate": 7.235899134315325e-06, + "loss": 4.8876, + "step": 69355 + }, + { + "epoch": 1.4111328125, + "grad_norm": 18.166128158569336, + "learning_rate": 7.235541638045514e-06, + "loss": 4.9053, + "step": 69360 + }, + { + "epoch": 1.4112345377604167, + "grad_norm": 18.8953914642334, + "learning_rate": 7.235184127491299e-06, + "loss": 4.7194, + "step": 69365 + }, + { + "epoch": 1.4113362630208333, + "grad_norm": 17.70001792907715, + "learning_rate": 7.2348266026549685e-06, + "loss": 4.6234, + "step": 69370 + }, + { + "epoch": 1.41143798828125, + "grad_norm": 22.002485275268555, + "learning_rate": 7.2344690635388045e-06, + "loss": 5.0426, + "step": 69375 + }, + { + "epoch": 1.4115397135416667, + "grad_norm": 19.129945755004883, + "learning_rate": 7.234111510145093e-06, + "loss": 4.7996, + "step": 69380 + }, + { + "epoch": 1.4116414388020833, + "grad_norm": 29.96672821044922, + "learning_rate": 7.233753942476119e-06, + "loss": 5.1046, + "step": 69385 + }, + { + "epoch": 1.4117431640625, + "grad_norm": 18.011852264404297, + "learning_rate": 7.233396360534165e-06, + "loss": 5.2643, + "step": 69390 + }, + { + "epoch": 1.4118448893229167, + "grad_norm": 17.897422790527344, + "learning_rate": 7.233038764321518e-06, + "loss": 4.904, + "step": 69395 + }, + { + "epoch": 1.4119466145833333, + "grad_norm": 17.37318992614746, + "learning_rate": 7.232681153840463e-06, + "loss": 4.8718, + "step": 69400 + }, + { + "epoch": 1.41204833984375, + "grad_norm": 25.03412437438965, + "learning_rate": 7.232323529093282e-06, + "loss": 4.9164, + "step": 69405 + }, + { + "epoch": 1.4121500651041667, + "grad_norm": 21.750423431396484, + "learning_rate": 7.231965890082264e-06, + "loss": 5.0697, + "step": 69410 + }, + { + "epoch": 1.4122517903645833, + "grad_norm": 14.81002140045166, + "learning_rate": 7.2316082368096925e-06, + "loss": 4.7126, + "step": 69415 + }, + { + "epoch": 1.412353515625, + "grad_norm": 15.221976280212402, + "learning_rate": 7.2312505692778525e-06, + "loss": 4.944, + "step": 69420 + }, + { + "epoch": 1.4124552408854167, + "grad_norm": 14.785808563232422, + "learning_rate": 7.230892887489029e-06, + "loss": 4.9424, + "step": 69425 + }, + { + "epoch": 1.4125569661458333, + "grad_norm": 21.234081268310547, + "learning_rate": 7.230535191445508e-06, + "loss": 4.8541, + "step": 69430 + }, + { + "epoch": 1.41265869140625, + "grad_norm": 18.93303871154785, + "learning_rate": 7.2301774811495765e-06, + "loss": 5.2639, + "step": 69435 + }, + { + "epoch": 1.4127604166666667, + "grad_norm": 13.752723693847656, + "learning_rate": 7.229819756603518e-06, + "loss": 4.8936, + "step": 69440 + }, + { + "epoch": 1.4128621419270833, + "grad_norm": 21.25027847290039, + "learning_rate": 7.229462017809618e-06, + "loss": 4.9562, + "step": 69445 + }, + { + "epoch": 1.4129638671875, + "grad_norm": 16.08287239074707, + "learning_rate": 7.229104264770165e-06, + "loss": 5.1601, + "step": 69450 + }, + { + "epoch": 1.4130655924479167, + "grad_norm": 18.3336181640625, + "learning_rate": 7.228746497487443e-06, + "loss": 4.9264, + "step": 69455 + }, + { + "epoch": 1.4131673177083333, + "grad_norm": 21.58991813659668, + "learning_rate": 7.228388715963738e-06, + "loss": 4.853, + "step": 69460 + }, + { + "epoch": 1.41326904296875, + "grad_norm": 18.75337791442871, + "learning_rate": 7.228030920201337e-06, + "loss": 4.7263, + "step": 69465 + }, + { + "epoch": 1.4133707682291667, + "grad_norm": 23.447914123535156, + "learning_rate": 7.227673110202524e-06, + "loss": 4.605, + "step": 69470 + }, + { + "epoch": 1.4134724934895833, + "grad_norm": 21.213132858276367, + "learning_rate": 7.22731528596959e-06, + "loss": 4.9736, + "step": 69475 + }, + { + "epoch": 1.41357421875, + "grad_norm": 15.232315063476562, + "learning_rate": 7.226957447504816e-06, + "loss": 4.6827, + "step": 69480 + }, + { + "epoch": 1.4136759440104167, + "grad_norm": 16.804685592651367, + "learning_rate": 7.226599594810492e-06, + "loss": 4.9481, + "step": 69485 + }, + { + "epoch": 1.4137776692708333, + "grad_norm": 21.75676918029785, + "learning_rate": 7.226241727888904e-06, + "loss": 5.1223, + "step": 69490 + }, + { + "epoch": 1.41387939453125, + "grad_norm": 30.025266647338867, + "learning_rate": 7.225883846742336e-06, + "loss": 5.0074, + "step": 69495 + }, + { + "epoch": 1.4139811197916667, + "grad_norm": 16.664175033569336, + "learning_rate": 7.225525951373079e-06, + "loss": 4.8695, + "step": 69500 + }, + { + "epoch": 1.4140828450520833, + "grad_norm": 16.264312744140625, + "learning_rate": 7.225168041783417e-06, + "loss": 5.2332, + "step": 69505 + }, + { + "epoch": 1.4141845703125, + "grad_norm": 24.9761905670166, + "learning_rate": 7.2248101179756354e-06, + "loss": 5.0887, + "step": 69510 + }, + { + "epoch": 1.4142862955729167, + "grad_norm": 12.551529884338379, + "learning_rate": 7.224452179952026e-06, + "loss": 4.8517, + "step": 69515 + }, + { + "epoch": 1.4143880208333333, + "grad_norm": 16.9583797454834, + "learning_rate": 7.224094227714873e-06, + "loss": 5.0298, + "step": 69520 + }, + { + "epoch": 1.41448974609375, + "grad_norm": 22.239002227783203, + "learning_rate": 7.223736261266463e-06, + "loss": 4.7168, + "step": 69525 + }, + { + "epoch": 1.4145914713541667, + "grad_norm": 16.315311431884766, + "learning_rate": 7.2233782806090854e-06, + "loss": 5.0563, + "step": 69530 + }, + { + "epoch": 1.4146931966145833, + "grad_norm": 19.079444885253906, + "learning_rate": 7.223020285745025e-06, + "loss": 4.9926, + "step": 69535 + }, + { + "epoch": 1.414794921875, + "grad_norm": 26.762849807739258, + "learning_rate": 7.222662276676572e-06, + "loss": 4.7146, + "step": 69540 + }, + { + "epoch": 1.4148966471354167, + "grad_norm": 24.058818817138672, + "learning_rate": 7.222304253406012e-06, + "loss": 4.8239, + "step": 69545 + }, + { + "epoch": 1.4149983723958333, + "grad_norm": 16.918127059936523, + "learning_rate": 7.2219462159356325e-06, + "loss": 4.8698, + "step": 69550 + }, + { + "epoch": 1.41510009765625, + "grad_norm": 20.181509017944336, + "learning_rate": 7.221588164267723e-06, + "loss": 5.0064, + "step": 69555 + }, + { + "epoch": 1.4152018229166667, + "grad_norm": 13.83988094329834, + "learning_rate": 7.22123009840457e-06, + "loss": 4.9227, + "step": 69560 + }, + { + "epoch": 1.4153035481770833, + "grad_norm": 22.745624542236328, + "learning_rate": 7.220872018348461e-06, + "loss": 4.7266, + "step": 69565 + }, + { + "epoch": 1.4154052734375, + "grad_norm": 16.7616024017334, + "learning_rate": 7.220513924101686e-06, + "loss": 5.0558, + "step": 69570 + }, + { + "epoch": 1.4155069986979167, + "grad_norm": 17.734546661376953, + "learning_rate": 7.220155815666532e-06, + "loss": 5.0045, + "step": 69575 + }, + { + "epoch": 1.4156087239583333, + "grad_norm": 16.74460792541504, + "learning_rate": 7.2197976930452865e-06, + "loss": 5.0329, + "step": 69580 + }, + { + "epoch": 1.41571044921875, + "grad_norm": 20.19151496887207, + "learning_rate": 7.219439556240239e-06, + "loss": 4.9603, + "step": 69585 + }, + { + "epoch": 1.4158121744791667, + "grad_norm": 17.9749698638916, + "learning_rate": 7.2190814052536765e-06, + "loss": 4.6552, + "step": 69590 + }, + { + "epoch": 1.4159138997395833, + "grad_norm": 20.30743408203125, + "learning_rate": 7.218723240087888e-06, + "loss": 4.9941, + "step": 69595 + }, + { + "epoch": 1.416015625, + "grad_norm": 15.958647727966309, + "learning_rate": 7.218365060745163e-06, + "loss": 4.9116, + "step": 69600 + }, + { + "epoch": 1.4161173502604167, + "grad_norm": 14.457865715026855, + "learning_rate": 7.218006867227789e-06, + "loss": 4.9956, + "step": 69605 + }, + { + "epoch": 1.4162190755208333, + "grad_norm": 18.557769775390625, + "learning_rate": 7.2176486595380545e-06, + "loss": 4.8954, + "step": 69610 + }, + { + "epoch": 1.41632080078125, + "grad_norm": 16.561553955078125, + "learning_rate": 7.21729043767825e-06, + "loss": 4.6903, + "step": 69615 + }, + { + "epoch": 1.4164225260416667, + "grad_norm": 24.788362503051758, + "learning_rate": 7.216932201650665e-06, + "loss": 4.9882, + "step": 69620 + }, + { + "epoch": 1.4165242513020833, + "grad_norm": 22.92694664001465, + "learning_rate": 7.216573951457585e-06, + "loss": 5.0996, + "step": 69625 + }, + { + "epoch": 1.4166259765625, + "grad_norm": 18.24521827697754, + "learning_rate": 7.216215687101302e-06, + "loss": 4.9764, + "step": 69630 + }, + { + "epoch": 1.4167277018229167, + "grad_norm": 20.474241256713867, + "learning_rate": 7.215857408584104e-06, + "loss": 4.7398, + "step": 69635 + }, + { + "epoch": 1.4168294270833333, + "grad_norm": 26.065488815307617, + "learning_rate": 7.21549911590828e-06, + "loss": 4.8577, + "step": 69640 + }, + { + "epoch": 1.41693115234375, + "grad_norm": 17.81722068786621, + "learning_rate": 7.215140809076121e-06, + "loss": 5.2462, + "step": 69645 + }, + { + "epoch": 1.4170328776041667, + "grad_norm": 15.501335144042969, + "learning_rate": 7.214782488089914e-06, + "loss": 4.9111, + "step": 69650 + }, + { + "epoch": 1.4171346028645833, + "grad_norm": 20.951412200927734, + "learning_rate": 7.214424152951951e-06, + "loss": 4.9695, + "step": 69655 + }, + { + "epoch": 1.417236328125, + "grad_norm": 18.323938369750977, + "learning_rate": 7.214065803664522e-06, + "loss": 4.8656, + "step": 69660 + }, + { + "epoch": 1.4173380533854167, + "grad_norm": 22.230018615722656, + "learning_rate": 7.213707440229913e-06, + "loss": 5.1525, + "step": 69665 + }, + { + "epoch": 1.4174397786458333, + "grad_norm": 19.240224838256836, + "learning_rate": 7.213349062650417e-06, + "loss": 4.8318, + "step": 69670 + }, + { + "epoch": 1.41754150390625, + "grad_norm": 18.876750946044922, + "learning_rate": 7.212990670928322e-06, + "loss": 4.9956, + "step": 69675 + }, + { + "epoch": 1.4176432291666667, + "grad_norm": 16.776357650756836, + "learning_rate": 7.212632265065921e-06, + "loss": 4.756, + "step": 69680 + }, + { + "epoch": 1.4177449544270833, + "grad_norm": 15.633723258972168, + "learning_rate": 7.212273845065501e-06, + "loss": 5.1318, + "step": 69685 + }, + { + "epoch": 1.4178466796875, + "grad_norm": 26.099475860595703, + "learning_rate": 7.2119154109293545e-06, + "loss": 4.7952, + "step": 69690 + }, + { + "epoch": 1.4179484049479167, + "grad_norm": 19.17020606994629, + "learning_rate": 7.211556962659769e-06, + "loss": 5.0277, + "step": 69695 + }, + { + "epoch": 1.4180501302083333, + "grad_norm": 17.463354110717773, + "learning_rate": 7.211198500259039e-06, + "loss": 4.9206, + "step": 69700 + }, + { + "epoch": 1.41815185546875, + "grad_norm": 16.802974700927734, + "learning_rate": 7.210840023729451e-06, + "loss": 5.1143, + "step": 69705 + }, + { + "epoch": 1.4182535807291667, + "grad_norm": 20.428768157958984, + "learning_rate": 7.210481533073296e-06, + "loss": 4.9142, + "step": 69710 + }, + { + "epoch": 1.4183553059895833, + "grad_norm": 13.969022750854492, + "learning_rate": 7.210123028292867e-06, + "loss": 4.9807, + "step": 69715 + }, + { + "epoch": 1.41845703125, + "grad_norm": 21.034753799438477, + "learning_rate": 7.2097645093904524e-06, + "loss": 5.0366, + "step": 69720 + }, + { + "epoch": 1.4185587565104167, + "grad_norm": 20.019258499145508, + "learning_rate": 7.2094059763683454e-06, + "loss": 4.955, + "step": 69725 + }, + { + "epoch": 1.4186604817708333, + "grad_norm": 12.22304916381836, + "learning_rate": 7.209047429228835e-06, + "loss": 4.8411, + "step": 69730 + }, + { + "epoch": 1.41876220703125, + "grad_norm": 21.303497314453125, + "learning_rate": 7.208688867974213e-06, + "loss": 4.708, + "step": 69735 + }, + { + "epoch": 1.4188639322916667, + "grad_norm": 18.149314880371094, + "learning_rate": 7.208330292606769e-06, + "loss": 4.6114, + "step": 69740 + }, + { + "epoch": 1.4189656575520833, + "grad_norm": 17.72163963317871, + "learning_rate": 7.207971703128798e-06, + "loss": 4.8809, + "step": 69745 + }, + { + "epoch": 1.4190673828125, + "grad_norm": 18.18993377685547, + "learning_rate": 7.207613099542586e-06, + "loss": 5.3685, + "step": 69750 + }, + { + "epoch": 1.4191691080729167, + "grad_norm": 19.01230812072754, + "learning_rate": 7.207254481850429e-06, + "loss": 5.0474, + "step": 69755 + }, + { + "epoch": 1.4192708333333333, + "grad_norm": 15.884658813476562, + "learning_rate": 7.206895850054616e-06, + "loss": 4.8207, + "step": 69760 + }, + { + "epoch": 1.41937255859375, + "grad_norm": 20.913564682006836, + "learning_rate": 7.206537204157437e-06, + "loss": 4.8547, + "step": 69765 + }, + { + "epoch": 1.4194742838541667, + "grad_norm": 12.91111946105957, + "learning_rate": 7.2061785441611885e-06, + "loss": 5.0035, + "step": 69770 + }, + { + "epoch": 1.4195760091145833, + "grad_norm": 19.18583106994629, + "learning_rate": 7.205819870068161e-06, + "loss": 4.8988, + "step": 69775 + }, + { + "epoch": 1.419677734375, + "grad_norm": 19.068241119384766, + "learning_rate": 7.205461181880641e-06, + "loss": 4.9162, + "step": 69780 + }, + { + "epoch": 1.4197794596354167, + "grad_norm": 17.558609008789062, + "learning_rate": 7.205102479600925e-06, + "loss": 5.1943, + "step": 69785 + }, + { + "epoch": 1.4198811848958333, + "grad_norm": 13.927706718444824, + "learning_rate": 7.2047437632313055e-06, + "loss": 4.9164, + "step": 69790 + }, + { + "epoch": 1.41998291015625, + "grad_norm": 12.540220260620117, + "learning_rate": 7.204385032774072e-06, + "loss": 4.6257, + "step": 69795 + }, + { + "epoch": 1.4200846354166667, + "grad_norm": 20.688594818115234, + "learning_rate": 7.204026288231518e-06, + "loss": 4.8182, + "step": 69800 + }, + { + "epoch": 1.4201863606770833, + "grad_norm": 15.835996627807617, + "learning_rate": 7.203667529605936e-06, + "loss": 4.969, + "step": 69805 + }, + { + "epoch": 1.4202880859375, + "grad_norm": 18.342754364013672, + "learning_rate": 7.203308756899619e-06, + "loss": 4.8928, + "step": 69810 + }, + { + "epoch": 1.4203898111979167, + "grad_norm": 13.415194511413574, + "learning_rate": 7.2029499701148576e-06, + "loss": 4.9797, + "step": 69815 + }, + { + "epoch": 1.4204915364583333, + "grad_norm": 19.130231857299805, + "learning_rate": 7.202591169253944e-06, + "loss": 4.9779, + "step": 69820 + }, + { + "epoch": 1.42059326171875, + "grad_norm": 20.830799102783203, + "learning_rate": 7.202232354319174e-06, + "loss": 4.941, + "step": 69825 + }, + { + "epoch": 1.4206949869791667, + "grad_norm": 17.36247444152832, + "learning_rate": 7.201873525312837e-06, + "loss": 4.9165, + "step": 69830 + }, + { + "epoch": 1.4207967122395833, + "grad_norm": 23.362789154052734, + "learning_rate": 7.201514682237227e-06, + "loss": 4.8765, + "step": 69835 + }, + { + "epoch": 1.4208984375, + "grad_norm": 22.617887496948242, + "learning_rate": 7.201155825094639e-06, + "loss": 4.9069, + "step": 69840 + }, + { + "epoch": 1.4210001627604167, + "grad_norm": 12.854693412780762, + "learning_rate": 7.200796953887361e-06, + "loss": 5.132, + "step": 69845 + }, + { + "epoch": 1.4211018880208333, + "grad_norm": 19.40606689453125, + "learning_rate": 7.20043806861769e-06, + "loss": 5.0253, + "step": 69850 + }, + { + "epoch": 1.42120361328125, + "grad_norm": 18.959686279296875, + "learning_rate": 7.20007916928792e-06, + "loss": 4.7575, + "step": 69855 + }, + { + "epoch": 1.4213053385416667, + "grad_norm": 14.42251205444336, + "learning_rate": 7.199720255900341e-06, + "loss": 5.1113, + "step": 69860 + }, + { + "epoch": 1.4214070638020833, + "grad_norm": 17.0681209564209, + "learning_rate": 7.199361328457248e-06, + "loss": 5.0996, + "step": 69865 + }, + { + "epoch": 1.4215087890625, + "grad_norm": 17.761005401611328, + "learning_rate": 7.199002386960935e-06, + "loss": 4.8968, + "step": 69870 + }, + { + "epoch": 1.4216105143229167, + "grad_norm": 14.286431312561035, + "learning_rate": 7.198643431413694e-06, + "loss": 4.7011, + "step": 69875 + }, + { + "epoch": 1.4217122395833333, + "grad_norm": 18.730693817138672, + "learning_rate": 7.19828446181782e-06, + "loss": 4.9915, + "step": 69880 + }, + { + "epoch": 1.42181396484375, + "grad_norm": 19.892478942871094, + "learning_rate": 7.197925478175606e-06, + "loss": 4.7851, + "step": 69885 + }, + { + "epoch": 1.4219156901041667, + "grad_norm": 14.59469223022461, + "learning_rate": 7.197566480489346e-06, + "loss": 5.179, + "step": 69890 + }, + { + "epoch": 1.4220174153645833, + "grad_norm": 15.453774452209473, + "learning_rate": 7.197207468761333e-06, + "loss": 5.1035, + "step": 69895 + }, + { + "epoch": 1.422119140625, + "grad_norm": 23.821041107177734, + "learning_rate": 7.196848442993863e-06, + "loss": 5.0309, + "step": 69900 + }, + { + "epoch": 1.4222208658854167, + "grad_norm": 16.56389808654785, + "learning_rate": 7.196489403189228e-06, + "loss": 5.052, + "step": 69905 + }, + { + "epoch": 1.4223225911458333, + "grad_norm": 20.362062454223633, + "learning_rate": 7.196130349349723e-06, + "loss": 4.7609, + "step": 69910 + }, + { + "epoch": 1.42242431640625, + "grad_norm": 18.079896926879883, + "learning_rate": 7.195771281477644e-06, + "loss": 5.2088, + "step": 69915 + }, + { + "epoch": 1.4225260416666667, + "grad_norm": 16.63059425354004, + "learning_rate": 7.195412199575282e-06, + "loss": 5.1144, + "step": 69920 + }, + { + "epoch": 1.4226277669270833, + "grad_norm": 16.361387252807617, + "learning_rate": 7.195053103644932e-06, + "loss": 5.085, + "step": 69925 + }, + { + "epoch": 1.4227294921875, + "grad_norm": 17.131393432617188, + "learning_rate": 7.194693993688891e-06, + "loss": 4.8513, + "step": 69930 + }, + { + "epoch": 1.4228312174479167, + "grad_norm": 21.13148307800293, + "learning_rate": 7.19433486970945e-06, + "loss": 4.9272, + "step": 69935 + }, + { + "epoch": 1.4229329427083333, + "grad_norm": 17.226530075073242, + "learning_rate": 7.193975731708908e-06, + "loss": 5.1498, + "step": 69940 + }, + { + "epoch": 1.42303466796875, + "grad_norm": 19.096099853515625, + "learning_rate": 7.193616579689558e-06, + "loss": 4.8896, + "step": 69945 + }, + { + "epoch": 1.4231363932291667, + "grad_norm": 16.1047420501709, + "learning_rate": 7.193257413653692e-06, + "loss": 4.9244, + "step": 69950 + }, + { + "epoch": 1.4232381184895833, + "grad_norm": 15.994318962097168, + "learning_rate": 7.192898233603609e-06, + "loss": 4.8973, + "step": 69955 + }, + { + "epoch": 1.42333984375, + "grad_norm": 16.974794387817383, + "learning_rate": 7.192539039541601e-06, + "loss": 4.9646, + "step": 69960 + }, + { + "epoch": 1.4234415690104167, + "grad_norm": 17.251033782958984, + "learning_rate": 7.192179831469967e-06, + "loss": 5.1696, + "step": 69965 + }, + { + "epoch": 1.4235432942708333, + "grad_norm": 20.025922775268555, + "learning_rate": 7.191820609390998e-06, + "loss": 5.334, + "step": 69970 + }, + { + "epoch": 1.42364501953125, + "grad_norm": 17.506641387939453, + "learning_rate": 7.19146137330699e-06, + "loss": 5.1227, + "step": 69975 + }, + { + "epoch": 1.4237467447916667, + "grad_norm": 23.794179916381836, + "learning_rate": 7.191102123220242e-06, + "loss": 5.072, + "step": 69980 + }, + { + "epoch": 1.4238484700520833, + "grad_norm": 15.338414192199707, + "learning_rate": 7.190742859133045e-06, + "loss": 4.8247, + "step": 69985 + }, + { + "epoch": 1.4239501953125, + "grad_norm": 19.619016647338867, + "learning_rate": 7.1903835810476975e-06, + "loss": 4.7988, + "step": 69990 + }, + { + "epoch": 1.4240519205729167, + "grad_norm": 15.30238151550293, + "learning_rate": 7.190024288966494e-06, + "loss": 4.7264, + "step": 69995 + }, + { + "epoch": 1.4241536458333333, + "grad_norm": 16.836870193481445, + "learning_rate": 7.189664982891729e-06, + "loss": 4.7117, + "step": 70000 + }, + { + "epoch": 1.42425537109375, + "grad_norm": 29.725473403930664, + "learning_rate": 7.189305662825703e-06, + "loss": 5.0607, + "step": 70005 + }, + { + "epoch": 1.4243570963541667, + "grad_norm": 22.261323928833008, + "learning_rate": 7.188946328770707e-06, + "loss": 5.1496, + "step": 70010 + }, + { + "epoch": 1.4244588216145833, + "grad_norm": 15.625893592834473, + "learning_rate": 7.188586980729039e-06, + "loss": 4.7504, + "step": 70015 + }, + { + "epoch": 1.424560546875, + "grad_norm": 15.779646873474121, + "learning_rate": 7.188227618702995e-06, + "loss": 4.8734, + "step": 70020 + }, + { + "epoch": 1.4246622721354167, + "grad_norm": 25.75416374206543, + "learning_rate": 7.187868242694871e-06, + "loss": 4.8306, + "step": 70025 + }, + { + "epoch": 1.4247639973958333, + "grad_norm": 24.742111206054688, + "learning_rate": 7.187508852706964e-06, + "loss": 4.96, + "step": 70030 + }, + { + "epoch": 1.42486572265625, + "grad_norm": 15.302083969116211, + "learning_rate": 7.1871494487415706e-06, + "loss": 4.9812, + "step": 70035 + }, + { + "epoch": 1.4249674479166667, + "grad_norm": 15.485260963439941, + "learning_rate": 7.186790030800984e-06, + "loss": 4.6548, + "step": 70040 + }, + { + "epoch": 1.4250691731770833, + "grad_norm": 13.799511909484863, + "learning_rate": 7.186430598887505e-06, + "loss": 4.5684, + "step": 70045 + }, + { + "epoch": 1.4251708984375, + "grad_norm": 18.650503158569336, + "learning_rate": 7.1860711530034275e-06, + "loss": 4.8933, + "step": 70050 + }, + { + "epoch": 1.4252726236979167, + "grad_norm": 18.72798728942871, + "learning_rate": 7.185711693151051e-06, + "loss": 4.9396, + "step": 70055 + }, + { + "epoch": 1.4253743489583333, + "grad_norm": 18.365633010864258, + "learning_rate": 7.18535221933267e-06, + "loss": 5.0391, + "step": 70060 + }, + { + "epoch": 1.42547607421875, + "grad_norm": 16.648488998413086, + "learning_rate": 7.1849927315505805e-06, + "loss": 4.8834, + "step": 70065 + }, + { + "epoch": 1.4255777994791667, + "grad_norm": 18.94120216369629, + "learning_rate": 7.1846332298070834e-06, + "loss": 4.6723, + "step": 70070 + }, + { + "epoch": 1.4256795247395833, + "grad_norm": 17.094982147216797, + "learning_rate": 7.184273714104472e-06, + "loss": 5.1158, + "step": 70075 + }, + { + "epoch": 1.42578125, + "grad_norm": 31.187931060791016, + "learning_rate": 7.183914184445044e-06, + "loss": 5.0915, + "step": 70080 + }, + { + "epoch": 1.4258829752604167, + "grad_norm": 22.83160972595215, + "learning_rate": 7.183554640831101e-06, + "loss": 5.1053, + "step": 70085 + }, + { + "epoch": 1.4259847005208333, + "grad_norm": 20.826244354248047, + "learning_rate": 7.1831950832649334e-06, + "loss": 4.8801, + "step": 70090 + }, + { + "epoch": 1.42608642578125, + "grad_norm": 22.047107696533203, + "learning_rate": 7.182835511748845e-06, + "loss": 4.7862, + "step": 70095 + }, + { + "epoch": 1.4261881510416667, + "grad_norm": 26.319791793823242, + "learning_rate": 7.182475926285129e-06, + "loss": 4.901, + "step": 70100 + }, + { + "epoch": 1.4262898763020833, + "grad_norm": 18.880874633789062, + "learning_rate": 7.182116326876084e-06, + "loss": 5.0449, + "step": 70105 + }, + { + "epoch": 1.4263916015625, + "grad_norm": 17.752592086791992, + "learning_rate": 7.18175671352401e-06, + "loss": 4.9096, + "step": 70110 + }, + { + "epoch": 1.4264933268229167, + "grad_norm": 17.199087142944336, + "learning_rate": 7.181397086231202e-06, + "loss": 5.0233, + "step": 70115 + }, + { + "epoch": 1.4265950520833333, + "grad_norm": 18.45903778076172, + "learning_rate": 7.18103744499996e-06, + "loss": 4.8017, + "step": 70120 + }, + { + "epoch": 1.42669677734375, + "grad_norm": 14.324954986572266, + "learning_rate": 7.1806777898325795e-06, + "loss": 4.8928, + "step": 70125 + }, + { + "epoch": 1.4267985026041667, + "grad_norm": 23.460533142089844, + "learning_rate": 7.180318120731361e-06, + "loss": 4.8644, + "step": 70130 + }, + { + "epoch": 1.4269002278645833, + "grad_norm": 19.935504913330078, + "learning_rate": 7.179958437698602e-06, + "loss": 4.8362, + "step": 70135 + }, + { + "epoch": 1.427001953125, + "grad_norm": 34.12960433959961, + "learning_rate": 7.179598740736602e-06, + "loss": 4.8817, + "step": 70140 + }, + { + "epoch": 1.4271036783854167, + "grad_norm": 15.40085220336914, + "learning_rate": 7.179239029847655e-06, + "loss": 4.8531, + "step": 70145 + }, + { + "epoch": 1.4272054036458333, + "grad_norm": 15.583209991455078, + "learning_rate": 7.178879305034066e-06, + "loss": 4.7122, + "step": 70150 + }, + { + "epoch": 1.42730712890625, + "grad_norm": 18.73274040222168, + "learning_rate": 7.178519566298127e-06, + "loss": 5.2897, + "step": 70155 + }, + { + "epoch": 1.4274088541666667, + "grad_norm": 19.99887466430664, + "learning_rate": 7.178159813642141e-06, + "loss": 4.6845, + "step": 70160 + }, + { + "epoch": 1.4275105794270833, + "grad_norm": 18.188392639160156, + "learning_rate": 7.177800047068405e-06, + "loss": 4.809, + "step": 70165 + }, + { + "epoch": 1.4276123046875, + "grad_norm": 17.123870849609375, + "learning_rate": 7.177440266579217e-06, + "loss": 4.8937, + "step": 70170 + }, + { + "epoch": 1.4277140299479167, + "grad_norm": 16.98628044128418, + "learning_rate": 7.177080472176878e-06, + "loss": 5.1005, + "step": 70175 + }, + { + "epoch": 1.4278157552083333, + "grad_norm": 20.04388999938965, + "learning_rate": 7.176720663863685e-06, + "loss": 4.9326, + "step": 70180 + }, + { + "epoch": 1.42791748046875, + "grad_norm": 18.96234893798828, + "learning_rate": 7.176360841641939e-06, + "loss": 4.919, + "step": 70185 + }, + { + "epoch": 1.4280192057291667, + "grad_norm": 15.861912727355957, + "learning_rate": 7.176001005513939e-06, + "loss": 5.1279, + "step": 70190 + }, + { + "epoch": 1.4281209309895833, + "grad_norm": 18.398693084716797, + "learning_rate": 7.17564115548198e-06, + "loss": 4.8273, + "step": 70195 + }, + { + "epoch": 1.42822265625, + "grad_norm": 15.9738187789917, + "learning_rate": 7.175281291548367e-06, + "loss": 4.6582, + "step": 70200 + }, + { + "epoch": 1.4283243815104167, + "grad_norm": 19.84161376953125, + "learning_rate": 7.174921413715397e-06, + "loss": 4.9145, + "step": 70205 + }, + { + "epoch": 1.4284261067708333, + "grad_norm": 19.358156204223633, + "learning_rate": 7.174561521985369e-06, + "loss": 4.8879, + "step": 70210 + }, + { + "epoch": 1.42852783203125, + "grad_norm": 15.03115177154541, + "learning_rate": 7.174201616360584e-06, + "loss": 4.9696, + "step": 70215 + }, + { + "epoch": 1.4286295572916667, + "grad_norm": 22.66036605834961, + "learning_rate": 7.17384169684334e-06, + "loss": 5.2007, + "step": 70220 + }, + { + "epoch": 1.4287312825520833, + "grad_norm": 18.626380920410156, + "learning_rate": 7.173481763435937e-06, + "loss": 4.9405, + "step": 70225 + }, + { + "epoch": 1.4288330078125, + "grad_norm": 18.95661163330078, + "learning_rate": 7.173121816140678e-06, + "loss": 4.9438, + "step": 70230 + }, + { + "epoch": 1.4289347330729167, + "grad_norm": 15.123517990112305, + "learning_rate": 7.172761854959858e-06, + "loss": 4.9735, + "step": 70235 + }, + { + "epoch": 1.4290364583333333, + "grad_norm": 19.50908088684082, + "learning_rate": 7.17240187989578e-06, + "loss": 4.9662, + "step": 70240 + }, + { + "epoch": 1.42913818359375, + "grad_norm": 15.141057014465332, + "learning_rate": 7.172041890950744e-06, + "loss": 5.0095, + "step": 70245 + }, + { + "epoch": 1.4292399088541667, + "grad_norm": 17.59922218322754, + "learning_rate": 7.17168188812705e-06, + "loss": 4.8651, + "step": 70250 + }, + { + "epoch": 1.4293416341145833, + "grad_norm": 17.20194435119629, + "learning_rate": 7.1713218714269974e-06, + "loss": 5.0259, + "step": 70255 + }, + { + "epoch": 1.429443359375, + "grad_norm": 18.564369201660156, + "learning_rate": 7.170961840852887e-06, + "loss": 4.9234, + "step": 70260 + }, + { + "epoch": 1.4295450846354167, + "grad_norm": 16.55963706970215, + "learning_rate": 7.170601796407021e-06, + "loss": 4.9117, + "step": 70265 + }, + { + "epoch": 1.4296468098958333, + "grad_norm": 14.815430641174316, + "learning_rate": 7.170241738091696e-06, + "loss": 5.1234, + "step": 70270 + }, + { + "epoch": 1.42974853515625, + "grad_norm": 15.690359115600586, + "learning_rate": 7.169881665909218e-06, + "loss": 4.7466, + "step": 70275 + }, + { + "epoch": 1.4298502604166667, + "grad_norm": 16.743961334228516, + "learning_rate": 7.169521579861884e-06, + "loss": 4.7323, + "step": 70280 + }, + { + "epoch": 1.4299519856770833, + "grad_norm": 18.21429443359375, + "learning_rate": 7.169161479951996e-06, + "loss": 4.771, + "step": 70285 + }, + { + "epoch": 1.4300537109375, + "grad_norm": 28.823177337646484, + "learning_rate": 7.168801366181855e-06, + "loss": 5.2136, + "step": 70290 + }, + { + "epoch": 1.4301554361979167, + "grad_norm": 16.620527267456055, + "learning_rate": 7.168441238553763e-06, + "loss": 4.8366, + "step": 70295 + }, + { + "epoch": 1.4302571614583333, + "grad_norm": 14.275616645812988, + "learning_rate": 7.168081097070016e-06, + "loss": 5.2772, + "step": 70300 + }, + { + "epoch": 1.43035888671875, + "grad_norm": 19.304176330566406, + "learning_rate": 7.167720941732923e-06, + "loss": 4.8344, + "step": 70305 + }, + { + "epoch": 1.4304606119791667, + "grad_norm": 15.66132640838623, + "learning_rate": 7.167360772544778e-06, + "loss": 4.9188, + "step": 70310 + }, + { + "epoch": 1.4305623372395833, + "grad_norm": 17.429845809936523, + "learning_rate": 7.167000589507889e-06, + "loss": 5.1323, + "step": 70315 + }, + { + "epoch": 1.4306640625, + "grad_norm": 22.581315994262695, + "learning_rate": 7.166640392624552e-06, + "loss": 4.8308, + "step": 70320 + }, + { + "epoch": 1.4307657877604167, + "grad_norm": 17.079322814941406, + "learning_rate": 7.166280181897071e-06, + "loss": 4.9109, + "step": 70325 + }, + { + "epoch": 1.4308675130208333, + "grad_norm": 19.891700744628906, + "learning_rate": 7.165919957327747e-06, + "loss": 4.9546, + "step": 70330 + }, + { + "epoch": 1.43096923828125, + "grad_norm": 16.18413543701172, + "learning_rate": 7.165559718918882e-06, + "loss": 4.7928, + "step": 70335 + }, + { + "epoch": 1.4310709635416667, + "grad_norm": 14.840444564819336, + "learning_rate": 7.165199466672779e-06, + "loss": 5.0157, + "step": 70340 + }, + { + "epoch": 1.4311726888020833, + "grad_norm": 15.848177909851074, + "learning_rate": 7.1648392005917375e-06, + "loss": 4.8215, + "step": 70345 + }, + { + "epoch": 1.4312744140625, + "grad_norm": 16.996522903442383, + "learning_rate": 7.16447892067806e-06, + "loss": 4.8727, + "step": 70350 + }, + { + "epoch": 1.4313761393229167, + "grad_norm": 18.38343620300293, + "learning_rate": 7.164118626934051e-06, + "loss": 4.9991, + "step": 70355 + }, + { + "epoch": 1.4314778645833333, + "grad_norm": 19.58608627319336, + "learning_rate": 7.163758319362011e-06, + "loss": 5.0181, + "step": 70360 + }, + { + "epoch": 1.43157958984375, + "grad_norm": 15.282454490661621, + "learning_rate": 7.163397997964241e-06, + "loss": 5.0597, + "step": 70365 + }, + { + "epoch": 1.4316813151041667, + "grad_norm": 18.415306091308594, + "learning_rate": 7.163037662743046e-06, + "loss": 4.9464, + "step": 70370 + }, + { + "epoch": 1.4317830403645833, + "grad_norm": 16.51983642578125, + "learning_rate": 7.162677313700725e-06, + "loss": 5.089, + "step": 70375 + }, + { + "epoch": 1.431884765625, + "grad_norm": 14.60922622680664, + "learning_rate": 7.162316950839585e-06, + "loss": 5.0281, + "step": 70380 + }, + { + "epoch": 1.4319864908854167, + "grad_norm": 21.2652587890625, + "learning_rate": 7.1619565741619235e-06, + "loss": 5.1977, + "step": 70385 + }, + { + "epoch": 1.4320882161458333, + "grad_norm": 26.87403678894043, + "learning_rate": 7.1615961836700475e-06, + "loss": 4.7474, + "step": 70390 + }, + { + "epoch": 1.43218994140625, + "grad_norm": 15.975127220153809, + "learning_rate": 7.161235779366257e-06, + "loss": 5.1411, + "step": 70395 + }, + { + "epoch": 1.4322916666666667, + "grad_norm": 18.445762634277344, + "learning_rate": 7.160875361252856e-06, + "loss": 5.0353, + "step": 70400 + }, + { + "epoch": 1.4323933919270833, + "grad_norm": 20.03849983215332, + "learning_rate": 7.1605149293321484e-06, + "loss": 4.6696, + "step": 70405 + }, + { + "epoch": 1.4324951171875, + "grad_norm": 17.8513126373291, + "learning_rate": 7.1601544836064364e-06, + "loss": 4.6993, + "step": 70410 + }, + { + "epoch": 1.4325968424479167, + "grad_norm": 15.6367769241333, + "learning_rate": 7.159794024078022e-06, + "loss": 4.7474, + "step": 70415 + }, + { + "epoch": 1.4326985677083333, + "grad_norm": 17.44361114501953, + "learning_rate": 7.1594335507492095e-06, + "loss": 4.7862, + "step": 70420 + }, + { + "epoch": 1.43280029296875, + "grad_norm": 14.918737411499023, + "learning_rate": 7.1590730636223016e-06, + "loss": 4.8038, + "step": 70425 + }, + { + "epoch": 1.4329020182291667, + "grad_norm": 23.380443572998047, + "learning_rate": 7.158712562699604e-06, + "loss": 4.8591, + "step": 70430 + }, + { + "epoch": 1.4330037434895833, + "grad_norm": 17.292821884155273, + "learning_rate": 7.158352047983416e-06, + "loss": 5.1387, + "step": 70435 + }, + { + "epoch": 1.43310546875, + "grad_norm": 16.922157287597656, + "learning_rate": 7.157991519476046e-06, + "loss": 4.7092, + "step": 70440 + }, + { + "epoch": 1.4332071940104167, + "grad_norm": 22.69989585876465, + "learning_rate": 7.1576309771797945e-06, + "loss": 4.9203, + "step": 70445 + }, + { + "epoch": 1.4333089192708333, + "grad_norm": 17.30035400390625, + "learning_rate": 7.157270421096966e-06, + "loss": 4.915, + "step": 70450 + }, + { + "epoch": 1.43341064453125, + "grad_norm": 17.134050369262695, + "learning_rate": 7.156909851229863e-06, + "loss": 4.7398, + "step": 70455 + }, + { + "epoch": 1.4335123697916667, + "grad_norm": 14.688288688659668, + "learning_rate": 7.156549267580792e-06, + "loss": 5.1657, + "step": 70460 + }, + { + "epoch": 1.4336140950520833, + "grad_norm": 19.58487892150879, + "learning_rate": 7.156188670152056e-06, + "loss": 4.9768, + "step": 70465 + }, + { + "epoch": 1.4337158203125, + "grad_norm": 22.260957717895508, + "learning_rate": 7.155828058945957e-06, + "loss": 4.8436, + "step": 70470 + }, + { + "epoch": 1.4338175455729167, + "grad_norm": 20.975492477416992, + "learning_rate": 7.155467433964803e-06, + "loss": 4.8702, + "step": 70475 + }, + { + "epoch": 1.4339192708333333, + "grad_norm": 16.179481506347656, + "learning_rate": 7.155106795210896e-06, + "loss": 4.838, + "step": 70480 + }, + { + "epoch": 1.43402099609375, + "grad_norm": 15.181025505065918, + "learning_rate": 7.154746142686541e-06, + "loss": 4.9008, + "step": 70485 + }, + { + "epoch": 1.4341227213541667, + "grad_norm": 19.214902877807617, + "learning_rate": 7.1543854763940405e-06, + "loss": 5.2106, + "step": 70490 + }, + { + "epoch": 1.4342244466145833, + "grad_norm": 18.247169494628906, + "learning_rate": 7.154024796335701e-06, + "loss": 4.9774, + "step": 70495 + }, + { + "epoch": 1.434326171875, + "grad_norm": 15.593696594238281, + "learning_rate": 7.153664102513829e-06, + "loss": 4.795, + "step": 70500 + }, + { + "epoch": 1.4344278971354167, + "grad_norm": 16.950786590576172, + "learning_rate": 7.153303394930725e-06, + "loss": 4.9231, + "step": 70505 + }, + { + "epoch": 1.4345296223958333, + "grad_norm": 15.552152633666992, + "learning_rate": 7.152942673588696e-06, + "loss": 5.0031, + "step": 70510 + }, + { + "epoch": 1.43463134765625, + "grad_norm": 14.0167818069458, + "learning_rate": 7.152581938490048e-06, + "loss": 5.0801, + "step": 70515 + }, + { + "epoch": 1.4347330729166667, + "grad_norm": 14.291637420654297, + "learning_rate": 7.152221189637084e-06, + "loss": 5.0427, + "step": 70520 + }, + { + "epoch": 1.4348347981770833, + "grad_norm": 18.653352737426758, + "learning_rate": 7.151860427032109e-06, + "loss": 4.8832, + "step": 70525 + }, + { + "epoch": 1.4349365234375, + "grad_norm": 18.839889526367188, + "learning_rate": 7.15149965067743e-06, + "loss": 4.8877, + "step": 70530 + }, + { + "epoch": 1.4350382486979167, + "grad_norm": 15.769993782043457, + "learning_rate": 7.1511388605753505e-06, + "loss": 4.8156, + "step": 70535 + }, + { + "epoch": 1.4351399739583333, + "grad_norm": 23.12473487854004, + "learning_rate": 7.150778056728177e-06, + "loss": 4.9639, + "step": 70540 + }, + { + "epoch": 1.43524169921875, + "grad_norm": 23.410568237304688, + "learning_rate": 7.150417239138214e-06, + "loss": 5.0608, + "step": 70545 + }, + { + "epoch": 1.4353434244791667, + "grad_norm": 20.35376739501953, + "learning_rate": 7.150056407807768e-06, + "loss": 5.035, + "step": 70550 + }, + { + "epoch": 1.4354451497395833, + "grad_norm": 16.4883975982666, + "learning_rate": 7.149695562739144e-06, + "loss": 4.7171, + "step": 70555 + }, + { + "epoch": 1.435546875, + "grad_norm": 22.616188049316406, + "learning_rate": 7.149334703934648e-06, + "loss": 5.1237, + "step": 70560 + }, + { + "epoch": 1.4356486002604167, + "grad_norm": 15.974313735961914, + "learning_rate": 7.148973831396586e-06, + "loss": 4.9622, + "step": 70565 + }, + { + "epoch": 1.4357503255208333, + "grad_norm": 20.768526077270508, + "learning_rate": 7.148612945127262e-06, + "loss": 5.0179, + "step": 70570 + }, + { + "epoch": 1.43585205078125, + "grad_norm": 16.398120880126953, + "learning_rate": 7.148252045128985e-06, + "loss": 4.9361, + "step": 70575 + }, + { + "epoch": 1.4359537760416667, + "grad_norm": 21.551319122314453, + "learning_rate": 7.147891131404057e-06, + "loss": 5.1565, + "step": 70580 + }, + { + "epoch": 1.4360555013020833, + "grad_norm": 16.57314682006836, + "learning_rate": 7.147530203954789e-06, + "loss": 5.0739, + "step": 70585 + }, + { + "epoch": 1.4361572265625, + "grad_norm": 14.087358474731445, + "learning_rate": 7.147169262783483e-06, + "loss": 4.8733, + "step": 70590 + }, + { + "epoch": 1.4362589518229167, + "grad_norm": 27.23332977294922, + "learning_rate": 7.146808307892448e-06, + "loss": 4.8117, + "step": 70595 + }, + { + "epoch": 1.4363606770833333, + "grad_norm": 17.65122413635254, + "learning_rate": 7.14644733928399e-06, + "loss": 5.1719, + "step": 70600 + }, + { + "epoch": 1.43646240234375, + "grad_norm": 21.647567749023438, + "learning_rate": 7.1460863569604135e-06, + "loss": 4.9889, + "step": 70605 + }, + { + "epoch": 1.4365641276041667, + "grad_norm": 25.84885025024414, + "learning_rate": 7.145725360924026e-06, + "loss": 4.7936, + "step": 70610 + }, + { + "epoch": 1.4366658528645833, + "grad_norm": 17.33889389038086, + "learning_rate": 7.145364351177135e-06, + "loss": 5.1284, + "step": 70615 + }, + { + "epoch": 1.436767578125, + "grad_norm": 16.551822662353516, + "learning_rate": 7.145003327722047e-06, + "loss": 4.7475, + "step": 70620 + }, + { + "epoch": 1.4368693033854167, + "grad_norm": 25.49342155456543, + "learning_rate": 7.1446422905610704e-06, + "loss": 5.0163, + "step": 70625 + }, + { + "epoch": 1.4369710286458333, + "grad_norm": 14.990520477294922, + "learning_rate": 7.144281239696509e-06, + "loss": 4.7569, + "step": 70630 + }, + { + "epoch": 1.43707275390625, + "grad_norm": 20.333770751953125, + "learning_rate": 7.14392017513067e-06, + "loss": 4.6921, + "step": 70635 + }, + { + "epoch": 1.4371744791666667, + "grad_norm": 15.360723495483398, + "learning_rate": 7.143559096865864e-06, + "loss": 4.7509, + "step": 70640 + }, + { + "epoch": 1.4372762044270833, + "grad_norm": 23.425622940063477, + "learning_rate": 7.143198004904394e-06, + "loss": 4.9242, + "step": 70645 + }, + { + "epoch": 1.4373779296875, + "grad_norm": 19.015453338623047, + "learning_rate": 7.14283689924857e-06, + "loss": 4.6915, + "step": 70650 + }, + { + "epoch": 1.4374796549479167, + "grad_norm": 19.328956604003906, + "learning_rate": 7.142475779900697e-06, + "loss": 4.8753, + "step": 70655 + }, + { + "epoch": 1.4375813802083333, + "grad_norm": 19.2026424407959, + "learning_rate": 7.142114646863086e-06, + "loss": 5.1529, + "step": 70660 + }, + { + "epoch": 1.43768310546875, + "grad_norm": 16.998760223388672, + "learning_rate": 7.14175350013804e-06, + "loss": 4.9914, + "step": 70665 + }, + { + "epoch": 1.4377848307291667, + "grad_norm": 18.30072784423828, + "learning_rate": 7.141392339727871e-06, + "loss": 4.9551, + "step": 70670 + }, + { + "epoch": 1.4378865559895833, + "grad_norm": 17.726444244384766, + "learning_rate": 7.141031165634883e-06, + "loss": 4.9013, + "step": 70675 + }, + { + "epoch": 1.43798828125, + "grad_norm": 19.577526092529297, + "learning_rate": 7.1406699778613875e-06, + "loss": 5.0237, + "step": 70680 + }, + { + "epoch": 1.4380900065104167, + "grad_norm": 16.459592819213867, + "learning_rate": 7.140308776409689e-06, + "loss": 4.9813, + "step": 70685 + }, + { + "epoch": 1.4381917317708333, + "grad_norm": 17.286481857299805, + "learning_rate": 7.139947561282097e-06, + "loss": 4.8618, + "step": 70690 + }, + { + "epoch": 1.43829345703125, + "grad_norm": 15.62448787689209, + "learning_rate": 7.139586332480921e-06, + "loss": 5.0486, + "step": 70695 + }, + { + "epoch": 1.4383951822916667, + "grad_norm": 19.490758895874023, + "learning_rate": 7.139225090008466e-06, + "loss": 4.88, + "step": 70700 + }, + { + "epoch": 1.4384969075520833, + "grad_norm": 16.243892669677734, + "learning_rate": 7.1388638338670415e-06, + "loss": 4.9266, + "step": 70705 + }, + { + "epoch": 1.4385986328125, + "grad_norm": 12.960511207580566, + "learning_rate": 7.138502564058956e-06, + "loss": 4.8888, + "step": 70710 + }, + { + "epoch": 1.4387003580729167, + "grad_norm": 22.277311325073242, + "learning_rate": 7.138141280586519e-06, + "loss": 4.795, + "step": 70715 + }, + { + "epoch": 1.4388020833333333, + "grad_norm": 37.61860656738281, + "learning_rate": 7.137779983452038e-06, + "loss": 4.8817, + "step": 70720 + }, + { + "epoch": 1.43890380859375, + "grad_norm": 16.297903060913086, + "learning_rate": 7.137418672657821e-06, + "loss": 4.8185, + "step": 70725 + }, + { + "epoch": 1.4390055338541667, + "grad_norm": 15.603626251220703, + "learning_rate": 7.137057348206177e-06, + "loss": 4.7611, + "step": 70730 + }, + { + "epoch": 1.4391072591145833, + "grad_norm": 17.983762741088867, + "learning_rate": 7.136696010099416e-06, + "loss": 4.7475, + "step": 70735 + }, + { + "epoch": 1.439208984375, + "grad_norm": 25.016611099243164, + "learning_rate": 7.136334658339844e-06, + "loss": 4.7243, + "step": 70740 + }, + { + "epoch": 1.4393107096354167, + "grad_norm": 12.83775806427002, + "learning_rate": 7.135973292929773e-06, + "loss": 4.7308, + "step": 70745 + }, + { + "epoch": 1.4394124348958333, + "grad_norm": 18.895187377929688, + "learning_rate": 7.13561191387151e-06, + "loss": 5.1667, + "step": 70750 + }, + { + "epoch": 1.43951416015625, + "grad_norm": 18.133838653564453, + "learning_rate": 7.135250521167365e-06, + "loss": 4.8183, + "step": 70755 + }, + { + "epoch": 1.4396158854166667, + "grad_norm": 21.959741592407227, + "learning_rate": 7.134889114819648e-06, + "loss": 4.7448, + "step": 70760 + }, + { + "epoch": 1.4397176106770833, + "grad_norm": 18.49034881591797, + "learning_rate": 7.134527694830665e-06, + "loss": 4.8669, + "step": 70765 + }, + { + "epoch": 1.4398193359375, + "grad_norm": 23.266128540039062, + "learning_rate": 7.1341662612027294e-06, + "loss": 5.0873, + "step": 70770 + }, + { + "epoch": 1.4399210611979167, + "grad_norm": 15.432271003723145, + "learning_rate": 7.133804813938148e-06, + "loss": 4.7636, + "step": 70775 + }, + { + "epoch": 1.4400227864583333, + "grad_norm": 18.462228775024414, + "learning_rate": 7.133443353039232e-06, + "loss": 5.0721, + "step": 70780 + }, + { + "epoch": 1.44012451171875, + "grad_norm": 15.615392684936523, + "learning_rate": 7.13308187850829e-06, + "loss": 5.1831, + "step": 70785 + }, + { + "epoch": 1.4402262369791667, + "grad_norm": 17.628067016601562, + "learning_rate": 7.132720390347632e-06, + "loss": 4.7202, + "step": 70790 + }, + { + "epoch": 1.4403279622395833, + "grad_norm": 18.3174991607666, + "learning_rate": 7.132358888559567e-06, + "loss": 4.97, + "step": 70795 + }, + { + "epoch": 1.4404296875, + "grad_norm": 30.600753784179688, + "learning_rate": 7.1319973731464064e-06, + "loss": 5.0754, + "step": 70800 + }, + { + "epoch": 1.4405314127604167, + "grad_norm": 22.195219039916992, + "learning_rate": 7.131635844110459e-06, + "loss": 4.7078, + "step": 70805 + }, + { + "epoch": 1.4406331380208333, + "grad_norm": 21.566301345825195, + "learning_rate": 7.1312743014540356e-06, + "loss": 4.981, + "step": 70810 + }, + { + "epoch": 1.44073486328125, + "grad_norm": 21.656396865844727, + "learning_rate": 7.1309127451794445e-06, + "loss": 4.8611, + "step": 70815 + }, + { + "epoch": 1.4408365885416667, + "grad_norm": 13.81451416015625, + "learning_rate": 7.130551175288999e-06, + "loss": 4.8214, + "step": 70820 + }, + { + "epoch": 1.4409383138020833, + "grad_norm": 15.529086112976074, + "learning_rate": 7.130189591785008e-06, + "loss": 4.7155, + "step": 70825 + }, + { + "epoch": 1.4410400390625, + "grad_norm": 19.5197811126709, + "learning_rate": 7.12982799466978e-06, + "loss": 5.1401, + "step": 70830 + }, + { + "epoch": 1.4411417643229167, + "grad_norm": 19.265974044799805, + "learning_rate": 7.129466383945629e-06, + "loss": 5.1834, + "step": 70835 + }, + { + "epoch": 1.4412434895833333, + "grad_norm": 20.056753158569336, + "learning_rate": 7.129104759614863e-06, + "loss": 4.8207, + "step": 70840 + }, + { + "epoch": 1.44134521484375, + "grad_norm": 19.19114112854004, + "learning_rate": 7.128743121679794e-06, + "loss": 4.8886, + "step": 70845 + }, + { + "epoch": 1.4414469401041667, + "grad_norm": 18.982378005981445, + "learning_rate": 7.128381470142732e-06, + "loss": 4.9215, + "step": 70850 + }, + { + "epoch": 1.4415486653645833, + "grad_norm": 13.908430099487305, + "learning_rate": 7.128019805005988e-06, + "loss": 4.714, + "step": 70855 + }, + { + "epoch": 1.441650390625, + "grad_norm": 21.511354446411133, + "learning_rate": 7.1276581262718725e-06, + "loss": 4.9734, + "step": 70860 + }, + { + "epoch": 1.4417521158854167, + "grad_norm": 16.07010269165039, + "learning_rate": 7.1272964339426964e-06, + "loss": 4.8445, + "step": 70865 + }, + { + "epoch": 1.4418538411458333, + "grad_norm": 17.557750701904297, + "learning_rate": 7.1269347280207736e-06, + "loss": 4.7227, + "step": 70870 + }, + { + "epoch": 1.44195556640625, + "grad_norm": 18.968204498291016, + "learning_rate": 7.126573008508412e-06, + "loss": 4.9406, + "step": 70875 + }, + { + "epoch": 1.4420572916666667, + "grad_norm": 19.715770721435547, + "learning_rate": 7.126211275407923e-06, + "loss": 4.9686, + "step": 70880 + }, + { + "epoch": 1.4421590169270833, + "grad_norm": 12.360221862792969, + "learning_rate": 7.12584952872162e-06, + "loss": 5.222, + "step": 70885 + }, + { + "epoch": 1.4422607421875, + "grad_norm": 14.935615539550781, + "learning_rate": 7.125487768451813e-06, + "loss": 4.9094, + "step": 70890 + }, + { + "epoch": 1.4423624674479167, + "grad_norm": 16.08841323852539, + "learning_rate": 7.125125994600813e-06, + "loss": 5.0361, + "step": 70895 + }, + { + "epoch": 1.4424641927083333, + "grad_norm": 18.95411491394043, + "learning_rate": 7.124764207170934e-06, + "loss": 5.1116, + "step": 70900 + }, + { + "epoch": 1.44256591796875, + "grad_norm": 16.852264404296875, + "learning_rate": 7.124402406164486e-06, + "loss": 4.9996, + "step": 70905 + }, + { + "epoch": 1.4426676432291667, + "grad_norm": 18.651050567626953, + "learning_rate": 7.1240405915837805e-06, + "loss": 5.4376, + "step": 70910 + }, + { + "epoch": 1.4427693684895833, + "grad_norm": 16.329988479614258, + "learning_rate": 7.123678763431129e-06, + "loss": 4.8128, + "step": 70915 + }, + { + "epoch": 1.44287109375, + "grad_norm": 18.892301559448242, + "learning_rate": 7.123316921708846e-06, + "loss": 4.7743, + "step": 70920 + }, + { + "epoch": 1.4429728190104167, + "grad_norm": 24.438922882080078, + "learning_rate": 7.12295506641924e-06, + "loss": 5.2302, + "step": 70925 + }, + { + "epoch": 1.4430745442708333, + "grad_norm": 17.448570251464844, + "learning_rate": 7.122593197564627e-06, + "loss": 4.7853, + "step": 70930 + }, + { + "epoch": 1.44317626953125, + "grad_norm": 19.430688858032227, + "learning_rate": 7.1222313151473155e-06, + "loss": 5.145, + "step": 70935 + }, + { + "epoch": 1.4432779947916667, + "grad_norm": 12.112936973571777, + "learning_rate": 7.121869419169621e-06, + "loss": 4.9896, + "step": 70940 + }, + { + "epoch": 1.4433797200520833, + "grad_norm": 20.467315673828125, + "learning_rate": 7.121507509633854e-06, + "loss": 4.8107, + "step": 70945 + }, + { + "epoch": 1.4434814453125, + "grad_norm": 16.172983169555664, + "learning_rate": 7.1211455865423275e-06, + "loss": 4.9915, + "step": 70950 + }, + { + "epoch": 1.4435831705729167, + "grad_norm": 19.74553680419922, + "learning_rate": 7.120783649897355e-06, + "loss": 4.8181, + "step": 70955 + }, + { + "epoch": 1.4436848958333333, + "grad_norm": 16.506643295288086, + "learning_rate": 7.120421699701246e-06, + "loss": 4.9618, + "step": 70960 + }, + { + "epoch": 1.44378662109375, + "grad_norm": 25.35239028930664, + "learning_rate": 7.120059735956317e-06, + "loss": 4.778, + "step": 70965 + }, + { + "epoch": 1.4438883463541667, + "grad_norm": 18.12531852722168, + "learning_rate": 7.119697758664879e-06, + "loss": 5.0301, + "step": 70970 + }, + { + "epoch": 1.4439900716145833, + "grad_norm": 24.169708251953125, + "learning_rate": 7.119335767829245e-06, + "loss": 4.812, + "step": 70975 + }, + { + "epoch": 1.444091796875, + "grad_norm": 22.952672958374023, + "learning_rate": 7.118973763451728e-06, + "loss": 5.2085, + "step": 70980 + }, + { + "epoch": 1.4441935221354167, + "grad_norm": 18.243186950683594, + "learning_rate": 7.118611745534643e-06, + "loss": 5.009, + "step": 70985 + }, + { + "epoch": 1.4442952473958333, + "grad_norm": 14.558023452758789, + "learning_rate": 7.1182497140803e-06, + "loss": 4.9346, + "step": 70990 + }, + { + "epoch": 1.44439697265625, + "grad_norm": 17.325529098510742, + "learning_rate": 7.117887669091013e-06, + "loss": 4.6483, + "step": 70995 + }, + { + "epoch": 1.4444986979166667, + "grad_norm": 16.67292594909668, + "learning_rate": 7.117525610569098e-06, + "loss": 4.7345, + "step": 71000 + }, + { + "epoch": 1.4446004231770833, + "grad_norm": 16.118234634399414, + "learning_rate": 7.117163538516866e-06, + "loss": 4.8736, + "step": 71005 + }, + { + "epoch": 1.4447021484375, + "grad_norm": 13.215571403503418, + "learning_rate": 7.11680145293663e-06, + "loss": 4.8391, + "step": 71010 + }, + { + "epoch": 1.4448038736979167, + "grad_norm": 18.854915618896484, + "learning_rate": 7.116439353830706e-06, + "loss": 4.8586, + "step": 71015 + }, + { + "epoch": 1.4449055989583333, + "grad_norm": 14.870781898498535, + "learning_rate": 7.116077241201404e-06, + "loss": 4.8693, + "step": 71020 + }, + { + "epoch": 1.44500732421875, + "grad_norm": 17.938440322875977, + "learning_rate": 7.115715115051043e-06, + "loss": 4.7478, + "step": 71025 + }, + { + "epoch": 1.4451090494791667, + "grad_norm": 18.639711380004883, + "learning_rate": 7.115352975381933e-06, + "loss": 4.897, + "step": 71030 + }, + { + "epoch": 1.4452107747395833, + "grad_norm": 12.650525093078613, + "learning_rate": 7.114990822196388e-06, + "loss": 4.6773, + "step": 71035 + }, + { + "epoch": 1.4453125, + "grad_norm": 13.065707206726074, + "learning_rate": 7.114628655496725e-06, + "loss": 4.8986, + "step": 71040 + }, + { + "epoch": 1.4454142252604167, + "grad_norm": 20.965669631958008, + "learning_rate": 7.114266475285254e-06, + "loss": 4.7689, + "step": 71045 + }, + { + "epoch": 1.4455159505208333, + "grad_norm": 17.57986831665039, + "learning_rate": 7.113904281564294e-06, + "loss": 4.9073, + "step": 71050 + }, + { + "epoch": 1.44561767578125, + "grad_norm": 20.7705135345459, + "learning_rate": 7.113542074336154e-06, + "loss": 4.8997, + "step": 71055 + }, + { + "epoch": 1.4457194010416667, + "grad_norm": 15.069085121154785, + "learning_rate": 7.113179853603152e-06, + "loss": 4.8891, + "step": 71060 + }, + { + "epoch": 1.4458211263020833, + "grad_norm": 12.871495246887207, + "learning_rate": 7.1128176193676025e-06, + "loss": 4.9482, + "step": 71065 + }, + { + "epoch": 1.4459228515625, + "grad_norm": 23.400461196899414, + "learning_rate": 7.112455371631818e-06, + "loss": 4.9676, + "step": 71070 + }, + { + "epoch": 1.4460245768229167, + "grad_norm": 24.50309944152832, + "learning_rate": 7.112093110398114e-06, + "loss": 4.9316, + "step": 71075 + }, + { + "epoch": 1.4461263020833333, + "grad_norm": 20.296083450317383, + "learning_rate": 7.111730835668806e-06, + "loss": 4.7495, + "step": 71080 + }, + { + "epoch": 1.44622802734375, + "grad_norm": 15.118896484375, + "learning_rate": 7.111368547446209e-06, + "loss": 4.8031, + "step": 71085 + }, + { + "epoch": 1.4463297526041667, + "grad_norm": 14.79344654083252, + "learning_rate": 7.111006245732637e-06, + "loss": 4.6669, + "step": 71090 + }, + { + "epoch": 1.4464314778645833, + "grad_norm": 17.28891944885254, + "learning_rate": 7.110643930530405e-06, + "loss": 4.8015, + "step": 71095 + }, + { + "epoch": 1.446533203125, + "grad_norm": 15.705299377441406, + "learning_rate": 7.110281601841827e-06, + "loss": 4.8605, + "step": 71100 + }, + { + "epoch": 1.4466349283854167, + "grad_norm": 21.48638153076172, + "learning_rate": 7.109919259669221e-06, + "loss": 5.105, + "step": 71105 + }, + { + "epoch": 1.4467366536458333, + "grad_norm": 14.385643005371094, + "learning_rate": 7.109556904014902e-06, + "loss": 4.8291, + "step": 71110 + }, + { + "epoch": 1.44683837890625, + "grad_norm": 18.105064392089844, + "learning_rate": 7.1091945348811806e-06, + "loss": 4.7317, + "step": 71115 + }, + { + "epoch": 1.4469401041666667, + "grad_norm": 15.592559814453125, + "learning_rate": 7.108832152270378e-06, + "loss": 5.0348, + "step": 71120 + }, + { + "epoch": 1.4470418294270833, + "grad_norm": 16.877368927001953, + "learning_rate": 7.108469756184807e-06, + "loss": 4.9378, + "step": 71125 + }, + { + "epoch": 1.4471435546875, + "grad_norm": 20.460359573364258, + "learning_rate": 7.108107346626784e-06, + "loss": 4.8519, + "step": 71130 + }, + { + "epoch": 1.4472452799479167, + "grad_norm": 13.917352676391602, + "learning_rate": 7.107744923598624e-06, + "loss": 4.8608, + "step": 71135 + }, + { + "epoch": 1.4473470052083333, + "grad_norm": 18.75516700744629, + "learning_rate": 7.107382487102642e-06, + "loss": 4.9501, + "step": 71140 + }, + { + "epoch": 1.44744873046875, + "grad_norm": 18.386783599853516, + "learning_rate": 7.107020037141156e-06, + "loss": 4.7849, + "step": 71145 + }, + { + "epoch": 1.4475504557291667, + "grad_norm": 17.691415786743164, + "learning_rate": 7.10665757371648e-06, + "loss": 4.7274, + "step": 71150 + }, + { + "epoch": 1.4476521809895833, + "grad_norm": 16.92501449584961, + "learning_rate": 7.106295096830931e-06, + "loss": 4.8833, + "step": 71155 + }, + { + "epoch": 1.44775390625, + "grad_norm": 22.46967315673828, + "learning_rate": 7.105932606486826e-06, + "loss": 4.8684, + "step": 71160 + }, + { + "epoch": 1.4478556315104167, + "grad_norm": 19.101287841796875, + "learning_rate": 7.105570102686479e-06, + "loss": 4.9997, + "step": 71165 + }, + { + "epoch": 1.4479573567708333, + "grad_norm": 22.051801681518555, + "learning_rate": 7.105207585432208e-06, + "loss": 5.3084, + "step": 71170 + }, + { + "epoch": 1.44805908203125, + "grad_norm": 18.027021408081055, + "learning_rate": 7.104845054726329e-06, + "loss": 5.0046, + "step": 71175 + }, + { + "epoch": 1.4481608072916667, + "grad_norm": 13.276820182800293, + "learning_rate": 7.1044825105711575e-06, + "loss": 4.9136, + "step": 71180 + }, + { + "epoch": 1.4482625325520833, + "grad_norm": 13.76736831665039, + "learning_rate": 7.104119952969011e-06, + "loss": 4.8998, + "step": 71185 + }, + { + "epoch": 1.4483642578125, + "grad_norm": 24.500303268432617, + "learning_rate": 7.103757381922206e-06, + "loss": 5.058, + "step": 71190 + }, + { + "epoch": 1.4484659830729167, + "grad_norm": 19.381914138793945, + "learning_rate": 7.103394797433058e-06, + "loss": 4.9265, + "step": 71195 + }, + { + "epoch": 1.4485677083333333, + "grad_norm": 13.456445693969727, + "learning_rate": 7.103032199503887e-06, + "loss": 4.849, + "step": 71200 + }, + { + "epoch": 1.44866943359375, + "grad_norm": 17.439741134643555, + "learning_rate": 7.102669588137007e-06, + "loss": 4.8368, + "step": 71205 + }, + { + "epoch": 1.4487711588541667, + "grad_norm": 18.81690216064453, + "learning_rate": 7.102306963334736e-06, + "loss": 5.0566, + "step": 71210 + }, + { + "epoch": 1.4488728841145833, + "grad_norm": 16.053478240966797, + "learning_rate": 7.10194432509939e-06, + "loss": 4.6743, + "step": 71215 + }, + { + "epoch": 1.448974609375, + "grad_norm": 17.468082427978516, + "learning_rate": 7.1015816734332885e-06, + "loss": 4.8716, + "step": 71220 + }, + { + "epoch": 1.4490763346354167, + "grad_norm": 17.121257781982422, + "learning_rate": 7.1012190083387465e-06, + "loss": 4.8812, + "step": 71225 + }, + { + "epoch": 1.4491780598958333, + "grad_norm": 21.328369140625, + "learning_rate": 7.1008563298180826e-06, + "loss": 5.0648, + "step": 71230 + }, + { + "epoch": 1.44927978515625, + "grad_norm": 22.888797760009766, + "learning_rate": 7.100493637873613e-06, + "loss": 4.8006, + "step": 71235 + }, + { + "epoch": 1.4493815104166667, + "grad_norm": 24.984724044799805, + "learning_rate": 7.100130932507655e-06, + "loss": 4.9812, + "step": 71240 + }, + { + "epoch": 1.4494832356770833, + "grad_norm": 18.56355094909668, + "learning_rate": 7.099768213722528e-06, + "loss": 4.6982, + "step": 71245 + }, + { + "epoch": 1.4495849609375, + "grad_norm": 21.79134750366211, + "learning_rate": 7.099405481520549e-06, + "loss": 4.8674, + "step": 71250 + }, + { + "epoch": 1.4496866861979167, + "grad_norm": 20.536685943603516, + "learning_rate": 7.099042735904035e-06, + "loss": 4.7256, + "step": 71255 + }, + { + "epoch": 1.4497884114583333, + "grad_norm": 23.381540298461914, + "learning_rate": 7.098679976875305e-06, + "loss": 4.9893, + "step": 71260 + }, + { + "epoch": 1.44989013671875, + "grad_norm": 12.625761032104492, + "learning_rate": 7.0983172044366756e-06, + "loss": 4.7786, + "step": 71265 + }, + { + "epoch": 1.4499918619791667, + "grad_norm": 19.462419509887695, + "learning_rate": 7.097954418590465e-06, + "loss": 5.0869, + "step": 71270 + }, + { + "epoch": 1.4500935872395833, + "grad_norm": 18.73729133605957, + "learning_rate": 7.0975916193389926e-06, + "loss": 5.13, + "step": 71275 + }, + { + "epoch": 1.4501953125, + "grad_norm": 21.923574447631836, + "learning_rate": 7.097228806684574e-06, + "loss": 5.2021, + "step": 71280 + }, + { + "epoch": 1.4502970377604167, + "grad_norm": 17.081769943237305, + "learning_rate": 7.096865980629531e-06, + "loss": 4.8663, + "step": 71285 + }, + { + "epoch": 1.4503987630208333, + "grad_norm": 14.311968803405762, + "learning_rate": 7.096503141176179e-06, + "loss": 5.0708, + "step": 71290 + }, + { + "epoch": 1.45050048828125, + "grad_norm": 14.64067554473877, + "learning_rate": 7.096140288326836e-06, + "loss": 4.7519, + "step": 71295 + }, + { + "epoch": 1.4506022135416667, + "grad_norm": 15.965209007263184, + "learning_rate": 7.095777422083824e-06, + "loss": 5.2265, + "step": 71300 + }, + { + "epoch": 1.4507039388020833, + "grad_norm": 16.34051513671875, + "learning_rate": 7.095414542449459e-06, + "loss": 4.6334, + "step": 71305 + }, + { + "epoch": 1.4508056640625, + "grad_norm": 17.810060501098633, + "learning_rate": 7.0950516494260604e-06, + "loss": 4.7334, + "step": 71310 + }, + { + "epoch": 1.4509073893229167, + "grad_norm": 21.344030380249023, + "learning_rate": 7.0946887430159475e-06, + "loss": 5.013, + "step": 71315 + }, + { + "epoch": 1.4510091145833333, + "grad_norm": 14.71314525604248, + "learning_rate": 7.094325823221437e-06, + "loss": 4.9278, + "step": 71320 + }, + { + "epoch": 1.45111083984375, + "grad_norm": 21.26120376586914, + "learning_rate": 7.093962890044849e-06, + "loss": 4.8126, + "step": 71325 + }, + { + "epoch": 1.4512125651041667, + "grad_norm": 15.77908706665039, + "learning_rate": 7.093599943488504e-06, + "loss": 4.8255, + "step": 71330 + }, + { + "epoch": 1.4513142903645833, + "grad_norm": 17.419387817382812, + "learning_rate": 7.093236983554719e-06, + "loss": 5.0872, + "step": 71335 + }, + { + "epoch": 1.451416015625, + "grad_norm": 17.243648529052734, + "learning_rate": 7.092874010245815e-06, + "loss": 4.832, + "step": 71340 + }, + { + "epoch": 1.4515177408854167, + "grad_norm": 20.655826568603516, + "learning_rate": 7.0925110235641105e-06, + "loss": 4.9216, + "step": 71345 + }, + { + "epoch": 1.4516194661458333, + "grad_norm": 13.361841201782227, + "learning_rate": 7.092148023511924e-06, + "loss": 4.8569, + "step": 71350 + }, + { + "epoch": 1.45172119140625, + "grad_norm": 13.402573585510254, + "learning_rate": 7.091785010091578e-06, + "loss": 4.9731, + "step": 71355 + }, + { + "epoch": 1.4518229166666667, + "grad_norm": 14.072593688964844, + "learning_rate": 7.091421983305387e-06, + "loss": 4.8765, + "step": 71360 + }, + { + "epoch": 1.4519246419270833, + "grad_norm": 20.23179817199707, + "learning_rate": 7.091058943155675e-06, + "loss": 4.836, + "step": 71365 + }, + { + "epoch": 1.4520263671875, + "grad_norm": 20.975984573364258, + "learning_rate": 7.0906958896447585e-06, + "loss": 4.9133, + "step": 71370 + }, + { + "epoch": 1.4521280924479167, + "grad_norm": 19.210283279418945, + "learning_rate": 7.090332822774961e-06, + "loss": 4.9377, + "step": 71375 + }, + { + "epoch": 1.4522298177083333, + "grad_norm": 15.90380859375, + "learning_rate": 7.0899697425486e-06, + "loss": 4.7735, + "step": 71380 + }, + { + "epoch": 1.45233154296875, + "grad_norm": 17.969959259033203, + "learning_rate": 7.089606648967995e-06, + "loss": 5.1164, + "step": 71385 + }, + { + "epoch": 1.4524332682291667, + "grad_norm": 16.089672088623047, + "learning_rate": 7.089243542035467e-06, + "loss": 4.7981, + "step": 71390 + }, + { + "epoch": 1.4525349934895833, + "grad_norm": 16.107803344726562, + "learning_rate": 7.088880421753337e-06, + "loss": 4.7885, + "step": 71395 + }, + { + "epoch": 1.45263671875, + "grad_norm": 15.108043670654297, + "learning_rate": 7.088517288123923e-06, + "loss": 4.8912, + "step": 71400 + }, + { + "epoch": 1.4527384440104167, + "grad_norm": 27.74049949645996, + "learning_rate": 7.088154141149547e-06, + "loss": 4.832, + "step": 71405 + }, + { + "epoch": 1.4528401692708333, + "grad_norm": 18.846479415893555, + "learning_rate": 7.087790980832528e-06, + "loss": 4.9712, + "step": 71410 + }, + { + "epoch": 1.45294189453125, + "grad_norm": 20.76891326904297, + "learning_rate": 7.087427807175189e-06, + "loss": 5.1719, + "step": 71415 + }, + { + "epoch": 1.4530436197916667, + "grad_norm": 16.015499114990234, + "learning_rate": 7.087064620179849e-06, + "loss": 4.8578, + "step": 71420 + }, + { + "epoch": 1.4531453450520833, + "grad_norm": 12.305472373962402, + "learning_rate": 7.086701419848826e-06, + "loss": 4.9407, + "step": 71425 + }, + { + "epoch": 1.4532470703125, + "grad_norm": 20.29728126525879, + "learning_rate": 7.086338206184446e-06, + "loss": 4.8086, + "step": 71430 + }, + { + "epoch": 1.4533487955729167, + "grad_norm": 18.900901794433594, + "learning_rate": 7.085974979189025e-06, + "loss": 4.8375, + "step": 71435 + }, + { + "epoch": 1.4534505208333333, + "grad_norm": 22.276987075805664, + "learning_rate": 7.085611738864888e-06, + "loss": 5.1241, + "step": 71440 + }, + { + "epoch": 1.45355224609375, + "grad_norm": 17.125707626342773, + "learning_rate": 7.0852484852143535e-06, + "loss": 4.9868, + "step": 71445 + }, + { + "epoch": 1.4536539713541667, + "grad_norm": 19.883460998535156, + "learning_rate": 7.0848852182397434e-06, + "loss": 4.8146, + "step": 71450 + }, + { + "epoch": 1.4537556966145833, + "grad_norm": 17.70098304748535, + "learning_rate": 7.084521937943379e-06, + "loss": 5.0125, + "step": 71455 + }, + { + "epoch": 1.453857421875, + "grad_norm": 16.302690505981445, + "learning_rate": 7.084158644327579e-06, + "loss": 4.7599, + "step": 71460 + }, + { + "epoch": 1.4539591471354167, + "grad_norm": 16.25685691833496, + "learning_rate": 7.0837953373946676e-06, + "loss": 4.8936, + "step": 71465 + }, + { + "epoch": 1.4540608723958333, + "grad_norm": 18.316129684448242, + "learning_rate": 7.083432017146966e-06, + "loss": 4.7376, + "step": 71470 + }, + { + "epoch": 1.45416259765625, + "grad_norm": 19.38409996032715, + "learning_rate": 7.083068683586795e-06, + "loss": 4.8726, + "step": 71475 + }, + { + "epoch": 1.4542643229166667, + "grad_norm": 17.437570571899414, + "learning_rate": 7.082705336716477e-06, + "loss": 5.0425, + "step": 71480 + }, + { + "epoch": 1.4543660481770833, + "grad_norm": 22.832656860351562, + "learning_rate": 7.082341976538332e-06, + "loss": 4.9722, + "step": 71485 + }, + { + "epoch": 1.4544677734375, + "grad_norm": 19.285524368286133, + "learning_rate": 7.081978603054683e-06, + "loss": 5.0322, + "step": 71490 + }, + { + "epoch": 1.4545694986979167, + "grad_norm": 24.666120529174805, + "learning_rate": 7.081615216267852e-06, + "loss": 4.9326, + "step": 71495 + }, + { + "epoch": 1.4546712239583333, + "grad_norm": 15.598616600036621, + "learning_rate": 7.081251816180161e-06, + "loss": 4.9331, + "step": 71500 + }, + { + "epoch": 1.45477294921875, + "grad_norm": 21.116626739501953, + "learning_rate": 7.08088840279393e-06, + "loss": 5.019, + "step": 71505 + }, + { + "epoch": 1.4548746744791667, + "grad_norm": 17.9234676361084, + "learning_rate": 7.080524976111486e-06, + "loss": 4.9163, + "step": 71510 + }, + { + "epoch": 1.4549763997395833, + "grad_norm": 18.063913345336914, + "learning_rate": 7.080161536135144e-06, + "loss": 4.8822, + "step": 71515 + }, + { + "epoch": 1.455078125, + "grad_norm": 19.425552368164062, + "learning_rate": 7.079798082867233e-06, + "loss": 4.9454, + "step": 71520 + }, + { + "epoch": 1.4551798502604167, + "grad_norm": 25.857446670532227, + "learning_rate": 7.0794346163100705e-06, + "loss": 4.5852, + "step": 71525 + }, + { + "epoch": 1.4552815755208333, + "grad_norm": 17.52020263671875, + "learning_rate": 7.0790711364659825e-06, + "loss": 4.8608, + "step": 71530 + }, + { + "epoch": 1.45538330078125, + "grad_norm": 16.728107452392578, + "learning_rate": 7.078707643337289e-06, + "loss": 4.9154, + "step": 71535 + }, + { + "epoch": 1.4554850260416667, + "grad_norm": 17.944856643676758, + "learning_rate": 7.078344136926313e-06, + "loss": 5.181, + "step": 71540 + }, + { + "epoch": 1.4555867513020833, + "grad_norm": 16.372209548950195, + "learning_rate": 7.077980617235378e-06, + "loss": 5.0844, + "step": 71545 + }, + { + "epoch": 1.4556884765625, + "grad_norm": 16.129165649414062, + "learning_rate": 7.077617084266808e-06, + "loss": 4.9957, + "step": 71550 + }, + { + "epoch": 1.4557902018229167, + "grad_norm": 21.480911254882812, + "learning_rate": 7.077253538022922e-06, + "loss": 4.8959, + "step": 71555 + }, + { + "epoch": 1.4558919270833333, + "grad_norm": 15.282299041748047, + "learning_rate": 7.076889978506048e-06, + "loss": 4.73, + "step": 71560 + }, + { + "epoch": 1.45599365234375, + "grad_norm": 15.872671127319336, + "learning_rate": 7.076526405718503e-06, + "loss": 4.8881, + "step": 71565 + }, + { + "epoch": 1.4560953776041667, + "grad_norm": 25.62542724609375, + "learning_rate": 7.076162819662616e-06, + "loss": 4.9422, + "step": 71570 + }, + { + "epoch": 1.4561971028645833, + "grad_norm": 15.270153045654297, + "learning_rate": 7.075799220340707e-06, + "loss": 4.8457, + "step": 71575 + }, + { + "epoch": 1.456298828125, + "grad_norm": 18.649986267089844, + "learning_rate": 7.0754356077551e-06, + "loss": 5.0722, + "step": 71580 + }, + { + "epoch": 1.4564005533854167, + "grad_norm": 19.420515060424805, + "learning_rate": 7.0750719819081186e-06, + "loss": 4.9804, + "step": 71585 + }, + { + "epoch": 1.4565022786458333, + "grad_norm": 17.624042510986328, + "learning_rate": 7.0747083428020845e-06, + "loss": 4.9354, + "step": 71590 + }, + { + "epoch": 1.45660400390625, + "grad_norm": 16.996662139892578, + "learning_rate": 7.074344690439323e-06, + "loss": 5.1204, + "step": 71595 + }, + { + "epoch": 1.4567057291666667, + "grad_norm": 16.21567153930664, + "learning_rate": 7.073981024822158e-06, + "loss": 5.0993, + "step": 71600 + }, + { + "epoch": 1.4568074544270833, + "grad_norm": 22.619426727294922, + "learning_rate": 7.073617345952913e-06, + "loss": 5.0332, + "step": 71605 + }, + { + "epoch": 1.4569091796875, + "grad_norm": 19.393125534057617, + "learning_rate": 7.0732536538339116e-06, + "loss": 4.7989, + "step": 71610 + }, + { + "epoch": 1.4570109049479167, + "grad_norm": 23.188243865966797, + "learning_rate": 7.0728899484674775e-06, + "loss": 5.1705, + "step": 71615 + }, + { + "epoch": 1.4571126302083333, + "grad_norm": 20.103885650634766, + "learning_rate": 7.072526229855934e-06, + "loss": 5.0151, + "step": 71620 + }, + { + "epoch": 1.45721435546875, + "grad_norm": 16.894268035888672, + "learning_rate": 7.072162498001606e-06, + "loss": 5.1711, + "step": 71625 + }, + { + "epoch": 1.4573160807291667, + "grad_norm": 18.1129093170166, + "learning_rate": 7.071798752906817e-06, + "loss": 4.8067, + "step": 71630 + }, + { + "epoch": 1.4574178059895833, + "grad_norm": 14.307183265686035, + "learning_rate": 7.071434994573892e-06, + "loss": 5.453, + "step": 71635 + }, + { + "epoch": 1.45751953125, + "grad_norm": 21.73395347595215, + "learning_rate": 7.0710712230051545e-06, + "loss": 5.1303, + "step": 71640 + }, + { + "epoch": 1.4576212565104167, + "grad_norm": 17.409400939941406, + "learning_rate": 7.0707074382029305e-06, + "loss": 4.9578, + "step": 71645 + }, + { + "epoch": 1.4577229817708333, + "grad_norm": 18.054798126220703, + "learning_rate": 7.070343640169543e-06, + "loss": 4.7883, + "step": 71650 + }, + { + "epoch": 1.45782470703125, + "grad_norm": 15.03065299987793, + "learning_rate": 7.0699798289073164e-06, + "loss": 4.8577, + "step": 71655 + }, + { + "epoch": 1.4579264322916667, + "grad_norm": 16.46623420715332, + "learning_rate": 7.069616004418576e-06, + "loss": 4.975, + "step": 71660 + }, + { + "epoch": 1.4580281575520833, + "grad_norm": 18.54180335998535, + "learning_rate": 7.069252166705646e-06, + "loss": 5.4279, + "step": 71665 + }, + { + "epoch": 1.4581298828125, + "grad_norm": 16.322856903076172, + "learning_rate": 7.068888315770851e-06, + "loss": 4.9614, + "step": 71670 + }, + { + "epoch": 1.4582316080729167, + "grad_norm": 17.697343826293945, + "learning_rate": 7.068524451616517e-06, + "loss": 4.8644, + "step": 71675 + }, + { + "epoch": 1.4583333333333333, + "grad_norm": 16.402650833129883, + "learning_rate": 7.0681605742449686e-06, + "loss": 4.7497, + "step": 71680 + }, + { + "epoch": 1.45843505859375, + "grad_norm": 20.15351104736328, + "learning_rate": 7.067796683658531e-06, + "loss": 4.6353, + "step": 71685 + }, + { + "epoch": 1.4585367838541667, + "grad_norm": 20.451683044433594, + "learning_rate": 7.067432779859529e-06, + "loss": 4.9243, + "step": 71690 + }, + { + "epoch": 1.4586385091145833, + "grad_norm": 18.6600284576416, + "learning_rate": 7.067068862850287e-06, + "loss": 5.0954, + "step": 71695 + }, + { + "epoch": 1.458740234375, + "grad_norm": 15.15963363647461, + "learning_rate": 7.066704932633132e-06, + "loss": 4.8219, + "step": 71700 + }, + { + "epoch": 1.4588419596354167, + "grad_norm": 15.314087867736816, + "learning_rate": 7.0663409892103885e-06, + "loss": 4.8157, + "step": 71705 + }, + { + "epoch": 1.4589436848958333, + "grad_norm": 18.190332412719727, + "learning_rate": 7.065977032584381e-06, + "loss": 4.8384, + "step": 71710 + }, + { + "epoch": 1.45904541015625, + "grad_norm": 18.68830108642578, + "learning_rate": 7.065613062757437e-06, + "loss": 4.8312, + "step": 71715 + }, + { + "epoch": 1.4591471354166667, + "grad_norm": 21.65420150756836, + "learning_rate": 7.065249079731881e-06, + "loss": 4.8353, + "step": 71720 + }, + { + "epoch": 1.4592488606770833, + "grad_norm": 14.85754108428955, + "learning_rate": 7.064885083510038e-06, + "loss": 4.9439, + "step": 71725 + }, + { + "epoch": 1.4593505859375, + "grad_norm": 20.020551681518555, + "learning_rate": 7.064521074094236e-06, + "loss": 4.871, + "step": 71730 + }, + { + "epoch": 1.4594523111979167, + "grad_norm": 15.314082145690918, + "learning_rate": 7.0641570514867985e-06, + "loss": 5.1616, + "step": 71735 + }, + { + "epoch": 1.4595540364583333, + "grad_norm": 49.546024322509766, + "learning_rate": 7.063793015690055e-06, + "loss": 4.9751, + "step": 71740 + }, + { + "epoch": 1.45965576171875, + "grad_norm": 13.244540214538574, + "learning_rate": 7.063428966706326e-06, + "loss": 4.9052, + "step": 71745 + }, + { + "epoch": 1.4597574869791667, + "grad_norm": 16.237213134765625, + "learning_rate": 7.063064904537944e-06, + "loss": 4.8906, + "step": 71750 + }, + { + "epoch": 1.4598592122395833, + "grad_norm": 19.5422306060791, + "learning_rate": 7.062700829187231e-06, + "loss": 4.9032, + "step": 71755 + }, + { + "epoch": 1.4599609375, + "grad_norm": 31.93084716796875, + "learning_rate": 7.062336740656513e-06, + "loss": 4.8437, + "step": 71760 + }, + { + "epoch": 1.4600626627604167, + "grad_norm": 12.913262367248535, + "learning_rate": 7.06197263894812e-06, + "loss": 4.9961, + "step": 71765 + }, + { + "epoch": 1.4601643880208333, + "grad_norm": 17.202129364013672, + "learning_rate": 7.0616085240643765e-06, + "loss": 4.9834, + "step": 71770 + }, + { + "epoch": 1.46026611328125, + "grad_norm": 19.656583786010742, + "learning_rate": 7.061244396007608e-06, + "loss": 4.9747, + "step": 71775 + }, + { + "epoch": 1.4603678385416667, + "grad_norm": 20.563920974731445, + "learning_rate": 7.060880254780142e-06, + "loss": 4.9559, + "step": 71780 + }, + { + "epoch": 1.4604695638020833, + "grad_norm": 19.172576904296875, + "learning_rate": 7.0605161003843056e-06, + "loss": 4.8237, + "step": 71785 + }, + { + "epoch": 1.4605712890625, + "grad_norm": 37.84503173828125, + "learning_rate": 7.060151932822427e-06, + "loss": 4.7181, + "step": 71790 + }, + { + "epoch": 1.4606730143229167, + "grad_norm": 18.669097900390625, + "learning_rate": 7.05978775209683e-06, + "loss": 4.8961, + "step": 71795 + }, + { + "epoch": 1.4607747395833333, + "grad_norm": 20.3231258392334, + "learning_rate": 7.059423558209842e-06, + "loss": 4.773, + "step": 71800 + }, + { + "epoch": 1.46087646484375, + "grad_norm": 16.728010177612305, + "learning_rate": 7.059059351163793e-06, + "loss": 4.9773, + "step": 71805 + }, + { + "epoch": 1.4609781901041667, + "grad_norm": 16.742815017700195, + "learning_rate": 7.058695130961008e-06, + "loss": 5.0187, + "step": 71810 + }, + { + "epoch": 1.4610799153645833, + "grad_norm": 17.856136322021484, + "learning_rate": 7.058330897603815e-06, + "loss": 4.7067, + "step": 71815 + }, + { + "epoch": 1.461181640625, + "grad_norm": 20.520492553710938, + "learning_rate": 7.05796665109454e-06, + "loss": 4.747, + "step": 71820 + }, + { + "epoch": 1.4612833658854167, + "grad_norm": 22.44304084777832, + "learning_rate": 7.057602391435512e-06, + "loss": 5.0825, + "step": 71825 + }, + { + "epoch": 1.4613850911458333, + "grad_norm": 16.843374252319336, + "learning_rate": 7.057238118629057e-06, + "loss": 4.9021, + "step": 71830 + }, + { + "epoch": 1.46148681640625, + "grad_norm": 17.64876365661621, + "learning_rate": 7.0568738326775045e-06, + "loss": 4.772, + "step": 71835 + }, + { + "epoch": 1.4615885416666667, + "grad_norm": 17.8752384185791, + "learning_rate": 7.05650953358318e-06, + "loss": 4.978, + "step": 71840 + }, + { + "epoch": 1.4616902669270833, + "grad_norm": 19.40576934814453, + "learning_rate": 7.056145221348414e-06, + "loss": 4.7223, + "step": 71845 + }, + { + "epoch": 1.4617919921875, + "grad_norm": 17.67828369140625, + "learning_rate": 7.055780895975531e-06, + "loss": 5.0038, + "step": 71850 + }, + { + "epoch": 1.4618937174479167, + "grad_norm": 16.57682991027832, + "learning_rate": 7.0554165574668634e-06, + "loss": 5.0458, + "step": 71855 + }, + { + "epoch": 1.4619954427083333, + "grad_norm": 43.036949157714844, + "learning_rate": 7.055052205824733e-06, + "loss": 5.0272, + "step": 71860 + }, + { + "epoch": 1.46209716796875, + "grad_norm": 17.60871696472168, + "learning_rate": 7.054687841051474e-06, + "loss": 4.9187, + "step": 71865 + }, + { + "epoch": 1.4621988932291667, + "grad_norm": 13.98741340637207, + "learning_rate": 7.05432346314941e-06, + "loss": 4.8884, + "step": 71870 + }, + { + "epoch": 1.4623006184895833, + "grad_norm": 19.175615310668945, + "learning_rate": 7.053959072120872e-06, + "loss": 4.9911, + "step": 71875 + }, + { + "epoch": 1.46240234375, + "grad_norm": 15.587203979492188, + "learning_rate": 7.053594667968189e-06, + "loss": 4.9752, + "step": 71880 + }, + { + "epoch": 1.4625040690104167, + "grad_norm": 16.329845428466797, + "learning_rate": 7.053230250693686e-06, + "loss": 4.9029, + "step": 71885 + }, + { + "epoch": 1.4626057942708333, + "grad_norm": 20.965883255004883, + "learning_rate": 7.052865820299694e-06, + "loss": 5.0501, + "step": 71890 + }, + { + "epoch": 1.46270751953125, + "grad_norm": 19.773317337036133, + "learning_rate": 7.052501376788542e-06, + "loss": 4.9096, + "step": 71895 + }, + { + "epoch": 1.4628092447916667, + "grad_norm": 16.937971115112305, + "learning_rate": 7.052136920162556e-06, + "loss": 4.845, + "step": 71900 + }, + { + "epoch": 1.4629109700520833, + "grad_norm": 23.102466583251953, + "learning_rate": 7.0517724504240685e-06, + "loss": 4.8245, + "step": 71905 + }, + { + "epoch": 1.4630126953125, + "grad_norm": 17.869693756103516, + "learning_rate": 7.051407967575405e-06, + "loss": 4.9811, + "step": 71910 + }, + { + "epoch": 1.4631144205729167, + "grad_norm": 15.824360847473145, + "learning_rate": 7.051043471618895e-06, + "loss": 4.7146, + "step": 71915 + }, + { + "epoch": 1.4632161458333333, + "grad_norm": 16.75447654724121, + "learning_rate": 7.050678962556871e-06, + "loss": 4.7892, + "step": 71920 + }, + { + "epoch": 1.46331787109375, + "grad_norm": 15.213130950927734, + "learning_rate": 7.050314440391658e-06, + "loss": 4.8378, + "step": 71925 + }, + { + "epoch": 1.4634195963541667, + "grad_norm": 24.218978881835938, + "learning_rate": 7.049949905125586e-06, + "loss": 4.6921, + "step": 71930 + }, + { + "epoch": 1.4635213216145833, + "grad_norm": 13.917548179626465, + "learning_rate": 7.049585356760986e-06, + "loss": 4.903, + "step": 71935 + }, + { + "epoch": 1.463623046875, + "grad_norm": 15.268401145935059, + "learning_rate": 7.049220795300185e-06, + "loss": 4.8129, + "step": 71940 + }, + { + "epoch": 1.4637247721354167, + "grad_norm": 16.472084045410156, + "learning_rate": 7.048856220745515e-06, + "loss": 4.7745, + "step": 71945 + }, + { + "epoch": 1.4638264973958333, + "grad_norm": 17.99079132080078, + "learning_rate": 7.048491633099304e-06, + "loss": 4.8643, + "step": 71950 + }, + { + "epoch": 1.46392822265625, + "grad_norm": 19.4393253326416, + "learning_rate": 7.0481270323638796e-06, + "loss": 4.7331, + "step": 71955 + }, + { + "epoch": 1.4640299479166667, + "grad_norm": 25.602018356323242, + "learning_rate": 7.047762418541575e-06, + "loss": 4.6782, + "step": 71960 + }, + { + "epoch": 1.4641316731770833, + "grad_norm": 18.705896377563477, + "learning_rate": 7.047397791634718e-06, + "loss": 4.9961, + "step": 71965 + }, + { + "epoch": 1.4642333984375, + "grad_norm": 18.221614837646484, + "learning_rate": 7.04703315164564e-06, + "loss": 4.8202, + "step": 71970 + }, + { + "epoch": 1.4643351236979167, + "grad_norm": 18.62617301940918, + "learning_rate": 7.046668498576671e-06, + "loss": 4.7768, + "step": 71975 + }, + { + "epoch": 1.4644368489583333, + "grad_norm": 20.519948959350586, + "learning_rate": 7.046303832430138e-06, + "loss": 5.1111, + "step": 71980 + }, + { + "epoch": 1.46453857421875, + "grad_norm": 20.20980453491211, + "learning_rate": 7.045939153208374e-06, + "loss": 4.7657, + "step": 71985 + }, + { + "epoch": 1.4646402994791667, + "grad_norm": 20.027240753173828, + "learning_rate": 7.045574460913708e-06, + "loss": 5.0778, + "step": 71990 + }, + { + "epoch": 1.4647420247395833, + "grad_norm": 22.10848045349121, + "learning_rate": 7.04520975554847e-06, + "loss": 5.142, + "step": 71995 + }, + { + "epoch": 1.46484375, + "grad_norm": 21.941211700439453, + "learning_rate": 7.044845037114991e-06, + "loss": 4.945, + "step": 72000 + }, + { + "epoch": 1.4649454752604167, + "grad_norm": 17.833894729614258, + "learning_rate": 7.044480305615602e-06, + "loss": 4.8997, + "step": 72005 + }, + { + "epoch": 1.4650472005208333, + "grad_norm": 15.518209457397461, + "learning_rate": 7.044115561052632e-06, + "loss": 4.7365, + "step": 72010 + }, + { + "epoch": 1.46514892578125, + "grad_norm": 17.686317443847656, + "learning_rate": 7.043750803428414e-06, + "loss": 4.8932, + "step": 72015 + }, + { + "epoch": 1.4652506510416667, + "grad_norm": 21.569211959838867, + "learning_rate": 7.043386032745277e-06, + "loss": 5.258, + "step": 72020 + }, + { + "epoch": 1.4653523763020833, + "grad_norm": 13.944156646728516, + "learning_rate": 7.04302124900555e-06, + "loss": 4.9278, + "step": 72025 + }, + { + "epoch": 1.4654541015625, + "grad_norm": 17.147235870361328, + "learning_rate": 7.042656452211567e-06, + "loss": 5.1017, + "step": 72030 + }, + { + "epoch": 1.4655558268229167, + "grad_norm": 25.340959548950195, + "learning_rate": 7.042291642365658e-06, + "loss": 4.9584, + "step": 72035 + }, + { + "epoch": 1.4656575520833333, + "grad_norm": 18.86572265625, + "learning_rate": 7.041926819470154e-06, + "loss": 4.7773, + "step": 72040 + }, + { + "epoch": 1.46575927734375, + "grad_norm": 13.51602840423584, + "learning_rate": 7.041561983527384e-06, + "loss": 4.9802, + "step": 72045 + }, + { + "epoch": 1.4658610026041667, + "grad_norm": 20.314807891845703, + "learning_rate": 7.041197134539683e-06, + "loss": 4.9894, + "step": 72050 + }, + { + "epoch": 1.4659627278645833, + "grad_norm": 18.02315330505371, + "learning_rate": 7.040832272509379e-06, + "loss": 4.8908, + "step": 72055 + }, + { + "epoch": 1.466064453125, + "grad_norm": 18.945032119750977, + "learning_rate": 7.0404673974388045e-06, + "loss": 4.9882, + "step": 72060 + }, + { + "epoch": 1.4661661783854167, + "grad_norm": 16.265531539916992, + "learning_rate": 7.040102509330293e-06, + "loss": 5.0026, + "step": 72065 + }, + { + "epoch": 1.4662679036458333, + "grad_norm": 18.054302215576172, + "learning_rate": 7.0397376081861725e-06, + "loss": 4.8742, + "step": 72070 + }, + { + "epoch": 1.46636962890625, + "grad_norm": 16.02395248413086, + "learning_rate": 7.039372694008776e-06, + "loss": 4.9785, + "step": 72075 + }, + { + "epoch": 1.4664713541666667, + "grad_norm": 16.166263580322266, + "learning_rate": 7.039007766800436e-06, + "loss": 4.9043, + "step": 72080 + }, + { + "epoch": 1.4665730794270833, + "grad_norm": 18.185842514038086, + "learning_rate": 7.0386428265634835e-06, + "loss": 5.1578, + "step": 72085 + }, + { + "epoch": 1.4666748046875, + "grad_norm": 19.502422332763672, + "learning_rate": 7.038277873300251e-06, + "loss": 4.9178, + "step": 72090 + }, + { + "epoch": 1.4667765299479167, + "grad_norm": 43.3030891418457, + "learning_rate": 7.037912907013068e-06, + "loss": 4.9495, + "step": 72095 + }, + { + "epoch": 1.4668782552083333, + "grad_norm": 14.52762222290039, + "learning_rate": 7.0375479277042705e-06, + "loss": 5.0637, + "step": 72100 + }, + { + "epoch": 1.46697998046875, + "grad_norm": 14.50600528717041, + "learning_rate": 7.037182935376187e-06, + "loss": 4.8809, + "step": 72105 + }, + { + "epoch": 1.4670817057291667, + "grad_norm": 16.714990615844727, + "learning_rate": 7.036817930031152e-06, + "loss": 5.0537, + "step": 72110 + }, + { + "epoch": 1.4671834309895833, + "grad_norm": 19.375953674316406, + "learning_rate": 7.036452911671498e-06, + "loss": 5.143, + "step": 72115 + }, + { + "epoch": 1.46728515625, + "grad_norm": 18.78704833984375, + "learning_rate": 7.036087880299554e-06, + "loss": 4.7764, + "step": 72120 + }, + { + "epoch": 1.4673868815104167, + "grad_norm": 20.350311279296875, + "learning_rate": 7.035722835917658e-06, + "loss": 4.9354, + "step": 72125 + }, + { + "epoch": 1.4674886067708333, + "grad_norm": 24.770557403564453, + "learning_rate": 7.0353577785281364e-06, + "loss": 4.8128, + "step": 72130 + }, + { + "epoch": 1.46759033203125, + "grad_norm": 21.12101936340332, + "learning_rate": 7.034992708133326e-06, + "loss": 5.0263, + "step": 72135 + }, + { + "epoch": 1.4676920572916667, + "grad_norm": 18.348161697387695, + "learning_rate": 7.034627624735557e-06, + "loss": 5.1974, + "step": 72140 + }, + { + "epoch": 1.4677937825520833, + "grad_norm": 19.49179458618164, + "learning_rate": 7.034262528337166e-06, + "loss": 4.9604, + "step": 72145 + }, + { + "epoch": 1.4678955078125, + "grad_norm": 15.339620590209961, + "learning_rate": 7.0338974189404804e-06, + "loss": 4.831, + "step": 72150 + }, + { + "epoch": 1.4679972330729167, + "grad_norm": 19.855606079101562, + "learning_rate": 7.033532296547838e-06, + "loss": 4.7565, + "step": 72155 + }, + { + "epoch": 1.4680989583333333, + "grad_norm": 18.017221450805664, + "learning_rate": 7.033167161161568e-06, + "loss": 5.1467, + "step": 72160 + }, + { + "epoch": 1.46820068359375, + "grad_norm": 16.844900131225586, + "learning_rate": 7.032802012784006e-06, + "loss": 5.393, + "step": 72165 + }, + { + "epoch": 1.4683024088541667, + "grad_norm": 19.22842025756836, + "learning_rate": 7.032436851417485e-06, + "loss": 4.9039, + "step": 72170 + }, + { + "epoch": 1.4684041341145833, + "grad_norm": 14.812942504882812, + "learning_rate": 7.032071677064337e-06, + "loss": 5.038, + "step": 72175 + }, + { + "epoch": 1.468505859375, + "grad_norm": 21.167869567871094, + "learning_rate": 7.0317064897268975e-06, + "loss": 4.9413, + "step": 72180 + }, + { + "epoch": 1.4686075846354167, + "grad_norm": 18.649084091186523, + "learning_rate": 7.031341289407496e-06, + "loss": 4.989, + "step": 72185 + }, + { + "epoch": 1.4687093098958333, + "grad_norm": 16.065595626831055, + "learning_rate": 7.03097607610847e-06, + "loss": 4.8491, + "step": 72190 + }, + { + "epoch": 1.46881103515625, + "grad_norm": 18.11307716369629, + "learning_rate": 7.030610849832151e-06, + "loss": 4.8678, + "step": 72195 + }, + { + "epoch": 1.4689127604166667, + "grad_norm": 24.874826431274414, + "learning_rate": 7.030245610580873e-06, + "loss": 5.0427, + "step": 72200 + }, + { + "epoch": 1.4690144856770833, + "grad_norm": 13.57904052734375, + "learning_rate": 7.029880358356972e-06, + "loss": 5.0097, + "step": 72205 + }, + { + "epoch": 1.4691162109375, + "grad_norm": 16.590511322021484, + "learning_rate": 7.029515093162778e-06, + "loss": 4.8313, + "step": 72210 + }, + { + "epoch": 1.4692179361979167, + "grad_norm": 22.27662467956543, + "learning_rate": 7.029149815000628e-06, + "loss": 5.2684, + "step": 72215 + }, + { + "epoch": 1.4693196614583333, + "grad_norm": 19.298490524291992, + "learning_rate": 7.028784523872853e-06, + "loss": 5.2625, + "step": 72220 + }, + { + "epoch": 1.46942138671875, + "grad_norm": 16.950454711914062, + "learning_rate": 7.02841921978179e-06, + "loss": 4.8213, + "step": 72225 + }, + { + "epoch": 1.4695231119791667, + "grad_norm": 16.900415420532227, + "learning_rate": 7.0280539027297735e-06, + "loss": 5.0425, + "step": 72230 + }, + { + "epoch": 1.4696248372395833, + "grad_norm": 15.156232833862305, + "learning_rate": 7.027688572719135e-06, + "loss": 4.7881, + "step": 72235 + }, + { + "epoch": 1.4697265625, + "grad_norm": 19.56195068359375, + "learning_rate": 7.02732322975221e-06, + "loss": 4.9332, + "step": 72240 + }, + { + "epoch": 1.4698282877604167, + "grad_norm": 17.066362380981445, + "learning_rate": 7.026957873831334e-06, + "loss": 4.9387, + "step": 72245 + }, + { + "epoch": 1.4699300130208333, + "grad_norm": 19.52692985534668, + "learning_rate": 7.026592504958839e-06, + "loss": 4.8243, + "step": 72250 + }, + { + "epoch": 1.47003173828125, + "grad_norm": 19.477331161499023, + "learning_rate": 7.026227123137063e-06, + "loss": 5.1461, + "step": 72255 + }, + { + "epoch": 1.4701334635416667, + "grad_norm": 15.38021183013916, + "learning_rate": 7.025861728368337e-06, + "loss": 4.9547, + "step": 72260 + }, + { + "epoch": 1.4702351888020833, + "grad_norm": 22.929296493530273, + "learning_rate": 7.025496320654999e-06, + "loss": 5.0245, + "step": 72265 + }, + { + "epoch": 1.4703369140625, + "grad_norm": 27.462055206298828, + "learning_rate": 7.025130899999381e-06, + "loss": 4.6598, + "step": 72270 + }, + { + "epoch": 1.4704386393229167, + "grad_norm": 19.417200088500977, + "learning_rate": 7.024765466403822e-06, + "loss": 4.7664, + "step": 72275 + }, + { + "epoch": 1.4705403645833333, + "grad_norm": 18.57391357421875, + "learning_rate": 7.024400019870652e-06, + "loss": 4.896, + "step": 72280 + }, + { + "epoch": 1.47064208984375, + "grad_norm": 22.389841079711914, + "learning_rate": 7.024034560402211e-06, + "loss": 5.0676, + "step": 72285 + }, + { + "epoch": 1.4707438151041667, + "grad_norm": 16.576358795166016, + "learning_rate": 7.023669088000828e-06, + "loss": 4.8226, + "step": 72290 + }, + { + "epoch": 1.4708455403645833, + "grad_norm": 13.452556610107422, + "learning_rate": 7.023303602668845e-06, + "loss": 5.0035, + "step": 72295 + }, + { + "epoch": 1.470947265625, + "grad_norm": 14.739838600158691, + "learning_rate": 7.022938104408595e-06, + "loss": 4.6939, + "step": 72300 + }, + { + "epoch": 1.4710489908854167, + "grad_norm": 18.91817283630371, + "learning_rate": 7.02257259322241e-06, + "loss": 4.8002, + "step": 72305 + }, + { + "epoch": 1.4711507161458333, + "grad_norm": 17.644865036010742, + "learning_rate": 7.0222070691126295e-06, + "loss": 5.0901, + "step": 72310 + }, + { + "epoch": 1.47125244140625, + "grad_norm": 21.623119354248047, + "learning_rate": 7.021841532081587e-06, + "loss": 4.7184, + "step": 72315 + }, + { + "epoch": 1.4713541666666667, + "grad_norm": 19.776752471923828, + "learning_rate": 7.02147598213162e-06, + "loss": 4.9334, + "step": 72320 + }, + { + "epoch": 1.4714558919270833, + "grad_norm": 20.35223960876465, + "learning_rate": 7.021110419265062e-06, + "loss": 4.8707, + "step": 72325 + }, + { + "epoch": 1.4715576171875, + "grad_norm": 17.440303802490234, + "learning_rate": 7.020744843484251e-06, + "loss": 4.8141, + "step": 72330 + }, + { + "epoch": 1.4716593424479167, + "grad_norm": 17.537891387939453, + "learning_rate": 7.020379254791523e-06, + "loss": 4.9894, + "step": 72335 + }, + { + "epoch": 1.4717610677083333, + "grad_norm": 14.064760208129883, + "learning_rate": 7.02001365318921e-06, + "loss": 4.9633, + "step": 72340 + }, + { + "epoch": 1.47186279296875, + "grad_norm": 16.99104118347168, + "learning_rate": 7.019648038679653e-06, + "loss": 4.8194, + "step": 72345 + }, + { + "epoch": 1.4719645182291667, + "grad_norm": 13.464895248413086, + "learning_rate": 7.019282411265186e-06, + "loss": 4.8749, + "step": 72350 + }, + { + "epoch": 1.4720662434895833, + "grad_norm": 22.57817840576172, + "learning_rate": 7.0189167709481444e-06, + "loss": 4.6463, + "step": 72355 + }, + { + "epoch": 1.47216796875, + "grad_norm": 18.010705947875977, + "learning_rate": 7.018551117730867e-06, + "loss": 4.6788, + "step": 72360 + }, + { + "epoch": 1.4722696940104167, + "grad_norm": 17.68086814880371, + "learning_rate": 7.018185451615688e-06, + "loss": 5.0406, + "step": 72365 + }, + { + "epoch": 1.4723714192708333, + "grad_norm": 17.331417083740234, + "learning_rate": 7.017819772604943e-06, + "loss": 4.9377, + "step": 72370 + }, + { + "epoch": 1.47247314453125, + "grad_norm": 21.16865348815918, + "learning_rate": 7.0174540807009736e-06, + "loss": 5.0953, + "step": 72375 + }, + { + "epoch": 1.4725748697916667, + "grad_norm": 16.725738525390625, + "learning_rate": 7.01708837590611e-06, + "loss": 4.8588, + "step": 72380 + }, + { + "epoch": 1.4726765950520833, + "grad_norm": 24.83305549621582, + "learning_rate": 7.016722658222694e-06, + "loss": 4.6929, + "step": 72385 + }, + { + "epoch": 1.4727783203125, + "grad_norm": 16.902238845825195, + "learning_rate": 7.016356927653059e-06, + "loss": 4.877, + "step": 72390 + }, + { + "epoch": 1.4728800455729167, + "grad_norm": 24.647396087646484, + "learning_rate": 7.0159911841995445e-06, + "loss": 4.8479, + "step": 72395 + }, + { + "epoch": 1.4729817708333333, + "grad_norm": 18.02705955505371, + "learning_rate": 7.0156254278644854e-06, + "loss": 4.8939, + "step": 72400 + }, + { + "epoch": 1.47308349609375, + "grad_norm": 16.996973037719727, + "learning_rate": 7.01525965865022e-06, + "loss": 4.6943, + "step": 72405 + }, + { + "epoch": 1.4731852213541667, + "grad_norm": 19.53432273864746, + "learning_rate": 7.014893876559084e-06, + "loss": 4.8951, + "step": 72410 + }, + { + "epoch": 1.4732869466145833, + "grad_norm": 14.951403617858887, + "learning_rate": 7.014528081593417e-06, + "loss": 4.9246, + "step": 72415 + }, + { + "epoch": 1.473388671875, + "grad_norm": 19.257875442504883, + "learning_rate": 7.014162273755555e-06, + "loss": 5.0347, + "step": 72420 + }, + { + "epoch": 1.4734903971354167, + "grad_norm": 20.084020614624023, + "learning_rate": 7.013796453047836e-06, + "loss": 5.0022, + "step": 72425 + }, + { + "epoch": 1.4735921223958333, + "grad_norm": 13.420949935913086, + "learning_rate": 7.013430619472597e-06, + "loss": 4.8794, + "step": 72430 + }, + { + "epoch": 1.47369384765625, + "grad_norm": 16.211925506591797, + "learning_rate": 7.0130647730321735e-06, + "loss": 5.115, + "step": 72435 + }, + { + "epoch": 1.4737955729166667, + "grad_norm": 14.98132038116455, + "learning_rate": 7.012698913728906e-06, + "loss": 4.8893, + "step": 72440 + }, + { + "epoch": 1.4738972981770833, + "grad_norm": 14.220857620239258, + "learning_rate": 7.0123330415651316e-06, + "loss": 4.966, + "step": 72445 + }, + { + "epoch": 1.4739990234375, + "grad_norm": 23.893569946289062, + "learning_rate": 7.011967156543188e-06, + "loss": 4.998, + "step": 72450 + }, + { + "epoch": 1.4741007486979167, + "grad_norm": 19.454816818237305, + "learning_rate": 7.011601258665414e-06, + "loss": 5.008, + "step": 72455 + }, + { + "epoch": 1.4742024739583333, + "grad_norm": 16.485855102539062, + "learning_rate": 7.011235347934145e-06, + "loss": 4.8399, + "step": 72460 + }, + { + "epoch": 1.47430419921875, + "grad_norm": 14.770404815673828, + "learning_rate": 7.01086942435172e-06, + "loss": 4.5472, + "step": 72465 + }, + { + "epoch": 1.4744059244791667, + "grad_norm": 15.37906551361084, + "learning_rate": 7.010503487920479e-06, + "loss": 4.9034, + "step": 72470 + }, + { + "epoch": 1.4745076497395833, + "grad_norm": 24.649158477783203, + "learning_rate": 7.010137538642759e-06, + "loss": 4.9119, + "step": 72475 + }, + { + "epoch": 1.474609375, + "grad_norm": 15.092602729797363, + "learning_rate": 7.009771576520899e-06, + "loss": 5.0632, + "step": 72480 + }, + { + "epoch": 1.4747111002604167, + "grad_norm": 16.342172622680664, + "learning_rate": 7.009405601557235e-06, + "loss": 4.9249, + "step": 72485 + }, + { + "epoch": 1.4748128255208333, + "grad_norm": 18.286958694458008, + "learning_rate": 7.009039613754107e-06, + "loss": 4.9421, + "step": 72490 + }, + { + "epoch": 1.47491455078125, + "grad_norm": 17.118961334228516, + "learning_rate": 7.008673613113854e-06, + "loss": 5.0174, + "step": 72495 + }, + { + "epoch": 1.4750162760416667, + "grad_norm": 21.888036727905273, + "learning_rate": 7.008307599638815e-06, + "loss": 5.0715, + "step": 72500 + }, + { + "epoch": 1.4751180013020833, + "grad_norm": 17.64079475402832, + "learning_rate": 7.007941573331327e-06, + "loss": 4.8557, + "step": 72505 + }, + { + "epoch": 1.4752197265625, + "grad_norm": 15.661858558654785, + "learning_rate": 7.00757553419373e-06, + "loss": 4.7501, + "step": 72510 + }, + { + "epoch": 1.4753214518229167, + "grad_norm": 21.246726989746094, + "learning_rate": 7.007209482228363e-06, + "loss": 5.1021, + "step": 72515 + }, + { + "epoch": 1.4754231770833333, + "grad_norm": 12.29672622680664, + "learning_rate": 7.006843417437564e-06, + "loss": 4.7304, + "step": 72520 + }, + { + "epoch": 1.47552490234375, + "grad_norm": 19.885944366455078, + "learning_rate": 7.006477339823674e-06, + "loss": 4.7955, + "step": 72525 + }, + { + "epoch": 1.4756266276041667, + "grad_norm": 20.457887649536133, + "learning_rate": 7.00611124938903e-06, + "loss": 4.8417, + "step": 72530 + }, + { + "epoch": 1.4757283528645833, + "grad_norm": 15.285040855407715, + "learning_rate": 7.0057451461359714e-06, + "loss": 4.712, + "step": 72535 + }, + { + "epoch": 1.475830078125, + "grad_norm": 13.848225593566895, + "learning_rate": 7.005379030066837e-06, + "loss": 5.0054, + "step": 72540 + }, + { + "epoch": 1.4759318033854167, + "grad_norm": 17.46442985534668, + "learning_rate": 7.00501290118397e-06, + "loss": 4.9143, + "step": 72545 + }, + { + "epoch": 1.4760335286458333, + "grad_norm": 18.00299072265625, + "learning_rate": 7.004646759489704e-06, + "loss": 5.0253, + "step": 72550 + }, + { + "epoch": 1.47613525390625, + "grad_norm": 14.193243980407715, + "learning_rate": 7.004280604986385e-06, + "loss": 4.953, + "step": 72555 + }, + { + "epoch": 1.4762369791666667, + "grad_norm": 20.1549072265625, + "learning_rate": 7.003914437676345e-06, + "loss": 4.7964, + "step": 72560 + }, + { + "epoch": 1.4763387044270833, + "grad_norm": 16.47673988342285, + "learning_rate": 7.003548257561931e-06, + "loss": 4.802, + "step": 72565 + }, + { + "epoch": 1.4764404296875, + "grad_norm": 23.50078010559082, + "learning_rate": 7.003182064645479e-06, + "loss": 5.0524, + "step": 72570 + }, + { + "epoch": 1.4765421549479167, + "grad_norm": 21.050739288330078, + "learning_rate": 7.002815858929328e-06, + "loss": 5.1169, + "step": 72575 + }, + { + "epoch": 1.4766438802083333, + "grad_norm": 16.6168212890625, + "learning_rate": 7.002449640415823e-06, + "loss": 4.8358, + "step": 72580 + }, + { + "epoch": 1.47674560546875, + "grad_norm": 17.235509872436523, + "learning_rate": 7.002083409107298e-06, + "loss": 5.0513, + "step": 72585 + }, + { + "epoch": 1.4768473307291667, + "grad_norm": 20.478307723999023, + "learning_rate": 7.001717165006094e-06, + "loss": 4.9043, + "step": 72590 + }, + { + "epoch": 1.4769490559895833, + "grad_norm": 16.890050888061523, + "learning_rate": 7.001350908114556e-06, + "loss": 4.8426, + "step": 72595 + }, + { + "epoch": 1.47705078125, + "grad_norm": 18.665889739990234, + "learning_rate": 7.000984638435019e-06, + "loss": 5.0507, + "step": 72600 + }, + { + "epoch": 1.4771525065104167, + "grad_norm": 16.75867462158203, + "learning_rate": 7.000618355969826e-06, + "loss": 4.8362, + "step": 72605 + }, + { + "epoch": 1.4772542317708333, + "grad_norm": 18.124662399291992, + "learning_rate": 7.000252060721318e-06, + "loss": 4.7364, + "step": 72610 + }, + { + "epoch": 1.47735595703125, + "grad_norm": 17.850440979003906, + "learning_rate": 6.999885752691832e-06, + "loss": 4.7052, + "step": 72615 + }, + { + "epoch": 1.4774576822916667, + "grad_norm": 26.632062911987305, + "learning_rate": 6.999519431883712e-06, + "loss": 5.1868, + "step": 72620 + }, + { + "epoch": 1.4775594075520833, + "grad_norm": 23.338788986206055, + "learning_rate": 6.999153098299296e-06, + "loss": 4.8486, + "step": 72625 + }, + { + "epoch": 1.4776611328125, + "grad_norm": 19.843875885009766, + "learning_rate": 6.998786751940929e-06, + "loss": 4.8234, + "step": 72630 + }, + { + "epoch": 1.4777628580729167, + "grad_norm": 13.115408897399902, + "learning_rate": 6.9984203928109486e-06, + "loss": 4.6704, + "step": 72635 + }, + { + "epoch": 1.4778645833333333, + "grad_norm": 26.0207576751709, + "learning_rate": 6.998054020911694e-06, + "loss": 4.9996, + "step": 72640 + }, + { + "epoch": 1.47796630859375, + "grad_norm": 19.221351623535156, + "learning_rate": 6.997687636245511e-06, + "loss": 5.004, + "step": 72645 + }, + { + "epoch": 1.4780680338541667, + "grad_norm": 12.506311416625977, + "learning_rate": 6.997321238814738e-06, + "loss": 4.7667, + "step": 72650 + }, + { + "epoch": 1.4781697591145833, + "grad_norm": 14.479881286621094, + "learning_rate": 6.996954828621715e-06, + "loss": 4.8149, + "step": 72655 + }, + { + "epoch": 1.478271484375, + "grad_norm": 21.45755958557129, + "learning_rate": 6.996588405668784e-06, + "loss": 4.7705, + "step": 72660 + }, + { + "epoch": 1.4783732096354167, + "grad_norm": 12.121599197387695, + "learning_rate": 6.996221969958288e-06, + "loss": 4.888, + "step": 72665 + }, + { + "epoch": 1.4784749348958333, + "grad_norm": 22.905176162719727, + "learning_rate": 6.9958555214925676e-06, + "loss": 5.1298, + "step": 72670 + }, + { + "epoch": 1.47857666015625, + "grad_norm": 20.08753204345703, + "learning_rate": 6.995489060273963e-06, + "loss": 4.9854, + "step": 72675 + }, + { + "epoch": 1.4786783854166667, + "grad_norm": 14.738089561462402, + "learning_rate": 6.995122586304818e-06, + "loss": 5.1079, + "step": 72680 + }, + { + "epoch": 1.4787801106770833, + "grad_norm": 17.77048110961914, + "learning_rate": 6.994756099587472e-06, + "loss": 4.8791, + "step": 72685 + }, + { + "epoch": 1.4788818359375, + "grad_norm": 20.29347801208496, + "learning_rate": 6.994389600124267e-06, + "loss": 4.8824, + "step": 72690 + }, + { + "epoch": 1.4789835611979167, + "grad_norm": 22.088951110839844, + "learning_rate": 6.9940230879175475e-06, + "loss": 4.7752, + "step": 72695 + }, + { + "epoch": 1.4790852864583333, + "grad_norm": 17.939483642578125, + "learning_rate": 6.993656562969654e-06, + "loss": 4.9504, + "step": 72700 + }, + { + "epoch": 1.47918701171875, + "grad_norm": 20.602245330810547, + "learning_rate": 6.993290025282926e-06, + "loss": 5.0509, + "step": 72705 + }, + { + "epoch": 1.4792887369791667, + "grad_norm": 22.4880428314209, + "learning_rate": 6.992923474859709e-06, + "loss": 4.7459, + "step": 72710 + }, + { + "epoch": 1.4793904622395833, + "grad_norm": 20.019514083862305, + "learning_rate": 6.992556911702342e-06, + "loss": 4.528, + "step": 72715 + }, + { + "epoch": 1.4794921875, + "grad_norm": 21.2738037109375, + "learning_rate": 6.992190335813169e-06, + "loss": 4.8021, + "step": 72720 + }, + { + "epoch": 1.4795939127604167, + "grad_norm": 16.500516891479492, + "learning_rate": 6.991823747194534e-06, + "loss": 4.7332, + "step": 72725 + }, + { + "epoch": 1.4796956380208333, + "grad_norm": 17.440168380737305, + "learning_rate": 6.991457145848776e-06, + "loss": 4.98, + "step": 72730 + }, + { + "epoch": 1.47979736328125, + "grad_norm": 20.54827117919922, + "learning_rate": 6.99109053177824e-06, + "loss": 4.7051, + "step": 72735 + }, + { + "epoch": 1.4798990885416667, + "grad_norm": 15.414922714233398, + "learning_rate": 6.990723904985266e-06, + "loss": 5.1093, + "step": 72740 + }, + { + "epoch": 1.4800008138020833, + "grad_norm": 17.20567512512207, + "learning_rate": 6.990357265472198e-06, + "loss": 5.0661, + "step": 72745 + }, + { + "epoch": 1.4801025390625, + "grad_norm": 19.58013343811035, + "learning_rate": 6.989990613241381e-06, + "loss": 4.8386, + "step": 72750 + }, + { + "epoch": 1.4802042643229167, + "grad_norm": 23.765958786010742, + "learning_rate": 6.9896239482951546e-06, + "loss": 5.3217, + "step": 72755 + }, + { + "epoch": 1.4803059895833333, + "grad_norm": 20.77876853942871, + "learning_rate": 6.989257270635863e-06, + "loss": 4.7151, + "step": 72760 + }, + { + "epoch": 1.48040771484375, + "grad_norm": 17.169042587280273, + "learning_rate": 6.988890580265849e-06, + "loss": 4.8383, + "step": 72765 + }, + { + "epoch": 1.4805094401041667, + "grad_norm": 18.860485076904297, + "learning_rate": 6.9885238771874545e-06, + "loss": 4.922, + "step": 72770 + }, + { + "epoch": 1.4806111653645833, + "grad_norm": 15.139610290527344, + "learning_rate": 6.988157161403025e-06, + "loss": 4.949, + "step": 72775 + }, + { + "epoch": 1.480712890625, + "grad_norm": 21.600326538085938, + "learning_rate": 6.987790432914902e-06, + "loss": 4.7109, + "step": 72780 + }, + { + "epoch": 1.4808146158854167, + "grad_norm": 15.408936500549316, + "learning_rate": 6.987423691725429e-06, + "loss": 4.9897, + "step": 72785 + }, + { + "epoch": 1.4809163411458333, + "grad_norm": 14.274620056152344, + "learning_rate": 6.987056937836949e-06, + "loss": 5.1721, + "step": 72790 + }, + { + "epoch": 1.48101806640625, + "grad_norm": 19.02145767211914, + "learning_rate": 6.986690171251806e-06, + "loss": 4.7976, + "step": 72795 + }, + { + "epoch": 1.4811197916666667, + "grad_norm": 15.920692443847656, + "learning_rate": 6.986323391972344e-06, + "loss": 4.9034, + "step": 72800 + }, + { + "epoch": 1.4812215169270833, + "grad_norm": 22.317468643188477, + "learning_rate": 6.985956600000906e-06, + "loss": 5.2043, + "step": 72805 + }, + { + "epoch": 1.4813232421875, + "grad_norm": 38.202476501464844, + "learning_rate": 6.9855897953398356e-06, + "loss": 4.6373, + "step": 72810 + }, + { + "epoch": 1.4814249674479167, + "grad_norm": 16.212411880493164, + "learning_rate": 6.985222977991477e-06, + "loss": 5.0135, + "step": 72815 + }, + { + "epoch": 1.4815266927083333, + "grad_norm": 17.713958740234375, + "learning_rate": 6.984856147958172e-06, + "loss": 4.7264, + "step": 72820 + }, + { + "epoch": 1.48162841796875, + "grad_norm": 16.817689895629883, + "learning_rate": 6.9844893052422695e-06, + "loss": 4.7431, + "step": 72825 + }, + { + "epoch": 1.4817301432291667, + "grad_norm": 17.728023529052734, + "learning_rate": 6.984122449846109e-06, + "loss": 4.6998, + "step": 72830 + }, + { + "epoch": 1.4818318684895833, + "grad_norm": 20.83949089050293, + "learning_rate": 6.983755581772035e-06, + "loss": 4.9808, + "step": 72835 + }, + { + "epoch": 1.48193359375, + "grad_norm": 16.432178497314453, + "learning_rate": 6.983388701022393e-06, + "loss": 4.9778, + "step": 72840 + }, + { + "epoch": 1.4820353190104167, + "grad_norm": 19.6307315826416, + "learning_rate": 6.983021807599526e-06, + "loss": 4.8707, + "step": 72845 + }, + { + "epoch": 1.4821370442708333, + "grad_norm": 16.886220932006836, + "learning_rate": 6.982654901505781e-06, + "loss": 4.8839, + "step": 72850 + }, + { + "epoch": 1.48223876953125, + "grad_norm": 20.415681838989258, + "learning_rate": 6.982287982743501e-06, + "loss": 4.8116, + "step": 72855 + }, + { + "epoch": 1.4823404947916667, + "grad_norm": 18.560710906982422, + "learning_rate": 6.981921051315027e-06, + "loss": 4.9239, + "step": 72860 + }, + { + "epoch": 1.4824422200520833, + "grad_norm": 14.972562789916992, + "learning_rate": 6.9815541072227085e-06, + "loss": 4.8783, + "step": 72865 + }, + { + "epoch": 1.4825439453125, + "grad_norm": 24.092205047607422, + "learning_rate": 6.981187150468889e-06, + "loss": 4.7194, + "step": 72870 + }, + { + "epoch": 1.4826456705729167, + "grad_norm": 14.930098533630371, + "learning_rate": 6.980820181055911e-06, + "loss": 4.7695, + "step": 72875 + }, + { + "epoch": 1.4827473958333333, + "grad_norm": 23.177194595336914, + "learning_rate": 6.980453198986123e-06, + "loss": 4.7909, + "step": 72880 + }, + { + "epoch": 1.48284912109375, + "grad_norm": 15.73789119720459, + "learning_rate": 6.980086204261864e-06, + "loss": 4.6964, + "step": 72885 + }, + { + "epoch": 1.4829508463541667, + "grad_norm": 16.710844039916992, + "learning_rate": 6.979719196885486e-06, + "loss": 4.7082, + "step": 72890 + }, + { + "epoch": 1.4830525716145833, + "grad_norm": 15.977572441101074, + "learning_rate": 6.97935217685933e-06, + "loss": 4.8251, + "step": 72895 + }, + { + "epoch": 1.483154296875, + "grad_norm": 15.385875701904297, + "learning_rate": 6.9789851441857415e-06, + "loss": 4.9982, + "step": 72900 + }, + { + "epoch": 1.4832560221354167, + "grad_norm": 20.54866600036621, + "learning_rate": 6.978618098867066e-06, + "loss": 4.8652, + "step": 72905 + }, + { + "epoch": 1.4833577473958333, + "grad_norm": 17.068256378173828, + "learning_rate": 6.978251040905649e-06, + "loss": 4.824, + "step": 72910 + }, + { + "epoch": 1.48345947265625, + "grad_norm": 18.878416061401367, + "learning_rate": 6.977883970303836e-06, + "loss": 4.7619, + "step": 72915 + }, + { + "epoch": 1.4835611979166667, + "grad_norm": 16.110336303710938, + "learning_rate": 6.9775168870639735e-06, + "loss": 4.9522, + "step": 72920 + }, + { + "epoch": 1.4836629231770833, + "grad_norm": 30.885725021362305, + "learning_rate": 6.977149791188403e-06, + "loss": 5.2295, + "step": 72925 + }, + { + "epoch": 1.4837646484375, + "grad_norm": 18.632396697998047, + "learning_rate": 6.976782682679475e-06, + "loss": 4.9763, + "step": 72930 + }, + { + "epoch": 1.4838663736979167, + "grad_norm": 22.06801986694336, + "learning_rate": 6.9764155615395336e-06, + "loss": 5.1641, + "step": 72935 + }, + { + "epoch": 1.4839680989583333, + "grad_norm": 16.851455688476562, + "learning_rate": 6.976048427770923e-06, + "loss": 4.6608, + "step": 72940 + }, + { + "epoch": 1.48406982421875, + "grad_norm": 17.327356338500977, + "learning_rate": 6.97568128137599e-06, + "loss": 4.9673, + "step": 72945 + }, + { + "epoch": 1.4841715494791667, + "grad_norm": 25.840370178222656, + "learning_rate": 6.9753141223570804e-06, + "loss": 4.9451, + "step": 72950 + }, + { + "epoch": 1.4842732747395833, + "grad_norm": 18.61471939086914, + "learning_rate": 6.974946950716543e-06, + "loss": 4.7263, + "step": 72955 + }, + { + "epoch": 1.484375, + "grad_norm": 18.24013900756836, + "learning_rate": 6.974579766456721e-06, + "loss": 4.9458, + "step": 72960 + }, + { + "epoch": 1.4844767252604167, + "grad_norm": 16.201269149780273, + "learning_rate": 6.974212569579959e-06, + "loss": 4.6825, + "step": 72965 + }, + { + "epoch": 1.4845784505208333, + "grad_norm": 20.836322784423828, + "learning_rate": 6.973845360088607e-06, + "loss": 5.1499, + "step": 72970 + }, + { + "epoch": 1.48468017578125, + "grad_norm": 27.867721557617188, + "learning_rate": 6.973478137985008e-06, + "loss": 5.0133, + "step": 72975 + }, + { + "epoch": 1.4847819010416667, + "grad_norm": 13.505186080932617, + "learning_rate": 6.973110903271513e-06, + "loss": 4.9946, + "step": 72980 + }, + { + "epoch": 1.4848836263020833, + "grad_norm": 18.510528564453125, + "learning_rate": 6.972743655950465e-06, + "loss": 4.96, + "step": 72985 + }, + { + "epoch": 1.4849853515625, + "grad_norm": 14.419594764709473, + "learning_rate": 6.97237639602421e-06, + "loss": 4.8393, + "step": 72990 + }, + { + "epoch": 1.4850870768229167, + "grad_norm": 15.179572105407715, + "learning_rate": 6.972009123495096e-06, + "loss": 4.7238, + "step": 72995 + }, + { + "epoch": 1.4851888020833333, + "grad_norm": 18.123905181884766, + "learning_rate": 6.971641838365469e-06, + "loss": 4.8096, + "step": 73000 + }, + { + "epoch": 1.48529052734375, + "grad_norm": 12.44006061553955, + "learning_rate": 6.971274540637678e-06, + "loss": 4.777, + "step": 73005 + }, + { + "epoch": 1.4853922526041667, + "grad_norm": 18.08283233642578, + "learning_rate": 6.970907230314069e-06, + "loss": 5.0599, + "step": 73010 + }, + { + "epoch": 1.4854939778645833, + "grad_norm": 13.61880111694336, + "learning_rate": 6.970539907396987e-06, + "loss": 4.922, + "step": 73015 + }, + { + "epoch": 1.485595703125, + "grad_norm": 15.989480972290039, + "learning_rate": 6.970172571888781e-06, + "loss": 4.6549, + "step": 73020 + }, + { + "epoch": 1.4856974283854167, + "grad_norm": 17.694604873657227, + "learning_rate": 6.969805223791799e-06, + "loss": 4.7324, + "step": 73025 + }, + { + "epoch": 1.4857991536458333, + "grad_norm": 20.26369285583496, + "learning_rate": 6.969437863108385e-06, + "loss": 4.9781, + "step": 73030 + }, + { + "epoch": 1.48590087890625, + "grad_norm": 18.275821685791016, + "learning_rate": 6.969070489840889e-06, + "loss": 4.8888, + "step": 73035 + }, + { + "epoch": 1.4860026041666667, + "grad_norm": 14.67530345916748, + "learning_rate": 6.968703103991656e-06, + "loss": 4.735, + "step": 73040 + }, + { + "epoch": 1.4861043294270833, + "grad_norm": 21.38918685913086, + "learning_rate": 6.968335705563038e-06, + "loss": 4.8654, + "step": 73045 + }, + { + "epoch": 1.4862060546875, + "grad_norm": 19.940460205078125, + "learning_rate": 6.967968294557379e-06, + "loss": 4.9714, + "step": 73050 + }, + { + "epoch": 1.4863077799479167, + "grad_norm": 19.190969467163086, + "learning_rate": 6.967600870977026e-06, + "loss": 4.8863, + "step": 73055 + }, + { + "epoch": 1.4864095052083333, + "grad_norm": 23.114912033081055, + "learning_rate": 6.967233434824329e-06, + "loss": 5.1066, + "step": 73060 + }, + { + "epoch": 1.48651123046875, + "grad_norm": 20.523950576782227, + "learning_rate": 6.966865986101634e-06, + "loss": 4.8294, + "step": 73065 + }, + { + "epoch": 1.4866129557291667, + "grad_norm": 14.912291526794434, + "learning_rate": 6.966498524811289e-06, + "loss": 5.0555, + "step": 73070 + }, + { + "epoch": 1.4867146809895833, + "grad_norm": 18.248821258544922, + "learning_rate": 6.9661310509556445e-06, + "loss": 4.9152, + "step": 73075 + }, + { + "epoch": 1.48681640625, + "grad_norm": 19.45579719543457, + "learning_rate": 6.965763564537046e-06, + "loss": 5.0994, + "step": 73080 + }, + { + "epoch": 1.4869181315104167, + "grad_norm": 16.397188186645508, + "learning_rate": 6.965396065557842e-06, + "loss": 4.7606, + "step": 73085 + }, + { + "epoch": 1.4870198567708333, + "grad_norm": 19.12653923034668, + "learning_rate": 6.965028554020382e-06, + "loss": 4.5924, + "step": 73090 + }, + { + "epoch": 1.48712158203125, + "grad_norm": 14.663079261779785, + "learning_rate": 6.964661029927013e-06, + "loss": 4.9587, + "step": 73095 + }, + { + "epoch": 1.4872233072916667, + "grad_norm": 15.663877487182617, + "learning_rate": 6.964293493280083e-06, + "loss": 4.9628, + "step": 73100 + }, + { + "epoch": 1.4873250325520833, + "grad_norm": 16.26332664489746, + "learning_rate": 6.9639259440819416e-06, + "loss": 4.7209, + "step": 73105 + }, + { + "epoch": 1.4874267578125, + "grad_norm": 17.132526397705078, + "learning_rate": 6.963558382334938e-06, + "loss": 4.8567, + "step": 73110 + }, + { + "epoch": 1.4875284830729167, + "grad_norm": 15.974034309387207, + "learning_rate": 6.963190808041418e-06, + "loss": 4.7272, + "step": 73115 + }, + { + "epoch": 1.4876302083333333, + "grad_norm": 16.43229866027832, + "learning_rate": 6.962823221203732e-06, + "loss": 4.6877, + "step": 73120 + }, + { + "epoch": 1.48773193359375, + "grad_norm": 22.163860321044922, + "learning_rate": 6.96245562182423e-06, + "loss": 5.0036, + "step": 73125 + }, + { + "epoch": 1.4878336588541667, + "grad_norm": 22.939050674438477, + "learning_rate": 6.962088009905258e-06, + "loss": 4.8747, + "step": 73130 + }, + { + "epoch": 1.4879353841145833, + "grad_norm": 23.531330108642578, + "learning_rate": 6.9617203854491675e-06, + "loss": 4.6875, + "step": 73135 + }, + { + "epoch": 1.488037109375, + "grad_norm": 18.077016830444336, + "learning_rate": 6.961352748458307e-06, + "loss": 4.9346, + "step": 73140 + }, + { + "epoch": 1.4881388346354167, + "grad_norm": 18.34637451171875, + "learning_rate": 6.9609850989350234e-06, + "loss": 5.0736, + "step": 73145 + }, + { + "epoch": 1.4882405598958333, + "grad_norm": 14.197152137756348, + "learning_rate": 6.960617436881669e-06, + "loss": 4.8271, + "step": 73150 + }, + { + "epoch": 1.48834228515625, + "grad_norm": 19.931489944458008, + "learning_rate": 6.9602497623005906e-06, + "loss": 4.6065, + "step": 73155 + }, + { + "epoch": 1.4884440104166667, + "grad_norm": 16.28819465637207, + "learning_rate": 6.959882075194139e-06, + "loss": 4.7671, + "step": 73160 + }, + { + "epoch": 1.4885457356770833, + "grad_norm": 17.33098793029785, + "learning_rate": 6.959514375564662e-06, + "loss": 4.9815, + "step": 73165 + }, + { + "epoch": 1.4886474609375, + "grad_norm": 14.791911125183105, + "learning_rate": 6.959146663414511e-06, + "loss": 4.9833, + "step": 73170 + }, + { + "epoch": 1.4887491861979167, + "grad_norm": 21.54263687133789, + "learning_rate": 6.958778938746036e-06, + "loss": 5.1616, + "step": 73175 + }, + { + "epoch": 1.4888509114583333, + "grad_norm": 18.02345848083496, + "learning_rate": 6.958411201561583e-06, + "loss": 4.6732, + "step": 73180 + }, + { + "epoch": 1.48895263671875, + "grad_norm": 20.231128692626953, + "learning_rate": 6.9580434518635055e-06, + "loss": 4.9101, + "step": 73185 + }, + { + "epoch": 1.4890543619791667, + "grad_norm": 18.49529266357422, + "learning_rate": 6.957675689654152e-06, + "loss": 5.1877, + "step": 73190 + }, + { + "epoch": 1.4891560872395833, + "grad_norm": 20.12556266784668, + "learning_rate": 6.9573079149358714e-06, + "loss": 4.9381, + "step": 73195 + }, + { + "epoch": 1.4892578125, + "grad_norm": 19.534833908081055, + "learning_rate": 6.956940127711016e-06, + "loss": 4.8272, + "step": 73200 + }, + { + "epoch": 1.4893595377604167, + "grad_norm": 16.871416091918945, + "learning_rate": 6.956572327981933e-06, + "loss": 4.8246, + "step": 73205 + }, + { + "epoch": 1.4894612630208333, + "grad_norm": 16.712451934814453, + "learning_rate": 6.956204515750974e-06, + "loss": 5.0621, + "step": 73210 + }, + { + "epoch": 1.48956298828125, + "grad_norm": 17.056257247924805, + "learning_rate": 6.95583669102049e-06, + "loss": 4.8252, + "step": 73215 + }, + { + "epoch": 1.4896647135416667, + "grad_norm": 19.75987434387207, + "learning_rate": 6.955468853792828e-06, + "loss": 5.1335, + "step": 73220 + }, + { + "epoch": 1.4897664388020833, + "grad_norm": 17.446762084960938, + "learning_rate": 6.955101004070344e-06, + "loss": 5.1272, + "step": 73225 + }, + { + "epoch": 1.4898681640625, + "grad_norm": 15.577166557312012, + "learning_rate": 6.954733141855384e-06, + "loss": 5.0311, + "step": 73230 + }, + { + "epoch": 1.4899698893229167, + "grad_norm": 16.423330307006836, + "learning_rate": 6.9543652671503e-06, + "loss": 4.9067, + "step": 73235 + }, + { + "epoch": 1.4900716145833333, + "grad_norm": 20.442466735839844, + "learning_rate": 6.953997379957443e-06, + "loss": 5.158, + "step": 73240 + }, + { + "epoch": 1.49017333984375, + "grad_norm": 21.640851974487305, + "learning_rate": 6.953629480279162e-06, + "loss": 4.9569, + "step": 73245 + }, + { + "epoch": 1.4902750651041667, + "grad_norm": 25.150684356689453, + "learning_rate": 6.953261568117808e-06, + "loss": 5.1442, + "step": 73250 + }, + { + "epoch": 1.4903767903645833, + "grad_norm": 17.673351287841797, + "learning_rate": 6.952893643475733e-06, + "loss": 4.6806, + "step": 73255 + }, + { + "epoch": 1.490478515625, + "grad_norm": 15.939491271972656, + "learning_rate": 6.952525706355289e-06, + "loss": 4.9083, + "step": 73260 + }, + { + "epoch": 1.4905802408854167, + "grad_norm": 59.45942306518555, + "learning_rate": 6.9521577567588235e-06, + "loss": 5.1585, + "step": 73265 + }, + { + "epoch": 1.4906819661458333, + "grad_norm": 16.68368148803711, + "learning_rate": 6.951789794688692e-06, + "loss": 5.0467, + "step": 73270 + }, + { + "epoch": 1.49078369140625, + "grad_norm": 15.950935363769531, + "learning_rate": 6.9514218201472414e-06, + "loss": 4.7719, + "step": 73275 + }, + { + "epoch": 1.4908854166666667, + "grad_norm": 26.34566879272461, + "learning_rate": 6.951053833136826e-06, + "loss": 5.0129, + "step": 73280 + }, + { + "epoch": 1.4909871419270833, + "grad_norm": 12.721968650817871, + "learning_rate": 6.950685833659795e-06, + "loss": 4.9711, + "step": 73285 + }, + { + "epoch": 1.4910888671875, + "grad_norm": 19.53141212463379, + "learning_rate": 6.950317821718502e-06, + "loss": 5.0654, + "step": 73290 + }, + { + "epoch": 1.4911905924479167, + "grad_norm": 17.6025447845459, + "learning_rate": 6.9499497973152964e-06, + "loss": 5.1622, + "step": 73295 + }, + { + "epoch": 1.4912923177083333, + "grad_norm": 16.291717529296875, + "learning_rate": 6.949581760452529e-06, + "loss": 4.9221, + "step": 73300 + }, + { + "epoch": 1.49139404296875, + "grad_norm": 17.08668327331543, + "learning_rate": 6.949213711132556e-06, + "loss": 4.8833, + "step": 73305 + }, + { + "epoch": 1.4914957682291667, + "grad_norm": 17.814632415771484, + "learning_rate": 6.948845649357725e-06, + "loss": 4.7497, + "step": 73310 + }, + { + "epoch": 1.4915974934895833, + "grad_norm": 17.874887466430664, + "learning_rate": 6.948477575130388e-06, + "loss": 5.0585, + "step": 73315 + }, + { + "epoch": 1.49169921875, + "grad_norm": 20.5736141204834, + "learning_rate": 6.9481094884528985e-06, + "loss": 4.9964, + "step": 73320 + }, + { + "epoch": 1.4918009440104167, + "grad_norm": 16.848175048828125, + "learning_rate": 6.947741389327608e-06, + "loss": 4.9612, + "step": 73325 + }, + { + "epoch": 1.4919026692708333, + "grad_norm": 17.641826629638672, + "learning_rate": 6.947373277756868e-06, + "loss": 4.9195, + "step": 73330 + }, + { + "epoch": 1.49200439453125, + "grad_norm": 17.083805084228516, + "learning_rate": 6.94700515374303e-06, + "loss": 5.2092, + "step": 73335 + }, + { + "epoch": 1.4921061197916667, + "grad_norm": 13.29108715057373, + "learning_rate": 6.946637017288449e-06, + "loss": 4.9738, + "step": 73340 + }, + { + "epoch": 1.4922078450520833, + "grad_norm": 18.518339157104492, + "learning_rate": 6.946268868395475e-06, + "loss": 4.9669, + "step": 73345 + }, + { + "epoch": 1.4923095703125, + "grad_norm": 20.382314682006836, + "learning_rate": 6.94590070706646e-06, + "loss": 4.8223, + "step": 73350 + }, + { + "epoch": 1.4924112955729167, + "grad_norm": 21.42458152770996, + "learning_rate": 6.945532533303757e-06, + "loss": 4.7352, + "step": 73355 + }, + { + "epoch": 1.4925130208333333, + "grad_norm": 21.9563045501709, + "learning_rate": 6.94516434710972e-06, + "loss": 5.1553, + "step": 73360 + }, + { + "epoch": 1.49261474609375, + "grad_norm": 16.17803192138672, + "learning_rate": 6.944796148486699e-06, + "loss": 4.8068, + "step": 73365 + }, + { + "epoch": 1.4927164713541667, + "grad_norm": 23.5169620513916, + "learning_rate": 6.9444279374370495e-06, + "loss": 5.1249, + "step": 73370 + }, + { + "epoch": 1.4928181966145833, + "grad_norm": 16.110506057739258, + "learning_rate": 6.944059713963121e-06, + "loss": 4.7605, + "step": 73375 + }, + { + "epoch": 1.492919921875, + "grad_norm": 22.694135665893555, + "learning_rate": 6.9436914780672696e-06, + "loss": 5.1372, + "step": 73380 + }, + { + "epoch": 1.4930216471354167, + "grad_norm": 14.16240119934082, + "learning_rate": 6.943323229751846e-06, + "loss": 5.1936, + "step": 73385 + }, + { + "epoch": 1.4931233723958333, + "grad_norm": 18.47466278076172, + "learning_rate": 6.942954969019202e-06, + "loss": 4.7632, + "step": 73390 + }, + { + "epoch": 1.49322509765625, + "grad_norm": 17.19047737121582, + "learning_rate": 6.942586695871696e-06, + "loss": 4.784, + "step": 73395 + }, + { + "epoch": 1.4933268229166667, + "grad_norm": 19.970521926879883, + "learning_rate": 6.942218410311677e-06, + "loss": 5.0752, + "step": 73400 + }, + { + "epoch": 1.4934285481770833, + "grad_norm": 20.6278076171875, + "learning_rate": 6.9418501123414964e-06, + "loss": 5.0034, + "step": 73405 + }, + { + "epoch": 1.4935302734375, + "grad_norm": 15.672682762145996, + "learning_rate": 6.9414818019635126e-06, + "loss": 5.0963, + "step": 73410 + }, + { + "epoch": 1.4936319986979167, + "grad_norm": 19.79712677001953, + "learning_rate": 6.941113479180074e-06, + "loss": 4.8406, + "step": 73415 + }, + { + "epoch": 1.4937337239583333, + "grad_norm": 33.2279167175293, + "learning_rate": 6.940745143993538e-06, + "loss": 5.4373, + "step": 73420 + }, + { + "epoch": 1.49383544921875, + "grad_norm": 15.366743087768555, + "learning_rate": 6.940376796406256e-06, + "loss": 4.9931, + "step": 73425 + }, + { + "epoch": 1.4939371744791667, + "grad_norm": 25.740497589111328, + "learning_rate": 6.9400084364205825e-06, + "loss": 4.8921, + "step": 73430 + }, + { + "epoch": 1.4940388997395833, + "grad_norm": 15.982891082763672, + "learning_rate": 6.939640064038871e-06, + "loss": 5.0359, + "step": 73435 + }, + { + "epoch": 1.494140625, + "grad_norm": 16.467254638671875, + "learning_rate": 6.939271679263475e-06, + "loss": 5.0511, + "step": 73440 + }, + { + "epoch": 1.4942423502604167, + "grad_norm": 19.459169387817383, + "learning_rate": 6.938903282096749e-06, + "loss": 4.706, + "step": 73445 + }, + { + "epoch": 1.4943440755208333, + "grad_norm": 20.09385871887207, + "learning_rate": 6.938534872541047e-06, + "loss": 4.9898, + "step": 73450 + }, + { + "epoch": 1.49444580078125, + "grad_norm": 19.663509368896484, + "learning_rate": 6.938166450598721e-06, + "loss": 4.69, + "step": 73455 + }, + { + "epoch": 1.4945475260416667, + "grad_norm": 11.722150802612305, + "learning_rate": 6.937798016272128e-06, + "loss": 4.9078, + "step": 73460 + }, + { + "epoch": 1.4946492513020833, + "grad_norm": 17.15929412841797, + "learning_rate": 6.93742956956362e-06, + "loss": 4.7718, + "step": 73465 + }, + { + "epoch": 1.4947509765625, + "grad_norm": 14.962903022766113, + "learning_rate": 6.937061110475553e-06, + "loss": 4.8045, + "step": 73470 + }, + { + "epoch": 1.4948527018229167, + "grad_norm": 19.976686477661133, + "learning_rate": 6.936692639010279e-06, + "loss": 4.9175, + "step": 73475 + }, + { + "epoch": 1.4949544270833333, + "grad_norm": 24.661672592163086, + "learning_rate": 6.936324155170154e-06, + "loss": 4.7738, + "step": 73480 + }, + { + "epoch": 1.49505615234375, + "grad_norm": 17.85993766784668, + "learning_rate": 6.935955658957533e-06, + "loss": 5.2132, + "step": 73485 + }, + { + "epoch": 1.4951578776041667, + "grad_norm": 17.889986038208008, + "learning_rate": 6.935587150374769e-06, + "loss": 4.7388, + "step": 73490 + }, + { + "epoch": 1.4952596028645833, + "grad_norm": 18.919572830200195, + "learning_rate": 6.9352186294242185e-06, + "loss": 4.7828, + "step": 73495 + }, + { + "epoch": 1.495361328125, + "grad_norm": 14.86476993560791, + "learning_rate": 6.934850096108237e-06, + "loss": 4.5931, + "step": 73500 + }, + { + "epoch": 1.4954630533854167, + "grad_norm": 17.15276336669922, + "learning_rate": 6.934481550429174e-06, + "loss": 4.7822, + "step": 73505 + }, + { + "epoch": 1.4955647786458333, + "grad_norm": 20.586301803588867, + "learning_rate": 6.93411299238939e-06, + "loss": 4.9426, + "step": 73510 + }, + { + "epoch": 1.49566650390625, + "grad_norm": 19.93338966369629, + "learning_rate": 6.933744421991239e-06, + "loss": 4.8491, + "step": 73515 + }, + { + "epoch": 1.4957682291666667, + "grad_norm": 18.047412872314453, + "learning_rate": 6.933375839237072e-06, + "loss": 5.056, + "step": 73520 + }, + { + "epoch": 1.4958699544270833, + "grad_norm": 17.126338958740234, + "learning_rate": 6.9330072441292496e-06, + "loss": 4.8472, + "step": 73525 + }, + { + "epoch": 1.4959716796875, + "grad_norm": 23.54718017578125, + "learning_rate": 6.932638636670124e-06, + "loss": 5.0038, + "step": 73530 + }, + { + "epoch": 1.4960734049479167, + "grad_norm": 22.070960998535156, + "learning_rate": 6.93227001686205e-06, + "loss": 4.9943, + "step": 73535 + }, + { + "epoch": 1.4961751302083333, + "grad_norm": 14.016714096069336, + "learning_rate": 6.931901384707385e-06, + "loss": 4.8376, + "step": 73540 + }, + { + "epoch": 1.49627685546875, + "grad_norm": 16.898473739624023, + "learning_rate": 6.931532740208483e-06, + "loss": 4.992, + "step": 73545 + }, + { + "epoch": 1.4963785807291667, + "grad_norm": 15.915619850158691, + "learning_rate": 6.9311640833677005e-06, + "loss": 5.1604, + "step": 73550 + }, + { + "epoch": 1.4964803059895833, + "grad_norm": 17.01117706298828, + "learning_rate": 6.930795414187392e-06, + "loss": 4.9668, + "step": 73555 + }, + { + "epoch": 1.49658203125, + "grad_norm": 18.03192138671875, + "learning_rate": 6.930426732669914e-06, + "loss": 4.869, + "step": 73560 + }, + { + "epoch": 1.4966837565104167, + "grad_norm": 18.847978591918945, + "learning_rate": 6.930058038817622e-06, + "loss": 4.8462, + "step": 73565 + }, + { + "epoch": 1.4967854817708333, + "grad_norm": 22.054306030273438, + "learning_rate": 6.929689332632871e-06, + "loss": 4.6954, + "step": 73570 + }, + { + "epoch": 1.49688720703125, + "grad_norm": 16.81905746459961, + "learning_rate": 6.929320614118018e-06, + "loss": 4.8883, + "step": 73575 + }, + { + "epoch": 1.4969889322916667, + "grad_norm": 17.27962875366211, + "learning_rate": 6.92895188327542e-06, + "loss": 5.0396, + "step": 73580 + }, + { + "epoch": 1.4970906575520833, + "grad_norm": 14.071355819702148, + "learning_rate": 6.92858314010743e-06, + "loss": 4.926, + "step": 73585 + }, + { + "epoch": 1.4971923828125, + "grad_norm": 18.878042221069336, + "learning_rate": 6.9282143846164076e-06, + "loss": 4.9852, + "step": 73590 + }, + { + "epoch": 1.4972941080729167, + "grad_norm": 13.411895751953125, + "learning_rate": 6.927845616804706e-06, + "loss": 4.6106, + "step": 73595 + }, + { + "epoch": 1.4973958333333333, + "grad_norm": 14.792037010192871, + "learning_rate": 6.927476836674683e-06, + "loss": 4.7128, + "step": 73600 + }, + { + "epoch": 1.49749755859375, + "grad_norm": 21.07665252685547, + "learning_rate": 6.927108044228696e-06, + "loss": 4.9833, + "step": 73605 + }, + { + "epoch": 1.4975992838541667, + "grad_norm": 22.325714111328125, + "learning_rate": 6.9267392394691e-06, + "loss": 4.8043, + "step": 73610 + }, + { + "epoch": 1.4977010091145833, + "grad_norm": 15.808566093444824, + "learning_rate": 6.926370422398251e-06, + "loss": 4.7091, + "step": 73615 + }, + { + "epoch": 1.497802734375, + "grad_norm": 16.736007690429688, + "learning_rate": 6.926001593018507e-06, + "loss": 4.828, + "step": 73620 + }, + { + "epoch": 1.4979044596354167, + "grad_norm": 17.77055549621582, + "learning_rate": 6.925632751332224e-06, + "loss": 4.9494, + "step": 73625 + }, + { + "epoch": 1.4980061848958333, + "grad_norm": 16.586965560913086, + "learning_rate": 6.925263897341759e-06, + "loss": 5.0856, + "step": 73630 + }, + { + "epoch": 1.49810791015625, + "grad_norm": 20.048938751220703, + "learning_rate": 6.924895031049469e-06, + "loss": 5.0302, + "step": 73635 + }, + { + "epoch": 1.4982096354166667, + "grad_norm": 15.784786224365234, + "learning_rate": 6.9245261524577116e-06, + "loss": 4.9297, + "step": 73640 + }, + { + "epoch": 1.4983113606770833, + "grad_norm": 23.736589431762695, + "learning_rate": 6.924157261568842e-06, + "loss": 4.8453, + "step": 73645 + }, + { + "epoch": 1.4984130859375, + "grad_norm": 27.334030151367188, + "learning_rate": 6.923788358385219e-06, + "loss": 4.8917, + "step": 73650 + }, + { + "epoch": 1.4985148111979167, + "grad_norm": 16.50396156311035, + "learning_rate": 6.923419442909198e-06, + "loss": 4.9141, + "step": 73655 + }, + { + "epoch": 1.4986165364583333, + "grad_norm": 18.88580894470215, + "learning_rate": 6.923050515143138e-06, + "loss": 4.8729, + "step": 73660 + }, + { + "epoch": 1.49871826171875, + "grad_norm": 17.534297943115234, + "learning_rate": 6.922681575089396e-06, + "loss": 4.8457, + "step": 73665 + }, + { + "epoch": 1.4988199869791667, + "grad_norm": 15.69845199584961, + "learning_rate": 6.92231262275033e-06, + "loss": 4.8982, + "step": 73670 + }, + { + "epoch": 1.4989217122395833, + "grad_norm": 15.615689277648926, + "learning_rate": 6.921943658128295e-06, + "loss": 5.0487, + "step": 73675 + }, + { + "epoch": 1.4990234375, + "grad_norm": 15.771432876586914, + "learning_rate": 6.921574681225651e-06, + "loss": 5.0952, + "step": 73680 + }, + { + "epoch": 1.4991251627604167, + "grad_norm": 20.061811447143555, + "learning_rate": 6.9212056920447545e-06, + "loss": 4.7564, + "step": 73685 + }, + { + "epoch": 1.4992268880208333, + "grad_norm": 20.25445556640625, + "learning_rate": 6.920836690587962e-06, + "loss": 5.1995, + "step": 73690 + }, + { + "epoch": 1.49932861328125, + "grad_norm": 20.219648361206055, + "learning_rate": 6.920467676857635e-06, + "loss": 4.9471, + "step": 73695 + }, + { + "epoch": 1.4994303385416667, + "grad_norm": 14.74786376953125, + "learning_rate": 6.920098650856128e-06, + "loss": 4.7787, + "step": 73700 + }, + { + "epoch": 1.4995320638020833, + "grad_norm": 20.585500717163086, + "learning_rate": 6.919729612585801e-06, + "loss": 4.7266, + "step": 73705 + }, + { + "epoch": 1.4996337890625, + "grad_norm": 21.864234924316406, + "learning_rate": 6.919360562049011e-06, + "loss": 4.693, + "step": 73710 + }, + { + "epoch": 1.4997355143229167, + "grad_norm": 33.437923431396484, + "learning_rate": 6.918991499248115e-06, + "loss": 5.0664, + "step": 73715 + }, + { + "epoch": 1.4998372395833333, + "grad_norm": 15.338980674743652, + "learning_rate": 6.918622424185474e-06, + "loss": 4.9644, + "step": 73720 + }, + { + "epoch": 1.49993896484375, + "grad_norm": 17.23502540588379, + "learning_rate": 6.918253336863443e-06, + "loss": 4.8209, + "step": 73725 + }, + { + "epoch": 1.5, + "eval_loss": 4.963383197784424, + "eval_runtime": 107.5237, + "eval_samples_per_second": 18.666, + "eval_steps_per_second": 9.337, + "step": 73728 + }, + { + "epoch": 1.5000406901041665, + "grad_norm": 18.61566925048828, + "learning_rate": 6.917884237284384e-06, + "loss": 5.0516, + "step": 73730 + }, + { + "epoch": 1.5001424153645835, + "grad_norm": 19.90483283996582, + "learning_rate": 6.917515125450652e-06, + "loss": 4.9355, + "step": 73735 + }, + { + "epoch": 1.500244140625, + "grad_norm": 21.726964950561523, + "learning_rate": 6.9171460013646084e-06, + "loss": 4.7312, + "step": 73740 + }, + { + "epoch": 1.5003458658854165, + "grad_norm": 19.104753494262695, + "learning_rate": 6.916776865028609e-06, + "loss": 4.9499, + "step": 73745 + }, + { + "epoch": 1.5004475911458335, + "grad_norm": 20.010883331298828, + "learning_rate": 6.916407716445016e-06, + "loss": 4.7724, + "step": 73750 + }, + { + "epoch": 1.50054931640625, + "grad_norm": 15.026531219482422, + "learning_rate": 6.916038555616185e-06, + "loss": 5.0481, + "step": 73755 + }, + { + "epoch": 1.5006510416666665, + "grad_norm": 21.793323516845703, + "learning_rate": 6.915669382544475e-06, + "loss": 4.7914, + "step": 73760 + }, + { + "epoch": 1.5007527669270835, + "grad_norm": 15.15608024597168, + "learning_rate": 6.9153001972322466e-06, + "loss": 4.8799, + "step": 73765 + }, + { + "epoch": 1.5008544921875, + "grad_norm": 18.172121047973633, + "learning_rate": 6.914930999681858e-06, + "loss": 4.7901, + "step": 73770 + }, + { + "epoch": 1.5009562174479165, + "grad_norm": 19.395374298095703, + "learning_rate": 6.914561789895669e-06, + "loss": 4.8109, + "step": 73775 + }, + { + "epoch": 1.5010579427083335, + "grad_norm": 17.5063419342041, + "learning_rate": 6.914192567876036e-06, + "loss": 4.7689, + "step": 73780 + }, + { + "epoch": 1.50115966796875, + "grad_norm": 16.172082901000977, + "learning_rate": 6.913823333625322e-06, + "loss": 5.042, + "step": 73785 + }, + { + "epoch": 1.5012613932291665, + "grad_norm": 19.381053924560547, + "learning_rate": 6.913454087145882e-06, + "loss": 4.9092, + "step": 73790 + }, + { + "epoch": 1.5013631184895835, + "grad_norm": 23.127593994140625, + "learning_rate": 6.91308482844008e-06, + "loss": 4.845, + "step": 73795 + }, + { + "epoch": 1.50146484375, + "grad_norm": 16.056745529174805, + "learning_rate": 6.9127155575102725e-06, + "loss": 4.8935, + "step": 73800 + }, + { + "epoch": 1.5015665690104165, + "grad_norm": 34.454856872558594, + "learning_rate": 6.912346274358819e-06, + "loss": 4.6832, + "step": 73805 + }, + { + "epoch": 1.5016682942708335, + "grad_norm": 21.009355545043945, + "learning_rate": 6.911976978988082e-06, + "loss": 4.895, + "step": 73810 + }, + { + "epoch": 1.50177001953125, + "grad_norm": 12.566206932067871, + "learning_rate": 6.9116076714004175e-06, + "loss": 4.6805, + "step": 73815 + }, + { + "epoch": 1.5018717447916665, + "grad_norm": 16.525808334350586, + "learning_rate": 6.911238351598186e-06, + "loss": 4.7155, + "step": 73820 + }, + { + "epoch": 1.5019734700520835, + "grad_norm": 16.40586280822754, + "learning_rate": 6.91086901958375e-06, + "loss": 5.0317, + "step": 73825 + }, + { + "epoch": 1.5020751953125, + "grad_norm": 26.113021850585938, + "learning_rate": 6.9104996753594655e-06, + "loss": 5.1692, + "step": 73830 + }, + { + "epoch": 1.5021769205729165, + "grad_norm": 17.24793815612793, + "learning_rate": 6.910130318927697e-06, + "loss": 4.9462, + "step": 73835 + }, + { + "epoch": 1.5022786458333335, + "grad_norm": 18.94293975830078, + "learning_rate": 6.909760950290801e-06, + "loss": 4.8291, + "step": 73840 + }, + { + "epoch": 1.50238037109375, + "grad_norm": 19.261764526367188, + "learning_rate": 6.909391569451138e-06, + "loss": 4.7739, + "step": 73845 + }, + { + "epoch": 1.5024820963541665, + "grad_norm": 17.03082275390625, + "learning_rate": 6.9090221764110696e-06, + "loss": 4.7534, + "step": 73850 + }, + { + "epoch": 1.5025838216145835, + "grad_norm": 23.46473503112793, + "learning_rate": 6.908652771172956e-06, + "loss": 5.1701, + "step": 73855 + }, + { + "epoch": 1.502685546875, + "grad_norm": 17.09911346435547, + "learning_rate": 6.908283353739157e-06, + "loss": 4.8389, + "step": 73860 + }, + { + "epoch": 1.5027872721354165, + "grad_norm": 18.924091339111328, + "learning_rate": 6.907913924112032e-06, + "loss": 4.9032, + "step": 73865 + }, + { + "epoch": 1.5028889973958335, + "grad_norm": 16.64251708984375, + "learning_rate": 6.907544482293944e-06, + "loss": 4.9489, + "step": 73870 + }, + { + "epoch": 1.50299072265625, + "grad_norm": 23.585527420043945, + "learning_rate": 6.907175028287251e-06, + "loss": 4.9404, + "step": 73875 + }, + { + "epoch": 1.5030924479166665, + "grad_norm": 18.679624557495117, + "learning_rate": 6.906805562094316e-06, + "loss": 5.1934, + "step": 73880 + }, + { + "epoch": 1.5031941731770835, + "grad_norm": 18.532474517822266, + "learning_rate": 6.906436083717498e-06, + "loss": 4.7452, + "step": 73885 + }, + { + "epoch": 1.5032958984375, + "grad_norm": 17.68333625793457, + "learning_rate": 6.90606659315916e-06, + "loss": 4.7289, + "step": 73890 + }, + { + "epoch": 1.5033976236979165, + "grad_norm": 16.589452743530273, + "learning_rate": 6.905697090421659e-06, + "loss": 5.1095, + "step": 73895 + }, + { + "epoch": 1.5034993489583335, + "grad_norm": 18.135007858276367, + "learning_rate": 6.905327575507361e-06, + "loss": 5.1817, + "step": 73900 + }, + { + "epoch": 1.50360107421875, + "grad_norm": 17.723989486694336, + "learning_rate": 6.904958048418624e-06, + "loss": 4.979, + "step": 73905 + }, + { + "epoch": 1.5037027994791665, + "grad_norm": 16.523324966430664, + "learning_rate": 6.9045885091578095e-06, + "loss": 4.9924, + "step": 73910 + }, + { + "epoch": 1.5038045247395835, + "grad_norm": 15.339942932128906, + "learning_rate": 6.9042189577272785e-06, + "loss": 4.8652, + "step": 73915 + }, + { + "epoch": 1.50390625, + "grad_norm": 15.946104049682617, + "learning_rate": 6.903849394129394e-06, + "loss": 4.812, + "step": 73920 + }, + { + "epoch": 1.5040079752604165, + "grad_norm": 15.554574012756348, + "learning_rate": 6.903479818366515e-06, + "loss": 4.883, + "step": 73925 + }, + { + "epoch": 1.5041097005208335, + "grad_norm": 20.230913162231445, + "learning_rate": 6.903110230441006e-06, + "loss": 4.9111, + "step": 73930 + }, + { + "epoch": 1.50421142578125, + "grad_norm": 15.828425407409668, + "learning_rate": 6.902740630355225e-06, + "loss": 4.9891, + "step": 73935 + }, + { + "epoch": 1.5043131510416665, + "grad_norm": 16.986976623535156, + "learning_rate": 6.902371018111536e-06, + "loss": 4.7028, + "step": 73940 + }, + { + "epoch": 1.5044148763020835, + "grad_norm": 20.669158935546875, + "learning_rate": 6.9020013937123e-06, + "loss": 5.0569, + "step": 73945 + }, + { + "epoch": 1.5045166015625, + "grad_norm": 14.969182968139648, + "learning_rate": 6.901631757159879e-06, + "loss": 4.8283, + "step": 73950 + }, + { + "epoch": 1.5046183268229165, + "grad_norm": 18.542762756347656, + "learning_rate": 6.901262108456636e-06, + "loss": 5.0976, + "step": 73955 + }, + { + "epoch": 1.5047200520833335, + "grad_norm": 16.904573440551758, + "learning_rate": 6.900892447604929e-06, + "loss": 4.7809, + "step": 73960 + }, + { + "epoch": 1.50482177734375, + "grad_norm": 20.634580612182617, + "learning_rate": 6.9005227746071245e-06, + "loss": 4.7022, + "step": 73965 + }, + { + "epoch": 1.5049235026041665, + "grad_norm": 14.00633716583252, + "learning_rate": 6.9001530894655835e-06, + "loss": 4.7641, + "step": 73970 + }, + { + "epoch": 1.5050252278645835, + "grad_norm": 19.09188461303711, + "learning_rate": 6.8997833921826655e-06, + "loss": 5.0561, + "step": 73975 + }, + { + "epoch": 1.505126953125, + "grad_norm": 17.10662841796875, + "learning_rate": 6.899413682760736e-06, + "loss": 4.7764, + "step": 73980 + }, + { + "epoch": 1.5052286783854165, + "grad_norm": 17.027416229248047, + "learning_rate": 6.899043961202155e-06, + "loss": 4.9114, + "step": 73985 + }, + { + "epoch": 1.5053304036458335, + "grad_norm": 16.8598575592041, + "learning_rate": 6.8986742275092865e-06, + "loss": 4.9226, + "step": 73990 + }, + { + "epoch": 1.50543212890625, + "grad_norm": 17.212297439575195, + "learning_rate": 6.898304481684492e-06, + "loss": 4.8934, + "step": 73995 + }, + { + "epoch": 1.5055338541666665, + "grad_norm": 14.031096458435059, + "learning_rate": 6.897934723730135e-06, + "loss": 4.8156, + "step": 74000 + }, + { + "epoch": 1.5056355794270835, + "grad_norm": 17.237773895263672, + "learning_rate": 6.897564953648577e-06, + "loss": 5.2063, + "step": 74005 + }, + { + "epoch": 1.5057373046875, + "grad_norm": 17.06839942932129, + "learning_rate": 6.897195171442181e-06, + "loss": 4.9754, + "step": 74010 + }, + { + "epoch": 1.5058390299479165, + "grad_norm": 14.587942123413086, + "learning_rate": 6.896825377113311e-06, + "loss": 4.8965, + "step": 74015 + }, + { + "epoch": 1.5059407552083335, + "grad_norm": 17.766300201416016, + "learning_rate": 6.896455570664329e-06, + "loss": 4.8655, + "step": 74020 + }, + { + "epoch": 1.50604248046875, + "grad_norm": 17.26036262512207, + "learning_rate": 6.896085752097597e-06, + "loss": 5.0459, + "step": 74025 + }, + { + "epoch": 1.5061442057291665, + "grad_norm": 15.550636291503906, + "learning_rate": 6.89571592141548e-06, + "loss": 5.1178, + "step": 74030 + }, + { + "epoch": 1.5062459309895835, + "grad_norm": 15.607887268066406, + "learning_rate": 6.895346078620341e-06, + "loss": 4.9957, + "step": 74035 + }, + { + "epoch": 1.50634765625, + "grad_norm": 15.432278633117676, + "learning_rate": 6.89497622371454e-06, + "loss": 4.841, + "step": 74040 + }, + { + "epoch": 1.5064493815104165, + "grad_norm": 20.247257232666016, + "learning_rate": 6.894606356700444e-06, + "loss": 4.5577, + "step": 74045 + }, + { + "epoch": 1.5065511067708335, + "grad_norm": 19.661819458007812, + "learning_rate": 6.894236477580414e-06, + "loss": 4.8502, + "step": 74050 + }, + { + "epoch": 1.50665283203125, + "grad_norm": 16.408597946166992, + "learning_rate": 6.893866586356814e-06, + "loss": 4.7649, + "step": 74055 + }, + { + "epoch": 1.5067545572916665, + "grad_norm": 15.62978744506836, + "learning_rate": 6.893496683032009e-06, + "loss": 4.7481, + "step": 74060 + }, + { + "epoch": 1.5068562825520835, + "grad_norm": 18.055139541625977, + "learning_rate": 6.893126767608359e-06, + "loss": 5.0821, + "step": 74065 + }, + { + "epoch": 1.5069580078125, + "grad_norm": 21.060611724853516, + "learning_rate": 6.8927568400882325e-06, + "loss": 4.9446, + "step": 74070 + }, + { + "epoch": 1.5070597330729165, + "grad_norm": 18.321640014648438, + "learning_rate": 6.892386900473988e-06, + "loss": 4.8776, + "step": 74075 + }, + { + "epoch": 1.5071614583333335, + "grad_norm": 13.571856498718262, + "learning_rate": 6.892016948767994e-06, + "loss": 4.7208, + "step": 74080 + }, + { + "epoch": 1.50726318359375, + "grad_norm": 18.749530792236328, + "learning_rate": 6.891646984972613e-06, + "loss": 4.7015, + "step": 74085 + }, + { + "epoch": 1.5073649088541665, + "grad_norm": 19.180788040161133, + "learning_rate": 6.891277009090206e-06, + "loss": 4.8056, + "step": 74090 + }, + { + "epoch": 1.5074666341145835, + "grad_norm": 17.486602783203125, + "learning_rate": 6.890907021123141e-06, + "loss": 4.8696, + "step": 74095 + }, + { + "epoch": 1.507568359375, + "grad_norm": 14.356998443603516, + "learning_rate": 6.890537021073779e-06, + "loss": 4.8888, + "step": 74100 + }, + { + "epoch": 1.5076700846354165, + "grad_norm": 16.72743034362793, + "learning_rate": 6.890167008944486e-06, + "loss": 4.8007, + "step": 74105 + }, + { + "epoch": 1.5077718098958335, + "grad_norm": 15.336177825927734, + "learning_rate": 6.8897969847376265e-06, + "loss": 4.8595, + "step": 74110 + }, + { + "epoch": 1.50787353515625, + "grad_norm": 18.837799072265625, + "learning_rate": 6.8894269484555635e-06, + "loss": 5.0176, + "step": 74115 + }, + { + "epoch": 1.5079752604166665, + "grad_norm": 23.324874877929688, + "learning_rate": 6.889056900100662e-06, + "loss": 5.1548, + "step": 74120 + }, + { + "epoch": 1.5080769856770835, + "grad_norm": 19.623912811279297, + "learning_rate": 6.888686839675287e-06, + "loss": 4.7832, + "step": 74125 + }, + { + "epoch": 1.5081787109375, + "grad_norm": 16.1142635345459, + "learning_rate": 6.888316767181802e-06, + "loss": 5.1021, + "step": 74130 + }, + { + "epoch": 1.5082804361979165, + "grad_norm": 16.177698135375977, + "learning_rate": 6.887946682622573e-06, + "loss": 4.8815, + "step": 74135 + }, + { + "epoch": 1.5083821614583335, + "grad_norm": 14.440707206726074, + "learning_rate": 6.887576585999965e-06, + "loss": 4.8385, + "step": 74140 + }, + { + "epoch": 1.50848388671875, + "grad_norm": 14.868057250976562, + "learning_rate": 6.887206477316339e-06, + "loss": 4.8907, + "step": 74145 + }, + { + "epoch": 1.5085856119791665, + "grad_norm": 20.343172073364258, + "learning_rate": 6.886836356574065e-06, + "loss": 4.8703, + "step": 74150 + }, + { + "epoch": 1.5086873372395835, + "grad_norm": 19.30673599243164, + "learning_rate": 6.8864662237755044e-06, + "loss": 4.9317, + "step": 74155 + }, + { + "epoch": 1.5087890625, + "grad_norm": 19.97107696533203, + "learning_rate": 6.886096078923023e-06, + "loss": 4.9365, + "step": 74160 + }, + { + "epoch": 1.5088907877604165, + "grad_norm": 17.866798400878906, + "learning_rate": 6.885725922018988e-06, + "loss": 5.095, + "step": 74165 + }, + { + "epoch": 1.5089925130208335, + "grad_norm": 19.961965560913086, + "learning_rate": 6.885355753065764e-06, + "loss": 4.9125, + "step": 74170 + }, + { + "epoch": 1.50909423828125, + "grad_norm": 26.87385368347168, + "learning_rate": 6.884985572065712e-06, + "loss": 5.1281, + "step": 74175 + }, + { + "epoch": 1.5091959635416665, + "grad_norm": 19.045589447021484, + "learning_rate": 6.884615379021202e-06, + "loss": 4.6171, + "step": 74180 + }, + { + "epoch": 1.5092976888020835, + "grad_norm": 17.736597061157227, + "learning_rate": 6.884245173934599e-06, + "loss": 4.8161, + "step": 74185 + }, + { + "epoch": 1.5093994140625, + "grad_norm": 34.135013580322266, + "learning_rate": 6.883874956808267e-06, + "loss": 4.9155, + "step": 74190 + }, + { + "epoch": 1.5095011393229165, + "grad_norm": 16.53053092956543, + "learning_rate": 6.883504727644571e-06, + "loss": 4.8681, + "step": 74195 + }, + { + "epoch": 1.5096028645833335, + "grad_norm": 16.61832046508789, + "learning_rate": 6.883134486445879e-06, + "loss": 5.2516, + "step": 74200 + }, + { + "epoch": 1.50970458984375, + "grad_norm": 15.762777328491211, + "learning_rate": 6.882764233214554e-06, + "loss": 4.9816, + "step": 74205 + }, + { + "epoch": 1.5098063151041665, + "grad_norm": 14.181014060974121, + "learning_rate": 6.882393967952965e-06, + "loss": 4.7649, + "step": 74210 + }, + { + "epoch": 1.5099080403645835, + "grad_norm": 19.78288459777832, + "learning_rate": 6.8820236906634755e-06, + "loss": 4.8015, + "step": 74215 + }, + { + "epoch": 1.510009765625, + "grad_norm": 15.070982933044434, + "learning_rate": 6.88165340134845e-06, + "loss": 4.9495, + "step": 74220 + }, + { + "epoch": 1.5101114908854165, + "grad_norm": 17.464868545532227, + "learning_rate": 6.881283100010259e-06, + "loss": 5.1985, + "step": 74225 + }, + { + "epoch": 1.5102132161458335, + "grad_norm": 17.333412170410156, + "learning_rate": 6.880912786651265e-06, + "loss": 4.8622, + "step": 74230 + }, + { + "epoch": 1.51031494140625, + "grad_norm": 19.05852508544922, + "learning_rate": 6.880542461273836e-06, + "loss": 5.0537, + "step": 74235 + }, + { + "epoch": 1.5104166666666665, + "grad_norm": 16.081098556518555, + "learning_rate": 6.880172123880338e-06, + "loss": 4.8017, + "step": 74240 + }, + { + "epoch": 1.5105183919270835, + "grad_norm": 16.570484161376953, + "learning_rate": 6.879801774473136e-06, + "loss": 4.7805, + "step": 74245 + }, + { + "epoch": 1.5106201171875, + "grad_norm": 21.671247482299805, + "learning_rate": 6.879431413054598e-06, + "loss": 4.8716, + "step": 74250 + }, + { + "epoch": 1.5107218424479165, + "grad_norm": 19.248291015625, + "learning_rate": 6.87906103962709e-06, + "loss": 5.3431, + "step": 74255 + }, + { + "epoch": 1.5108235677083335, + "grad_norm": 14.63965892791748, + "learning_rate": 6.878690654192978e-06, + "loss": 5.162, + "step": 74260 + }, + { + "epoch": 1.51092529296875, + "grad_norm": 16.335676193237305, + "learning_rate": 6.878320256754629e-06, + "loss": 4.8184, + "step": 74265 + }, + { + "epoch": 1.5110270182291665, + "grad_norm": 18.30893325805664, + "learning_rate": 6.87794984731441e-06, + "loss": 4.8809, + "step": 74270 + }, + { + "epoch": 1.5111287434895835, + "grad_norm": 14.05101490020752, + "learning_rate": 6.877579425874687e-06, + "loss": 4.7529, + "step": 74275 + }, + { + "epoch": 1.51123046875, + "grad_norm": 18.684484481811523, + "learning_rate": 6.877208992437829e-06, + "loss": 4.7675, + "step": 74280 + }, + { + "epoch": 1.5113321940104165, + "grad_norm": 26.152034759521484, + "learning_rate": 6.8768385470061996e-06, + "loss": 5.2382, + "step": 74285 + }, + { + "epoch": 1.5114339192708335, + "grad_norm": 23.904584884643555, + "learning_rate": 6.876468089582169e-06, + "loss": 4.8681, + "step": 74290 + }, + { + "epoch": 1.51153564453125, + "grad_norm": 16.702016830444336, + "learning_rate": 6.8760976201681025e-06, + "loss": 4.9135, + "step": 74295 + }, + { + "epoch": 1.5116373697916665, + "grad_norm": 16.370603561401367, + "learning_rate": 6.875727138766368e-06, + "loss": 4.8829, + "step": 74300 + }, + { + "epoch": 1.5117390950520835, + "grad_norm": 19.79442024230957, + "learning_rate": 6.875356645379333e-06, + "loss": 5.0304, + "step": 74305 + }, + { + "epoch": 1.5118408203125, + "grad_norm": 20.2983341217041, + "learning_rate": 6.8749861400093635e-06, + "loss": 4.9349, + "step": 74310 + }, + { + "epoch": 1.5119425455729165, + "grad_norm": 21.81562614440918, + "learning_rate": 6.874615622658829e-06, + "loss": 5.0894, + "step": 74315 + }, + { + "epoch": 1.5120442708333335, + "grad_norm": 20.45145606994629, + "learning_rate": 6.874245093330094e-06, + "loss": 5.2727, + "step": 74320 + }, + { + "epoch": 1.51214599609375, + "grad_norm": 15.87985897064209, + "learning_rate": 6.873874552025529e-06, + "loss": 4.489, + "step": 74325 + }, + { + "epoch": 1.5122477213541665, + "grad_norm": 14.526932716369629, + "learning_rate": 6.8735039987475005e-06, + "loss": 4.8361, + "step": 74330 + }, + { + "epoch": 1.5123494466145835, + "grad_norm": 15.17785358428955, + "learning_rate": 6.873133433498375e-06, + "loss": 4.9642, + "step": 74335 + }, + { + "epoch": 1.512451171875, + "grad_norm": 17.22098159790039, + "learning_rate": 6.8727628562805225e-06, + "loss": 4.7795, + "step": 74340 + }, + { + "epoch": 1.5125528971354165, + "grad_norm": 15.232708930969238, + "learning_rate": 6.872392267096311e-06, + "loss": 5.0958, + "step": 74345 + }, + { + "epoch": 1.5126546223958335, + "grad_norm": 18.817222595214844, + "learning_rate": 6.872021665948104e-06, + "loss": 4.8413, + "step": 74350 + }, + { + "epoch": 1.51275634765625, + "grad_norm": 27.880521774291992, + "learning_rate": 6.871651052838275e-06, + "loss": 5.1798, + "step": 74355 + }, + { + "epoch": 1.5128580729166665, + "grad_norm": 17.974409103393555, + "learning_rate": 6.871280427769189e-06, + "loss": 4.9267, + "step": 74360 + }, + { + "epoch": 1.5129597981770835, + "grad_norm": 22.7706298828125, + "learning_rate": 6.870909790743216e-06, + "loss": 5.1392, + "step": 74365 + }, + { + "epoch": 1.5130615234375, + "grad_norm": 16.104259490966797, + "learning_rate": 6.8705391417627245e-06, + "loss": 4.6333, + "step": 74370 + }, + { + "epoch": 1.5131632486979165, + "grad_norm": 15.973647117614746, + "learning_rate": 6.8701684808300795e-06, + "loss": 4.6991, + "step": 74375 + }, + { + "epoch": 1.5132649739583335, + "grad_norm": 15.112855911254883, + "learning_rate": 6.869797807947652e-06, + "loss": 4.7928, + "step": 74380 + }, + { + "epoch": 1.51336669921875, + "grad_norm": 12.41349983215332, + "learning_rate": 6.86942712311781e-06, + "loss": 5.0686, + "step": 74385 + }, + { + "epoch": 1.5134684244791665, + "grad_norm": 21.734882354736328, + "learning_rate": 6.869056426342922e-06, + "loss": 4.8666, + "step": 74390 + }, + { + "epoch": 1.5135701497395835, + "grad_norm": 15.870635032653809, + "learning_rate": 6.868685717625358e-06, + "loss": 5.3126, + "step": 74395 + }, + { + "epoch": 1.513671875, + "grad_norm": 22.823556900024414, + "learning_rate": 6.868314996967484e-06, + "loss": 5.0481, + "step": 74400 + }, + { + "epoch": 1.5137736002604165, + "grad_norm": 15.048409461975098, + "learning_rate": 6.867944264371671e-06, + "loss": 4.9888, + "step": 74405 + }, + { + "epoch": 1.5138753255208335, + "grad_norm": 19.50459861755371, + "learning_rate": 6.8675735198402875e-06, + "loss": 5.1262, + "step": 74410 + }, + { + "epoch": 1.51397705078125, + "grad_norm": 18.828794479370117, + "learning_rate": 6.867202763375701e-06, + "loss": 4.8178, + "step": 74415 + }, + { + "epoch": 1.5140787760416665, + "grad_norm": 14.521666526794434, + "learning_rate": 6.866831994980283e-06, + "loss": 4.8001, + "step": 74420 + }, + { + "epoch": 1.5141805013020835, + "grad_norm": 16.852426528930664, + "learning_rate": 6.866461214656399e-06, + "loss": 4.9005, + "step": 74425 + }, + { + "epoch": 1.5142822265625, + "grad_norm": 12.8623628616333, + "learning_rate": 6.866090422406423e-06, + "loss": 4.77, + "step": 74430 + }, + { + "epoch": 1.5143839518229165, + "grad_norm": 15.720499038696289, + "learning_rate": 6.86571961823272e-06, + "loss": 4.9124, + "step": 74435 + }, + { + "epoch": 1.5144856770833335, + "grad_norm": 19.025354385375977, + "learning_rate": 6.8653488021376614e-06, + "loss": 5.1599, + "step": 74440 + }, + { + "epoch": 1.51458740234375, + "grad_norm": 16.159818649291992, + "learning_rate": 6.864977974123615e-06, + "loss": 5.0849, + "step": 74445 + }, + { + "epoch": 1.5146891276041665, + "grad_norm": 17.4603271484375, + "learning_rate": 6.864607134192952e-06, + "loss": 5.0672, + "step": 74450 + }, + { + "epoch": 1.5147908528645835, + "grad_norm": 14.63106632232666, + "learning_rate": 6.864236282348042e-06, + "loss": 5.4531, + "step": 74455 + }, + { + "epoch": 1.514892578125, + "grad_norm": 18.935562133789062, + "learning_rate": 6.863865418591255e-06, + "loss": 4.6206, + "step": 74460 + }, + { + "epoch": 1.5149943033854165, + "grad_norm": 16.7497501373291, + "learning_rate": 6.863494542924957e-06, + "loss": 4.9463, + "step": 74465 + }, + { + "epoch": 1.5150960286458335, + "grad_norm": 18.012216567993164, + "learning_rate": 6.863123655351522e-06, + "loss": 5.0446, + "step": 74470 + }, + { + "epoch": 1.51519775390625, + "grad_norm": 16.654844284057617, + "learning_rate": 6.8627527558733185e-06, + "loss": 4.9987, + "step": 74475 + }, + { + "epoch": 1.5152994791666665, + "grad_norm": 14.498893737792969, + "learning_rate": 6.862381844492715e-06, + "loss": 5.0284, + "step": 74480 + }, + { + "epoch": 1.5154012044270835, + "grad_norm": 20.686126708984375, + "learning_rate": 6.862010921212083e-06, + "loss": 4.6611, + "step": 74485 + }, + { + "epoch": 1.5155029296875, + "grad_norm": 14.006009101867676, + "learning_rate": 6.861639986033792e-06, + "loss": 4.7042, + "step": 74490 + }, + { + "epoch": 1.5156046549479165, + "grad_norm": 21.208833694458008, + "learning_rate": 6.861269038960215e-06, + "loss": 4.7867, + "step": 74495 + }, + { + "epoch": 1.5157063802083335, + "grad_norm": 15.92988395690918, + "learning_rate": 6.860898079993716e-06, + "loss": 4.8023, + "step": 74500 + }, + { + "epoch": 1.51580810546875, + "grad_norm": 19.18795394897461, + "learning_rate": 6.860527109136671e-06, + "loss": 4.8112, + "step": 74505 + }, + { + "epoch": 1.5159098307291665, + "grad_norm": 24.472314834594727, + "learning_rate": 6.860156126391448e-06, + "loss": 4.8497, + "step": 74510 + }, + { + "epoch": 1.5160115559895835, + "grad_norm": 15.701223373413086, + "learning_rate": 6.859785131760418e-06, + "loss": 4.7814, + "step": 74515 + }, + { + "epoch": 1.51611328125, + "grad_norm": 52.51200866699219, + "learning_rate": 6.8594141252459515e-06, + "loss": 5.0973, + "step": 74520 + }, + { + "epoch": 1.5162150065104165, + "grad_norm": 20.951488494873047, + "learning_rate": 6.8590431068504196e-06, + "loss": 5.0601, + "step": 74525 + }, + { + "epoch": 1.5163167317708335, + "grad_norm": 18.989656448364258, + "learning_rate": 6.858672076576191e-06, + "loss": 4.971, + "step": 74530 + }, + { + "epoch": 1.51641845703125, + "grad_norm": 13.969698905944824, + "learning_rate": 6.8583010344256385e-06, + "loss": 4.9851, + "step": 74535 + }, + { + "epoch": 1.5165201822916665, + "grad_norm": 19.799562454223633, + "learning_rate": 6.857929980401131e-06, + "loss": 5.1485, + "step": 74540 + }, + { + "epoch": 1.5166219075520835, + "grad_norm": 20.41676139831543, + "learning_rate": 6.857558914505043e-06, + "loss": 5.0997, + "step": 74545 + }, + { + "epoch": 1.5167236328125, + "grad_norm": 16.784210205078125, + "learning_rate": 6.8571878367397415e-06, + "loss": 5.0054, + "step": 74550 + }, + { + "epoch": 1.5168253580729165, + "grad_norm": 14.907124519348145, + "learning_rate": 6.8568167471075995e-06, + "loss": 5.1533, + "step": 74555 + }, + { + "epoch": 1.5169270833333335, + "grad_norm": 19.483596801757812, + "learning_rate": 6.856445645610988e-06, + "loss": 5.0634, + "step": 74560 + }, + { + "epoch": 1.51702880859375, + "grad_norm": 17.842815399169922, + "learning_rate": 6.856074532252279e-06, + "loss": 4.9085, + "step": 74565 + }, + { + "epoch": 1.5171305338541665, + "grad_norm": 14.85669994354248, + "learning_rate": 6.855703407033843e-06, + "loss": 5.0108, + "step": 74570 + }, + { + "epoch": 1.5172322591145835, + "grad_norm": 15.606411933898926, + "learning_rate": 6.8553322699580505e-06, + "loss": 4.8571, + "step": 74575 + }, + { + "epoch": 1.517333984375, + "grad_norm": 15.000596046447754, + "learning_rate": 6.854961121027274e-06, + "loss": 4.9798, + "step": 74580 + }, + { + "epoch": 1.5174357096354165, + "grad_norm": 16.97732925415039, + "learning_rate": 6.854589960243885e-06, + "loss": 4.8122, + "step": 74585 + }, + { + "epoch": 1.5175374348958335, + "grad_norm": 17.558860778808594, + "learning_rate": 6.854218787610255e-06, + "loss": 5.0493, + "step": 74590 + }, + { + "epoch": 1.51763916015625, + "grad_norm": 19.873838424682617, + "learning_rate": 6.853847603128754e-06, + "loss": 4.7053, + "step": 74595 + }, + { + "epoch": 1.5177408854166665, + "grad_norm": 16.32404327392578, + "learning_rate": 6.853476406801758e-06, + "loss": 4.8319, + "step": 74600 + }, + { + "epoch": 1.5178426106770835, + "grad_norm": 22.82706642150879, + "learning_rate": 6.853105198631633e-06, + "loss": 4.915, + "step": 74605 + }, + { + "epoch": 1.5179443359375, + "grad_norm": 16.642507553100586, + "learning_rate": 6.8527339786207556e-06, + "loss": 4.8956, + "step": 74610 + }, + { + "epoch": 1.5180460611979165, + "grad_norm": 17.910642623901367, + "learning_rate": 6.852362746771497e-06, + "loss": 4.9884, + "step": 74615 + }, + { + "epoch": 1.5181477864583335, + "grad_norm": 13.643001556396484, + "learning_rate": 6.851991503086228e-06, + "loss": 4.8279, + "step": 74620 + }, + { + "epoch": 1.51824951171875, + "grad_norm": 19.279619216918945, + "learning_rate": 6.851620247567321e-06, + "loss": 5.0351, + "step": 74625 + }, + { + "epoch": 1.5183512369791665, + "grad_norm": 32.09348678588867, + "learning_rate": 6.851248980217149e-06, + "loss": 5.0027, + "step": 74630 + }, + { + "epoch": 1.5184529622395835, + "grad_norm": 17.597854614257812, + "learning_rate": 6.850877701038082e-06, + "loss": 4.8789, + "step": 74635 + }, + { + "epoch": 1.5185546875, + "grad_norm": 18.803497314453125, + "learning_rate": 6.850506410032496e-06, + "loss": 4.8218, + "step": 74640 + }, + { + "epoch": 1.5186564127604165, + "grad_norm": 15.549327850341797, + "learning_rate": 6.85013510720276e-06, + "loss": 5.0452, + "step": 74645 + }, + { + "epoch": 1.5187581380208335, + "grad_norm": 17.79375457763672, + "learning_rate": 6.849763792551248e-06, + "loss": 4.8794, + "step": 74650 + }, + { + "epoch": 1.51885986328125, + "grad_norm": 21.53700065612793, + "learning_rate": 6.849392466080334e-06, + "loss": 4.9169, + "step": 74655 + }, + { + "epoch": 1.5189615885416665, + "grad_norm": 18.504079818725586, + "learning_rate": 6.8490211277923875e-06, + "loss": 4.8406, + "step": 74660 + }, + { + "epoch": 1.5190633138020835, + "grad_norm": 17.991304397583008, + "learning_rate": 6.848649777689784e-06, + "loss": 4.7801, + "step": 74665 + }, + { + "epoch": 1.5191650390625, + "grad_norm": 19.92142105102539, + "learning_rate": 6.848278415774895e-06, + "loss": 4.9629, + "step": 74670 + }, + { + "epoch": 1.5192667643229165, + "grad_norm": 14.327018737792969, + "learning_rate": 6.8479070420500936e-06, + "loss": 4.8471, + "step": 74675 + }, + { + "epoch": 1.5193684895833335, + "grad_norm": 20.605379104614258, + "learning_rate": 6.847535656517753e-06, + "loss": 5.06, + "step": 74680 + }, + { + "epoch": 1.51947021484375, + "grad_norm": 11.8859281539917, + "learning_rate": 6.8471642591802455e-06, + "loss": 4.6358, + "step": 74685 + }, + { + "epoch": 1.5195719401041665, + "grad_norm": 20.92430877685547, + "learning_rate": 6.846792850039945e-06, + "loss": 5.057, + "step": 74690 + }, + { + "epoch": 1.5196736653645835, + "grad_norm": 16.67367172241211, + "learning_rate": 6.846421429099225e-06, + "loss": 5.0507, + "step": 74695 + }, + { + "epoch": 1.519775390625, + "grad_norm": 17.117887496948242, + "learning_rate": 6.846049996360457e-06, + "loss": 5.2085, + "step": 74700 + }, + { + "epoch": 1.5198771158854165, + "grad_norm": 23.030977249145508, + "learning_rate": 6.845678551826017e-06, + "loss": 5.2215, + "step": 74705 + }, + { + "epoch": 1.5199788411458335, + "grad_norm": 13.304411888122559, + "learning_rate": 6.845307095498276e-06, + "loss": 5.1587, + "step": 74710 + }, + { + "epoch": 1.52008056640625, + "grad_norm": 24.080778121948242, + "learning_rate": 6.844935627379609e-06, + "loss": 4.7321, + "step": 74715 + }, + { + "epoch": 1.5201822916666665, + "grad_norm": 16.62262725830078, + "learning_rate": 6.844564147472389e-06, + "loss": 4.7656, + "step": 74720 + }, + { + "epoch": 1.5202840169270835, + "grad_norm": 22.78026580810547, + "learning_rate": 6.84419265577899e-06, + "loss": 4.797, + "step": 74725 + }, + { + "epoch": 1.5203857421875, + "grad_norm": 55.14432907104492, + "learning_rate": 6.8438211523017835e-06, + "loss": 4.9372, + "step": 74730 + }, + { + "epoch": 1.5204874674479165, + "grad_norm": 16.710887908935547, + "learning_rate": 6.843449637043147e-06, + "loss": 4.6053, + "step": 74735 + }, + { + "epoch": 1.5205891927083335, + "grad_norm": 22.6101131439209, + "learning_rate": 6.8430781100054525e-06, + "loss": 5.0437, + "step": 74740 + }, + { + "epoch": 1.52069091796875, + "grad_norm": 15.125260353088379, + "learning_rate": 6.842706571191074e-06, + "loss": 4.9218, + "step": 74745 + }, + { + "epoch": 1.5207926432291665, + "grad_norm": 18.403335571289062, + "learning_rate": 6.842335020602383e-06, + "loss": 4.7733, + "step": 74750 + }, + { + "epoch": 1.5208943684895835, + "grad_norm": 24.32311248779297, + "learning_rate": 6.8419634582417585e-06, + "loss": 4.971, + "step": 74755 + }, + { + "epoch": 1.52099609375, + "grad_norm": 18.46474838256836, + "learning_rate": 6.841591884111572e-06, + "loss": 4.7882, + "step": 74760 + }, + { + "epoch": 1.5210978190104165, + "grad_norm": 15.579151153564453, + "learning_rate": 6.841220298214197e-06, + "loss": 4.8764, + "step": 74765 + }, + { + "epoch": 1.5211995442708335, + "grad_norm": 17.25452423095703, + "learning_rate": 6.8408487005520095e-06, + "loss": 4.9694, + "step": 74770 + }, + { + "epoch": 1.52130126953125, + "grad_norm": 19.943723678588867, + "learning_rate": 6.840477091127382e-06, + "loss": 5.016, + "step": 74775 + }, + { + "epoch": 1.5214029947916665, + "grad_norm": 15.19292163848877, + "learning_rate": 6.840105469942692e-06, + "loss": 4.8076, + "step": 74780 + }, + { + "epoch": 1.5215047200520835, + "grad_norm": 17.465797424316406, + "learning_rate": 6.839733837000311e-06, + "loss": 5.0314, + "step": 74785 + }, + { + "epoch": 1.5216064453125, + "grad_norm": 15.745357513427734, + "learning_rate": 6.8393621923026146e-06, + "loss": 4.8568, + "step": 74790 + }, + { + "epoch": 1.5217081705729165, + "grad_norm": 19.45064353942871, + "learning_rate": 6.838990535851979e-06, + "loss": 4.5943, + "step": 74795 + }, + { + "epoch": 1.5218098958333335, + "grad_norm": 18.643447875976562, + "learning_rate": 6.838618867650776e-06, + "loss": 4.6917, + "step": 74800 + }, + { + "epoch": 1.52191162109375, + "grad_norm": 21.712100982666016, + "learning_rate": 6.838247187701383e-06, + "loss": 4.9632, + "step": 74805 + }, + { + "epoch": 1.5220133463541665, + "grad_norm": 23.092164993286133, + "learning_rate": 6.837875496006174e-06, + "loss": 4.9859, + "step": 74810 + }, + { + "epoch": 1.5221150716145835, + "grad_norm": 18.783910751342773, + "learning_rate": 6.8375037925675235e-06, + "loss": 4.8103, + "step": 74815 + }, + { + "epoch": 1.522216796875, + "grad_norm": 19.06719207763672, + "learning_rate": 6.837132077387807e-06, + "loss": 4.8822, + "step": 74820 + }, + { + "epoch": 1.5223185221354165, + "grad_norm": 16.5234432220459, + "learning_rate": 6.8367603504694e-06, + "loss": 4.7474, + "step": 74825 + }, + { + "epoch": 1.5224202473958335, + "grad_norm": 20.80901336669922, + "learning_rate": 6.836388611814679e-06, + "loss": 4.9521, + "step": 74830 + }, + { + "epoch": 1.52252197265625, + "grad_norm": 22.027729034423828, + "learning_rate": 6.8360168614260156e-06, + "loss": 4.8722, + "step": 74835 + }, + { + "epoch": 1.5226236979166665, + "grad_norm": 18.07360076904297, + "learning_rate": 6.835645099305788e-06, + "loss": 5.0029, + "step": 74840 + }, + { + "epoch": 1.5227254231770835, + "grad_norm": 16.668657302856445, + "learning_rate": 6.8352733254563705e-06, + "loss": 4.8957, + "step": 74845 + }, + { + "epoch": 1.5228271484375, + "grad_norm": 16.573081970214844, + "learning_rate": 6.834901539880141e-06, + "loss": 5.14, + "step": 74850 + }, + { + "epoch": 1.5229288736979165, + "grad_norm": 12.937106132507324, + "learning_rate": 6.83452974257947e-06, + "loss": 4.7707, + "step": 74855 + }, + { + "epoch": 1.5230305989583335, + "grad_norm": 15.63602352142334, + "learning_rate": 6.834157933556739e-06, + "loss": 4.727, + "step": 74860 + }, + { + "epoch": 1.52313232421875, + "grad_norm": 16.943330764770508, + "learning_rate": 6.833786112814319e-06, + "loss": 4.9461, + "step": 74865 + }, + { + "epoch": 1.5232340494791665, + "grad_norm": 23.37097930908203, + "learning_rate": 6.833414280354589e-06, + "loss": 5.0729, + "step": 74870 + }, + { + "epoch": 1.5233357747395835, + "grad_norm": 22.600631713867188, + "learning_rate": 6.833042436179924e-06, + "loss": 5.2816, + "step": 74875 + }, + { + "epoch": 1.5234375, + "grad_norm": 20.346176147460938, + "learning_rate": 6.832670580292699e-06, + "loss": 4.9441, + "step": 74880 + }, + { + "epoch": 1.5235392252604165, + "grad_norm": 18.71855926513672, + "learning_rate": 6.8322987126952914e-06, + "loss": 4.8333, + "step": 74885 + }, + { + "epoch": 1.5236409505208335, + "grad_norm": 18.328031539916992, + "learning_rate": 6.831926833390076e-06, + "loss": 4.722, + "step": 74890 + }, + { + "epoch": 1.52374267578125, + "grad_norm": 13.694019317626953, + "learning_rate": 6.83155494237943e-06, + "loss": 4.8123, + "step": 74895 + }, + { + "epoch": 1.5238444010416665, + "grad_norm": 15.403268814086914, + "learning_rate": 6.83118303966573e-06, + "loss": 4.8732, + "step": 74900 + }, + { + "epoch": 1.5239461263020835, + "grad_norm": 16.912647247314453, + "learning_rate": 6.830811125251351e-06, + "loss": 5.0148, + "step": 74905 + }, + { + "epoch": 1.5240478515625, + "grad_norm": 18.969451904296875, + "learning_rate": 6.83043919913867e-06, + "loss": 4.8794, + "step": 74910 + }, + { + "epoch": 1.5241495768229165, + "grad_norm": 16.768346786499023, + "learning_rate": 6.830067261330065e-06, + "loss": 4.9422, + "step": 74915 + }, + { + "epoch": 1.5242513020833335, + "grad_norm": 20.503585815429688, + "learning_rate": 6.829695311827908e-06, + "loss": 4.8519, + "step": 74920 + }, + { + "epoch": 1.52435302734375, + "grad_norm": 18.31321144104004, + "learning_rate": 6.829323350634581e-06, + "loss": 4.679, + "step": 74925 + }, + { + "epoch": 1.5244547526041665, + "grad_norm": 20.29840087890625, + "learning_rate": 6.828951377752458e-06, + "loss": 4.8966, + "step": 74930 + }, + { + "epoch": 1.5245564778645835, + "grad_norm": 17.204208374023438, + "learning_rate": 6.8285793931839165e-06, + "loss": 4.7946, + "step": 74935 + }, + { + "epoch": 1.524658203125, + "grad_norm": 15.386311531066895, + "learning_rate": 6.8282073969313325e-06, + "loss": 5.0103, + "step": 74940 + }, + { + "epoch": 1.5247599283854165, + "grad_norm": 17.35848045349121, + "learning_rate": 6.827835388997084e-06, + "loss": 4.7065, + "step": 74945 + }, + { + "epoch": 1.5248616536458335, + "grad_norm": 19.23098373413086, + "learning_rate": 6.827463369383548e-06, + "loss": 4.8396, + "step": 74950 + }, + { + "epoch": 1.52496337890625, + "grad_norm": 16.94078254699707, + "learning_rate": 6.827091338093099e-06, + "loss": 4.8016, + "step": 74955 + }, + { + "epoch": 1.5250651041666665, + "grad_norm": 18.141305923461914, + "learning_rate": 6.826719295128119e-06, + "loss": 4.9495, + "step": 74960 + }, + { + "epoch": 1.5251668294270835, + "grad_norm": 14.596761703491211, + "learning_rate": 6.826347240490982e-06, + "loss": 4.9664, + "step": 74965 + }, + { + "epoch": 1.5252685546875, + "grad_norm": 19.19036293029785, + "learning_rate": 6.825975174184065e-06, + "loss": 4.7405, + "step": 74970 + }, + { + "epoch": 1.5253702799479165, + "grad_norm": 13.339612007141113, + "learning_rate": 6.825603096209749e-06, + "loss": 4.8689, + "step": 74975 + }, + { + "epoch": 1.5254720052083335, + "grad_norm": 14.355705261230469, + "learning_rate": 6.825231006570405e-06, + "loss": 4.9945, + "step": 74980 + }, + { + "epoch": 1.52557373046875, + "grad_norm": 24.795866012573242, + "learning_rate": 6.824858905268416e-06, + "loss": 4.8078, + "step": 74985 + }, + { + "epoch": 1.5256754557291665, + "grad_norm": 19.47315216064453, + "learning_rate": 6.82448679230616e-06, + "loss": 4.8811, + "step": 74990 + }, + { + "epoch": 1.5257771809895835, + "grad_norm": 18.559551239013672, + "learning_rate": 6.824114667686009e-06, + "loss": 4.9209, + "step": 74995 + }, + { + "epoch": 1.52587890625, + "grad_norm": 14.820694923400879, + "learning_rate": 6.823742531410347e-06, + "loss": 5.0623, + "step": 75000 + }, + { + "epoch": 1.5259806315104165, + "grad_norm": 15.422422409057617, + "learning_rate": 6.823370383481548e-06, + "loss": 4.824, + "step": 75005 + }, + { + "epoch": 1.5260823567708335, + "grad_norm": 21.03640365600586, + "learning_rate": 6.82299822390199e-06, + "loss": 5.2397, + "step": 75010 + }, + { + "epoch": 1.52618408203125, + "grad_norm": 15.687496185302734, + "learning_rate": 6.8226260526740525e-06, + "loss": 4.7471, + "step": 75015 + }, + { + "epoch": 1.5262858072916665, + "grad_norm": 18.89000701904297, + "learning_rate": 6.822253869800114e-06, + "loss": 4.5748, + "step": 75020 + }, + { + "epoch": 1.5263875325520835, + "grad_norm": 15.529324531555176, + "learning_rate": 6.821881675282551e-06, + "loss": 4.9134, + "step": 75025 + }, + { + "epoch": 1.5264892578125, + "grad_norm": 16.13465690612793, + "learning_rate": 6.821509469123744e-06, + "loss": 4.8149, + "step": 75030 + }, + { + "epoch": 1.5265909830729165, + "grad_norm": 18.9518985748291, + "learning_rate": 6.821137251326067e-06, + "loss": 5.2734, + "step": 75035 + }, + { + "epoch": 1.5266927083333335, + "grad_norm": 20.00338363647461, + "learning_rate": 6.8207650218919046e-06, + "loss": 4.9121, + "step": 75040 + }, + { + "epoch": 1.52679443359375, + "grad_norm": 50.835777282714844, + "learning_rate": 6.820392780823628e-06, + "loss": 5.0021, + "step": 75045 + }, + { + "epoch": 1.5268961588541665, + "grad_norm": 16.03130340576172, + "learning_rate": 6.820020528123622e-06, + "loss": 4.9303, + "step": 75050 + }, + { + "epoch": 1.5269978841145835, + "grad_norm": 19.770444869995117, + "learning_rate": 6.819648263794262e-06, + "loss": 4.9065, + "step": 75055 + }, + { + "epoch": 1.527099609375, + "grad_norm": 15.465803146362305, + "learning_rate": 6.819275987837927e-06, + "loss": 4.8511, + "step": 75060 + }, + { + "epoch": 1.5272013346354165, + "grad_norm": 15.42260456085205, + "learning_rate": 6.818903700256996e-06, + "loss": 4.7836, + "step": 75065 + }, + { + "epoch": 1.5273030598958335, + "grad_norm": 20.630596160888672, + "learning_rate": 6.8185314010538475e-06, + "loss": 5.2533, + "step": 75070 + }, + { + "epoch": 1.52740478515625, + "grad_norm": 21.04111671447754, + "learning_rate": 6.81815909023086e-06, + "loss": 4.9417, + "step": 75075 + }, + { + "epoch": 1.5275065104166665, + "grad_norm": 23.59731101989746, + "learning_rate": 6.817786767790413e-06, + "loss": 5.1051, + "step": 75080 + }, + { + "epoch": 1.5276082356770835, + "grad_norm": 18.24223518371582, + "learning_rate": 6.817414433734888e-06, + "loss": 5.0338, + "step": 75085 + }, + { + "epoch": 1.5277099609375, + "grad_norm": 15.395711898803711, + "learning_rate": 6.8170420880666584e-06, + "loss": 4.8249, + "step": 75090 + }, + { + "epoch": 1.5278116861979165, + "grad_norm": 17.689334869384766, + "learning_rate": 6.8166697307881095e-06, + "loss": 4.7521, + "step": 75095 + }, + { + "epoch": 1.5279134114583335, + "grad_norm": 14.700702667236328, + "learning_rate": 6.816297361901616e-06, + "loss": 4.7901, + "step": 75100 + }, + { + "epoch": 1.52801513671875, + "grad_norm": 16.214500427246094, + "learning_rate": 6.815924981409559e-06, + "loss": 5.0756, + "step": 75105 + }, + { + "epoch": 1.5281168619791665, + "grad_norm": 17.977367401123047, + "learning_rate": 6.815552589314318e-06, + "loss": 4.7982, + "step": 75110 + }, + { + "epoch": 1.5282185872395835, + "grad_norm": 14.3431978225708, + "learning_rate": 6.815180185618273e-06, + "loss": 4.8762, + "step": 75115 + }, + { + "epoch": 1.5283203125, + "grad_norm": 16.813859939575195, + "learning_rate": 6.814807770323803e-06, + "loss": 4.7769, + "step": 75120 + }, + { + "epoch": 1.5284220377604165, + "grad_norm": 15.01757526397705, + "learning_rate": 6.8144353434332865e-06, + "loss": 4.822, + "step": 75125 + }, + { + "epoch": 1.5285237630208335, + "grad_norm": 16.73737907409668, + "learning_rate": 6.8140629049491045e-06, + "loss": 4.8487, + "step": 75130 + }, + { + "epoch": 1.52862548828125, + "grad_norm": 16.38226318359375, + "learning_rate": 6.813690454873637e-06, + "loss": 4.9729, + "step": 75135 + }, + { + "epoch": 1.5287272135416665, + "grad_norm": 19.838403701782227, + "learning_rate": 6.813317993209263e-06, + "loss": 5.2059, + "step": 75140 + }, + { + "epoch": 1.5288289388020835, + "grad_norm": 22.942665100097656, + "learning_rate": 6.812945519958363e-06, + "loss": 4.7975, + "step": 75145 + }, + { + "epoch": 1.5289306640625, + "grad_norm": 19.074583053588867, + "learning_rate": 6.812573035123315e-06, + "loss": 4.5875, + "step": 75150 + }, + { + "epoch": 1.5290323893229165, + "grad_norm": 20.02288055419922, + "learning_rate": 6.8122005387065015e-06, + "loss": 4.9269, + "step": 75155 + }, + { + "epoch": 1.5291341145833335, + "grad_norm": 32.6577033996582, + "learning_rate": 6.811828030710302e-06, + "loss": 5.218, + "step": 75160 + }, + { + "epoch": 1.52923583984375, + "grad_norm": 17.14379119873047, + "learning_rate": 6.8114555111370965e-06, + "loss": 5.1703, + "step": 75165 + }, + { + "epoch": 1.5293375651041665, + "grad_norm": 15.462634086608887, + "learning_rate": 6.811082979989265e-06, + "loss": 4.9434, + "step": 75170 + }, + { + "epoch": 1.5294392903645835, + "grad_norm": 18.38283348083496, + "learning_rate": 6.810710437269188e-06, + "loss": 5.0935, + "step": 75175 + }, + { + "epoch": 1.529541015625, + "grad_norm": 17.43380355834961, + "learning_rate": 6.810337882979248e-06, + "loss": 4.9266, + "step": 75180 + }, + { + "epoch": 1.5296427408854165, + "grad_norm": 21.507129669189453, + "learning_rate": 6.809965317121823e-06, + "loss": 5.1774, + "step": 75185 + }, + { + "epoch": 1.5297444661458335, + "grad_norm": 18.353710174560547, + "learning_rate": 6.809592739699293e-06, + "loss": 5.0626, + "step": 75190 + }, + { + "epoch": 1.52984619140625, + "grad_norm": 21.54132652282715, + "learning_rate": 6.80922015071404e-06, + "loss": 4.8616, + "step": 75195 + }, + { + "epoch": 1.5299479166666665, + "grad_norm": 20.07134246826172, + "learning_rate": 6.808847550168443e-06, + "loss": 4.9973, + "step": 75200 + }, + { + "epoch": 1.5300496419270835, + "grad_norm": 20.126131057739258, + "learning_rate": 6.8084749380648874e-06, + "loss": 4.9229, + "step": 75205 + }, + { + "epoch": 1.5301513671875, + "grad_norm": 19.428558349609375, + "learning_rate": 6.808102314405749e-06, + "loss": 5.0274, + "step": 75210 + }, + { + "epoch": 1.5302530924479165, + "grad_norm": 23.692331314086914, + "learning_rate": 6.807729679193412e-06, + "loss": 4.9337, + "step": 75215 + }, + { + "epoch": 1.5303548177083335, + "grad_norm": 14.908543586730957, + "learning_rate": 6.807357032430255e-06, + "loss": 4.9126, + "step": 75220 + }, + { + "epoch": 1.53045654296875, + "grad_norm": 19.31720542907715, + "learning_rate": 6.806984374118661e-06, + "loss": 4.9689, + "step": 75225 + }, + { + "epoch": 1.5305582682291665, + "grad_norm": 13.00231647491455, + "learning_rate": 6.80661170426101e-06, + "loss": 4.886, + "step": 75230 + }, + { + "epoch": 1.5306599934895835, + "grad_norm": 18.174381256103516, + "learning_rate": 6.8062390228596855e-06, + "loss": 5.038, + "step": 75235 + }, + { + "epoch": 1.53076171875, + "grad_norm": 17.183969497680664, + "learning_rate": 6.8058663299170644e-06, + "loss": 4.7707, + "step": 75240 + }, + { + "epoch": 1.5308634440104165, + "grad_norm": 15.180994987487793, + "learning_rate": 6.805493625435533e-06, + "loss": 4.9844, + "step": 75245 + }, + { + "epoch": 1.5309651692708335, + "grad_norm": 18.06219482421875, + "learning_rate": 6.805120909417469e-06, + "loss": 4.8466, + "step": 75250 + }, + { + "epoch": 1.53106689453125, + "grad_norm": 17.95558738708496, + "learning_rate": 6.8047481818652565e-06, + "loss": 4.9736, + "step": 75255 + }, + { + "epoch": 1.5311686197916665, + "grad_norm": 23.306514739990234, + "learning_rate": 6.804375442781276e-06, + "loss": 5.0786, + "step": 75260 + }, + { + "epoch": 1.5312703450520835, + "grad_norm": 17.23859214782715, + "learning_rate": 6.804002692167908e-06, + "loss": 4.9443, + "step": 75265 + }, + { + "epoch": 1.5313720703125, + "grad_norm": 20.601594924926758, + "learning_rate": 6.803629930027536e-06, + "loss": 5.06, + "step": 75270 + }, + { + "epoch": 1.5314737955729165, + "grad_norm": 18.32597541809082, + "learning_rate": 6.803257156362542e-06, + "loss": 5.2668, + "step": 75275 + }, + { + "epoch": 1.5315755208333335, + "grad_norm": 15.346505165100098, + "learning_rate": 6.802884371175306e-06, + "loss": 4.885, + "step": 75280 + }, + { + "epoch": 1.53167724609375, + "grad_norm": 23.861953735351562, + "learning_rate": 6.802511574468212e-06, + "loss": 4.9878, + "step": 75285 + }, + { + "epoch": 1.5317789713541665, + "grad_norm": 17.375131607055664, + "learning_rate": 6.802138766243642e-06, + "loss": 4.9741, + "step": 75290 + }, + { + "epoch": 1.5318806966145835, + "grad_norm": 23.118593215942383, + "learning_rate": 6.801765946503975e-06, + "loss": 4.6844, + "step": 75295 + }, + { + "epoch": 1.531982421875, + "grad_norm": 18.972915649414062, + "learning_rate": 6.801393115251598e-06, + "loss": 4.8612, + "step": 75300 + }, + { + "epoch": 1.5320841471354165, + "grad_norm": 16.67854118347168, + "learning_rate": 6.801020272488889e-06, + "loss": 4.9597, + "step": 75305 + }, + { + "epoch": 1.5321858723958335, + "grad_norm": 19.10824203491211, + "learning_rate": 6.800647418218233e-06, + "loss": 4.9883, + "step": 75310 + }, + { + "epoch": 1.53228759765625, + "grad_norm": 17.200929641723633, + "learning_rate": 6.800274552442012e-06, + "loss": 4.7315, + "step": 75315 + }, + { + "epoch": 1.5323893229166665, + "grad_norm": 23.371305465698242, + "learning_rate": 6.799901675162608e-06, + "loss": 4.8074, + "step": 75320 + }, + { + "epoch": 1.5324910481770835, + "grad_norm": 19.04886245727539, + "learning_rate": 6.799528786382403e-06, + "loss": 4.9431, + "step": 75325 + }, + { + "epoch": 1.5325927734375, + "grad_norm": 20.066791534423828, + "learning_rate": 6.79915588610378e-06, + "loss": 4.8072, + "step": 75330 + }, + { + "epoch": 1.5326944986979165, + "grad_norm": 17.801509857177734, + "learning_rate": 6.7987829743291234e-06, + "loss": 4.723, + "step": 75335 + }, + { + "epoch": 1.5327962239583335, + "grad_norm": 19.759920120239258, + "learning_rate": 6.798410051060813e-06, + "loss": 4.8484, + "step": 75340 + }, + { + "epoch": 1.53289794921875, + "grad_norm": 14.359951972961426, + "learning_rate": 6.798037116301235e-06, + "loss": 4.8225, + "step": 75345 + }, + { + "epoch": 1.5329996744791665, + "grad_norm": 16.633495330810547, + "learning_rate": 6.797664170052769e-06, + "loss": 5.0535, + "step": 75350 + }, + { + "epoch": 1.5331013997395835, + "grad_norm": 14.220829963684082, + "learning_rate": 6.7972912123178e-06, + "loss": 4.874, + "step": 75355 + }, + { + "epoch": 1.533203125, + "grad_norm": 19.691104888916016, + "learning_rate": 6.796918243098711e-06, + "loss": 5.0994, + "step": 75360 + }, + { + "epoch": 1.5333048502604165, + "grad_norm": 17.090049743652344, + "learning_rate": 6.7965452623978845e-06, + "loss": 4.9189, + "step": 75365 + }, + { + "epoch": 1.5334065755208335, + "grad_norm": 20.007394790649414, + "learning_rate": 6.796172270217704e-06, + "loss": 4.9923, + "step": 75370 + }, + { + "epoch": 1.53350830078125, + "grad_norm": 16.489595413208008, + "learning_rate": 6.7957992665605544e-06, + "loss": 5.0364, + "step": 75375 + }, + { + "epoch": 1.5336100260416665, + "grad_norm": 20.084627151489258, + "learning_rate": 6.795426251428816e-06, + "loss": 4.7434, + "step": 75380 + }, + { + "epoch": 1.5337117513020835, + "grad_norm": 22.091732025146484, + "learning_rate": 6.795053224824875e-06, + "loss": 4.8945, + "step": 75385 + }, + { + "epoch": 1.5338134765625, + "grad_norm": 18.651235580444336, + "learning_rate": 6.794680186751113e-06, + "loss": 5.13, + "step": 75390 + }, + { + "epoch": 1.5339152018229165, + "grad_norm": 15.287589073181152, + "learning_rate": 6.794307137209914e-06, + "loss": 4.75, + "step": 75395 + }, + { + "epoch": 1.5340169270833335, + "grad_norm": 18.16973304748535, + "learning_rate": 6.793934076203662e-06, + "loss": 5.0011, + "step": 75400 + }, + { + "epoch": 1.53411865234375, + "grad_norm": 31.09516143798828, + "learning_rate": 6.793561003734741e-06, + "loss": 4.9639, + "step": 75405 + }, + { + "epoch": 1.5342203776041665, + "grad_norm": 20.680644989013672, + "learning_rate": 6.7931879198055355e-06, + "loss": 4.7893, + "step": 75410 + }, + { + "epoch": 1.5343221028645835, + "grad_norm": 17.230154037475586, + "learning_rate": 6.792814824418427e-06, + "loss": 4.7006, + "step": 75415 + }, + { + "epoch": 1.534423828125, + "grad_norm": 17.41826057434082, + "learning_rate": 6.7924417175758e-06, + "loss": 4.9696, + "step": 75420 + }, + { + "epoch": 1.5345255533854165, + "grad_norm": 14.961630821228027, + "learning_rate": 6.792068599280042e-06, + "loss": 5.0114, + "step": 75425 + }, + { + "epoch": 1.5346272786458335, + "grad_norm": 18.318248748779297, + "learning_rate": 6.7916954695335335e-06, + "loss": 4.6595, + "step": 75430 + }, + { + "epoch": 1.53472900390625, + "grad_norm": 17.14917755126953, + "learning_rate": 6.791322328338658e-06, + "loss": 4.9284, + "step": 75435 + }, + { + "epoch": 1.5348307291666665, + "grad_norm": 18.18790626525879, + "learning_rate": 6.790949175697804e-06, + "loss": 4.9659, + "step": 75440 + }, + { + "epoch": 1.5349324544270835, + "grad_norm": 13.836126327514648, + "learning_rate": 6.790576011613352e-06, + "loss": 4.7673, + "step": 75445 + }, + { + "epoch": 1.5350341796875, + "grad_norm": 19.47946548461914, + "learning_rate": 6.790202836087688e-06, + "loss": 4.9323, + "step": 75450 + }, + { + "epoch": 1.5351359049479165, + "grad_norm": 14.6814546585083, + "learning_rate": 6.789829649123196e-06, + "loss": 4.9074, + "step": 75455 + }, + { + "epoch": 1.5352376302083335, + "grad_norm": 18.661542892456055, + "learning_rate": 6.78945645072226e-06, + "loss": 4.8769, + "step": 75460 + }, + { + "epoch": 1.53533935546875, + "grad_norm": 16.42062759399414, + "learning_rate": 6.789083240887267e-06, + "loss": 4.741, + "step": 75465 + }, + { + "epoch": 1.5354410807291665, + "grad_norm": 15.490864753723145, + "learning_rate": 6.788710019620599e-06, + "loss": 4.7861, + "step": 75470 + }, + { + "epoch": 1.5355428059895835, + "grad_norm": 22.509653091430664, + "learning_rate": 6.788336786924643e-06, + "loss": 4.9536, + "step": 75475 + }, + { + "epoch": 1.53564453125, + "grad_norm": 13.388818740844727, + "learning_rate": 6.787963542801781e-06, + "loss": 4.9334, + "step": 75480 + }, + { + "epoch": 1.5357462565104165, + "grad_norm": 19.83963394165039, + "learning_rate": 6.7875902872544e-06, + "loss": 5.052, + "step": 75485 + }, + { + "epoch": 1.5358479817708335, + "grad_norm": 20.78998374938965, + "learning_rate": 6.787217020284884e-06, + "loss": 5.0078, + "step": 75490 + }, + { + "epoch": 1.53594970703125, + "grad_norm": 14.816184043884277, + "learning_rate": 6.78684374189562e-06, + "loss": 4.996, + "step": 75495 + }, + { + "epoch": 1.5360514322916665, + "grad_norm": 19.766515731811523, + "learning_rate": 6.78647045208899e-06, + "loss": 5.0977, + "step": 75500 + }, + { + "epoch": 1.5361531575520835, + "grad_norm": 17.618511199951172, + "learning_rate": 6.786097150867382e-06, + "loss": 5.2207, + "step": 75505 + }, + { + "epoch": 1.5362548828125, + "grad_norm": 18.02480125427246, + "learning_rate": 6.785723838233182e-06, + "loss": 4.7069, + "step": 75510 + }, + { + "epoch": 1.5363566080729165, + "grad_norm": 18.22066879272461, + "learning_rate": 6.78535051418877e-06, + "loss": 4.832, + "step": 75515 + }, + { + "epoch": 1.5364583333333335, + "grad_norm": 32.4595947265625, + "learning_rate": 6.784977178736537e-06, + "loss": 4.8696, + "step": 75520 + }, + { + "epoch": 1.53656005859375, + "grad_norm": 19.78407859802246, + "learning_rate": 6.7846038318788655e-06, + "loss": 5.0509, + "step": 75525 + }, + { + "epoch": 1.5366617838541665, + "grad_norm": 14.670494079589844, + "learning_rate": 6.784230473618143e-06, + "loss": 4.827, + "step": 75530 + }, + { + "epoch": 1.5367635091145835, + "grad_norm": 15.831177711486816, + "learning_rate": 6.783857103956754e-06, + "loss": 4.991, + "step": 75535 + }, + { + "epoch": 1.536865234375, + "grad_norm": 16.66312026977539, + "learning_rate": 6.783483722897084e-06, + "loss": 4.7425, + "step": 75540 + }, + { + "epoch": 1.5369669596354165, + "grad_norm": 13.695281982421875, + "learning_rate": 6.783110330441519e-06, + "loss": 4.8643, + "step": 75545 + }, + { + "epoch": 1.5370686848958335, + "grad_norm": 26.310840606689453, + "learning_rate": 6.782736926592445e-06, + "loss": 4.6532, + "step": 75550 + }, + { + "epoch": 1.53717041015625, + "grad_norm": 16.09027671813965, + "learning_rate": 6.782363511352248e-06, + "loss": 4.8472, + "step": 75555 + }, + { + "epoch": 1.5372721354166665, + "grad_norm": 17.381038665771484, + "learning_rate": 6.781990084723316e-06, + "loss": 4.6676, + "step": 75560 + }, + { + "epoch": 1.5373738606770835, + "grad_norm": 18.703824996948242, + "learning_rate": 6.781616646708029e-06, + "loss": 5.1826, + "step": 75565 + }, + { + "epoch": 1.5374755859375, + "grad_norm": 14.346389770507812, + "learning_rate": 6.78124319730878e-06, + "loss": 5.0776, + "step": 75570 + }, + { + "epoch": 1.5375773111979165, + "grad_norm": 20.23275375366211, + "learning_rate": 6.7808697365279515e-06, + "loss": 5.0532, + "step": 75575 + }, + { + "epoch": 1.5376790364583335, + "grad_norm": 13.680510520935059, + "learning_rate": 6.780496264367931e-06, + "loss": 5.0554, + "step": 75580 + }, + { + "epoch": 1.53778076171875, + "grad_norm": 18.400619506835938, + "learning_rate": 6.780122780831105e-06, + "loss": 4.7818, + "step": 75585 + }, + { + "epoch": 1.5378824869791665, + "grad_norm": 18.244306564331055, + "learning_rate": 6.779749285919859e-06, + "loss": 4.7667, + "step": 75590 + }, + { + "epoch": 1.5379842122395835, + "grad_norm": 16.50611114501953, + "learning_rate": 6.779375779636581e-06, + "loss": 4.7912, + "step": 75595 + }, + { + "epoch": 1.5380859375, + "grad_norm": 14.296647071838379, + "learning_rate": 6.779002261983656e-06, + "loss": 4.7713, + "step": 75600 + }, + { + "epoch": 1.5381876627604165, + "grad_norm": 15.801284790039062, + "learning_rate": 6.7786287329634706e-06, + "loss": 4.8037, + "step": 75605 + }, + { + "epoch": 1.5382893880208335, + "grad_norm": 18.08022117614746, + "learning_rate": 6.778255192578411e-06, + "loss": 5.1662, + "step": 75610 + }, + { + "epoch": 1.53839111328125, + "grad_norm": 18.586015701293945, + "learning_rate": 6.7778816408308676e-06, + "loss": 4.9981, + "step": 75615 + }, + { + "epoch": 1.5384928385416665, + "grad_norm": 20.916641235351562, + "learning_rate": 6.777508077723224e-06, + "loss": 4.8755, + "step": 75620 + }, + { + "epoch": 1.5385945638020835, + "grad_norm": 20.104936599731445, + "learning_rate": 6.777134503257868e-06, + "loss": 5.1263, + "step": 75625 + }, + { + "epoch": 1.5386962890625, + "grad_norm": 18.147808074951172, + "learning_rate": 6.776760917437186e-06, + "loss": 4.9323, + "step": 75630 + }, + { + "epoch": 1.5387980143229165, + "grad_norm": 19.2969970703125, + "learning_rate": 6.776387320263568e-06, + "loss": 5.4192, + "step": 75635 + }, + { + "epoch": 1.5388997395833335, + "grad_norm": 18.132137298583984, + "learning_rate": 6.7760137117393965e-06, + "loss": 4.742, + "step": 75640 + }, + { + "epoch": 1.53900146484375, + "grad_norm": 37.29319763183594, + "learning_rate": 6.7756400918670626e-06, + "loss": 5.059, + "step": 75645 + }, + { + "epoch": 1.5391031901041665, + "grad_norm": 17.566329956054688, + "learning_rate": 6.775266460648953e-06, + "loss": 5.1534, + "step": 75650 + }, + { + "epoch": 1.5392049153645835, + "grad_norm": 21.011484146118164, + "learning_rate": 6.774892818087452e-06, + "loss": 5.0168, + "step": 75655 + }, + { + "epoch": 1.539306640625, + "grad_norm": 15.594938278198242, + "learning_rate": 6.77451916418495e-06, + "loss": 4.6165, + "step": 75660 + }, + { + "epoch": 1.5394083658854165, + "grad_norm": 22.644804000854492, + "learning_rate": 6.774145498943835e-06, + "loss": 4.9832, + "step": 75665 + }, + { + "epoch": 1.5395100911458335, + "grad_norm": 13.015871047973633, + "learning_rate": 6.7737718223664924e-06, + "loss": 4.8568, + "step": 75670 + }, + { + "epoch": 1.53961181640625, + "grad_norm": 18.837934494018555, + "learning_rate": 6.773398134455311e-06, + "loss": 4.9786, + "step": 75675 + }, + { + "epoch": 1.5397135416666665, + "grad_norm": 15.77898120880127, + "learning_rate": 6.773024435212678e-06, + "loss": 4.8592, + "step": 75680 + }, + { + "epoch": 1.5398152669270835, + "grad_norm": 18.155399322509766, + "learning_rate": 6.772650724640984e-06, + "loss": 4.8385, + "step": 75685 + }, + { + "epoch": 1.5399169921875, + "grad_norm": 27.2852783203125, + "learning_rate": 6.772277002742613e-06, + "loss": 5.2727, + "step": 75690 + }, + { + "epoch": 1.5400187174479165, + "grad_norm": 19.754436492919922, + "learning_rate": 6.771903269519954e-06, + "loss": 5.1651, + "step": 75695 + }, + { + "epoch": 1.5401204427083335, + "grad_norm": 19.716550827026367, + "learning_rate": 6.771529524975395e-06, + "loss": 4.8968, + "step": 75700 + }, + { + "epoch": 1.54022216796875, + "grad_norm": 18.76214027404785, + "learning_rate": 6.771155769111326e-06, + "loss": 4.8241, + "step": 75705 + }, + { + "epoch": 1.5403238932291665, + "grad_norm": 17.274625778198242, + "learning_rate": 6.770782001930134e-06, + "loss": 4.8776, + "step": 75710 + }, + { + "epoch": 1.5404256184895835, + "grad_norm": 17.101381301879883, + "learning_rate": 6.770408223434207e-06, + "loss": 5.1657, + "step": 75715 + }, + { + "epoch": 1.54052734375, + "grad_norm": 16.541950225830078, + "learning_rate": 6.770034433625933e-06, + "loss": 4.9301, + "step": 75720 + }, + { + "epoch": 1.5406290690104165, + "grad_norm": 24.091218948364258, + "learning_rate": 6.7696606325077015e-06, + "loss": 4.8147, + "step": 75725 + }, + { + "epoch": 1.5407307942708335, + "grad_norm": 20.320310592651367, + "learning_rate": 6.7692868200819e-06, + "loss": 4.844, + "step": 75730 + }, + { + "epoch": 1.54083251953125, + "grad_norm": 21.272865295410156, + "learning_rate": 6.768912996350917e-06, + "loss": 5.0057, + "step": 75735 + }, + { + "epoch": 1.5409342447916665, + "grad_norm": 18.118751525878906, + "learning_rate": 6.7685391613171416e-06, + "loss": 5.0746, + "step": 75740 + }, + { + "epoch": 1.5410359700520835, + "grad_norm": 20.17093849182129, + "learning_rate": 6.7681653149829614e-06, + "loss": 5.0224, + "step": 75745 + }, + { + "epoch": 1.5411376953125, + "grad_norm": 22.30824089050293, + "learning_rate": 6.767791457350768e-06, + "loss": 5.0498, + "step": 75750 + }, + { + "epoch": 1.5412394205729165, + "grad_norm": 14.005391120910645, + "learning_rate": 6.767417588422948e-06, + "loss": 5.0687, + "step": 75755 + }, + { + "epoch": 1.5413411458333335, + "grad_norm": 23.751317977905273, + "learning_rate": 6.767043708201889e-06, + "loss": 5.059, + "step": 75760 + }, + { + "epoch": 1.54144287109375, + "grad_norm": 19.526844024658203, + "learning_rate": 6.7666698166899834e-06, + "loss": 5.0089, + "step": 75765 + }, + { + "epoch": 1.5415445963541665, + "grad_norm": 22.54624366760254, + "learning_rate": 6.766295913889618e-06, + "loss": 4.9071, + "step": 75770 + }, + { + "epoch": 1.5416463216145835, + "grad_norm": 20.150371551513672, + "learning_rate": 6.765921999803182e-06, + "loss": 5.2302, + "step": 75775 + }, + { + "epoch": 1.541748046875, + "grad_norm": 18.976905822753906, + "learning_rate": 6.765548074433066e-06, + "loss": 4.9971, + "step": 75780 + }, + { + "epoch": 1.5418497721354165, + "grad_norm": 21.669452667236328, + "learning_rate": 6.765174137781657e-06, + "loss": 4.7712, + "step": 75785 + }, + { + "epoch": 1.5419514973958335, + "grad_norm": 16.44671058654785, + "learning_rate": 6.7648001898513475e-06, + "loss": 4.9523, + "step": 75790 + }, + { + "epoch": 1.54205322265625, + "grad_norm": 14.610180854797363, + "learning_rate": 6.764426230644523e-06, + "loss": 4.7306, + "step": 75795 + }, + { + "epoch": 1.5421549479166665, + "grad_norm": 19.840919494628906, + "learning_rate": 6.764052260163576e-06, + "loss": 4.8391, + "step": 75800 + }, + { + "epoch": 1.5422566731770835, + "grad_norm": 20.533170700073242, + "learning_rate": 6.763678278410896e-06, + "loss": 4.9365, + "step": 75805 + }, + { + "epoch": 1.5423583984375, + "grad_norm": 19.41121482849121, + "learning_rate": 6.76330428538887e-06, + "loss": 4.9553, + "step": 75810 + }, + { + "epoch": 1.5424601236979165, + "grad_norm": 20.227684020996094, + "learning_rate": 6.76293028109989e-06, + "loss": 4.761, + "step": 75815 + }, + { + "epoch": 1.5425618489583335, + "grad_norm": 23.622596740722656, + "learning_rate": 6.762556265546345e-06, + "loss": 5.0815, + "step": 75820 + }, + { + "epoch": 1.54266357421875, + "grad_norm": 19.02068519592285, + "learning_rate": 6.762182238730624e-06, + "loss": 4.8302, + "step": 75825 + }, + { + "epoch": 1.5427652994791665, + "grad_norm": 24.152355194091797, + "learning_rate": 6.76180820065512e-06, + "loss": 5.0304, + "step": 75830 + }, + { + "epoch": 1.5428670247395835, + "grad_norm": 14.569042205810547, + "learning_rate": 6.761434151322218e-06, + "loss": 5.0101, + "step": 75835 + }, + { + "epoch": 1.54296875, + "grad_norm": 20.484601974487305, + "learning_rate": 6.761060090734313e-06, + "loss": 4.9404, + "step": 75840 + }, + { + "epoch": 1.5430704752604165, + "grad_norm": 18.749406814575195, + "learning_rate": 6.760686018893794e-06, + "loss": 4.979, + "step": 75845 + }, + { + "epoch": 1.5431722005208335, + "grad_norm": 19.91989517211914, + "learning_rate": 6.7603119358030475e-06, + "loss": 4.992, + "step": 75850 + }, + { + "epoch": 1.54327392578125, + "grad_norm": 14.530909538269043, + "learning_rate": 6.759937841464469e-06, + "loss": 4.8794, + "step": 75855 + }, + { + "epoch": 1.5433756510416665, + "grad_norm": 17.914470672607422, + "learning_rate": 6.759563735880444e-06, + "loss": 5.0335, + "step": 75860 + }, + { + "epoch": 1.5434773763020835, + "grad_norm": 17.27333641052246, + "learning_rate": 6.759189619053368e-06, + "loss": 4.8382, + "step": 75865 + }, + { + "epoch": 1.5435791015625, + "grad_norm": 16.873233795166016, + "learning_rate": 6.7588154909856275e-06, + "loss": 4.7823, + "step": 75870 + }, + { + "epoch": 1.5436808268229165, + "grad_norm": 25.675479888916016, + "learning_rate": 6.7584413516796145e-06, + "loss": 5.0066, + "step": 75875 + }, + { + "epoch": 1.5437825520833335, + "grad_norm": 18.063636779785156, + "learning_rate": 6.758067201137718e-06, + "loss": 4.7258, + "step": 75880 + }, + { + "epoch": 1.54388427734375, + "grad_norm": 16.31781578063965, + "learning_rate": 6.757693039362332e-06, + "loss": 4.9148, + "step": 75885 + }, + { + "epoch": 1.5439860026041665, + "grad_norm": 23.118698120117188, + "learning_rate": 6.757318866355845e-06, + "loss": 4.8698, + "step": 75890 + }, + { + "epoch": 1.5440877278645835, + "grad_norm": 16.585025787353516, + "learning_rate": 6.756944682120648e-06, + "loss": 4.8678, + "step": 75895 + }, + { + "epoch": 1.544189453125, + "grad_norm": 23.00611114501953, + "learning_rate": 6.756570486659131e-06, + "loss": 4.9648, + "step": 75900 + }, + { + "epoch": 1.5442911783854165, + "grad_norm": 23.18418312072754, + "learning_rate": 6.756196279973688e-06, + "loss": 4.5679, + "step": 75905 + }, + { + "epoch": 1.5443929036458335, + "grad_norm": 22.145326614379883, + "learning_rate": 6.7558220620667085e-06, + "loss": 5.1179, + "step": 75910 + }, + { + "epoch": 1.54449462890625, + "grad_norm": 18.290264129638672, + "learning_rate": 6.755447832940582e-06, + "loss": 4.783, + "step": 75915 + }, + { + "epoch": 1.5445963541666665, + "grad_norm": 18.418853759765625, + "learning_rate": 6.755073592597702e-06, + "loss": 4.7784, + "step": 75920 + }, + { + "epoch": 1.5446980794270835, + "grad_norm": 21.870023727416992, + "learning_rate": 6.7546993410404585e-06, + "loss": 5.0162, + "step": 75925 + }, + { + "epoch": 1.5447998046875, + "grad_norm": 17.923625946044922, + "learning_rate": 6.754325078271243e-06, + "loss": 4.9677, + "step": 75930 + }, + { + "epoch": 1.5449015299479165, + "grad_norm": 14.811519622802734, + "learning_rate": 6.753950804292448e-06, + "loss": 4.8414, + "step": 75935 + }, + { + "epoch": 1.5450032552083335, + "grad_norm": 21.06867790222168, + "learning_rate": 6.753576519106464e-06, + "loss": 4.8121, + "step": 75940 + }, + { + "epoch": 1.54510498046875, + "grad_norm": 16.387828826904297, + "learning_rate": 6.7532022227156825e-06, + "loss": 4.7855, + "step": 75945 + }, + { + "epoch": 1.5452067057291665, + "grad_norm": 16.21847915649414, + "learning_rate": 6.752827915122495e-06, + "loss": 5.0596, + "step": 75950 + }, + { + "epoch": 1.5453084309895835, + "grad_norm": 23.86589241027832, + "learning_rate": 6.752453596329293e-06, + "loss": 4.9682, + "step": 75955 + }, + { + "epoch": 1.54541015625, + "grad_norm": 16.187450408935547, + "learning_rate": 6.75207926633847e-06, + "loss": 4.8529, + "step": 75960 + }, + { + "epoch": 1.5455118815104165, + "grad_norm": 14.86487865447998, + "learning_rate": 6.751704925152416e-06, + "loss": 4.9939, + "step": 75965 + }, + { + "epoch": 1.5456136067708335, + "grad_norm": 15.868630409240723, + "learning_rate": 6.7513305727735245e-06, + "loss": 4.8866, + "step": 75970 + }, + { + "epoch": 1.54571533203125, + "grad_norm": 13.280143737792969, + "learning_rate": 6.750956209204187e-06, + "loss": 5.0581, + "step": 75975 + }, + { + "epoch": 1.5458170572916665, + "grad_norm": 19.901491165161133, + "learning_rate": 6.750581834446793e-06, + "loss": 4.8745, + "step": 75980 + }, + { + "epoch": 1.5459187825520835, + "grad_norm": 15.216782569885254, + "learning_rate": 6.750207448503738e-06, + "loss": 4.8195, + "step": 75985 + }, + { + "epoch": 1.5460205078125, + "grad_norm": 16.785980224609375, + "learning_rate": 6.749833051377412e-06, + "loss": 4.8779, + "step": 75990 + }, + { + "epoch": 1.5461222330729165, + "grad_norm": 23.099607467651367, + "learning_rate": 6.74945864307021e-06, + "loss": 4.9795, + "step": 75995 + }, + { + "epoch": 1.5462239583333335, + "grad_norm": 17.500694274902344, + "learning_rate": 6.7490842235845226e-06, + "loss": 4.9381, + "step": 76000 + }, + { + "epoch": 1.54632568359375, + "grad_norm": 16.813852310180664, + "learning_rate": 6.748709792922741e-06, + "loss": 5.0347, + "step": 76005 + }, + { + "epoch": 1.5464274088541665, + "grad_norm": 19.046558380126953, + "learning_rate": 6.74833535108726e-06, + "loss": 4.9175, + "step": 76010 + }, + { + "epoch": 1.5465291341145835, + "grad_norm": 25.56309700012207, + "learning_rate": 6.747960898080469e-06, + "loss": 4.8631, + "step": 76015 + }, + { + "epoch": 1.546630859375, + "grad_norm": 15.912497520446777, + "learning_rate": 6.747586433904764e-06, + "loss": 4.6966, + "step": 76020 + }, + { + "epoch": 1.5467325846354165, + "grad_norm": 20.232572555541992, + "learning_rate": 6.747211958562537e-06, + "loss": 4.8082, + "step": 76025 + }, + { + "epoch": 1.5468343098958335, + "grad_norm": 22.585121154785156, + "learning_rate": 6.746837472056179e-06, + "loss": 4.8595, + "step": 76030 + }, + { + "epoch": 1.54693603515625, + "grad_norm": 17.51320457458496, + "learning_rate": 6.746462974388086e-06, + "loss": 5.2416, + "step": 76035 + }, + { + "epoch": 1.5470377604166665, + "grad_norm": 15.473162651062012, + "learning_rate": 6.746088465560647e-06, + "loss": 5.051, + "step": 76040 + }, + { + "epoch": 1.5471394856770835, + "grad_norm": 17.224367141723633, + "learning_rate": 6.745713945576257e-06, + "loss": 4.9031, + "step": 76045 + }, + { + "epoch": 1.5472412109375, + "grad_norm": 19.569547653198242, + "learning_rate": 6.745339414437312e-06, + "loss": 4.9495, + "step": 76050 + }, + { + "epoch": 1.5473429361979165, + "grad_norm": 22.189743041992188, + "learning_rate": 6.744964872146199e-06, + "loss": 5.0183, + "step": 76055 + }, + { + "epoch": 1.5474446614583335, + "grad_norm": 21.657306671142578, + "learning_rate": 6.744590318705316e-06, + "loss": 4.7291, + "step": 76060 + }, + { + "epoch": 1.54754638671875, + "grad_norm": 13.746254920959473, + "learning_rate": 6.744215754117055e-06, + "loss": 5.117, + "step": 76065 + }, + { + "epoch": 1.5476481119791665, + "grad_norm": 17.445072174072266, + "learning_rate": 6.743841178383807e-06, + "loss": 4.7919, + "step": 76070 + }, + { + "epoch": 1.5477498372395835, + "grad_norm": 14.770665168762207, + "learning_rate": 6.74346659150797e-06, + "loss": 4.9579, + "step": 76075 + }, + { + "epoch": 1.5478515625, + "grad_norm": 15.773245811462402, + "learning_rate": 6.743091993491933e-06, + "loss": 4.5993, + "step": 76080 + }, + { + "epoch": 1.5479532877604165, + "grad_norm": 17.741104125976562, + "learning_rate": 6.7427173843380926e-06, + "loss": 5.1169, + "step": 76085 + }, + { + "epoch": 1.5480550130208335, + "grad_norm": 16.211153030395508, + "learning_rate": 6.742342764048841e-06, + "loss": 4.8084, + "step": 76090 + }, + { + "epoch": 1.54815673828125, + "grad_norm": 14.504094123840332, + "learning_rate": 6.7419681326265715e-06, + "loss": 5.0485, + "step": 76095 + }, + { + "epoch": 1.5482584635416665, + "grad_norm": 25.14191246032715, + "learning_rate": 6.741593490073679e-06, + "loss": 5.0083, + "step": 76100 + }, + { + "epoch": 1.5483601888020835, + "grad_norm": 16.224035263061523, + "learning_rate": 6.741218836392558e-06, + "loss": 4.9207, + "step": 76105 + }, + { + "epoch": 1.5484619140625, + "grad_norm": 15.998515129089355, + "learning_rate": 6.7408441715856e-06, + "loss": 5.1846, + "step": 76110 + }, + { + "epoch": 1.5485636393229165, + "grad_norm": 14.371600151062012, + "learning_rate": 6.740469495655202e-06, + "loss": 4.8533, + "step": 76115 + }, + { + "epoch": 1.5486653645833335, + "grad_norm": 16.96851921081543, + "learning_rate": 6.740094808603754e-06, + "loss": 4.9111, + "step": 76120 + }, + { + "epoch": 1.54876708984375, + "grad_norm": 18.18484115600586, + "learning_rate": 6.739720110433654e-06, + "loss": 4.8224, + "step": 76125 + }, + { + "epoch": 1.5488688151041665, + "grad_norm": 15.380146026611328, + "learning_rate": 6.739345401147295e-06, + "loss": 4.9732, + "step": 76130 + }, + { + "epoch": 1.5489705403645835, + "grad_norm": 20.815471649169922, + "learning_rate": 6.73897068074707e-06, + "loss": 4.6201, + "step": 76135 + }, + { + "epoch": 1.549072265625, + "grad_norm": 25.163970947265625, + "learning_rate": 6.738595949235376e-06, + "loss": 5.0285, + "step": 76140 + }, + { + "epoch": 1.5491739908854165, + "grad_norm": 16.106128692626953, + "learning_rate": 6.738221206614604e-06, + "loss": 5.1208, + "step": 76145 + }, + { + "epoch": 1.5492757161458335, + "grad_norm": 17.007034301757812, + "learning_rate": 6.7378464528871514e-06, + "loss": 4.6999, + "step": 76150 + }, + { + "epoch": 1.54937744140625, + "grad_norm": 17.076828002929688, + "learning_rate": 6.7374716880554104e-06, + "loss": 4.8077, + "step": 76155 + }, + { + "epoch": 1.5494791666666665, + "grad_norm": 15.316368103027344, + "learning_rate": 6.737096912121778e-06, + "loss": 4.6809, + "step": 76160 + }, + { + "epoch": 1.5495808919270835, + "grad_norm": 13.90898323059082, + "learning_rate": 6.736722125088647e-06, + "loss": 4.8995, + "step": 76165 + }, + { + "epoch": 1.5496826171875, + "grad_norm": 22.452476501464844, + "learning_rate": 6.7363473269584145e-06, + "loss": 4.7981, + "step": 76170 + }, + { + "epoch": 1.5497843424479165, + "grad_norm": 17.983177185058594, + "learning_rate": 6.735972517733473e-06, + "loss": 4.7477, + "step": 76175 + }, + { + "epoch": 1.5498860677083335, + "grad_norm": 17.734533309936523, + "learning_rate": 6.7355976974162175e-06, + "loss": 4.8851, + "step": 76180 + }, + { + "epoch": 1.54998779296875, + "grad_norm": 18.163633346557617, + "learning_rate": 6.735222866009044e-06, + "loss": 5.0014, + "step": 76185 + }, + { + "epoch": 1.5500895182291665, + "grad_norm": 23.09613037109375, + "learning_rate": 6.734848023514347e-06, + "loss": 5.211, + "step": 76190 + }, + { + "epoch": 1.5501912434895835, + "grad_norm": 13.785858154296875, + "learning_rate": 6.734473169934522e-06, + "loss": 4.8766, + "step": 76195 + }, + { + "epoch": 1.55029296875, + "grad_norm": 17.98517417907715, + "learning_rate": 6.734098305271964e-06, + "loss": 4.9378, + "step": 76200 + }, + { + "epoch": 1.5503946940104165, + "grad_norm": 20.53366470336914, + "learning_rate": 6.733723429529068e-06, + "loss": 5.0341, + "step": 76205 + }, + { + "epoch": 1.5504964192708335, + "grad_norm": 15.54620361328125, + "learning_rate": 6.733348542708231e-06, + "loss": 5.0859, + "step": 76210 + }, + { + "epoch": 1.55059814453125, + "grad_norm": 17.890499114990234, + "learning_rate": 6.732973644811847e-06, + "loss": 4.7355, + "step": 76215 + }, + { + "epoch": 1.5506998697916665, + "grad_norm": 15.643623352050781, + "learning_rate": 6.732598735842311e-06, + "loss": 4.7422, + "step": 76220 + }, + { + "epoch": 1.5508015950520835, + "grad_norm": 18.26169204711914, + "learning_rate": 6.732223815802017e-06, + "loss": 4.6663, + "step": 76225 + }, + { + "epoch": 1.5509033203125, + "grad_norm": 33.6732177734375, + "learning_rate": 6.731848884693366e-06, + "loss": 5.1397, + "step": 76230 + }, + { + "epoch": 1.5510050455729165, + "grad_norm": 14.881658554077148, + "learning_rate": 6.731473942518748e-06, + "loss": 4.9379, + "step": 76235 + }, + { + "epoch": 1.5511067708333335, + "grad_norm": 18.097143173217773, + "learning_rate": 6.731098989280562e-06, + "loss": 4.8666, + "step": 76240 + }, + { + "epoch": 1.55120849609375, + "grad_norm": 28.520347595214844, + "learning_rate": 6.7307240249812035e-06, + "loss": 4.8144, + "step": 76245 + }, + { + "epoch": 1.5513102213541665, + "grad_norm": 16.07003402709961, + "learning_rate": 6.730349049623067e-06, + "loss": 4.8103, + "step": 76250 + }, + { + "epoch": 1.5514119466145835, + "grad_norm": 16.45796012878418, + "learning_rate": 6.72997406320855e-06, + "loss": 5.1394, + "step": 76255 + }, + { + "epoch": 1.551513671875, + "grad_norm": 15.16385555267334, + "learning_rate": 6.729599065740048e-06, + "loss": 5.0409, + "step": 76260 + }, + { + "epoch": 1.5516153971354165, + "grad_norm": 19.804319381713867, + "learning_rate": 6.729224057219956e-06, + "loss": 5.0801, + "step": 76265 + }, + { + "epoch": 1.5517171223958335, + "grad_norm": 16.732215881347656, + "learning_rate": 6.7288490376506735e-06, + "loss": 4.8264, + "step": 76270 + }, + { + "epoch": 1.55181884765625, + "grad_norm": 20.346393585205078, + "learning_rate": 6.7284740070345914e-06, + "loss": 4.4601, + "step": 76275 + }, + { + "epoch": 1.5519205729166665, + "grad_norm": 18.063772201538086, + "learning_rate": 6.72809896537411e-06, + "loss": 4.9156, + "step": 76280 + }, + { + "epoch": 1.5520222981770835, + "grad_norm": 13.994226455688477, + "learning_rate": 6.7277239126716255e-06, + "loss": 4.8806, + "step": 76285 + }, + { + "epoch": 1.5521240234375, + "grad_norm": 14.83841609954834, + "learning_rate": 6.727348848929533e-06, + "loss": 4.6429, + "step": 76290 + }, + { + "epoch": 1.5522257486979165, + "grad_norm": 21.8773136138916, + "learning_rate": 6.726973774150231e-06, + "loss": 5.1457, + "step": 76295 + }, + { + "epoch": 1.5523274739583335, + "grad_norm": 19.127504348754883, + "learning_rate": 6.726598688336112e-06, + "loss": 5.1162, + "step": 76300 + }, + { + "epoch": 1.55242919921875, + "grad_norm": 14.154664993286133, + "learning_rate": 6.726223591489579e-06, + "loss": 4.7981, + "step": 76305 + }, + { + "epoch": 1.5525309244791665, + "grad_norm": 17.239593505859375, + "learning_rate": 6.725848483613023e-06, + "loss": 4.8147, + "step": 76310 + }, + { + "epoch": 1.5526326497395835, + "grad_norm": 25.90461540222168, + "learning_rate": 6.7254733647088435e-06, + "loss": 4.81, + "step": 76315 + }, + { + "epoch": 1.552734375, + "grad_norm": 15.505881309509277, + "learning_rate": 6.725098234779438e-06, + "loss": 4.7029, + "step": 76320 + }, + { + "epoch": 1.5528361002604165, + "grad_norm": 16.51976203918457, + "learning_rate": 6.724723093827201e-06, + "loss": 4.9115, + "step": 76325 + }, + { + "epoch": 1.5529378255208335, + "grad_norm": 18.871349334716797, + "learning_rate": 6.724347941854532e-06, + "loss": 4.8407, + "step": 76330 + }, + { + "epoch": 1.55303955078125, + "grad_norm": 14.91750431060791, + "learning_rate": 6.723972778863827e-06, + "loss": 5.0807, + "step": 76335 + }, + { + "epoch": 1.5531412760416665, + "grad_norm": 18.746355056762695, + "learning_rate": 6.723597604857482e-06, + "loss": 4.9319, + "step": 76340 + }, + { + "epoch": 1.5532430013020835, + "grad_norm": 13.411162376403809, + "learning_rate": 6.7232224198378955e-06, + "loss": 4.9186, + "step": 76345 + }, + { + "epoch": 1.5533447265625, + "grad_norm": 19.48465347290039, + "learning_rate": 6.722847223807467e-06, + "loss": 4.7663, + "step": 76350 + }, + { + "epoch": 1.5534464518229165, + "grad_norm": 20.022336959838867, + "learning_rate": 6.722472016768589e-06, + "loss": 4.924, + "step": 76355 + }, + { + "epoch": 1.5535481770833335, + "grad_norm": 18.796812057495117, + "learning_rate": 6.722096798723663e-06, + "loss": 5.0216, + "step": 76360 + }, + { + "epoch": 1.55364990234375, + "grad_norm": 15.946523666381836, + "learning_rate": 6.7217215696750846e-06, + "loss": 5.0436, + "step": 76365 + }, + { + "epoch": 1.5537516276041665, + "grad_norm": 13.177233695983887, + "learning_rate": 6.721346329625252e-06, + "loss": 4.9865, + "step": 76370 + }, + { + "epoch": 1.5538533528645835, + "grad_norm": 19.579435348510742, + "learning_rate": 6.7209710785765635e-06, + "loss": 4.9736, + "step": 76375 + }, + { + "epoch": 1.553955078125, + "grad_norm": 18.73345375061035, + "learning_rate": 6.720595816531415e-06, + "loss": 5.0096, + "step": 76380 + }, + { + "epoch": 1.5540568033854165, + "grad_norm": 20.164369583129883, + "learning_rate": 6.720220543492206e-06, + "loss": 4.7862, + "step": 76385 + }, + { + "epoch": 1.5541585286458335, + "grad_norm": 14.44796371459961, + "learning_rate": 6.719845259461335e-06, + "loss": 5.0584, + "step": 76390 + }, + { + "epoch": 1.55426025390625, + "grad_norm": 20.13172721862793, + "learning_rate": 6.719469964441196e-06, + "loss": 4.8724, + "step": 76395 + }, + { + "epoch": 1.5543619791666665, + "grad_norm": 18.369827270507812, + "learning_rate": 6.719094658434192e-06, + "loss": 5.0413, + "step": 76400 + }, + { + "epoch": 1.5544637044270835, + "grad_norm": 28.504270553588867, + "learning_rate": 6.718719341442717e-06, + "loss": 4.7676, + "step": 76405 + }, + { + "epoch": 1.5545654296875, + "grad_norm": 26.30895233154297, + "learning_rate": 6.718344013469173e-06, + "loss": 4.9221, + "step": 76410 + }, + { + "epoch": 1.5546671549479165, + "grad_norm": 16.903064727783203, + "learning_rate": 6.717968674515954e-06, + "loss": 4.9895, + "step": 76415 + }, + { + "epoch": 1.5547688802083335, + "grad_norm": 19.68203353881836, + "learning_rate": 6.717593324585463e-06, + "loss": 4.8524, + "step": 76420 + }, + { + "epoch": 1.55487060546875, + "grad_norm": 19.152751922607422, + "learning_rate": 6.717217963680094e-06, + "loss": 4.9394, + "step": 76425 + }, + { + "epoch": 1.5549723307291665, + "grad_norm": 15.055410385131836, + "learning_rate": 6.716842591802248e-06, + "loss": 5.1144, + "step": 76430 + }, + { + "epoch": 1.5550740559895835, + "grad_norm": 29.60721778869629, + "learning_rate": 6.716467208954323e-06, + "loss": 4.8821, + "step": 76435 + }, + { + "epoch": 1.55517578125, + "grad_norm": 18.61847496032715, + "learning_rate": 6.716091815138718e-06, + "loss": 4.991, + "step": 76440 + }, + { + "epoch": 1.5552775065104165, + "grad_norm": 17.944047927856445, + "learning_rate": 6.71571641035783e-06, + "loss": 5.0878, + "step": 76445 + }, + { + "epoch": 1.5553792317708335, + "grad_norm": 22.797164916992188, + "learning_rate": 6.715340994614059e-06, + "loss": 4.9612, + "step": 76450 + }, + { + "epoch": 1.55548095703125, + "grad_norm": 19.70838737487793, + "learning_rate": 6.714965567909803e-06, + "loss": 4.802, + "step": 76455 + }, + { + "epoch": 1.5555826822916665, + "grad_norm": 19.012958526611328, + "learning_rate": 6.714590130247463e-06, + "loss": 4.7556, + "step": 76460 + }, + { + "epoch": 1.5556844075520835, + "grad_norm": 19.088157653808594, + "learning_rate": 6.714214681629436e-06, + "loss": 4.8669, + "step": 76465 + }, + { + "epoch": 1.5557861328125, + "grad_norm": 23.56322479248047, + "learning_rate": 6.71383922205812e-06, + "loss": 4.902, + "step": 76470 + }, + { + "epoch": 1.5558878580729165, + "grad_norm": 14.951395988464355, + "learning_rate": 6.7134637515359166e-06, + "loss": 4.9355, + "step": 76475 + }, + { + "epoch": 1.5559895833333335, + "grad_norm": 19.752870559692383, + "learning_rate": 6.713088270065224e-06, + "loss": 4.881, + "step": 76480 + }, + { + "epoch": 1.55609130859375, + "grad_norm": 20.5246639251709, + "learning_rate": 6.71271277764844e-06, + "loss": 4.9175, + "step": 76485 + }, + { + "epoch": 1.5561930338541665, + "grad_norm": 18.794960021972656, + "learning_rate": 6.712337274287965e-06, + "loss": 4.932, + "step": 76490 + }, + { + "epoch": 1.5562947591145835, + "grad_norm": 19.68052101135254, + "learning_rate": 6.711961759986199e-06, + "loss": 4.9869, + "step": 76495 + }, + { + "epoch": 1.556396484375, + "grad_norm": 19.22032928466797, + "learning_rate": 6.7115862347455415e-06, + "loss": 5.339, + "step": 76500 + }, + { + "epoch": 1.5564982096354165, + "grad_norm": 17.642501831054688, + "learning_rate": 6.711210698568391e-06, + "loss": 4.6538, + "step": 76505 + }, + { + "epoch": 1.5565999348958335, + "grad_norm": 17.07622528076172, + "learning_rate": 6.7108351514571466e-06, + "loss": 4.6971, + "step": 76510 + }, + { + "epoch": 1.55670166015625, + "grad_norm": 16.691225051879883, + "learning_rate": 6.710459593414209e-06, + "loss": 4.8875, + "step": 76515 + }, + { + "epoch": 1.5568033854166665, + "grad_norm": 17.326461791992188, + "learning_rate": 6.7100840244419776e-06, + "loss": 4.8966, + "step": 76520 + }, + { + "epoch": 1.5569051106770835, + "grad_norm": 15.965960502624512, + "learning_rate": 6.7097084445428524e-06, + "loss": 4.9402, + "step": 76525 + }, + { + "epoch": 1.5570068359375, + "grad_norm": 17.883331298828125, + "learning_rate": 6.7093328537192325e-06, + "loss": 5.0768, + "step": 76530 + }, + { + "epoch": 1.5571085611979165, + "grad_norm": 20.537750244140625, + "learning_rate": 6.708957251973518e-06, + "loss": 4.9258, + "step": 76535 + }, + { + "epoch": 1.5572102864583335, + "grad_norm": 18.803632736206055, + "learning_rate": 6.70858163930811e-06, + "loss": 4.8451, + "step": 76540 + }, + { + "epoch": 1.55731201171875, + "grad_norm": 20.114896774291992, + "learning_rate": 6.708206015725407e-06, + "loss": 4.832, + "step": 76545 + }, + { + "epoch": 1.5574137369791665, + "grad_norm": 19.357379913330078, + "learning_rate": 6.707830381227812e-06, + "loss": 4.8065, + "step": 76550 + }, + { + "epoch": 1.5575154622395835, + "grad_norm": 20.497426986694336, + "learning_rate": 6.707454735817719e-06, + "loss": 5.1578, + "step": 76555 + }, + { + "epoch": 1.5576171875, + "grad_norm": 16.716535568237305, + "learning_rate": 6.707079079497535e-06, + "loss": 4.9343, + "step": 76560 + }, + { + "epoch": 1.5577189127604165, + "grad_norm": 19.056360244750977, + "learning_rate": 6.706703412269656e-06, + "loss": 4.6811, + "step": 76565 + }, + { + "epoch": 1.5578206380208335, + "grad_norm": 13.475566864013672, + "learning_rate": 6.706327734136486e-06, + "loss": 4.9955, + "step": 76570 + }, + { + "epoch": 1.55792236328125, + "grad_norm": 25.154993057250977, + "learning_rate": 6.705952045100421e-06, + "loss": 5.0478, + "step": 76575 + }, + { + "epoch": 1.5580240885416665, + "grad_norm": 25.691726684570312, + "learning_rate": 6.705576345163867e-06, + "loss": 4.8464, + "step": 76580 + }, + { + "epoch": 1.5581258138020835, + "grad_norm": 19.412485122680664, + "learning_rate": 6.705200634329219e-06, + "loss": 4.8713, + "step": 76585 + }, + { + "epoch": 1.5582275390625, + "grad_norm": 21.414657592773438, + "learning_rate": 6.704824912598881e-06, + "loss": 4.5687, + "step": 76590 + }, + { + "epoch": 1.5583292643229165, + "grad_norm": 16.058815002441406, + "learning_rate": 6.704449179975255e-06, + "loss": 5.0049, + "step": 76595 + }, + { + "epoch": 1.5584309895833335, + "grad_norm": 15.055623054504395, + "learning_rate": 6.7040734364607365e-06, + "loss": 4.9872, + "step": 76600 + }, + { + "epoch": 1.55853271484375, + "grad_norm": 21.824914932250977, + "learning_rate": 6.703697682057732e-06, + "loss": 4.9416, + "step": 76605 + }, + { + "epoch": 1.5586344401041665, + "grad_norm": 22.65589714050293, + "learning_rate": 6.703321916768639e-06, + "loss": 5.1395, + "step": 76610 + }, + { + "epoch": 1.5587361653645835, + "grad_norm": 14.747269630432129, + "learning_rate": 6.70294614059586e-06, + "loss": 5.0444, + "step": 76615 + }, + { + "epoch": 1.558837890625, + "grad_norm": 19.978458404541016, + "learning_rate": 6.7025703535417965e-06, + "loss": 5.0435, + "step": 76620 + }, + { + "epoch": 1.5589396158854165, + "grad_norm": 18.3420467376709, + "learning_rate": 6.702194555608847e-06, + "loss": 4.8185, + "step": 76625 + }, + { + "epoch": 1.5590413411458335, + "grad_norm": 18.874109268188477, + "learning_rate": 6.701818746799415e-06, + "loss": 5.0542, + "step": 76630 + }, + { + "epoch": 1.55914306640625, + "grad_norm": 15.5546875, + "learning_rate": 6.701442927115904e-06, + "loss": 4.9845, + "step": 76635 + }, + { + "epoch": 1.5592447916666665, + "grad_norm": 15.679448127746582, + "learning_rate": 6.7010670965607106e-06, + "loss": 4.9763, + "step": 76640 + }, + { + "epoch": 1.5593465169270835, + "grad_norm": 18.10806655883789, + "learning_rate": 6.700691255136239e-06, + "loss": 4.8609, + "step": 76645 + }, + { + "epoch": 1.5594482421875, + "grad_norm": 20.529258728027344, + "learning_rate": 6.700315402844889e-06, + "loss": 4.9226, + "step": 76650 + }, + { + "epoch": 1.5595499674479165, + "grad_norm": 18.985990524291992, + "learning_rate": 6.699939539689065e-06, + "loss": 5.0806, + "step": 76655 + }, + { + "epoch": 1.5596516927083335, + "grad_norm": 22.296648025512695, + "learning_rate": 6.699563665671167e-06, + "loss": 4.8182, + "step": 76660 + }, + { + "epoch": 1.55975341796875, + "grad_norm": 16.883657455444336, + "learning_rate": 6.699187780793596e-06, + "loss": 4.9065, + "step": 76665 + }, + { + "epoch": 1.5598551432291665, + "grad_norm": 19.168312072753906, + "learning_rate": 6.698811885058754e-06, + "loss": 4.9581, + "step": 76670 + }, + { + "epoch": 1.5599568684895835, + "grad_norm": 15.501960754394531, + "learning_rate": 6.698435978469044e-06, + "loss": 4.7881, + "step": 76675 + }, + { + "epoch": 1.56005859375, + "grad_norm": 17.8190975189209, + "learning_rate": 6.698060061026867e-06, + "loss": 4.5447, + "step": 76680 + }, + { + "epoch": 1.5601603190104165, + "grad_norm": 16.646574020385742, + "learning_rate": 6.697684132734626e-06, + "loss": 4.8627, + "step": 76685 + }, + { + "epoch": 1.5602620442708335, + "grad_norm": 20.42961883544922, + "learning_rate": 6.69730819359472e-06, + "loss": 4.8558, + "step": 76690 + }, + { + "epoch": 1.56036376953125, + "grad_norm": 16.297542572021484, + "learning_rate": 6.696932243609555e-06, + "loss": 4.7461, + "step": 76695 + }, + { + "epoch": 1.5604654947916665, + "grad_norm": 18.094282150268555, + "learning_rate": 6.696556282781532e-06, + "loss": 4.9303, + "step": 76700 + }, + { + "epoch": 1.5605672200520835, + "grad_norm": 26.73900032043457, + "learning_rate": 6.6961803111130516e-06, + "loss": 5.184, + "step": 76705 + }, + { + "epoch": 1.5606689453125, + "grad_norm": 18.140865325927734, + "learning_rate": 6.6958043286065196e-06, + "loss": 5.03, + "step": 76710 + }, + { + "epoch": 1.5607706705729165, + "grad_norm": 24.829618453979492, + "learning_rate": 6.695428335264335e-06, + "loss": 5.3421, + "step": 76715 + }, + { + "epoch": 1.5608723958333335, + "grad_norm": 19.507436752319336, + "learning_rate": 6.695052331088902e-06, + "loss": 4.9911, + "step": 76720 + }, + { + "epoch": 1.56097412109375, + "grad_norm": 22.31227684020996, + "learning_rate": 6.694676316082623e-06, + "loss": 5.3476, + "step": 76725 + }, + { + "epoch": 1.5610758463541665, + "grad_norm": 16.97702980041504, + "learning_rate": 6.694300290247899e-06, + "loss": 4.9265, + "step": 76730 + }, + { + "epoch": 1.5611775716145835, + "grad_norm": 16.074010848999023, + "learning_rate": 6.6939242535871355e-06, + "loss": 4.7608, + "step": 76735 + }, + { + "epoch": 1.561279296875, + "grad_norm": 14.137917518615723, + "learning_rate": 6.693548206102732e-06, + "loss": 4.9212, + "step": 76740 + }, + { + "epoch": 1.5613810221354165, + "grad_norm": 15.392752647399902, + "learning_rate": 6.693172147797093e-06, + "loss": 4.8769, + "step": 76745 + }, + { + "epoch": 1.5614827473958335, + "grad_norm": 49.76026916503906, + "learning_rate": 6.692796078672624e-06, + "loss": 4.7142, + "step": 76750 + }, + { + "epoch": 1.56158447265625, + "grad_norm": 18.22812271118164, + "learning_rate": 6.6924199987317225e-06, + "loss": 4.7415, + "step": 76755 + }, + { + "epoch": 1.5616861979166665, + "grad_norm": 14.680009841918945, + "learning_rate": 6.692043907976797e-06, + "loss": 4.8355, + "step": 76760 + }, + { + "epoch": 1.5617879231770835, + "grad_norm": 25.939002990722656, + "learning_rate": 6.691667806410247e-06, + "loss": 5.005, + "step": 76765 + }, + { + "epoch": 1.5618896484375, + "grad_norm": 20.936119079589844, + "learning_rate": 6.691291694034476e-06, + "loss": 5.1099, + "step": 76770 + }, + { + "epoch": 1.5619913736979165, + "grad_norm": 19.030536651611328, + "learning_rate": 6.690915570851889e-06, + "loss": 4.9423, + "step": 76775 + }, + { + "epoch": 1.5620930989583335, + "grad_norm": 14.66305923461914, + "learning_rate": 6.690539436864887e-06, + "loss": 5.011, + "step": 76780 + }, + { + "epoch": 1.56219482421875, + "grad_norm": 18.131877899169922, + "learning_rate": 6.6901632920758765e-06, + "loss": 4.9512, + "step": 76785 + }, + { + "epoch": 1.5622965494791665, + "grad_norm": 19.434247970581055, + "learning_rate": 6.689787136487258e-06, + "loss": 5.3473, + "step": 76790 + }, + { + "epoch": 1.5623982747395835, + "grad_norm": 17.743741989135742, + "learning_rate": 6.689410970101435e-06, + "loss": 5.1248, + "step": 76795 + }, + { + "epoch": 1.5625, + "grad_norm": 13.668832778930664, + "learning_rate": 6.689034792920814e-06, + "loss": 4.7177, + "step": 76800 + }, + { + "epoch": 1.5626017252604165, + "grad_norm": 16.455116271972656, + "learning_rate": 6.688658604947796e-06, + "loss": 4.971, + "step": 76805 + }, + { + "epoch": 1.5627034505208335, + "grad_norm": 25.02837562561035, + "learning_rate": 6.688282406184786e-06, + "loss": 4.8797, + "step": 76810 + }, + { + "epoch": 1.56280517578125, + "grad_norm": 21.941463470458984, + "learning_rate": 6.687906196634187e-06, + "loss": 5.0432, + "step": 76815 + }, + { + "epoch": 1.5629069010416665, + "grad_norm": 14.787580490112305, + "learning_rate": 6.687529976298404e-06, + "loss": 4.8201, + "step": 76820 + }, + { + "epoch": 1.5630086263020835, + "grad_norm": 15.51940631866455, + "learning_rate": 6.687153745179838e-06, + "loss": 5.1685, + "step": 76825 + }, + { + "epoch": 1.5631103515625, + "grad_norm": 19.065317153930664, + "learning_rate": 6.686777503280898e-06, + "loss": 4.703, + "step": 76830 + }, + { + "epoch": 1.5632120768229165, + "grad_norm": 17.758129119873047, + "learning_rate": 6.686401250603983e-06, + "loss": 5.0746, + "step": 76835 + }, + { + "epoch": 1.5633138020833335, + "grad_norm": 17.384679794311523, + "learning_rate": 6.6860249871515004e-06, + "loss": 4.9958, + "step": 76840 + }, + { + "epoch": 1.56341552734375, + "grad_norm": 20.382604598999023, + "learning_rate": 6.685648712925853e-06, + "loss": 4.865, + "step": 76845 + }, + { + "epoch": 1.5635172526041665, + "grad_norm": 23.404104232788086, + "learning_rate": 6.685272427929445e-06, + "loss": 5.107, + "step": 76850 + }, + { + "epoch": 1.5636189778645835, + "grad_norm": 18.940250396728516, + "learning_rate": 6.684896132164684e-06, + "loss": 4.8044, + "step": 76855 + }, + { + "epoch": 1.563720703125, + "grad_norm": 18.246566772460938, + "learning_rate": 6.684519825633968e-06, + "loss": 4.8754, + "step": 76860 + }, + { + "epoch": 1.5638224283854165, + "grad_norm": 14.902541160583496, + "learning_rate": 6.684143508339707e-06, + "loss": 4.6848, + "step": 76865 + }, + { + "epoch": 1.5639241536458335, + "grad_norm": 19.07648468017578, + "learning_rate": 6.683767180284302e-06, + "loss": 5.0537, + "step": 76870 + }, + { + "epoch": 1.56402587890625, + "grad_norm": 16.93868064880371, + "learning_rate": 6.683390841470161e-06, + "loss": 4.9815, + "step": 76875 + }, + { + "epoch": 1.5641276041666665, + "grad_norm": 21.118663787841797, + "learning_rate": 6.683014491899687e-06, + "loss": 4.8042, + "step": 76880 + }, + { + "epoch": 1.5642293294270835, + "grad_norm": 21.049781799316406, + "learning_rate": 6.682638131575282e-06, + "loss": 4.8234, + "step": 76885 + }, + { + "epoch": 1.5643310546875, + "grad_norm": 17.39873695373535, + "learning_rate": 6.682261760499355e-06, + "loss": 4.6757, + "step": 76890 + }, + { + "epoch": 1.5644327799479165, + "grad_norm": 15.499115943908691, + "learning_rate": 6.68188537867431e-06, + "loss": 4.8801, + "step": 76895 + }, + { + "epoch": 1.5645345052083335, + "grad_norm": 16.915822982788086, + "learning_rate": 6.681508986102551e-06, + "loss": 5.0154, + "step": 76900 + }, + { + "epoch": 1.56463623046875, + "grad_norm": 16.387117385864258, + "learning_rate": 6.681132582786484e-06, + "loss": 4.7307, + "step": 76905 + }, + { + "epoch": 1.5647379557291665, + "grad_norm": 20.514822006225586, + "learning_rate": 6.680756168728512e-06, + "loss": 4.8464, + "step": 76910 + }, + { + "epoch": 1.5648396809895835, + "grad_norm": 13.837306022644043, + "learning_rate": 6.680379743931043e-06, + "loss": 4.842, + "step": 76915 + }, + { + "epoch": 1.56494140625, + "grad_norm": 16.25389862060547, + "learning_rate": 6.68000330839648e-06, + "loss": 4.9236, + "step": 76920 + }, + { + "epoch": 1.5650431315104165, + "grad_norm": 25.256772994995117, + "learning_rate": 6.679626862127228e-06, + "loss": 4.7533, + "step": 76925 + }, + { + "epoch": 1.5651448567708335, + "grad_norm": 16.936243057250977, + "learning_rate": 6.6792504051256964e-06, + "loss": 4.9479, + "step": 76930 + }, + { + "epoch": 1.56524658203125, + "grad_norm": 17.759658813476562, + "learning_rate": 6.678873937394286e-06, + "loss": 4.8611, + "step": 76935 + }, + { + "epoch": 1.5653483072916665, + "grad_norm": 16.595430374145508, + "learning_rate": 6.678497458935404e-06, + "loss": 4.8859, + "step": 76940 + }, + { + "epoch": 1.5654500325520835, + "grad_norm": 20.444995880126953, + "learning_rate": 6.678120969751456e-06, + "loss": 4.8183, + "step": 76945 + }, + { + "epoch": 1.5655517578125, + "grad_norm": 23.99357795715332, + "learning_rate": 6.677744469844849e-06, + "loss": 4.8461, + "step": 76950 + }, + { + "epoch": 1.5656534830729165, + "grad_norm": 15.482669830322266, + "learning_rate": 6.677367959217987e-06, + "loss": 4.7244, + "step": 76955 + }, + { + "epoch": 1.5657552083333335, + "grad_norm": 17.945602416992188, + "learning_rate": 6.676991437873275e-06, + "loss": 4.8853, + "step": 76960 + }, + { + "epoch": 1.56585693359375, + "grad_norm": 15.07337760925293, + "learning_rate": 6.67661490581312e-06, + "loss": 4.8728, + "step": 76965 + }, + { + "epoch": 1.5659586588541665, + "grad_norm": 13.92265510559082, + "learning_rate": 6.67623836303993e-06, + "loss": 4.6765, + "step": 76970 + }, + { + "epoch": 1.5660603841145835, + "grad_norm": 24.654909133911133, + "learning_rate": 6.6758618095561065e-06, + "loss": 4.8423, + "step": 76975 + }, + { + "epoch": 1.566162109375, + "grad_norm": 21.516799926757812, + "learning_rate": 6.6754852453640594e-06, + "loss": 4.7964, + "step": 76980 + }, + { + "epoch": 1.5662638346354165, + "grad_norm": 18.040958404541016, + "learning_rate": 6.675108670466195e-06, + "loss": 4.6792, + "step": 76985 + }, + { + "epoch": 1.5663655598958335, + "grad_norm": 16.248817443847656, + "learning_rate": 6.674732084864915e-06, + "loss": 5.0866, + "step": 76990 + }, + { + "epoch": 1.56646728515625, + "grad_norm": 21.734691619873047, + "learning_rate": 6.67435548856263e-06, + "loss": 5.1965, + "step": 76995 + }, + { + "epoch": 1.5665690104166665, + "grad_norm": 17.32826042175293, + "learning_rate": 6.6739788815617445e-06, + "loss": 5.0247, + "step": 77000 + }, + { + "epoch": 1.5666707356770835, + "grad_norm": 18.76344108581543, + "learning_rate": 6.673602263864667e-06, + "loss": 5.0259, + "step": 77005 + }, + { + "epoch": 1.5667724609375, + "grad_norm": 19.877363204956055, + "learning_rate": 6.673225635473801e-06, + "loss": 4.9686, + "step": 77010 + }, + { + "epoch": 1.5668741861979165, + "grad_norm": 13.572972297668457, + "learning_rate": 6.672848996391555e-06, + "loss": 4.9486, + "step": 77015 + }, + { + "epoch": 1.5669759114583335, + "grad_norm": 24.330577850341797, + "learning_rate": 6.672472346620333e-06, + "loss": 5.1522, + "step": 77020 + }, + { + "epoch": 1.56707763671875, + "grad_norm": 17.15358543395996, + "learning_rate": 6.6720956861625455e-06, + "loss": 4.6497, + "step": 77025 + }, + { + "epoch": 1.5671793619791665, + "grad_norm": 19.662216186523438, + "learning_rate": 6.6717190150205965e-06, + "loss": 4.7488, + "step": 77030 + }, + { + "epoch": 1.5672810872395835, + "grad_norm": 13.19690227508545, + "learning_rate": 6.6713423331968955e-06, + "loss": 4.9743, + "step": 77035 + }, + { + "epoch": 1.5673828125, + "grad_norm": 17.28420066833496, + "learning_rate": 6.670965640693845e-06, + "loss": 4.8724, + "step": 77040 + }, + { + "epoch": 1.5674845377604165, + "grad_norm": 15.984939575195312, + "learning_rate": 6.670588937513855e-06, + "loss": 4.906, + "step": 77045 + }, + { + "epoch": 1.5675862630208335, + "grad_norm": 14.329710006713867, + "learning_rate": 6.670212223659334e-06, + "loss": 4.8266, + "step": 77050 + }, + { + "epoch": 1.56768798828125, + "grad_norm": 18.675655364990234, + "learning_rate": 6.6698354991326845e-06, + "loss": 4.9273, + "step": 77055 + }, + { + "epoch": 1.5677897135416665, + "grad_norm": 21.920639038085938, + "learning_rate": 6.669458763936318e-06, + "loss": 5.0283, + "step": 77060 + }, + { + "epoch": 1.5678914388020835, + "grad_norm": 22.761289596557617, + "learning_rate": 6.669082018072638e-06, + "loss": 4.9626, + "step": 77065 + }, + { + "epoch": 1.5679931640625, + "grad_norm": 15.971258163452148, + "learning_rate": 6.668705261544055e-06, + "loss": 5.3157, + "step": 77070 + }, + { + "epoch": 1.5680948893229165, + "grad_norm": 17.59691619873047, + "learning_rate": 6.668328494352976e-06, + "loss": 4.8535, + "step": 77075 + }, + { + "epoch": 1.5681966145833335, + "grad_norm": 14.7621488571167, + "learning_rate": 6.667951716501806e-06, + "loss": 4.8655, + "step": 77080 + }, + { + "epoch": 1.56829833984375, + "grad_norm": 14.647316932678223, + "learning_rate": 6.667574927992954e-06, + "loss": 4.9633, + "step": 77085 + }, + { + "epoch": 1.5684000651041665, + "grad_norm": 35.37171173095703, + "learning_rate": 6.667198128828829e-06, + "loss": 5.4106, + "step": 77090 + }, + { + "epoch": 1.5685017903645835, + "grad_norm": 20.683181762695312, + "learning_rate": 6.666821319011836e-06, + "loss": 4.9168, + "step": 77095 + }, + { + "epoch": 1.568603515625, + "grad_norm": 13.150004386901855, + "learning_rate": 6.666444498544384e-06, + "loss": 4.7297, + "step": 77100 + }, + { + "epoch": 1.5687052408854165, + "grad_norm": 15.156237602233887, + "learning_rate": 6.666067667428882e-06, + "loss": 4.8932, + "step": 77105 + }, + { + "epoch": 1.5688069661458335, + "grad_norm": 17.449796676635742, + "learning_rate": 6.665690825667734e-06, + "loss": 4.7956, + "step": 77110 + }, + { + "epoch": 1.56890869140625, + "grad_norm": 20.085630416870117, + "learning_rate": 6.665313973263352e-06, + "loss": 5.1111, + "step": 77115 + }, + { + "epoch": 1.5690104166666665, + "grad_norm": 17.30594825744629, + "learning_rate": 6.664937110218143e-06, + "loss": 4.8045, + "step": 77120 + }, + { + "epoch": 1.5691121419270835, + "grad_norm": 20.32632064819336, + "learning_rate": 6.664560236534514e-06, + "loss": 4.5741, + "step": 77125 + }, + { + "epoch": 1.5692138671875, + "grad_norm": 15.051185607910156, + "learning_rate": 6.664183352214872e-06, + "loss": 4.7458, + "step": 77130 + }, + { + "epoch": 1.5693155924479165, + "grad_norm": 14.053901672363281, + "learning_rate": 6.663806457261628e-06, + "loss": 4.7697, + "step": 77135 + }, + { + "epoch": 1.5694173177083335, + "grad_norm": 19.088768005371094, + "learning_rate": 6.66342955167719e-06, + "loss": 4.8923, + "step": 77140 + }, + { + "epoch": 1.56951904296875, + "grad_norm": 18.21845817565918, + "learning_rate": 6.663052635463962e-06, + "loss": 4.9712, + "step": 77145 + }, + { + "epoch": 1.5696207682291665, + "grad_norm": 20.416812896728516, + "learning_rate": 6.662675708624359e-06, + "loss": 4.7975, + "step": 77150 + }, + { + "epoch": 1.5697224934895835, + "grad_norm": 23.767131805419922, + "learning_rate": 6.662298771160783e-06, + "loss": 4.7764, + "step": 77155 + }, + { + "epoch": 1.56982421875, + "grad_norm": 17.172060012817383, + "learning_rate": 6.6619218230756474e-06, + "loss": 4.8257, + "step": 77160 + }, + { + "epoch": 1.5699259440104165, + "grad_norm": 15.078773498535156, + "learning_rate": 6.6615448643713585e-06, + "loss": 4.879, + "step": 77165 + }, + { + "epoch": 1.5700276692708335, + "grad_norm": 15.761359214782715, + "learning_rate": 6.661167895050326e-06, + "loss": 4.6966, + "step": 77170 + }, + { + "epoch": 1.57012939453125, + "grad_norm": 15.276300430297852, + "learning_rate": 6.660790915114958e-06, + "loss": 5.0056, + "step": 77175 + }, + { + "epoch": 1.5702311197916665, + "grad_norm": 53.92129898071289, + "learning_rate": 6.660413924567661e-06, + "loss": 5.2943, + "step": 77180 + }, + { + "epoch": 1.5703328450520835, + "grad_norm": 21.43948745727539, + "learning_rate": 6.660036923410848e-06, + "loss": 4.8768, + "step": 77185 + }, + { + "epoch": 1.5704345703125, + "grad_norm": 16.00575065612793, + "learning_rate": 6.659659911646926e-06, + "loss": 5.0055, + "step": 77190 + }, + { + "epoch": 1.5705362955729165, + "grad_norm": 13.893599510192871, + "learning_rate": 6.659282889278304e-06, + "loss": 4.8836, + "step": 77195 + }, + { + "epoch": 1.5706380208333335, + "grad_norm": 16.819311141967773, + "learning_rate": 6.658905856307392e-06, + "loss": 4.9692, + "step": 77200 + }, + { + "epoch": 1.57073974609375, + "grad_norm": 20.82662010192871, + "learning_rate": 6.6585288127365966e-06, + "loss": 4.7414, + "step": 77205 + }, + { + "epoch": 1.5708414713541665, + "grad_norm": 18.317386627197266, + "learning_rate": 6.658151758568328e-06, + "loss": 4.7496, + "step": 77210 + }, + { + "epoch": 1.5709431966145835, + "grad_norm": 22.11166763305664, + "learning_rate": 6.657774693804997e-06, + "loss": 4.7705, + "step": 77215 + }, + { + "epoch": 1.571044921875, + "grad_norm": 20.051393508911133, + "learning_rate": 6.657397618449012e-06, + "loss": 4.7324, + "step": 77220 + }, + { + "epoch": 1.5711466471354165, + "grad_norm": 16.454753875732422, + "learning_rate": 6.657020532502781e-06, + "loss": 4.7521, + "step": 77225 + }, + { + "epoch": 1.5712483723958335, + "grad_norm": 19.554540634155273, + "learning_rate": 6.656643435968717e-06, + "loss": 4.8683, + "step": 77230 + }, + { + "epoch": 1.57135009765625, + "grad_norm": 15.4120454788208, + "learning_rate": 6.656266328849226e-06, + "loss": 4.9872, + "step": 77235 + }, + { + "epoch": 1.5714518229166665, + "grad_norm": 15.056573867797852, + "learning_rate": 6.65588921114672e-06, + "loss": 4.9009, + "step": 77240 + }, + { + "epoch": 1.5715535481770835, + "grad_norm": 20.33596420288086, + "learning_rate": 6.6555120828636065e-06, + "loss": 5.18, + "step": 77245 + }, + { + "epoch": 1.5716552734375, + "grad_norm": 17.819833755493164, + "learning_rate": 6.655134944002297e-06, + "loss": 5.0803, + "step": 77250 + }, + { + "epoch": 1.5717569986979165, + "grad_norm": 18.437294006347656, + "learning_rate": 6.654757794565201e-06, + "loss": 4.9004, + "step": 77255 + }, + { + "epoch": 1.5718587239583335, + "grad_norm": 16.42464256286621, + "learning_rate": 6.654380634554727e-06, + "loss": 4.8669, + "step": 77260 + }, + { + "epoch": 1.57196044921875, + "grad_norm": 18.537385940551758, + "learning_rate": 6.654003463973286e-06, + "loss": 4.943, + "step": 77265 + }, + { + "epoch": 1.5720621744791665, + "grad_norm": 16.75611114501953, + "learning_rate": 6.653626282823289e-06, + "loss": 4.9543, + "step": 77270 + }, + { + "epoch": 1.5721638997395835, + "grad_norm": 27.799327850341797, + "learning_rate": 6.653249091107143e-06, + "loss": 5.4173, + "step": 77275 + }, + { + "epoch": 1.572265625, + "grad_norm": 15.965200424194336, + "learning_rate": 6.652871888827262e-06, + "loss": 4.6258, + "step": 77280 + }, + { + "epoch": 1.5723673502604165, + "grad_norm": 21.83049964904785, + "learning_rate": 6.652494675986053e-06, + "loss": 4.7848, + "step": 77285 + }, + { + "epoch": 1.5724690755208335, + "grad_norm": 21.203584671020508, + "learning_rate": 6.652117452585929e-06, + "loss": 5.1301, + "step": 77290 + }, + { + "epoch": 1.57257080078125, + "grad_norm": 14.219804763793945, + "learning_rate": 6.651740218629299e-06, + "loss": 4.8281, + "step": 77295 + }, + { + "epoch": 1.5726725260416665, + "grad_norm": 14.234814643859863, + "learning_rate": 6.651362974118571e-06, + "loss": 4.9706, + "step": 77300 + }, + { + "epoch": 1.5727742513020835, + "grad_norm": 16.277894973754883, + "learning_rate": 6.65098571905616e-06, + "loss": 4.9603, + "step": 77305 + }, + { + "epoch": 1.5728759765625, + "grad_norm": 17.002607345581055, + "learning_rate": 6.650608453444473e-06, + "loss": 4.8717, + "step": 77310 + }, + { + "epoch": 1.5729777018229165, + "grad_norm": 14.225690841674805, + "learning_rate": 6.650231177285924e-06, + "loss": 4.7246, + "step": 77315 + }, + { + "epoch": 1.5730794270833335, + "grad_norm": 24.582258224487305, + "learning_rate": 6.64985389058292e-06, + "loss": 4.8287, + "step": 77320 + }, + { + "epoch": 1.57318115234375, + "grad_norm": 40.107574462890625, + "learning_rate": 6.6494765933378735e-06, + "loss": 5.0005, + "step": 77325 + }, + { + "epoch": 1.5732828776041665, + "grad_norm": 18.931095123291016, + "learning_rate": 6.649099285553196e-06, + "loss": 4.8337, + "step": 77330 + }, + { + "epoch": 1.5733846028645835, + "grad_norm": 17.930971145629883, + "learning_rate": 6.648721967231295e-06, + "loss": 4.6975, + "step": 77335 + }, + { + "epoch": 1.573486328125, + "grad_norm": 20.601770401000977, + "learning_rate": 6.648344638374587e-06, + "loss": 5.2505, + "step": 77340 + }, + { + "epoch": 1.5735880533854165, + "grad_norm": 18.314041137695312, + "learning_rate": 6.647967298985479e-06, + "loss": 4.9197, + "step": 77345 + }, + { + "epoch": 1.5736897786458335, + "grad_norm": 14.572474479675293, + "learning_rate": 6.647589949066382e-06, + "loss": 4.7883, + "step": 77350 + }, + { + "epoch": 1.57379150390625, + "grad_norm": 13.086615562438965, + "learning_rate": 6.647212588619709e-06, + "loss": 5.0806, + "step": 77355 + }, + { + "epoch": 1.5738932291666665, + "grad_norm": 23.99964714050293, + "learning_rate": 6.6468352176478716e-06, + "loss": 5.1527, + "step": 77360 + }, + { + "epoch": 1.5739949544270835, + "grad_norm": 18.141218185424805, + "learning_rate": 6.646457836153278e-06, + "loss": 4.9524, + "step": 77365 + }, + { + "epoch": 1.5740966796875, + "grad_norm": 17.225749969482422, + "learning_rate": 6.646080444138343e-06, + "loss": 4.8529, + "step": 77370 + }, + { + "epoch": 1.5741984049479165, + "grad_norm": 16.026010513305664, + "learning_rate": 6.645703041605475e-06, + "loss": 4.8233, + "step": 77375 + }, + { + "epoch": 1.5743001302083335, + "grad_norm": 19.14385986328125, + "learning_rate": 6.645325628557089e-06, + "loss": 4.996, + "step": 77380 + }, + { + "epoch": 1.57440185546875, + "grad_norm": 18.615066528320312, + "learning_rate": 6.644948204995594e-06, + "loss": 4.8104, + "step": 77385 + }, + { + "epoch": 1.5745035807291665, + "grad_norm": 16.22861671447754, + "learning_rate": 6.644570770923401e-06, + "loss": 4.9981, + "step": 77390 + }, + { + "epoch": 1.5746053059895835, + "grad_norm": 15.076242446899414, + "learning_rate": 6.644193326342923e-06, + "loss": 5.0413, + "step": 77395 + }, + { + "epoch": 1.57470703125, + "grad_norm": 14.269573211669922, + "learning_rate": 6.643815871256572e-06, + "loss": 4.6518, + "step": 77400 + }, + { + "epoch": 1.5748087565104165, + "grad_norm": 19.455801010131836, + "learning_rate": 6.64343840566676e-06, + "loss": 5.0163, + "step": 77405 + }, + { + "epoch": 1.5749104817708335, + "grad_norm": 32.4377326965332, + "learning_rate": 6.6430609295758986e-06, + "loss": 4.9752, + "step": 77410 + }, + { + "epoch": 1.57501220703125, + "grad_norm": 16.345338821411133, + "learning_rate": 6.642683442986398e-06, + "loss": 5.2928, + "step": 77415 + }, + { + "epoch": 1.5751139322916665, + "grad_norm": 15.369772911071777, + "learning_rate": 6.642305945900672e-06, + "loss": 4.9042, + "step": 77420 + }, + { + "epoch": 1.5752156575520835, + "grad_norm": 20.659915924072266, + "learning_rate": 6.641928438321134e-06, + "loss": 5.3429, + "step": 77425 + }, + { + "epoch": 1.5753173828125, + "grad_norm": 18.664201736450195, + "learning_rate": 6.641550920250191e-06, + "loss": 4.8629, + "step": 77430 + }, + { + "epoch": 1.5754191080729165, + "grad_norm": 17.74066734313965, + "learning_rate": 6.641173391690262e-06, + "loss": 4.7563, + "step": 77435 + }, + { + "epoch": 1.5755208333333335, + "grad_norm": 20.164222717285156, + "learning_rate": 6.640795852643754e-06, + "loss": 4.9036, + "step": 77440 + }, + { + "epoch": 1.57562255859375, + "grad_norm": 19.931501388549805, + "learning_rate": 6.640418303113083e-06, + "loss": 4.871, + "step": 77445 + }, + { + "epoch": 1.5757242838541665, + "grad_norm": 16.978355407714844, + "learning_rate": 6.64004074310066e-06, + "loss": 4.7451, + "step": 77450 + }, + { + "epoch": 1.5758260091145835, + "grad_norm": 17.429311752319336, + "learning_rate": 6.639663172608895e-06, + "loss": 5.1935, + "step": 77455 + }, + { + "epoch": 1.575927734375, + "grad_norm": 17.719810485839844, + "learning_rate": 6.6392855916402045e-06, + "loss": 4.8746, + "step": 77460 + }, + { + "epoch": 1.5760294596354165, + "grad_norm": 17.451316833496094, + "learning_rate": 6.638908000196997e-06, + "loss": 4.8438, + "step": 77465 + }, + { + "epoch": 1.5761311848958335, + "grad_norm": 23.49443244934082, + "learning_rate": 6.6385303982816905e-06, + "loss": 4.8065, + "step": 77470 + }, + { + "epoch": 1.57623291015625, + "grad_norm": 18.236867904663086, + "learning_rate": 6.638152785896696e-06, + "loss": 4.857, + "step": 77475 + }, + { + "epoch": 1.5763346354166665, + "grad_norm": 22.60857582092285, + "learning_rate": 6.637775163044422e-06, + "loss": 4.8117, + "step": 77480 + }, + { + "epoch": 1.5764363606770835, + "grad_norm": 14.52153205871582, + "learning_rate": 6.637397529727286e-06, + "loss": 4.6585, + "step": 77485 + }, + { + "epoch": 1.5765380859375, + "grad_norm": 28.365310668945312, + "learning_rate": 6.637019885947699e-06, + "loss": 5.1931, + "step": 77490 + }, + { + "epoch": 1.5766398111979165, + "grad_norm": 23.148710250854492, + "learning_rate": 6.636642231708074e-06, + "loss": 4.8336, + "step": 77495 + }, + { + "epoch": 1.5767415364583335, + "grad_norm": 16.678871154785156, + "learning_rate": 6.636264567010826e-06, + "loss": 4.9599, + "step": 77500 + }, + { + "epoch": 1.57684326171875, + "grad_norm": 19.665889739990234, + "learning_rate": 6.6358868918583665e-06, + "loss": 4.8561, + "step": 77505 + }, + { + "epoch": 1.5769449869791665, + "grad_norm": 22.080692291259766, + "learning_rate": 6.635509206253109e-06, + "loss": 4.9915, + "step": 77510 + }, + { + "epoch": 1.5770467122395835, + "grad_norm": 16.66773796081543, + "learning_rate": 6.6351315101974675e-06, + "loss": 5.0044, + "step": 77515 + }, + { + "epoch": 1.5771484375, + "grad_norm": 18.764965057373047, + "learning_rate": 6.6347538036938545e-06, + "loss": 4.9324, + "step": 77520 + }, + { + "epoch": 1.5772501627604165, + "grad_norm": 14.415276527404785, + "learning_rate": 6.634376086744683e-06, + "loss": 4.9151, + "step": 77525 + }, + { + "epoch": 1.5773518880208335, + "grad_norm": 14.457918167114258, + "learning_rate": 6.633998359352367e-06, + "loss": 5.0574, + "step": 77530 + }, + { + "epoch": 1.57745361328125, + "grad_norm": 15.51650619506836, + "learning_rate": 6.6336206215193215e-06, + "loss": 4.748, + "step": 77535 + }, + { + "epoch": 1.5775553385416665, + "grad_norm": 18.052026748657227, + "learning_rate": 6.633242873247958e-06, + "loss": 4.9214, + "step": 77540 + }, + { + "epoch": 1.5776570638020835, + "grad_norm": 27.23762321472168, + "learning_rate": 6.63286511454069e-06, + "loss": 5.1739, + "step": 77545 + }, + { + "epoch": 1.5777587890625, + "grad_norm": 19.140602111816406, + "learning_rate": 6.632487345399934e-06, + "loss": 4.989, + "step": 77550 + }, + { + "epoch": 1.5778605143229165, + "grad_norm": 16.49518394470215, + "learning_rate": 6.6321095658280995e-06, + "loss": 4.8902, + "step": 77555 + }, + { + "epoch": 1.5779622395833335, + "grad_norm": 21.209842681884766, + "learning_rate": 6.6317317758276055e-06, + "loss": 4.8084, + "step": 77560 + }, + { + "epoch": 1.57806396484375, + "grad_norm": 18.01194953918457, + "learning_rate": 6.631353975400863e-06, + "loss": 4.6831, + "step": 77565 + }, + { + "epoch": 1.5781656901041665, + "grad_norm": 14.607440948486328, + "learning_rate": 6.630976164550286e-06, + "loss": 5.0457, + "step": 77570 + }, + { + "epoch": 1.5782674153645835, + "grad_norm": 17.221723556518555, + "learning_rate": 6.630598343278289e-06, + "loss": 4.952, + "step": 77575 + }, + { + "epoch": 1.578369140625, + "grad_norm": 14.538558959960938, + "learning_rate": 6.630220511587288e-06, + "loss": 4.8661, + "step": 77580 + }, + { + "epoch": 1.5784708658854165, + "grad_norm": 16.962881088256836, + "learning_rate": 6.629842669479693e-06, + "loss": 4.8994, + "step": 77585 + }, + { + "epoch": 1.5785725911458335, + "grad_norm": 15.887263298034668, + "learning_rate": 6.6294648169579215e-06, + "loss": 4.6908, + "step": 77590 + }, + { + "epoch": 1.57867431640625, + "grad_norm": 15.106478691101074, + "learning_rate": 6.629086954024387e-06, + "loss": 5.0457, + "step": 77595 + }, + { + "epoch": 1.5787760416666665, + "grad_norm": 19.506431579589844, + "learning_rate": 6.628709080681504e-06, + "loss": 4.8369, + "step": 77600 + }, + { + "epoch": 1.5788777669270835, + "grad_norm": 19.467586517333984, + "learning_rate": 6.628331196931688e-06, + "loss": 4.7952, + "step": 77605 + }, + { + "epoch": 1.5789794921875, + "grad_norm": 18.929115295410156, + "learning_rate": 6.62795330277735e-06, + "loss": 4.9676, + "step": 77610 + }, + { + "epoch": 1.5790812174479165, + "grad_norm": 16.161827087402344, + "learning_rate": 6.62757539822091e-06, + "loss": 4.7683, + "step": 77615 + }, + { + "epoch": 1.5791829427083335, + "grad_norm": 19.14144515991211, + "learning_rate": 6.6271974832647775e-06, + "loss": 4.9417, + "step": 77620 + }, + { + "epoch": 1.57928466796875, + "grad_norm": 17.14496612548828, + "learning_rate": 6.626819557911371e-06, + "loss": 4.7634, + "step": 77625 + }, + { + "epoch": 1.5793863932291665, + "grad_norm": 24.611003875732422, + "learning_rate": 6.626441622163103e-06, + "loss": 4.7916, + "step": 77630 + }, + { + "epoch": 1.5794881184895835, + "grad_norm": 21.676376342773438, + "learning_rate": 6.62606367602239e-06, + "loss": 4.9618, + "step": 77635 + }, + { + "epoch": 1.57958984375, + "grad_norm": 20.479822158813477, + "learning_rate": 6.625685719491647e-06, + "loss": 5.0592, + "step": 77640 + }, + { + "epoch": 1.5796915690104165, + "grad_norm": 15.640852928161621, + "learning_rate": 6.625307752573289e-06, + "loss": 5.0899, + "step": 77645 + }, + { + "epoch": 1.5797932942708335, + "grad_norm": 18.899658203125, + "learning_rate": 6.624929775269728e-06, + "loss": 5.0607, + "step": 77650 + }, + { + "epoch": 1.57989501953125, + "grad_norm": 16.688844680786133, + "learning_rate": 6.6245517875833835e-06, + "loss": 4.8923, + "step": 77655 + }, + { + "epoch": 1.5799967447916665, + "grad_norm": 18.497323989868164, + "learning_rate": 6.624173789516667e-06, + "loss": 4.9274, + "step": 77660 + }, + { + "epoch": 1.5800984700520835, + "grad_norm": 16.93980598449707, + "learning_rate": 6.623795781071997e-06, + "loss": 4.8244, + "step": 77665 + }, + { + "epoch": 1.5802001953125, + "grad_norm": 22.385883331298828, + "learning_rate": 6.623417762251787e-06, + "loss": 4.7979, + "step": 77670 + }, + { + "epoch": 1.5803019205729165, + "grad_norm": 19.751649856567383, + "learning_rate": 6.623039733058452e-06, + "loss": 4.7964, + "step": 77675 + }, + { + "epoch": 1.5804036458333335, + "grad_norm": 23.129222869873047, + "learning_rate": 6.62266169349441e-06, + "loss": 5.1402, + "step": 77680 + }, + { + "epoch": 1.58050537109375, + "grad_norm": 19.337034225463867, + "learning_rate": 6.622283643562073e-06, + "loss": 5.2993, + "step": 77685 + }, + { + "epoch": 1.5806070963541665, + "grad_norm": 13.123196601867676, + "learning_rate": 6.62190558326386e-06, + "loss": 4.9882, + "step": 77690 + }, + { + "epoch": 1.5807088216145835, + "grad_norm": 13.228594779968262, + "learning_rate": 6.621527512602185e-06, + "loss": 4.6583, + "step": 77695 + }, + { + "epoch": 1.580810546875, + "grad_norm": 21.695676803588867, + "learning_rate": 6.621149431579463e-06, + "loss": 4.9929, + "step": 77700 + }, + { + "epoch": 1.5809122721354165, + "grad_norm": 22.617515563964844, + "learning_rate": 6.6207713401981125e-06, + "loss": 4.7694, + "step": 77705 + }, + { + "epoch": 1.5810139973958335, + "grad_norm": 19.52504539489746, + "learning_rate": 6.620393238460547e-06, + "loss": 4.9238, + "step": 77710 + }, + { + "epoch": 1.58111572265625, + "grad_norm": 18.595849990844727, + "learning_rate": 6.620015126369181e-06, + "loss": 4.8109, + "step": 77715 + }, + { + "epoch": 1.5812174479166665, + "grad_norm": 16.176525115966797, + "learning_rate": 6.619637003926436e-06, + "loss": 5.0899, + "step": 77720 + }, + { + "epoch": 1.5813191731770835, + "grad_norm": 17.047338485717773, + "learning_rate": 6.619258871134721e-06, + "loss": 4.9459, + "step": 77725 + }, + { + "epoch": 1.5814208984375, + "grad_norm": 13.650521278381348, + "learning_rate": 6.618880727996458e-06, + "loss": 5.1281, + "step": 77730 + }, + { + "epoch": 1.5815226236979165, + "grad_norm": 20.211393356323242, + "learning_rate": 6.618502574514062e-06, + "loss": 5.0344, + "step": 77735 + }, + { + "epoch": 1.5816243489583335, + "grad_norm": 18.023235321044922, + "learning_rate": 6.618124410689945e-06, + "loss": 4.7396, + "step": 77740 + }, + { + "epoch": 1.58172607421875, + "grad_norm": 13.948060035705566, + "learning_rate": 6.61774623652653e-06, + "loss": 4.6664, + "step": 77745 + }, + { + "epoch": 1.5818277994791665, + "grad_norm": 16.577945709228516, + "learning_rate": 6.6173680520262285e-06, + "loss": 4.8052, + "step": 77750 + }, + { + "epoch": 1.5819295247395835, + "grad_norm": 14.454174995422363, + "learning_rate": 6.616989857191459e-06, + "loss": 4.7987, + "step": 77755 + }, + { + "epoch": 1.58203125, + "grad_norm": 15.34245491027832, + "learning_rate": 6.616611652024638e-06, + "loss": 5.0759, + "step": 77760 + }, + { + "epoch": 1.5821329752604165, + "grad_norm": 17.884828567504883, + "learning_rate": 6.61623343652818e-06, + "loss": 5.0209, + "step": 77765 + }, + { + "epoch": 1.5822347005208335, + "grad_norm": 20.670536041259766, + "learning_rate": 6.615855210704505e-06, + "loss": 4.7749, + "step": 77770 + }, + { + "epoch": 1.58233642578125, + "grad_norm": 14.572765350341797, + "learning_rate": 6.615476974556027e-06, + "loss": 5.1979, + "step": 77775 + }, + { + "epoch": 1.5824381510416665, + "grad_norm": 16.54180908203125, + "learning_rate": 6.615098728085165e-06, + "loss": 4.9896, + "step": 77780 + }, + { + "epoch": 1.5825398763020835, + "grad_norm": 14.514839172363281, + "learning_rate": 6.614720471294336e-06, + "loss": 4.7059, + "step": 77785 + }, + { + "epoch": 1.5826416015625, + "grad_norm": 41.79184341430664, + "learning_rate": 6.614342204185954e-06, + "loss": 4.8889, + "step": 77790 + }, + { + "epoch": 1.5827433268229165, + "grad_norm": 17.944751739501953, + "learning_rate": 6.6139639267624376e-06, + "loss": 5.1593, + "step": 77795 + }, + { + "epoch": 1.5828450520833335, + "grad_norm": 19.717296600341797, + "learning_rate": 6.613585639026205e-06, + "loss": 4.936, + "step": 77800 + }, + { + "epoch": 1.58294677734375, + "grad_norm": 14.94243335723877, + "learning_rate": 6.613207340979673e-06, + "loss": 4.6594, + "step": 77805 + }, + { + "epoch": 1.5830485026041665, + "grad_norm": 21.33932113647461, + "learning_rate": 6.612829032625257e-06, + "loss": 4.9789, + "step": 77810 + }, + { + "epoch": 1.5831502278645835, + "grad_norm": 15.058452606201172, + "learning_rate": 6.6124507139653745e-06, + "loss": 4.8601, + "step": 77815 + }, + { + "epoch": 1.583251953125, + "grad_norm": 17.630775451660156, + "learning_rate": 6.612072385002446e-06, + "loss": 4.704, + "step": 77820 + }, + { + "epoch": 1.5833536783854165, + "grad_norm": 19.48297119140625, + "learning_rate": 6.6116940457388855e-06, + "loss": 4.9279, + "step": 77825 + }, + { + "epoch": 1.5834554036458335, + "grad_norm": 17.960893630981445, + "learning_rate": 6.611315696177112e-06, + "loss": 4.7972, + "step": 77830 + }, + { + "epoch": 1.58355712890625, + "grad_norm": 17.1477108001709, + "learning_rate": 6.610937336319543e-06, + "loss": 5.093, + "step": 77835 + }, + { + "epoch": 1.5836588541666665, + "grad_norm": 17.76117706298828, + "learning_rate": 6.610558966168595e-06, + "loss": 4.8251, + "step": 77840 + }, + { + "epoch": 1.5837605794270835, + "grad_norm": 27.7379150390625, + "learning_rate": 6.6101805857266865e-06, + "loss": 4.8586, + "step": 77845 + }, + { + "epoch": 1.5838623046875, + "grad_norm": 21.52096939086914, + "learning_rate": 6.609802194996235e-06, + "loss": 5.0328, + "step": 77850 + }, + { + "epoch": 1.5839640299479165, + "grad_norm": 26.95650291442871, + "learning_rate": 6.609423793979659e-06, + "loss": 4.9718, + "step": 77855 + }, + { + "epoch": 1.5840657552083335, + "grad_norm": 20.346927642822266, + "learning_rate": 6.609045382679377e-06, + "loss": 4.9219, + "step": 77860 + }, + { + "epoch": 1.58416748046875, + "grad_norm": 18.13079261779785, + "learning_rate": 6.608666961097805e-06, + "loss": 4.6425, + "step": 77865 + }, + { + "epoch": 1.5842692057291665, + "grad_norm": 17.36900520324707, + "learning_rate": 6.60828852923736e-06, + "loss": 4.9962, + "step": 77870 + }, + { + "epoch": 1.5843709309895835, + "grad_norm": 15.326240539550781, + "learning_rate": 6.607910087100465e-06, + "loss": 5.0464, + "step": 77875 + }, + { + "epoch": 1.58447265625, + "grad_norm": 16.710857391357422, + "learning_rate": 6.6075316346895314e-06, + "loss": 5.0364, + "step": 77880 + }, + { + "epoch": 1.5845743815104165, + "grad_norm": 25.794906616210938, + "learning_rate": 6.607153172006984e-06, + "loss": 5.1554, + "step": 77885 + }, + { + "epoch": 1.5846761067708335, + "grad_norm": 29.89504623413086, + "learning_rate": 6.606774699055235e-06, + "loss": 4.5934, + "step": 77890 + }, + { + "epoch": 1.58477783203125, + "grad_norm": 15.245182037353516, + "learning_rate": 6.606396215836708e-06, + "loss": 4.9074, + "step": 77895 + }, + { + "epoch": 1.5848795572916665, + "grad_norm": 18.782485961914062, + "learning_rate": 6.606017722353818e-06, + "loss": 4.9797, + "step": 77900 + }, + { + "epoch": 1.5849812825520835, + "grad_norm": 25.071495056152344, + "learning_rate": 6.605639218608985e-06, + "loss": 5.1173, + "step": 77905 + }, + { + "epoch": 1.5850830078125, + "grad_norm": 19.678476333618164, + "learning_rate": 6.605260704604628e-06, + "loss": 4.6668, + "step": 77910 + }, + { + "epoch": 1.5851847330729165, + "grad_norm": 14.368860244750977, + "learning_rate": 6.604882180343164e-06, + "loss": 5.03, + "step": 77915 + }, + { + "epoch": 1.5852864583333335, + "grad_norm": 16.759225845336914, + "learning_rate": 6.604503645827012e-06, + "loss": 5.0981, + "step": 77920 + }, + { + "epoch": 1.58538818359375, + "grad_norm": 21.398752212524414, + "learning_rate": 6.604125101058591e-06, + "loss": 5.0934, + "step": 77925 + }, + { + "epoch": 1.5854899088541665, + "grad_norm": 18.119768142700195, + "learning_rate": 6.6037465460403205e-06, + "loss": 5.219, + "step": 77930 + }, + { + "epoch": 1.5855916341145835, + "grad_norm": 17.31157875061035, + "learning_rate": 6.603367980774617e-06, + "loss": 4.8956, + "step": 77935 + }, + { + "epoch": 1.585693359375, + "grad_norm": 15.577363014221191, + "learning_rate": 6.602989405263905e-06, + "loss": 5.0001, + "step": 77940 + }, + { + "epoch": 1.5857950846354165, + "grad_norm": 15.22318172454834, + "learning_rate": 6.602610819510595e-06, + "loss": 5.0439, + "step": 77945 + }, + { + "epoch": 1.5858968098958335, + "grad_norm": 15.499859809875488, + "learning_rate": 6.602232223517113e-06, + "loss": 4.7979, + "step": 77950 + }, + { + "epoch": 1.58599853515625, + "grad_norm": 22.96691131591797, + "learning_rate": 6.601853617285876e-06, + "loss": 5.0319, + "step": 77955 + }, + { + "epoch": 1.5861002604166665, + "grad_norm": 14.643075942993164, + "learning_rate": 6.601475000819302e-06, + "loss": 4.8932, + "step": 77960 + }, + { + "epoch": 1.5862019856770835, + "grad_norm": 15.523459434509277, + "learning_rate": 6.601096374119811e-06, + "loss": 5.1281, + "step": 77965 + }, + { + "epoch": 1.5863037109375, + "grad_norm": 16.172819137573242, + "learning_rate": 6.600717737189822e-06, + "loss": 4.8612, + "step": 77970 + }, + { + "epoch": 1.5864054361979165, + "grad_norm": 21.869779586791992, + "learning_rate": 6.6003390900317555e-06, + "loss": 4.7814, + "step": 77975 + }, + { + "epoch": 1.5865071614583335, + "grad_norm": 14.84055233001709, + "learning_rate": 6.599960432648032e-06, + "loss": 4.8629, + "step": 77980 + }, + { + "epoch": 1.58660888671875, + "grad_norm": 19.87416648864746, + "learning_rate": 6.599581765041065e-06, + "loss": 4.9495, + "step": 77985 + }, + { + "epoch": 1.5867106119791665, + "grad_norm": 17.335220336914062, + "learning_rate": 6.599203087213281e-06, + "loss": 4.897, + "step": 77990 + }, + { + "epoch": 1.5868123372395835, + "grad_norm": 14.514521598815918, + "learning_rate": 6.598824399167097e-06, + "loss": 4.9176, + "step": 77995 + }, + { + "epoch": 1.5869140625, + "grad_norm": 17.481840133666992, + "learning_rate": 6.598445700904933e-06, + "loss": 4.767, + "step": 78000 + }, + { + "epoch": 1.5870157877604165, + "grad_norm": 17.801822662353516, + "learning_rate": 6.598066992429209e-06, + "loss": 4.8558, + "step": 78005 + }, + { + "epoch": 1.5871175130208335, + "grad_norm": 15.426238059997559, + "learning_rate": 6.597688273742342e-06, + "loss": 4.9405, + "step": 78010 + }, + { + "epoch": 1.58721923828125, + "grad_norm": 19.654869079589844, + "learning_rate": 6.597309544846755e-06, + "loss": 4.7839, + "step": 78015 + }, + { + "epoch": 1.5873209635416665, + "grad_norm": 18.33132553100586, + "learning_rate": 6.596930805744866e-06, + "loss": 4.8876, + "step": 78020 + }, + { + "epoch": 1.5874226888020835, + "grad_norm": 20.6184139251709, + "learning_rate": 6.596552056439099e-06, + "loss": 4.7466, + "step": 78025 + }, + { + "epoch": 1.5875244140625, + "grad_norm": 22.487611770629883, + "learning_rate": 6.596173296931867e-06, + "loss": 4.8928, + "step": 78030 + }, + { + "epoch": 1.5876261393229165, + "grad_norm": 20.06707000732422, + "learning_rate": 6.595794527225598e-06, + "loss": 4.701, + "step": 78035 + }, + { + "epoch": 1.5877278645833335, + "grad_norm": 18.28289031982422, + "learning_rate": 6.595415747322707e-06, + "loss": 4.8026, + "step": 78040 + }, + { + "epoch": 1.58782958984375, + "grad_norm": 15.163468360900879, + "learning_rate": 6.595036957225615e-06, + "loss": 4.5931, + "step": 78045 + }, + { + "epoch": 1.5879313151041665, + "grad_norm": 17.500581741333008, + "learning_rate": 6.594658156936744e-06, + "loss": 4.828, + "step": 78050 + }, + { + "epoch": 1.5880330403645835, + "grad_norm": 15.045737266540527, + "learning_rate": 6.594279346458514e-06, + "loss": 4.9214, + "step": 78055 + }, + { + "epoch": 1.588134765625, + "grad_norm": 18.46704864501953, + "learning_rate": 6.593900525793344e-06, + "loss": 4.769, + "step": 78060 + }, + { + "epoch": 1.5882364908854165, + "grad_norm": 18.45603370666504, + "learning_rate": 6.593521694943657e-06, + "loss": 4.9173, + "step": 78065 + }, + { + "epoch": 1.5883382161458335, + "grad_norm": 16.590702056884766, + "learning_rate": 6.593142853911872e-06, + "loss": 4.8965, + "step": 78070 + }, + { + "epoch": 1.58843994140625, + "grad_norm": 19.947805404663086, + "learning_rate": 6.592764002700409e-06, + "loss": 4.8081, + "step": 78075 + }, + { + "epoch": 1.5885416666666665, + "grad_norm": 15.450380325317383, + "learning_rate": 6.592385141311691e-06, + "loss": 5.001, + "step": 78080 + }, + { + "epoch": 1.5886433919270835, + "grad_norm": 15.676187515258789, + "learning_rate": 6.592006269748137e-06, + "loss": 4.9616, + "step": 78085 + }, + { + "epoch": 1.5887451171875, + "grad_norm": 19.1705322265625, + "learning_rate": 6.591627388012167e-06, + "loss": 4.9529, + "step": 78090 + }, + { + "epoch": 1.5888468424479165, + "grad_norm": 22.491634368896484, + "learning_rate": 6.5912484961062055e-06, + "loss": 4.5458, + "step": 78095 + }, + { + "epoch": 1.5889485677083335, + "grad_norm": 17.38690948486328, + "learning_rate": 6.5908695940326695e-06, + "loss": 4.6812, + "step": 78100 + }, + { + "epoch": 1.58905029296875, + "grad_norm": 16.00044822692871, + "learning_rate": 6.590490681793982e-06, + "loss": 4.8226, + "step": 78105 + }, + { + "epoch": 1.5891520182291665, + "grad_norm": 17.75666618347168, + "learning_rate": 6.590111759392565e-06, + "loss": 4.9106, + "step": 78110 + }, + { + "epoch": 1.5892537434895835, + "grad_norm": 20.661163330078125, + "learning_rate": 6.589732826830837e-06, + "loss": 4.8017, + "step": 78115 + }, + { + "epoch": 1.58935546875, + "grad_norm": 22.928852081298828, + "learning_rate": 6.589353884111222e-06, + "loss": 4.9422, + "step": 78120 + }, + { + "epoch": 1.5894571940104165, + "grad_norm": 19.802444458007812, + "learning_rate": 6.58897493123614e-06, + "loss": 5.0005, + "step": 78125 + }, + { + "epoch": 1.5895589192708335, + "grad_norm": 22.846681594848633, + "learning_rate": 6.588595968208012e-06, + "loss": 5.0117, + "step": 78130 + }, + { + "epoch": 1.58966064453125, + "grad_norm": 16.900224685668945, + "learning_rate": 6.588216995029262e-06, + "loss": 4.9466, + "step": 78135 + }, + { + "epoch": 1.5897623697916665, + "grad_norm": 22.55743980407715, + "learning_rate": 6.587838011702307e-06, + "loss": 4.9945, + "step": 78140 + }, + { + "epoch": 1.5898640950520835, + "grad_norm": 15.183404922485352, + "learning_rate": 6.5874590182295724e-06, + "loss": 5.0189, + "step": 78145 + }, + { + "epoch": 1.5899658203125, + "grad_norm": 17.20765495300293, + "learning_rate": 6.5870800146134786e-06, + "loss": 4.9951, + "step": 78150 + }, + { + "epoch": 1.5900675455729165, + "grad_norm": 20.928382873535156, + "learning_rate": 6.586701000856447e-06, + "loss": 4.9231, + "step": 78155 + }, + { + "epoch": 1.5901692708333335, + "grad_norm": 13.278340339660645, + "learning_rate": 6.586321976960899e-06, + "loss": 4.9398, + "step": 78160 + }, + { + "epoch": 1.59027099609375, + "grad_norm": 19.529481887817383, + "learning_rate": 6.585942942929259e-06, + "loss": 5.1349, + "step": 78165 + }, + { + "epoch": 1.5903727213541665, + "grad_norm": 17.56586456298828, + "learning_rate": 6.585563898763945e-06, + "loss": 4.7781, + "step": 78170 + }, + { + "epoch": 1.5904744466145835, + "grad_norm": 19.033937454223633, + "learning_rate": 6.585184844467382e-06, + "loss": 4.9456, + "step": 78175 + }, + { + "epoch": 1.590576171875, + "grad_norm": 16.54722023010254, + "learning_rate": 6.584805780041991e-06, + "loss": 4.8288, + "step": 78180 + }, + { + "epoch": 1.5906778971354165, + "grad_norm": 24.16669464111328, + "learning_rate": 6.584426705490195e-06, + "loss": 5.0026, + "step": 78185 + }, + { + "epoch": 1.5907796223958335, + "grad_norm": 22.049579620361328, + "learning_rate": 6.584047620814413e-06, + "loss": 4.8534, + "step": 78190 + }, + { + "epoch": 1.59088134765625, + "grad_norm": 17.41502571105957, + "learning_rate": 6.583668526017073e-06, + "loss": 4.9735, + "step": 78195 + }, + { + "epoch": 1.5909830729166665, + "grad_norm": 21.529617309570312, + "learning_rate": 6.583289421100592e-06, + "loss": 4.7112, + "step": 78200 + }, + { + "epoch": 1.5910847981770835, + "grad_norm": 17.26789665222168, + "learning_rate": 6.582910306067394e-06, + "loss": 4.8637, + "step": 78205 + }, + { + "epoch": 1.5911865234375, + "grad_norm": 20.3911075592041, + "learning_rate": 6.582531180919901e-06, + "loss": 4.9982, + "step": 78210 + }, + { + "epoch": 1.5912882486979165, + "grad_norm": 12.936078071594238, + "learning_rate": 6.582152045660537e-06, + "loss": 4.7912, + "step": 78215 + }, + { + "epoch": 1.5913899739583335, + "grad_norm": 16.8448429107666, + "learning_rate": 6.581772900291724e-06, + "loss": 5.2541, + "step": 78220 + }, + { + "epoch": 1.59149169921875, + "grad_norm": 19.612445831298828, + "learning_rate": 6.581393744815884e-06, + "loss": 5.074, + "step": 78225 + }, + { + "epoch": 1.5915934244791665, + "grad_norm": 18.340177536010742, + "learning_rate": 6.581014579235439e-06, + "loss": 4.8858, + "step": 78230 + }, + { + "epoch": 1.5916951497395835, + "grad_norm": 16.69034767150879, + "learning_rate": 6.580635403552814e-06, + "loss": 5.0611, + "step": 78235 + }, + { + "epoch": 1.591796875, + "grad_norm": 18.581798553466797, + "learning_rate": 6.580256217770431e-06, + "loss": 4.9919, + "step": 78240 + }, + { + "epoch": 1.5918986002604165, + "grad_norm": 14.18588924407959, + "learning_rate": 6.57987702189071e-06, + "loss": 4.9274, + "step": 78245 + }, + { + "epoch": 1.5920003255208335, + "grad_norm": 14.918359756469727, + "learning_rate": 6.579497815916079e-06, + "loss": 4.74, + "step": 78250 + }, + { + "epoch": 1.59210205078125, + "grad_norm": 30.97678565979004, + "learning_rate": 6.579118599848957e-06, + "loss": 4.637, + "step": 78255 + }, + { + "epoch": 1.5922037760416665, + "grad_norm": 16.86899757385254, + "learning_rate": 6.578739373691769e-06, + "loss": 4.8601, + "step": 78260 + }, + { + "epoch": 1.5923055013020835, + "grad_norm": 13.345468521118164, + "learning_rate": 6.578360137446937e-06, + "loss": 5.012, + "step": 78265 + }, + { + "epoch": 1.5924072265625, + "grad_norm": 15.457182884216309, + "learning_rate": 6.577980891116884e-06, + "loss": 4.9481, + "step": 78270 + }, + { + "epoch": 1.5925089518229165, + "grad_norm": 15.503886222839355, + "learning_rate": 6.577601634704036e-06, + "loss": 4.9091, + "step": 78275 + }, + { + "epoch": 1.5926106770833335, + "grad_norm": 19.253496170043945, + "learning_rate": 6.577222368210813e-06, + "loss": 4.9176, + "step": 78280 + }, + { + "epoch": 1.59271240234375, + "grad_norm": 24.707260131835938, + "learning_rate": 6.5768430916396395e-06, + "loss": 4.987, + "step": 78285 + }, + { + "epoch": 1.5928141276041665, + "grad_norm": 15.375958442687988, + "learning_rate": 6.5764638049929395e-06, + "loss": 4.8154, + "step": 78290 + }, + { + "epoch": 1.5929158528645835, + "grad_norm": 18.74699592590332, + "learning_rate": 6.576084508273137e-06, + "loss": 4.8448, + "step": 78295 + }, + { + "epoch": 1.593017578125, + "grad_norm": 20.610393524169922, + "learning_rate": 6.575705201482654e-06, + "loss": 5.1134, + "step": 78300 + }, + { + "epoch": 1.5931193033854165, + "grad_norm": 14.822539329528809, + "learning_rate": 6.575325884623915e-06, + "loss": 4.7178, + "step": 78305 + }, + { + "epoch": 1.5932210286458335, + "grad_norm": 22.81618309020996, + "learning_rate": 6.574946557699344e-06, + "loss": 4.9526, + "step": 78310 + }, + { + "epoch": 1.59332275390625, + "grad_norm": 21.120195388793945, + "learning_rate": 6.574567220711365e-06, + "loss": 4.817, + "step": 78315 + }, + { + "epoch": 1.5934244791666665, + "grad_norm": 15.201175689697266, + "learning_rate": 6.5741878736624e-06, + "loss": 5.1022, + "step": 78320 + }, + { + "epoch": 1.5935262044270835, + "grad_norm": 17.431739807128906, + "learning_rate": 6.573808516554876e-06, + "loss": 4.6943, + "step": 78325 + }, + { + "epoch": 1.5936279296875, + "grad_norm": 14.245755195617676, + "learning_rate": 6.573429149391214e-06, + "loss": 5.1952, + "step": 78330 + }, + { + "epoch": 1.5937296549479165, + "grad_norm": 18.867773056030273, + "learning_rate": 6.573049772173838e-06, + "loss": 5.2725, + "step": 78335 + }, + { + "epoch": 1.5938313802083335, + "grad_norm": 20.308069229125977, + "learning_rate": 6.572670384905176e-06, + "loss": 4.9908, + "step": 78340 + }, + { + "epoch": 1.59393310546875, + "grad_norm": 18.983346939086914, + "learning_rate": 6.572290987587647e-06, + "loss": 4.6147, + "step": 78345 + }, + { + "epoch": 1.5940348307291665, + "grad_norm": 20.4641056060791, + "learning_rate": 6.571911580223679e-06, + "loss": 4.8744, + "step": 78350 + }, + { + "epoch": 1.5941365559895835, + "grad_norm": 17.22130012512207, + "learning_rate": 6.5715321628156945e-06, + "loss": 4.85, + "step": 78355 + }, + { + "epoch": 1.59423828125, + "grad_norm": 15.905156135559082, + "learning_rate": 6.571152735366119e-06, + "loss": 4.8098, + "step": 78360 + }, + { + "epoch": 1.5943400065104165, + "grad_norm": 34.11623764038086, + "learning_rate": 6.570773297877376e-06, + "loss": 4.9023, + "step": 78365 + }, + { + "epoch": 1.5944417317708335, + "grad_norm": 14.793914794921875, + "learning_rate": 6.570393850351891e-06, + "loss": 4.8038, + "step": 78370 + }, + { + "epoch": 1.59454345703125, + "grad_norm": 20.110225677490234, + "learning_rate": 6.570014392792085e-06, + "loss": 4.8665, + "step": 78375 + }, + { + "epoch": 1.5946451822916665, + "grad_norm": 22.583580017089844, + "learning_rate": 6.569634925200389e-06, + "loss": 4.9123, + "step": 78380 + }, + { + "epoch": 1.5947469075520835, + "grad_norm": 17.973337173461914, + "learning_rate": 6.569255447579221e-06, + "loss": 5.0502, + "step": 78385 + }, + { + "epoch": 1.5948486328125, + "grad_norm": 19.664628982543945, + "learning_rate": 6.568875959931012e-06, + "loss": 4.928, + "step": 78390 + }, + { + "epoch": 1.5949503580729165, + "grad_norm": 13.279293060302734, + "learning_rate": 6.5684964622581815e-06, + "loss": 4.8394, + "step": 78395 + }, + { + "epoch": 1.5950520833333335, + "grad_norm": 22.132915496826172, + "learning_rate": 6.568116954563156e-06, + "loss": 5.154, + "step": 78400 + }, + { + "epoch": 1.59515380859375, + "grad_norm": 23.40898895263672, + "learning_rate": 6.5677374368483625e-06, + "loss": 4.92, + "step": 78405 + }, + { + "epoch": 1.5952555338541665, + "grad_norm": 22.317665100097656, + "learning_rate": 6.567357909116223e-06, + "loss": 4.7366, + "step": 78410 + }, + { + "epoch": 1.5953572591145835, + "grad_norm": 18.04042625427246, + "learning_rate": 6.566978371369166e-06, + "loss": 4.9023, + "step": 78415 + }, + { + "epoch": 1.595458984375, + "grad_norm": 26.206586837768555, + "learning_rate": 6.5665988236096134e-06, + "loss": 4.8408, + "step": 78420 + }, + { + "epoch": 1.5955607096354165, + "grad_norm": 14.684491157531738, + "learning_rate": 6.566219265839991e-06, + "loss": 4.9279, + "step": 78425 + }, + { + "epoch": 1.5956624348958335, + "grad_norm": 19.297941207885742, + "learning_rate": 6.565839698062726e-06, + "loss": 5.2554, + "step": 78430 + }, + { + "epoch": 1.59576416015625, + "grad_norm": 17.182388305664062, + "learning_rate": 6.565460120280241e-06, + "loss": 5.0758, + "step": 78435 + }, + { + "epoch": 1.5958658854166665, + "grad_norm": 27.723657608032227, + "learning_rate": 6.565080532494963e-06, + "loss": 5.1039, + "step": 78440 + }, + { + "epoch": 1.5959676106770835, + "grad_norm": 13.49740219116211, + "learning_rate": 6.564700934709318e-06, + "loss": 4.8037, + "step": 78445 + }, + { + "epoch": 1.5960693359375, + "grad_norm": 18.40250587463379, + "learning_rate": 6.564321326925731e-06, + "loss": 5.0743, + "step": 78450 + }, + { + "epoch": 1.5961710611979165, + "grad_norm": 17.59928321838379, + "learning_rate": 6.563941709146626e-06, + "loss": 4.8672, + "step": 78455 + }, + { + "epoch": 1.5962727864583335, + "grad_norm": 16.112512588500977, + "learning_rate": 6.563562081374432e-06, + "loss": 4.7987, + "step": 78460 + }, + { + "epoch": 1.59637451171875, + "grad_norm": 24.68279457092285, + "learning_rate": 6.5631824436115706e-06, + "loss": 5.0782, + "step": 78465 + }, + { + "epoch": 1.5964762369791665, + "grad_norm": 18.683486938476562, + "learning_rate": 6.562802795860471e-06, + "loss": 4.7709, + "step": 78470 + }, + { + "epoch": 1.5965779622395835, + "grad_norm": 15.589475631713867, + "learning_rate": 6.562423138123557e-06, + "loss": 4.984, + "step": 78475 + }, + { + "epoch": 1.5966796875, + "grad_norm": 15.764464378356934, + "learning_rate": 6.562043470403256e-06, + "loss": 5.1528, + "step": 78480 + }, + { + "epoch": 1.5967814127604165, + "grad_norm": 15.510300636291504, + "learning_rate": 6.561663792701991e-06, + "loss": 4.7688, + "step": 78485 + }, + { + "epoch": 1.5968831380208335, + "grad_norm": 15.515043258666992, + "learning_rate": 6.561284105022192e-06, + "loss": 5.1185, + "step": 78490 + }, + { + "epoch": 1.59698486328125, + "grad_norm": 17.304292678833008, + "learning_rate": 6.560904407366282e-06, + "loss": 4.8075, + "step": 78495 + }, + { + "epoch": 1.5970865885416665, + "grad_norm": 18.010997772216797, + "learning_rate": 6.560524699736689e-06, + "loss": 4.9809, + "step": 78500 + }, + { + "epoch": 1.5971883138020835, + "grad_norm": 14.831347465515137, + "learning_rate": 6.560144982135839e-06, + "loss": 4.8961, + "step": 78505 + }, + { + "epoch": 1.5972900390625, + "grad_norm": 22.34608268737793, + "learning_rate": 6.5597652545661575e-06, + "loss": 4.7476, + "step": 78510 + }, + { + "epoch": 1.5973917643229165, + "grad_norm": 20.93677520751953, + "learning_rate": 6.55938551703007e-06, + "loss": 4.881, + "step": 78515 + }, + { + "epoch": 1.5974934895833335, + "grad_norm": 15.823053359985352, + "learning_rate": 6.559005769530005e-06, + "loss": 5.0445, + "step": 78520 + }, + { + "epoch": 1.59759521484375, + "grad_norm": 20.167692184448242, + "learning_rate": 6.558626012068389e-06, + "loss": 4.8135, + "step": 78525 + }, + { + "epoch": 1.5976969401041665, + "grad_norm": 22.717838287353516, + "learning_rate": 6.558246244647645e-06, + "loss": 5.2454, + "step": 78530 + }, + { + "epoch": 1.5977986653645835, + "grad_norm": 23.42510223388672, + "learning_rate": 6.557866467270204e-06, + "loss": 4.8026, + "step": 78535 + }, + { + "epoch": 1.597900390625, + "grad_norm": 12.940948486328125, + "learning_rate": 6.5574866799384904e-06, + "loss": 4.8861, + "step": 78540 + }, + { + "epoch": 1.5980021158854165, + "grad_norm": 21.568147659301758, + "learning_rate": 6.557106882654932e-06, + "loss": 4.8405, + "step": 78545 + }, + { + "epoch": 1.5981038411458335, + "grad_norm": 24.954811096191406, + "learning_rate": 6.556727075421954e-06, + "loss": 4.9794, + "step": 78550 + }, + { + "epoch": 1.59820556640625, + "grad_norm": 14.53658676147461, + "learning_rate": 6.556347258241984e-06, + "loss": 4.5782, + "step": 78555 + }, + { + "epoch": 1.5983072916666665, + "grad_norm": 12.566308975219727, + "learning_rate": 6.555967431117449e-06, + "loss": 4.8394, + "step": 78560 + }, + { + "epoch": 1.5984090169270835, + "grad_norm": 23.00605010986328, + "learning_rate": 6.555587594050777e-06, + "loss": 5.2722, + "step": 78565 + }, + { + "epoch": 1.5985107421875, + "grad_norm": 19.282623291015625, + "learning_rate": 6.5552077470443945e-06, + "loss": 4.9122, + "step": 78570 + }, + { + "epoch": 1.5986124674479165, + "grad_norm": 21.981775283813477, + "learning_rate": 6.554827890100727e-06, + "loss": 4.6497, + "step": 78575 + }, + { + "epoch": 1.5987141927083335, + "grad_norm": 24.33081817626953, + "learning_rate": 6.554448023222203e-06, + "loss": 4.889, + "step": 78580 + }, + { + "epoch": 1.59881591796875, + "grad_norm": 19.502710342407227, + "learning_rate": 6.55406814641125e-06, + "loss": 4.884, + "step": 78585 + }, + { + "epoch": 1.5989176432291665, + "grad_norm": 14.539149284362793, + "learning_rate": 6.553688259670295e-06, + "loss": 4.8645, + "step": 78590 + }, + { + "epoch": 1.5990193684895835, + "grad_norm": 15.702723503112793, + "learning_rate": 6.553308363001765e-06, + "loss": 4.922, + "step": 78595 + }, + { + "epoch": 1.59912109375, + "grad_norm": 20.544395446777344, + "learning_rate": 6.552928456408087e-06, + "loss": 4.7559, + "step": 78600 + }, + { + "epoch": 1.5992228190104165, + "grad_norm": 17.521793365478516, + "learning_rate": 6.55254853989169e-06, + "loss": 5.0857, + "step": 78605 + }, + { + "epoch": 1.5993245442708335, + "grad_norm": 15.384156227111816, + "learning_rate": 6.552168613455e-06, + "loss": 4.983, + "step": 78610 + }, + { + "epoch": 1.59942626953125, + "grad_norm": 16.58633041381836, + "learning_rate": 6.551788677100445e-06, + "loss": 4.856, + "step": 78615 + }, + { + "epoch": 1.5995279947916665, + "grad_norm": 15.862202644348145, + "learning_rate": 6.551408730830454e-06, + "loss": 4.7428, + "step": 78620 + }, + { + "epoch": 1.5996297200520835, + "grad_norm": 12.851487159729004, + "learning_rate": 6.551028774647454e-06, + "loss": 4.8463, + "step": 78625 + }, + { + "epoch": 1.5997314453125, + "grad_norm": 18.735374450683594, + "learning_rate": 6.550648808553871e-06, + "loss": 5.1153, + "step": 78630 + }, + { + "epoch": 1.5998331705729165, + "grad_norm": 18.402936935424805, + "learning_rate": 6.550268832552136e-06, + "loss": 4.7949, + "step": 78635 + }, + { + "epoch": 1.5999348958333335, + "grad_norm": 14.25478458404541, + "learning_rate": 6.549888846644674e-06, + "loss": 5.1413, + "step": 78640 + }, + { + "epoch": 1.60003662109375, + "grad_norm": 19.14142608642578, + "learning_rate": 6.549508850833914e-06, + "loss": 5.335, + "step": 78645 + }, + { + "epoch": 1.6001383463541665, + "grad_norm": 14.744706153869629, + "learning_rate": 6.549128845122285e-06, + "loss": 4.7027, + "step": 78650 + }, + { + "epoch": 1.6002400716145835, + "grad_norm": 18.165712356567383, + "learning_rate": 6.548748829512215e-06, + "loss": 4.9782, + "step": 78655 + }, + { + "epoch": 1.600341796875, + "grad_norm": 18.432239532470703, + "learning_rate": 6.5483688040061314e-06, + "loss": 4.9428, + "step": 78660 + }, + { + "epoch": 1.6004435221354165, + "grad_norm": 15.875091552734375, + "learning_rate": 6.5479887686064635e-06, + "loss": 4.9992, + "step": 78665 + }, + { + "epoch": 1.6005452473958335, + "grad_norm": 15.941271781921387, + "learning_rate": 6.547608723315636e-06, + "loss": 4.9498, + "step": 78670 + }, + { + "epoch": 1.60064697265625, + "grad_norm": 14.965075492858887, + "learning_rate": 6.5472286681360835e-06, + "loss": 4.607, + "step": 78675 + }, + { + "epoch": 1.6007486979166665, + "grad_norm": 18.11119270324707, + "learning_rate": 6.546848603070228e-06, + "loss": 4.9248, + "step": 78680 + }, + { + "epoch": 1.6008504231770835, + "grad_norm": 19.702516555786133, + "learning_rate": 6.546468528120502e-06, + "loss": 4.9103, + "step": 78685 + }, + { + "epoch": 1.6009521484375, + "grad_norm": 13.2982759475708, + "learning_rate": 6.5460884432893325e-06, + "loss": 4.7809, + "step": 78690 + }, + { + "epoch": 1.6010538736979165, + "grad_norm": 15.202537536621094, + "learning_rate": 6.5457083485791495e-06, + "loss": 4.6992, + "step": 78695 + }, + { + "epoch": 1.6011555989583335, + "grad_norm": 23.415977478027344, + "learning_rate": 6.54532824399238e-06, + "loss": 4.9351, + "step": 78700 + }, + { + "epoch": 1.60125732421875, + "grad_norm": 16.174081802368164, + "learning_rate": 6.544948129531454e-06, + "loss": 4.9373, + "step": 78705 + }, + { + "epoch": 1.6013590494791665, + "grad_norm": 27.67054557800293, + "learning_rate": 6.5445680051988e-06, + "loss": 5.0806, + "step": 78710 + }, + { + "epoch": 1.6014607747395835, + "grad_norm": 19.179155349731445, + "learning_rate": 6.544187870996848e-06, + "loss": 4.7671, + "step": 78715 + }, + { + "epoch": 1.6015625, + "grad_norm": 13.936979293823242, + "learning_rate": 6.543807726928023e-06, + "loss": 4.8736, + "step": 78720 + }, + { + "epoch": 1.6016642252604165, + "grad_norm": 16.07945442199707, + "learning_rate": 6.543427572994758e-06, + "loss": 4.9828, + "step": 78725 + }, + { + "epoch": 1.6017659505208335, + "grad_norm": 18.1291446685791, + "learning_rate": 6.543047409199482e-06, + "loss": 5.1453, + "step": 78730 + }, + { + "epoch": 1.60186767578125, + "grad_norm": 14.90963077545166, + "learning_rate": 6.542667235544621e-06, + "loss": 4.7862, + "step": 78735 + }, + { + "epoch": 1.6019694010416665, + "grad_norm": 15.241128921508789, + "learning_rate": 6.5422870520326074e-06, + "loss": 4.5929, + "step": 78740 + }, + { + "epoch": 1.6020711263020835, + "grad_norm": 16.166996002197266, + "learning_rate": 6.54190685866587e-06, + "loss": 4.8097, + "step": 78745 + }, + { + "epoch": 1.6021728515625, + "grad_norm": 23.0306453704834, + "learning_rate": 6.541526655446835e-06, + "loss": 4.7353, + "step": 78750 + }, + { + "epoch": 1.6022745768229165, + "grad_norm": 22.562515258789062, + "learning_rate": 6.541146442377935e-06, + "loss": 4.92, + "step": 78755 + }, + { + "epoch": 1.6023763020833335, + "grad_norm": 18.584991455078125, + "learning_rate": 6.540766219461597e-06, + "loss": 4.8469, + "step": 78760 + }, + { + "epoch": 1.60247802734375, + "grad_norm": 20.37003517150879, + "learning_rate": 6.540385986700254e-06, + "loss": 5.1583, + "step": 78765 + }, + { + "epoch": 1.6025797526041665, + "grad_norm": 15.004366874694824, + "learning_rate": 6.540005744096334e-06, + "loss": 4.9904, + "step": 78770 + }, + { + "epoch": 1.6026814778645835, + "grad_norm": 16.26982879638672, + "learning_rate": 6.539625491652264e-06, + "loss": 4.8059, + "step": 78775 + }, + { + "epoch": 1.602783203125, + "grad_norm": 15.6987943649292, + "learning_rate": 6.539245229370477e-06, + "loss": 5.0586, + "step": 78780 + }, + { + "epoch": 1.6028849283854165, + "grad_norm": 17.146127700805664, + "learning_rate": 6.538864957253401e-06, + "loss": 4.846, + "step": 78785 + }, + { + "epoch": 1.6029866536458335, + "grad_norm": 17.054243087768555, + "learning_rate": 6.538484675303467e-06, + "loss": 4.9589, + "step": 78790 + }, + { + "epoch": 1.60308837890625, + "grad_norm": 15.342422485351562, + "learning_rate": 6.538104383523105e-06, + "loss": 4.7372, + "step": 78795 + }, + { + "epoch": 1.6031901041666665, + "grad_norm": 17.84432029724121, + "learning_rate": 6.537724081914743e-06, + "loss": 4.7297, + "step": 78800 + }, + { + "epoch": 1.6032918294270835, + "grad_norm": 16.211524963378906, + "learning_rate": 6.5373437704808126e-06, + "loss": 4.8931, + "step": 78805 + }, + { + "epoch": 1.6033935546875, + "grad_norm": 18.046953201293945, + "learning_rate": 6.536963449223745e-06, + "loss": 4.875, + "step": 78810 + }, + { + "epoch": 1.6034952799479165, + "grad_norm": 14.824358940124512, + "learning_rate": 6.536583118145966e-06, + "loss": 5.021, + "step": 78815 + }, + { + "epoch": 1.6035970052083335, + "grad_norm": 22.884103775024414, + "learning_rate": 6.536202777249911e-06, + "loss": 4.9507, + "step": 78820 + }, + { + "epoch": 1.60369873046875, + "grad_norm": 23.815017700195312, + "learning_rate": 6.535822426538006e-06, + "loss": 5.1336, + "step": 78825 + }, + { + "epoch": 1.6038004557291665, + "grad_norm": 20.226451873779297, + "learning_rate": 6.535442066012683e-06, + "loss": 4.9542, + "step": 78830 + }, + { + "epoch": 1.6039021809895835, + "grad_norm": 13.827350616455078, + "learning_rate": 6.535061695676374e-06, + "loss": 5.2419, + "step": 78835 + }, + { + "epoch": 1.60400390625, + "grad_norm": 16.659509658813477, + "learning_rate": 6.5346813155315085e-06, + "loss": 4.8184, + "step": 78840 + }, + { + "epoch": 1.6041056315104165, + "grad_norm": 25.0316104888916, + "learning_rate": 6.534300925580516e-06, + "loss": 5.0024, + "step": 78845 + }, + { + "epoch": 1.6042073567708335, + "grad_norm": 18.813209533691406, + "learning_rate": 6.533920525825827e-06, + "loss": 5.0497, + "step": 78850 + }, + { + "epoch": 1.60430908203125, + "grad_norm": 20.777755737304688, + "learning_rate": 6.5335401162698745e-06, + "loss": 4.9671, + "step": 78855 + }, + { + "epoch": 1.6044108072916665, + "grad_norm": 19.73084831237793, + "learning_rate": 6.533159696915086e-06, + "loss": 4.7517, + "step": 78860 + }, + { + "epoch": 1.6045125325520835, + "grad_norm": 17.1376895904541, + "learning_rate": 6.532779267763893e-06, + "loss": 4.7572, + "step": 78865 + }, + { + "epoch": 1.6046142578125, + "grad_norm": 15.971083641052246, + "learning_rate": 6.532398828818729e-06, + "loss": 5.0983, + "step": 78870 + }, + { + "epoch": 1.6047159830729165, + "grad_norm": 15.490754127502441, + "learning_rate": 6.532018380082021e-06, + "loss": 5.09, + "step": 78875 + }, + { + "epoch": 1.6048177083333335, + "grad_norm": 27.108518600463867, + "learning_rate": 6.531637921556204e-06, + "loss": 4.6021, + "step": 78880 + }, + { + "epoch": 1.60491943359375, + "grad_norm": 14.923535346984863, + "learning_rate": 6.531257453243707e-06, + "loss": 4.9215, + "step": 78885 + }, + { + "epoch": 1.6050211588541665, + "grad_norm": 24.544879913330078, + "learning_rate": 6.530876975146958e-06, + "loss": 4.9031, + "step": 78890 + }, + { + "epoch": 1.6051228841145835, + "grad_norm": 15.812681198120117, + "learning_rate": 6.530496487268394e-06, + "loss": 5.2437, + "step": 78895 + }, + { + "epoch": 1.605224609375, + "grad_norm": 17.303333282470703, + "learning_rate": 6.530115989610441e-06, + "loss": 4.8292, + "step": 78900 + }, + { + "epoch": 1.6053263346354165, + "grad_norm": 17.21516227722168, + "learning_rate": 6.529735482175534e-06, + "loss": 4.9432, + "step": 78905 + }, + { + "epoch": 1.6054280598958335, + "grad_norm": 24.39499855041504, + "learning_rate": 6.529354964966104e-06, + "loss": 4.91, + "step": 78910 + }, + { + "epoch": 1.60552978515625, + "grad_norm": 18.201467514038086, + "learning_rate": 6.528974437984578e-06, + "loss": 5.0223, + "step": 78915 + }, + { + "epoch": 1.6056315104166665, + "grad_norm": 16.38945770263672, + "learning_rate": 6.528593901233393e-06, + "loss": 4.7463, + "step": 78920 + }, + { + "epoch": 1.6057332356770835, + "grad_norm": 19.142309188842773, + "learning_rate": 6.528213354714978e-06, + "loss": 4.6994, + "step": 78925 + }, + { + "epoch": 1.6058349609375, + "grad_norm": 16.681699752807617, + "learning_rate": 6.527832798431765e-06, + "loss": 4.7079, + "step": 78930 + }, + { + "epoch": 1.6059366861979165, + "grad_norm": 14.51612377166748, + "learning_rate": 6.527452232386185e-06, + "loss": 4.7709, + "step": 78935 + }, + { + "epoch": 1.6060384114583335, + "grad_norm": 18.612579345703125, + "learning_rate": 6.52707165658067e-06, + "loss": 4.8077, + "step": 78940 + }, + { + "epoch": 1.60614013671875, + "grad_norm": 14.115731239318848, + "learning_rate": 6.526691071017652e-06, + "loss": 4.8419, + "step": 78945 + }, + { + "epoch": 1.6062418619791665, + "grad_norm": 13.693761825561523, + "learning_rate": 6.526310475699563e-06, + "loss": 4.6959, + "step": 78950 + }, + { + "epoch": 1.6063435872395835, + "grad_norm": 13.872641563415527, + "learning_rate": 6.525929870628833e-06, + "loss": 4.9496, + "step": 78955 + }, + { + "epoch": 1.6064453125, + "grad_norm": 16.73817253112793, + "learning_rate": 6.525549255807898e-06, + "loss": 4.7265, + "step": 78960 + }, + { + "epoch": 1.6065470377604165, + "grad_norm": 15.111185073852539, + "learning_rate": 6.525168631239186e-06, + "loss": 5.0147, + "step": 78965 + }, + { + "epoch": 1.6066487630208335, + "grad_norm": 16.947818756103516, + "learning_rate": 6.52478799692513e-06, + "loss": 4.8356, + "step": 78970 + }, + { + "epoch": 1.60675048828125, + "grad_norm": 18.917423248291016, + "learning_rate": 6.524407352868164e-06, + "loss": 4.8155, + "step": 78975 + }, + { + "epoch": 1.6068522135416665, + "grad_norm": 23.392654418945312, + "learning_rate": 6.524026699070718e-06, + "loss": 4.7995, + "step": 78980 + }, + { + "epoch": 1.6069539388020835, + "grad_norm": 14.516361236572266, + "learning_rate": 6.523646035535226e-06, + "loss": 4.8528, + "step": 78985 + }, + { + "epoch": 1.6070556640625, + "grad_norm": 22.17662239074707, + "learning_rate": 6.5232653622641195e-06, + "loss": 5.0501, + "step": 78990 + }, + { + "epoch": 1.6071573893229165, + "grad_norm": 18.816251754760742, + "learning_rate": 6.52288467925983e-06, + "loss": 4.9195, + "step": 78995 + }, + { + "epoch": 1.6072591145833335, + "grad_norm": 16.20366096496582, + "learning_rate": 6.522503986524792e-06, + "loss": 4.8587, + "step": 79000 + }, + { + "epoch": 1.60736083984375, + "grad_norm": 24.663049697875977, + "learning_rate": 6.522123284061436e-06, + "loss": 4.8299, + "step": 79005 + }, + { + "epoch": 1.6074625651041665, + "grad_norm": 17.724376678466797, + "learning_rate": 6.521742571872196e-06, + "loss": 4.8233, + "step": 79010 + }, + { + "epoch": 1.6075642903645835, + "grad_norm": 18.5225830078125, + "learning_rate": 6.5213618499595045e-06, + "loss": 4.862, + "step": 79015 + }, + { + "epoch": 1.607666015625, + "grad_norm": 16.199655532836914, + "learning_rate": 6.520981118325792e-06, + "loss": 4.8224, + "step": 79020 + }, + { + "epoch": 1.6077677408854165, + "grad_norm": 18.63947105407715, + "learning_rate": 6.520600376973494e-06, + "loss": 4.7603, + "step": 79025 + }, + { + "epoch": 1.6078694661458335, + "grad_norm": 14.935831069946289, + "learning_rate": 6.520219625905042e-06, + "loss": 4.8188, + "step": 79030 + }, + { + "epoch": 1.60797119140625, + "grad_norm": 16.949954986572266, + "learning_rate": 6.519838865122869e-06, + "loss": 4.8822, + "step": 79035 + }, + { + "epoch": 1.6080729166666665, + "grad_norm": 16.574289321899414, + "learning_rate": 6.51945809462941e-06, + "loss": 4.7791, + "step": 79040 + }, + { + "epoch": 1.6081746419270835, + "grad_norm": 14.335553169250488, + "learning_rate": 6.519077314427094e-06, + "loss": 4.9875, + "step": 79045 + }, + { + "epoch": 1.6082763671875, + "grad_norm": 14.086782455444336, + "learning_rate": 6.518696524518357e-06, + "loss": 4.8595, + "step": 79050 + }, + { + "epoch": 1.6083780924479165, + "grad_norm": 16.29545021057129, + "learning_rate": 6.5183157249056305e-06, + "loss": 4.7123, + "step": 79055 + }, + { + "epoch": 1.6084798177083335, + "grad_norm": 18.84982681274414, + "learning_rate": 6.517934915591349e-06, + "loss": 4.8784, + "step": 79060 + }, + { + "epoch": 1.60858154296875, + "grad_norm": 17.20783233642578, + "learning_rate": 6.517554096577946e-06, + "loss": 4.7082, + "step": 79065 + }, + { + "epoch": 1.6086832682291665, + "grad_norm": 14.966687202453613, + "learning_rate": 6.517173267867852e-06, + "loss": 5.4743, + "step": 79070 + }, + { + "epoch": 1.6087849934895835, + "grad_norm": 18.076112747192383, + "learning_rate": 6.516792429463505e-06, + "loss": 4.7544, + "step": 79075 + }, + { + "epoch": 1.60888671875, + "grad_norm": 19.698741912841797, + "learning_rate": 6.516411581367335e-06, + "loss": 5.025, + "step": 79080 + }, + { + "epoch": 1.6089884440104165, + "grad_norm": 17.87499237060547, + "learning_rate": 6.516030723581776e-06, + "loss": 4.8865, + "step": 79085 + }, + { + "epoch": 1.6090901692708335, + "grad_norm": 20.698421478271484, + "learning_rate": 6.515649856109262e-06, + "loss": 4.6898, + "step": 79090 + }, + { + "epoch": 1.60919189453125, + "grad_norm": 16.20440673828125, + "learning_rate": 6.515268978952226e-06, + "loss": 4.8969, + "step": 79095 + }, + { + "epoch": 1.6092936197916665, + "grad_norm": 18.247121810913086, + "learning_rate": 6.514888092113102e-06, + "loss": 5.4993, + "step": 79100 + }, + { + "epoch": 1.6093953450520835, + "grad_norm": 17.36793327331543, + "learning_rate": 6.514507195594325e-06, + "loss": 4.9217, + "step": 79105 + }, + { + "epoch": 1.6094970703125, + "grad_norm": 17.12091064453125, + "learning_rate": 6.514126289398327e-06, + "loss": 5.1787, + "step": 79110 + }, + { + "epoch": 1.6095987955729165, + "grad_norm": 17.236125946044922, + "learning_rate": 6.513745373527543e-06, + "loss": 4.9962, + "step": 79115 + }, + { + "epoch": 1.6097005208333335, + "grad_norm": 15.600191116333008, + "learning_rate": 6.513364447984409e-06, + "loss": 4.6917, + "step": 79120 + }, + { + "epoch": 1.60980224609375, + "grad_norm": 20.868947982788086, + "learning_rate": 6.512983512771353e-06, + "loss": 4.8994, + "step": 79125 + }, + { + "epoch": 1.6099039713541665, + "grad_norm": 28.816566467285156, + "learning_rate": 6.512602567890814e-06, + "loss": 4.7514, + "step": 79130 + }, + { + "epoch": 1.6100056966145835, + "grad_norm": 18.705644607543945, + "learning_rate": 6.512221613345224e-06, + "loss": 4.8389, + "step": 79135 + }, + { + "epoch": 1.610107421875, + "grad_norm": 23.698768615722656, + "learning_rate": 6.51184064913702e-06, + "loss": 5.1594, + "step": 79140 + }, + { + "epoch": 1.6102091471354165, + "grad_norm": 17.867101669311523, + "learning_rate": 6.511459675268633e-06, + "loss": 5.624, + "step": 79145 + }, + { + "epoch": 1.6103108723958335, + "grad_norm": 13.60685920715332, + "learning_rate": 6.511078691742498e-06, + "loss": 5.0503, + "step": 79150 + }, + { + "epoch": 1.61041259765625, + "grad_norm": 18.975433349609375, + "learning_rate": 6.51069769856105e-06, + "loss": 4.7789, + "step": 79155 + }, + { + "epoch": 1.6105143229166665, + "grad_norm": 21.76375389099121, + "learning_rate": 6.510316695726724e-06, + "loss": 4.6104, + "step": 79160 + }, + { + "epoch": 1.6106160481770835, + "grad_norm": 23.149520874023438, + "learning_rate": 6.5099356832419525e-06, + "loss": 5.075, + "step": 79165 + }, + { + "epoch": 1.6107177734375, + "grad_norm": 16.792810440063477, + "learning_rate": 6.509554661109173e-06, + "loss": 4.6242, + "step": 79170 + }, + { + "epoch": 1.6108194986979165, + "grad_norm": 23.936805725097656, + "learning_rate": 6.509173629330816e-06, + "loss": 5.1413, + "step": 79175 + }, + { + "epoch": 1.6109212239583335, + "grad_norm": 23.532001495361328, + "learning_rate": 6.508792587909321e-06, + "loss": 5.3006, + "step": 79180 + }, + { + "epoch": 1.61102294921875, + "grad_norm": 16.99368667602539, + "learning_rate": 6.50841153684712e-06, + "loss": 4.8421, + "step": 79185 + }, + { + "epoch": 1.6111246744791665, + "grad_norm": 17.747957229614258, + "learning_rate": 6.508030476146647e-06, + "loss": 4.9842, + "step": 79190 + }, + { + "epoch": 1.6112263997395835, + "grad_norm": 18.624759674072266, + "learning_rate": 6.507649405810339e-06, + "loss": 4.9178, + "step": 79195 + }, + { + "epoch": 1.611328125, + "grad_norm": 16.405052185058594, + "learning_rate": 6.507268325840627e-06, + "loss": 5.1324, + "step": 79200 + }, + { + "epoch": 1.6114298502604165, + "grad_norm": 16.553945541381836, + "learning_rate": 6.506887236239953e-06, + "loss": 5.0863, + "step": 79205 + }, + { + "epoch": 1.6115315755208335, + "grad_norm": 15.826672554016113, + "learning_rate": 6.506506137010746e-06, + "loss": 4.9171, + "step": 79210 + }, + { + "epoch": 1.61163330078125, + "grad_norm": 16.863977432250977, + "learning_rate": 6.506125028155443e-06, + "loss": 4.7547, + "step": 79215 + }, + { + "epoch": 1.6117350260416665, + "grad_norm": 15.724366188049316, + "learning_rate": 6.50574390967648e-06, + "loss": 4.7382, + "step": 79220 + }, + { + "epoch": 1.6118367513020835, + "grad_norm": 14.976256370544434, + "learning_rate": 6.50536278157629e-06, + "loss": 4.9277, + "step": 79225 + }, + { + "epoch": 1.6119384765625, + "grad_norm": 21.218307495117188, + "learning_rate": 6.504981643857311e-06, + "loss": 4.8587, + "step": 79230 + }, + { + "epoch": 1.6120402018229165, + "grad_norm": 17.363540649414062, + "learning_rate": 6.504600496521975e-06, + "loss": 4.9177, + "step": 79235 + }, + { + "epoch": 1.6121419270833335, + "grad_norm": 23.531147003173828, + "learning_rate": 6.504219339572721e-06, + "loss": 4.9793, + "step": 79240 + }, + { + "epoch": 1.61224365234375, + "grad_norm": 15.550033569335938, + "learning_rate": 6.503838173011983e-06, + "loss": 4.7901, + "step": 79245 + }, + { + "epoch": 1.6123453776041665, + "grad_norm": 16.390005111694336, + "learning_rate": 6.503456996842197e-06, + "loss": 4.7968, + "step": 79250 + }, + { + "epoch": 1.6124471028645835, + "grad_norm": 19.145950317382812, + "learning_rate": 6.503075811065797e-06, + "loss": 4.8983, + "step": 79255 + }, + { + "epoch": 1.612548828125, + "grad_norm": 17.06300163269043, + "learning_rate": 6.50269461568522e-06, + "loss": 4.5919, + "step": 79260 + }, + { + "epoch": 1.6126505533854165, + "grad_norm": 15.658186912536621, + "learning_rate": 6.502313410702901e-06, + "loss": 5.0188, + "step": 79265 + }, + { + "epoch": 1.6127522786458335, + "grad_norm": 22.365421295166016, + "learning_rate": 6.501932196121276e-06, + "loss": 4.8102, + "step": 79270 + }, + { + "epoch": 1.61285400390625, + "grad_norm": 16.89535140991211, + "learning_rate": 6.501550971942783e-06, + "loss": 4.8623, + "step": 79275 + }, + { + "epoch": 1.6129557291666665, + "grad_norm": 16.256471633911133, + "learning_rate": 6.501169738169854e-06, + "loss": 4.7724, + "step": 79280 + }, + { + "epoch": 1.6130574544270835, + "grad_norm": 22.008317947387695, + "learning_rate": 6.500788494804928e-06, + "loss": 4.9456, + "step": 79285 + }, + { + "epoch": 1.6131591796875, + "grad_norm": 15.747754096984863, + "learning_rate": 6.50040724185044e-06, + "loss": 4.7601, + "step": 79290 + }, + { + "epoch": 1.6132609049479165, + "grad_norm": 15.846940040588379, + "learning_rate": 6.500025979308825e-06, + "loss": 4.8881, + "step": 79295 + }, + { + "epoch": 1.6133626302083335, + "grad_norm": 17.811141967773438, + "learning_rate": 6.49964470718252e-06, + "loss": 4.6681, + "step": 79300 + }, + { + "epoch": 1.61346435546875, + "grad_norm": 22.774883270263672, + "learning_rate": 6.499263425473963e-06, + "loss": 4.9829, + "step": 79305 + }, + { + "epoch": 1.6135660807291665, + "grad_norm": 18.413482666015625, + "learning_rate": 6.498882134185587e-06, + "loss": 4.8749, + "step": 79310 + }, + { + "epoch": 1.6136678059895835, + "grad_norm": 12.502001762390137, + "learning_rate": 6.49850083331983e-06, + "loss": 5.0, + "step": 79315 + }, + { + "epoch": 1.61376953125, + "grad_norm": 13.33310317993164, + "learning_rate": 6.4981195228791295e-06, + "loss": 4.8532, + "step": 79320 + }, + { + "epoch": 1.6138712565104165, + "grad_norm": 19.138072967529297, + "learning_rate": 6.497738202865921e-06, + "loss": 4.7978, + "step": 79325 + }, + { + "epoch": 1.6139729817708335, + "grad_norm": 15.042864799499512, + "learning_rate": 6.49735687328264e-06, + "loss": 5.0057, + "step": 79330 + }, + { + "epoch": 1.61407470703125, + "grad_norm": 14.65888786315918, + "learning_rate": 6.496975534131724e-06, + "loss": 5.1704, + "step": 79335 + }, + { + "epoch": 1.6141764322916665, + "grad_norm": 14.810722351074219, + "learning_rate": 6.4965941854156104e-06, + "loss": 4.8609, + "step": 79340 + }, + { + "epoch": 1.6142781575520835, + "grad_norm": 15.168237686157227, + "learning_rate": 6.496212827136733e-06, + "loss": 5.1015, + "step": 79345 + }, + { + "epoch": 1.6143798828125, + "grad_norm": 13.966534614562988, + "learning_rate": 6.495831459297533e-06, + "loss": 4.8588, + "step": 79350 + }, + { + "epoch": 1.6144816080729165, + "grad_norm": 20.66706085205078, + "learning_rate": 6.495450081900443e-06, + "loss": 4.8997, + "step": 79355 + }, + { + "epoch": 1.6145833333333335, + "grad_norm": 19.150705337524414, + "learning_rate": 6.495068694947903e-06, + "loss": 5.0216, + "step": 79360 + }, + { + "epoch": 1.61468505859375, + "grad_norm": 14.32753849029541, + "learning_rate": 6.494687298442347e-06, + "loss": 4.6721, + "step": 79365 + }, + { + "epoch": 1.6147867838541665, + "grad_norm": 17.033737182617188, + "learning_rate": 6.494305892386214e-06, + "loss": 4.8073, + "step": 79370 + }, + { + "epoch": 1.6148885091145835, + "grad_norm": 18.909549713134766, + "learning_rate": 6.493924476781941e-06, + "loss": 4.8808, + "step": 79375 + }, + { + "epoch": 1.614990234375, + "grad_norm": 16.79547119140625, + "learning_rate": 6.4935430516319655e-06, + "loss": 4.8899, + "step": 79380 + }, + { + "epoch": 1.6150919596354165, + "grad_norm": 15.674798011779785, + "learning_rate": 6.493161616938724e-06, + "loss": 5.0828, + "step": 79385 + }, + { + "epoch": 1.6151936848958335, + "grad_norm": 20.43111801147461, + "learning_rate": 6.492780172704654e-06, + "loss": 4.8021, + "step": 79390 + }, + { + "epoch": 1.61529541015625, + "grad_norm": 19.939125061035156, + "learning_rate": 6.492398718932192e-06, + "loss": 4.7796, + "step": 79395 + }, + { + "epoch": 1.6153971354166665, + "grad_norm": 14.803712844848633, + "learning_rate": 6.492017255623775e-06, + "loss": 4.9098, + "step": 79400 + }, + { + "epoch": 1.6154988606770835, + "grad_norm": 19.634672164916992, + "learning_rate": 6.491635782781843e-06, + "loss": 4.67, + "step": 79405 + }, + { + "epoch": 1.6156005859375, + "grad_norm": 19.94845199584961, + "learning_rate": 6.491254300408831e-06, + "loss": 4.6151, + "step": 79410 + }, + { + "epoch": 1.6157023111979165, + "grad_norm": 14.985784530639648, + "learning_rate": 6.490872808507178e-06, + "loss": 4.9374, + "step": 79415 + }, + { + "epoch": 1.6158040364583335, + "grad_norm": 23.477811813354492, + "learning_rate": 6.490491307079319e-06, + "loss": 4.7163, + "step": 79420 + }, + { + "epoch": 1.61590576171875, + "grad_norm": 17.842763900756836, + "learning_rate": 6.490109796127696e-06, + "loss": 4.9243, + "step": 79425 + }, + { + "epoch": 1.6160074869791665, + "grad_norm": 15.928811073303223, + "learning_rate": 6.489728275654744e-06, + "loss": 4.8791, + "step": 79430 + }, + { + "epoch": 1.6161092122395835, + "grad_norm": 17.410930633544922, + "learning_rate": 6.489346745662901e-06, + "loss": 5.1257, + "step": 79435 + }, + { + "epoch": 1.6162109375, + "grad_norm": 17.24591636657715, + "learning_rate": 6.488965206154605e-06, + "loss": 4.97, + "step": 79440 + }, + { + "epoch": 1.6163126627604165, + "grad_norm": 16.88422393798828, + "learning_rate": 6.488583657132293e-06, + "loss": 4.9461, + "step": 79445 + }, + { + "epoch": 1.6164143880208335, + "grad_norm": 15.673127174377441, + "learning_rate": 6.4882020985984065e-06, + "loss": 5.0945, + "step": 79450 + }, + { + "epoch": 1.61651611328125, + "grad_norm": 20.29756736755371, + "learning_rate": 6.48782053055538e-06, + "loss": 4.9068, + "step": 79455 + }, + { + "epoch": 1.6166178385416665, + "grad_norm": 16.04287338256836, + "learning_rate": 6.4874389530056515e-06, + "loss": 4.8449, + "step": 79460 + }, + { + "epoch": 1.6167195638020835, + "grad_norm": 21.382068634033203, + "learning_rate": 6.487057365951661e-06, + "loss": 4.8669, + "step": 79465 + }, + { + "epoch": 1.6168212890625, + "grad_norm": 16.351333618164062, + "learning_rate": 6.486675769395846e-06, + "loss": 4.7974, + "step": 79470 + }, + { + "epoch": 1.6169230143229165, + "grad_norm": 17.053565979003906, + "learning_rate": 6.486294163340644e-06, + "loss": 4.8521, + "step": 79475 + }, + { + "epoch": 1.6170247395833335, + "grad_norm": 12.610479354858398, + "learning_rate": 6.485912547788495e-06, + "loss": 4.9466, + "step": 79480 + }, + { + "epoch": 1.61712646484375, + "grad_norm": 14.360236167907715, + "learning_rate": 6.485530922741836e-06, + "loss": 5.1706, + "step": 79485 + }, + { + "epoch": 1.6172281901041665, + "grad_norm": 20.66261863708496, + "learning_rate": 6.485149288203107e-06, + "loss": 4.9525, + "step": 79490 + }, + { + "epoch": 1.6173299153645835, + "grad_norm": 18.332151412963867, + "learning_rate": 6.484767644174743e-06, + "loss": 4.6695, + "step": 79495 + }, + { + "epoch": 1.617431640625, + "grad_norm": 17.18271827697754, + "learning_rate": 6.484385990659188e-06, + "loss": 4.8061, + "step": 79500 + }, + { + "epoch": 1.6175333658854165, + "grad_norm": 16.201156616210938, + "learning_rate": 6.484004327658876e-06, + "loss": 4.9221, + "step": 79505 + }, + { + "epoch": 1.6176350911458335, + "grad_norm": 24.76453399658203, + "learning_rate": 6.483622655176248e-06, + "loss": 4.753, + "step": 79510 + }, + { + "epoch": 1.61773681640625, + "grad_norm": 15.630239486694336, + "learning_rate": 6.4832409732137415e-06, + "loss": 5.0609, + "step": 79515 + }, + { + "epoch": 1.6178385416666665, + "grad_norm": 21.397886276245117, + "learning_rate": 6.482859281773797e-06, + "loss": 5.0005, + "step": 79520 + }, + { + "epoch": 1.6179402669270835, + "grad_norm": 14.164992332458496, + "learning_rate": 6.482477580858851e-06, + "loss": 4.9101, + "step": 79525 + }, + { + "epoch": 1.6180419921875, + "grad_norm": 18.54135513305664, + "learning_rate": 6.482095870471346e-06, + "loss": 4.8984, + "step": 79530 + }, + { + "epoch": 1.6181437174479165, + "grad_norm": 15.808853149414062, + "learning_rate": 6.481714150613717e-06, + "loss": 4.8337, + "step": 79535 + }, + { + "epoch": 1.6182454427083335, + "grad_norm": 17.112333297729492, + "learning_rate": 6.481332421288407e-06, + "loss": 4.7316, + "step": 79540 + }, + { + "epoch": 1.61834716796875, + "grad_norm": 15.831459045410156, + "learning_rate": 6.480950682497852e-06, + "loss": 4.7729, + "step": 79545 + }, + { + "epoch": 1.6184488932291665, + "grad_norm": 18.83638572692871, + "learning_rate": 6.480568934244491e-06, + "loss": 4.7795, + "step": 79550 + }, + { + "epoch": 1.6185506184895835, + "grad_norm": 19.92467498779297, + "learning_rate": 6.4801871765307655e-06, + "loss": 5.0186, + "step": 79555 + }, + { + "epoch": 1.61865234375, + "grad_norm": 19.232070922851562, + "learning_rate": 6.479805409359115e-06, + "loss": 4.9355, + "step": 79560 + }, + { + "epoch": 1.6187540690104165, + "grad_norm": 22.297637939453125, + "learning_rate": 6.479423632731976e-06, + "loss": 5.0854, + "step": 79565 + }, + { + "epoch": 1.6188557942708335, + "grad_norm": 22.618080139160156, + "learning_rate": 6.47904184665179e-06, + "loss": 4.7742, + "step": 79570 + }, + { + "epoch": 1.61895751953125, + "grad_norm": 16.513233184814453, + "learning_rate": 6.4786600511209965e-06, + "loss": 4.8881, + "step": 79575 + }, + { + "epoch": 1.6190592447916665, + "grad_norm": 17.69224739074707, + "learning_rate": 6.478278246142034e-06, + "loss": 4.7137, + "step": 79580 + }, + { + "epoch": 1.6191609700520835, + "grad_norm": 14.50816822052002, + "learning_rate": 6.477896431717344e-06, + "loss": 4.76, + "step": 79585 + }, + { + "epoch": 1.6192626953125, + "grad_norm": 18.476778030395508, + "learning_rate": 6.477514607849364e-06, + "loss": 4.7089, + "step": 79590 + }, + { + "epoch": 1.6193644205729165, + "grad_norm": 22.342700958251953, + "learning_rate": 6.477132774540536e-06, + "loss": 5.0502, + "step": 79595 + }, + { + "epoch": 1.6194661458333335, + "grad_norm": 16.28926658630371, + "learning_rate": 6.476750931793297e-06, + "loss": 5.0536, + "step": 79600 + }, + { + "epoch": 1.61956787109375, + "grad_norm": 18.90643310546875, + "learning_rate": 6.476369079610088e-06, + "loss": 5.1006, + "step": 79605 + }, + { + "epoch": 1.6196695963541665, + "grad_norm": 17.4345703125, + "learning_rate": 6.47598721799335e-06, + "loss": 5.2282, + "step": 79610 + }, + { + "epoch": 1.6197713216145835, + "grad_norm": 16.219501495361328, + "learning_rate": 6.475605346945522e-06, + "loss": 4.8319, + "step": 79615 + }, + { + "epoch": 1.619873046875, + "grad_norm": 17.803775787353516, + "learning_rate": 6.475223466469045e-06, + "loss": 4.8831, + "step": 79620 + }, + { + "epoch": 1.6199747721354165, + "grad_norm": 18.204879760742188, + "learning_rate": 6.474841576566358e-06, + "loss": 5.0574, + "step": 79625 + }, + { + "epoch": 1.6200764973958335, + "grad_norm": 14.941680908203125, + "learning_rate": 6.474459677239901e-06, + "loss": 4.7653, + "step": 79630 + }, + { + "epoch": 1.62017822265625, + "grad_norm": 17.52024269104004, + "learning_rate": 6.4740777684921145e-06, + "loss": 4.7735, + "step": 79635 + }, + { + "epoch": 1.6202799479166665, + "grad_norm": 14.990971565246582, + "learning_rate": 6.473695850325439e-06, + "loss": 5.0676, + "step": 79640 + }, + { + "epoch": 1.6203816731770835, + "grad_norm": 22.76850700378418, + "learning_rate": 6.473313922742315e-06, + "loss": 5.3462, + "step": 79645 + }, + { + "epoch": 1.6204833984375, + "grad_norm": 13.757218360900879, + "learning_rate": 6.472931985745183e-06, + "loss": 4.8758, + "step": 79650 + }, + { + "epoch": 1.6205851236979165, + "grad_norm": 18.015766143798828, + "learning_rate": 6.472550039336482e-06, + "loss": 4.8368, + "step": 79655 + }, + { + "epoch": 1.6206868489583335, + "grad_norm": 24.10776138305664, + "learning_rate": 6.472168083518656e-06, + "loss": 4.9645, + "step": 79660 + }, + { + "epoch": 1.62078857421875, + "grad_norm": 18.35443115234375, + "learning_rate": 6.471786118294142e-06, + "loss": 4.6078, + "step": 79665 + }, + { + "epoch": 1.6208902994791665, + "grad_norm": 19.337791442871094, + "learning_rate": 6.471404143665383e-06, + "loss": 5.0773, + "step": 79670 + }, + { + "epoch": 1.6209920247395835, + "grad_norm": 16.163177490234375, + "learning_rate": 6.471022159634818e-06, + "loss": 4.9789, + "step": 79675 + }, + { + "epoch": 1.62109375, + "grad_norm": 17.392223358154297, + "learning_rate": 6.470640166204887e-06, + "loss": 5.0497, + "step": 79680 + }, + { + "epoch": 1.6211954752604165, + "grad_norm": 20.40245246887207, + "learning_rate": 6.470258163378034e-06, + "loss": 5.1385, + "step": 79685 + }, + { + "epoch": 1.6212972005208335, + "grad_norm": 17.178144454956055, + "learning_rate": 6.469876151156698e-06, + "loss": 5.0117, + "step": 79690 + }, + { + "epoch": 1.62139892578125, + "grad_norm": 21.492265701293945, + "learning_rate": 6.469494129543319e-06, + "loss": 4.9229, + "step": 79695 + }, + { + "epoch": 1.6215006510416665, + "grad_norm": 16.22516441345215, + "learning_rate": 6.4691120985403385e-06, + "loss": 4.8161, + "step": 79700 + }, + { + "epoch": 1.6216023763020835, + "grad_norm": 19.15074920654297, + "learning_rate": 6.468730058150199e-06, + "loss": 4.7192, + "step": 79705 + }, + { + "epoch": 1.6217041015625, + "grad_norm": 18.564590454101562, + "learning_rate": 6.468348008375341e-06, + "loss": 4.7598, + "step": 79710 + }, + { + "epoch": 1.6218058268229165, + "grad_norm": 23.01280403137207, + "learning_rate": 6.467965949218205e-06, + "loss": 4.8516, + "step": 79715 + }, + { + "epoch": 1.6219075520833335, + "grad_norm": 20.161495208740234, + "learning_rate": 6.467583880681232e-06, + "loss": 4.8666, + "step": 79720 + }, + { + "epoch": 1.62200927734375, + "grad_norm": 19.99164581298828, + "learning_rate": 6.467201802766865e-06, + "loss": 5.0447, + "step": 79725 + }, + { + "epoch": 1.6221110026041665, + "grad_norm": 26.518695831298828, + "learning_rate": 6.466819715477543e-06, + "loss": 4.9527, + "step": 79730 + }, + { + "epoch": 1.6222127278645835, + "grad_norm": 17.308137893676758, + "learning_rate": 6.466437618815708e-06, + "loss": 4.8251, + "step": 79735 + }, + { + "epoch": 1.622314453125, + "grad_norm": 27.346553802490234, + "learning_rate": 6.466055512783805e-06, + "loss": 4.9775, + "step": 79740 + }, + { + "epoch": 1.6224161783854165, + "grad_norm": 19.494064331054688, + "learning_rate": 6.46567339738427e-06, + "loss": 4.9831, + "step": 79745 + }, + { + "epoch": 1.6225179036458335, + "grad_norm": 15.605834007263184, + "learning_rate": 6.465291272619549e-06, + "loss": 4.9836, + "step": 79750 + }, + { + "epoch": 1.62261962890625, + "grad_norm": 18.527332305908203, + "learning_rate": 6.46490913849208e-06, + "loss": 4.9447, + "step": 79755 + }, + { + "epoch": 1.6227213541666665, + "grad_norm": 15.477921485900879, + "learning_rate": 6.464526995004309e-06, + "loss": 4.932, + "step": 79760 + }, + { + "epoch": 1.6228230794270835, + "grad_norm": 21.075191497802734, + "learning_rate": 6.4641448421586725e-06, + "loss": 4.712, + "step": 79765 + }, + { + "epoch": 1.6229248046875, + "grad_norm": 16.07645034790039, + "learning_rate": 6.463762679957618e-06, + "loss": 4.8009, + "step": 79770 + }, + { + "epoch": 1.6230265299479165, + "grad_norm": 19.45242691040039, + "learning_rate": 6.463380508403583e-06, + "loss": 4.9204, + "step": 79775 + }, + { + "epoch": 1.6231282552083335, + "grad_norm": 19.162099838256836, + "learning_rate": 6.462998327499013e-06, + "loss": 4.8226, + "step": 79780 + }, + { + "epoch": 1.62322998046875, + "grad_norm": 18.130842208862305, + "learning_rate": 6.4626161372463445e-06, + "loss": 5.2606, + "step": 79785 + }, + { + "epoch": 1.6233317057291665, + "grad_norm": 19.035083770751953, + "learning_rate": 6.462233937648027e-06, + "loss": 4.7833, + "step": 79790 + }, + { + "epoch": 1.6234334309895835, + "grad_norm": 17.585783004760742, + "learning_rate": 6.461851728706497e-06, + "loss": 4.9966, + "step": 79795 + }, + { + "epoch": 1.62353515625, + "grad_norm": 18.214862823486328, + "learning_rate": 6.461469510424199e-06, + "loss": 4.8352, + "step": 79800 + }, + { + "epoch": 1.6236368815104165, + "grad_norm": 21.993497848510742, + "learning_rate": 6.461087282803574e-06, + "loss": 4.6405, + "step": 79805 + }, + { + "epoch": 1.6237386067708335, + "grad_norm": 19.765789031982422, + "learning_rate": 6.460705045847065e-06, + "loss": 4.9051, + "step": 79810 + }, + { + "epoch": 1.62384033203125, + "grad_norm": 22.47144889831543, + "learning_rate": 6.4603227995571165e-06, + "loss": 4.9421, + "step": 79815 + }, + { + "epoch": 1.6239420572916665, + "grad_norm": 18.546411514282227, + "learning_rate": 6.459940543936166e-06, + "loss": 4.8674, + "step": 79820 + }, + { + "epoch": 1.6240437825520835, + "grad_norm": 19.893482208251953, + "learning_rate": 6.459558278986661e-06, + "loss": 5.0286, + "step": 79825 + }, + { + "epoch": 1.6241455078125, + "grad_norm": 17.03301239013672, + "learning_rate": 6.4591760047110415e-06, + "loss": 5.0392, + "step": 79830 + }, + { + "epoch": 1.6242472330729165, + "grad_norm": 19.25391960144043, + "learning_rate": 6.458793721111749e-06, + "loss": 4.9724, + "step": 79835 + }, + { + "epoch": 1.6243489583333335, + "grad_norm": 19.061738967895508, + "learning_rate": 6.458411428191231e-06, + "loss": 4.8283, + "step": 79840 + }, + { + "epoch": 1.62445068359375, + "grad_norm": 15.64952564239502, + "learning_rate": 6.458029125951925e-06, + "loss": 4.8616, + "step": 79845 + }, + { + "epoch": 1.6245524088541665, + "grad_norm": 22.988609313964844, + "learning_rate": 6.457646814396275e-06, + "loss": 4.6406, + "step": 79850 + }, + { + "epoch": 1.6246541341145835, + "grad_norm": 18.00729751586914, + "learning_rate": 6.457264493526725e-06, + "loss": 4.7843, + "step": 79855 + }, + { + "epoch": 1.624755859375, + "grad_norm": 18.055078506469727, + "learning_rate": 6.456882163345717e-06, + "loss": 5.0577, + "step": 79860 + }, + { + "epoch": 1.6248575846354165, + "grad_norm": 19.72256088256836, + "learning_rate": 6.456499823855695e-06, + "loss": 5.1287, + "step": 79865 + }, + { + "epoch": 1.6249593098958335, + "grad_norm": 18.31966781616211, + "learning_rate": 6.4561174750591024e-06, + "loss": 5.1498, + "step": 79870 + }, + { + "epoch": 1.62506103515625, + "grad_norm": 21.281253814697266, + "learning_rate": 6.45573511695838e-06, + "loss": 4.8352, + "step": 79875 + }, + { + "epoch": 1.6251627604166665, + "grad_norm": 20.072938919067383, + "learning_rate": 6.455352749555974e-06, + "loss": 5.1637, + "step": 79880 + }, + { + "epoch": 1.6252644856770835, + "grad_norm": 16.015905380249023, + "learning_rate": 6.454970372854323e-06, + "loss": 4.7509, + "step": 79885 + }, + { + "epoch": 1.6253662109375, + "grad_norm": 21.40315818786621, + "learning_rate": 6.454587986855876e-06, + "loss": 4.8015, + "step": 79890 + }, + { + "epoch": 1.6254679361979165, + "grad_norm": 17.854122161865234, + "learning_rate": 6.4542055915630715e-06, + "loss": 5.3212, + "step": 79895 + }, + { + "epoch": 1.6255696614583335, + "grad_norm": 21.31396484375, + "learning_rate": 6.453823186978356e-06, + "loss": 4.9846, + "step": 79900 + }, + { + "epoch": 1.62567138671875, + "grad_norm": 22.187000274658203, + "learning_rate": 6.453440773104172e-06, + "loss": 4.9646, + "step": 79905 + }, + { + "epoch": 1.6257731119791665, + "grad_norm": 21.868677139282227, + "learning_rate": 6.453058349942963e-06, + "loss": 5.0676, + "step": 79910 + }, + { + "epoch": 1.6258748372395835, + "grad_norm": 20.841535568237305, + "learning_rate": 6.452675917497171e-06, + "loss": 5.1026, + "step": 79915 + }, + { + "epoch": 1.6259765625, + "grad_norm": 18.464685440063477, + "learning_rate": 6.4522934757692425e-06, + "loss": 4.9122, + "step": 79920 + }, + { + "epoch": 1.6260782877604165, + "grad_norm": 17.142271041870117, + "learning_rate": 6.451911024761619e-06, + "loss": 5.0017, + "step": 79925 + }, + { + "epoch": 1.6261800130208335, + "grad_norm": 21.487943649291992, + "learning_rate": 6.4515285644767456e-06, + "loss": 4.9539, + "step": 79930 + }, + { + "epoch": 1.62628173828125, + "grad_norm": 18.309202194213867, + "learning_rate": 6.451146094917065e-06, + "loss": 4.8461, + "step": 79935 + }, + { + "epoch": 1.6263834635416665, + "grad_norm": 15.868523597717285, + "learning_rate": 6.450763616085022e-06, + "loss": 4.8921, + "step": 79940 + }, + { + "epoch": 1.6264851888020835, + "grad_norm": 16.626140594482422, + "learning_rate": 6.45038112798306e-06, + "loss": 4.9102, + "step": 79945 + }, + { + "epoch": 1.6265869140625, + "grad_norm": 17.228368759155273, + "learning_rate": 6.449998630613621e-06, + "loss": 4.8516, + "step": 79950 + }, + { + "epoch": 1.6266886393229165, + "grad_norm": 18.39137077331543, + "learning_rate": 6.449616123979153e-06, + "loss": 4.7127, + "step": 79955 + }, + { + "epoch": 1.6267903645833335, + "grad_norm": 18.95516586303711, + "learning_rate": 6.4492336080820995e-06, + "loss": 4.9737, + "step": 79960 + }, + { + "epoch": 1.62689208984375, + "grad_norm": 19.85725975036621, + "learning_rate": 6.448851082924901e-06, + "loss": 4.9772, + "step": 79965 + }, + { + "epoch": 1.6269938151041665, + "grad_norm": 19.681840896606445, + "learning_rate": 6.448468548510004e-06, + "loss": 5.0282, + "step": 79970 + }, + { + "epoch": 1.6270955403645835, + "grad_norm": 22.179441452026367, + "learning_rate": 6.448086004839853e-06, + "loss": 4.8544, + "step": 79975 + }, + { + "epoch": 1.627197265625, + "grad_norm": 13.833584785461426, + "learning_rate": 6.447703451916894e-06, + "loss": 4.8066, + "step": 79980 + }, + { + "epoch": 1.6272989908854165, + "grad_norm": 15.850955963134766, + "learning_rate": 6.447320889743567e-06, + "loss": 4.7865, + "step": 79985 + }, + { + "epoch": 1.6274007161458335, + "grad_norm": 17.83222198486328, + "learning_rate": 6.44693831832232e-06, + "loss": 4.8463, + "step": 79990 + }, + { + "epoch": 1.62750244140625, + "grad_norm": 24.220321655273438, + "learning_rate": 6.446555737655597e-06, + "loss": 5.0249, + "step": 79995 + }, + { + "epoch": 1.6276041666666665, + "grad_norm": 22.2767276763916, + "learning_rate": 6.446173147745841e-06, + "loss": 4.7656, + "step": 80000 + }, + { + "epoch": 1.6277058919270835, + "grad_norm": 22.764915466308594, + "learning_rate": 6.445790548595497e-06, + "loss": 4.886, + "step": 80005 + }, + { + "epoch": 1.6278076171875, + "grad_norm": 40.14106369018555, + "learning_rate": 6.445407940207012e-06, + "loss": 4.7217, + "step": 80010 + }, + { + "epoch": 1.6279093424479165, + "grad_norm": 20.464500427246094, + "learning_rate": 6.445025322582828e-06, + "loss": 4.7545, + "step": 80015 + }, + { + "epoch": 1.6280110677083335, + "grad_norm": 19.095458984375, + "learning_rate": 6.444642695725393e-06, + "loss": 5.2449, + "step": 80020 + }, + { + "epoch": 1.62811279296875, + "grad_norm": 18.174388885498047, + "learning_rate": 6.444260059637146e-06, + "loss": 4.9383, + "step": 80025 + }, + { + "epoch": 1.6282145182291665, + "grad_norm": 17.990154266357422, + "learning_rate": 6.44387741432054e-06, + "loss": 4.8902, + "step": 80030 + }, + { + "epoch": 1.6283162434895835, + "grad_norm": 16.869213104248047, + "learning_rate": 6.443494759778012e-06, + "loss": 4.9049, + "step": 80035 + }, + { + "epoch": 1.62841796875, + "grad_norm": 20.97643280029297, + "learning_rate": 6.443112096012013e-06, + "loss": 4.8837, + "step": 80040 + }, + { + "epoch": 1.6285196940104165, + "grad_norm": 16.77023696899414, + "learning_rate": 6.442729423024985e-06, + "loss": 4.8256, + "step": 80045 + }, + { + "epoch": 1.6286214192708335, + "grad_norm": 14.819687843322754, + "learning_rate": 6.442346740819374e-06, + "loss": 5.0516, + "step": 80050 + }, + { + "epoch": 1.62872314453125, + "grad_norm": 24.086366653442383, + "learning_rate": 6.441964049397624e-06, + "loss": 4.9114, + "step": 80055 + }, + { + "epoch": 1.6288248697916665, + "grad_norm": 20.526554107666016, + "learning_rate": 6.441581348762184e-06, + "loss": 4.8418, + "step": 80060 + }, + { + "epoch": 1.6289265950520835, + "grad_norm": 16.82411003112793, + "learning_rate": 6.441198638915496e-06, + "loss": 4.8452, + "step": 80065 + }, + { + "epoch": 1.6290283203125, + "grad_norm": 22.679855346679688, + "learning_rate": 6.440815919860004e-06, + "loss": 4.8925, + "step": 80070 + }, + { + "epoch": 1.6291300455729165, + "grad_norm": 16.184446334838867, + "learning_rate": 6.440433191598159e-06, + "loss": 5.1166, + "step": 80075 + }, + { + "epoch": 1.6292317708333335, + "grad_norm": 16.187679290771484, + "learning_rate": 6.4400504541324e-06, + "loss": 4.8875, + "step": 80080 + }, + { + "epoch": 1.62933349609375, + "grad_norm": 20.873281478881836, + "learning_rate": 6.439667707465178e-06, + "loss": 4.8066, + "step": 80085 + }, + { + "epoch": 1.6294352213541665, + "grad_norm": 19.708038330078125, + "learning_rate": 6.439284951598937e-06, + "loss": 5.1401, + "step": 80090 + }, + { + "epoch": 1.6295369466145835, + "grad_norm": 38.39317321777344, + "learning_rate": 6.438902186536119e-06, + "loss": 4.9448, + "step": 80095 + }, + { + "epoch": 1.629638671875, + "grad_norm": 18.833364486694336, + "learning_rate": 6.438519412279175e-06, + "loss": 5.1009, + "step": 80100 + }, + { + "epoch": 1.6297403971354165, + "grad_norm": 22.054887771606445, + "learning_rate": 6.438136628830547e-06, + "loss": 4.8395, + "step": 80105 + }, + { + "epoch": 1.6298421223958335, + "grad_norm": 20.037933349609375, + "learning_rate": 6.437753836192684e-06, + "loss": 5.2487, + "step": 80110 + }, + { + "epoch": 1.62994384765625, + "grad_norm": 18.947776794433594, + "learning_rate": 6.437371034368031e-06, + "loss": 4.874, + "step": 80115 + }, + { + "epoch": 1.6300455729166665, + "grad_norm": 21.380807876586914, + "learning_rate": 6.436988223359031e-06, + "loss": 4.9302, + "step": 80120 + }, + { + "epoch": 1.6301472981770835, + "grad_norm": 22.003881454467773, + "learning_rate": 6.436605403168135e-06, + "loss": 5.1496, + "step": 80125 + }, + { + "epoch": 1.6302490234375, + "grad_norm": 23.59336280822754, + "learning_rate": 6.4362225737977855e-06, + "loss": 4.8872, + "step": 80130 + }, + { + "epoch": 1.6303507486979165, + "grad_norm": 14.776691436767578, + "learning_rate": 6.435839735250429e-06, + "loss": 4.966, + "step": 80135 + }, + { + "epoch": 1.6304524739583335, + "grad_norm": 20.572660446166992, + "learning_rate": 6.435456887528513e-06, + "loss": 4.7821, + "step": 80140 + }, + { + "epoch": 1.63055419921875, + "grad_norm": 16.993242263793945, + "learning_rate": 6.4350740306344815e-06, + "loss": 4.964, + "step": 80145 + }, + { + "epoch": 1.6306559244791665, + "grad_norm": 21.82431411743164, + "learning_rate": 6.434691164570784e-06, + "loss": 4.7461, + "step": 80150 + }, + { + "epoch": 1.6307576497395835, + "grad_norm": 16.945966720581055, + "learning_rate": 6.434308289339866e-06, + "loss": 4.9662, + "step": 80155 + }, + { + "epoch": 1.630859375, + "grad_norm": 17.584758758544922, + "learning_rate": 6.433925404944172e-06, + "loss": 4.9441, + "step": 80160 + }, + { + "epoch": 1.6309611002604165, + "grad_norm": 23.023019790649414, + "learning_rate": 6.4335425113861505e-06, + "loss": 4.8025, + "step": 80165 + }, + { + "epoch": 1.6310628255208335, + "grad_norm": 19.963773727416992, + "learning_rate": 6.433159608668248e-06, + "loss": 4.9922, + "step": 80170 + }, + { + "epoch": 1.63116455078125, + "grad_norm": 16.303091049194336, + "learning_rate": 6.43277669679291e-06, + "loss": 4.7964, + "step": 80175 + }, + { + "epoch": 1.6312662760416665, + "grad_norm": 16.483985900878906, + "learning_rate": 6.432393775762583e-06, + "loss": 4.883, + "step": 80180 + }, + { + "epoch": 1.6313680013020835, + "grad_norm": 19.713199615478516, + "learning_rate": 6.432010845579715e-06, + "loss": 4.9816, + "step": 80185 + }, + { + "epoch": 1.6314697265625, + "grad_norm": 15.415249824523926, + "learning_rate": 6.431627906246752e-06, + "loss": 4.8723, + "step": 80190 + }, + { + "epoch": 1.6315714518229165, + "grad_norm": 17.816801071166992, + "learning_rate": 6.431244957766142e-06, + "loss": 4.9935, + "step": 80195 + }, + { + "epoch": 1.6316731770833335, + "grad_norm": 12.99370288848877, + "learning_rate": 6.430862000140331e-06, + "loss": 4.8395, + "step": 80200 + }, + { + "epoch": 1.63177490234375, + "grad_norm": 18.644527435302734, + "learning_rate": 6.430479033371766e-06, + "loss": 4.9557, + "step": 80205 + }, + { + "epoch": 1.6318766276041665, + "grad_norm": 20.267696380615234, + "learning_rate": 6.430096057462893e-06, + "loss": 5.1055, + "step": 80210 + }, + { + "epoch": 1.6319783528645835, + "grad_norm": 18.32171630859375, + "learning_rate": 6.429713072416161e-06, + "loss": 5.1088, + "step": 80215 + }, + { + "epoch": 1.632080078125, + "grad_norm": 16.087800979614258, + "learning_rate": 6.429330078234017e-06, + "loss": 5.0313, + "step": 80220 + }, + { + "epoch": 1.6321818033854165, + "grad_norm": 18.400449752807617, + "learning_rate": 6.428947074918906e-06, + "loss": 4.8394, + "step": 80225 + }, + { + "epoch": 1.6322835286458335, + "grad_norm": 22.76713752746582, + "learning_rate": 6.428564062473278e-06, + "loss": 5.0518, + "step": 80230 + }, + { + "epoch": 1.63238525390625, + "grad_norm": 17.115724563598633, + "learning_rate": 6.428181040899578e-06, + "loss": 4.9089, + "step": 80235 + }, + { + "epoch": 1.6324869791666665, + "grad_norm": 18.395465850830078, + "learning_rate": 6.427798010200255e-06, + "loss": 4.9853, + "step": 80240 + }, + { + "epoch": 1.6325887044270835, + "grad_norm": 20.805696487426758, + "learning_rate": 6.427414970377758e-06, + "loss": 4.9141, + "step": 80245 + }, + { + "epoch": 1.6326904296875, + "grad_norm": 20.68460464477539, + "learning_rate": 6.42703192143453e-06, + "loss": 4.7854, + "step": 80250 + }, + { + "epoch": 1.6327921549479165, + "grad_norm": 20.265033721923828, + "learning_rate": 6.426648863373022e-06, + "loss": 4.9712, + "step": 80255 + }, + { + "epoch": 1.6328938802083335, + "grad_norm": 18.78996467590332, + "learning_rate": 6.42626579619568e-06, + "loss": 4.9301, + "step": 80260 + }, + { + "epoch": 1.63299560546875, + "grad_norm": 21.349594116210938, + "learning_rate": 6.4258827199049525e-06, + "loss": 5.019, + "step": 80265 + }, + { + "epoch": 1.6330973307291665, + "grad_norm": 17.641357421875, + "learning_rate": 6.425499634503287e-06, + "loss": 4.8525, + "step": 80270 + }, + { + "epoch": 1.6331990559895835, + "grad_norm": 16.806190490722656, + "learning_rate": 6.425116539993131e-06, + "loss": 5.0181, + "step": 80275 + }, + { + "epoch": 1.63330078125, + "grad_norm": 23.82420539855957, + "learning_rate": 6.424733436376934e-06, + "loss": 4.97, + "step": 80280 + }, + { + "epoch": 1.6334025065104165, + "grad_norm": 20.265413284301758, + "learning_rate": 6.424350323657143e-06, + "loss": 4.6054, + "step": 80285 + }, + { + "epoch": 1.6335042317708335, + "grad_norm": 17.052162170410156, + "learning_rate": 6.423967201836203e-06, + "loss": 4.91, + "step": 80290 + }, + { + "epoch": 1.63360595703125, + "grad_norm": 19.153797149658203, + "learning_rate": 6.4235840709165665e-06, + "loss": 4.9566, + "step": 80295 + }, + { + "epoch": 1.6337076822916665, + "grad_norm": 16.369598388671875, + "learning_rate": 6.423200930900679e-06, + "loss": 4.9631, + "step": 80300 + }, + { + "epoch": 1.6338094075520835, + "grad_norm": 18.60525894165039, + "learning_rate": 6.422817781790989e-06, + "loss": 4.8859, + "step": 80305 + }, + { + "epoch": 1.6339111328125, + "grad_norm": 15.037546157836914, + "learning_rate": 6.422434623589946e-06, + "loss": 4.9013, + "step": 80310 + }, + { + "epoch": 1.6340128580729165, + "grad_norm": 16.953874588012695, + "learning_rate": 6.422051456299996e-06, + "loss": 4.704, + "step": 80315 + }, + { + "epoch": 1.6341145833333335, + "grad_norm": 25.255796432495117, + "learning_rate": 6.421668279923591e-06, + "loss": 5.0714, + "step": 80320 + }, + { + "epoch": 1.63421630859375, + "grad_norm": 18.789180755615234, + "learning_rate": 6.421285094463175e-06, + "loss": 4.6083, + "step": 80325 + }, + { + "epoch": 1.6343180338541665, + "grad_norm": 19.97476577758789, + "learning_rate": 6.420901899921199e-06, + "loss": 4.5849, + "step": 80330 + }, + { + "epoch": 1.6344197591145835, + "grad_norm": 17.352514266967773, + "learning_rate": 6.420518696300111e-06, + "loss": 4.9939, + "step": 80335 + }, + { + "epoch": 1.634521484375, + "grad_norm": 19.127592086791992, + "learning_rate": 6.420135483602359e-06, + "loss": 4.7658, + "step": 80340 + }, + { + "epoch": 1.6346232096354165, + "grad_norm": 18.89628791809082, + "learning_rate": 6.4197522618303924e-06, + "loss": 4.8924, + "step": 80345 + }, + { + "epoch": 1.6347249348958335, + "grad_norm": 15.67961311340332, + "learning_rate": 6.419369030986658e-06, + "loss": 4.9475, + "step": 80350 + }, + { + "epoch": 1.63482666015625, + "grad_norm": 16.645627975463867, + "learning_rate": 6.4189857910736085e-06, + "loss": 4.8691, + "step": 80355 + }, + { + "epoch": 1.6349283854166665, + "grad_norm": 20.68548583984375, + "learning_rate": 6.41860254209369e-06, + "loss": 5.0003, + "step": 80360 + }, + { + "epoch": 1.6350301106770835, + "grad_norm": 21.81602668762207, + "learning_rate": 6.4182192840493506e-06, + "loss": 5.0756, + "step": 80365 + }, + { + "epoch": 1.6351318359375, + "grad_norm": 14.657315254211426, + "learning_rate": 6.417836016943041e-06, + "loss": 5.0258, + "step": 80370 + }, + { + "epoch": 1.6352335611979165, + "grad_norm": 16.986730575561523, + "learning_rate": 6.4174527407772095e-06, + "loss": 4.9647, + "step": 80375 + }, + { + "epoch": 1.6353352864583335, + "grad_norm": 21.55415916442871, + "learning_rate": 6.417069455554303e-06, + "loss": 5.1395, + "step": 80380 + }, + { + "epoch": 1.63543701171875, + "grad_norm": 16.313125610351562, + "learning_rate": 6.416686161276775e-06, + "loss": 4.9742, + "step": 80385 + }, + { + "epoch": 1.6355387369791665, + "grad_norm": 15.933833122253418, + "learning_rate": 6.41630285794707e-06, + "loss": 4.7545, + "step": 80390 + }, + { + "epoch": 1.6356404622395835, + "grad_norm": 21.274398803710938, + "learning_rate": 6.415919545567641e-06, + "loss": 4.8121, + "step": 80395 + }, + { + "epoch": 1.6357421875, + "grad_norm": 23.952966690063477, + "learning_rate": 6.415536224140934e-06, + "loss": 4.8159, + "step": 80400 + }, + { + "epoch": 1.6358439127604165, + "grad_norm": 19.144977569580078, + "learning_rate": 6.415152893669401e-06, + "loss": 4.812, + "step": 80405 + }, + { + "epoch": 1.6359456380208335, + "grad_norm": 17.343252182006836, + "learning_rate": 6.41476955415549e-06, + "loss": 4.7942, + "step": 80410 + }, + { + "epoch": 1.63604736328125, + "grad_norm": 13.608453750610352, + "learning_rate": 6.414386205601651e-06, + "loss": 5.026, + "step": 80415 + }, + { + "epoch": 1.6361490885416665, + "grad_norm": 15.951788902282715, + "learning_rate": 6.414002848010332e-06, + "loss": 4.919, + "step": 80420 + }, + { + "epoch": 1.6362508138020835, + "grad_norm": 13.039834976196289, + "learning_rate": 6.413619481383986e-06, + "loss": 4.8113, + "step": 80425 + }, + { + "epoch": 1.6363525390625, + "grad_norm": 18.65108871459961, + "learning_rate": 6.413236105725057e-06, + "loss": 4.9881, + "step": 80430 + }, + { + "epoch": 1.6364542643229165, + "grad_norm": 15.573071479797363, + "learning_rate": 6.4128527210360006e-06, + "loss": 4.9957, + "step": 80435 + }, + { + "epoch": 1.6365559895833335, + "grad_norm": 19.92762565612793, + "learning_rate": 6.412469327319261e-06, + "loss": 5.1565, + "step": 80440 + }, + { + "epoch": 1.63665771484375, + "grad_norm": 16.337862014770508, + "learning_rate": 6.412085924577293e-06, + "loss": 4.9938, + "step": 80445 + }, + { + "epoch": 1.6367594401041665, + "grad_norm": 15.366190910339355, + "learning_rate": 6.411702512812544e-06, + "loss": 4.9221, + "step": 80450 + }, + { + "epoch": 1.6368611653645835, + "grad_norm": 23.882179260253906, + "learning_rate": 6.4113190920274635e-06, + "loss": 4.8239, + "step": 80455 + }, + { + "epoch": 1.636962890625, + "grad_norm": 14.545907020568848, + "learning_rate": 6.4109356622245026e-06, + "loss": 4.8155, + "step": 80460 + }, + { + "epoch": 1.6370646158854165, + "grad_norm": 19.461246490478516, + "learning_rate": 6.41055222340611e-06, + "loss": 4.9354, + "step": 80465 + }, + { + "epoch": 1.6371663411458335, + "grad_norm": 19.685134887695312, + "learning_rate": 6.410168775574736e-06, + "loss": 4.8928, + "step": 80470 + }, + { + "epoch": 1.63726806640625, + "grad_norm": 21.25920867919922, + "learning_rate": 6.409785318732833e-06, + "loss": 4.8602, + "step": 80475 + }, + { + "epoch": 1.6373697916666665, + "grad_norm": 19.873130798339844, + "learning_rate": 6.409401852882848e-06, + "loss": 4.8417, + "step": 80480 + }, + { + "epoch": 1.6374715169270835, + "grad_norm": 18.755388259887695, + "learning_rate": 6.409018378027234e-06, + "loss": 4.9416, + "step": 80485 + }, + { + "epoch": 1.6375732421875, + "grad_norm": 18.65032958984375, + "learning_rate": 6.40863489416844e-06, + "loss": 4.7616, + "step": 80490 + }, + { + "epoch": 1.6376749674479165, + "grad_norm": 16.79813575744629, + "learning_rate": 6.408251401308913e-06, + "loss": 4.9389, + "step": 80495 + }, + { + "epoch": 1.6377766927083335, + "grad_norm": 13.175562858581543, + "learning_rate": 6.40786789945111e-06, + "loss": 4.8388, + "step": 80500 + }, + { + "epoch": 1.63787841796875, + "grad_norm": 16.53670883178711, + "learning_rate": 6.407484388597477e-06, + "loss": 4.7499, + "step": 80505 + }, + { + "epoch": 1.6379801432291665, + "grad_norm": 17.829242706298828, + "learning_rate": 6.407100868750465e-06, + "loss": 4.8451, + "step": 80510 + }, + { + "epoch": 1.6380818684895835, + "grad_norm": 18.450292587280273, + "learning_rate": 6.406717339912525e-06, + "loss": 4.8574, + "step": 80515 + }, + { + "epoch": 1.63818359375, + "grad_norm": 14.151050567626953, + "learning_rate": 6.406333802086108e-06, + "loss": 5.0257, + "step": 80520 + }, + { + "epoch": 1.6382853190104165, + "grad_norm": 17.352699279785156, + "learning_rate": 6.405950255273664e-06, + "loss": 4.8077, + "step": 80525 + }, + { + "epoch": 1.6383870442708335, + "grad_norm": 19.21209144592285, + "learning_rate": 6.405566699477646e-06, + "loss": 5.0493, + "step": 80530 + }, + { + "epoch": 1.63848876953125, + "grad_norm": 20.328590393066406, + "learning_rate": 6.4051831347005014e-06, + "loss": 5.0632, + "step": 80535 + }, + { + "epoch": 1.6385904947916665, + "grad_norm": 11.77135181427002, + "learning_rate": 6.4047995609446825e-06, + "loss": 4.7668, + "step": 80540 + }, + { + "epoch": 1.6386922200520835, + "grad_norm": 18.270431518554688, + "learning_rate": 6.40441597821264e-06, + "loss": 4.7821, + "step": 80545 + }, + { + "epoch": 1.6387939453125, + "grad_norm": 16.409805297851562, + "learning_rate": 6.404032386506827e-06, + "loss": 4.8775, + "step": 80550 + }, + { + "epoch": 1.6388956705729165, + "grad_norm": 16.97492218017578, + "learning_rate": 6.403648785829693e-06, + "loss": 4.7142, + "step": 80555 + }, + { + "epoch": 1.6389973958333335, + "grad_norm": 19.022132873535156, + "learning_rate": 6.403265176183686e-06, + "loss": 5.1948, + "step": 80560 + }, + { + "epoch": 1.63909912109375, + "grad_norm": 20.30731773376465, + "learning_rate": 6.402881557571262e-06, + "loss": 4.7252, + "step": 80565 + }, + { + "epoch": 1.6392008463541665, + "grad_norm": 20.937074661254883, + "learning_rate": 6.402497929994869e-06, + "loss": 5.2222, + "step": 80570 + }, + { + "epoch": 1.6393025716145835, + "grad_norm": 19.292390823364258, + "learning_rate": 6.402114293456959e-06, + "loss": 4.7546, + "step": 80575 + }, + { + "epoch": 1.639404296875, + "grad_norm": 20.178415298461914, + "learning_rate": 6.4017306479599864e-06, + "loss": 4.8688, + "step": 80580 + }, + { + "epoch": 1.6395060221354165, + "grad_norm": 18.056495666503906, + "learning_rate": 6.401346993506397e-06, + "loss": 4.8254, + "step": 80585 + }, + { + "epoch": 1.6396077473958335, + "grad_norm": 15.970878601074219, + "learning_rate": 6.400963330098647e-06, + "loss": 5.16, + "step": 80590 + }, + { + "epoch": 1.63970947265625, + "grad_norm": 29.355566024780273, + "learning_rate": 6.4005796577391866e-06, + "loss": 5.044, + "step": 80595 + }, + { + "epoch": 1.6398111979166665, + "grad_norm": 17.166484832763672, + "learning_rate": 6.4001959764304655e-06, + "loss": 5.1454, + "step": 80600 + }, + { + "epoch": 1.6399129231770835, + "grad_norm": 18.590620040893555, + "learning_rate": 6.399812286174937e-06, + "loss": 4.7306, + "step": 80605 + }, + { + "epoch": 1.6400146484375, + "grad_norm": 19.957386016845703, + "learning_rate": 6.399428586975052e-06, + "loss": 4.8173, + "step": 80610 + }, + { + "epoch": 1.6401163736979165, + "grad_norm": 15.94602108001709, + "learning_rate": 6.399044878833262e-06, + "loss": 4.7067, + "step": 80615 + }, + { + "epoch": 1.6402180989583335, + "grad_norm": 15.20630931854248, + "learning_rate": 6.398661161752021e-06, + "loss": 5.096, + "step": 80620 + }, + { + "epoch": 1.64031982421875, + "grad_norm": 18.08597755432129, + "learning_rate": 6.3982774357337785e-06, + "loss": 4.7025, + "step": 80625 + }, + { + "epoch": 1.6404215494791665, + "grad_norm": 18.766305923461914, + "learning_rate": 6.397893700780987e-06, + "loss": 4.8801, + "step": 80630 + }, + { + "epoch": 1.6405232747395835, + "grad_norm": 16.544086456298828, + "learning_rate": 6.397509956896098e-06, + "loss": 4.9984, + "step": 80635 + }, + { + "epoch": 1.640625, + "grad_norm": 17.77387809753418, + "learning_rate": 6.397126204081565e-06, + "loss": 4.6905, + "step": 80640 + }, + { + "epoch": 1.6407267252604165, + "grad_norm": 16.53368377685547, + "learning_rate": 6.396742442339839e-06, + "loss": 4.7903, + "step": 80645 + }, + { + "epoch": 1.6408284505208335, + "grad_norm": 16.982990264892578, + "learning_rate": 6.396358671673371e-06, + "loss": 4.6737, + "step": 80650 + }, + { + "epoch": 1.64093017578125, + "grad_norm": 21.055984497070312, + "learning_rate": 6.3959748920846154e-06, + "loss": 4.5987, + "step": 80655 + }, + { + "epoch": 1.6410319010416665, + "grad_norm": 15.034321784973145, + "learning_rate": 6.395591103576023e-06, + "loss": 4.8948, + "step": 80660 + }, + { + "epoch": 1.6411336263020835, + "grad_norm": 18.17619514465332, + "learning_rate": 6.395207306150046e-06, + "loss": 4.8614, + "step": 80665 + }, + { + "epoch": 1.6412353515625, + "grad_norm": 15.231200218200684, + "learning_rate": 6.394823499809138e-06, + "loss": 4.9091, + "step": 80670 + }, + { + "epoch": 1.6413370768229165, + "grad_norm": 18.36714744567871, + "learning_rate": 6.39443968455575e-06, + "loss": 4.689, + "step": 80675 + }, + { + "epoch": 1.6414388020833335, + "grad_norm": 31.180469512939453, + "learning_rate": 6.394055860392336e-06, + "loss": 5.2506, + "step": 80680 + }, + { + "epoch": 1.64154052734375, + "grad_norm": 17.80890655517578, + "learning_rate": 6.393672027321347e-06, + "loss": 4.8716, + "step": 80685 + }, + { + "epoch": 1.6416422526041665, + "grad_norm": 15.907085418701172, + "learning_rate": 6.3932881853452355e-06, + "loss": 5.0533, + "step": 80690 + }, + { + "epoch": 1.6417439778645835, + "grad_norm": 16.4301815032959, + "learning_rate": 6.392904334466457e-06, + "loss": 5.0053, + "step": 80695 + }, + { + "epoch": 1.641845703125, + "grad_norm": 14.319703102111816, + "learning_rate": 6.3925204746874584e-06, + "loss": 4.9848, + "step": 80700 + }, + { + "epoch": 1.6419474283854165, + "grad_norm": 31.431838989257812, + "learning_rate": 6.392136606010699e-06, + "loss": 4.7706, + "step": 80705 + }, + { + "epoch": 1.6420491536458335, + "grad_norm": 21.196956634521484, + "learning_rate": 6.3917527284386275e-06, + "loss": 4.9668, + "step": 80710 + }, + { + "epoch": 1.64215087890625, + "grad_norm": 17.14997673034668, + "learning_rate": 6.391368841973696e-06, + "loss": 4.9131, + "step": 80715 + }, + { + "epoch": 1.6422526041666665, + "grad_norm": 23.51473045349121, + "learning_rate": 6.390984946618362e-06, + "loss": 4.7708, + "step": 80720 + }, + { + "epoch": 1.6423543294270835, + "grad_norm": 13.308450698852539, + "learning_rate": 6.390601042375075e-06, + "loss": 4.9197, + "step": 80725 + }, + { + "epoch": 1.6424560546875, + "grad_norm": 17.428943634033203, + "learning_rate": 6.390217129246289e-06, + "loss": 4.9939, + "step": 80730 + }, + { + "epoch": 1.6425577799479165, + "grad_norm": 19.864978790283203, + "learning_rate": 6.389833207234457e-06, + "loss": 5.1136, + "step": 80735 + }, + { + "epoch": 1.6426595052083335, + "grad_norm": 17.97338104248047, + "learning_rate": 6.38944927634203e-06, + "loss": 4.6842, + "step": 80740 + }, + { + "epoch": 1.64276123046875, + "grad_norm": 16.571027755737305, + "learning_rate": 6.389065336571466e-06, + "loss": 5.0518, + "step": 80745 + }, + { + "epoch": 1.6428629557291665, + "grad_norm": 41.82326126098633, + "learning_rate": 6.388681387925215e-06, + "loss": 5.449, + "step": 80750 + }, + { + "epoch": 1.6429646809895835, + "grad_norm": 12.66007137298584, + "learning_rate": 6.3882974304057285e-06, + "loss": 5.0176, + "step": 80755 + }, + { + "epoch": 1.64306640625, + "grad_norm": 13.119610786437988, + "learning_rate": 6.387913464015465e-06, + "loss": 5.0026, + "step": 80760 + }, + { + "epoch": 1.6431681315104165, + "grad_norm": 19.908994674682617, + "learning_rate": 6.387529488756873e-06, + "loss": 4.917, + "step": 80765 + }, + { + "epoch": 1.6432698567708335, + "grad_norm": 19.2406005859375, + "learning_rate": 6.387145504632409e-06, + "loss": 4.6219, + "step": 80770 + }, + { + "epoch": 1.64337158203125, + "grad_norm": 20.9529972076416, + "learning_rate": 6.386761511644526e-06, + "loss": 4.9498, + "step": 80775 + }, + { + "epoch": 1.6434733072916665, + "grad_norm": 28.6306209564209, + "learning_rate": 6.386377509795676e-06, + "loss": 5.0845, + "step": 80780 + }, + { + "epoch": 1.6435750325520835, + "grad_norm": 16.430273056030273, + "learning_rate": 6.385993499088314e-06, + "loss": 4.6116, + "step": 80785 + }, + { + "epoch": 1.6436767578125, + "grad_norm": 23.802947998046875, + "learning_rate": 6.385609479524893e-06, + "loss": 5.069, + "step": 80790 + }, + { + "epoch": 1.6437784830729165, + "grad_norm": 20.630538940429688, + "learning_rate": 6.385225451107868e-06, + "loss": 4.9475, + "step": 80795 + }, + { + "epoch": 1.6438802083333335, + "grad_norm": 30.469602584838867, + "learning_rate": 6.384841413839693e-06, + "loss": 4.9641, + "step": 80800 + }, + { + "epoch": 1.64398193359375, + "grad_norm": 17.774015426635742, + "learning_rate": 6.38445736772282e-06, + "loss": 4.9879, + "step": 80805 + }, + { + "epoch": 1.6440836588541665, + "grad_norm": 22.86528205871582, + "learning_rate": 6.384073312759704e-06, + "loss": 4.927, + "step": 80810 + }, + { + "epoch": 1.6441853841145835, + "grad_norm": 19.318899154663086, + "learning_rate": 6.3836892489528e-06, + "loss": 4.9622, + "step": 80815 + }, + { + "epoch": 1.644287109375, + "grad_norm": 17.162551879882812, + "learning_rate": 6.38330517630456e-06, + "loss": 4.8209, + "step": 80820 + }, + { + "epoch": 1.6443888346354165, + "grad_norm": 17.3750057220459, + "learning_rate": 6.3829210948174384e-06, + "loss": 4.8502, + "step": 80825 + }, + { + "epoch": 1.6444905598958335, + "grad_norm": 18.474512100219727, + "learning_rate": 6.382537004493889e-06, + "loss": 4.899, + "step": 80830 + }, + { + "epoch": 1.64459228515625, + "grad_norm": 17.502731323242188, + "learning_rate": 6.38215290533637e-06, + "loss": 4.961, + "step": 80835 + }, + { + "epoch": 1.6446940104166665, + "grad_norm": 19.049192428588867, + "learning_rate": 6.38176879734733e-06, + "loss": 4.923, + "step": 80840 + }, + { + "epoch": 1.6447957356770835, + "grad_norm": 16.95572280883789, + "learning_rate": 6.381384680529228e-06, + "loss": 4.8365, + "step": 80845 + }, + { + "epoch": 1.6448974609375, + "grad_norm": 17.47199058532715, + "learning_rate": 6.3810005548845155e-06, + "loss": 4.7705, + "step": 80850 + }, + { + "epoch": 1.6449991861979165, + "grad_norm": 14.259559631347656, + "learning_rate": 6.380616420415646e-06, + "loss": 5.1872, + "step": 80855 + }, + { + "epoch": 1.6451009114583335, + "grad_norm": 16.8327579498291, + "learning_rate": 6.380232277125079e-06, + "loss": 4.9075, + "step": 80860 + }, + { + "epoch": 1.64520263671875, + "grad_norm": 17.91512107849121, + "learning_rate": 6.3798481250152655e-06, + "loss": 4.7616, + "step": 80865 + }, + { + "epoch": 1.6453043619791665, + "grad_norm": 14.3180513381958, + "learning_rate": 6.379463964088659e-06, + "loss": 4.8466, + "step": 80870 + }, + { + "epoch": 1.6454060872395835, + "grad_norm": 21.422521591186523, + "learning_rate": 6.379079794347717e-06, + "loss": 5.027, + "step": 80875 + }, + { + "epoch": 1.6455078125, + "grad_norm": 21.34332847595215, + "learning_rate": 6.3786956157948924e-06, + "loss": 4.7641, + "step": 80880 + }, + { + "epoch": 1.6456095377604165, + "grad_norm": 16.040409088134766, + "learning_rate": 6.37831142843264e-06, + "loss": 4.8982, + "step": 80885 + }, + { + "epoch": 1.6457112630208335, + "grad_norm": 13.459758758544922, + "learning_rate": 6.3779272322634154e-06, + "loss": 5.2083, + "step": 80890 + }, + { + "epoch": 1.64581298828125, + "grad_norm": 23.152170181274414, + "learning_rate": 6.377543027289672e-06, + "loss": 4.8724, + "step": 80895 + }, + { + "epoch": 1.6459147135416665, + "grad_norm": 19.963716506958008, + "learning_rate": 6.377158813513869e-06, + "loss": 4.7053, + "step": 80900 + }, + { + "epoch": 1.6460164388020835, + "grad_norm": 22.603940963745117, + "learning_rate": 6.3767745909384575e-06, + "loss": 4.7269, + "step": 80905 + }, + { + "epoch": 1.6461181640625, + "grad_norm": 15.014775276184082, + "learning_rate": 6.376390359565891e-06, + "loss": 4.9468, + "step": 80910 + }, + { + "epoch": 1.6462198893229165, + "grad_norm": 14.574199676513672, + "learning_rate": 6.376006119398629e-06, + "loss": 4.7418, + "step": 80915 + }, + { + "epoch": 1.6463216145833335, + "grad_norm": 17.767648696899414, + "learning_rate": 6.375621870439123e-06, + "loss": 5.0078, + "step": 80920 + }, + { + "epoch": 1.64642333984375, + "grad_norm": 21.46990394592285, + "learning_rate": 6.375237612689832e-06, + "loss": 5.4304, + "step": 80925 + }, + { + "epoch": 1.6465250651041665, + "grad_norm": 17.75469398498535, + "learning_rate": 6.374853346153209e-06, + "loss": 5.1383, + "step": 80930 + }, + { + "epoch": 1.6466267903645835, + "grad_norm": 21.791345596313477, + "learning_rate": 6.374469070831708e-06, + "loss": 5.1395, + "step": 80935 + }, + { + "epoch": 1.646728515625, + "grad_norm": 20.495407104492188, + "learning_rate": 6.374084786727788e-06, + "loss": 4.8727, + "step": 80940 + }, + { + "epoch": 1.6468302408854165, + "grad_norm": 21.641590118408203, + "learning_rate": 6.3737004938439e-06, + "loss": 4.8696, + "step": 80945 + }, + { + "epoch": 1.6469319661458335, + "grad_norm": 18.400720596313477, + "learning_rate": 6.373316192182504e-06, + "loss": 4.7218, + "step": 80950 + }, + { + "epoch": 1.64703369140625, + "grad_norm": 28.448030471801758, + "learning_rate": 6.372931881746052e-06, + "loss": 5.1028, + "step": 80955 + }, + { + "epoch": 1.6471354166666665, + "grad_norm": 16.253934860229492, + "learning_rate": 6.372547562537002e-06, + "loss": 4.698, + "step": 80960 + }, + { + "epoch": 1.6472371419270835, + "grad_norm": 17.661788940429688, + "learning_rate": 6.3721632345578085e-06, + "loss": 4.9725, + "step": 80965 + }, + { + "epoch": 1.6473388671875, + "grad_norm": 17.534257888793945, + "learning_rate": 6.371778897810927e-06, + "loss": 4.7836, + "step": 80970 + }, + { + "epoch": 1.6474405924479165, + "grad_norm": 20.599491119384766, + "learning_rate": 6.371394552298815e-06, + "loss": 4.8187, + "step": 80975 + }, + { + "epoch": 1.6475423177083335, + "grad_norm": 18.604719161987305, + "learning_rate": 6.371010198023926e-06, + "loss": 4.6709, + "step": 80980 + }, + { + "epoch": 1.64764404296875, + "grad_norm": 23.21985626220703, + "learning_rate": 6.370625834988717e-06, + "loss": 5.1817, + "step": 80985 + }, + { + "epoch": 1.6477457682291665, + "grad_norm": 15.339303016662598, + "learning_rate": 6.370241463195644e-06, + "loss": 5.0602, + "step": 80990 + }, + { + "epoch": 1.6478474934895835, + "grad_norm": 20.48607063293457, + "learning_rate": 6.369857082647164e-06, + "loss": 5.0169, + "step": 80995 + }, + { + "epoch": 1.64794921875, + "grad_norm": 14.236988067626953, + "learning_rate": 6.3694726933457314e-06, + "loss": 4.7954, + "step": 81000 + }, + { + "epoch": 1.6480509440104165, + "grad_norm": 16.971261978149414, + "learning_rate": 6.369088295293804e-06, + "loss": 4.9755, + "step": 81005 + }, + { + "epoch": 1.6481526692708335, + "grad_norm": 15.959661483764648, + "learning_rate": 6.368703888493836e-06, + "loss": 4.898, + "step": 81010 + }, + { + "epoch": 1.64825439453125, + "grad_norm": 19.90723419189453, + "learning_rate": 6.368319472948285e-06, + "loss": 5.1855, + "step": 81015 + }, + { + "epoch": 1.6483561197916665, + "grad_norm": 18.128007888793945, + "learning_rate": 6.367935048659607e-06, + "loss": 4.9893, + "step": 81020 + }, + { + "epoch": 1.6484578450520835, + "grad_norm": 18.572139739990234, + "learning_rate": 6.367550615630258e-06, + "loss": 4.9982, + "step": 81025 + }, + { + "epoch": 1.6485595703125, + "grad_norm": 14.20007038116455, + "learning_rate": 6.367166173862695e-06, + "loss": 4.964, + "step": 81030 + }, + { + "epoch": 1.6486612955729165, + "grad_norm": 15.34067153930664, + "learning_rate": 6.3667817233593745e-06, + "loss": 4.8551, + "step": 81035 + }, + { + "epoch": 1.6487630208333335, + "grad_norm": 19.49968147277832, + "learning_rate": 6.366397264122751e-06, + "loss": 4.9305, + "step": 81040 + }, + { + "epoch": 1.64886474609375, + "grad_norm": 17.299442291259766, + "learning_rate": 6.366012796155284e-06, + "loss": 5.2323, + "step": 81045 + }, + { + "epoch": 1.6489664713541665, + "grad_norm": 14.583788871765137, + "learning_rate": 6.365628319459429e-06, + "loss": 4.7614, + "step": 81050 + }, + { + "epoch": 1.6490681966145835, + "grad_norm": 18.278379440307617, + "learning_rate": 6.365243834037642e-06, + "loss": 5.0017, + "step": 81055 + }, + { + "epoch": 1.649169921875, + "grad_norm": 18.971879959106445, + "learning_rate": 6.364859339892382e-06, + "loss": 5.0849, + "step": 81060 + }, + { + "epoch": 1.6492716471354165, + "grad_norm": 18.40877342224121, + "learning_rate": 6.364474837026101e-06, + "loss": 4.8954, + "step": 81065 + }, + { + "epoch": 1.6493733723958335, + "grad_norm": 18.695119857788086, + "learning_rate": 6.364090325441261e-06, + "loss": 4.9102, + "step": 81070 + }, + { + "epoch": 1.64947509765625, + "grad_norm": 20.76095199584961, + "learning_rate": 6.3637058051403164e-06, + "loss": 4.6822, + "step": 81075 + }, + { + "epoch": 1.6495768229166665, + "grad_norm": 19.483192443847656, + "learning_rate": 6.3633212761257245e-06, + "loss": 4.9358, + "step": 81080 + }, + { + "epoch": 1.6496785481770835, + "grad_norm": 19.371442794799805, + "learning_rate": 6.362936738399943e-06, + "loss": 4.8134, + "step": 81085 + }, + { + "epoch": 1.6497802734375, + "grad_norm": 23.279436111450195, + "learning_rate": 6.362552191965428e-06, + "loss": 4.6342, + "step": 81090 + }, + { + "epoch": 1.6498819986979165, + "grad_norm": 18.145849227905273, + "learning_rate": 6.362167636824636e-06, + "loss": 4.7175, + "step": 81095 + }, + { + "epoch": 1.6499837239583335, + "grad_norm": 20.077468872070312, + "learning_rate": 6.361783072980027e-06, + "loss": 5.0294, + "step": 81100 + }, + { + "epoch": 1.65008544921875, + "grad_norm": 17.56163215637207, + "learning_rate": 6.361398500434055e-06, + "loss": 4.826, + "step": 81105 + }, + { + "epoch": 1.6501871744791665, + "grad_norm": 17.780452728271484, + "learning_rate": 6.36101391918918e-06, + "loss": 5.005, + "step": 81110 + }, + { + "epoch": 1.6502888997395835, + "grad_norm": 24.29856300354004, + "learning_rate": 6.360629329247857e-06, + "loss": 4.8411, + "step": 81115 + }, + { + "epoch": 1.650390625, + "grad_norm": 14.079450607299805, + "learning_rate": 6.3602447306125445e-06, + "loss": 4.9531, + "step": 81120 + }, + { + "epoch": 1.6504923502604165, + "grad_norm": 17.48220443725586, + "learning_rate": 6.359860123285702e-06, + "loss": 5.1426, + "step": 81125 + }, + { + "epoch": 1.6505940755208335, + "grad_norm": 25.16912269592285, + "learning_rate": 6.3594755072697826e-06, + "loss": 4.7358, + "step": 81130 + }, + { + "epoch": 1.65069580078125, + "grad_norm": 18.183656692504883, + "learning_rate": 6.359090882567247e-06, + "loss": 4.8263, + "step": 81135 + }, + { + "epoch": 1.6507975260416665, + "grad_norm": 17.983339309692383, + "learning_rate": 6.358706249180552e-06, + "loss": 5.1446, + "step": 81140 + }, + { + "epoch": 1.6508992513020835, + "grad_norm": 18.93448829650879, + "learning_rate": 6.3583216071121555e-06, + "loss": 4.949, + "step": 81145 + }, + { + "epoch": 1.6510009765625, + "grad_norm": 18.048337936401367, + "learning_rate": 6.357936956364515e-06, + "loss": 5.0677, + "step": 81150 + }, + { + "epoch": 1.6511027018229165, + "grad_norm": 15.418044090270996, + "learning_rate": 6.3575522969400885e-06, + "loss": 4.8552, + "step": 81155 + }, + { + "epoch": 1.6512044270833335, + "grad_norm": 21.54730224609375, + "learning_rate": 6.357167628841334e-06, + "loss": 4.8746, + "step": 81160 + }, + { + "epoch": 1.65130615234375, + "grad_norm": 19.39606285095215, + "learning_rate": 6.35678295207071e-06, + "loss": 4.6817, + "step": 81165 + }, + { + "epoch": 1.6514078776041665, + "grad_norm": 18.66714859008789, + "learning_rate": 6.356398266630672e-06, + "loss": 4.7944, + "step": 81170 + }, + { + "epoch": 1.6515096028645835, + "grad_norm": 20.21660614013672, + "learning_rate": 6.35601357252368e-06, + "loss": 4.7856, + "step": 81175 + }, + { + "epoch": 1.651611328125, + "grad_norm": 13.549357414245605, + "learning_rate": 6.3556288697521926e-06, + "loss": 4.9764, + "step": 81180 + }, + { + "epoch": 1.6517130533854165, + "grad_norm": 20.12383460998535, + "learning_rate": 6.3552441583186665e-06, + "loss": 5.0659, + "step": 81185 + }, + { + "epoch": 1.6518147786458335, + "grad_norm": 18.5706787109375, + "learning_rate": 6.354859438225561e-06, + "loss": 4.7306, + "step": 81190 + }, + { + "epoch": 1.65191650390625, + "grad_norm": 15.046250343322754, + "learning_rate": 6.354474709475332e-06, + "loss": 4.9785, + "step": 81195 + }, + { + "epoch": 1.6520182291666665, + "grad_norm": 17.34564781188965, + "learning_rate": 6.354089972070442e-06, + "loss": 4.7347, + "step": 81200 + }, + { + "epoch": 1.6521199544270835, + "grad_norm": 15.587437629699707, + "learning_rate": 6.353705226013345e-06, + "loss": 5.0965, + "step": 81205 + }, + { + "epoch": 1.6522216796875, + "grad_norm": 16.42561912536621, + "learning_rate": 6.353320471306502e-06, + "loss": 4.8382, + "step": 81210 + }, + { + "epoch": 1.6523234049479165, + "grad_norm": 15.840572357177734, + "learning_rate": 6.352935707952372e-06, + "loss": 4.759, + "step": 81215 + }, + { + "epoch": 1.6524251302083335, + "grad_norm": 16.79633140563965, + "learning_rate": 6.35255093595341e-06, + "loss": 5.043, + "step": 81220 + }, + { + "epoch": 1.65252685546875, + "grad_norm": 14.530644416809082, + "learning_rate": 6.352166155312078e-06, + "loss": 4.9065, + "step": 81225 + }, + { + "epoch": 1.6526285807291665, + "grad_norm": 16.403554916381836, + "learning_rate": 6.351781366030834e-06, + "loss": 4.8684, + "step": 81230 + }, + { + "epoch": 1.6527303059895835, + "grad_norm": 19.680442810058594, + "learning_rate": 6.351396568112136e-06, + "loss": 4.7324, + "step": 81235 + }, + { + "epoch": 1.65283203125, + "grad_norm": 18.673748016357422, + "learning_rate": 6.3510117615584425e-06, + "loss": 4.8015, + "step": 81240 + }, + { + "epoch": 1.6529337565104165, + "grad_norm": 18.610750198364258, + "learning_rate": 6.3506269463722125e-06, + "loss": 4.8844, + "step": 81245 + }, + { + "epoch": 1.6530354817708335, + "grad_norm": 21.466955184936523, + "learning_rate": 6.350242122555905e-06, + "loss": 5.2614, + "step": 81250 + }, + { + "epoch": 1.65313720703125, + "grad_norm": 19.515018463134766, + "learning_rate": 6.34985729011198e-06, + "loss": 5.0048, + "step": 81255 + }, + { + "epoch": 1.6532389322916665, + "grad_norm": 13.566984176635742, + "learning_rate": 6.349472449042895e-06, + "loss": 5.1159, + "step": 81260 + }, + { + "epoch": 1.6533406575520835, + "grad_norm": 17.98697280883789, + "learning_rate": 6.349087599351109e-06, + "loss": 4.6136, + "step": 81265 + }, + { + "epoch": 1.6534423828125, + "grad_norm": 19.733985900878906, + "learning_rate": 6.348702741039081e-06, + "loss": 5.005, + "step": 81270 + }, + { + "epoch": 1.6535441080729165, + "grad_norm": 13.146855354309082, + "learning_rate": 6.3483178741092724e-06, + "loss": 4.8531, + "step": 81275 + }, + { + "epoch": 1.6536458333333335, + "grad_norm": 21.791034698486328, + "learning_rate": 6.3479329985641395e-06, + "loss": 4.8209, + "step": 81280 + }, + { + "epoch": 1.65374755859375, + "grad_norm": 14.045492172241211, + "learning_rate": 6.347548114406143e-06, + "loss": 4.7575, + "step": 81285 + }, + { + "epoch": 1.6538492838541665, + "grad_norm": 17.85917854309082, + "learning_rate": 6.347163221637741e-06, + "loss": 4.7438, + "step": 81290 + }, + { + "epoch": 1.6539510091145835, + "grad_norm": 15.612523078918457, + "learning_rate": 6.346778320261393e-06, + "loss": 4.8361, + "step": 81295 + }, + { + "epoch": 1.654052734375, + "grad_norm": 21.59895133972168, + "learning_rate": 6.346393410279561e-06, + "loss": 5.1065, + "step": 81300 + }, + { + "epoch": 1.6541544596354165, + "grad_norm": 24.232044219970703, + "learning_rate": 6.346008491694702e-06, + "loss": 5.1446, + "step": 81305 + }, + { + "epoch": 1.6542561848958335, + "grad_norm": 16.54058265686035, + "learning_rate": 6.345623564509275e-06, + "loss": 4.9207, + "step": 81310 + }, + { + "epoch": 1.65435791015625, + "grad_norm": 13.673179626464844, + "learning_rate": 6.345238628725743e-06, + "loss": 4.8331, + "step": 81315 + }, + { + "epoch": 1.6544596354166665, + "grad_norm": 19.406936645507812, + "learning_rate": 6.344853684346561e-06, + "loss": 4.8011, + "step": 81320 + }, + { + "epoch": 1.6545613606770835, + "grad_norm": 17.19759178161621, + "learning_rate": 6.344468731374191e-06, + "loss": 4.9844, + "step": 81325 + }, + { + "epoch": 1.6546630859375, + "grad_norm": 17.5775203704834, + "learning_rate": 6.344083769811093e-06, + "loss": 5.0159, + "step": 81330 + }, + { + "epoch": 1.6547648111979165, + "grad_norm": 17.338695526123047, + "learning_rate": 6.343698799659726e-06, + "loss": 5.0223, + "step": 81335 + }, + { + "epoch": 1.6548665364583335, + "grad_norm": 22.795528411865234, + "learning_rate": 6.343313820922552e-06, + "loss": 5.1445, + "step": 81340 + }, + { + "epoch": 1.65496826171875, + "grad_norm": 17.41619873046875, + "learning_rate": 6.3429288336020276e-06, + "loss": 4.9842, + "step": 81345 + }, + { + "epoch": 1.6550699869791665, + "grad_norm": 15.130722045898438, + "learning_rate": 6.342543837700614e-06, + "loss": 4.9455, + "step": 81350 + }, + { + "epoch": 1.6551717122395835, + "grad_norm": 17.885578155517578, + "learning_rate": 6.342158833220772e-06, + "loss": 4.9779, + "step": 81355 + }, + { + "epoch": 1.6552734375, + "grad_norm": 20.243146896362305, + "learning_rate": 6.34177382016496e-06, + "loss": 5.0165, + "step": 81360 + }, + { + "epoch": 1.6553751627604165, + "grad_norm": 20.909706115722656, + "learning_rate": 6.341388798535642e-06, + "loss": 5.072, + "step": 81365 + }, + { + "epoch": 1.6554768880208335, + "grad_norm": 27.43775177001953, + "learning_rate": 6.341003768335272e-06, + "loss": 5.0618, + "step": 81370 + }, + { + "epoch": 1.65557861328125, + "grad_norm": 16.363014221191406, + "learning_rate": 6.340618729566316e-06, + "loss": 4.6687, + "step": 81375 + }, + { + "epoch": 1.6556803385416665, + "grad_norm": 18.256288528442383, + "learning_rate": 6.34023368223123e-06, + "loss": 4.6235, + "step": 81380 + }, + { + "epoch": 1.6557820638020835, + "grad_norm": 13.885550498962402, + "learning_rate": 6.339848626332478e-06, + "loss": 5.2905, + "step": 81385 + }, + { + "epoch": 1.6558837890625, + "grad_norm": 16.63842010498047, + "learning_rate": 6.339463561872516e-06, + "loss": 4.8221, + "step": 81390 + }, + { + "epoch": 1.6559855143229165, + "grad_norm": 26.652450561523438, + "learning_rate": 6.339078488853811e-06, + "loss": 4.7906, + "step": 81395 + }, + { + "epoch": 1.6560872395833335, + "grad_norm": 18.638336181640625, + "learning_rate": 6.3386934072788164e-06, + "loss": 4.8952, + "step": 81400 + }, + { + "epoch": 1.65618896484375, + "grad_norm": 20.25375747680664, + "learning_rate": 6.338308317149997e-06, + "loss": 4.8874, + "step": 81405 + }, + { + "epoch": 1.6562906901041665, + "grad_norm": 18.460180282592773, + "learning_rate": 6.337923218469813e-06, + "loss": 5.0633, + "step": 81410 + }, + { + "epoch": 1.6563924153645835, + "grad_norm": 18.918088912963867, + "learning_rate": 6.337538111240723e-06, + "loss": 5.1738, + "step": 81415 + }, + { + "epoch": 1.656494140625, + "grad_norm": 17.88324546813965, + "learning_rate": 6.33715299546519e-06, + "loss": 4.7101, + "step": 81420 + }, + { + "epoch": 1.6565958658854165, + "grad_norm": 19.477937698364258, + "learning_rate": 6.336767871145673e-06, + "loss": 5.2146, + "step": 81425 + }, + { + "epoch": 1.6566975911458335, + "grad_norm": 19.45166015625, + "learning_rate": 6.336382738284634e-06, + "loss": 4.7346, + "step": 81430 + }, + { + "epoch": 1.65679931640625, + "grad_norm": 16.011672973632812, + "learning_rate": 6.335997596884535e-06, + "loss": 5.268, + "step": 81435 + }, + { + "epoch": 1.6569010416666665, + "grad_norm": 15.790066719055176, + "learning_rate": 6.335612446947833e-06, + "loss": 4.8434, + "step": 81440 + }, + { + "epoch": 1.6570027669270835, + "grad_norm": 18.13548469543457, + "learning_rate": 6.335227288476993e-06, + "loss": 4.9747, + "step": 81445 + }, + { + "epoch": 1.6571044921875, + "grad_norm": 20.141233444213867, + "learning_rate": 6.334842121474474e-06, + "loss": 5.0206, + "step": 81450 + }, + { + "epoch": 1.6572062174479165, + "grad_norm": 26.73401641845703, + "learning_rate": 6.334456945942738e-06, + "loss": 4.9614, + "step": 81455 + }, + { + "epoch": 1.6573079427083335, + "grad_norm": 16.917572021484375, + "learning_rate": 6.334071761884245e-06, + "loss": 4.9655, + "step": 81460 + }, + { + "epoch": 1.65740966796875, + "grad_norm": 20.01961898803711, + "learning_rate": 6.333686569301456e-06, + "loss": 4.8924, + "step": 81465 + }, + { + "epoch": 1.6575113932291665, + "grad_norm": 13.747541427612305, + "learning_rate": 6.333301368196836e-06, + "loss": 4.9353, + "step": 81470 + }, + { + "epoch": 1.6576131184895835, + "grad_norm": 16.179298400878906, + "learning_rate": 6.332916158572842e-06, + "loss": 5.0436, + "step": 81475 + }, + { + "epoch": 1.65771484375, + "grad_norm": 15.381621360778809, + "learning_rate": 6.332530940431936e-06, + "loss": 4.6896, + "step": 81480 + }, + { + "epoch": 1.6578165690104165, + "grad_norm": 18.311094284057617, + "learning_rate": 6.332145713776582e-06, + "loss": 5.1109, + "step": 81485 + }, + { + "epoch": 1.6579182942708335, + "grad_norm": 16.329191207885742, + "learning_rate": 6.331760478609238e-06, + "loss": 4.9213, + "step": 81490 + }, + { + "epoch": 1.65802001953125, + "grad_norm": 17.20502281188965, + "learning_rate": 6.331375234932369e-06, + "loss": 4.9559, + "step": 81495 + }, + { + "epoch": 1.6581217447916665, + "grad_norm": 18.692625045776367, + "learning_rate": 6.3309899827484336e-06, + "loss": 4.7978, + "step": 81500 + }, + { + "epoch": 1.6582234700520835, + "grad_norm": 24.413230895996094, + "learning_rate": 6.330604722059895e-06, + "loss": 4.9825, + "step": 81505 + }, + { + "epoch": 1.6583251953125, + "grad_norm": 20.67276954650879, + "learning_rate": 6.330219452869215e-06, + "loss": 4.8476, + "step": 81510 + }, + { + "epoch": 1.6584269205729165, + "grad_norm": 14.98585033416748, + "learning_rate": 6.329834175178855e-06, + "loss": 4.9279, + "step": 81515 + }, + { + "epoch": 1.6585286458333335, + "grad_norm": 17.961484909057617, + "learning_rate": 6.329448888991275e-06, + "loss": 4.8898, + "step": 81520 + }, + { + "epoch": 1.65863037109375, + "grad_norm": 13.961779594421387, + "learning_rate": 6.329063594308942e-06, + "loss": 4.6318, + "step": 81525 + }, + { + "epoch": 1.6587320963541665, + "grad_norm": 23.21220588684082, + "learning_rate": 6.328678291134312e-06, + "loss": 5.0139, + "step": 81530 + }, + { + "epoch": 1.6588338216145835, + "grad_norm": 16.479415893554688, + "learning_rate": 6.32829297946985e-06, + "loss": 4.9446, + "step": 81535 + }, + { + "epoch": 1.658935546875, + "grad_norm": 19.253427505493164, + "learning_rate": 6.327907659318018e-06, + "loss": 4.8605, + "step": 81540 + }, + { + "epoch": 1.6590372721354165, + "grad_norm": 15.24284553527832, + "learning_rate": 6.327522330681277e-06, + "loss": 5.0814, + "step": 81545 + }, + { + "epoch": 1.6591389973958335, + "grad_norm": 21.323915481567383, + "learning_rate": 6.327136993562091e-06, + "loss": 4.7812, + "step": 81550 + }, + { + "epoch": 1.65924072265625, + "grad_norm": 18.530086517333984, + "learning_rate": 6.326751647962919e-06, + "loss": 5.0106, + "step": 81555 + }, + { + "epoch": 1.6593424479166665, + "grad_norm": 14.993953704833984, + "learning_rate": 6.326366293886226e-06, + "loss": 4.8843, + "step": 81560 + }, + { + "epoch": 1.6594441731770835, + "grad_norm": 17.892427444458008, + "learning_rate": 6.325980931334474e-06, + "loss": 4.9927, + "step": 81565 + }, + { + "epoch": 1.6595458984375, + "grad_norm": 19.667678833007812, + "learning_rate": 6.325595560310123e-06, + "loss": 4.6565, + "step": 81570 + }, + { + "epoch": 1.6596476236979165, + "grad_norm": 16.376052856445312, + "learning_rate": 6.3252101808156395e-06, + "loss": 4.962, + "step": 81575 + }, + { + "epoch": 1.6597493489583335, + "grad_norm": 18.47883415222168, + "learning_rate": 6.324824792853482e-06, + "loss": 4.9154, + "step": 81580 + }, + { + "epoch": 1.65985107421875, + "grad_norm": 15.82834529876709, + "learning_rate": 6.324439396426116e-06, + "loss": 4.8389, + "step": 81585 + }, + { + "epoch": 1.6599527994791665, + "grad_norm": 22.1400089263916, + "learning_rate": 6.324053991536002e-06, + "loss": 5.0911, + "step": 81590 + }, + { + "epoch": 1.6600545247395835, + "grad_norm": 14.521541595458984, + "learning_rate": 6.323668578185602e-06, + "loss": 4.9274, + "step": 81595 + }, + { + "epoch": 1.66015625, + "grad_norm": 19.430469512939453, + "learning_rate": 6.323283156377382e-06, + "loss": 4.8042, + "step": 81600 + }, + { + "epoch": 1.6602579752604165, + "grad_norm": 16.119848251342773, + "learning_rate": 6.322897726113803e-06, + "loss": 5.1018, + "step": 81605 + }, + { + "epoch": 1.6603597005208335, + "grad_norm": 17.1912784576416, + "learning_rate": 6.322512287397326e-06, + "loss": 4.7607, + "step": 81610 + }, + { + "epoch": 1.66046142578125, + "grad_norm": 25.520849227905273, + "learning_rate": 6.322126840230416e-06, + "loss": 4.9718, + "step": 81615 + }, + { + "epoch": 1.6605631510416665, + "grad_norm": 11.817096710205078, + "learning_rate": 6.321741384615534e-06, + "loss": 4.9546, + "step": 81620 + }, + { + "epoch": 1.6606648763020835, + "grad_norm": 20.26352882385254, + "learning_rate": 6.321355920555146e-06, + "loss": 4.8129, + "step": 81625 + }, + { + "epoch": 1.6607666015625, + "grad_norm": 16.08393669128418, + "learning_rate": 6.320970448051712e-06, + "loss": 4.9397, + "step": 81630 + }, + { + "epoch": 1.6608683268229165, + "grad_norm": 20.400728225708008, + "learning_rate": 6.320584967107695e-06, + "loss": 4.5311, + "step": 81635 + }, + { + "epoch": 1.6609700520833335, + "grad_norm": 20.26088523864746, + "learning_rate": 6.3201994777255606e-06, + "loss": 5.0722, + "step": 81640 + }, + { + "epoch": 1.66107177734375, + "grad_norm": 19.338871002197266, + "learning_rate": 6.31981397990777e-06, + "loss": 4.9191, + "step": 81645 + }, + { + "epoch": 1.6611735026041665, + "grad_norm": 16.67496109008789, + "learning_rate": 6.319428473656788e-06, + "loss": 4.7027, + "step": 81650 + }, + { + "epoch": 1.6612752278645835, + "grad_norm": 20.883230209350586, + "learning_rate": 6.3190429589750765e-06, + "loss": 4.9564, + "step": 81655 + }, + { + "epoch": 1.661376953125, + "grad_norm": 14.7133150100708, + "learning_rate": 6.3186574358650985e-06, + "loss": 4.6484, + "step": 81660 + }, + { + "epoch": 1.6614786783854165, + "grad_norm": 14.857380867004395, + "learning_rate": 6.3182719043293195e-06, + "loss": 4.6328, + "step": 81665 + }, + { + "epoch": 1.6615804036458335, + "grad_norm": 17.862838745117188, + "learning_rate": 6.3178863643702e-06, + "loss": 5.0497, + "step": 81670 + }, + { + "epoch": 1.66168212890625, + "grad_norm": 20.393320083618164, + "learning_rate": 6.317500815990206e-06, + "loss": 4.8333, + "step": 81675 + }, + { + "epoch": 1.6617838541666665, + "grad_norm": 16.05791664123535, + "learning_rate": 6.317115259191801e-06, + "loss": 4.6739, + "step": 81680 + }, + { + "epoch": 1.6618855794270835, + "grad_norm": 16.028257369995117, + "learning_rate": 6.316729693977446e-06, + "loss": 4.7736, + "step": 81685 + }, + { + "epoch": 1.6619873046875, + "grad_norm": 22.160606384277344, + "learning_rate": 6.316344120349607e-06, + "loss": 4.8376, + "step": 81690 + }, + { + "epoch": 1.6620890299479165, + "grad_norm": 21.484905242919922, + "learning_rate": 6.315958538310747e-06, + "loss": 4.8652, + "step": 81695 + }, + { + "epoch": 1.6621907552083335, + "grad_norm": 17.665740966796875, + "learning_rate": 6.315572947863328e-06, + "loss": 5.0974, + "step": 81700 + }, + { + "epoch": 1.66229248046875, + "grad_norm": 20.534576416015625, + "learning_rate": 6.315187349009817e-06, + "loss": 5.1247, + "step": 81705 + }, + { + "epoch": 1.6623942057291665, + "grad_norm": 19.177820205688477, + "learning_rate": 6.3148017417526755e-06, + "loss": 5.0426, + "step": 81710 + }, + { + "epoch": 1.6624959309895835, + "grad_norm": 16.58751678466797, + "learning_rate": 6.314416126094369e-06, + "loss": 5.0824, + "step": 81715 + }, + { + "epoch": 1.66259765625, + "grad_norm": 16.335390090942383, + "learning_rate": 6.314030502037362e-06, + "loss": 4.9811, + "step": 81720 + }, + { + "epoch": 1.6626993815104165, + "grad_norm": 20.11294174194336, + "learning_rate": 6.313644869584115e-06, + "loss": 4.9333, + "step": 81725 + }, + { + "epoch": 1.6628011067708335, + "grad_norm": 13.646926879882812, + "learning_rate": 6.313259228737095e-06, + "loss": 4.5716, + "step": 81730 + }, + { + "epoch": 1.66290283203125, + "grad_norm": 23.137313842773438, + "learning_rate": 6.312873579498765e-06, + "loss": 4.6441, + "step": 81735 + }, + { + "epoch": 1.6630045572916665, + "grad_norm": 14.552658081054688, + "learning_rate": 6.312487921871591e-06, + "loss": 4.7767, + "step": 81740 + }, + { + "epoch": 1.6631062825520835, + "grad_norm": 17.16745376586914, + "learning_rate": 6.3121022558580345e-06, + "loss": 4.9047, + "step": 81745 + }, + { + "epoch": 1.6632080078125, + "grad_norm": 15.715396881103516, + "learning_rate": 6.311716581460561e-06, + "loss": 4.9002, + "step": 81750 + }, + { + "epoch": 1.6633097330729165, + "grad_norm": 16.775814056396484, + "learning_rate": 6.3113308986816355e-06, + "loss": 4.855, + "step": 81755 + }, + { + "epoch": 1.6634114583333335, + "grad_norm": 20.20855140686035, + "learning_rate": 6.310945207523722e-06, + "loss": 4.8816, + "step": 81760 + }, + { + "epoch": 1.66351318359375, + "grad_norm": 15.971122741699219, + "learning_rate": 6.3105595079892845e-06, + "loss": 4.878, + "step": 81765 + }, + { + "epoch": 1.6636149088541665, + "grad_norm": 21.97266960144043, + "learning_rate": 6.3101738000807875e-06, + "loss": 4.9992, + "step": 81770 + }, + { + "epoch": 1.6637166341145835, + "grad_norm": 18.93181037902832, + "learning_rate": 6.309788083800696e-06, + "loss": 5.3233, + "step": 81775 + }, + { + "epoch": 1.663818359375, + "grad_norm": 15.990411758422852, + "learning_rate": 6.309402359151474e-06, + "loss": 4.8733, + "step": 81780 + }, + { + "epoch": 1.6639200846354165, + "grad_norm": 12.520604133605957, + "learning_rate": 6.309016626135587e-06, + "loss": 4.7704, + "step": 81785 + }, + { + "epoch": 1.6640218098958335, + "grad_norm": 18.04343032836914, + "learning_rate": 6.308630884755499e-06, + "loss": 4.7473, + "step": 81790 + }, + { + "epoch": 1.66412353515625, + "grad_norm": 17.946287155151367, + "learning_rate": 6.308245135013676e-06, + "loss": 4.6115, + "step": 81795 + }, + { + "epoch": 1.6642252604166665, + "grad_norm": 17.573503494262695, + "learning_rate": 6.307859376912581e-06, + "loss": 4.7997, + "step": 81800 + }, + { + "epoch": 1.6643269856770835, + "grad_norm": 20.179800033569336, + "learning_rate": 6.30747361045468e-06, + "loss": 4.7888, + "step": 81805 + }, + { + "epoch": 1.6644287109375, + "grad_norm": 18.983877182006836, + "learning_rate": 6.307087835642438e-06, + "loss": 5.0576, + "step": 81810 + }, + { + "epoch": 1.6645304361979165, + "grad_norm": 18.522235870361328, + "learning_rate": 6.30670205247832e-06, + "loss": 4.9215, + "step": 81815 + }, + { + "epoch": 1.6646321614583335, + "grad_norm": 13.757329940795898, + "learning_rate": 6.30631626096479e-06, + "loss": 5.0138, + "step": 81820 + }, + { + "epoch": 1.66473388671875, + "grad_norm": 14.716339111328125, + "learning_rate": 6.305930461104315e-06, + "loss": 4.8926, + "step": 81825 + }, + { + "epoch": 1.6648356119791665, + "grad_norm": 23.58364486694336, + "learning_rate": 6.305544652899356e-06, + "loss": 4.8352, + "step": 81830 + }, + { + "epoch": 1.6649373372395835, + "grad_norm": 19.770488739013672, + "learning_rate": 6.305158836352384e-06, + "loss": 4.836, + "step": 81835 + }, + { + "epoch": 1.6650390625, + "grad_norm": 19.63115119934082, + "learning_rate": 6.304773011465859e-06, + "loss": 5.0177, + "step": 81840 + }, + { + "epoch": 1.6651407877604165, + "grad_norm": 22.660966873168945, + "learning_rate": 6.304387178242251e-06, + "loss": 4.8513, + "step": 81845 + }, + { + "epoch": 1.6652425130208335, + "grad_norm": 17.921886444091797, + "learning_rate": 6.304001336684022e-06, + "loss": 4.8762, + "step": 81850 + }, + { + "epoch": 1.66534423828125, + "grad_norm": 21.4168758392334, + "learning_rate": 6.303615486793637e-06, + "loss": 4.891, + "step": 81855 + }, + { + "epoch": 1.6654459635416665, + "grad_norm": 22.5517635345459, + "learning_rate": 6.303229628573565e-06, + "loss": 4.8322, + "step": 81860 + }, + { + "epoch": 1.6655476888020835, + "grad_norm": 21.161277770996094, + "learning_rate": 6.302843762026268e-06, + "loss": 5.1633, + "step": 81865 + }, + { + "epoch": 1.6656494140625, + "grad_norm": 15.342466354370117, + "learning_rate": 6.302457887154214e-06, + "loss": 4.9353, + "step": 81870 + }, + { + "epoch": 1.6657511393229165, + "grad_norm": 20.335542678833008, + "learning_rate": 6.3020720039598674e-06, + "loss": 4.852, + "step": 81875 + }, + { + "epoch": 1.6658528645833335, + "grad_norm": 16.19839859008789, + "learning_rate": 6.301686112445693e-06, + "loss": 4.7433, + "step": 81880 + }, + { + "epoch": 1.66595458984375, + "grad_norm": 26.297107696533203, + "learning_rate": 6.301300212614159e-06, + "loss": 4.8379, + "step": 81885 + }, + { + "epoch": 1.6660563151041665, + "grad_norm": 19.199756622314453, + "learning_rate": 6.300914304467728e-06, + "loss": 4.8959, + "step": 81890 + }, + { + "epoch": 1.6661580403645835, + "grad_norm": 17.738597869873047, + "learning_rate": 6.3005283880088686e-06, + "loss": 5.1261, + "step": 81895 + }, + { + "epoch": 1.666259765625, + "grad_norm": 16.79474639892578, + "learning_rate": 6.300142463240046e-06, + "loss": 4.8497, + "step": 81900 + }, + { + "epoch": 1.6663614908854165, + "grad_norm": 21.837404251098633, + "learning_rate": 6.299756530163725e-06, + "loss": 5.1557, + "step": 81905 + }, + { + "epoch": 1.6664632161458335, + "grad_norm": 17.648738861083984, + "learning_rate": 6.299370588782372e-06, + "loss": 4.7387, + "step": 81910 + }, + { + "epoch": 1.66656494140625, + "grad_norm": 15.45201301574707, + "learning_rate": 6.298984639098456e-06, + "loss": 4.7534, + "step": 81915 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 25.269290924072266, + "learning_rate": 6.2985986811144374e-06, + "loss": 4.6042, + "step": 81920 + }, + { + "epoch": 1.6667683919270835, + "grad_norm": 18.317211151123047, + "learning_rate": 6.298212714832786e-06, + "loss": 4.9531, + "step": 81925 + }, + { + "epoch": 1.6668701171875, + "grad_norm": 21.692825317382812, + "learning_rate": 6.2978267402559675e-06, + "loss": 4.7934, + "step": 81930 + }, + { + "epoch": 1.6669718424479165, + "grad_norm": 20.057174682617188, + "learning_rate": 6.297440757386451e-06, + "loss": 4.8912, + "step": 81935 + }, + { + "epoch": 1.6670735677083335, + "grad_norm": 22.418794631958008, + "learning_rate": 6.297054766226696e-06, + "loss": 4.9236, + "step": 81940 + }, + { + "epoch": 1.66717529296875, + "grad_norm": 16.65369987487793, + "learning_rate": 6.296668766779175e-06, + "loss": 4.8926, + "step": 81945 + }, + { + "epoch": 1.6672770182291665, + "grad_norm": 16.93004608154297, + "learning_rate": 6.296282759046353e-06, + "loss": 4.9631, + "step": 81950 + }, + { + "epoch": 1.6673787434895835, + "grad_norm": 20.665237426757812, + "learning_rate": 6.295896743030693e-06, + "loss": 4.9669, + "step": 81955 + }, + { + "epoch": 1.66748046875, + "grad_norm": 13.076620101928711, + "learning_rate": 6.295510718734665e-06, + "loss": 4.9206, + "step": 81960 + }, + { + "epoch": 1.6675821940104165, + "grad_norm": 20.319683074951172, + "learning_rate": 6.295124686160736e-06, + "loss": 4.9255, + "step": 81965 + }, + { + "epoch": 1.6676839192708335, + "grad_norm": 13.199480056762695, + "learning_rate": 6.2947386453113705e-06, + "loss": 4.9985, + "step": 81970 + }, + { + "epoch": 1.66778564453125, + "grad_norm": 19.22233009338379, + "learning_rate": 6.294352596189036e-06, + "loss": 4.7959, + "step": 81975 + }, + { + "epoch": 1.6678873697916665, + "grad_norm": 18.435565948486328, + "learning_rate": 6.2939665387962e-06, + "loss": 4.886, + "step": 81980 + }, + { + "epoch": 1.6679890950520835, + "grad_norm": 15.750076293945312, + "learning_rate": 6.293580473135328e-06, + "loss": 4.8081, + "step": 81985 + }, + { + "epoch": 1.6680908203125, + "grad_norm": 29.37137794494629, + "learning_rate": 6.293194399208888e-06, + "loss": 5.0008, + "step": 81990 + }, + { + "epoch": 1.6681925455729165, + "grad_norm": 14.727178573608398, + "learning_rate": 6.292808317019345e-06, + "loss": 4.8922, + "step": 81995 + }, + { + "epoch": 1.6682942708333335, + "grad_norm": 14.76876163482666, + "learning_rate": 6.292422226569168e-06, + "loss": 5.0604, + "step": 82000 + }, + { + "epoch": 1.66839599609375, + "grad_norm": 19.479084014892578, + "learning_rate": 6.292036127860825e-06, + "loss": 4.8129, + "step": 82005 + }, + { + "epoch": 1.6684977213541665, + "grad_norm": 17.436481475830078, + "learning_rate": 6.291650020896778e-06, + "loss": 4.923, + "step": 82010 + }, + { + "epoch": 1.6685994466145835, + "grad_norm": 18.99956703186035, + "learning_rate": 6.2912639056795e-06, + "loss": 4.861, + "step": 82015 + }, + { + "epoch": 1.668701171875, + "grad_norm": 15.401307106018066, + "learning_rate": 6.290877782211453e-06, + "loss": 4.9603, + "step": 82020 + }, + { + "epoch": 1.6688028971354165, + "grad_norm": 17.6651668548584, + "learning_rate": 6.2904916504951096e-06, + "loss": 4.8451, + "step": 82025 + }, + { + "epoch": 1.6689046223958335, + "grad_norm": 20.224050521850586, + "learning_rate": 6.290105510532934e-06, + "loss": 4.865, + "step": 82030 + }, + { + "epoch": 1.66900634765625, + "grad_norm": 16.096248626708984, + "learning_rate": 6.289719362327391e-06, + "loss": 4.8034, + "step": 82035 + }, + { + "epoch": 1.6691080729166665, + "grad_norm": 21.926891326904297, + "learning_rate": 6.289333205880953e-06, + "loss": 4.9351, + "step": 82040 + }, + { + "epoch": 1.6692097981770835, + "grad_norm": 16.51157569885254, + "learning_rate": 6.288947041196083e-06, + "loss": 4.6717, + "step": 82045 + }, + { + "epoch": 1.6693115234375, + "grad_norm": 15.594719886779785, + "learning_rate": 6.288560868275252e-06, + "loss": 5.1301, + "step": 82050 + }, + { + "epoch": 1.6694132486979165, + "grad_norm": 19.60822105407715, + "learning_rate": 6.288174687120927e-06, + "loss": 4.9054, + "step": 82055 + }, + { + "epoch": 1.6695149739583335, + "grad_norm": 18.10324478149414, + "learning_rate": 6.287788497735572e-06, + "loss": 4.9842, + "step": 82060 + }, + { + "epoch": 1.66961669921875, + "grad_norm": 16.451675415039062, + "learning_rate": 6.28740230012166e-06, + "loss": 4.9264, + "step": 82065 + }, + { + "epoch": 1.6697184244791665, + "grad_norm": 14.925625801086426, + "learning_rate": 6.287016094281655e-06, + "loss": 5.1752, + "step": 82070 + }, + { + "epoch": 1.6698201497395835, + "grad_norm": 23.909175872802734, + "learning_rate": 6.2866298802180255e-06, + "loss": 5.244, + "step": 82075 + }, + { + "epoch": 1.669921875, + "grad_norm": 18.107318878173828, + "learning_rate": 6.28624365793324e-06, + "loss": 5.0628, + "step": 82080 + }, + { + "epoch": 1.6700236002604165, + "grad_norm": 14.775787353515625, + "learning_rate": 6.285857427429764e-06, + "loss": 5.2483, + "step": 82085 + }, + { + "epoch": 1.6701253255208335, + "grad_norm": 22.868247985839844, + "learning_rate": 6.2854711887100706e-06, + "loss": 4.8819, + "step": 82090 + }, + { + "epoch": 1.67022705078125, + "grad_norm": 20.8818302154541, + "learning_rate": 6.285084941776622e-06, + "loss": 4.8264, + "step": 82095 + }, + { + "epoch": 1.6703287760416665, + "grad_norm": 17.850513458251953, + "learning_rate": 6.284698686631889e-06, + "loss": 5.1949, + "step": 82100 + }, + { + "epoch": 1.6704305013020835, + "grad_norm": 13.266267776489258, + "learning_rate": 6.28431242327834e-06, + "loss": 4.9667, + "step": 82105 + }, + { + "epoch": 1.6705322265625, + "grad_norm": 17.048677444458008, + "learning_rate": 6.283926151718441e-06, + "loss": 4.9681, + "step": 82110 + }, + { + "epoch": 1.6706339518229165, + "grad_norm": 20.814102172851562, + "learning_rate": 6.283539871954662e-06, + "loss": 4.697, + "step": 82115 + }, + { + "epoch": 1.6707356770833335, + "grad_norm": 15.860257148742676, + "learning_rate": 6.283153583989472e-06, + "loss": 4.8033, + "step": 82120 + }, + { + "epoch": 1.67083740234375, + "grad_norm": 13.802936553955078, + "learning_rate": 6.282767287825335e-06, + "loss": 4.7935, + "step": 82125 + }, + { + "epoch": 1.6709391276041665, + "grad_norm": 21.304397583007812, + "learning_rate": 6.282380983464725e-06, + "loss": 4.7564, + "step": 82130 + }, + { + "epoch": 1.6710408528645835, + "grad_norm": 21.234270095825195, + "learning_rate": 6.281994670910108e-06, + "loss": 4.7961, + "step": 82135 + }, + { + "epoch": 1.671142578125, + "grad_norm": 16.89966583251953, + "learning_rate": 6.281608350163949e-06, + "loss": 4.8334, + "step": 82140 + }, + { + "epoch": 1.6712443033854165, + "grad_norm": 18.535573959350586, + "learning_rate": 6.281222021228721e-06, + "loss": 4.9379, + "step": 82145 + }, + { + "epoch": 1.6713460286458335, + "grad_norm": 20.479244232177734, + "learning_rate": 6.280835684106891e-06, + "loss": 5.0773, + "step": 82150 + }, + { + "epoch": 1.67144775390625, + "grad_norm": 17.595840454101562, + "learning_rate": 6.280449338800928e-06, + "loss": 4.8167, + "step": 82155 + }, + { + "epoch": 1.6715494791666665, + "grad_norm": 18.362346649169922, + "learning_rate": 6.2800629853133e-06, + "loss": 4.7758, + "step": 82160 + }, + { + "epoch": 1.6716512044270835, + "grad_norm": 12.764823913574219, + "learning_rate": 6.279676623646476e-06, + "loss": 4.8066, + "step": 82165 + }, + { + "epoch": 1.6717529296875, + "grad_norm": 16.101640701293945, + "learning_rate": 6.279290253802923e-06, + "loss": 4.8999, + "step": 82170 + }, + { + "epoch": 1.6718546549479165, + "grad_norm": 15.222220420837402, + "learning_rate": 6.278903875785113e-06, + "loss": 5.0081, + "step": 82175 + }, + { + "epoch": 1.6719563802083335, + "grad_norm": 19.611906051635742, + "learning_rate": 6.278517489595513e-06, + "loss": 4.7648, + "step": 82180 + }, + { + "epoch": 1.67205810546875, + "grad_norm": 17.045841217041016, + "learning_rate": 6.2781310952365915e-06, + "loss": 4.9863, + "step": 82185 + }, + { + "epoch": 1.6721598307291665, + "grad_norm": 16.71947479248047, + "learning_rate": 6.277744692710817e-06, + "loss": 5.0014, + "step": 82190 + }, + { + "epoch": 1.6722615559895835, + "grad_norm": 17.52019691467285, + "learning_rate": 6.277358282020663e-06, + "loss": 4.6901, + "step": 82195 + }, + { + "epoch": 1.67236328125, + "grad_norm": 16.129201889038086, + "learning_rate": 6.276971863168593e-06, + "loss": 5.0026, + "step": 82200 + }, + { + "epoch": 1.6724650065104165, + "grad_norm": 14.257380485534668, + "learning_rate": 6.2765854361570775e-06, + "loss": 4.7317, + "step": 82205 + }, + { + "epoch": 1.6725667317708335, + "grad_norm": 15.88701343536377, + "learning_rate": 6.276199000988587e-06, + "loss": 4.7217, + "step": 82210 + }, + { + "epoch": 1.67266845703125, + "grad_norm": 18.5660400390625, + "learning_rate": 6.275812557665591e-06, + "loss": 5.0026, + "step": 82215 + }, + { + "epoch": 1.6727701822916665, + "grad_norm": 24.773942947387695, + "learning_rate": 6.275426106190557e-06, + "loss": 5.0545, + "step": 82220 + }, + { + "epoch": 1.6728719075520835, + "grad_norm": 21.724517822265625, + "learning_rate": 6.2750396465659545e-06, + "loss": 4.7248, + "step": 82225 + }, + { + "epoch": 1.6729736328125, + "grad_norm": 19.466964721679688, + "learning_rate": 6.274653178794252e-06, + "loss": 4.7522, + "step": 82230 + }, + { + "epoch": 1.6730753580729165, + "grad_norm": 19.56180763244629, + "learning_rate": 6.274266702877923e-06, + "loss": 4.914, + "step": 82235 + }, + { + "epoch": 1.6731770833333335, + "grad_norm": 16.513999938964844, + "learning_rate": 6.273880218819433e-06, + "loss": 4.7344, + "step": 82240 + }, + { + "epoch": 1.67327880859375, + "grad_norm": 15.535037994384766, + "learning_rate": 6.273493726621253e-06, + "loss": 4.7619, + "step": 82245 + }, + { + "epoch": 1.6733805338541665, + "grad_norm": 15.766763687133789, + "learning_rate": 6.273107226285854e-06, + "loss": 5.1164, + "step": 82250 + }, + { + "epoch": 1.6734822591145835, + "grad_norm": 19.62434959411621, + "learning_rate": 6.272720717815701e-06, + "loss": 5.1646, + "step": 82255 + }, + { + "epoch": 1.673583984375, + "grad_norm": 18.162532806396484, + "learning_rate": 6.272334201213269e-06, + "loss": 4.9638, + "step": 82260 + }, + { + "epoch": 1.6736857096354165, + "grad_norm": 19.32564353942871, + "learning_rate": 6.271947676481025e-06, + "loss": 4.9717, + "step": 82265 + }, + { + "epoch": 1.6737874348958335, + "grad_norm": 14.479071617126465, + "learning_rate": 6.271561143621437e-06, + "loss": 4.8916, + "step": 82270 + }, + { + "epoch": 1.67388916015625, + "grad_norm": 24.007654190063477, + "learning_rate": 6.27117460263698e-06, + "loss": 4.7917, + "step": 82275 + }, + { + "epoch": 1.6739908854166665, + "grad_norm": 20.206745147705078, + "learning_rate": 6.270788053530118e-06, + "loss": 4.8413, + "step": 82280 + }, + { + "epoch": 1.6740926106770835, + "grad_norm": 13.446179389953613, + "learning_rate": 6.2704014963033255e-06, + "loss": 4.8337, + "step": 82285 + }, + { + "epoch": 1.6741943359375, + "grad_norm": 28.231748580932617, + "learning_rate": 6.270014930959071e-06, + "loss": 5.1108, + "step": 82290 + }, + { + "epoch": 1.6742960611979165, + "grad_norm": 21.24274444580078, + "learning_rate": 6.269628357499823e-06, + "loss": 4.9271, + "step": 82295 + }, + { + "epoch": 1.6743977864583335, + "grad_norm": 20.534276962280273, + "learning_rate": 6.269241775928053e-06, + "loss": 4.7915, + "step": 82300 + }, + { + "epoch": 1.67449951171875, + "grad_norm": 21.973617553710938, + "learning_rate": 6.268855186246231e-06, + "loss": 4.9422, + "step": 82305 + }, + { + "epoch": 1.6746012369791665, + "grad_norm": 15.90302848815918, + "learning_rate": 6.268468588456829e-06, + "loss": 4.9943, + "step": 82310 + }, + { + "epoch": 1.6747029622395835, + "grad_norm": 16.362464904785156, + "learning_rate": 6.268081982562313e-06, + "loss": 4.7146, + "step": 82315 + }, + { + "epoch": 1.6748046875, + "grad_norm": 17.688783645629883, + "learning_rate": 6.267695368565157e-06, + "loss": 4.9939, + "step": 82320 + }, + { + "epoch": 1.6749064127604165, + "grad_norm": 16.242630004882812, + "learning_rate": 6.2673087464678284e-06, + "loss": 4.5987, + "step": 82325 + }, + { + "epoch": 1.6750081380208335, + "grad_norm": 17.958749771118164, + "learning_rate": 6.2669221162728e-06, + "loss": 4.6668, + "step": 82330 + }, + { + "epoch": 1.67510986328125, + "grad_norm": 16.28798484802246, + "learning_rate": 6.266535477982541e-06, + "loss": 5.0924, + "step": 82335 + }, + { + "epoch": 1.6752115885416665, + "grad_norm": 20.381206512451172, + "learning_rate": 6.266148831599525e-06, + "loss": 4.7683, + "step": 82340 + }, + { + "epoch": 1.6753133138020835, + "grad_norm": 18.427297592163086, + "learning_rate": 6.265762177126217e-06, + "loss": 4.9161, + "step": 82345 + }, + { + "epoch": 1.6754150390625, + "grad_norm": 24.998807907104492, + "learning_rate": 6.265375514565092e-06, + "loss": 4.9461, + "step": 82350 + }, + { + "epoch": 1.6755167643229165, + "grad_norm": 16.76968002319336, + "learning_rate": 6.264988843918619e-06, + "loss": 4.9656, + "step": 82355 + }, + { + "epoch": 1.6756184895833335, + "grad_norm": 18.3514461517334, + "learning_rate": 6.264602165189267e-06, + "loss": 4.9439, + "step": 82360 + }, + { + "epoch": 1.67572021484375, + "grad_norm": 14.827873229980469, + "learning_rate": 6.264215478379512e-06, + "loss": 4.6292, + "step": 82365 + }, + { + "epoch": 1.6758219401041665, + "grad_norm": 16.41152572631836, + "learning_rate": 6.263828783491818e-06, + "loss": 4.865, + "step": 82370 + }, + { + "epoch": 1.6759236653645835, + "grad_norm": 20.077211380004883, + "learning_rate": 6.2634420805286614e-06, + "loss": 4.9538, + "step": 82375 + }, + { + "epoch": 1.676025390625, + "grad_norm": 19.28754234313965, + "learning_rate": 6.26305536949251e-06, + "loss": 5.1253, + "step": 82380 + }, + { + "epoch": 1.6761271158854165, + "grad_norm": 22.932897567749023, + "learning_rate": 6.262668650385836e-06, + "loss": 4.7648, + "step": 82385 + }, + { + "epoch": 1.6762288411458335, + "grad_norm": 20.90911293029785, + "learning_rate": 6.262281923211111e-06, + "loss": 4.6634, + "step": 82390 + }, + { + "epoch": 1.67633056640625, + "grad_norm": 30.537525177001953, + "learning_rate": 6.261895187970804e-06, + "loss": 5.0808, + "step": 82395 + }, + { + "epoch": 1.6764322916666665, + "grad_norm": 18.63669776916504, + "learning_rate": 6.261508444667388e-06, + "loss": 4.812, + "step": 82400 + }, + { + "epoch": 1.6765340169270835, + "grad_norm": 15.450441360473633, + "learning_rate": 6.261121693303333e-06, + "loss": 4.6233, + "step": 82405 + }, + { + "epoch": 1.6766357421875, + "grad_norm": 26.886629104614258, + "learning_rate": 6.26073493388111e-06, + "loss": 4.717, + "step": 82410 + }, + { + "epoch": 1.6767374674479165, + "grad_norm": 19.65818214416504, + "learning_rate": 6.260348166403194e-06, + "loss": 4.9391, + "step": 82415 + }, + { + "epoch": 1.6768391927083335, + "grad_norm": 17.111005783081055, + "learning_rate": 6.259961390872051e-06, + "loss": 4.936, + "step": 82420 + }, + { + "epoch": 1.67694091796875, + "grad_norm": 17.296281814575195, + "learning_rate": 6.259574607290155e-06, + "loss": 5.16, + "step": 82425 + }, + { + "epoch": 1.6770426432291665, + "grad_norm": 15.960112571716309, + "learning_rate": 6.259187815659977e-06, + "loss": 4.9178, + "step": 82430 + }, + { + "epoch": 1.6771443684895835, + "grad_norm": 18.856414794921875, + "learning_rate": 6.258801015983988e-06, + "loss": 4.825, + "step": 82435 + }, + { + "epoch": 1.67724609375, + "grad_norm": 21.37535285949707, + "learning_rate": 6.258414208264661e-06, + "loss": 4.864, + "step": 82440 + }, + { + "epoch": 1.6773478190104165, + "grad_norm": 25.241344451904297, + "learning_rate": 6.258027392504467e-06, + "loss": 5.3183, + "step": 82445 + }, + { + "epoch": 1.6774495442708335, + "grad_norm": 17.79130744934082, + "learning_rate": 6.257640568705876e-06, + "loss": 4.89, + "step": 82450 + }, + { + "epoch": 1.67755126953125, + "grad_norm": 17.554662704467773, + "learning_rate": 6.2572537368713615e-06, + "loss": 4.8589, + "step": 82455 + }, + { + "epoch": 1.6776529947916665, + "grad_norm": 18.427593231201172, + "learning_rate": 6.256866897003396e-06, + "loss": 4.8372, + "step": 82460 + }, + { + "epoch": 1.6777547200520835, + "grad_norm": 19.345767974853516, + "learning_rate": 6.256480049104448e-06, + "loss": 4.9266, + "step": 82465 + }, + { + "epoch": 1.6778564453125, + "grad_norm": 17.401094436645508, + "learning_rate": 6.256093193176992e-06, + "loss": 4.9098, + "step": 82470 + }, + { + "epoch": 1.6779581705729165, + "grad_norm": 21.837451934814453, + "learning_rate": 6.255706329223499e-06, + "loss": 5.0763, + "step": 82475 + }, + { + "epoch": 1.6780598958333335, + "grad_norm": 16.27921485900879, + "learning_rate": 6.255319457246442e-06, + "loss": 5.0609, + "step": 82480 + }, + { + "epoch": 1.67816162109375, + "grad_norm": 14.849989891052246, + "learning_rate": 6.254932577248292e-06, + "loss": 4.9919, + "step": 82485 + }, + { + "epoch": 1.6782633463541665, + "grad_norm": 21.412757873535156, + "learning_rate": 6.254545689231521e-06, + "loss": 4.9509, + "step": 82490 + }, + { + "epoch": 1.6783650716145835, + "grad_norm": 15.504704475402832, + "learning_rate": 6.254158793198601e-06, + "loss": 4.6267, + "step": 82495 + }, + { + "epoch": 1.678466796875, + "grad_norm": 19.98439598083496, + "learning_rate": 6.253771889152005e-06, + "loss": 4.8084, + "step": 82500 + }, + { + "epoch": 1.6785685221354165, + "grad_norm": 17.870962142944336, + "learning_rate": 6.2533849770942055e-06, + "loss": 4.8002, + "step": 82505 + }, + { + "epoch": 1.6786702473958335, + "grad_norm": 23.69552230834961, + "learning_rate": 6.252998057027673e-06, + "loss": 4.4624, + "step": 82510 + }, + { + "epoch": 1.67877197265625, + "grad_norm": 15.159432411193848, + "learning_rate": 6.2526111289548795e-06, + "loss": 4.7926, + "step": 82515 + }, + { + "epoch": 1.6788736979166665, + "grad_norm": 20.765256881713867, + "learning_rate": 6.2522241928783e-06, + "loss": 4.8926, + "step": 82520 + }, + { + "epoch": 1.6789754231770835, + "grad_norm": 17.26991844177246, + "learning_rate": 6.251837248800404e-06, + "loss": 4.9227, + "step": 82525 + }, + { + "epoch": 1.6790771484375, + "grad_norm": 18.75740623474121, + "learning_rate": 6.251450296723667e-06, + "loss": 4.7354, + "step": 82530 + }, + { + "epoch": 1.6791788736979165, + "grad_norm": 19.808944702148438, + "learning_rate": 6.25106333665056e-06, + "loss": 4.6863, + "step": 82535 + }, + { + "epoch": 1.6792805989583335, + "grad_norm": 21.37550163269043, + "learning_rate": 6.250676368583554e-06, + "loss": 4.7942, + "step": 82540 + }, + { + "epoch": 1.67938232421875, + "grad_norm": 18.501445770263672, + "learning_rate": 6.250289392525124e-06, + "loss": 4.9415, + "step": 82545 + }, + { + "epoch": 1.6794840494791665, + "grad_norm": 20.640775680541992, + "learning_rate": 6.2499024084777405e-06, + "loss": 5.0934, + "step": 82550 + }, + { + "epoch": 1.6795857747395835, + "grad_norm": 28.254192352294922, + "learning_rate": 6.249515416443878e-06, + "loss": 4.9772, + "step": 82555 + }, + { + "epoch": 1.6796875, + "grad_norm": 21.32723045349121, + "learning_rate": 6.24912841642601e-06, + "loss": 4.8377, + "step": 82560 + }, + { + "epoch": 1.6797892252604165, + "grad_norm": 19.87507438659668, + "learning_rate": 6.248741408426606e-06, + "loss": 4.9283, + "step": 82565 + }, + { + "epoch": 1.6798909505208335, + "grad_norm": 17.917646408081055, + "learning_rate": 6.248354392448141e-06, + "loss": 4.6967, + "step": 82570 + }, + { + "epoch": 1.67999267578125, + "grad_norm": 19.770288467407227, + "learning_rate": 6.247967368493089e-06, + "loss": 4.7074, + "step": 82575 + }, + { + "epoch": 1.6800944010416665, + "grad_norm": 21.10219383239746, + "learning_rate": 6.247580336563921e-06, + "loss": 4.8719, + "step": 82580 + }, + { + "epoch": 1.6801961263020835, + "grad_norm": 17.123565673828125, + "learning_rate": 6.24719329666311e-06, + "loss": 4.61, + "step": 82585 + }, + { + "epoch": 1.6802978515625, + "grad_norm": 16.316165924072266, + "learning_rate": 6.246806248793131e-06, + "loss": 4.8869, + "step": 82590 + }, + { + "epoch": 1.6803995768229165, + "grad_norm": 19.275615692138672, + "learning_rate": 6.2464191929564545e-06, + "loss": 4.8723, + "step": 82595 + }, + { + "epoch": 1.6805013020833335, + "grad_norm": 16.006458282470703, + "learning_rate": 6.246032129155556e-06, + "loss": 4.9265, + "step": 82600 + }, + { + "epoch": 1.68060302734375, + "grad_norm": 19.925945281982422, + "learning_rate": 6.2456450573929075e-06, + "loss": 4.6616, + "step": 82605 + }, + { + "epoch": 1.6807047526041665, + "grad_norm": 23.185226440429688, + "learning_rate": 6.245257977670982e-06, + "loss": 5.1129, + "step": 82610 + }, + { + "epoch": 1.6808064778645835, + "grad_norm": 22.52745246887207, + "learning_rate": 6.244870889992254e-06, + "loss": 4.9342, + "step": 82615 + }, + { + "epoch": 1.680908203125, + "grad_norm": 18.22064781188965, + "learning_rate": 6.2444837943591966e-06, + "loss": 4.7343, + "step": 82620 + }, + { + "epoch": 1.6810099283854165, + "grad_norm": 18.275108337402344, + "learning_rate": 6.244096690774282e-06, + "loss": 4.883, + "step": 82625 + }, + { + "epoch": 1.6811116536458335, + "grad_norm": 14.561219215393066, + "learning_rate": 6.243709579239984e-06, + "loss": 4.6807, + "step": 82630 + }, + { + "epoch": 1.68121337890625, + "grad_norm": 20.61600112915039, + "learning_rate": 6.243322459758778e-06, + "loss": 5.1266, + "step": 82635 + }, + { + "epoch": 1.6813151041666665, + "grad_norm": 14.475400924682617, + "learning_rate": 6.242935332333135e-06, + "loss": 4.8944, + "step": 82640 + }, + { + "epoch": 1.6814168294270835, + "grad_norm": 19.22689437866211, + "learning_rate": 6.24254819696553e-06, + "loss": 4.9651, + "step": 82645 + }, + { + "epoch": 1.6815185546875, + "grad_norm": 20.62044334411621, + "learning_rate": 6.242161053658438e-06, + "loss": 4.7537, + "step": 82650 + }, + { + "epoch": 1.6816202799479165, + "grad_norm": 24.11125946044922, + "learning_rate": 6.241773902414328e-06, + "loss": 5.1148, + "step": 82655 + }, + { + "epoch": 1.6817220052083335, + "grad_norm": 17.31456756591797, + "learning_rate": 6.24138674323568e-06, + "loss": 4.9757, + "step": 82660 + }, + { + "epoch": 1.68182373046875, + "grad_norm": 16.805679321289062, + "learning_rate": 6.240999576124964e-06, + "loss": 4.9527, + "step": 82665 + }, + { + "epoch": 1.6819254557291665, + "grad_norm": 24.676193237304688, + "learning_rate": 6.240612401084654e-06, + "loss": 4.844, + "step": 82670 + }, + { + "epoch": 1.6820271809895835, + "grad_norm": 21.05967140197754, + "learning_rate": 6.240225218117225e-06, + "loss": 4.9118, + "step": 82675 + }, + { + "epoch": 1.68212890625, + "grad_norm": 19.06972312927246, + "learning_rate": 6.239838027225149e-06, + "loss": 5.0591, + "step": 82680 + }, + { + "epoch": 1.6822306315104165, + "grad_norm": 21.98724365234375, + "learning_rate": 6.239450828410904e-06, + "loss": 4.8107, + "step": 82685 + }, + { + "epoch": 1.6823323567708335, + "grad_norm": 16.437686920166016, + "learning_rate": 6.239063621676962e-06, + "loss": 5.0543, + "step": 82690 + }, + { + "epoch": 1.68243408203125, + "grad_norm": 18.550533294677734, + "learning_rate": 6.2386764070257944e-06, + "loss": 4.6993, + "step": 82695 + }, + { + "epoch": 1.6825358072916665, + "grad_norm": 17.67921257019043, + "learning_rate": 6.238289184459879e-06, + "loss": 4.7542, + "step": 82700 + }, + { + "epoch": 1.6826375325520835, + "grad_norm": 14.910987854003906, + "learning_rate": 6.237901953981687e-06, + "loss": 5.063, + "step": 82705 + }, + { + "epoch": 1.6827392578125, + "grad_norm": 17.339012145996094, + "learning_rate": 6.237514715593697e-06, + "loss": 4.996, + "step": 82710 + }, + { + "epoch": 1.6828409830729165, + "grad_norm": 19.514175415039062, + "learning_rate": 6.23712746929838e-06, + "loss": 4.7567, + "step": 82715 + }, + { + "epoch": 1.6829427083333335, + "grad_norm": 19.113832473754883, + "learning_rate": 6.236740215098212e-06, + "loss": 4.8919, + "step": 82720 + }, + { + "epoch": 1.68304443359375, + "grad_norm": 18.909948348999023, + "learning_rate": 6.236352952995665e-06, + "loss": 5.0044, + "step": 82725 + }, + { + "epoch": 1.6831461588541665, + "grad_norm": 21.19529151916504, + "learning_rate": 6.2359656829932155e-06, + "loss": 4.9093, + "step": 82730 + }, + { + "epoch": 1.6832478841145835, + "grad_norm": 17.634544372558594, + "learning_rate": 6.2355784050933375e-06, + "loss": 4.9586, + "step": 82735 + }, + { + "epoch": 1.683349609375, + "grad_norm": 19.31044578552246, + "learning_rate": 6.235191119298506e-06, + "loss": 5.1584, + "step": 82740 + }, + { + "epoch": 1.6834513346354165, + "grad_norm": 21.361913681030273, + "learning_rate": 6.234803825611195e-06, + "loss": 4.8252, + "step": 82745 + }, + { + "epoch": 1.6835530598958335, + "grad_norm": 16.043354034423828, + "learning_rate": 6.234416524033881e-06, + "loss": 4.8535, + "step": 82750 + }, + { + "epoch": 1.68365478515625, + "grad_norm": 15.344182014465332, + "learning_rate": 6.234029214569036e-06, + "loss": 4.9792, + "step": 82755 + }, + { + "epoch": 1.6837565104166665, + "grad_norm": 13.982007026672363, + "learning_rate": 6.233641897219136e-06, + "loss": 4.8666, + "step": 82760 + }, + { + "epoch": 1.6838582356770835, + "grad_norm": 20.40341567993164, + "learning_rate": 6.233254571986657e-06, + "loss": 5.0298, + "step": 82765 + }, + { + "epoch": 1.6839599609375, + "grad_norm": 20.5599308013916, + "learning_rate": 6.232867238874071e-06, + "loss": 5.2024, + "step": 82770 + }, + { + "epoch": 1.6840616861979165, + "grad_norm": 17.526615142822266, + "learning_rate": 6.232479897883856e-06, + "loss": 4.7235, + "step": 82775 + }, + { + "epoch": 1.6841634114583335, + "grad_norm": 16.348514556884766, + "learning_rate": 6.232092549018486e-06, + "loss": 5.095, + "step": 82780 + }, + { + "epoch": 1.68426513671875, + "grad_norm": 20.032949447631836, + "learning_rate": 6.231705192280435e-06, + "loss": 5.4328, + "step": 82785 + }, + { + "epoch": 1.6843668619791665, + "grad_norm": 17.570077896118164, + "learning_rate": 6.231317827672179e-06, + "loss": 4.8917, + "step": 82790 + }, + { + "epoch": 1.6844685872395835, + "grad_norm": 19.981842041015625, + "learning_rate": 6.230930455196194e-06, + "loss": 5.0005, + "step": 82795 + }, + { + "epoch": 1.6845703125, + "grad_norm": 16.653745651245117, + "learning_rate": 6.230543074854952e-06, + "loss": 4.7074, + "step": 82800 + }, + { + "epoch": 1.6846720377604165, + "grad_norm": 17.499359130859375, + "learning_rate": 6.230155686650932e-06, + "loss": 5.0378, + "step": 82805 + }, + { + "epoch": 1.6847737630208335, + "grad_norm": 21.206186294555664, + "learning_rate": 6.229768290586607e-06, + "loss": 4.6882, + "step": 82810 + }, + { + "epoch": 1.68487548828125, + "grad_norm": 22.754945755004883, + "learning_rate": 6.229380886664454e-06, + "loss": 4.9243, + "step": 82815 + }, + { + "epoch": 1.6849772135416665, + "grad_norm": 25.22353744506836, + "learning_rate": 6.228993474886947e-06, + "loss": 4.9273, + "step": 82820 + }, + { + "epoch": 1.6850789388020835, + "grad_norm": 22.20627784729004, + "learning_rate": 6.22860605525656e-06, + "loss": 4.9836, + "step": 82825 + }, + { + "epoch": 1.6851806640625, + "grad_norm": 16.139419555664062, + "learning_rate": 6.228218627775772e-06, + "loss": 5.0063, + "step": 82830 + }, + { + "epoch": 1.6852823893229165, + "grad_norm": 16.800670623779297, + "learning_rate": 6.227831192447056e-06, + "loss": 4.8474, + "step": 82835 + }, + { + "epoch": 1.6853841145833335, + "grad_norm": 13.413309097290039, + "learning_rate": 6.227443749272889e-06, + "loss": 4.7668, + "step": 82840 + }, + { + "epoch": 1.68548583984375, + "grad_norm": 23.752077102661133, + "learning_rate": 6.227056298255744e-06, + "loss": 4.9785, + "step": 82845 + }, + { + "epoch": 1.6855875651041665, + "grad_norm": 19.263744354248047, + "learning_rate": 6.226668839398102e-06, + "loss": 4.8075, + "step": 82850 + }, + { + "epoch": 1.6856892903645835, + "grad_norm": 35.7112922668457, + "learning_rate": 6.226281372702434e-06, + "loss": 4.8476, + "step": 82855 + }, + { + "epoch": 1.685791015625, + "grad_norm": 15.263946533203125, + "learning_rate": 6.225893898171218e-06, + "loss": 4.834, + "step": 82860 + }, + { + "epoch": 1.6858927408854165, + "grad_norm": 23.661806106567383, + "learning_rate": 6.225506415806928e-06, + "loss": 4.7837, + "step": 82865 + }, + { + "epoch": 1.6859944661458335, + "grad_norm": 21.81659507751465, + "learning_rate": 6.225118925612043e-06, + "loss": 4.8598, + "step": 82870 + }, + { + "epoch": 1.68609619140625, + "grad_norm": 18.618194580078125, + "learning_rate": 6.224731427589035e-06, + "loss": 4.8031, + "step": 82875 + }, + { + "epoch": 1.6861979166666665, + "grad_norm": 20.998916625976562, + "learning_rate": 6.224343921740383e-06, + "loss": 5.1089, + "step": 82880 + }, + { + "epoch": 1.6862996419270835, + "grad_norm": 18.832040786743164, + "learning_rate": 6.2239564080685625e-06, + "loss": 4.8822, + "step": 82885 + }, + { + "epoch": 1.6864013671875, + "grad_norm": 18.63092803955078, + "learning_rate": 6.223568886576047e-06, + "loss": 4.8281, + "step": 82890 + }, + { + "epoch": 1.6865030924479165, + "grad_norm": 19.208385467529297, + "learning_rate": 6.223181357265317e-06, + "loss": 5.0149, + "step": 82895 + }, + { + "epoch": 1.6866048177083335, + "grad_norm": 18.046689987182617, + "learning_rate": 6.222793820138845e-06, + "loss": 5.1538, + "step": 82900 + }, + { + "epoch": 1.68670654296875, + "grad_norm": 16.611526489257812, + "learning_rate": 6.2224062751991086e-06, + "loss": 5.164, + "step": 82905 + }, + { + "epoch": 1.6868082682291665, + "grad_norm": 19.47238540649414, + "learning_rate": 6.222018722448586e-06, + "loss": 4.6023, + "step": 82910 + }, + { + "epoch": 1.6869099934895835, + "grad_norm": 19.464557647705078, + "learning_rate": 6.221631161889749e-06, + "loss": 4.9467, + "step": 82915 + }, + { + "epoch": 1.68701171875, + "grad_norm": 16.984270095825195, + "learning_rate": 6.221243593525079e-06, + "loss": 4.8336, + "step": 82920 + }, + { + "epoch": 1.6871134440104165, + "grad_norm": 16.94786262512207, + "learning_rate": 6.22085601735705e-06, + "loss": 4.9511, + "step": 82925 + }, + { + "epoch": 1.6872151692708335, + "grad_norm": 16.73882484436035, + "learning_rate": 6.220468433388136e-06, + "loss": 5.0001, + "step": 82930 + }, + { + "epoch": 1.68731689453125, + "grad_norm": 19.49587631225586, + "learning_rate": 6.2200808416208195e-06, + "loss": 4.8959, + "step": 82935 + }, + { + "epoch": 1.6874186197916665, + "grad_norm": 12.147173881530762, + "learning_rate": 6.219693242057571e-06, + "loss": 4.9662, + "step": 82940 + }, + { + "epoch": 1.6875203450520835, + "grad_norm": 19.646562576293945, + "learning_rate": 6.2193056347008704e-06, + "loss": 5.073, + "step": 82945 + }, + { + "epoch": 1.6876220703125, + "grad_norm": 16.249414443969727, + "learning_rate": 6.218918019553195e-06, + "loss": 4.9339, + "step": 82950 + }, + { + "epoch": 1.6877237955729165, + "grad_norm": 15.255518913269043, + "learning_rate": 6.21853039661702e-06, + "loss": 4.7013, + "step": 82955 + }, + { + "epoch": 1.6878255208333335, + "grad_norm": 21.479639053344727, + "learning_rate": 6.218142765894822e-06, + "loss": 4.932, + "step": 82960 + }, + { + "epoch": 1.68792724609375, + "grad_norm": 17.867080688476562, + "learning_rate": 6.217755127389079e-06, + "loss": 5.0134, + "step": 82965 + }, + { + "epoch": 1.6880289713541665, + "grad_norm": 16.868762969970703, + "learning_rate": 6.2173674811022674e-06, + "loss": 4.8103, + "step": 82970 + }, + { + "epoch": 1.6881306966145835, + "grad_norm": 19.910186767578125, + "learning_rate": 6.2169798270368634e-06, + "loss": 5.1387, + "step": 82975 + }, + { + "epoch": 1.688232421875, + "grad_norm": 24.59853744506836, + "learning_rate": 6.2165921651953455e-06, + "loss": 4.8641, + "step": 82980 + }, + { + "epoch": 1.6883341471354165, + "grad_norm": 16.623943328857422, + "learning_rate": 6.2162044955801885e-06, + "loss": 4.8647, + "step": 82985 + }, + { + "epoch": 1.6884358723958335, + "grad_norm": 19.831499099731445, + "learning_rate": 6.215816818193872e-06, + "loss": 4.7582, + "step": 82990 + }, + { + "epoch": 1.68853759765625, + "grad_norm": 17.53365135192871, + "learning_rate": 6.215429133038871e-06, + "loss": 4.841, + "step": 82995 + }, + { + "epoch": 1.6886393229166665, + "grad_norm": 17.452123641967773, + "learning_rate": 6.215041440117665e-06, + "loss": 4.9938, + "step": 83000 + }, + { + "epoch": 1.6887410481770835, + "grad_norm": 22.633941650390625, + "learning_rate": 6.214653739432728e-06, + "loss": 5.1885, + "step": 83005 + }, + { + "epoch": 1.6888427734375, + "grad_norm": 22.038541793823242, + "learning_rate": 6.214266030986541e-06, + "loss": 5.0815, + "step": 83010 + }, + { + "epoch": 1.6889444986979165, + "grad_norm": 14.913360595703125, + "learning_rate": 6.213878314781579e-06, + "loss": 4.944, + "step": 83015 + }, + { + "epoch": 1.6890462239583335, + "grad_norm": 15.391146659851074, + "learning_rate": 6.21349059082032e-06, + "loss": 4.858, + "step": 83020 + }, + { + "epoch": 1.68914794921875, + "grad_norm": 15.027204513549805, + "learning_rate": 6.2131028591052415e-06, + "loss": 4.7609, + "step": 83025 + }, + { + "epoch": 1.6892496744791665, + "grad_norm": 20.42051887512207, + "learning_rate": 6.2127151196388205e-06, + "loss": 5.1969, + "step": 83030 + }, + { + "epoch": 1.6893513997395835, + "grad_norm": 23.451705932617188, + "learning_rate": 6.212327372423535e-06, + "loss": 4.8241, + "step": 83035 + }, + { + "epoch": 1.689453125, + "grad_norm": 17.476734161376953, + "learning_rate": 6.2119396174618615e-06, + "loss": 5.0098, + "step": 83040 + }, + { + "epoch": 1.6895548502604165, + "grad_norm": 24.822616577148438, + "learning_rate": 6.211551854756279e-06, + "loss": 4.8408, + "step": 83045 + }, + { + "epoch": 1.6896565755208335, + "grad_norm": 14.968707084655762, + "learning_rate": 6.211164084309266e-06, + "loss": 4.831, + "step": 83050 + }, + { + "epoch": 1.68975830078125, + "grad_norm": 17.67279624938965, + "learning_rate": 6.210776306123298e-06, + "loss": 5.0866, + "step": 83055 + }, + { + "epoch": 1.6898600260416665, + "grad_norm": 16.96710968017578, + "learning_rate": 6.210388520200854e-06, + "loss": 4.8487, + "step": 83060 + }, + { + "epoch": 1.6899617513020835, + "grad_norm": 22.98688507080078, + "learning_rate": 6.210000726544412e-06, + "loss": 4.9314, + "step": 83065 + }, + { + "epoch": 1.6900634765625, + "grad_norm": 23.698684692382812, + "learning_rate": 6.209612925156449e-06, + "loss": 4.7522, + "step": 83070 + }, + { + "epoch": 1.6901652018229165, + "grad_norm": 16.30589485168457, + "learning_rate": 6.209225116039442e-06, + "loss": 5.0376, + "step": 83075 + }, + { + "epoch": 1.6902669270833335, + "grad_norm": 24.32685089111328, + "learning_rate": 6.2088372991958735e-06, + "loss": 4.8766, + "step": 83080 + }, + { + "epoch": 1.69036865234375, + "grad_norm": 16.43683433532715, + "learning_rate": 6.208449474628215e-06, + "loss": 4.9536, + "step": 83085 + }, + { + "epoch": 1.6904703776041665, + "grad_norm": 23.388399124145508, + "learning_rate": 6.208061642338951e-06, + "loss": 5.0067, + "step": 83090 + }, + { + "epoch": 1.6905721028645835, + "grad_norm": 15.123551368713379, + "learning_rate": 6.207673802330554e-06, + "loss": 5.0767, + "step": 83095 + }, + { + "epoch": 1.690673828125, + "grad_norm": 14.91812801361084, + "learning_rate": 6.207285954605506e-06, + "loss": 5.1655, + "step": 83100 + }, + { + "epoch": 1.6907755533854165, + "grad_norm": 16.434139251708984, + "learning_rate": 6.206898099166284e-06, + "loss": 5.0237, + "step": 83105 + }, + { + "epoch": 1.6908772786458335, + "grad_norm": 15.614998817443848, + "learning_rate": 6.206510236015366e-06, + "loss": 4.8126, + "step": 83110 + }, + { + "epoch": 1.69097900390625, + "grad_norm": 17.076236724853516, + "learning_rate": 6.20612236515523e-06, + "loss": 4.7739, + "step": 83115 + }, + { + "epoch": 1.6910807291666665, + "grad_norm": 20.268651962280273, + "learning_rate": 6.205734486588356e-06, + "loss": 4.6771, + "step": 83120 + }, + { + "epoch": 1.6911824544270835, + "grad_norm": 17.02799415588379, + "learning_rate": 6.20534660031722e-06, + "loss": 5.0647, + "step": 83125 + }, + { + "epoch": 1.6912841796875, + "grad_norm": 16.268815994262695, + "learning_rate": 6.204958706344304e-06, + "loss": 4.7948, + "step": 83130 + }, + { + "epoch": 1.6913859049479165, + "grad_norm": 21.07646369934082, + "learning_rate": 6.204570804672083e-06, + "loss": 4.8064, + "step": 83135 + }, + { + "epoch": 1.6914876302083335, + "grad_norm": 15.368847846984863, + "learning_rate": 6.204182895303037e-06, + "loss": 4.8331, + "step": 83140 + }, + { + "epoch": 1.69158935546875, + "grad_norm": 15.68338394165039, + "learning_rate": 6.203794978239646e-06, + "loss": 4.8425, + "step": 83145 + }, + { + "epoch": 1.6916910807291665, + "grad_norm": 17.549755096435547, + "learning_rate": 6.203407053484385e-06, + "loss": 4.8628, + "step": 83150 + }, + { + "epoch": 1.6917928059895835, + "grad_norm": 18.207332611083984, + "learning_rate": 6.203019121039736e-06, + "loss": 4.9247, + "step": 83155 + }, + { + "epoch": 1.69189453125, + "grad_norm": 16.210067749023438, + "learning_rate": 6.202631180908177e-06, + "loss": 4.9822, + "step": 83160 + }, + { + "epoch": 1.6919962565104165, + "grad_norm": 20.663410186767578, + "learning_rate": 6.202243233092187e-06, + "loss": 4.9488, + "step": 83165 + }, + { + "epoch": 1.6920979817708335, + "grad_norm": 24.146442413330078, + "learning_rate": 6.201855277594243e-06, + "loss": 4.7896, + "step": 83170 + }, + { + "epoch": 1.69219970703125, + "grad_norm": 16.875200271606445, + "learning_rate": 6.201467314416825e-06, + "loss": 4.9831, + "step": 83175 + }, + { + "epoch": 1.6923014322916665, + "grad_norm": 15.119183540344238, + "learning_rate": 6.201079343562414e-06, + "loss": 5.1459, + "step": 83180 + }, + { + "epoch": 1.6924031575520835, + "grad_norm": 16.279884338378906, + "learning_rate": 6.2006913650334854e-06, + "loss": 4.9574, + "step": 83185 + }, + { + "epoch": 1.6925048828125, + "grad_norm": 18.51039695739746, + "learning_rate": 6.2003033788325215e-06, + "loss": 5.0263, + "step": 83190 + }, + { + "epoch": 1.6926066080729165, + "grad_norm": 19.738971710205078, + "learning_rate": 6.199915384962001e-06, + "loss": 4.8049, + "step": 83195 + }, + { + "epoch": 1.6927083333333335, + "grad_norm": 17.654024124145508, + "learning_rate": 6.1995273834243995e-06, + "loss": 4.9712, + "step": 83200 + }, + { + "epoch": 1.69281005859375, + "grad_norm": 14.583274841308594, + "learning_rate": 6.199139374222201e-06, + "loss": 4.9607, + "step": 83205 + }, + { + "epoch": 1.6929117838541665, + "grad_norm": 18.986133575439453, + "learning_rate": 6.19875135735788e-06, + "loss": 4.6822, + "step": 83210 + }, + { + "epoch": 1.6930135091145835, + "grad_norm": 13.782912254333496, + "learning_rate": 6.198363332833921e-06, + "loss": 4.9477, + "step": 83215 + }, + { + "epoch": 1.693115234375, + "grad_norm": 15.539193153381348, + "learning_rate": 6.1979753006528e-06, + "loss": 4.8672, + "step": 83220 + }, + { + "epoch": 1.6932169596354165, + "grad_norm": 19.454017639160156, + "learning_rate": 6.197587260816996e-06, + "loss": 4.6554, + "step": 83225 + }, + { + "epoch": 1.6933186848958335, + "grad_norm": 17.118825912475586, + "learning_rate": 6.197199213328991e-06, + "loss": 4.665, + "step": 83230 + }, + { + "epoch": 1.69342041015625, + "grad_norm": 19.287744522094727, + "learning_rate": 6.196811158191262e-06, + "loss": 4.8824, + "step": 83235 + }, + { + "epoch": 1.6935221354166665, + "grad_norm": 20.70615005493164, + "learning_rate": 6.196423095406291e-06, + "loss": 5.0195, + "step": 83240 + }, + { + "epoch": 1.6936238606770835, + "grad_norm": 15.018260955810547, + "learning_rate": 6.196035024976556e-06, + "loss": 4.8997, + "step": 83245 + }, + { + "epoch": 1.6937255859375, + "grad_norm": 24.3259220123291, + "learning_rate": 6.195646946904534e-06, + "loss": 4.9623, + "step": 83250 + }, + { + "epoch": 1.6938273111979165, + "grad_norm": 13.77193546295166, + "learning_rate": 6.19525886119271e-06, + "loss": 5.0124, + "step": 83255 + }, + { + "epoch": 1.6939290364583335, + "grad_norm": 15.298799514770508, + "learning_rate": 6.194870767843562e-06, + "loss": 4.9874, + "step": 83260 + }, + { + "epoch": 1.69403076171875, + "grad_norm": 19.594467163085938, + "learning_rate": 6.194482666859567e-06, + "loss": 4.522, + "step": 83265 + }, + { + "epoch": 1.6941324869791665, + "grad_norm": 18.7517147064209, + "learning_rate": 6.194094558243209e-06, + "loss": 4.8951, + "step": 83270 + }, + { + "epoch": 1.6942342122395835, + "grad_norm": 14.44775390625, + "learning_rate": 6.193706441996964e-06, + "loss": 4.7468, + "step": 83275 + }, + { + "epoch": 1.6943359375, + "grad_norm": 16.83201789855957, + "learning_rate": 6.193318318123315e-06, + "loss": 4.9897, + "step": 83280 + }, + { + "epoch": 1.6944376627604165, + "grad_norm": 20.319368362426758, + "learning_rate": 6.192930186624741e-06, + "loss": 4.7421, + "step": 83285 + }, + { + "epoch": 1.6945393880208335, + "grad_norm": 17.157974243164062, + "learning_rate": 6.1925420475037205e-06, + "loss": 4.9448, + "step": 83290 + }, + { + "epoch": 1.69464111328125, + "grad_norm": 15.976302146911621, + "learning_rate": 6.192153900762735e-06, + "loss": 4.9307, + "step": 83295 + }, + { + "epoch": 1.6947428385416665, + "grad_norm": 15.710837364196777, + "learning_rate": 6.191765746404266e-06, + "loss": 4.8661, + "step": 83300 + }, + { + "epoch": 1.6948445638020835, + "grad_norm": 27.098955154418945, + "learning_rate": 6.191377584430791e-06, + "loss": 5.1854, + "step": 83305 + }, + { + "epoch": 1.6949462890625, + "grad_norm": 17.073348999023438, + "learning_rate": 6.190989414844792e-06, + "loss": 5.0186, + "step": 83310 + }, + { + "epoch": 1.6950480143229165, + "grad_norm": 29.618131637573242, + "learning_rate": 6.190601237648748e-06, + "loss": 5.1927, + "step": 83315 + }, + { + "epoch": 1.6951497395833335, + "grad_norm": 17.07962417602539, + "learning_rate": 6.190213052845141e-06, + "loss": 4.8203, + "step": 83320 + }, + { + "epoch": 1.69525146484375, + "grad_norm": 18.08628273010254, + "learning_rate": 6.189824860436449e-06, + "loss": 4.9062, + "step": 83325 + }, + { + "epoch": 1.6953531901041665, + "grad_norm": 15.976516723632812, + "learning_rate": 6.1894366604251555e-06, + "loss": 5.2131, + "step": 83330 + }, + { + "epoch": 1.6954549153645835, + "grad_norm": 14.699098587036133, + "learning_rate": 6.189048452813738e-06, + "loss": 4.8138, + "step": 83335 + }, + { + "epoch": 1.695556640625, + "grad_norm": 15.694952011108398, + "learning_rate": 6.188660237604679e-06, + "loss": 4.802, + "step": 83340 + }, + { + "epoch": 1.6956583658854165, + "grad_norm": 18.295183181762695, + "learning_rate": 6.188272014800458e-06, + "loss": 4.8223, + "step": 83345 + }, + { + "epoch": 1.6957600911458335, + "grad_norm": 20.381248474121094, + "learning_rate": 6.187883784403558e-06, + "loss": 4.8734, + "step": 83350 + }, + { + "epoch": 1.69586181640625, + "grad_norm": 17.245773315429688, + "learning_rate": 6.187495546416454e-06, + "loss": 4.7217, + "step": 83355 + }, + { + "epoch": 1.6959635416666665, + "grad_norm": 16.209945678710938, + "learning_rate": 6.187107300841634e-06, + "loss": 4.8251, + "step": 83360 + }, + { + "epoch": 1.6960652669270835, + "grad_norm": 17.99249839782715, + "learning_rate": 6.186719047681574e-06, + "loss": 4.9776, + "step": 83365 + }, + { + "epoch": 1.6961669921875, + "grad_norm": 17.23807716369629, + "learning_rate": 6.186330786938755e-06, + "loss": 4.8811, + "step": 83370 + }, + { + "epoch": 1.6962687174479165, + "grad_norm": 18.923358917236328, + "learning_rate": 6.18594251861566e-06, + "loss": 4.9599, + "step": 83375 + }, + { + "epoch": 1.6963704427083335, + "grad_norm": 20.920242309570312, + "learning_rate": 6.185554242714767e-06, + "loss": 4.9071, + "step": 83380 + }, + { + "epoch": 1.69647216796875, + "grad_norm": 14.950749397277832, + "learning_rate": 6.18516595923856e-06, + "loss": 5.0114, + "step": 83385 + }, + { + "epoch": 1.6965738932291665, + "grad_norm": 24.211841583251953, + "learning_rate": 6.184777668189519e-06, + "loss": 4.9364, + "step": 83390 + }, + { + "epoch": 1.6966756184895835, + "grad_norm": 17.1594181060791, + "learning_rate": 6.184389369570124e-06, + "loss": 5.0734, + "step": 83395 + }, + { + "epoch": 1.69677734375, + "grad_norm": 20.354087829589844, + "learning_rate": 6.184001063382857e-06, + "loss": 5.0401, + "step": 83400 + }, + { + "epoch": 1.6968790690104165, + "grad_norm": 17.03594398498535, + "learning_rate": 6.183612749630199e-06, + "loss": 4.8802, + "step": 83405 + }, + { + "epoch": 1.6969807942708335, + "grad_norm": 22.512462615966797, + "learning_rate": 6.1832244283146325e-06, + "loss": 4.8704, + "step": 83410 + }, + { + "epoch": 1.69708251953125, + "grad_norm": 16.753862380981445, + "learning_rate": 6.182836099438637e-06, + "loss": 4.7675, + "step": 83415 + }, + { + "epoch": 1.6971842447916665, + "grad_norm": 15.63274097442627, + "learning_rate": 6.182447763004694e-06, + "loss": 4.9068, + "step": 83420 + }, + { + "epoch": 1.6972859700520835, + "grad_norm": 14.424443244934082, + "learning_rate": 6.182059419015286e-06, + "loss": 4.9127, + "step": 83425 + }, + { + "epoch": 1.6973876953125, + "grad_norm": 20.625635147094727, + "learning_rate": 6.1816710674728915e-06, + "loss": 4.8501, + "step": 83430 + }, + { + "epoch": 1.6974894205729165, + "grad_norm": 15.007439613342285, + "learning_rate": 6.181282708379997e-06, + "loss": 4.9035, + "step": 83435 + }, + { + "epoch": 1.6975911458333335, + "grad_norm": 18.57463264465332, + "learning_rate": 6.180894341739079e-06, + "loss": 5.1277, + "step": 83440 + }, + { + "epoch": 1.69769287109375, + "grad_norm": 15.739090919494629, + "learning_rate": 6.18050596755262e-06, + "loss": 4.9061, + "step": 83445 + }, + { + "epoch": 1.6977945963541665, + "grad_norm": 16.526973724365234, + "learning_rate": 6.180117585823104e-06, + "loss": 4.9792, + "step": 83450 + }, + { + "epoch": 1.6978963216145835, + "grad_norm": 17.31817054748535, + "learning_rate": 6.1797291965530125e-06, + "loss": 4.9343, + "step": 83455 + }, + { + "epoch": 1.697998046875, + "grad_norm": 15.445481300354004, + "learning_rate": 6.179340799744824e-06, + "loss": 4.7427, + "step": 83460 + }, + { + "epoch": 1.6980997721354165, + "grad_norm": 18.510068893432617, + "learning_rate": 6.178952395401023e-06, + "loss": 4.6566, + "step": 83465 + }, + { + "epoch": 1.6982014973958335, + "grad_norm": 12.482451438903809, + "learning_rate": 6.17856398352409e-06, + "loss": 4.9382, + "step": 83470 + }, + { + "epoch": 1.69830322265625, + "grad_norm": 18.226072311401367, + "learning_rate": 6.178175564116507e-06, + "loss": 4.9568, + "step": 83475 + }, + { + "epoch": 1.6984049479166665, + "grad_norm": 18.896541595458984, + "learning_rate": 6.1777871371807575e-06, + "loss": 5.0412, + "step": 83480 + }, + { + "epoch": 1.6985066731770835, + "grad_norm": 15.245565414428711, + "learning_rate": 6.177398702719321e-06, + "loss": 4.7122, + "step": 83485 + }, + { + "epoch": 1.6986083984375, + "grad_norm": 18.053878784179688, + "learning_rate": 6.177010260734681e-06, + "loss": 4.8639, + "step": 83490 + }, + { + "epoch": 1.6987101236979165, + "grad_norm": 22.320096969604492, + "learning_rate": 6.176621811229317e-06, + "loss": 4.9656, + "step": 83495 + }, + { + "epoch": 1.6988118489583335, + "grad_norm": 16.472671508789062, + "learning_rate": 6.176233354205716e-06, + "loss": 5.0102, + "step": 83500 + }, + { + "epoch": 1.69891357421875, + "grad_norm": 16.010704040527344, + "learning_rate": 6.175844889666357e-06, + "loss": 4.8423, + "step": 83505 + }, + { + "epoch": 1.6990152994791665, + "grad_norm": 21.75669288635254, + "learning_rate": 6.175456417613721e-06, + "loss": 4.8237, + "step": 83510 + }, + { + "epoch": 1.6991170247395835, + "grad_norm": 17.581012725830078, + "learning_rate": 6.175067938050292e-06, + "loss": 4.7192, + "step": 83515 + }, + { + "epoch": 1.69921875, + "grad_norm": 20.430356979370117, + "learning_rate": 6.174679450978553e-06, + "loss": 4.9435, + "step": 83520 + }, + { + "epoch": 1.6993204752604165, + "grad_norm": 15.13022232055664, + "learning_rate": 6.174290956400982e-06, + "loss": 4.8433, + "step": 83525 + }, + { + "epoch": 1.6994222005208335, + "grad_norm": 18.644926071166992, + "learning_rate": 6.173902454320068e-06, + "loss": 4.8262, + "step": 83530 + }, + { + "epoch": 1.69952392578125, + "grad_norm": 14.89896297454834, + "learning_rate": 6.173513944738288e-06, + "loss": 4.7597, + "step": 83535 + }, + { + "epoch": 1.6996256510416665, + "grad_norm": 47.77336120605469, + "learning_rate": 6.173125427658128e-06, + "loss": 4.893, + "step": 83540 + }, + { + "epoch": 1.6997273763020835, + "grad_norm": 15.792137145996094, + "learning_rate": 6.172736903082068e-06, + "loss": 5.1435, + "step": 83545 + }, + { + "epoch": 1.6998291015625, + "grad_norm": 21.146137237548828, + "learning_rate": 6.172348371012592e-06, + "loss": 4.9097, + "step": 83550 + }, + { + "epoch": 1.6999308268229165, + "grad_norm": 17.29098129272461, + "learning_rate": 6.171959831452181e-06, + "loss": 4.6054, + "step": 83555 + }, + { + "epoch": 1.7000325520833335, + "grad_norm": 15.67375373840332, + "learning_rate": 6.171571284403319e-06, + "loss": 4.7923, + "step": 83560 + }, + { + "epoch": 1.70013427734375, + "grad_norm": 18.926895141601562, + "learning_rate": 6.17118272986849e-06, + "loss": 5.152, + "step": 83565 + }, + { + "epoch": 1.7002360026041665, + "grad_norm": 23.479787826538086, + "learning_rate": 6.170794167850174e-06, + "loss": 4.8456, + "step": 83570 + }, + { + "epoch": 1.7003377278645835, + "grad_norm": 20.5338191986084, + "learning_rate": 6.170405598350855e-06, + "loss": 4.9399, + "step": 83575 + }, + { + "epoch": 1.700439453125, + "grad_norm": 24.269840240478516, + "learning_rate": 6.1700170213730155e-06, + "loss": 5.1052, + "step": 83580 + }, + { + "epoch": 1.7005411783854165, + "grad_norm": 19.046579360961914, + "learning_rate": 6.16962843691914e-06, + "loss": 5.1311, + "step": 83585 + }, + { + "epoch": 1.7006429036458335, + "grad_norm": 15.513418197631836, + "learning_rate": 6.169239844991708e-06, + "loss": 5.1798, + "step": 83590 + }, + { + "epoch": 1.70074462890625, + "grad_norm": 17.16634178161621, + "learning_rate": 6.1688512455932055e-06, + "loss": 4.7221, + "step": 83595 + }, + { + "epoch": 1.7008463541666665, + "grad_norm": 16.482595443725586, + "learning_rate": 6.168462638726114e-06, + "loss": 4.6634, + "step": 83600 + }, + { + "epoch": 1.7009480794270835, + "grad_norm": 22.708433151245117, + "learning_rate": 6.168074024392918e-06, + "loss": 4.835, + "step": 83605 + }, + { + "epoch": 1.7010498046875, + "grad_norm": 18.300260543823242, + "learning_rate": 6.1676854025961e-06, + "loss": 5.0881, + "step": 83610 + }, + { + "epoch": 1.7011515299479165, + "grad_norm": 15.487622261047363, + "learning_rate": 6.167296773338142e-06, + "loss": 4.8313, + "step": 83615 + }, + { + "epoch": 1.7012532552083335, + "grad_norm": 15.689990043640137, + "learning_rate": 6.166908136621529e-06, + "loss": 4.9459, + "step": 83620 + }, + { + "epoch": 1.70135498046875, + "grad_norm": 19.915973663330078, + "learning_rate": 6.166519492448741e-06, + "loss": 4.9992, + "step": 83625 + }, + { + "epoch": 1.7014567057291665, + "grad_norm": 20.755186080932617, + "learning_rate": 6.166130840822268e-06, + "loss": 4.9695, + "step": 83630 + }, + { + "epoch": 1.7015584309895835, + "grad_norm": 19.041885375976562, + "learning_rate": 6.1657421817445866e-06, + "loss": 5.0474, + "step": 83635 + }, + { + "epoch": 1.70166015625, + "grad_norm": 24.080947875976562, + "learning_rate": 6.165353515218182e-06, + "loss": 4.8866, + "step": 83640 + }, + { + "epoch": 1.7017618815104165, + "grad_norm": 20.189661026000977, + "learning_rate": 6.16496484124554e-06, + "loss": 4.5545, + "step": 83645 + }, + { + "epoch": 1.7018636067708335, + "grad_norm": 20.199077606201172, + "learning_rate": 6.164576159829141e-06, + "loss": 5.0812, + "step": 83650 + }, + { + "epoch": 1.70196533203125, + "grad_norm": 19.84073257446289, + "learning_rate": 6.164187470971471e-06, + "loss": 4.9069, + "step": 83655 + }, + { + "epoch": 1.7020670572916665, + "grad_norm": 17.435848236083984, + "learning_rate": 6.163798774675012e-06, + "loss": 4.9415, + "step": 83660 + }, + { + "epoch": 1.7021687825520835, + "grad_norm": 18.070972442626953, + "learning_rate": 6.163410070942248e-06, + "loss": 4.8742, + "step": 83665 + }, + { + "epoch": 1.7022705078125, + "grad_norm": 22.09352684020996, + "learning_rate": 6.163021359775665e-06, + "loss": 4.8841, + "step": 83670 + }, + { + "epoch": 1.7023722330729165, + "grad_norm": 15.253063201904297, + "learning_rate": 6.162632641177744e-06, + "loss": 4.7992, + "step": 83675 + }, + { + "epoch": 1.7024739583333335, + "grad_norm": 18.683311462402344, + "learning_rate": 6.162243915150967e-06, + "loss": 4.8143, + "step": 83680 + }, + { + "epoch": 1.70257568359375, + "grad_norm": 18.115476608276367, + "learning_rate": 6.161855181697823e-06, + "loss": 4.8077, + "step": 83685 + }, + { + "epoch": 1.7026774088541665, + "grad_norm": 16.97974967956543, + "learning_rate": 6.161466440820792e-06, + "loss": 5.0375, + "step": 83690 + }, + { + "epoch": 1.7027791341145835, + "grad_norm": 20.484172821044922, + "learning_rate": 6.16107769252236e-06, + "loss": 4.9112, + "step": 83695 + }, + { + "epoch": 1.702880859375, + "grad_norm": 19.26343536376953, + "learning_rate": 6.160688936805011e-06, + "loss": 4.8535, + "step": 83700 + }, + { + "epoch": 1.7029825846354165, + "grad_norm": 17.70234489440918, + "learning_rate": 6.160300173671225e-06, + "loss": 4.9293, + "step": 83705 + }, + { + "epoch": 1.7030843098958335, + "grad_norm": 23.572063446044922, + "learning_rate": 6.1599114031234915e-06, + "loss": 5.041, + "step": 83710 + }, + { + "epoch": 1.70318603515625, + "grad_norm": 16.786243438720703, + "learning_rate": 6.159522625164292e-06, + "loss": 5.07, + "step": 83715 + }, + { + "epoch": 1.7032877604166665, + "grad_norm": 16.873241424560547, + "learning_rate": 6.159133839796111e-06, + "loss": 4.8194, + "step": 83720 + }, + { + "epoch": 1.7033894856770835, + "grad_norm": 18.609338760375977, + "learning_rate": 6.158745047021433e-06, + "loss": 4.8989, + "step": 83725 + }, + { + "epoch": 1.7034912109375, + "grad_norm": 20.053434371948242, + "learning_rate": 6.15835624684274e-06, + "loss": 4.6281, + "step": 83730 + }, + { + "epoch": 1.7035929361979165, + "grad_norm": 22.303367614746094, + "learning_rate": 6.157967439262521e-06, + "loss": 4.912, + "step": 83735 + }, + { + "epoch": 1.7036946614583335, + "grad_norm": 19.76363182067871, + "learning_rate": 6.157578624283258e-06, + "loss": 4.8565, + "step": 83740 + }, + { + "epoch": 1.70379638671875, + "grad_norm": 26.794727325439453, + "learning_rate": 6.157189801907433e-06, + "loss": 4.915, + "step": 83745 + }, + { + "epoch": 1.7038981119791665, + "grad_norm": 16.4754695892334, + "learning_rate": 6.156800972137533e-06, + "loss": 4.8715, + "step": 83750 + }, + { + "epoch": 1.7039998372395835, + "grad_norm": 21.636302947998047, + "learning_rate": 6.156412134976042e-06, + "loss": 4.8278, + "step": 83755 + }, + { + "epoch": 1.7041015625, + "grad_norm": 14.668656349182129, + "learning_rate": 6.156023290425445e-06, + "loss": 4.797, + "step": 83760 + }, + { + "epoch": 1.7042032877604165, + "grad_norm": 16.821062088012695, + "learning_rate": 6.155634438488226e-06, + "loss": 5.0288, + "step": 83765 + }, + { + "epoch": 1.7043050130208335, + "grad_norm": 15.797085762023926, + "learning_rate": 6.155245579166868e-06, + "loss": 4.8242, + "step": 83770 + }, + { + "epoch": 1.70440673828125, + "grad_norm": 17.498363494873047, + "learning_rate": 6.1548567124638605e-06, + "loss": 4.9715, + "step": 83775 + }, + { + "epoch": 1.7045084635416665, + "grad_norm": 17.644437789916992, + "learning_rate": 6.154467838381684e-06, + "loss": 4.8081, + "step": 83780 + }, + { + "epoch": 1.7046101888020835, + "grad_norm": 17.79157066345215, + "learning_rate": 6.154078956922824e-06, + "loss": 4.9414, + "step": 83785 + }, + { + "epoch": 1.7047119140625, + "grad_norm": 17.5229549407959, + "learning_rate": 6.153690068089765e-06, + "loss": 5.1128, + "step": 83790 + }, + { + "epoch": 1.7048136393229165, + "grad_norm": 23.2213134765625, + "learning_rate": 6.1533011718849946e-06, + "loss": 4.7122, + "step": 83795 + }, + { + "epoch": 1.7049153645833335, + "grad_norm": 19.84525489807129, + "learning_rate": 6.152912268310994e-06, + "loss": 4.6949, + "step": 83800 + }, + { + "epoch": 1.70501708984375, + "grad_norm": 15.797211647033691, + "learning_rate": 6.1525233573702515e-06, + "loss": 5.0486, + "step": 83805 + }, + { + "epoch": 1.7051188151041665, + "grad_norm": 19.27643394470215, + "learning_rate": 6.1521344390652495e-06, + "loss": 5.1742, + "step": 83810 + }, + { + "epoch": 1.7052205403645835, + "grad_norm": 17.667020797729492, + "learning_rate": 6.151745513398475e-06, + "loss": 4.8515, + "step": 83815 + }, + { + "epoch": 1.705322265625, + "grad_norm": 21.352460861206055, + "learning_rate": 6.151356580372412e-06, + "loss": 4.7757, + "step": 83820 + }, + { + "epoch": 1.7054239908854165, + "grad_norm": 14.993484497070312, + "learning_rate": 6.150967639989547e-06, + "loss": 5.0839, + "step": 83825 + }, + { + "epoch": 1.7055257161458335, + "grad_norm": 16.28390121459961, + "learning_rate": 6.150578692252363e-06, + "loss": 4.666, + "step": 83830 + }, + { + "epoch": 1.70562744140625, + "grad_norm": 19.805095672607422, + "learning_rate": 6.150189737163346e-06, + "loss": 4.8746, + "step": 83835 + }, + { + "epoch": 1.7057291666666665, + "grad_norm": 20.82379722595215, + "learning_rate": 6.149800774724982e-06, + "loss": 4.7575, + "step": 83840 + }, + { + "epoch": 1.7058308919270835, + "grad_norm": 19.142269134521484, + "learning_rate": 6.149411804939756e-06, + "loss": 4.5684, + "step": 83845 + }, + { + "epoch": 1.7059326171875, + "grad_norm": 20.089473724365234, + "learning_rate": 6.149022827810154e-06, + "loss": 4.8442, + "step": 83850 + }, + { + "epoch": 1.7060343424479165, + "grad_norm": 21.05266761779785, + "learning_rate": 6.1486338433386615e-06, + "loss": 4.8241, + "step": 83855 + }, + { + "epoch": 1.7061360677083335, + "grad_norm": 16.98387908935547, + "learning_rate": 6.148244851527762e-06, + "loss": 4.8238, + "step": 83860 + }, + { + "epoch": 1.70623779296875, + "grad_norm": 16.381011962890625, + "learning_rate": 6.147855852379942e-06, + "loss": 4.7714, + "step": 83865 + }, + { + "epoch": 1.7063395182291665, + "grad_norm": 19.861900329589844, + "learning_rate": 6.14746684589769e-06, + "loss": 5.027, + "step": 83870 + }, + { + "epoch": 1.7064412434895835, + "grad_norm": 19.732481002807617, + "learning_rate": 6.147077832083487e-06, + "loss": 4.7554, + "step": 83875 + }, + { + "epoch": 1.70654296875, + "grad_norm": 14.906501770019531, + "learning_rate": 6.146688810939822e-06, + "loss": 5.1821, + "step": 83880 + }, + { + "epoch": 1.7066446940104165, + "grad_norm": 18.92152976989746, + "learning_rate": 6.146299782469178e-06, + "loss": 4.7295, + "step": 83885 + }, + { + "epoch": 1.7067464192708335, + "grad_norm": 31.729806900024414, + "learning_rate": 6.145910746674043e-06, + "loss": 4.7508, + "step": 83890 + }, + { + "epoch": 1.70684814453125, + "grad_norm": 19.187963485717773, + "learning_rate": 6.1455217035569025e-06, + "loss": 5.0224, + "step": 83895 + }, + { + "epoch": 1.7069498697916665, + "grad_norm": 19.286792755126953, + "learning_rate": 6.14513265312024e-06, + "loss": 4.8899, + "step": 83900 + }, + { + "epoch": 1.7070515950520835, + "grad_norm": 15.913158416748047, + "learning_rate": 6.144743595366547e-06, + "loss": 5.181, + "step": 83905 + }, + { + "epoch": 1.7071533203125, + "grad_norm": 21.41750717163086, + "learning_rate": 6.144354530298302e-06, + "loss": 4.7574, + "step": 83910 + }, + { + "epoch": 1.7072550455729165, + "grad_norm": 14.430695533752441, + "learning_rate": 6.143965457917998e-06, + "loss": 5.1198, + "step": 83915 + }, + { + "epoch": 1.7073567708333335, + "grad_norm": 21.522621154785156, + "learning_rate": 6.143576378228116e-06, + "loss": 4.7939, + "step": 83920 + }, + { + "epoch": 1.70745849609375, + "grad_norm": 17.67748260498047, + "learning_rate": 6.143187291231144e-06, + "loss": 5.0, + "step": 83925 + }, + { + "epoch": 1.7075602213541665, + "grad_norm": 15.334161758422852, + "learning_rate": 6.142798196929568e-06, + "loss": 4.8686, + "step": 83930 + }, + { + "epoch": 1.7076619466145835, + "grad_norm": 18.131271362304688, + "learning_rate": 6.142409095325874e-06, + "loss": 5.0705, + "step": 83935 + }, + { + "epoch": 1.707763671875, + "grad_norm": 19.225055694580078, + "learning_rate": 6.142019986422549e-06, + "loss": 5.1338, + "step": 83940 + }, + { + "epoch": 1.7078653971354165, + "grad_norm": 15.304327011108398, + "learning_rate": 6.14163087022208e-06, + "loss": 4.9458, + "step": 83945 + }, + { + "epoch": 1.7079671223958335, + "grad_norm": 22.96167755126953, + "learning_rate": 6.141241746726951e-06, + "loss": 4.9673, + "step": 83950 + }, + { + "epoch": 1.70806884765625, + "grad_norm": 21.276460647583008, + "learning_rate": 6.140852615939651e-06, + "loss": 4.7637, + "step": 83955 + }, + { + "epoch": 1.7081705729166665, + "grad_norm": 16.396711349487305, + "learning_rate": 6.1404634778626635e-06, + "loss": 4.8881, + "step": 83960 + }, + { + "epoch": 1.7082722981770835, + "grad_norm": 19.26045036315918, + "learning_rate": 6.140074332498476e-06, + "loss": 4.9041, + "step": 83965 + }, + { + "epoch": 1.7083740234375, + "grad_norm": 14.547274589538574, + "learning_rate": 6.139685179849577e-06, + "loss": 4.8606, + "step": 83970 + }, + { + "epoch": 1.7084757486979165, + "grad_norm": 18.122940063476562, + "learning_rate": 6.1392960199184496e-06, + "loss": 5.101, + "step": 83975 + }, + { + "epoch": 1.7085774739583335, + "grad_norm": 18.315998077392578, + "learning_rate": 6.138906852707584e-06, + "loss": 4.7923, + "step": 83980 + }, + { + "epoch": 1.70867919921875, + "grad_norm": 20.772817611694336, + "learning_rate": 6.138517678219464e-06, + "loss": 5.1763, + "step": 83985 + }, + { + "epoch": 1.7087809244791665, + "grad_norm": 15.311722755432129, + "learning_rate": 6.138128496456578e-06, + "loss": 4.8516, + "step": 83990 + }, + { + "epoch": 1.7088826497395835, + "grad_norm": 13.999305725097656, + "learning_rate": 6.137739307421412e-06, + "loss": 4.6886, + "step": 83995 + }, + { + "epoch": 1.708984375, + "grad_norm": 14.848372459411621, + "learning_rate": 6.1373501111164525e-06, + "loss": 4.7774, + "step": 84000 + }, + { + "epoch": 1.7090861002604165, + "grad_norm": 16.99158477783203, + "learning_rate": 6.136960907544189e-06, + "loss": 4.8749, + "step": 84005 + }, + { + "epoch": 1.7091878255208335, + "grad_norm": 19.56524085998535, + "learning_rate": 6.1365716967071044e-06, + "loss": 4.8852, + "step": 84010 + }, + { + "epoch": 1.70928955078125, + "grad_norm": 22.666358947753906, + "learning_rate": 6.136182478607686e-06, + "loss": 5.0338, + "step": 84015 + }, + { + "epoch": 1.7093912760416665, + "grad_norm": 16.320377349853516, + "learning_rate": 6.135793253248425e-06, + "loss": 4.8871, + "step": 84020 + }, + { + "epoch": 1.7094930013020835, + "grad_norm": 21.298484802246094, + "learning_rate": 6.135404020631806e-06, + "loss": 4.9395, + "step": 84025 + }, + { + "epoch": 1.7095947265625, + "grad_norm": 20.151365280151367, + "learning_rate": 6.135014780760312e-06, + "loss": 4.7411, + "step": 84030 + }, + { + "epoch": 1.7096964518229165, + "grad_norm": 15.312362670898438, + "learning_rate": 6.134625533636437e-06, + "loss": 4.8051, + "step": 84035 + }, + { + "epoch": 1.7097981770833335, + "grad_norm": 15.336492538452148, + "learning_rate": 6.1342362792626646e-06, + "loss": 4.8536, + "step": 84040 + }, + { + "epoch": 1.70989990234375, + "grad_norm": 21.06063461303711, + "learning_rate": 6.133847017641481e-06, + "loss": 4.9255, + "step": 84045 + }, + { + "epoch": 1.7100016276041665, + "grad_norm": 19.015775680541992, + "learning_rate": 6.133457748775375e-06, + "loss": 5.0196, + "step": 84050 + }, + { + "epoch": 1.7101033528645835, + "grad_norm": 16.407567977905273, + "learning_rate": 6.133068472666836e-06, + "loss": 5.116, + "step": 84055 + }, + { + "epoch": 1.710205078125, + "grad_norm": 19.782588958740234, + "learning_rate": 6.132679189318347e-06, + "loss": 5.2476, + "step": 84060 + }, + { + "epoch": 1.7103068033854165, + "grad_norm": 20.400182723999023, + "learning_rate": 6.132289898732399e-06, + "loss": 4.9647, + "step": 84065 + }, + { + "epoch": 1.7104085286458335, + "grad_norm": 14.872151374816895, + "learning_rate": 6.131900600911477e-06, + "loss": 4.7699, + "step": 84070 + }, + { + "epoch": 1.71051025390625, + "grad_norm": 15.574953079223633, + "learning_rate": 6.13151129585807e-06, + "loss": 4.9158, + "step": 84075 + }, + { + "epoch": 1.7106119791666665, + "grad_norm": 25.264081954956055, + "learning_rate": 6.131121983574664e-06, + "loss": 5.0875, + "step": 84080 + }, + { + "epoch": 1.7107137044270835, + "grad_norm": 20.24286651611328, + "learning_rate": 6.130732664063749e-06, + "loss": 4.614, + "step": 84085 + }, + { + "epoch": 1.7108154296875, + "grad_norm": 14.775066375732422, + "learning_rate": 6.13034333732781e-06, + "loss": 5.0173, + "step": 84090 + }, + { + "epoch": 1.7109171549479165, + "grad_norm": 16.215595245361328, + "learning_rate": 6.129954003369338e-06, + "loss": 4.5971, + "step": 84095 + }, + { + "epoch": 1.7110188802083335, + "grad_norm": 18.45758056640625, + "learning_rate": 6.129564662190817e-06, + "loss": 4.6924, + "step": 84100 + }, + { + "epoch": 1.71112060546875, + "grad_norm": 23.16010284423828, + "learning_rate": 6.129175313794735e-06, + "loss": 5.0379, + "step": 84105 + }, + { + "epoch": 1.7112223307291665, + "grad_norm": 16.086931228637695, + "learning_rate": 6.128785958183583e-06, + "loss": 4.9035, + "step": 84110 + }, + { + "epoch": 1.7113240559895835, + "grad_norm": 29.783845901489258, + "learning_rate": 6.128396595359848e-06, + "loss": 4.9372, + "step": 84115 + }, + { + "epoch": 1.71142578125, + "grad_norm": 21.74406623840332, + "learning_rate": 6.128007225326014e-06, + "loss": 5.1896, + "step": 84120 + }, + { + "epoch": 1.7115275065104165, + "grad_norm": 17.21973419189453, + "learning_rate": 6.127617848084575e-06, + "loss": 5.152, + "step": 84125 + }, + { + "epoch": 1.7116292317708335, + "grad_norm": 15.611642837524414, + "learning_rate": 6.127228463638014e-06, + "loss": 4.8618, + "step": 84130 + }, + { + "epoch": 1.71173095703125, + "grad_norm": 22.202978134155273, + "learning_rate": 6.1268390719888215e-06, + "loss": 4.846, + "step": 84135 + }, + { + "epoch": 1.7118326822916665, + "grad_norm": 16.2452335357666, + "learning_rate": 6.126449673139485e-06, + "loss": 5.0305, + "step": 84140 + }, + { + "epoch": 1.7119344075520835, + "grad_norm": 15.731160163879395, + "learning_rate": 6.126060267092493e-06, + "loss": 5.0402, + "step": 84145 + }, + { + "epoch": 1.7120361328125, + "grad_norm": 21.751399993896484, + "learning_rate": 6.125670853850333e-06, + "loss": 4.6331, + "step": 84150 + }, + { + "epoch": 1.7121378580729165, + "grad_norm": 20.68482780456543, + "learning_rate": 6.125281433415493e-06, + "loss": 4.8406, + "step": 84155 + }, + { + "epoch": 1.7122395833333335, + "grad_norm": 22.751220703125, + "learning_rate": 6.124892005790462e-06, + "loss": 5.1422, + "step": 84160 + }, + { + "epoch": 1.71234130859375, + "grad_norm": 19.200223922729492, + "learning_rate": 6.12450257097773e-06, + "loss": 4.7995, + "step": 84165 + }, + { + "epoch": 1.7124430338541665, + "grad_norm": 18.076303482055664, + "learning_rate": 6.124113128979781e-06, + "loss": 4.8575, + "step": 84170 + }, + { + "epoch": 1.7125447591145835, + "grad_norm": 17.227766036987305, + "learning_rate": 6.123723679799108e-06, + "loss": 4.8385, + "step": 84175 + }, + { + "epoch": 1.712646484375, + "grad_norm": 15.617029190063477, + "learning_rate": 6.123334223438197e-06, + "loss": 4.7926, + "step": 84180 + }, + { + "epoch": 1.7127482096354165, + "grad_norm": 17.774303436279297, + "learning_rate": 6.122944759899536e-06, + "loss": 4.906, + "step": 84185 + }, + { + "epoch": 1.7128499348958335, + "grad_norm": 18.155357360839844, + "learning_rate": 6.122555289185614e-06, + "loss": 4.7878, + "step": 84190 + }, + { + "epoch": 1.71295166015625, + "grad_norm": 20.00084686279297, + "learning_rate": 6.1221658112989215e-06, + "loss": 5.093, + "step": 84195 + }, + { + "epoch": 1.7130533854166665, + "grad_norm": 16.50215721130371, + "learning_rate": 6.121776326241945e-06, + "loss": 5.0496, + "step": 84200 + }, + { + "epoch": 1.7131551106770835, + "grad_norm": 18.521495819091797, + "learning_rate": 6.121386834017174e-06, + "loss": 5.1444, + "step": 84205 + }, + { + "epoch": 1.7132568359375, + "grad_norm": 15.462236404418945, + "learning_rate": 6.120997334627097e-06, + "loss": 4.9554, + "step": 84210 + }, + { + "epoch": 1.7133585611979165, + "grad_norm": 17.546493530273438, + "learning_rate": 6.120607828074203e-06, + "loss": 4.9199, + "step": 84215 + }, + { + "epoch": 1.7134602864583335, + "grad_norm": 17.49128532409668, + "learning_rate": 6.1202183143609805e-06, + "loss": 4.994, + "step": 84220 + }, + { + "epoch": 1.71356201171875, + "grad_norm": 17.91905975341797, + "learning_rate": 6.11982879348992e-06, + "loss": 4.9293, + "step": 84225 + }, + { + "epoch": 1.7136637369791665, + "grad_norm": 33.437774658203125, + "learning_rate": 6.119439265463507e-06, + "loss": 5.0352, + "step": 84230 + }, + { + "epoch": 1.7137654622395835, + "grad_norm": 18.56374740600586, + "learning_rate": 6.119049730284232e-06, + "loss": 4.9534, + "step": 84235 + }, + { + "epoch": 1.7138671875, + "grad_norm": 17.568016052246094, + "learning_rate": 6.118660187954587e-06, + "loss": 5.0017, + "step": 84240 + }, + { + "epoch": 1.7139689127604165, + "grad_norm": 17.350004196166992, + "learning_rate": 6.118270638477057e-06, + "loss": 5.0841, + "step": 84245 + }, + { + "epoch": 1.7140706380208335, + "grad_norm": 17.01819610595703, + "learning_rate": 6.1178810818541326e-06, + "loss": 4.8, + "step": 84250 + }, + { + "epoch": 1.71417236328125, + "grad_norm": 16.79500389099121, + "learning_rate": 6.117491518088303e-06, + "loss": 4.6565, + "step": 84255 + }, + { + "epoch": 1.7142740885416665, + "grad_norm": 30.21030616760254, + "learning_rate": 6.117101947182058e-06, + "loss": 5.1628, + "step": 84260 + }, + { + "epoch": 1.7143758138020835, + "grad_norm": 17.439903259277344, + "learning_rate": 6.116712369137886e-06, + "loss": 4.8994, + "step": 84265 + }, + { + "epoch": 1.7144775390625, + "grad_norm": 19.57935333251953, + "learning_rate": 6.116322783958276e-06, + "loss": 4.8984, + "step": 84270 + }, + { + "epoch": 1.7145792643229165, + "grad_norm": 22.513885498046875, + "learning_rate": 6.115933191645718e-06, + "loss": 4.914, + "step": 84275 + }, + { + "epoch": 1.7146809895833335, + "grad_norm": 14.196208953857422, + "learning_rate": 6.115543592202701e-06, + "loss": 4.8542, + "step": 84280 + }, + { + "epoch": 1.71478271484375, + "grad_norm": 19.582111358642578, + "learning_rate": 6.115153985631714e-06, + "loss": 4.6673, + "step": 84285 + }, + { + "epoch": 1.7148844401041665, + "grad_norm": 19.002151489257812, + "learning_rate": 6.114764371935248e-06, + "loss": 5.0528, + "step": 84290 + }, + { + "epoch": 1.7149861653645835, + "grad_norm": 17.34467887878418, + "learning_rate": 6.114374751115792e-06, + "loss": 5.0199, + "step": 84295 + }, + { + "epoch": 1.715087890625, + "grad_norm": 19.860273361206055, + "learning_rate": 6.113985123175833e-06, + "loss": 4.8259, + "step": 84300 + }, + { + "epoch": 1.7151896158854165, + "grad_norm": 18.18375015258789, + "learning_rate": 6.113595488117865e-06, + "loss": 4.905, + "step": 84305 + }, + { + "epoch": 1.7152913411458335, + "grad_norm": 13.919901847839355, + "learning_rate": 6.113205845944373e-06, + "loss": 5.2072, + "step": 84310 + }, + { + "epoch": 1.71539306640625, + "grad_norm": 21.959686279296875, + "learning_rate": 6.1128161966578516e-06, + "loss": 4.8032, + "step": 84315 + }, + { + "epoch": 1.7154947916666665, + "grad_norm": 15.49593448638916, + "learning_rate": 6.112426540260786e-06, + "loss": 4.9147, + "step": 84320 + }, + { + "epoch": 1.7155965169270835, + "grad_norm": 18.861101150512695, + "learning_rate": 6.112036876755668e-06, + "loss": 4.6356, + "step": 84325 + }, + { + "epoch": 1.7156982421875, + "grad_norm": 18.353124618530273, + "learning_rate": 6.111647206144988e-06, + "loss": 5.261, + "step": 84330 + }, + { + "epoch": 1.7157999674479165, + "grad_norm": 17.185958862304688, + "learning_rate": 6.111257528431236e-06, + "loss": 5.0876, + "step": 84335 + }, + { + "epoch": 1.7159016927083335, + "grad_norm": 18.390369415283203, + "learning_rate": 6.110867843616899e-06, + "loss": 4.8895, + "step": 84340 + }, + { + "epoch": 1.71600341796875, + "grad_norm": 18.277124404907227, + "learning_rate": 6.110478151704471e-06, + "loss": 4.7712, + "step": 84345 + }, + { + "epoch": 1.7161051432291665, + "grad_norm": 21.518896102905273, + "learning_rate": 6.110088452696439e-06, + "loss": 4.8808, + "step": 84350 + }, + { + "epoch": 1.7162068684895835, + "grad_norm": 23.092674255371094, + "learning_rate": 6.109698746595295e-06, + "loss": 4.9966, + "step": 84355 + }, + { + "epoch": 1.71630859375, + "grad_norm": 16.636157989501953, + "learning_rate": 6.109309033403528e-06, + "loss": 4.5325, + "step": 84360 + }, + { + "epoch": 1.7164103190104165, + "grad_norm": 18.70769500732422, + "learning_rate": 6.108919313123629e-06, + "loss": 5.0488, + "step": 84365 + }, + { + "epoch": 1.7165120442708335, + "grad_norm": 15.63332748413086, + "learning_rate": 6.108529585758087e-06, + "loss": 4.8251, + "step": 84370 + }, + { + "epoch": 1.71661376953125, + "grad_norm": 22.400299072265625, + "learning_rate": 6.108139851309392e-06, + "loss": 4.9562, + "step": 84375 + }, + { + "epoch": 1.7167154947916665, + "grad_norm": 15.598390579223633, + "learning_rate": 6.107750109780037e-06, + "loss": 4.8599, + "step": 84380 + }, + { + "epoch": 1.7168172200520835, + "grad_norm": 20.05901336669922, + "learning_rate": 6.10736036117251e-06, + "loss": 5.2229, + "step": 84385 + }, + { + "epoch": 1.7169189453125, + "grad_norm": 18.160160064697266, + "learning_rate": 6.1069706054893006e-06, + "loss": 4.6952, + "step": 84390 + }, + { + "epoch": 1.7170206705729165, + "grad_norm": 23.010000228881836, + "learning_rate": 6.106580842732902e-06, + "loss": 4.8547, + "step": 84395 + }, + { + "epoch": 1.7171223958333335, + "grad_norm": 17.382102966308594, + "learning_rate": 6.1061910729058025e-06, + "loss": 4.935, + "step": 84400 + }, + { + "epoch": 1.71722412109375, + "grad_norm": 20.80740737915039, + "learning_rate": 6.105801296010492e-06, + "loss": 4.8223, + "step": 84405 + }, + { + "epoch": 1.7173258463541665, + "grad_norm": 18.006166458129883, + "learning_rate": 6.105411512049464e-06, + "loss": 4.6462, + "step": 84410 + }, + { + "epoch": 1.7174275716145835, + "grad_norm": 22.000532150268555, + "learning_rate": 6.105021721025206e-06, + "loss": 5.155, + "step": 84415 + }, + { + "epoch": 1.717529296875, + "grad_norm": 19.597169876098633, + "learning_rate": 6.10463192294021e-06, + "loss": 4.8207, + "step": 84420 + }, + { + "epoch": 1.7176310221354165, + "grad_norm": 14.711776733398438, + "learning_rate": 6.104242117796968e-06, + "loss": 4.9277, + "step": 84425 + }, + { + "epoch": 1.7177327473958335, + "grad_norm": 16.343427658081055, + "learning_rate": 6.103852305597968e-06, + "loss": 4.7122, + "step": 84430 + }, + { + "epoch": 1.71783447265625, + "grad_norm": 19.815208435058594, + "learning_rate": 6.1034624863457035e-06, + "loss": 4.7698, + "step": 84435 + }, + { + "epoch": 1.7179361979166665, + "grad_norm": 18.680335998535156, + "learning_rate": 6.103072660042663e-06, + "loss": 4.5975, + "step": 84440 + }, + { + "epoch": 1.7180379231770835, + "grad_norm": 15.501835823059082, + "learning_rate": 6.102682826691338e-06, + "loss": 4.9826, + "step": 84445 + }, + { + "epoch": 1.7181396484375, + "grad_norm": 16.034711837768555, + "learning_rate": 6.10229298629422e-06, + "loss": 4.9729, + "step": 84450 + }, + { + "epoch": 1.7182413736979165, + "grad_norm": 19.3919734954834, + "learning_rate": 6.101903138853801e-06, + "loss": 4.627, + "step": 84455 + }, + { + "epoch": 1.7183430989583335, + "grad_norm": 12.888676643371582, + "learning_rate": 6.1015132843725686e-06, + "loss": 4.7836, + "step": 84460 + }, + { + "epoch": 1.71844482421875, + "grad_norm": 24.410348892211914, + "learning_rate": 6.1011234228530185e-06, + "loss": 5.0377, + "step": 84465 + }, + { + "epoch": 1.7185465494791665, + "grad_norm": 21.500524520874023, + "learning_rate": 6.100733554297637e-06, + "loss": 4.9048, + "step": 84470 + }, + { + "epoch": 1.7186482747395835, + "grad_norm": 13.015094757080078, + "learning_rate": 6.100343678708918e-06, + "loss": 5.1555, + "step": 84475 + }, + { + "epoch": 1.71875, + "grad_norm": 16.278427124023438, + "learning_rate": 6.099953796089353e-06, + "loss": 4.8639, + "step": 84480 + }, + { + "epoch": 1.7188517252604165, + "grad_norm": 14.248571395874023, + "learning_rate": 6.099563906441432e-06, + "loss": 4.9672, + "step": 84485 + }, + { + "epoch": 1.7189534505208335, + "grad_norm": 13.43848991394043, + "learning_rate": 6.099174009767647e-06, + "loss": 4.9449, + "step": 84490 + }, + { + "epoch": 1.71905517578125, + "grad_norm": 19.12261199951172, + "learning_rate": 6.098784106070489e-06, + "loss": 4.9616, + "step": 84495 + }, + { + "epoch": 1.7191569010416665, + "grad_norm": 18.079313278198242, + "learning_rate": 6.09839419535245e-06, + "loss": 5.0648, + "step": 84500 + }, + { + "epoch": 1.7192586263020835, + "grad_norm": 32.22506332397461, + "learning_rate": 6.098004277616018e-06, + "loss": 4.7443, + "step": 84505 + }, + { + "epoch": 1.7193603515625, + "grad_norm": 18.195249557495117, + "learning_rate": 6.09761435286369e-06, + "loss": 5.1014, + "step": 84510 + }, + { + "epoch": 1.7194620768229165, + "grad_norm": 18.17646598815918, + "learning_rate": 6.0972244210979545e-06, + "loss": 4.8662, + "step": 84515 + }, + { + "epoch": 1.7195638020833335, + "grad_norm": 16.50071144104004, + "learning_rate": 6.096834482321302e-06, + "loss": 4.6439, + "step": 84520 + }, + { + "epoch": 1.71966552734375, + "grad_norm": 15.08395004272461, + "learning_rate": 6.0964445365362265e-06, + "loss": 5.0242, + "step": 84525 + }, + { + "epoch": 1.7197672526041665, + "grad_norm": 21.05729866027832, + "learning_rate": 6.096054583745218e-06, + "loss": 4.6566, + "step": 84530 + }, + { + "epoch": 1.7198689778645835, + "grad_norm": 17.940298080444336, + "learning_rate": 6.095664623950769e-06, + "loss": 4.9988, + "step": 84535 + }, + { + "epoch": 1.719970703125, + "grad_norm": 19.48215103149414, + "learning_rate": 6.09527465715537e-06, + "loss": 5.2428, + "step": 84540 + }, + { + "epoch": 1.7200724283854165, + "grad_norm": 26.785898208618164, + "learning_rate": 6.094884683361514e-06, + "loss": 4.9354, + "step": 84545 + }, + { + "epoch": 1.7201741536458335, + "grad_norm": 18.050235748291016, + "learning_rate": 6.0944947025716925e-06, + "loss": 4.9127, + "step": 84550 + }, + { + "epoch": 1.72027587890625, + "grad_norm": 16.319936752319336, + "learning_rate": 6.094104714788398e-06, + "loss": 4.843, + "step": 84555 + }, + { + "epoch": 1.7203776041666665, + "grad_norm": 15.674389839172363, + "learning_rate": 6.09371472001412e-06, + "loss": 4.8134, + "step": 84560 + }, + { + "epoch": 1.7204793294270835, + "grad_norm": 19.651763916015625, + "learning_rate": 6.093324718251354e-06, + "loss": 4.9303, + "step": 84565 + }, + { + "epoch": 1.7205810546875, + "grad_norm": 16.68358612060547, + "learning_rate": 6.092934709502588e-06, + "loss": 5.1146, + "step": 84570 + }, + { + "epoch": 1.7206827799479165, + "grad_norm": 18.76952362060547, + "learning_rate": 6.092544693770318e-06, + "loss": 4.774, + "step": 84575 + }, + { + "epoch": 1.7207845052083335, + "grad_norm": 19.13603401184082, + "learning_rate": 6.092154671057034e-06, + "loss": 5.082, + "step": 84580 + }, + { + "epoch": 1.72088623046875, + "grad_norm": 18.719823837280273, + "learning_rate": 6.091764641365228e-06, + "loss": 5.0097, + "step": 84585 + }, + { + "epoch": 1.7209879557291665, + "grad_norm": 20.261520385742188, + "learning_rate": 6.091374604697392e-06, + "loss": 4.8128, + "step": 84590 + }, + { + "epoch": 1.7210896809895835, + "grad_norm": 17.189699172973633, + "learning_rate": 6.090984561056019e-06, + "loss": 4.7144, + "step": 84595 + }, + { + "epoch": 1.72119140625, + "grad_norm": 19.107906341552734, + "learning_rate": 6.0905945104436e-06, + "loss": 4.8695, + "step": 84600 + }, + { + "epoch": 1.7212931315104165, + "grad_norm": 17.952682495117188, + "learning_rate": 6.09020445286263e-06, + "loss": 4.8638, + "step": 84605 + }, + { + "epoch": 1.7213948567708335, + "grad_norm": 20.08997344970703, + "learning_rate": 6.089814388315599e-06, + "loss": 4.8757, + "step": 84610 + }, + { + "epoch": 1.72149658203125, + "grad_norm": 15.018961906433105, + "learning_rate": 6.089424316805001e-06, + "loss": 4.6574, + "step": 84615 + }, + { + "epoch": 1.7215983072916665, + "grad_norm": 15.829415321350098, + "learning_rate": 6.0890342383333265e-06, + "loss": 4.8443, + "step": 84620 + }, + { + "epoch": 1.7217000325520835, + "grad_norm": 18.010997772216797, + "learning_rate": 6.088644152903068e-06, + "loss": 5.0752, + "step": 84625 + }, + { + "epoch": 1.7218017578125, + "grad_norm": 25.01796531677246, + "learning_rate": 6.088254060516721e-06, + "loss": 5.1781, + "step": 84630 + }, + { + "epoch": 1.7219034830729165, + "grad_norm": 17.59847068786621, + "learning_rate": 6.087863961176774e-06, + "loss": 4.602, + "step": 84635 + }, + { + "epoch": 1.7220052083333335, + "grad_norm": 17.722431182861328, + "learning_rate": 6.087473854885724e-06, + "loss": 4.8972, + "step": 84640 + }, + { + "epoch": 1.72210693359375, + "grad_norm": 17.805051803588867, + "learning_rate": 6.087083741646059e-06, + "loss": 4.9395, + "step": 84645 + }, + { + "epoch": 1.7222086588541665, + "grad_norm": 17.0727481842041, + "learning_rate": 6.0866936214602745e-06, + "loss": 4.6489, + "step": 84650 + }, + { + "epoch": 1.7223103841145835, + "grad_norm": 14.385614395141602, + "learning_rate": 6.086303494330863e-06, + "loss": 4.8691, + "step": 84655 + }, + { + "epoch": 1.722412109375, + "grad_norm": 13.615102767944336, + "learning_rate": 6.085913360260317e-06, + "loss": 4.769, + "step": 84660 + }, + { + "epoch": 1.7225138346354165, + "grad_norm": 15.58966064453125, + "learning_rate": 6.085523219251129e-06, + "loss": 4.8531, + "step": 84665 + }, + { + "epoch": 1.7226155598958335, + "grad_norm": 17.379138946533203, + "learning_rate": 6.085133071305793e-06, + "loss": 4.846, + "step": 84670 + }, + { + "epoch": 1.72271728515625, + "grad_norm": 22.464468002319336, + "learning_rate": 6.084742916426801e-06, + "loss": 4.9344, + "step": 84675 + }, + { + "epoch": 1.7228190104166665, + "grad_norm": 15.591736793518066, + "learning_rate": 6.084352754616646e-06, + "loss": 4.6677, + "step": 84680 + }, + { + "epoch": 1.7229207356770835, + "grad_norm": 14.368048667907715, + "learning_rate": 6.0839625858778216e-06, + "loss": 4.9181, + "step": 84685 + }, + { + "epoch": 1.7230224609375, + "grad_norm": 18.039962768554688, + "learning_rate": 6.083572410212819e-06, + "loss": 4.6977, + "step": 84690 + }, + { + "epoch": 1.7231241861979165, + "grad_norm": 21.399808883666992, + "learning_rate": 6.083182227624133e-06, + "loss": 5.2081, + "step": 84695 + }, + { + "epoch": 1.7232259114583335, + "grad_norm": 18.966800689697266, + "learning_rate": 6.082792038114257e-06, + "loss": 5.1627, + "step": 84700 + }, + { + "epoch": 1.72332763671875, + "grad_norm": 24.8486328125, + "learning_rate": 6.082401841685683e-06, + "loss": 4.8853, + "step": 84705 + }, + { + "epoch": 1.7234293619791665, + "grad_norm": 18.166839599609375, + "learning_rate": 6.082011638340907e-06, + "loss": 4.8482, + "step": 84710 + }, + { + "epoch": 1.7235310872395835, + "grad_norm": 17.494056701660156, + "learning_rate": 6.081621428082417e-06, + "loss": 4.982, + "step": 84715 + }, + { + "epoch": 1.7236328125, + "grad_norm": 18.286867141723633, + "learning_rate": 6.08123121091271e-06, + "loss": 4.7066, + "step": 84720 + }, + { + "epoch": 1.7237345377604165, + "grad_norm": 18.41255760192871, + "learning_rate": 6.080840986834281e-06, + "loss": 5.0422, + "step": 84725 + }, + { + "epoch": 1.7238362630208335, + "grad_norm": 16.275615692138672, + "learning_rate": 6.080450755849619e-06, + "loss": 4.8763, + "step": 84730 + }, + { + "epoch": 1.72393798828125, + "grad_norm": 18.337247848510742, + "learning_rate": 6.08006051796122e-06, + "loss": 5.2246, + "step": 84735 + }, + { + "epoch": 1.7240397135416665, + "grad_norm": 16.98293685913086, + "learning_rate": 6.079670273171577e-06, + "loss": 4.8905, + "step": 84740 + }, + { + "epoch": 1.7241414388020835, + "grad_norm": 16.357498168945312, + "learning_rate": 6.079280021483185e-06, + "loss": 4.9942, + "step": 84745 + }, + { + "epoch": 1.7242431640625, + "grad_norm": 19.75905990600586, + "learning_rate": 6.078889762898535e-06, + "loss": 4.7321, + "step": 84750 + }, + { + "epoch": 1.7243448893229165, + "grad_norm": 18.617605209350586, + "learning_rate": 6.078499497420123e-06, + "loss": 4.8247, + "step": 84755 + }, + { + "epoch": 1.7244466145833335, + "grad_norm": 21.505414962768555, + "learning_rate": 6.078109225050441e-06, + "loss": 4.8602, + "step": 84760 + }, + { + "epoch": 1.72454833984375, + "grad_norm": 19.996278762817383, + "learning_rate": 6.0777189457919824e-06, + "loss": 5.0679, + "step": 84765 + }, + { + "epoch": 1.7246500651041665, + "grad_norm": 20.564125061035156, + "learning_rate": 6.077328659647243e-06, + "loss": 4.8711, + "step": 84770 + }, + { + "epoch": 1.7247517903645835, + "grad_norm": 15.384822845458984, + "learning_rate": 6.076938366618717e-06, + "loss": 4.8094, + "step": 84775 + }, + { + "epoch": 1.724853515625, + "grad_norm": 15.410236358642578, + "learning_rate": 6.0765480667088936e-06, + "loss": 4.8015, + "step": 84780 + }, + { + "epoch": 1.7249552408854165, + "grad_norm": 19.785594940185547, + "learning_rate": 6.076157759920272e-06, + "loss": 4.9885, + "step": 84785 + }, + { + "epoch": 1.7250569661458335, + "grad_norm": 19.959985733032227, + "learning_rate": 6.075767446255342e-06, + "loss": 4.6923, + "step": 84790 + }, + { + "epoch": 1.72515869140625, + "grad_norm": 15.990626335144043, + "learning_rate": 6.075377125716602e-06, + "loss": 4.969, + "step": 84795 + }, + { + "epoch": 1.7252604166666665, + "grad_norm": 15.358285903930664, + "learning_rate": 6.074986798306541e-06, + "loss": 4.7918, + "step": 84800 + }, + { + "epoch": 1.7253621419270835, + "grad_norm": 19.443593978881836, + "learning_rate": 6.074596464027657e-06, + "loss": 4.7617, + "step": 84805 + }, + { + "epoch": 1.7254638671875, + "grad_norm": 14.538213729858398, + "learning_rate": 6.0742061228824425e-06, + "loss": 4.8515, + "step": 84810 + }, + { + "epoch": 1.7255655924479165, + "grad_norm": 24.59913444519043, + "learning_rate": 6.073815774873392e-06, + "loss": 4.5711, + "step": 84815 + }, + { + "epoch": 1.7256673177083335, + "grad_norm": 16.773239135742188, + "learning_rate": 6.073425420002998e-06, + "loss": 4.7335, + "step": 84820 + }, + { + "epoch": 1.72576904296875, + "grad_norm": 17.9732608795166, + "learning_rate": 6.073035058273759e-06, + "loss": 4.8958, + "step": 84825 + }, + { + "epoch": 1.7258707682291665, + "grad_norm": 15.076618194580078, + "learning_rate": 6.072644689688165e-06, + "loss": 5.062, + "step": 84830 + }, + { + "epoch": 1.7259724934895835, + "grad_norm": 15.956180572509766, + "learning_rate": 6.072254314248712e-06, + "loss": 4.8857, + "step": 84835 + }, + { + "epoch": 1.72607421875, + "grad_norm": 15.944602012634277, + "learning_rate": 6.071863931957895e-06, + "loss": 4.569, + "step": 84840 + }, + { + "epoch": 1.7261759440104165, + "grad_norm": 16.855783462524414, + "learning_rate": 6.071473542818207e-06, + "loss": 4.8599, + "step": 84845 + }, + { + "epoch": 1.7262776692708335, + "grad_norm": 13.58051586151123, + "learning_rate": 6.071083146832144e-06, + "loss": 4.7137, + "step": 84850 + }, + { + "epoch": 1.72637939453125, + "grad_norm": 16.020742416381836, + "learning_rate": 6.070692744002198e-06, + "loss": 4.7358, + "step": 84855 + }, + { + "epoch": 1.7264811197916665, + "grad_norm": 20.81098747253418, + "learning_rate": 6.070302334330867e-06, + "loss": 5.1578, + "step": 84860 + }, + { + "epoch": 1.7265828450520835, + "grad_norm": 15.46779727935791, + "learning_rate": 6.069911917820644e-06, + "loss": 4.9263, + "step": 84865 + }, + { + "epoch": 1.7266845703125, + "grad_norm": 15.902974128723145, + "learning_rate": 6.069521494474021e-06, + "loss": 4.8004, + "step": 84870 + }, + { + "epoch": 1.7267862955729165, + "grad_norm": 16.77931022644043, + "learning_rate": 6.069131064293498e-06, + "loss": 4.7967, + "step": 84875 + }, + { + "epoch": 1.7268880208333335, + "grad_norm": 13.636069297790527, + "learning_rate": 6.068740627281564e-06, + "loss": 4.9896, + "step": 84880 + }, + { + "epoch": 1.72698974609375, + "grad_norm": 22.602231979370117, + "learning_rate": 6.0683501834407185e-06, + "loss": 5.3911, + "step": 84885 + }, + { + "epoch": 1.7270914713541665, + "grad_norm": 15.889906883239746, + "learning_rate": 6.0679597327734565e-06, + "loss": 4.9806, + "step": 84890 + }, + { + "epoch": 1.7271931966145835, + "grad_norm": 16.411306381225586, + "learning_rate": 6.0675692752822676e-06, + "loss": 4.6298, + "step": 84895 + }, + { + "epoch": 1.727294921875, + "grad_norm": 19.650033950805664, + "learning_rate": 6.067178810969651e-06, + "loss": 4.8762, + "step": 84900 + }, + { + "epoch": 1.7273966471354165, + "grad_norm": 15.963469505310059, + "learning_rate": 6.066788339838101e-06, + "loss": 4.7798, + "step": 84905 + }, + { + "epoch": 1.7274983723958335, + "grad_norm": 18.286148071289062, + "learning_rate": 6.066397861890111e-06, + "loss": 4.8381, + "step": 84910 + }, + { + "epoch": 1.72760009765625, + "grad_norm": 13.588241577148438, + "learning_rate": 6.066007377128179e-06, + "loss": 4.8421, + "step": 84915 + }, + { + "epoch": 1.7277018229166665, + "grad_norm": 16.333528518676758, + "learning_rate": 6.065616885554796e-06, + "loss": 4.9693, + "step": 84920 + }, + { + "epoch": 1.7278035481770835, + "grad_norm": 17.899263381958008, + "learning_rate": 6.065226387172461e-06, + "loss": 4.7649, + "step": 84925 + }, + { + "epoch": 1.7279052734375, + "grad_norm": 18.089628219604492, + "learning_rate": 6.064835881983669e-06, + "loss": 4.7564, + "step": 84930 + }, + { + "epoch": 1.7280069986979165, + "grad_norm": 21.768383026123047, + "learning_rate": 6.0644453699909104e-06, + "loss": 4.7882, + "step": 84935 + }, + { + "epoch": 1.7281087239583335, + "grad_norm": 16.487022399902344, + "learning_rate": 6.064054851196686e-06, + "loss": 5.0498, + "step": 84940 + }, + { + "epoch": 1.72821044921875, + "grad_norm": 20.565082550048828, + "learning_rate": 6.0636643256034866e-06, + "loss": 5.1841, + "step": 84945 + }, + { + "epoch": 1.7283121744791665, + "grad_norm": 16.29314613342285, + "learning_rate": 6.063273793213812e-06, + "loss": 4.8739, + "step": 84950 + }, + { + "epoch": 1.7284138997395835, + "grad_norm": 22.062725067138672, + "learning_rate": 6.062883254030155e-06, + "loss": 4.9225, + "step": 84955 + }, + { + "epoch": 1.728515625, + "grad_norm": 17.188600540161133, + "learning_rate": 6.062492708055011e-06, + "loss": 4.9755, + "step": 84960 + }, + { + "epoch": 1.7286173502604165, + "grad_norm": 21.54253578186035, + "learning_rate": 6.0621021552908764e-06, + "loss": 5.0574, + "step": 84965 + }, + { + "epoch": 1.7287190755208335, + "grad_norm": 17.35780143737793, + "learning_rate": 6.061711595740245e-06, + "loss": 4.7943, + "step": 84970 + }, + { + "epoch": 1.72882080078125, + "grad_norm": 15.329303741455078, + "learning_rate": 6.061321029405614e-06, + "loss": 4.8555, + "step": 84975 + }, + { + "epoch": 1.7289225260416665, + "grad_norm": 19.959571838378906, + "learning_rate": 6.06093045628948e-06, + "loss": 5.0589, + "step": 84980 + }, + { + "epoch": 1.7290242513020835, + "grad_norm": 19.096105575561523, + "learning_rate": 6.060539876394335e-06, + "loss": 4.6325, + "step": 84985 + }, + { + "epoch": 1.7291259765625, + "grad_norm": 21.181045532226562, + "learning_rate": 6.0601492897226785e-06, + "loss": 5.0686, + "step": 84990 + }, + { + "epoch": 1.7292277018229165, + "grad_norm": 15.539607048034668, + "learning_rate": 6.059758696277003e-06, + "loss": 5.0927, + "step": 84995 + }, + { + "epoch": 1.7293294270833335, + "grad_norm": 18.198001861572266, + "learning_rate": 6.059368096059807e-06, + "loss": 5.0267, + "step": 85000 + }, + { + "epoch": 1.72943115234375, + "grad_norm": 19.106473922729492, + "learning_rate": 6.058977489073585e-06, + "loss": 4.9028, + "step": 85005 + }, + { + "epoch": 1.7295328776041665, + "grad_norm": 19.081846237182617, + "learning_rate": 6.058586875320832e-06, + "loss": 5.0277, + "step": 85010 + }, + { + "epoch": 1.7296346028645835, + "grad_norm": 22.829853057861328, + "learning_rate": 6.058196254804046e-06, + "loss": 4.8008, + "step": 85015 + }, + { + "epoch": 1.729736328125, + "grad_norm": 20.29962921142578, + "learning_rate": 6.057805627525722e-06, + "loss": 5.1079, + "step": 85020 + }, + { + "epoch": 1.7298380533854165, + "grad_norm": 16.90874481201172, + "learning_rate": 6.0574149934883555e-06, + "loss": 5.0116, + "step": 85025 + }, + { + "epoch": 1.7299397786458335, + "grad_norm": 21.02300453186035, + "learning_rate": 6.057024352694444e-06, + "loss": 4.7839, + "step": 85030 + }, + { + "epoch": 1.73004150390625, + "grad_norm": 22.295711517333984, + "learning_rate": 6.0566337051464795e-06, + "loss": 4.9081, + "step": 85035 + }, + { + "epoch": 1.7301432291666665, + "grad_norm": 21.163288116455078, + "learning_rate": 6.056243050846964e-06, + "loss": 5.0085, + "step": 85040 + }, + { + "epoch": 1.7302449544270835, + "grad_norm": 16.466398239135742, + "learning_rate": 6.05585238979839e-06, + "loss": 5.0007, + "step": 85045 + }, + { + "epoch": 1.7303466796875, + "grad_norm": 20.860477447509766, + "learning_rate": 6.055461722003253e-06, + "loss": 5.0289, + "step": 85050 + }, + { + "epoch": 1.7304484049479165, + "grad_norm": 14.64479923248291, + "learning_rate": 6.0550710474640525e-06, + "loss": 4.9756, + "step": 85055 + }, + { + "epoch": 1.7305501302083335, + "grad_norm": 20.36655616760254, + "learning_rate": 6.054680366183282e-06, + "loss": 5.0675, + "step": 85060 + }, + { + "epoch": 1.73065185546875, + "grad_norm": 17.49422264099121, + "learning_rate": 6.054289678163438e-06, + "loss": 4.8173, + "step": 85065 + }, + { + "epoch": 1.7307535807291665, + "grad_norm": 17.32754898071289, + "learning_rate": 6.05389898340702e-06, + "loss": 4.9231, + "step": 85070 + }, + { + "epoch": 1.7308553059895835, + "grad_norm": 15.764788627624512, + "learning_rate": 6.053508281916519e-06, + "loss": 5.0114, + "step": 85075 + }, + { + "epoch": 1.73095703125, + "grad_norm": 23.698244094848633, + "learning_rate": 6.053117573694437e-06, + "loss": 5.0402, + "step": 85080 + }, + { + "epoch": 1.7310587565104165, + "grad_norm": 17.361448287963867, + "learning_rate": 6.0527268587432675e-06, + "loss": 4.8758, + "step": 85085 + }, + { + "epoch": 1.7311604817708335, + "grad_norm": 18.400779724121094, + "learning_rate": 6.052336137065507e-06, + "loss": 4.8909, + "step": 85090 + }, + { + "epoch": 1.73126220703125, + "grad_norm": 16.533052444458008, + "learning_rate": 6.051945408663652e-06, + "loss": 4.9416, + "step": 85095 + }, + { + "epoch": 1.7313639322916665, + "grad_norm": 19.317955017089844, + "learning_rate": 6.0515546735401995e-06, + "loss": 4.9441, + "step": 85100 + }, + { + "epoch": 1.7314656575520835, + "grad_norm": 27.853900909423828, + "learning_rate": 6.051163931697649e-06, + "loss": 5.0709, + "step": 85105 + }, + { + "epoch": 1.7315673828125, + "grad_norm": 18.64720916748047, + "learning_rate": 6.050773183138493e-06, + "loss": 4.8097, + "step": 85110 + }, + { + "epoch": 1.7316691080729165, + "grad_norm": 16.910484313964844, + "learning_rate": 6.050382427865229e-06, + "loss": 4.9783, + "step": 85115 + }, + { + "epoch": 1.7317708333333335, + "grad_norm": 22.26827049255371, + "learning_rate": 6.049991665880357e-06, + "loss": 4.6476, + "step": 85120 + }, + { + "epoch": 1.73187255859375, + "grad_norm": 19.24021339416504, + "learning_rate": 6.04960089718637e-06, + "loss": 4.803, + "step": 85125 + }, + { + "epoch": 1.7319742838541665, + "grad_norm": 17.597736358642578, + "learning_rate": 6.0492101217857664e-06, + "loss": 4.9212, + "step": 85130 + }, + { + "epoch": 1.7320760091145835, + "grad_norm": 17.072988510131836, + "learning_rate": 6.048819339681042e-06, + "loss": 4.8232, + "step": 85135 + }, + { + "epoch": 1.732177734375, + "grad_norm": 15.686797142028809, + "learning_rate": 6.048428550874696e-06, + "loss": 4.9869, + "step": 85140 + }, + { + "epoch": 1.7322794596354165, + "grad_norm": 20.80129623413086, + "learning_rate": 6.048037755369226e-06, + "loss": 5.1341, + "step": 85145 + }, + { + "epoch": 1.7323811848958335, + "grad_norm": 18.20208168029785, + "learning_rate": 6.047646953167127e-06, + "loss": 5.0551, + "step": 85150 + }, + { + "epoch": 1.73248291015625, + "grad_norm": 16.773216247558594, + "learning_rate": 6.047256144270895e-06, + "loss": 4.8647, + "step": 85155 + }, + { + "epoch": 1.7325846354166665, + "grad_norm": 13.399576187133789, + "learning_rate": 6.0468653286830305e-06, + "loss": 5.0028, + "step": 85160 + }, + { + "epoch": 1.7326863606770835, + "grad_norm": 20.311290740966797, + "learning_rate": 6.046474506406027e-06, + "loss": 4.6705, + "step": 85165 + }, + { + "epoch": 1.7327880859375, + "grad_norm": 22.66021728515625, + "learning_rate": 6.0460836774423845e-06, + "loss": 4.79, + "step": 85170 + }, + { + "epoch": 1.7328898111979165, + "grad_norm": 15.277825355529785, + "learning_rate": 6.0456928417946e-06, + "loss": 4.9082, + "step": 85175 + }, + { + "epoch": 1.7329915364583335, + "grad_norm": 15.674276351928711, + "learning_rate": 6.045301999465168e-06, + "loss": 4.7705, + "step": 85180 + }, + { + "epoch": 1.73309326171875, + "grad_norm": 16.51247215270996, + "learning_rate": 6.04491115045659e-06, + "loss": 4.7705, + "step": 85185 + }, + { + "epoch": 1.7331949869791665, + "grad_norm": 17.50090789794922, + "learning_rate": 6.044520294771361e-06, + "loss": 5.1419, + "step": 85190 + }, + { + "epoch": 1.7332967122395835, + "grad_norm": 13.254423141479492, + "learning_rate": 6.044129432411979e-06, + "loss": 5.0686, + "step": 85195 + }, + { + "epoch": 1.7333984375, + "grad_norm": 18.348264694213867, + "learning_rate": 6.043738563380942e-06, + "loss": 4.921, + "step": 85200 + }, + { + "epoch": 1.7335001627604165, + "grad_norm": 16.362140655517578, + "learning_rate": 6.043347687680746e-06, + "loss": 4.7347, + "step": 85205 + }, + { + "epoch": 1.7336018880208335, + "grad_norm": 21.832149505615234, + "learning_rate": 6.0429568053138896e-06, + "loss": 4.8228, + "step": 85210 + }, + { + "epoch": 1.73370361328125, + "grad_norm": 15.37947940826416, + "learning_rate": 6.042565916282872e-06, + "loss": 5.0203, + "step": 85215 + }, + { + "epoch": 1.7338053385416665, + "grad_norm": 19.06475067138672, + "learning_rate": 6.042175020590186e-06, + "loss": 4.764, + "step": 85220 + }, + { + "epoch": 1.7339070638020835, + "grad_norm": 20.26571273803711, + "learning_rate": 6.041784118238335e-06, + "loss": 4.5042, + "step": 85225 + }, + { + "epoch": 1.7340087890625, + "grad_norm": 15.622490882873535, + "learning_rate": 6.0413932092298124e-06, + "loss": 5.0631, + "step": 85230 + }, + { + "epoch": 1.7341105143229165, + "grad_norm": 17.64378547668457, + "learning_rate": 6.041002293567119e-06, + "loss": 4.9909, + "step": 85235 + }, + { + "epoch": 1.7342122395833335, + "grad_norm": 14.98162841796875, + "learning_rate": 6.040611371252751e-06, + "loss": 4.8351, + "step": 85240 + }, + { + "epoch": 1.73431396484375, + "grad_norm": 16.73639678955078, + "learning_rate": 6.040220442289206e-06, + "loss": 5.1368, + "step": 85245 + }, + { + "epoch": 1.7344156901041665, + "grad_norm": 18.924715042114258, + "learning_rate": 6.039829506678984e-06, + "loss": 4.8665, + "step": 85250 + }, + { + "epoch": 1.7345174153645835, + "grad_norm": 21.747026443481445, + "learning_rate": 6.03943856442458e-06, + "loss": 5.0784, + "step": 85255 + }, + { + "epoch": 1.734619140625, + "grad_norm": 19.95043182373047, + "learning_rate": 6.039047615528495e-06, + "loss": 5.0349, + "step": 85260 + }, + { + "epoch": 1.7347208658854165, + "grad_norm": 18.509157180786133, + "learning_rate": 6.038656659993224e-06, + "loss": 4.5739, + "step": 85265 + }, + { + "epoch": 1.7348225911458335, + "grad_norm": 17.227088928222656, + "learning_rate": 6.0382656978212686e-06, + "loss": 4.8051, + "step": 85270 + }, + { + "epoch": 1.73492431640625, + "grad_norm": 15.605267524719238, + "learning_rate": 6.037874729015123e-06, + "loss": 4.9637, + "step": 85275 + }, + { + "epoch": 1.7350260416666665, + "grad_norm": 15.332128524780273, + "learning_rate": 6.037483753577289e-06, + "loss": 4.9281, + "step": 85280 + }, + { + "epoch": 1.7351277669270835, + "grad_norm": 25.289493560791016, + "learning_rate": 6.0370927715102614e-06, + "loss": 4.6929, + "step": 85285 + }, + { + "epoch": 1.7352294921875, + "grad_norm": 14.640203475952148, + "learning_rate": 6.036701782816541e-06, + "loss": 4.7663, + "step": 85290 + }, + { + "epoch": 1.7353312174479165, + "grad_norm": 16.89731216430664, + "learning_rate": 6.036310787498625e-06, + "loss": 4.9185, + "step": 85295 + }, + { + "epoch": 1.7354329427083335, + "grad_norm": 20.570512771606445, + "learning_rate": 6.035919785559013e-06, + "loss": 4.7969, + "step": 85300 + }, + { + "epoch": 1.73553466796875, + "grad_norm": 15.741189956665039, + "learning_rate": 6.035528777000202e-06, + "loss": 4.7358, + "step": 85305 + }, + { + "epoch": 1.7356363932291665, + "grad_norm": 17.83517074584961, + "learning_rate": 6.035137761824691e-06, + "loss": 4.8384, + "step": 85310 + }, + { + "epoch": 1.7357381184895835, + "grad_norm": 14.936927795410156, + "learning_rate": 6.034746740034978e-06, + "loss": 4.8639, + "step": 85315 + }, + { + "epoch": 1.73583984375, + "grad_norm": 20.515127182006836, + "learning_rate": 6.034355711633561e-06, + "loss": 5.158, + "step": 85320 + }, + { + "epoch": 1.7359415690104165, + "grad_norm": 23.749149322509766, + "learning_rate": 6.0339646766229406e-06, + "loss": 5.0111, + "step": 85325 + }, + { + "epoch": 1.7360432942708335, + "grad_norm": 17.051389694213867, + "learning_rate": 6.033573635005615e-06, + "loss": 4.8104, + "step": 85330 + }, + { + "epoch": 1.73614501953125, + "grad_norm": 15.420918464660645, + "learning_rate": 6.03318258678408e-06, + "loss": 4.8192, + "step": 85335 + }, + { + "epoch": 1.7362467447916665, + "grad_norm": 20.484725952148438, + "learning_rate": 6.032791531960838e-06, + "loss": 4.7996, + "step": 85340 + }, + { + "epoch": 1.7363484700520835, + "grad_norm": 18.001834869384766, + "learning_rate": 6.0324004705383846e-06, + "loss": 4.7156, + "step": 85345 + }, + { + "epoch": 1.7364501953125, + "grad_norm": 14.854018211364746, + "learning_rate": 6.03200940251922e-06, + "loss": 4.8591, + "step": 85350 + }, + { + "epoch": 1.7365519205729165, + "grad_norm": 18.979875564575195, + "learning_rate": 6.031618327905843e-06, + "loss": 5.0835, + "step": 85355 + }, + { + "epoch": 1.7366536458333335, + "grad_norm": 19.82354736328125, + "learning_rate": 6.031227246700753e-06, + "loss": 4.9843, + "step": 85360 + }, + { + "epoch": 1.73675537109375, + "grad_norm": 16.68768310546875, + "learning_rate": 6.030836158906448e-06, + "loss": 5.3458, + "step": 85365 + }, + { + "epoch": 1.7368570963541665, + "grad_norm": 18.87885093688965, + "learning_rate": 6.030445064525427e-06, + "loss": 4.755, + "step": 85370 + }, + { + "epoch": 1.7369588216145835, + "grad_norm": 18.85016441345215, + "learning_rate": 6.0300539635601895e-06, + "loss": 4.9203, + "step": 85375 + }, + { + "epoch": 1.737060546875, + "grad_norm": 15.27229118347168, + "learning_rate": 6.029662856013234e-06, + "loss": 4.7125, + "step": 85380 + }, + { + "epoch": 1.7371622721354165, + "grad_norm": 35.859066009521484, + "learning_rate": 6.0292717418870585e-06, + "loss": 4.4874, + "step": 85385 + }, + { + "epoch": 1.7372639973958335, + "grad_norm": 17.300325393676758, + "learning_rate": 6.028880621184165e-06, + "loss": 4.9527, + "step": 85390 + }, + { + "epoch": 1.73736572265625, + "grad_norm": 23.909923553466797, + "learning_rate": 6.028489493907049e-06, + "loss": 4.9526, + "step": 85395 + }, + { + "epoch": 1.7374674479166665, + "grad_norm": 15.782244682312012, + "learning_rate": 6.028098360058214e-06, + "loss": 4.7985, + "step": 85400 + }, + { + "epoch": 1.7375691731770835, + "grad_norm": 15.256966590881348, + "learning_rate": 6.027707219640155e-06, + "loss": 4.9234, + "step": 85405 + }, + { + "epoch": 1.7376708984375, + "grad_norm": 16.98236083984375, + "learning_rate": 6.027316072655374e-06, + "loss": 4.7379, + "step": 85410 + }, + { + "epoch": 1.7377726236979165, + "grad_norm": 19.215476989746094, + "learning_rate": 6.02692491910637e-06, + "loss": 4.9688, + "step": 85415 + }, + { + "epoch": 1.7378743489583335, + "grad_norm": 17.93984603881836, + "learning_rate": 6.026533758995641e-06, + "loss": 4.8677, + "step": 85420 + }, + { + "epoch": 1.73797607421875, + "grad_norm": 22.075862884521484, + "learning_rate": 6.026142592325685e-06, + "loss": 4.8107, + "step": 85425 + }, + { + "epoch": 1.7380777994791665, + "grad_norm": 13.052663803100586, + "learning_rate": 6.025751419099007e-06, + "loss": 5.0712, + "step": 85430 + }, + { + "epoch": 1.7381795247395835, + "grad_norm": 18.297107696533203, + "learning_rate": 6.025360239318102e-06, + "loss": 4.9205, + "step": 85435 + }, + { + "epoch": 1.73828125, + "grad_norm": 15.827877044677734, + "learning_rate": 6.02496905298547e-06, + "loss": 5.1351, + "step": 85440 + }, + { + "epoch": 1.7383829752604165, + "grad_norm": 36.47446823120117, + "learning_rate": 6.024577860103612e-06, + "loss": 5.0897, + "step": 85445 + }, + { + "epoch": 1.7384847005208335, + "grad_norm": 18.11457061767578, + "learning_rate": 6.024186660675025e-06, + "loss": 4.6616, + "step": 85450 + }, + { + "epoch": 1.73858642578125, + "grad_norm": 24.239286422729492, + "learning_rate": 6.023795454702213e-06, + "loss": 4.7991, + "step": 85455 + }, + { + "epoch": 1.7386881510416665, + "grad_norm": 15.89610481262207, + "learning_rate": 6.023404242187672e-06, + "loss": 4.8429, + "step": 85460 + }, + { + "epoch": 1.7387898763020835, + "grad_norm": 20.5506591796875, + "learning_rate": 6.023013023133902e-06, + "loss": 4.9391, + "step": 85465 + }, + { + "epoch": 1.7388916015625, + "grad_norm": 19.62325096130371, + "learning_rate": 6.022621797543403e-06, + "loss": 4.8013, + "step": 85470 + }, + { + "epoch": 1.7389933268229165, + "grad_norm": 16.42041778564453, + "learning_rate": 6.022230565418676e-06, + "loss": 4.8609, + "step": 85475 + }, + { + "epoch": 1.7390950520833335, + "grad_norm": 14.246593475341797, + "learning_rate": 6.0218393267622215e-06, + "loss": 4.9167, + "step": 85480 + }, + { + "epoch": 1.73919677734375, + "grad_norm": 15.265469551086426, + "learning_rate": 6.021448081576536e-06, + "loss": 4.8, + "step": 85485 + }, + { + "epoch": 1.7392985026041665, + "grad_norm": 17.990074157714844, + "learning_rate": 6.0210568298641225e-06, + "loss": 4.9042, + "step": 85490 + }, + { + "epoch": 1.7394002278645835, + "grad_norm": 22.2513484954834, + "learning_rate": 6.020665571627481e-06, + "loss": 4.9056, + "step": 85495 + }, + { + "epoch": 1.739501953125, + "grad_norm": 20.518035888671875, + "learning_rate": 6.0202743068691095e-06, + "loss": 5.0624, + "step": 85500 + }, + { + "epoch": 1.7396036783854165, + "grad_norm": 18.694570541381836, + "learning_rate": 6.019883035591509e-06, + "loss": 4.714, + "step": 85505 + }, + { + "epoch": 1.7397054036458335, + "grad_norm": 17.896554946899414, + "learning_rate": 6.01949175779718e-06, + "loss": 4.9764, + "step": 85510 + }, + { + "epoch": 1.73980712890625, + "grad_norm": 15.882288932800293, + "learning_rate": 6.01910047348862e-06, + "loss": 4.811, + "step": 85515 + }, + { + "epoch": 1.7399088541666665, + "grad_norm": 17.95659637451172, + "learning_rate": 6.018709182668335e-06, + "loss": 4.8562, + "step": 85520 + }, + { + "epoch": 1.7400105794270835, + "grad_norm": 14.935317039489746, + "learning_rate": 6.018317885338819e-06, + "loss": 4.8165, + "step": 85525 + }, + { + "epoch": 1.7401123046875, + "grad_norm": 23.090852737426758, + "learning_rate": 6.017926581502577e-06, + "loss": 4.8297, + "step": 85530 + }, + { + "epoch": 1.7402140299479165, + "grad_norm": 16.462800979614258, + "learning_rate": 6.017535271162106e-06, + "loss": 5.1625, + "step": 85535 + }, + { + "epoch": 1.7403157552083335, + "grad_norm": 14.683582305908203, + "learning_rate": 6.017143954319909e-06, + "loss": 4.7565, + "step": 85540 + }, + { + "epoch": 1.74041748046875, + "grad_norm": 21.031091690063477, + "learning_rate": 6.016752630978483e-06, + "loss": 4.916, + "step": 85545 + }, + { + "epoch": 1.7405192057291665, + "grad_norm": 20.52033233642578, + "learning_rate": 6.016361301140332e-06, + "loss": 5.0988, + "step": 85550 + }, + { + "epoch": 1.7406209309895835, + "grad_norm": 19.7735652923584, + "learning_rate": 6.015969964807953e-06, + "loss": 4.864, + "step": 85555 + }, + { + "epoch": 1.74072265625, + "grad_norm": 23.204914093017578, + "learning_rate": 6.015578621983851e-06, + "loss": 4.8948, + "step": 85560 + }, + { + "epoch": 1.7408243815104165, + "grad_norm": 17.12640953063965, + "learning_rate": 6.0151872726705215e-06, + "loss": 4.9268, + "step": 85565 + }, + { + "epoch": 1.7409261067708335, + "grad_norm": 20.374048233032227, + "learning_rate": 6.014795916870469e-06, + "loss": 4.9002, + "step": 85570 + }, + { + "epoch": 1.74102783203125, + "grad_norm": 17.52140998840332, + "learning_rate": 6.014404554586194e-06, + "loss": 5.0475, + "step": 85575 + }, + { + "epoch": 1.7411295572916665, + "grad_norm": 17.04070281982422, + "learning_rate": 6.014013185820193e-06, + "loss": 4.7752, + "step": 85580 + }, + { + "epoch": 1.7412312825520835, + "grad_norm": 16.04717445373535, + "learning_rate": 6.013621810574972e-06, + "loss": 5.2361, + "step": 85585 + }, + { + "epoch": 1.7413330078125, + "grad_norm": 20.79758644104004, + "learning_rate": 6.01323042885303e-06, + "loss": 5.0097, + "step": 85590 + }, + { + "epoch": 1.7414347330729165, + "grad_norm": 25.42742347717285, + "learning_rate": 6.012839040656865e-06, + "loss": 5.1482, + "step": 85595 + }, + { + "epoch": 1.7415364583333335, + "grad_norm": 22.940731048583984, + "learning_rate": 6.0124476459889814e-06, + "loss": 4.7807, + "step": 85600 + }, + { + "epoch": 1.74163818359375, + "grad_norm": 22.879417419433594, + "learning_rate": 6.012056244851877e-06, + "loss": 5.368, + "step": 85605 + }, + { + "epoch": 1.7417399088541665, + "grad_norm": 15.946866035461426, + "learning_rate": 6.011664837248057e-06, + "loss": 4.774, + "step": 85610 + }, + { + "epoch": 1.7418416341145835, + "grad_norm": 18.577478408813477, + "learning_rate": 6.011273423180018e-06, + "loss": 4.8506, + "step": 85615 + }, + { + "epoch": 1.741943359375, + "grad_norm": 18.329219818115234, + "learning_rate": 6.010882002650264e-06, + "loss": 4.8805, + "step": 85620 + }, + { + "epoch": 1.7420450846354165, + "grad_norm": 14.231281280517578, + "learning_rate": 6.010490575661294e-06, + "loss": 4.8538, + "step": 85625 + }, + { + "epoch": 1.7421468098958335, + "grad_norm": 17.938243865966797, + "learning_rate": 6.01009914221561e-06, + "loss": 5.0241, + "step": 85630 + }, + { + "epoch": 1.74224853515625, + "grad_norm": 20.12211799621582, + "learning_rate": 6.009707702315713e-06, + "loss": 4.9489, + "step": 85635 + }, + { + "epoch": 1.7423502604166665, + "grad_norm": 16.317855834960938, + "learning_rate": 6.009316255964105e-06, + "loss": 5.0211, + "step": 85640 + }, + { + "epoch": 1.7424519856770835, + "grad_norm": 22.17862319946289, + "learning_rate": 6.008924803163286e-06, + "loss": 5.0447, + "step": 85645 + }, + { + "epoch": 1.7425537109375, + "grad_norm": 21.734132766723633, + "learning_rate": 6.0085333439157575e-06, + "loss": 4.5007, + "step": 85650 + }, + { + "epoch": 1.7426554361979165, + "grad_norm": 15.370392799377441, + "learning_rate": 6.008141878224022e-06, + "loss": 5.0917, + "step": 85655 + }, + { + "epoch": 1.7427571614583335, + "grad_norm": 17.0989990234375, + "learning_rate": 6.00775040609058e-06, + "loss": 4.7224, + "step": 85660 + }, + { + "epoch": 1.74285888671875, + "grad_norm": 17.865461349487305, + "learning_rate": 6.007358927517931e-06, + "loss": 4.994, + "step": 85665 + }, + { + "epoch": 1.7429606119791665, + "grad_norm": 15.442703247070312, + "learning_rate": 6.00696744250858e-06, + "loss": 4.7561, + "step": 85670 + }, + { + "epoch": 1.7430623372395835, + "grad_norm": 15.9050874710083, + "learning_rate": 6.006575951065026e-06, + "loss": 4.7763, + "step": 85675 + }, + { + "epoch": 1.7431640625, + "grad_norm": 17.487302780151367, + "learning_rate": 6.006184453189771e-06, + "loss": 4.9886, + "step": 85680 + }, + { + "epoch": 1.7432657877604165, + "grad_norm": 16.600234985351562, + "learning_rate": 6.0057929488853176e-06, + "loss": 4.8808, + "step": 85685 + }, + { + "epoch": 1.7433675130208335, + "grad_norm": 21.009275436401367, + "learning_rate": 6.0054014381541656e-06, + "loss": 4.9835, + "step": 85690 + }, + { + "epoch": 1.74346923828125, + "grad_norm": 16.10570526123047, + "learning_rate": 6.005009920998819e-06, + "loss": 4.7729, + "step": 85695 + }, + { + "epoch": 1.7435709635416665, + "grad_norm": 19.266498565673828, + "learning_rate": 6.0046183974217765e-06, + "loss": 4.9155, + "step": 85700 + }, + { + "epoch": 1.7436726888020835, + "grad_norm": 18.51091766357422, + "learning_rate": 6.004226867425542e-06, + "loss": 5.006, + "step": 85705 + }, + { + "epoch": 1.7437744140625, + "grad_norm": 20.153278350830078, + "learning_rate": 6.003835331012616e-06, + "loss": 4.6151, + "step": 85710 + }, + { + "epoch": 1.7438761393229165, + "grad_norm": 19.21755599975586, + "learning_rate": 6.003443788185502e-06, + "loss": 4.8949, + "step": 85715 + }, + { + "epoch": 1.7439778645833335, + "grad_norm": 15.811535835266113, + "learning_rate": 6.0030522389467015e-06, + "loss": 5.0117, + "step": 85720 + }, + { + "epoch": 1.74407958984375, + "grad_norm": 14.9879789352417, + "learning_rate": 6.002660683298714e-06, + "loss": 4.8618, + "step": 85725 + }, + { + "epoch": 1.7441813151041665, + "grad_norm": 19.815664291381836, + "learning_rate": 6.002269121244043e-06, + "loss": 5.0969, + "step": 85730 + }, + { + "epoch": 1.7442830403645835, + "grad_norm": 15.635452270507812, + "learning_rate": 6.00187755278519e-06, + "loss": 4.9357, + "step": 85735 + }, + { + "epoch": 1.744384765625, + "grad_norm": 27.851219177246094, + "learning_rate": 6.00148597792466e-06, + "loss": 4.9764, + "step": 85740 + }, + { + "epoch": 1.7444864908854165, + "grad_norm": 18.972618103027344, + "learning_rate": 6.00109439666495e-06, + "loss": 5.0122, + "step": 85745 + }, + { + "epoch": 1.7445882161458335, + "grad_norm": 18.280832290649414, + "learning_rate": 6.000702809008565e-06, + "loss": 5.1356, + "step": 85750 + }, + { + "epoch": 1.74468994140625, + "grad_norm": 21.35822105407715, + "learning_rate": 6.000311214958008e-06, + "loss": 5.0419, + "step": 85755 + }, + { + "epoch": 1.7447916666666665, + "grad_norm": 11.824041366577148, + "learning_rate": 5.999919614515778e-06, + "loss": 4.8735, + "step": 85760 + }, + { + "epoch": 1.7448933919270835, + "grad_norm": 25.434621810913086, + "learning_rate": 5.999528007684381e-06, + "loss": 5.1886, + "step": 85765 + }, + { + "epoch": 1.7449951171875, + "grad_norm": 20.095613479614258, + "learning_rate": 5.999136394466316e-06, + "loss": 5.0501, + "step": 85770 + }, + { + "epoch": 1.7450968424479165, + "grad_norm": 16.409095764160156, + "learning_rate": 5.998744774864087e-06, + "loss": 4.9584, + "step": 85775 + }, + { + "epoch": 1.7451985677083335, + "grad_norm": 22.180561065673828, + "learning_rate": 5.998353148880196e-06, + "loss": 5.0053, + "step": 85780 + }, + { + "epoch": 1.74530029296875, + "grad_norm": 18.54450798034668, + "learning_rate": 5.997961516517145e-06, + "loss": 4.9268, + "step": 85785 + }, + { + "epoch": 1.7454020182291665, + "grad_norm": 19.382457733154297, + "learning_rate": 5.997569877777437e-06, + "loss": 4.7822, + "step": 85790 + }, + { + "epoch": 1.7455037434895835, + "grad_norm": 17.38108253479004, + "learning_rate": 5.997178232663574e-06, + "loss": 5.0139, + "step": 85795 + }, + { + "epoch": 1.74560546875, + "grad_norm": 16.777544021606445, + "learning_rate": 5.996786581178059e-06, + "loss": 4.9367, + "step": 85800 + }, + { + "epoch": 1.7457071940104165, + "grad_norm": 16.768829345703125, + "learning_rate": 5.996394923323393e-06, + "loss": 4.8845, + "step": 85805 + }, + { + "epoch": 1.7458089192708335, + "grad_norm": 16.762067794799805, + "learning_rate": 5.996003259102081e-06, + "loss": 4.8889, + "step": 85810 + }, + { + "epoch": 1.74591064453125, + "grad_norm": 20.437646865844727, + "learning_rate": 5.995611588516623e-06, + "loss": 4.6283, + "step": 85815 + }, + { + "epoch": 1.7460123697916665, + "grad_norm": 16.61212158203125, + "learning_rate": 5.995219911569525e-06, + "loss": 4.779, + "step": 85820 + }, + { + "epoch": 1.7461140950520835, + "grad_norm": 17.790252685546875, + "learning_rate": 5.994828228263287e-06, + "loss": 4.7505, + "step": 85825 + }, + { + "epoch": 1.7462158203125, + "grad_norm": 14.374982833862305, + "learning_rate": 5.9944365386004125e-06, + "loss": 4.7119, + "step": 85830 + }, + { + "epoch": 1.7463175455729165, + "grad_norm": 17.175230026245117, + "learning_rate": 5.994044842583403e-06, + "loss": 4.9225, + "step": 85835 + }, + { + "epoch": 1.7464192708333335, + "grad_norm": 15.571799278259277, + "learning_rate": 5.993653140214763e-06, + "loss": 4.8001, + "step": 85840 + }, + { + "epoch": 1.74652099609375, + "grad_norm": 20.698862075805664, + "learning_rate": 5.993261431496996e-06, + "loss": 4.9308, + "step": 85845 + }, + { + "epoch": 1.7466227213541665, + "grad_norm": 16.116954803466797, + "learning_rate": 5.992869716432602e-06, + "loss": 4.723, + "step": 85850 + }, + { + "epoch": 1.7467244466145835, + "grad_norm": 16.825851440429688, + "learning_rate": 5.992477995024088e-06, + "loss": 5.0625, + "step": 85855 + }, + { + "epoch": 1.746826171875, + "grad_norm": 20.59319305419922, + "learning_rate": 5.992086267273953e-06, + "loss": 5.0008, + "step": 85860 + }, + { + "epoch": 1.7469278971354165, + "grad_norm": 13.80220890045166, + "learning_rate": 5.991694533184701e-06, + "loss": 4.8794, + "step": 85865 + }, + { + "epoch": 1.7470296223958335, + "grad_norm": 16.158113479614258, + "learning_rate": 5.991302792758837e-06, + "loss": 4.6485, + "step": 85870 + }, + { + "epoch": 1.74713134765625, + "grad_norm": 18.382413864135742, + "learning_rate": 5.990911045998863e-06, + "loss": 4.6385, + "step": 85875 + }, + { + "epoch": 1.7472330729166665, + "grad_norm": 19.959609985351562, + "learning_rate": 5.9905192929072805e-06, + "loss": 4.8821, + "step": 85880 + }, + { + "epoch": 1.7473347981770835, + "grad_norm": 21.722862243652344, + "learning_rate": 5.990127533486596e-06, + "loss": 4.8651, + "step": 85885 + }, + { + "epoch": 1.7474365234375, + "grad_norm": 21.222620010375977, + "learning_rate": 5.98973576773931e-06, + "loss": 5.3252, + "step": 85890 + }, + { + "epoch": 1.7475382486979165, + "grad_norm": 22.63051414489746, + "learning_rate": 5.989343995667927e-06, + "loss": 4.858, + "step": 85895 + }, + { + "epoch": 1.7476399739583335, + "grad_norm": 15.339709281921387, + "learning_rate": 5.988952217274951e-06, + "loss": 4.9389, + "step": 85900 + }, + { + "epoch": 1.74774169921875, + "grad_norm": 18.429288864135742, + "learning_rate": 5.988560432562881e-06, + "loss": 4.924, + "step": 85905 + }, + { + "epoch": 1.7478434244791665, + "grad_norm": 15.304901123046875, + "learning_rate": 5.988168641534225e-06, + "loss": 4.9546, + "step": 85910 + }, + { + "epoch": 1.7479451497395835, + "grad_norm": 14.205239295959473, + "learning_rate": 5.987776844191486e-06, + "loss": 5.1677, + "step": 85915 + }, + { + "epoch": 1.748046875, + "grad_norm": 17.40241813659668, + "learning_rate": 5.987385040537166e-06, + "loss": 4.7112, + "step": 85920 + }, + { + "epoch": 1.7481486002604165, + "grad_norm": 14.937131881713867, + "learning_rate": 5.986993230573769e-06, + "loss": 4.8649, + "step": 85925 + }, + { + "epoch": 1.7482503255208335, + "grad_norm": 19.5753116607666, + "learning_rate": 5.986601414303798e-06, + "loss": 5.1697, + "step": 85930 + }, + { + "epoch": 1.74835205078125, + "grad_norm": 18.93539047241211, + "learning_rate": 5.986209591729758e-06, + "loss": 4.718, + "step": 85935 + }, + { + "epoch": 1.7484537760416665, + "grad_norm": 17.876832962036133, + "learning_rate": 5.9858177628541525e-06, + "loss": 4.8262, + "step": 85940 + }, + { + "epoch": 1.7485555013020835, + "grad_norm": 18.603172302246094, + "learning_rate": 5.985425927679482e-06, + "loss": 4.8849, + "step": 85945 + }, + { + "epoch": 1.7486572265625, + "grad_norm": 14.281291007995605, + "learning_rate": 5.985034086208255e-06, + "loss": 4.5994, + "step": 85950 + }, + { + "epoch": 1.7487589518229165, + "grad_norm": 15.680105209350586, + "learning_rate": 5.9846422384429715e-06, + "loss": 4.6875, + "step": 85955 + }, + { + "epoch": 1.7488606770833335, + "grad_norm": 18.227115631103516, + "learning_rate": 5.984250384386138e-06, + "loss": 4.8002, + "step": 85960 + }, + { + "epoch": 1.74896240234375, + "grad_norm": 18.862146377563477, + "learning_rate": 5.983858524040257e-06, + "loss": 4.8474, + "step": 85965 + }, + { + "epoch": 1.7490641276041665, + "grad_norm": 18.15032386779785, + "learning_rate": 5.98346665740783e-06, + "loss": 4.8504, + "step": 85970 + }, + { + "epoch": 1.7491658528645835, + "grad_norm": 21.390838623046875, + "learning_rate": 5.983074784491366e-06, + "loss": 4.7316, + "step": 85975 + }, + { + "epoch": 1.749267578125, + "grad_norm": 20.556880950927734, + "learning_rate": 5.982682905293364e-06, + "loss": 4.6305, + "step": 85980 + }, + { + "epoch": 1.7493693033854165, + "grad_norm": 18.32866096496582, + "learning_rate": 5.982291019816332e-06, + "loss": 4.6649, + "step": 85985 + }, + { + "epoch": 1.7494710286458335, + "grad_norm": 17.408994674682617, + "learning_rate": 5.9818991280627716e-06, + "loss": 4.6915, + "step": 85990 + }, + { + "epoch": 1.74957275390625, + "grad_norm": 17.827625274658203, + "learning_rate": 5.981507230035187e-06, + "loss": 4.9408, + "step": 85995 + }, + { + "epoch": 1.7496744791666665, + "grad_norm": 17.890512466430664, + "learning_rate": 5.981115325736083e-06, + "loss": 4.8042, + "step": 86000 + }, + { + "epoch": 1.7497762044270835, + "grad_norm": 19.891611099243164, + "learning_rate": 5.980723415167964e-06, + "loss": 4.8517, + "step": 86005 + }, + { + "epoch": 1.7498779296875, + "grad_norm": 19.275602340698242, + "learning_rate": 5.980331498333333e-06, + "loss": 4.8582, + "step": 86010 + }, + { + "epoch": 1.7499796549479165, + "grad_norm": 27.884929656982422, + "learning_rate": 5.979939575234695e-06, + "loss": 4.8215, + "step": 86015 + }, + { + "epoch": 1.75, + "eval_loss": 4.941539287567139, + "eval_runtime": 107.6546, + "eval_samples_per_second": 18.643, + "eval_steps_per_second": 9.326, + "step": 86016 + }, + { + "epoch": 1.7500813802083335, + "grad_norm": 18.560243606567383, + "learning_rate": 5.979547645874555e-06, + "loss": 4.8209, + "step": 86020 + }, + { + "epoch": 1.75018310546875, + "grad_norm": 25.28578758239746, + "learning_rate": 5.979155710255415e-06, + "loss": 5.2039, + "step": 86025 + }, + { + "epoch": 1.7502848307291665, + "grad_norm": 17.151100158691406, + "learning_rate": 5.978763768379783e-06, + "loss": 5.0709, + "step": 86030 + }, + { + "epoch": 1.7503865559895835, + "grad_norm": 17.4923095703125, + "learning_rate": 5.978371820250159e-06, + "loss": 4.8451, + "step": 86035 + }, + { + "epoch": 1.75048828125, + "grad_norm": 16.33687973022461, + "learning_rate": 5.977979865869051e-06, + "loss": 4.985, + "step": 86040 + }, + { + "epoch": 1.7505900065104165, + "grad_norm": 15.739945411682129, + "learning_rate": 5.977587905238962e-06, + "loss": 4.7635, + "step": 86045 + }, + { + "epoch": 1.7506917317708335, + "grad_norm": 20.8831787109375, + "learning_rate": 5.977195938362397e-06, + "loss": 4.642, + "step": 86050 + }, + { + "epoch": 1.75079345703125, + "grad_norm": 20.173311233520508, + "learning_rate": 5.9768039652418575e-06, + "loss": 4.9695, + "step": 86055 + }, + { + "epoch": 1.7508951822916665, + "grad_norm": 23.41875457763672, + "learning_rate": 5.976411985879853e-06, + "loss": 4.9267, + "step": 86060 + }, + { + "epoch": 1.7509969075520835, + "grad_norm": 17.532617568969727, + "learning_rate": 5.976020000278886e-06, + "loss": 4.839, + "step": 86065 + }, + { + "epoch": 1.7510986328125, + "grad_norm": 16.99863624572754, + "learning_rate": 5.975628008441461e-06, + "loss": 5.0491, + "step": 86070 + }, + { + "epoch": 1.7512003580729165, + "grad_norm": 19.42047119140625, + "learning_rate": 5.975236010370081e-06, + "loss": 4.824, + "step": 86075 + }, + { + "epoch": 1.7513020833333335, + "grad_norm": 16.37110137939453, + "learning_rate": 5.974844006067255e-06, + "loss": 4.9847, + "step": 86080 + }, + { + "epoch": 1.75140380859375, + "grad_norm": 15.896268844604492, + "learning_rate": 5.974451995535483e-06, + "loss": 4.8437, + "step": 86085 + }, + { + "epoch": 1.7515055338541665, + "grad_norm": 17.83246421813965, + "learning_rate": 5.974059978777273e-06, + "loss": 4.8686, + "step": 86090 + }, + { + "epoch": 1.7516072591145835, + "grad_norm": 21.826641082763672, + "learning_rate": 5.97366795579513e-06, + "loss": 4.7275, + "step": 86095 + }, + { + "epoch": 1.751708984375, + "grad_norm": 16.307748794555664, + "learning_rate": 5.973275926591556e-06, + "loss": 4.9385, + "step": 86100 + }, + { + "epoch": 1.7518107096354165, + "grad_norm": 23.338478088378906, + "learning_rate": 5.97288389116906e-06, + "loss": 5.0613, + "step": 86105 + }, + { + "epoch": 1.7519124348958335, + "grad_norm": 32.345375061035156, + "learning_rate": 5.972491849530142e-06, + "loss": 4.5883, + "step": 86110 + }, + { + "epoch": 1.75201416015625, + "grad_norm": 19.32306480407715, + "learning_rate": 5.9720998016773125e-06, + "loss": 4.8012, + "step": 86115 + }, + { + "epoch": 1.7521158854166665, + "grad_norm": 22.055103302001953, + "learning_rate": 5.9717077476130725e-06, + "loss": 4.8275, + "step": 86120 + }, + { + "epoch": 1.7522176106770835, + "grad_norm": 17.32343292236328, + "learning_rate": 5.9713156873399285e-06, + "loss": 5.0089, + "step": 86125 + }, + { + "epoch": 1.7523193359375, + "grad_norm": 18.133155822753906, + "learning_rate": 5.970923620860386e-06, + "loss": 4.8789, + "step": 86130 + }, + { + "epoch": 1.7524210611979165, + "grad_norm": 17.122358322143555, + "learning_rate": 5.970531548176949e-06, + "loss": 4.7331, + "step": 86135 + }, + { + "epoch": 1.7525227864583335, + "grad_norm": 23.333736419677734, + "learning_rate": 5.9701394692921245e-06, + "loss": 4.8367, + "step": 86140 + }, + { + "epoch": 1.75262451171875, + "grad_norm": 14.997668266296387, + "learning_rate": 5.969747384208416e-06, + "loss": 5.061, + "step": 86145 + }, + { + "epoch": 1.7527262369791665, + "grad_norm": 19.294599533081055, + "learning_rate": 5.9693552929283295e-06, + "loss": 4.8678, + "step": 86150 + }, + { + "epoch": 1.7528279622395835, + "grad_norm": 16.009353637695312, + "learning_rate": 5.968963195454372e-06, + "loss": 5.0865, + "step": 86155 + }, + { + "epoch": 1.7529296875, + "grad_norm": 18.71373748779297, + "learning_rate": 5.968571091789046e-06, + "loss": 4.9602, + "step": 86160 + }, + { + "epoch": 1.7530314127604165, + "grad_norm": 22.78525733947754, + "learning_rate": 5.9681789819348565e-06, + "loss": 4.8119, + "step": 86165 + }, + { + "epoch": 1.7531331380208335, + "grad_norm": 27.997575759887695, + "learning_rate": 5.967786865894313e-06, + "loss": 4.8345, + "step": 86170 + }, + { + "epoch": 1.75323486328125, + "grad_norm": 19.160573959350586, + "learning_rate": 5.9673947436699174e-06, + "loss": 4.8269, + "step": 86175 + }, + { + "epoch": 1.7533365885416665, + "grad_norm": 19.93934440612793, + "learning_rate": 5.9670026152641765e-06, + "loss": 5.3198, + "step": 86180 + }, + { + "epoch": 1.7534383138020835, + "grad_norm": 18.569456100463867, + "learning_rate": 5.966610480679598e-06, + "loss": 5.1741, + "step": 86185 + }, + { + "epoch": 1.7535400390625, + "grad_norm": 20.17766761779785, + "learning_rate": 5.966218339918684e-06, + "loss": 4.9087, + "step": 86190 + }, + { + "epoch": 1.7536417643229165, + "grad_norm": 13.254290580749512, + "learning_rate": 5.965826192983939e-06, + "loss": 4.7374, + "step": 86195 + }, + { + "epoch": 1.7537434895833335, + "grad_norm": 17.263164520263672, + "learning_rate": 5.965434039877874e-06, + "loss": 4.739, + "step": 86200 + }, + { + "epoch": 1.75384521484375, + "grad_norm": 17.33690071105957, + "learning_rate": 5.965041880602991e-06, + "loss": 4.9967, + "step": 86205 + }, + { + "epoch": 1.7539469401041665, + "grad_norm": 15.389745712280273, + "learning_rate": 5.964649715161797e-06, + "loss": 4.9176, + "step": 86210 + }, + { + "epoch": 1.7540486653645835, + "grad_norm": 21.16498565673828, + "learning_rate": 5.964257543556798e-06, + "loss": 4.7181, + "step": 86215 + }, + { + "epoch": 1.754150390625, + "grad_norm": 27.0748233795166, + "learning_rate": 5.9638653657905e-06, + "loss": 5.0201, + "step": 86220 + }, + { + "epoch": 1.7542521158854165, + "grad_norm": 16.069923400878906, + "learning_rate": 5.963473181865406e-06, + "loss": 4.8575, + "step": 86225 + }, + { + "epoch": 1.7543538411458335, + "grad_norm": 15.87247371673584, + "learning_rate": 5.963080991784026e-06, + "loss": 4.8748, + "step": 86230 + }, + { + "epoch": 1.75445556640625, + "grad_norm": 20.778018951416016, + "learning_rate": 5.9626887955488644e-06, + "loss": 4.8481, + "step": 86235 + }, + { + "epoch": 1.7545572916666665, + "grad_norm": 15.329676628112793, + "learning_rate": 5.962296593162426e-06, + "loss": 4.97, + "step": 86240 + }, + { + "epoch": 1.7546590169270835, + "grad_norm": 19.00392723083496, + "learning_rate": 5.961904384627219e-06, + "loss": 4.8414, + "step": 86245 + }, + { + "epoch": 1.7547607421875, + "grad_norm": 20.781524658203125, + "learning_rate": 5.961512169945748e-06, + "loss": 4.9256, + "step": 86250 + }, + { + "epoch": 1.7548624674479165, + "grad_norm": 18.05843734741211, + "learning_rate": 5.961119949120517e-06, + "loss": 4.8547, + "step": 86255 + }, + { + "epoch": 1.7549641927083335, + "grad_norm": 19.80792236328125, + "learning_rate": 5.960727722154038e-06, + "loss": 5.1326, + "step": 86260 + }, + { + "epoch": 1.75506591796875, + "grad_norm": 16.339008331298828, + "learning_rate": 5.960335489048812e-06, + "loss": 4.8057, + "step": 86265 + }, + { + "epoch": 1.7551676432291665, + "grad_norm": 15.541213035583496, + "learning_rate": 5.959943249807347e-06, + "loss": 5.1224, + "step": 86270 + }, + { + "epoch": 1.7552693684895835, + "grad_norm": 21.60602378845215, + "learning_rate": 5.959551004432151e-06, + "loss": 4.9736, + "step": 86275 + }, + { + "epoch": 1.75537109375, + "grad_norm": 18.271900177001953, + "learning_rate": 5.9591587529257265e-06, + "loss": 4.8348, + "step": 86280 + }, + { + "epoch": 1.7554728190104165, + "grad_norm": 15.830063819885254, + "learning_rate": 5.9587664952905845e-06, + "loss": 4.6584, + "step": 86285 + }, + { + "epoch": 1.7555745442708335, + "grad_norm": 19.61239242553711, + "learning_rate": 5.958374231529227e-06, + "loss": 4.6794, + "step": 86290 + }, + { + "epoch": 1.75567626953125, + "grad_norm": 12.65583324432373, + "learning_rate": 5.957981961644162e-06, + "loss": 4.7525, + "step": 86295 + }, + { + "epoch": 1.7557779947916665, + "grad_norm": 17.15668487548828, + "learning_rate": 5.957589685637899e-06, + "loss": 4.9124, + "step": 86300 + }, + { + "epoch": 1.7558797200520835, + "grad_norm": 20.124557495117188, + "learning_rate": 5.95719740351294e-06, + "loss": 4.7029, + "step": 86305 + }, + { + "epoch": 1.7559814453125, + "grad_norm": 14.220940589904785, + "learning_rate": 5.956805115271793e-06, + "loss": 4.988, + "step": 86310 + }, + { + "epoch": 1.7560831705729165, + "grad_norm": 18.99559783935547, + "learning_rate": 5.956412820916967e-06, + "loss": 4.9152, + "step": 86315 + }, + { + "epoch": 1.7561848958333335, + "grad_norm": 17.72015380859375, + "learning_rate": 5.956020520450965e-06, + "loss": 4.9207, + "step": 86320 + }, + { + "epoch": 1.75628662109375, + "grad_norm": 17.330778121948242, + "learning_rate": 5.955628213876295e-06, + "loss": 4.9622, + "step": 86325 + }, + { + "epoch": 1.7563883463541665, + "grad_norm": 20.542434692382812, + "learning_rate": 5.955235901195464e-06, + "loss": 4.6092, + "step": 86330 + }, + { + "epoch": 1.7564900716145835, + "grad_norm": 17.84880256652832, + "learning_rate": 5.95484358241098e-06, + "loss": 4.8525, + "step": 86335 + }, + { + "epoch": 1.756591796875, + "grad_norm": 17.810346603393555, + "learning_rate": 5.9544512575253485e-06, + "loss": 5.0394, + "step": 86340 + }, + { + "epoch": 1.7566935221354165, + "grad_norm": 16.02992057800293, + "learning_rate": 5.954058926541075e-06, + "loss": 4.8851, + "step": 86345 + }, + { + "epoch": 1.7567952473958335, + "grad_norm": 15.493570327758789, + "learning_rate": 5.953666589460671e-06, + "loss": 4.7199, + "step": 86350 + }, + { + "epoch": 1.75689697265625, + "grad_norm": 15.775496482849121, + "learning_rate": 5.953274246286637e-06, + "loss": 4.8849, + "step": 86355 + }, + { + "epoch": 1.7569986979166665, + "grad_norm": 18.228008270263672, + "learning_rate": 5.952881897021484e-06, + "loss": 4.9376, + "step": 86360 + }, + { + "epoch": 1.7571004231770835, + "grad_norm": 17.651945114135742, + "learning_rate": 5.95248954166772e-06, + "loss": 4.8086, + "step": 86365 + }, + { + "epoch": 1.7572021484375, + "grad_norm": 13.349501609802246, + "learning_rate": 5.952097180227847e-06, + "loss": 4.8012, + "step": 86370 + }, + { + "epoch": 1.7573038736979165, + "grad_norm": 14.16867733001709, + "learning_rate": 5.951704812704378e-06, + "loss": 4.8443, + "step": 86375 + }, + { + "epoch": 1.7574055989583335, + "grad_norm": 16.23691177368164, + "learning_rate": 5.951312439099817e-06, + "loss": 4.8813, + "step": 86380 + }, + { + "epoch": 1.75750732421875, + "grad_norm": 21.603994369506836, + "learning_rate": 5.950920059416671e-06, + "loss": 4.9658, + "step": 86385 + }, + { + "epoch": 1.7576090494791665, + "grad_norm": 18.373014450073242, + "learning_rate": 5.950527673657447e-06, + "loss": 4.6961, + "step": 86390 + }, + { + "epoch": 1.7577107747395835, + "grad_norm": 18.24887466430664, + "learning_rate": 5.950135281824653e-06, + "loss": 4.6895, + "step": 86395 + }, + { + "epoch": 1.7578125, + "grad_norm": 17.748281478881836, + "learning_rate": 5.9497428839207974e-06, + "loss": 4.8358, + "step": 86400 + }, + { + "epoch": 1.7579142252604165, + "grad_norm": 14.772025108337402, + "learning_rate": 5.949350479948386e-06, + "loss": 5.1184, + "step": 86405 + }, + { + "epoch": 1.7580159505208335, + "grad_norm": 22.27665901184082, + "learning_rate": 5.948958069909925e-06, + "loss": 5.0066, + "step": 86410 + }, + { + "epoch": 1.75811767578125, + "grad_norm": 13.515464782714844, + "learning_rate": 5.948565653807925e-06, + "loss": 4.9195, + "step": 86415 + }, + { + "epoch": 1.7582194010416665, + "grad_norm": 16.76201057434082, + "learning_rate": 5.9481732316448894e-06, + "loss": 4.726, + "step": 86420 + }, + { + "epoch": 1.7583211263020835, + "grad_norm": 20.49966049194336, + "learning_rate": 5.94778080342333e-06, + "loss": 4.8404, + "step": 86425 + }, + { + "epoch": 1.7584228515625, + "grad_norm": 17.62663459777832, + "learning_rate": 5.947388369145751e-06, + "loss": 4.9463, + "step": 86430 + }, + { + "epoch": 1.7585245768229165, + "grad_norm": 15.999982833862305, + "learning_rate": 5.946995928814661e-06, + "loss": 4.866, + "step": 86435 + }, + { + "epoch": 1.7586263020833335, + "grad_norm": 15.917414665222168, + "learning_rate": 5.946603482432569e-06, + "loss": 4.8793, + "step": 86440 + }, + { + "epoch": 1.75872802734375, + "grad_norm": 15.930188179016113, + "learning_rate": 5.946211030001979e-06, + "loss": 4.6401, + "step": 86445 + }, + { + "epoch": 1.7588297526041665, + "grad_norm": 28.503679275512695, + "learning_rate": 5.945818571525402e-06, + "loss": 4.9823, + "step": 86450 + }, + { + "epoch": 1.7589314778645835, + "grad_norm": 14.739498138427734, + "learning_rate": 5.9454261070053454e-06, + "loss": 5.1325, + "step": 86455 + }, + { + "epoch": 1.759033203125, + "grad_norm": 19.70188331604004, + "learning_rate": 5.945033636444315e-06, + "loss": 4.6432, + "step": 86460 + }, + { + "epoch": 1.7591349283854165, + "grad_norm": 15.618892669677734, + "learning_rate": 5.94464115984482e-06, + "loss": 5.1692, + "step": 86465 + }, + { + "epoch": 1.7592366536458335, + "grad_norm": 15.657222747802734, + "learning_rate": 5.944248677209368e-06, + "loss": 4.7557, + "step": 86470 + }, + { + "epoch": 1.75933837890625, + "grad_norm": 15.374401092529297, + "learning_rate": 5.943856188540465e-06, + "loss": 5.0024, + "step": 86475 + }, + { + "epoch": 1.7594401041666665, + "grad_norm": 17.20415687561035, + "learning_rate": 5.943463693840623e-06, + "loss": 4.8665, + "step": 86480 + }, + { + "epoch": 1.7595418294270835, + "grad_norm": 22.406435012817383, + "learning_rate": 5.943071193112346e-06, + "loss": 4.9479, + "step": 86485 + }, + { + "epoch": 1.7596435546875, + "grad_norm": 16.923694610595703, + "learning_rate": 5.942678686358144e-06, + "loss": 4.6328, + "step": 86490 + }, + { + "epoch": 1.7597452799479165, + "grad_norm": 19.8585147857666, + "learning_rate": 5.942286173580526e-06, + "loss": 4.6065, + "step": 86495 + }, + { + "epoch": 1.7598470052083335, + "grad_norm": 14.30851936340332, + "learning_rate": 5.941893654781996e-06, + "loss": 4.8892, + "step": 86500 + }, + { + "epoch": 1.75994873046875, + "grad_norm": 15.814101219177246, + "learning_rate": 5.941501129965066e-06, + "loss": 4.7034, + "step": 86505 + }, + { + "epoch": 1.7600504557291665, + "grad_norm": 21.6844425201416, + "learning_rate": 5.9411085991322415e-06, + "loss": 4.8033, + "step": 86510 + }, + { + "epoch": 1.7601521809895835, + "grad_norm": 12.650962829589844, + "learning_rate": 5.940716062286033e-06, + "loss": 4.8695, + "step": 86515 + }, + { + "epoch": 1.76025390625, + "grad_norm": 18.759798049926758, + "learning_rate": 5.940323519428948e-06, + "loss": 4.8264, + "step": 86520 + }, + { + "epoch": 1.7603556315104165, + "grad_norm": 20.291316986083984, + "learning_rate": 5.939930970563492e-06, + "loss": 4.6817, + "step": 86525 + }, + { + "epoch": 1.7604573567708335, + "grad_norm": 18.47158432006836, + "learning_rate": 5.9395384156921764e-06, + "loss": 4.8595, + "step": 86530 + }, + { + "epoch": 1.76055908203125, + "grad_norm": 20.585235595703125, + "learning_rate": 5.93914585481751e-06, + "loss": 4.6605, + "step": 86535 + }, + { + "epoch": 1.7606608072916665, + "grad_norm": 23.500242233276367, + "learning_rate": 5.9387532879419975e-06, + "loss": 4.9308, + "step": 86540 + }, + { + "epoch": 1.7607625325520835, + "grad_norm": 21.123764038085938, + "learning_rate": 5.938360715068151e-06, + "loss": 4.8063, + "step": 86545 + }, + { + "epoch": 1.7608642578125, + "grad_norm": 19.843017578125, + "learning_rate": 5.937968136198476e-06, + "loss": 5.023, + "step": 86550 + }, + { + "epoch": 1.7609659830729165, + "grad_norm": 15.615704536437988, + "learning_rate": 5.937575551335484e-06, + "loss": 4.6898, + "step": 86555 + }, + { + "epoch": 1.7610677083333335, + "grad_norm": 15.919120788574219, + "learning_rate": 5.937182960481681e-06, + "loss": 4.7839, + "step": 86560 + }, + { + "epoch": 1.76116943359375, + "grad_norm": 20.34284782409668, + "learning_rate": 5.936790363639576e-06, + "loss": 4.8882, + "step": 86565 + }, + { + "epoch": 1.7612711588541665, + "grad_norm": 26.52699089050293, + "learning_rate": 5.9363977608116794e-06, + "loss": 4.8323, + "step": 86570 + }, + { + "epoch": 1.7613728841145835, + "grad_norm": 16.78786277770996, + "learning_rate": 5.936005152000497e-06, + "loss": 4.7467, + "step": 86575 + }, + { + "epoch": 1.761474609375, + "grad_norm": 18.052804946899414, + "learning_rate": 5.935612537208539e-06, + "loss": 5.3492, + "step": 86580 + }, + { + "epoch": 1.7615763346354165, + "grad_norm": 24.60805320739746, + "learning_rate": 5.935219916438315e-06, + "loss": 4.6908, + "step": 86585 + }, + { + "epoch": 1.7616780598958335, + "grad_norm": 16.26642417907715, + "learning_rate": 5.9348272896923305e-06, + "loss": 4.7475, + "step": 86590 + }, + { + "epoch": 1.76177978515625, + "grad_norm": 18.838863372802734, + "learning_rate": 5.934434656973098e-06, + "loss": 4.8759, + "step": 86595 + }, + { + "epoch": 1.7618815104166665, + "grad_norm": 29.760799407958984, + "learning_rate": 5.934042018283124e-06, + "loss": 4.9839, + "step": 86600 + }, + { + "epoch": 1.7619832356770835, + "grad_norm": 15.77631664276123, + "learning_rate": 5.933649373624919e-06, + "loss": 4.9065, + "step": 86605 + }, + { + "epoch": 1.7620849609375, + "grad_norm": 14.884183883666992, + "learning_rate": 5.93325672300099e-06, + "loss": 4.8287, + "step": 86610 + }, + { + "epoch": 1.7621866861979165, + "grad_norm": 17.50905990600586, + "learning_rate": 5.932864066413846e-06, + "loss": 4.6298, + "step": 86615 + }, + { + "epoch": 1.7622884114583335, + "grad_norm": 15.936980247497559, + "learning_rate": 5.932471403865998e-06, + "loss": 4.681, + "step": 86620 + }, + { + "epoch": 1.76239013671875, + "grad_norm": 16.01227378845215, + "learning_rate": 5.932078735359954e-06, + "loss": 4.9249, + "step": 86625 + }, + { + "epoch": 1.7624918619791665, + "grad_norm": 18.797924041748047, + "learning_rate": 5.931686060898222e-06, + "loss": 5.0613, + "step": 86630 + }, + { + "epoch": 1.7625935872395835, + "grad_norm": 24.0843448638916, + "learning_rate": 5.9312933804833115e-06, + "loss": 4.8357, + "step": 86635 + }, + { + "epoch": 1.7626953125, + "grad_norm": 21.818880081176758, + "learning_rate": 5.930900694117731e-06, + "loss": 5.1584, + "step": 86640 + }, + { + "epoch": 1.7627970377604165, + "grad_norm": 14.429359436035156, + "learning_rate": 5.930508001803994e-06, + "loss": 5.3479, + "step": 86645 + }, + { + "epoch": 1.7628987630208335, + "grad_norm": 19.590835571289062, + "learning_rate": 5.930115303544604e-06, + "loss": 5.0044, + "step": 86650 + }, + { + "epoch": 1.76300048828125, + "grad_norm": 26.211511611938477, + "learning_rate": 5.929722599342072e-06, + "loss": 4.8529, + "step": 86655 + }, + { + "epoch": 1.7631022135416665, + "grad_norm": 13.475521087646484, + "learning_rate": 5.929329889198909e-06, + "loss": 5.0183, + "step": 86660 + }, + { + "epoch": 1.7632039388020835, + "grad_norm": 19.319305419921875, + "learning_rate": 5.9289371731176225e-06, + "loss": 4.7651, + "step": 86665 + }, + { + "epoch": 1.7633056640625, + "grad_norm": 18.999202728271484, + "learning_rate": 5.928544451100723e-06, + "loss": 4.5601, + "step": 86670 + }, + { + "epoch": 1.7634073893229165, + "grad_norm": 17.17632293701172, + "learning_rate": 5.928151723150719e-06, + "loss": 4.8403, + "step": 86675 + }, + { + "epoch": 1.7635091145833335, + "grad_norm": 22.925962448120117, + "learning_rate": 5.927758989270118e-06, + "loss": 5.0686, + "step": 86680 + }, + { + "epoch": 1.76361083984375, + "grad_norm": 18.943214416503906, + "learning_rate": 5.927366249461435e-06, + "loss": 4.7485, + "step": 86685 + }, + { + "epoch": 1.7637125651041665, + "grad_norm": 15.70346736907959, + "learning_rate": 5.926973503727174e-06, + "loss": 5.0186, + "step": 86690 + }, + { + "epoch": 1.7638142903645835, + "grad_norm": 17.610763549804688, + "learning_rate": 5.926580752069846e-06, + "loss": 5.0651, + "step": 86695 + }, + { + "epoch": 1.763916015625, + "grad_norm": 22.006696701049805, + "learning_rate": 5.926187994491963e-06, + "loss": 4.9901, + "step": 86700 + }, + { + "epoch": 1.7640177408854165, + "grad_norm": 17.01923370361328, + "learning_rate": 5.925795230996031e-06, + "loss": 4.7807, + "step": 86705 + }, + { + "epoch": 1.7641194661458335, + "grad_norm": 16.426076889038086, + "learning_rate": 5.925402461584563e-06, + "loss": 4.8391, + "step": 86710 + }, + { + "epoch": 1.76422119140625, + "grad_norm": 18.726408004760742, + "learning_rate": 5.925009686260066e-06, + "loss": 4.6805, + "step": 86715 + }, + { + "epoch": 1.7643229166666665, + "grad_norm": 21.757057189941406, + "learning_rate": 5.92461690502505e-06, + "loss": 5.1584, + "step": 86720 + }, + { + "epoch": 1.7644246419270835, + "grad_norm": 15.462935447692871, + "learning_rate": 5.924224117882027e-06, + "loss": 4.7958, + "step": 86725 + }, + { + "epoch": 1.7645263671875, + "grad_norm": 25.043895721435547, + "learning_rate": 5.923831324833505e-06, + "loss": 4.8332, + "step": 86730 + }, + { + "epoch": 1.7646280924479165, + "grad_norm": 17.774005889892578, + "learning_rate": 5.923438525881994e-06, + "loss": 5.1059, + "step": 86735 + }, + { + "epoch": 1.7647298177083335, + "grad_norm": 25.95446014404297, + "learning_rate": 5.923045721030003e-06, + "loss": 4.914, + "step": 86740 + }, + { + "epoch": 1.76483154296875, + "grad_norm": 18.599708557128906, + "learning_rate": 5.922652910280042e-06, + "loss": 5.1107, + "step": 86745 + }, + { + "epoch": 1.7649332682291665, + "grad_norm": 23.248565673828125, + "learning_rate": 5.922260093634624e-06, + "loss": 4.8515, + "step": 86750 + }, + { + "epoch": 1.7650349934895835, + "grad_norm": 15.184300422668457, + "learning_rate": 5.921867271096256e-06, + "loss": 4.7503, + "step": 86755 + }, + { + "epoch": 1.76513671875, + "grad_norm": 17.4615478515625, + "learning_rate": 5.921474442667449e-06, + "loss": 4.8489, + "step": 86760 + }, + { + "epoch": 1.7652384440104165, + "grad_norm": 17.404489517211914, + "learning_rate": 5.9210816083507124e-06, + "loss": 4.8638, + "step": 86765 + }, + { + "epoch": 1.7653401692708335, + "grad_norm": 20.914697647094727, + "learning_rate": 5.920688768148557e-06, + "loss": 5.1003, + "step": 86770 + }, + { + "epoch": 1.76544189453125, + "grad_norm": 19.103342056274414, + "learning_rate": 5.920295922063494e-06, + "loss": 4.6256, + "step": 86775 + }, + { + "epoch": 1.7655436197916665, + "grad_norm": 19.969575881958008, + "learning_rate": 5.919903070098031e-06, + "loss": 4.7216, + "step": 86780 + }, + { + "epoch": 1.7656453450520835, + "grad_norm": 18.515605926513672, + "learning_rate": 5.919510212254677e-06, + "loss": 4.6663, + "step": 86785 + }, + { + "epoch": 1.7657470703125, + "grad_norm": 18.2778377532959, + "learning_rate": 5.919117348535949e-06, + "loss": 4.7834, + "step": 86790 + }, + { + "epoch": 1.7658487955729165, + "grad_norm": 18.080204010009766, + "learning_rate": 5.91872447894435e-06, + "loss": 4.9747, + "step": 86795 + }, + { + "epoch": 1.7659505208333335, + "grad_norm": 14.61758804321289, + "learning_rate": 5.918331603482395e-06, + "loss": 4.9929, + "step": 86800 + }, + { + "epoch": 1.76605224609375, + "grad_norm": 21.68010902404785, + "learning_rate": 5.917938722152591e-06, + "loss": 4.732, + "step": 86805 + }, + { + "epoch": 1.7661539713541665, + "grad_norm": 15.91997241973877, + "learning_rate": 5.91754583495745e-06, + "loss": 4.8463, + "step": 86810 + }, + { + "epoch": 1.7662556966145835, + "grad_norm": 19.393552780151367, + "learning_rate": 5.917152941899483e-06, + "loss": 4.6643, + "step": 86815 + }, + { + "epoch": 1.766357421875, + "grad_norm": 15.76963996887207, + "learning_rate": 5.916760042981201e-06, + "loss": 4.9676, + "step": 86820 + }, + { + "epoch": 1.7664591471354165, + "grad_norm": 22.490598678588867, + "learning_rate": 5.916367138205111e-06, + "loss": 5.0291, + "step": 86825 + }, + { + "epoch": 1.7665608723958335, + "grad_norm": 21.87406349182129, + "learning_rate": 5.915974227573727e-06, + "loss": 5.2198, + "step": 86830 + }, + { + "epoch": 1.76666259765625, + "grad_norm": 18.951276779174805, + "learning_rate": 5.915581311089559e-06, + "loss": 5.0195, + "step": 86835 + }, + { + "epoch": 1.7667643229166665, + "grad_norm": 19.778993606567383, + "learning_rate": 5.915188388755117e-06, + "loss": 4.9292, + "step": 86840 + }, + { + "epoch": 1.7668660481770835, + "grad_norm": 19.108125686645508, + "learning_rate": 5.914795460572911e-06, + "loss": 4.7906, + "step": 86845 + }, + { + "epoch": 1.7669677734375, + "grad_norm": 19.173755645751953, + "learning_rate": 5.914402526545452e-06, + "loss": 4.9272, + "step": 86850 + }, + { + "epoch": 1.7670694986979165, + "grad_norm": 18.05232048034668, + "learning_rate": 5.9140095866752524e-06, + "loss": 5.1528, + "step": 86855 + }, + { + "epoch": 1.7671712239583335, + "grad_norm": 19.953336715698242, + "learning_rate": 5.913616640964821e-06, + "loss": 4.8432, + "step": 86860 + }, + { + "epoch": 1.76727294921875, + "grad_norm": 21.417037963867188, + "learning_rate": 5.913223689416669e-06, + "loss": 5.0636, + "step": 86865 + }, + { + "epoch": 1.7673746744791665, + "grad_norm": 20.69256591796875, + "learning_rate": 5.9128307320333075e-06, + "loss": 5.4141, + "step": 86870 + }, + { + "epoch": 1.7674763997395835, + "grad_norm": 19.280231475830078, + "learning_rate": 5.912437768817249e-06, + "loss": 4.9206, + "step": 86875 + }, + { + "epoch": 1.767578125, + "grad_norm": 13.675802230834961, + "learning_rate": 5.912044799771001e-06, + "loss": 5.2761, + "step": 86880 + }, + { + "epoch": 1.7676798502604165, + "grad_norm": 22.174774169921875, + "learning_rate": 5.911651824897076e-06, + "loss": 4.6906, + "step": 86885 + }, + { + "epoch": 1.7677815755208335, + "grad_norm": 16.48046875, + "learning_rate": 5.911258844197985e-06, + "loss": 5.037, + "step": 86890 + }, + { + "epoch": 1.76788330078125, + "grad_norm": 17.880935668945312, + "learning_rate": 5.910865857676242e-06, + "loss": 5.0001, + "step": 86895 + }, + { + "epoch": 1.7679850260416665, + "grad_norm": 13.515945434570312, + "learning_rate": 5.910472865334352e-06, + "loss": 4.7298, + "step": 86900 + }, + { + "epoch": 1.7680867513020835, + "grad_norm": 19.961353302001953, + "learning_rate": 5.910079867174831e-06, + "loss": 5.0655, + "step": 86905 + }, + { + "epoch": 1.7681884765625, + "grad_norm": 19.126535415649414, + "learning_rate": 5.90968686320019e-06, + "loss": 4.7822, + "step": 86910 + }, + { + "epoch": 1.7682902018229165, + "grad_norm": 23.28045654296875, + "learning_rate": 5.909293853412937e-06, + "loss": 4.7384, + "step": 86915 + }, + { + "epoch": 1.7683919270833335, + "grad_norm": 14.503271102905273, + "learning_rate": 5.908900837815586e-06, + "loss": 4.9079, + "step": 86920 + }, + { + "epoch": 1.76849365234375, + "grad_norm": 18.175424575805664, + "learning_rate": 5.908507816410646e-06, + "loss": 4.5808, + "step": 86925 + }, + { + "epoch": 1.7685953776041665, + "grad_norm": 23.998388290405273, + "learning_rate": 5.9081147892006295e-06, + "loss": 4.8749, + "step": 86930 + }, + { + "epoch": 1.7686971028645835, + "grad_norm": 16.52667808532715, + "learning_rate": 5.907721756188048e-06, + "loss": 4.6889, + "step": 86935 + }, + { + "epoch": 1.768798828125, + "grad_norm": 22.468931198120117, + "learning_rate": 5.9073287173754135e-06, + "loss": 5.1506, + "step": 86940 + }, + { + "epoch": 1.7689005533854165, + "grad_norm": 14.99986457824707, + "learning_rate": 5.906935672765236e-06, + "loss": 4.8858, + "step": 86945 + }, + { + "epoch": 1.7690022786458335, + "grad_norm": 21.090579986572266, + "learning_rate": 5.906542622360026e-06, + "loss": 5.0823, + "step": 86950 + }, + { + "epoch": 1.76910400390625, + "grad_norm": 18.17315673828125, + "learning_rate": 5.906149566162298e-06, + "loss": 5.0562, + "step": 86955 + }, + { + "epoch": 1.7692057291666665, + "grad_norm": 18.069265365600586, + "learning_rate": 5.9057565041745625e-06, + "loss": 4.8838, + "step": 86960 + }, + { + "epoch": 1.7693074544270835, + "grad_norm": 17.946657180786133, + "learning_rate": 5.905363436399329e-06, + "loss": 4.8087, + "step": 86965 + }, + { + "epoch": 1.7694091796875, + "grad_norm": 19.298377990722656, + "learning_rate": 5.904970362839111e-06, + "loss": 4.6336, + "step": 86970 + }, + { + "epoch": 1.7695109049479165, + "grad_norm": 18.69222640991211, + "learning_rate": 5.90457728349642e-06, + "loss": 5.021, + "step": 86975 + }, + { + "epoch": 1.7696126302083335, + "grad_norm": 12.722639083862305, + "learning_rate": 5.904184198373767e-06, + "loss": 5.0124, + "step": 86980 + }, + { + "epoch": 1.76971435546875, + "grad_norm": 16.282594680786133, + "learning_rate": 5.903791107473664e-06, + "loss": 4.8312, + "step": 86985 + }, + { + "epoch": 1.7698160807291665, + "grad_norm": 14.585951805114746, + "learning_rate": 5.903398010798622e-06, + "loss": 4.9295, + "step": 86990 + }, + { + "epoch": 1.7699178059895835, + "grad_norm": 14.32020378112793, + "learning_rate": 5.903004908351155e-06, + "loss": 4.9205, + "step": 86995 + }, + { + "epoch": 1.77001953125, + "grad_norm": 23.461164474487305, + "learning_rate": 5.9026118001337716e-06, + "loss": 4.9231, + "step": 87000 + }, + { + "epoch": 1.7701212565104165, + "grad_norm": 20.84175682067871, + "learning_rate": 5.902218686148987e-06, + "loss": 4.7649, + "step": 87005 + }, + { + "epoch": 1.7702229817708335, + "grad_norm": 17.357345581054688, + "learning_rate": 5.9018255663993106e-06, + "loss": 5.1801, + "step": 87010 + }, + { + "epoch": 1.77032470703125, + "grad_norm": 18.297000885009766, + "learning_rate": 5.901432440887256e-06, + "loss": 4.7721, + "step": 87015 + }, + { + "epoch": 1.7704264322916665, + "grad_norm": 18.03997230529785, + "learning_rate": 5.901039309615331e-06, + "loss": 4.7509, + "step": 87020 + }, + { + "epoch": 1.7705281575520835, + "grad_norm": 19.443880081176758, + "learning_rate": 5.900646172586055e-06, + "loss": 4.9262, + "step": 87025 + }, + { + "epoch": 1.7706298828125, + "grad_norm": 18.548574447631836, + "learning_rate": 5.900253029801933e-06, + "loss": 4.7459, + "step": 87030 + }, + { + "epoch": 1.7707316080729165, + "grad_norm": 16.479873657226562, + "learning_rate": 5.899859881265482e-06, + "loss": 4.8189, + "step": 87035 + }, + { + "epoch": 1.7708333333333335, + "grad_norm": 18.38920783996582, + "learning_rate": 5.899466726979211e-06, + "loss": 5.2121, + "step": 87040 + }, + { + "epoch": 1.77093505859375, + "grad_norm": 23.26772117614746, + "learning_rate": 5.899073566945633e-06, + "loss": 5.01, + "step": 87045 + }, + { + "epoch": 1.7710367838541665, + "grad_norm": 20.238296508789062, + "learning_rate": 5.8986804011672605e-06, + "loss": 4.9361, + "step": 87050 + }, + { + "epoch": 1.7711385091145835, + "grad_norm": 17.404338836669922, + "learning_rate": 5.8982872296466055e-06, + "loss": 4.8282, + "step": 87055 + }, + { + "epoch": 1.771240234375, + "grad_norm": 13.891304016113281, + "learning_rate": 5.897894052386181e-06, + "loss": 4.6762, + "step": 87060 + }, + { + "epoch": 1.7713419596354165, + "grad_norm": 19.17892837524414, + "learning_rate": 5.897500869388497e-06, + "loss": 4.9804, + "step": 87065 + }, + { + "epoch": 1.7714436848958335, + "grad_norm": 19.76304054260254, + "learning_rate": 5.897107680656068e-06, + "loss": 4.9932, + "step": 87070 + }, + { + "epoch": 1.77154541015625, + "grad_norm": 15.364632606506348, + "learning_rate": 5.896714486191406e-06, + "loss": 4.9399, + "step": 87075 + }, + { + "epoch": 1.7716471354166665, + "grad_norm": 21.838329315185547, + "learning_rate": 5.896321285997023e-06, + "loss": 5.0634, + "step": 87080 + }, + { + "epoch": 1.7717488606770835, + "grad_norm": 18.12010955810547, + "learning_rate": 5.895928080075432e-06, + "loss": 5.2301, + "step": 87085 + }, + { + "epoch": 1.7718505859375, + "grad_norm": 16.582515716552734, + "learning_rate": 5.8955348684291446e-06, + "loss": 4.6874, + "step": 87090 + }, + { + "epoch": 1.7719523111979165, + "grad_norm": 18.315032958984375, + "learning_rate": 5.895141651060673e-06, + "loss": 4.7744, + "step": 87095 + }, + { + "epoch": 1.7720540364583335, + "grad_norm": 16.949848175048828, + "learning_rate": 5.894748427972532e-06, + "loss": 4.7596, + "step": 87100 + }, + { + "epoch": 1.77215576171875, + "grad_norm": 27.30653190612793, + "learning_rate": 5.89435519916723e-06, + "loss": 5.0832, + "step": 87105 + }, + { + "epoch": 1.7722574869791665, + "grad_norm": 22.714962005615234, + "learning_rate": 5.893961964647284e-06, + "loss": 5.0871, + "step": 87110 + }, + { + "epoch": 1.7723592122395835, + "grad_norm": 18.334592819213867, + "learning_rate": 5.893568724415205e-06, + "loss": 4.9559, + "step": 87115 + }, + { + "epoch": 1.7724609375, + "grad_norm": 14.518753051757812, + "learning_rate": 5.893175478473505e-06, + "loss": 4.7546, + "step": 87120 + }, + { + "epoch": 1.7725626627604165, + "grad_norm": 24.23743438720703, + "learning_rate": 5.892782226824697e-06, + "loss": 4.636, + "step": 87125 + }, + { + "epoch": 1.7726643880208335, + "grad_norm": 17.27254867553711, + "learning_rate": 5.892388969471294e-06, + "loss": 4.8798, + "step": 87130 + }, + { + "epoch": 1.77276611328125, + "grad_norm": 19.838218688964844, + "learning_rate": 5.891995706415809e-06, + "loss": 4.915, + "step": 87135 + }, + { + "epoch": 1.7728678385416665, + "grad_norm": 20.678815841674805, + "learning_rate": 5.891602437660755e-06, + "loss": 4.7654, + "step": 87140 + }, + { + "epoch": 1.7729695638020835, + "grad_norm": 16.49329376220703, + "learning_rate": 5.8912091632086445e-06, + "loss": 4.8856, + "step": 87145 + }, + { + "epoch": 1.7730712890625, + "grad_norm": 21.24469757080078, + "learning_rate": 5.890815883061989e-06, + "loss": 4.9504, + "step": 87150 + }, + { + "epoch": 1.7731730143229165, + "grad_norm": 16.464691162109375, + "learning_rate": 5.890422597223304e-06, + "loss": 4.9029, + "step": 87155 + }, + { + "epoch": 1.7732747395833335, + "grad_norm": 16.673322677612305, + "learning_rate": 5.8900293056951e-06, + "loss": 4.7922, + "step": 87160 + }, + { + "epoch": 1.77337646484375, + "grad_norm": 23.0777530670166, + "learning_rate": 5.889636008479893e-06, + "loss": 5.0793, + "step": 87165 + }, + { + "epoch": 1.7734781901041665, + "grad_norm": 18.591793060302734, + "learning_rate": 5.889242705580193e-06, + "loss": 4.716, + "step": 87170 + }, + { + "epoch": 1.7735799153645835, + "grad_norm": 23.04306983947754, + "learning_rate": 5.888849396998515e-06, + "loss": 4.9279, + "step": 87175 + }, + { + "epoch": 1.773681640625, + "grad_norm": 18.406991958618164, + "learning_rate": 5.888456082737371e-06, + "loss": 4.762, + "step": 87180 + }, + { + "epoch": 1.7737833658854165, + "grad_norm": 18.02263069152832, + "learning_rate": 5.888062762799276e-06, + "loss": 4.7827, + "step": 87185 + }, + { + "epoch": 1.7738850911458335, + "grad_norm": 19.60504150390625, + "learning_rate": 5.887669437186741e-06, + "loss": 4.7038, + "step": 87190 + }, + { + "epoch": 1.77398681640625, + "grad_norm": 19.430341720581055, + "learning_rate": 5.88727610590228e-06, + "loss": 5.0386, + "step": 87195 + }, + { + "epoch": 1.7740885416666665, + "grad_norm": 21.511144638061523, + "learning_rate": 5.886882768948406e-06, + "loss": 5.0687, + "step": 87200 + }, + { + "epoch": 1.7741902669270835, + "grad_norm": 19.044750213623047, + "learning_rate": 5.886489426327634e-06, + "loss": 4.9868, + "step": 87205 + }, + { + "epoch": 1.7742919921875, + "grad_norm": 21.798725128173828, + "learning_rate": 5.8860960780424734e-06, + "loss": 4.8199, + "step": 87210 + }, + { + "epoch": 1.7743937174479165, + "grad_norm": 22.714277267456055, + "learning_rate": 5.885702724095443e-06, + "loss": 4.9808, + "step": 87215 + }, + { + "epoch": 1.7744954427083335, + "grad_norm": 21.17294692993164, + "learning_rate": 5.885309364489052e-06, + "loss": 4.8478, + "step": 87220 + }, + { + "epoch": 1.77459716796875, + "grad_norm": 20.728073120117188, + "learning_rate": 5.884915999225815e-06, + "loss": 4.7104, + "step": 87225 + }, + { + "epoch": 1.7746988932291665, + "grad_norm": 15.374987602233887, + "learning_rate": 5.884522628308247e-06, + "loss": 4.7421, + "step": 87230 + }, + { + "epoch": 1.7748006184895835, + "grad_norm": 17.639982223510742, + "learning_rate": 5.884129251738858e-06, + "loss": 4.6775, + "step": 87235 + }, + { + "epoch": 1.77490234375, + "grad_norm": 16.900070190429688, + "learning_rate": 5.883735869520165e-06, + "loss": 4.768, + "step": 87240 + }, + { + "epoch": 1.7750040690104165, + "grad_norm": 18.6893253326416, + "learning_rate": 5.883342481654681e-06, + "loss": 4.9437, + "step": 87245 + }, + { + "epoch": 1.7751057942708335, + "grad_norm": 16.918296813964844, + "learning_rate": 5.882949088144918e-06, + "loss": 5.0552, + "step": 87250 + }, + { + "epoch": 1.77520751953125, + "grad_norm": 13.782252311706543, + "learning_rate": 5.882555688993391e-06, + "loss": 4.7413, + "step": 87255 + }, + { + "epoch": 1.7753092447916665, + "grad_norm": 21.327795028686523, + "learning_rate": 5.882162284202613e-06, + "loss": 4.9643, + "step": 87260 + }, + { + "epoch": 1.7754109700520835, + "grad_norm": 16.891176223754883, + "learning_rate": 5.881768873775098e-06, + "loss": 4.4927, + "step": 87265 + }, + { + "epoch": 1.7755126953125, + "grad_norm": 18.38629722595215, + "learning_rate": 5.881375457713359e-06, + "loss": 4.9713, + "step": 87270 + }, + { + "epoch": 1.7756144205729165, + "grad_norm": 22.139385223388672, + "learning_rate": 5.880982036019912e-06, + "loss": 5.0105, + "step": 87275 + }, + { + "epoch": 1.7757161458333335, + "grad_norm": 23.542877197265625, + "learning_rate": 5.8805886086972685e-06, + "loss": 5.1439, + "step": 87280 + }, + { + "epoch": 1.77581787109375, + "grad_norm": 16.101408004760742, + "learning_rate": 5.8801951757479445e-06, + "loss": 4.5946, + "step": 87285 + }, + { + "epoch": 1.7759195963541665, + "grad_norm": 18.16291046142578, + "learning_rate": 5.8798017371744505e-06, + "loss": 4.7637, + "step": 87290 + }, + { + "epoch": 1.7760213216145835, + "grad_norm": 22.553001403808594, + "learning_rate": 5.879408292979305e-06, + "loss": 4.8312, + "step": 87295 + }, + { + "epoch": 1.776123046875, + "grad_norm": 16.536535263061523, + "learning_rate": 5.879014843165019e-06, + "loss": 4.7749, + "step": 87300 + }, + { + "epoch": 1.7762247721354165, + "grad_norm": 18.931270599365234, + "learning_rate": 5.878621387734107e-06, + "loss": 5.0662, + "step": 87305 + }, + { + "epoch": 1.7763264973958335, + "grad_norm": 16.680341720581055, + "learning_rate": 5.878227926689084e-06, + "loss": 5.1873, + "step": 87310 + }, + { + "epoch": 1.77642822265625, + "grad_norm": 17.970115661621094, + "learning_rate": 5.8778344600324625e-06, + "loss": 5.0576, + "step": 87315 + }, + { + "epoch": 1.7765299479166665, + "grad_norm": 19.266395568847656, + "learning_rate": 5.877440987766758e-06, + "loss": 5.071, + "step": 87320 + }, + { + "epoch": 1.7766316731770835, + "grad_norm": 13.616511344909668, + "learning_rate": 5.877047509894484e-06, + "loss": 4.8317, + "step": 87325 + }, + { + "epoch": 1.7767333984375, + "grad_norm": 21.452192306518555, + "learning_rate": 5.876654026418154e-06, + "loss": 5.1539, + "step": 87330 + }, + { + "epoch": 1.7768351236979165, + "grad_norm": 17.437585830688477, + "learning_rate": 5.876260537340284e-06, + "loss": 5.1284, + "step": 87335 + }, + { + "epoch": 1.7769368489583335, + "grad_norm": 16.981918334960938, + "learning_rate": 5.8758670426633865e-06, + "loss": 4.9636, + "step": 87340 + }, + { + "epoch": 1.77703857421875, + "grad_norm": 14.716462135314941, + "learning_rate": 5.875473542389978e-06, + "loss": 4.9071, + "step": 87345 + }, + { + "epoch": 1.7771402994791665, + "grad_norm": 17.583702087402344, + "learning_rate": 5.875080036522571e-06, + "loss": 4.8666, + "step": 87350 + }, + { + "epoch": 1.7772420247395835, + "grad_norm": 21.04877471923828, + "learning_rate": 5.87468652506368e-06, + "loss": 5.0314, + "step": 87355 + }, + { + "epoch": 1.77734375, + "grad_norm": 15.7358980178833, + "learning_rate": 5.874293008015819e-06, + "loss": 5.0069, + "step": 87360 + }, + { + "epoch": 1.7774454752604165, + "grad_norm": 17.812355041503906, + "learning_rate": 5.873899485381503e-06, + "loss": 4.8693, + "step": 87365 + }, + { + "epoch": 1.7775472005208335, + "grad_norm": 22.59456443786621, + "learning_rate": 5.873505957163249e-06, + "loss": 4.8744, + "step": 87370 + }, + { + "epoch": 1.77764892578125, + "grad_norm": 17.724456787109375, + "learning_rate": 5.873112423363568e-06, + "loss": 4.9905, + "step": 87375 + }, + { + "epoch": 1.7777506510416665, + "grad_norm": 24.107351303100586, + "learning_rate": 5.8727188839849745e-06, + "loss": 4.8718, + "step": 87380 + }, + { + "epoch": 1.7778523763020835, + "grad_norm": 19.4588623046875, + "learning_rate": 5.872325339029986e-06, + "loss": 4.8937, + "step": 87385 + }, + { + "epoch": 1.7779541015625, + "grad_norm": 16.197105407714844, + "learning_rate": 5.871931788501115e-06, + "loss": 4.6682, + "step": 87390 + }, + { + "epoch": 1.7780558268229165, + "grad_norm": 17.792922973632812, + "learning_rate": 5.871538232400877e-06, + "loss": 5.2684, + "step": 87395 + }, + { + "epoch": 1.7781575520833335, + "grad_norm": 17.979114532470703, + "learning_rate": 5.871144670731786e-06, + "loss": 5.0028, + "step": 87400 + }, + { + "epoch": 1.77825927734375, + "grad_norm": 19.838703155517578, + "learning_rate": 5.870751103496357e-06, + "loss": 5.1441, + "step": 87405 + }, + { + "epoch": 1.7783610026041665, + "grad_norm": 13.574052810668945, + "learning_rate": 5.8703575306971035e-06, + "loss": 4.8091, + "step": 87410 + }, + { + "epoch": 1.7784627278645835, + "grad_norm": 20.345691680908203, + "learning_rate": 5.869963952336545e-06, + "loss": 5.3448, + "step": 87415 + }, + { + "epoch": 1.778564453125, + "grad_norm": 19.969120025634766, + "learning_rate": 5.86957036841719e-06, + "loss": 4.6762, + "step": 87420 + }, + { + "epoch": 1.7786661783854165, + "grad_norm": 20.485103607177734, + "learning_rate": 5.8691767789415575e-06, + "loss": 5.125, + "step": 87425 + }, + { + "epoch": 1.7787679036458335, + "grad_norm": 20.595600128173828, + "learning_rate": 5.8687831839121614e-06, + "loss": 4.7006, + "step": 87430 + }, + { + "epoch": 1.77886962890625, + "grad_norm": 15.801783561706543, + "learning_rate": 5.868389583331517e-06, + "loss": 4.6904, + "step": 87435 + }, + { + "epoch": 1.7789713541666665, + "grad_norm": 15.010241508483887, + "learning_rate": 5.867995977202139e-06, + "loss": 4.8243, + "step": 87440 + }, + { + "epoch": 1.7790730794270835, + "grad_norm": 18.123106002807617, + "learning_rate": 5.86760236552654e-06, + "loss": 4.9126, + "step": 87445 + }, + { + "epoch": 1.7791748046875, + "grad_norm": 16.588966369628906, + "learning_rate": 5.86720874830724e-06, + "loss": 4.9677, + "step": 87450 + }, + { + "epoch": 1.7792765299479165, + "grad_norm": 23.13456916809082, + "learning_rate": 5.86681512554675e-06, + "loss": 5.0167, + "step": 87455 + }, + { + "epoch": 1.7793782552083335, + "grad_norm": 12.17510986328125, + "learning_rate": 5.866421497247587e-06, + "loss": 4.9479, + "step": 87460 + }, + { + "epoch": 1.77947998046875, + "grad_norm": 15.723722457885742, + "learning_rate": 5.8660278634122655e-06, + "loss": 4.8967, + "step": 87465 + }, + { + "epoch": 1.7795817057291665, + "grad_norm": 20.69877052307129, + "learning_rate": 5.865634224043301e-06, + "loss": 5.192, + "step": 87470 + }, + { + "epoch": 1.7796834309895835, + "grad_norm": 18.07529640197754, + "learning_rate": 5.865240579143208e-06, + "loss": 4.7393, + "step": 87475 + }, + { + "epoch": 1.77978515625, + "grad_norm": 18.503894805908203, + "learning_rate": 5.864846928714504e-06, + "loss": 4.8574, + "step": 87480 + }, + { + "epoch": 1.7798868815104165, + "grad_norm": 17.66398811340332, + "learning_rate": 5.864453272759701e-06, + "loss": 4.8027, + "step": 87485 + }, + { + "epoch": 1.7799886067708335, + "grad_norm": 16.78095817565918, + "learning_rate": 5.864059611281316e-06, + "loss": 4.913, + "step": 87490 + }, + { + "epoch": 1.78009033203125, + "grad_norm": 17.64847755432129, + "learning_rate": 5.863665944281864e-06, + "loss": 4.9354, + "step": 87495 + }, + { + "epoch": 1.7801920572916665, + "grad_norm": 20.887712478637695, + "learning_rate": 5.863272271763862e-06, + "loss": 4.6955, + "step": 87500 + }, + { + "epoch": 1.7802937825520835, + "grad_norm": 19.568700790405273, + "learning_rate": 5.862878593729825e-06, + "loss": 4.6912, + "step": 87505 + }, + { + "epoch": 1.7803955078125, + "grad_norm": 24.89226531982422, + "learning_rate": 5.862484910182265e-06, + "loss": 4.7974, + "step": 87510 + }, + { + "epoch": 1.7804972330729165, + "grad_norm": 16.519933700561523, + "learning_rate": 5.862091221123703e-06, + "loss": 5.0342, + "step": 87515 + }, + { + "epoch": 1.7805989583333335, + "grad_norm": 26.78883171081543, + "learning_rate": 5.861697526556649e-06, + "loss": 5.0841, + "step": 87520 + }, + { + "epoch": 1.78070068359375, + "grad_norm": 15.58768367767334, + "learning_rate": 5.861303826483624e-06, + "loss": 4.635, + "step": 87525 + }, + { + "epoch": 1.7808024088541665, + "grad_norm": 19.40523338317871, + "learning_rate": 5.860910120907139e-06, + "loss": 4.8758, + "step": 87530 + }, + { + "epoch": 1.7809041341145835, + "grad_norm": 28.403514862060547, + "learning_rate": 5.8605164098297126e-06, + "loss": 4.6746, + "step": 87535 + }, + { + "epoch": 1.781005859375, + "grad_norm": 19.20328712463379, + "learning_rate": 5.860122693253859e-06, + "loss": 4.76, + "step": 87540 + }, + { + "epoch": 1.7811075846354165, + "grad_norm": 20.78742218017578, + "learning_rate": 5.8597289711820944e-06, + "loss": 4.5513, + "step": 87545 + }, + { + "epoch": 1.7812093098958335, + "grad_norm": 28.00459098815918, + "learning_rate": 5.859335243616935e-06, + "loss": 4.9419, + "step": 87550 + }, + { + "epoch": 1.78131103515625, + "grad_norm": 13.067527770996094, + "learning_rate": 5.858941510560896e-06, + "loss": 4.9008, + "step": 87555 + }, + { + "epoch": 1.7814127604166665, + "grad_norm": 19.05767822265625, + "learning_rate": 5.8585477720164926e-06, + "loss": 4.7613, + "step": 87560 + }, + { + "epoch": 1.7815144856770835, + "grad_norm": 15.791375160217285, + "learning_rate": 5.858154027986244e-06, + "loss": 4.8395, + "step": 87565 + }, + { + "epoch": 1.7816162109375, + "grad_norm": 17.20386505126953, + "learning_rate": 5.8577602784726615e-06, + "loss": 4.7564, + "step": 87570 + }, + { + "epoch": 1.7817179361979165, + "grad_norm": 17.52037239074707, + "learning_rate": 5.8573665234782626e-06, + "loss": 4.8727, + "step": 87575 + }, + { + "epoch": 1.7818196614583335, + "grad_norm": 20.438732147216797, + "learning_rate": 5.856972763005566e-06, + "loss": 4.8306, + "step": 87580 + }, + { + "epoch": 1.78192138671875, + "grad_norm": 18.407623291015625, + "learning_rate": 5.856578997057084e-06, + "loss": 4.812, + "step": 87585 + }, + { + "epoch": 1.7820231119791665, + "grad_norm": 28.446914672851562, + "learning_rate": 5.856185225635335e-06, + "loss": 4.8167, + "step": 87590 + }, + { + "epoch": 1.7821248372395835, + "grad_norm": 19.686634063720703, + "learning_rate": 5.855791448742834e-06, + "loss": 4.9989, + "step": 87595 + }, + { + "epoch": 1.7822265625, + "grad_norm": 20.94583511352539, + "learning_rate": 5.855397666382095e-06, + "loss": 5.1956, + "step": 87600 + }, + { + "epoch": 1.7823282877604165, + "grad_norm": 24.784732818603516, + "learning_rate": 5.85500387855564e-06, + "loss": 4.9504, + "step": 87605 + }, + { + "epoch": 1.7824300130208335, + "grad_norm": 15.080114364624023, + "learning_rate": 5.854610085265979e-06, + "loss": 5.125, + "step": 87610 + }, + { + "epoch": 1.78253173828125, + "grad_norm": 20.103351593017578, + "learning_rate": 5.854216286515633e-06, + "loss": 4.849, + "step": 87615 + }, + { + "epoch": 1.7826334635416665, + "grad_norm": 16.937158584594727, + "learning_rate": 5.8538224823071144e-06, + "loss": 4.7785, + "step": 87620 + }, + { + "epoch": 1.7827351888020835, + "grad_norm": 22.914220809936523, + "learning_rate": 5.853428672642942e-06, + "loss": 4.8438, + "step": 87625 + }, + { + "epoch": 1.7828369140625, + "grad_norm": 18.527864456176758, + "learning_rate": 5.85303485752563e-06, + "loss": 4.9139, + "step": 87630 + }, + { + "epoch": 1.7829386393229165, + "grad_norm": 16.332805633544922, + "learning_rate": 5.852641036957698e-06, + "loss": 4.7444, + "step": 87635 + }, + { + "epoch": 1.7830403645833335, + "grad_norm": 22.855735778808594, + "learning_rate": 5.85224721094166e-06, + "loss": 5.0162, + "step": 87640 + }, + { + "epoch": 1.78314208984375, + "grad_norm": 22.805192947387695, + "learning_rate": 5.851853379480032e-06, + "loss": 4.9154, + "step": 87645 + }, + { + "epoch": 1.7832438151041665, + "grad_norm": 15.988571166992188, + "learning_rate": 5.851459542575331e-06, + "loss": 4.9571, + "step": 87650 + }, + { + "epoch": 1.7833455403645835, + "grad_norm": 21.810453414916992, + "learning_rate": 5.851065700230074e-06, + "loss": 4.7043, + "step": 87655 + }, + { + "epoch": 1.783447265625, + "grad_norm": 25.06600570678711, + "learning_rate": 5.850671852446779e-06, + "loss": 4.9755, + "step": 87660 + }, + { + "epoch": 1.7835489908854165, + "grad_norm": 18.194053649902344, + "learning_rate": 5.850277999227959e-06, + "loss": 4.7572, + "step": 87665 + }, + { + "epoch": 1.7836507161458335, + "grad_norm": 16.63043212890625, + "learning_rate": 5.849884140576132e-06, + "loss": 4.8819, + "step": 87670 + }, + { + "epoch": 1.78375244140625, + "grad_norm": 21.346906661987305, + "learning_rate": 5.849490276493816e-06, + "loss": 4.7754, + "step": 87675 + }, + { + "epoch": 1.7838541666666665, + "grad_norm": 24.881765365600586, + "learning_rate": 5.8490964069835276e-06, + "loss": 5.1362, + "step": 87680 + }, + { + "epoch": 1.7839558919270835, + "grad_norm": 18.211620330810547, + "learning_rate": 5.848702532047783e-06, + "loss": 4.7052, + "step": 87685 + }, + { + "epoch": 1.7840576171875, + "grad_norm": 19.74892807006836, + "learning_rate": 5.8483086516890974e-06, + "loss": 4.9678, + "step": 87690 + }, + { + "epoch": 1.7841593424479165, + "grad_norm": 16.353740692138672, + "learning_rate": 5.84791476590999e-06, + "loss": 5.0883, + "step": 87695 + }, + { + "epoch": 1.7842610677083335, + "grad_norm": 13.72539234161377, + "learning_rate": 5.847520874712975e-06, + "loss": 4.8997, + "step": 87700 + }, + { + "epoch": 1.78436279296875, + "grad_norm": 21.224275588989258, + "learning_rate": 5.847126978100571e-06, + "loss": 4.9503, + "step": 87705 + }, + { + "epoch": 1.7844645182291665, + "grad_norm": 19.030752182006836, + "learning_rate": 5.846733076075295e-06, + "loss": 4.8773, + "step": 87710 + }, + { + "epoch": 1.7845662434895835, + "grad_norm": 13.817037582397461, + "learning_rate": 5.846339168639663e-06, + "loss": 4.7184, + "step": 87715 + }, + { + "epoch": 1.78466796875, + "grad_norm": 15.792333602905273, + "learning_rate": 5.845945255796193e-06, + "loss": 5.1347, + "step": 87720 + }, + { + "epoch": 1.7847696940104165, + "grad_norm": 25.1926326751709, + "learning_rate": 5.845551337547401e-06, + "loss": 4.6327, + "step": 87725 + }, + { + "epoch": 1.7848714192708335, + "grad_norm": 18.586469650268555, + "learning_rate": 5.845157413895804e-06, + "loss": 4.9238, + "step": 87730 + }, + { + "epoch": 1.78497314453125, + "grad_norm": 17.864505767822266, + "learning_rate": 5.8447634848439206e-06, + "loss": 4.8859, + "step": 87735 + }, + { + "epoch": 1.7850748697916665, + "grad_norm": 16.86371612548828, + "learning_rate": 5.844369550394265e-06, + "loss": 4.8378, + "step": 87740 + }, + { + "epoch": 1.7851765950520835, + "grad_norm": 15.173039436340332, + "learning_rate": 5.843975610549357e-06, + "loss": 5.0018, + "step": 87745 + }, + { + "epoch": 1.7852783203125, + "grad_norm": 20.2265625, + "learning_rate": 5.843581665311714e-06, + "loss": 4.8208, + "step": 87750 + }, + { + "epoch": 1.7853800455729165, + "grad_norm": 15.509832382202148, + "learning_rate": 5.84318771468385e-06, + "loss": 4.7608, + "step": 87755 + }, + { + "epoch": 1.7854817708333335, + "grad_norm": 12.543475151062012, + "learning_rate": 5.842793758668285e-06, + "loss": 5.0952, + "step": 87760 + }, + { + "epoch": 1.78558349609375, + "grad_norm": 16.65633773803711, + "learning_rate": 5.842399797267535e-06, + "loss": 5.1259, + "step": 87765 + }, + { + "epoch": 1.7856852213541665, + "grad_norm": 14.463776588439941, + "learning_rate": 5.842005830484118e-06, + "loss": 4.8394, + "step": 87770 + }, + { + "epoch": 1.7857869466145835, + "grad_norm": 16.750905990600586, + "learning_rate": 5.841611858320552e-06, + "loss": 4.8663, + "step": 87775 + }, + { + "epoch": 1.785888671875, + "grad_norm": 18.759244918823242, + "learning_rate": 5.8412178807793505e-06, + "loss": 4.7734, + "step": 87780 + }, + { + "epoch": 1.7859903971354165, + "grad_norm": 17.191877365112305, + "learning_rate": 5.8408238978630374e-06, + "loss": 4.848, + "step": 87785 + }, + { + "epoch": 1.7860921223958335, + "grad_norm": 23.488418579101562, + "learning_rate": 5.840429909574125e-06, + "loss": 4.9114, + "step": 87790 + }, + { + "epoch": 1.78619384765625, + "grad_norm": 16.532955169677734, + "learning_rate": 5.840035915915131e-06, + "loss": 4.6878, + "step": 87795 + }, + { + "epoch": 1.7862955729166665, + "grad_norm": 17.656999588012695, + "learning_rate": 5.839641916888576e-06, + "loss": 5.2102, + "step": 87800 + }, + { + "epoch": 1.7863972981770835, + "grad_norm": 16.658405303955078, + "learning_rate": 5.839247912496974e-06, + "loss": 4.908, + "step": 87805 + }, + { + "epoch": 1.7864990234375, + "grad_norm": 16.859039306640625, + "learning_rate": 5.838853902742844e-06, + "loss": 4.9509, + "step": 87810 + }, + { + "epoch": 1.7866007486979165, + "grad_norm": 17.457162857055664, + "learning_rate": 5.838459887628706e-06, + "loss": 4.8851, + "step": 87815 + }, + { + "epoch": 1.7867024739583335, + "grad_norm": 13.283500671386719, + "learning_rate": 5.838065867157073e-06, + "loss": 4.8948, + "step": 87820 + }, + { + "epoch": 1.78680419921875, + "grad_norm": 13.442310333251953, + "learning_rate": 5.837671841330468e-06, + "loss": 4.9393, + "step": 87825 + }, + { + "epoch": 1.7869059244791665, + "grad_norm": 18.223970413208008, + "learning_rate": 5.837277810151403e-06, + "loss": 4.8184, + "step": 87830 + }, + { + "epoch": 1.7870076497395835, + "grad_norm": 24.088029861450195, + "learning_rate": 5.8368837736224e-06, + "loss": 4.9643, + "step": 87835 + }, + { + "epoch": 1.787109375, + "grad_norm": 18.353212356567383, + "learning_rate": 5.836489731745976e-06, + "loss": 4.9005, + "step": 87840 + }, + { + "epoch": 1.7872111002604165, + "grad_norm": 17.379390716552734, + "learning_rate": 5.836095684524648e-06, + "loss": 4.8978, + "step": 87845 + }, + { + "epoch": 1.7873128255208335, + "grad_norm": 14.055397987365723, + "learning_rate": 5.835701631960933e-06, + "loss": 4.9531, + "step": 87850 + }, + { + "epoch": 1.78741455078125, + "grad_norm": 14.243861198425293, + "learning_rate": 5.8353075740573505e-06, + "loss": 5.087, + "step": 87855 + }, + { + "epoch": 1.7875162760416665, + "grad_norm": 14.861563682556152, + "learning_rate": 5.834913510816418e-06, + "loss": 4.9375, + "step": 87860 + }, + { + "epoch": 1.7876180013020835, + "grad_norm": 16.089401245117188, + "learning_rate": 5.834519442240653e-06, + "loss": 5.0649, + "step": 87865 + }, + { + "epoch": 1.7877197265625, + "grad_norm": 17.275686264038086, + "learning_rate": 5.8341253683325725e-06, + "loss": 5.0593, + "step": 87870 + }, + { + "epoch": 1.7878214518229165, + "grad_norm": 17.298242568969727, + "learning_rate": 5.8337312890946975e-06, + "loss": 4.6982, + "step": 87875 + }, + { + "epoch": 1.7879231770833335, + "grad_norm": 16.11362075805664, + "learning_rate": 5.833337204529543e-06, + "loss": 4.8218, + "step": 87880 + }, + { + "epoch": 1.78802490234375, + "grad_norm": 19.67045021057129, + "learning_rate": 5.8329431146396285e-06, + "loss": 4.8087, + "step": 87885 + }, + { + "epoch": 1.7881266276041665, + "grad_norm": 21.139001846313477, + "learning_rate": 5.832549019427473e-06, + "loss": 5.1357, + "step": 87890 + }, + { + "epoch": 1.7882283528645835, + "grad_norm": 17.479318618774414, + "learning_rate": 5.832154918895592e-06, + "loss": 4.8353, + "step": 87895 + }, + { + "epoch": 1.788330078125, + "grad_norm": 19.4914493560791, + "learning_rate": 5.831760813046505e-06, + "loss": 4.7219, + "step": 87900 + }, + { + "epoch": 1.7884318033854165, + "grad_norm": 17.035024642944336, + "learning_rate": 5.831366701882733e-06, + "loss": 4.7151, + "step": 87905 + }, + { + "epoch": 1.7885335286458335, + "grad_norm": 16.14463996887207, + "learning_rate": 5.830972585406789e-06, + "loss": 4.6478, + "step": 87910 + }, + { + "epoch": 1.78863525390625, + "grad_norm": 18.32030487060547, + "learning_rate": 5.8305784636211945e-06, + "loss": 5.0878, + "step": 87915 + }, + { + "epoch": 1.7887369791666665, + "grad_norm": 14.933008193969727, + "learning_rate": 5.830184336528469e-06, + "loss": 4.6684, + "step": 87920 + }, + { + "epoch": 1.7888387044270835, + "grad_norm": 12.487655639648438, + "learning_rate": 5.829790204131128e-06, + "loss": 5.1352, + "step": 87925 + }, + { + "epoch": 1.7889404296875, + "grad_norm": 18.269485473632812, + "learning_rate": 5.829396066431691e-06, + "loss": 4.6918, + "step": 87930 + }, + { + "epoch": 1.7890421549479165, + "grad_norm": 12.6039400100708, + "learning_rate": 5.829001923432675e-06, + "loss": 4.6464, + "step": 87935 + }, + { + "epoch": 1.7891438802083335, + "grad_norm": 19.319875717163086, + "learning_rate": 5.828607775136602e-06, + "loss": 5.1878, + "step": 87940 + }, + { + "epoch": 1.78924560546875, + "grad_norm": 16.99966812133789, + "learning_rate": 5.828213621545987e-06, + "loss": 4.7403, + "step": 87945 + }, + { + "epoch": 1.7893473307291665, + "grad_norm": 17.147424697875977, + "learning_rate": 5.82781946266335e-06, + "loss": 4.6664, + "step": 87950 + }, + { + "epoch": 1.7894490559895835, + "grad_norm": 18.313291549682617, + "learning_rate": 5.827425298491209e-06, + "loss": 4.9021, + "step": 87955 + }, + { + "epoch": 1.78955078125, + "grad_norm": 16.104507446289062, + "learning_rate": 5.827031129032084e-06, + "loss": 4.7623, + "step": 87960 + }, + { + "epoch": 1.7896525065104165, + "grad_norm": 11.465877532958984, + "learning_rate": 5.826636954288493e-06, + "loss": 5.0278, + "step": 87965 + }, + { + "epoch": 1.7897542317708335, + "grad_norm": 15.040751457214355, + "learning_rate": 5.826242774262954e-06, + "loss": 4.8385, + "step": 87970 + }, + { + "epoch": 1.78985595703125, + "grad_norm": 19.50025749206543, + "learning_rate": 5.825848588957986e-06, + "loss": 5.0599, + "step": 87975 + }, + { + "epoch": 1.7899576822916665, + "grad_norm": 18.500469207763672, + "learning_rate": 5.825454398376107e-06, + "loss": 4.8523, + "step": 87980 + }, + { + "epoch": 1.7900594075520835, + "grad_norm": 20.017345428466797, + "learning_rate": 5.825060202519837e-06, + "loss": 5.2488, + "step": 87985 + }, + { + "epoch": 1.7901611328125, + "grad_norm": 18.851821899414062, + "learning_rate": 5.8246660013916936e-06, + "loss": 4.8128, + "step": 87990 + }, + { + "epoch": 1.7902628580729165, + "grad_norm": 16.91675567626953, + "learning_rate": 5.8242717949941976e-06, + "loss": 4.6203, + "step": 87995 + }, + { + "epoch": 1.7903645833333335, + "grad_norm": 14.676960945129395, + "learning_rate": 5.823877583329864e-06, + "loss": 4.8953, + "step": 88000 + }, + { + "epoch": 1.79046630859375, + "grad_norm": 18.755821228027344, + "learning_rate": 5.823483366401217e-06, + "loss": 4.9214, + "step": 88005 + }, + { + "epoch": 1.7905680338541665, + "grad_norm": 15.604779243469238, + "learning_rate": 5.823089144210772e-06, + "loss": 4.8893, + "step": 88010 + }, + { + "epoch": 1.7906697591145835, + "grad_norm": 43.05686569213867, + "learning_rate": 5.822694916761047e-06, + "loss": 4.8324, + "step": 88015 + }, + { + "epoch": 1.790771484375, + "grad_norm": 22.33287811279297, + "learning_rate": 5.822300684054564e-06, + "loss": 4.7283, + "step": 88020 + }, + { + "epoch": 1.7908732096354165, + "grad_norm": 14.051888465881348, + "learning_rate": 5.821906446093839e-06, + "loss": 4.5938, + "step": 88025 + }, + { + "epoch": 1.7909749348958335, + "grad_norm": 17.132219314575195, + "learning_rate": 5.8215122028813955e-06, + "loss": 4.8367, + "step": 88030 + }, + { + "epoch": 1.79107666015625, + "grad_norm": 16.10393714904785, + "learning_rate": 5.821117954419749e-06, + "loss": 4.9912, + "step": 88035 + }, + { + "epoch": 1.7911783854166665, + "grad_norm": 19.640117645263672, + "learning_rate": 5.820723700711417e-06, + "loss": 4.7464, + "step": 88040 + }, + { + "epoch": 1.7912801106770835, + "grad_norm": 16.569578170776367, + "learning_rate": 5.820329441758924e-06, + "loss": 4.8643, + "step": 88045 + }, + { + "epoch": 1.7913818359375, + "grad_norm": 19.52069664001465, + "learning_rate": 5.819935177564784e-06, + "loss": 5.2104, + "step": 88050 + }, + { + "epoch": 1.7914835611979165, + "grad_norm": 15.419842720031738, + "learning_rate": 5.81954090813152e-06, + "loss": 4.853, + "step": 88055 + }, + { + "epoch": 1.7915852864583335, + "grad_norm": 21.034725189208984, + "learning_rate": 5.81914663346165e-06, + "loss": 4.9993, + "step": 88060 + }, + { + "epoch": 1.79168701171875, + "grad_norm": 18.809412002563477, + "learning_rate": 5.818752353557692e-06, + "loss": 4.9353, + "step": 88065 + }, + { + "epoch": 1.7917887369791665, + "grad_norm": 17.36994171142578, + "learning_rate": 5.818358068422167e-06, + "loss": 4.9218, + "step": 88070 + }, + { + "epoch": 1.7918904622395835, + "grad_norm": 22.09026336669922, + "learning_rate": 5.817963778057594e-06, + "loss": 5.0351, + "step": 88075 + }, + { + "epoch": 1.7919921875, + "grad_norm": 18.84889030456543, + "learning_rate": 5.817569482466491e-06, + "loss": 4.9745, + "step": 88080 + }, + { + "epoch": 1.7920939127604165, + "grad_norm": 18.948436737060547, + "learning_rate": 5.817175181651379e-06, + "loss": 4.8396, + "step": 88085 + }, + { + "epoch": 1.7921956380208335, + "grad_norm": 29.03215980529785, + "learning_rate": 5.816780875614776e-06, + "loss": 4.6434, + "step": 88090 + }, + { + "epoch": 1.79229736328125, + "grad_norm": 19.42930030822754, + "learning_rate": 5.8163865643592045e-06, + "loss": 4.7101, + "step": 88095 + }, + { + "epoch": 1.7923990885416665, + "grad_norm": 22.060054779052734, + "learning_rate": 5.8159922478871825e-06, + "loss": 4.9521, + "step": 88100 + }, + { + "epoch": 1.7925008138020835, + "grad_norm": 17.950237274169922, + "learning_rate": 5.815597926201226e-06, + "loss": 4.9204, + "step": 88105 + }, + { + "epoch": 1.7926025390625, + "grad_norm": 27.189172744750977, + "learning_rate": 5.81520359930386e-06, + "loss": 4.9694, + "step": 88110 + }, + { + "epoch": 1.7927042643229165, + "grad_norm": 13.59628963470459, + "learning_rate": 5.8148092671976006e-06, + "loss": 4.821, + "step": 88115 + }, + { + "epoch": 1.7928059895833335, + "grad_norm": 21.38927459716797, + "learning_rate": 5.81441492988497e-06, + "loss": 4.8995, + "step": 88120 + }, + { + "epoch": 1.79290771484375, + "grad_norm": 21.618480682373047, + "learning_rate": 5.814020587368486e-06, + "loss": 5.0077, + "step": 88125 + }, + { + "epoch": 1.7930094401041665, + "grad_norm": 21.039417266845703, + "learning_rate": 5.813626239650668e-06, + "loss": 4.7664, + "step": 88130 + }, + { + "epoch": 1.7931111653645835, + "grad_norm": 16.519176483154297, + "learning_rate": 5.813231886734037e-06, + "loss": 4.7744, + "step": 88135 + }, + { + "epoch": 1.793212890625, + "grad_norm": 16.31978416442871, + "learning_rate": 5.812837528621113e-06, + "loss": 4.8315, + "step": 88140 + }, + { + "epoch": 1.7933146158854165, + "grad_norm": 15.28887939453125, + "learning_rate": 5.8124431653144155e-06, + "loss": 4.8705, + "step": 88145 + }, + { + "epoch": 1.7934163411458335, + "grad_norm": 21.681955337524414, + "learning_rate": 5.812048796816463e-06, + "loss": 4.8252, + "step": 88150 + }, + { + "epoch": 1.79351806640625, + "grad_norm": 20.242258071899414, + "learning_rate": 5.811654423129777e-06, + "loss": 5.0339, + "step": 88155 + }, + { + "epoch": 1.7936197916666665, + "grad_norm": 14.347769737243652, + "learning_rate": 5.811260044256876e-06, + "loss": 4.9586, + "step": 88160 + }, + { + "epoch": 1.7937215169270835, + "grad_norm": 20.36470603942871, + "learning_rate": 5.8108656602002836e-06, + "loss": 4.7153, + "step": 88165 + }, + { + "epoch": 1.7938232421875, + "grad_norm": 17.175378799438477, + "learning_rate": 5.810471270962513e-06, + "loss": 4.9467, + "step": 88170 + }, + { + "epoch": 1.7939249674479165, + "grad_norm": 14.91232681274414, + "learning_rate": 5.8100768765460915e-06, + "loss": 4.8726, + "step": 88175 + }, + { + "epoch": 1.7940266927083335, + "grad_norm": 18.744903564453125, + "learning_rate": 5.809682476953535e-06, + "loss": 4.7378, + "step": 88180 + }, + { + "epoch": 1.79412841796875, + "grad_norm": 14.848149299621582, + "learning_rate": 5.809288072187363e-06, + "loss": 4.8008, + "step": 88185 + }, + { + "epoch": 1.7942301432291665, + "grad_norm": 16.96190643310547, + "learning_rate": 5.808893662250099e-06, + "loss": 4.6361, + "step": 88190 + }, + { + "epoch": 1.7943318684895835, + "grad_norm": 17.38351821899414, + "learning_rate": 5.808499247144261e-06, + "loss": 5.0172, + "step": 88195 + }, + { + "epoch": 1.79443359375, + "grad_norm": 16.48365020751953, + "learning_rate": 5.808104826872369e-06, + "loss": 4.7097, + "step": 88200 + }, + { + "epoch": 1.7945353190104165, + "grad_norm": 16.174381256103516, + "learning_rate": 5.807710401436943e-06, + "loss": 4.6201, + "step": 88205 + }, + { + "epoch": 1.7946370442708335, + "grad_norm": 17.29930877685547, + "learning_rate": 5.807315970840506e-06, + "loss": 4.9628, + "step": 88210 + }, + { + "epoch": 1.79473876953125, + "grad_norm": 16.325759887695312, + "learning_rate": 5.806921535085573e-06, + "loss": 4.9227, + "step": 88215 + }, + { + "epoch": 1.7948404947916665, + "grad_norm": 19.502046585083008, + "learning_rate": 5.80652709417467e-06, + "loss": 4.782, + "step": 88220 + }, + { + "epoch": 1.7949422200520835, + "grad_norm": 17.992725372314453, + "learning_rate": 5.806132648110314e-06, + "loss": 4.8827, + "step": 88225 + }, + { + "epoch": 1.7950439453125, + "grad_norm": 15.970784187316895, + "learning_rate": 5.805738196895028e-06, + "loss": 4.8782, + "step": 88230 + }, + { + "epoch": 1.7951456705729165, + "grad_norm": 20.71463394165039, + "learning_rate": 5.8053437405313285e-06, + "loss": 4.6387, + "step": 88235 + }, + { + "epoch": 1.7952473958333335, + "grad_norm": 20.047161102294922, + "learning_rate": 5.804949279021739e-06, + "loss": 4.8972, + "step": 88240 + }, + { + "epoch": 1.79534912109375, + "grad_norm": 18.385475158691406, + "learning_rate": 5.804554812368779e-06, + "loss": 5.151, + "step": 88245 + }, + { + "epoch": 1.7954508463541665, + "grad_norm": 21.6641845703125, + "learning_rate": 5.804160340574969e-06, + "loss": 4.9718, + "step": 88250 + }, + { + "epoch": 1.7955525716145835, + "grad_norm": 12.148396492004395, + "learning_rate": 5.803765863642831e-06, + "loss": 4.9684, + "step": 88255 + }, + { + "epoch": 1.795654296875, + "grad_norm": 17.324207305908203, + "learning_rate": 5.803371381574883e-06, + "loss": 4.76, + "step": 88260 + }, + { + "epoch": 1.7957560221354165, + "grad_norm": 22.056964874267578, + "learning_rate": 5.802976894373646e-06, + "loss": 4.6743, + "step": 88265 + }, + { + "epoch": 1.7958577473958335, + "grad_norm": 16.340045928955078, + "learning_rate": 5.802582402041643e-06, + "loss": 4.8598, + "step": 88270 + }, + { + "epoch": 1.79595947265625, + "grad_norm": 17.85617446899414, + "learning_rate": 5.8021879045813935e-06, + "loss": 4.8668, + "step": 88275 + }, + { + "epoch": 1.7960611979166665, + "grad_norm": 20.53058433532715, + "learning_rate": 5.801793401995417e-06, + "loss": 4.9707, + "step": 88280 + }, + { + "epoch": 1.7961629231770835, + "grad_norm": 21.83555030822754, + "learning_rate": 5.801398894286236e-06, + "loss": 5.189, + "step": 88285 + }, + { + "epoch": 1.7962646484375, + "grad_norm": 15.942612648010254, + "learning_rate": 5.80100438145637e-06, + "loss": 4.9821, + "step": 88290 + }, + { + "epoch": 1.7963663736979165, + "grad_norm": 19.328168869018555, + "learning_rate": 5.800609863508341e-06, + "loss": 4.7793, + "step": 88295 + }, + { + "epoch": 1.7964680989583335, + "grad_norm": 17.22724151611328, + "learning_rate": 5.800215340444667e-06, + "loss": 4.8726, + "step": 88300 + }, + { + "epoch": 1.79656982421875, + "grad_norm": 22.39527702331543, + "learning_rate": 5.799820812267874e-06, + "loss": 4.7166, + "step": 88305 + }, + { + "epoch": 1.7966715494791665, + "grad_norm": 17.387617111206055, + "learning_rate": 5.799426278980477e-06, + "loss": 4.929, + "step": 88310 + }, + { + "epoch": 1.7967732747395835, + "grad_norm": 15.582245826721191, + "learning_rate": 5.799031740585002e-06, + "loss": 4.7226, + "step": 88315 + }, + { + "epoch": 1.796875, + "grad_norm": 15.046652793884277, + "learning_rate": 5.798637197083967e-06, + "loss": 5.0525, + "step": 88320 + }, + { + "epoch": 1.7969767252604165, + "grad_norm": 17.335386276245117, + "learning_rate": 5.798242648479893e-06, + "loss": 4.842, + "step": 88325 + }, + { + "epoch": 1.7970784505208335, + "grad_norm": 23.84859848022461, + "learning_rate": 5.797848094775304e-06, + "loss": 4.9174, + "step": 88330 + }, + { + "epoch": 1.79718017578125, + "grad_norm": 20.123197555541992, + "learning_rate": 5.797453535972715e-06, + "loss": 4.8485, + "step": 88335 + }, + { + "epoch": 1.7972819010416665, + "grad_norm": 23.140493392944336, + "learning_rate": 5.797058972074655e-06, + "loss": 4.8704, + "step": 88340 + }, + { + "epoch": 1.7973836263020835, + "grad_norm": 18.707529067993164, + "learning_rate": 5.796664403083639e-06, + "loss": 5.0032, + "step": 88345 + }, + { + "epoch": 1.7974853515625, + "grad_norm": 17.460344314575195, + "learning_rate": 5.7962698290021926e-06, + "loss": 4.9176, + "step": 88350 + }, + { + "epoch": 1.7975870768229165, + "grad_norm": 22.143062591552734, + "learning_rate": 5.795875249832832e-06, + "loss": 5.252, + "step": 88355 + }, + { + "epoch": 1.7976888020833335, + "grad_norm": 16.731687545776367, + "learning_rate": 5.795480665578083e-06, + "loss": 4.9039, + "step": 88360 + }, + { + "epoch": 1.79779052734375, + "grad_norm": 16.442947387695312, + "learning_rate": 5.795086076240465e-06, + "loss": 4.9529, + "step": 88365 + }, + { + "epoch": 1.7978922526041665, + "grad_norm": 15.278543472290039, + "learning_rate": 5.794691481822498e-06, + "loss": 5.2099, + "step": 88370 + }, + { + "epoch": 1.7979939778645835, + "grad_norm": 12.35828971862793, + "learning_rate": 5.794296882326707e-06, + "loss": 4.8732, + "step": 88375 + }, + { + "epoch": 1.798095703125, + "grad_norm": 18.76160430908203, + "learning_rate": 5.7939022777556095e-06, + "loss": 4.7316, + "step": 88380 + }, + { + "epoch": 1.7981974283854165, + "grad_norm": 16.38729476928711, + "learning_rate": 5.793507668111729e-06, + "loss": 5.1839, + "step": 88385 + }, + { + "epoch": 1.7982991536458335, + "grad_norm": 17.131010055541992, + "learning_rate": 5.793113053397586e-06, + "loss": 4.7768, + "step": 88390 + }, + { + "epoch": 1.79840087890625, + "grad_norm": 17.344350814819336, + "learning_rate": 5.792718433615703e-06, + "loss": 5.0204, + "step": 88395 + }, + { + "epoch": 1.7985026041666665, + "grad_norm": 19.046228408813477, + "learning_rate": 5.792323808768598e-06, + "loss": 4.9589, + "step": 88400 + }, + { + "epoch": 1.7986043294270835, + "grad_norm": 15.94335651397705, + "learning_rate": 5.791929178858799e-06, + "loss": 4.839, + "step": 88405 + }, + { + "epoch": 1.7987060546875, + "grad_norm": 16.619951248168945, + "learning_rate": 5.791534543888824e-06, + "loss": 4.8092, + "step": 88410 + }, + { + "epoch": 1.7988077799479165, + "grad_norm": 17.07763671875, + "learning_rate": 5.791139903861192e-06, + "loss": 4.9418, + "step": 88415 + }, + { + "epoch": 1.7989095052083335, + "grad_norm": 17.815322875976562, + "learning_rate": 5.790745258778428e-06, + "loss": 4.707, + "step": 88420 + }, + { + "epoch": 1.79901123046875, + "grad_norm": 18.145071029663086, + "learning_rate": 5.790350608643053e-06, + "loss": 5.08, + "step": 88425 + }, + { + "epoch": 1.7991129557291665, + "grad_norm": 15.412796020507812, + "learning_rate": 5.789955953457589e-06, + "loss": 4.8747, + "step": 88430 + }, + { + "epoch": 1.7992146809895835, + "grad_norm": 16.34760093688965, + "learning_rate": 5.789561293224558e-06, + "loss": 4.9006, + "step": 88435 + }, + { + "epoch": 1.79931640625, + "grad_norm": 15.385222434997559, + "learning_rate": 5.789166627946479e-06, + "loss": 4.8193, + "step": 88440 + }, + { + "epoch": 1.7994181315104165, + "grad_norm": 15.027897834777832, + "learning_rate": 5.788771957625877e-06, + "loss": 4.7684, + "step": 88445 + }, + { + "epoch": 1.7995198567708335, + "grad_norm": 17.3519344329834, + "learning_rate": 5.788377282265273e-06, + "loss": 4.9311, + "step": 88450 + }, + { + "epoch": 1.79962158203125, + "grad_norm": 16.70494270324707, + "learning_rate": 5.787982601867187e-06, + "loss": 4.7653, + "step": 88455 + }, + { + "epoch": 1.7997233072916665, + "grad_norm": 18.184017181396484, + "learning_rate": 5.7875879164341434e-06, + "loss": 4.7617, + "step": 88460 + }, + { + "epoch": 1.7998250325520835, + "grad_norm": 16.51224136352539, + "learning_rate": 5.7871932259686615e-06, + "loss": 4.989, + "step": 88465 + }, + { + "epoch": 1.7999267578125, + "grad_norm": 21.264606475830078, + "learning_rate": 5.786798530473267e-06, + "loss": 5.0075, + "step": 88470 + }, + { + "epoch": 1.8000284830729165, + "grad_norm": 28.733661651611328, + "learning_rate": 5.786403829950477e-06, + "loss": 4.799, + "step": 88475 + }, + { + "epoch": 1.8001302083333335, + "grad_norm": 18.91158676147461, + "learning_rate": 5.786009124402818e-06, + "loss": 4.8294, + "step": 88480 + }, + { + "epoch": 1.80023193359375, + "grad_norm": 25.384078979492188, + "learning_rate": 5.785614413832809e-06, + "loss": 4.8982, + "step": 88485 + }, + { + "epoch": 1.8003336588541665, + "grad_norm": 18.242311477661133, + "learning_rate": 5.785219698242975e-06, + "loss": 5.0114, + "step": 88490 + }, + { + "epoch": 1.8004353841145835, + "grad_norm": 24.76007652282715, + "learning_rate": 5.784824977635836e-06, + "loss": 5.0384, + "step": 88495 + }, + { + "epoch": 1.800537109375, + "grad_norm": 16.23733901977539, + "learning_rate": 5.784430252013914e-06, + "loss": 5.1326, + "step": 88500 + }, + { + "epoch": 1.8006388346354165, + "grad_norm": 17.793832778930664, + "learning_rate": 5.78403552137973e-06, + "loss": 4.6734, + "step": 88505 + }, + { + "epoch": 1.8007405598958335, + "grad_norm": 14.769939422607422, + "learning_rate": 5.783640785735812e-06, + "loss": 4.8556, + "step": 88510 + }, + { + "epoch": 1.80084228515625, + "grad_norm": 20.013790130615234, + "learning_rate": 5.783246045084675e-06, + "loss": 4.9377, + "step": 88515 + }, + { + "epoch": 1.8009440104166665, + "grad_norm": 16.81150245666504, + "learning_rate": 5.782851299428844e-06, + "loss": 4.8716, + "step": 88520 + }, + { + "epoch": 1.8010457356770835, + "grad_norm": 15.153989791870117, + "learning_rate": 5.782456548770844e-06, + "loss": 4.6781, + "step": 88525 + }, + { + "epoch": 1.8011474609375, + "grad_norm": 19.247785568237305, + "learning_rate": 5.782061793113194e-06, + "loss": 5.1584, + "step": 88530 + }, + { + "epoch": 1.8012491861979165, + "grad_norm": 14.73239803314209, + "learning_rate": 5.7816670324584175e-06, + "loss": 4.9268, + "step": 88535 + }, + { + "epoch": 1.8013509114583335, + "grad_norm": 20.3370361328125, + "learning_rate": 5.781272266809037e-06, + "loss": 4.7091, + "step": 88540 + }, + { + "epoch": 1.80145263671875, + "grad_norm": 17.776952743530273, + "learning_rate": 5.780877496167574e-06, + "loss": 4.6713, + "step": 88545 + }, + { + "epoch": 1.8015543619791665, + "grad_norm": 15.850624084472656, + "learning_rate": 5.780482720536552e-06, + "loss": 4.9256, + "step": 88550 + }, + { + "epoch": 1.8016560872395835, + "grad_norm": 19.978567123413086, + "learning_rate": 5.780087939918493e-06, + "loss": 4.6702, + "step": 88555 + }, + { + "epoch": 1.8017578125, + "grad_norm": 24.514074325561523, + "learning_rate": 5.779693154315921e-06, + "loss": 4.7537, + "step": 88560 + }, + { + "epoch": 1.8018595377604165, + "grad_norm": 22.173452377319336, + "learning_rate": 5.779298363731356e-06, + "loss": 4.8548, + "step": 88565 + }, + { + "epoch": 1.8019612630208335, + "grad_norm": 19.220684051513672, + "learning_rate": 5.778903568167322e-06, + "loss": 4.8024, + "step": 88570 + }, + { + "epoch": 1.80206298828125, + "grad_norm": 16.125118255615234, + "learning_rate": 5.778508767626342e-06, + "loss": 5.0153, + "step": 88575 + }, + { + "epoch": 1.8021647135416665, + "grad_norm": 15.80095386505127, + "learning_rate": 5.778113962110937e-06, + "loss": 5.0378, + "step": 88580 + }, + { + "epoch": 1.8022664388020835, + "grad_norm": 24.15378189086914, + "learning_rate": 5.777719151623631e-06, + "loss": 4.9017, + "step": 88585 + }, + { + "epoch": 1.8023681640625, + "grad_norm": 13.89053726196289, + "learning_rate": 5.7773243361669476e-06, + "loss": 4.8442, + "step": 88590 + }, + { + "epoch": 1.8024698893229165, + "grad_norm": 20.560932159423828, + "learning_rate": 5.776929515743407e-06, + "loss": 5.0689, + "step": 88595 + }, + { + "epoch": 1.8025716145833335, + "grad_norm": 17.54327964782715, + "learning_rate": 5.776534690355535e-06, + "loss": 5.0619, + "step": 88600 + }, + { + "epoch": 1.80267333984375, + "grad_norm": 17.401344299316406, + "learning_rate": 5.776139860005852e-06, + "loss": 4.9942, + "step": 88605 + }, + { + "epoch": 1.8027750651041665, + "grad_norm": 26.075899124145508, + "learning_rate": 5.775745024696881e-06, + "loss": 4.8783, + "step": 88610 + }, + { + "epoch": 1.8028767903645835, + "grad_norm": 17.779579162597656, + "learning_rate": 5.775350184431145e-06, + "loss": 4.8957, + "step": 88615 + }, + { + "epoch": 1.802978515625, + "grad_norm": 18.310882568359375, + "learning_rate": 5.77495533921117e-06, + "loss": 5.1409, + "step": 88620 + }, + { + "epoch": 1.8030802408854165, + "grad_norm": 16.8492374420166, + "learning_rate": 5.774560489039475e-06, + "loss": 4.9737, + "step": 88625 + }, + { + "epoch": 1.8031819661458335, + "grad_norm": 18.293428421020508, + "learning_rate": 5.774165633918585e-06, + "loss": 4.8608, + "step": 88630 + }, + { + "epoch": 1.80328369140625, + "grad_norm": 15.641308784484863, + "learning_rate": 5.773770773851021e-06, + "loss": 5.1247, + "step": 88635 + }, + { + "epoch": 1.8033854166666665, + "grad_norm": 14.60080337524414, + "learning_rate": 5.773375908839308e-06, + "loss": 4.8612, + "step": 88640 + }, + { + "epoch": 1.8034871419270835, + "grad_norm": 20.49100112915039, + "learning_rate": 5.772981038885969e-06, + "loss": 4.6885, + "step": 88645 + }, + { + "epoch": 1.8035888671875, + "grad_norm": 15.508686065673828, + "learning_rate": 5.772586163993525e-06, + "loss": 4.7608, + "step": 88650 + }, + { + "epoch": 1.8036905924479165, + "grad_norm": 15.036402702331543, + "learning_rate": 5.772191284164504e-06, + "loss": 4.7763, + "step": 88655 + }, + { + "epoch": 1.8037923177083335, + "grad_norm": 19.589601516723633, + "learning_rate": 5.771796399401422e-06, + "loss": 4.7348, + "step": 88660 + }, + { + "epoch": 1.80389404296875, + "grad_norm": 14.147035598754883, + "learning_rate": 5.771401509706809e-06, + "loss": 5.0739, + "step": 88665 + }, + { + "epoch": 1.8039957682291665, + "grad_norm": 14.869315147399902, + "learning_rate": 5.771006615083183e-06, + "loss": 4.9888, + "step": 88670 + }, + { + "epoch": 1.8040974934895835, + "grad_norm": 18.373363494873047, + "learning_rate": 5.77061171553307e-06, + "loss": 4.9015, + "step": 88675 + }, + { + "epoch": 1.80419921875, + "grad_norm": 21.01815414428711, + "learning_rate": 5.770216811058994e-06, + "loss": 4.7458, + "step": 88680 + }, + { + "epoch": 1.8043009440104165, + "grad_norm": 16.52678108215332, + "learning_rate": 5.769821901663475e-06, + "loss": 4.6669, + "step": 88685 + }, + { + "epoch": 1.8044026692708335, + "grad_norm": 17.95452308654785, + "learning_rate": 5.76942698734904e-06, + "loss": 4.5855, + "step": 88690 + }, + { + "epoch": 1.80450439453125, + "grad_norm": 19.28365707397461, + "learning_rate": 5.769032068118211e-06, + "loss": 5.1656, + "step": 88695 + }, + { + "epoch": 1.8046061197916665, + "grad_norm": 18.12550926208496, + "learning_rate": 5.76863714397351e-06, + "loss": 4.7759, + "step": 88700 + }, + { + "epoch": 1.8047078450520835, + "grad_norm": 14.832032203674316, + "learning_rate": 5.768242214917463e-06, + "loss": 4.9932, + "step": 88705 + }, + { + "epoch": 1.8048095703125, + "grad_norm": 23.794872283935547, + "learning_rate": 5.767847280952591e-06, + "loss": 4.8702, + "step": 88710 + }, + { + "epoch": 1.8049112955729165, + "grad_norm": 20.021860122680664, + "learning_rate": 5.767452342081419e-06, + "loss": 5.1585, + "step": 88715 + }, + { + "epoch": 1.8050130208333335, + "grad_norm": 16.569828033447266, + "learning_rate": 5.76705739830647e-06, + "loss": 4.851, + "step": 88720 + }, + { + "epoch": 1.80511474609375, + "grad_norm": 14.957700729370117, + "learning_rate": 5.766662449630268e-06, + "loss": 4.968, + "step": 88725 + }, + { + "epoch": 1.8052164713541665, + "grad_norm": 15.31236743927002, + "learning_rate": 5.766267496055337e-06, + "loss": 4.6499, + "step": 88730 + }, + { + "epoch": 1.8053181966145835, + "grad_norm": 17.209041595458984, + "learning_rate": 5.7658725375841986e-06, + "loss": 4.8892, + "step": 88735 + }, + { + "epoch": 1.805419921875, + "grad_norm": 21.233726501464844, + "learning_rate": 5.7654775742193784e-06, + "loss": 4.884, + "step": 88740 + }, + { + "epoch": 1.8055216471354165, + "grad_norm": 13.799934387207031, + "learning_rate": 5.765082605963399e-06, + "loss": 4.9269, + "step": 88745 + }, + { + "epoch": 1.8056233723958335, + "grad_norm": 17.09919548034668, + "learning_rate": 5.764687632818785e-06, + "loss": 4.9824, + "step": 88750 + }, + { + "epoch": 1.80572509765625, + "grad_norm": 15.975651741027832, + "learning_rate": 5.76429265478806e-06, + "loss": 5.0002, + "step": 88755 + }, + { + "epoch": 1.8058268229166665, + "grad_norm": 19.91948890686035, + "learning_rate": 5.763897671873747e-06, + "loss": 4.8478, + "step": 88760 + }, + { + "epoch": 1.8059285481770835, + "grad_norm": 18.738515853881836, + "learning_rate": 5.76350268407837e-06, + "loss": 4.958, + "step": 88765 + }, + { + "epoch": 1.8060302734375, + "grad_norm": 14.358658790588379, + "learning_rate": 5.763107691404455e-06, + "loss": 4.9102, + "step": 88770 + }, + { + "epoch": 1.8061319986979165, + "grad_norm": 22.414390563964844, + "learning_rate": 5.762712693854522e-06, + "loss": 4.9663, + "step": 88775 + }, + { + "epoch": 1.8062337239583335, + "grad_norm": 77.80580139160156, + "learning_rate": 5.762317691431099e-06, + "loss": 4.8577, + "step": 88780 + }, + { + "epoch": 1.80633544921875, + "grad_norm": 17.27120018005371, + "learning_rate": 5.761922684136707e-06, + "loss": 4.9243, + "step": 88785 + }, + { + "epoch": 1.8064371744791665, + "grad_norm": 18.136743545532227, + "learning_rate": 5.761527671973871e-06, + "loss": 4.8275, + "step": 88790 + }, + { + "epoch": 1.8065388997395835, + "grad_norm": 17.836820602416992, + "learning_rate": 5.761132654945115e-06, + "loss": 4.7897, + "step": 88795 + }, + { + "epoch": 1.806640625, + "grad_norm": 18.07931137084961, + "learning_rate": 5.760737633052963e-06, + "loss": 4.7864, + "step": 88800 + }, + { + "epoch": 1.8067423502604165, + "grad_norm": 16.979381561279297, + "learning_rate": 5.760342606299938e-06, + "loss": 4.8833, + "step": 88805 + }, + { + "epoch": 1.8068440755208335, + "grad_norm": 20.358675003051758, + "learning_rate": 5.759947574688567e-06, + "loss": 4.7677, + "step": 88810 + }, + { + "epoch": 1.80694580078125, + "grad_norm": 15.929529190063477, + "learning_rate": 5.75955253822137e-06, + "loss": 4.6992, + "step": 88815 + }, + { + "epoch": 1.8070475260416665, + "grad_norm": 20.48660659790039, + "learning_rate": 5.759157496900875e-06, + "loss": 5.3426, + "step": 88820 + }, + { + "epoch": 1.8071492513020835, + "grad_norm": 18.11032485961914, + "learning_rate": 5.758762450729604e-06, + "loss": 5.0167, + "step": 88825 + }, + { + "epoch": 1.8072509765625, + "grad_norm": 15.032636642456055, + "learning_rate": 5.758367399710082e-06, + "loss": 4.9901, + "step": 88830 + }, + { + "epoch": 1.8073527018229165, + "grad_norm": 15.934508323669434, + "learning_rate": 5.757972343844834e-06, + "loss": 4.7782, + "step": 88835 + }, + { + "epoch": 1.8074544270833335, + "grad_norm": 14.460432052612305, + "learning_rate": 5.757577283136382e-06, + "loss": 4.8859, + "step": 88840 + }, + { + "epoch": 1.80755615234375, + "grad_norm": 19.856300354003906, + "learning_rate": 5.757182217587251e-06, + "loss": 5.0374, + "step": 88845 + }, + { + "epoch": 1.8076578776041665, + "grad_norm": 16.988605499267578, + "learning_rate": 5.756787147199968e-06, + "loss": 4.9325, + "step": 88850 + }, + { + "epoch": 1.8077596028645835, + "grad_norm": 15.001616477966309, + "learning_rate": 5.7563920719770536e-06, + "loss": 4.9849, + "step": 88855 + }, + { + "epoch": 1.807861328125, + "grad_norm": 16.570653915405273, + "learning_rate": 5.755996991921036e-06, + "loss": 4.7393, + "step": 88860 + }, + { + "epoch": 1.8079630533854165, + "grad_norm": 15.771570205688477, + "learning_rate": 5.755601907034436e-06, + "loss": 4.7134, + "step": 88865 + }, + { + "epoch": 1.8080647786458335, + "grad_norm": 18.413360595703125, + "learning_rate": 5.7552068173197795e-06, + "loss": 5.05, + "step": 88870 + }, + { + "epoch": 1.80816650390625, + "grad_norm": 18.578702926635742, + "learning_rate": 5.754811722779592e-06, + "loss": 4.7678, + "step": 88875 + }, + { + "epoch": 1.8082682291666665, + "grad_norm": 16.4433650970459, + "learning_rate": 5.7544166234163965e-06, + "loss": 5.0214, + "step": 88880 + }, + { + "epoch": 1.8083699544270835, + "grad_norm": 17.541288375854492, + "learning_rate": 5.754021519232719e-06, + "loss": 4.9107, + "step": 88885 + }, + { + "epoch": 1.8084716796875, + "grad_norm": 14.242688179016113, + "learning_rate": 5.753626410231083e-06, + "loss": 4.6673, + "step": 88890 + }, + { + "epoch": 1.8085734049479165, + "grad_norm": 17.389801025390625, + "learning_rate": 5.753231296414012e-06, + "loss": 5.0519, + "step": 88895 + }, + { + "epoch": 1.8086751302083335, + "grad_norm": 15.779364585876465, + "learning_rate": 5.7528361777840345e-06, + "loss": 4.7095, + "step": 88900 + }, + { + "epoch": 1.80877685546875, + "grad_norm": 17.438081741333008, + "learning_rate": 5.752441054343671e-06, + "loss": 4.7487, + "step": 88905 + }, + { + "epoch": 1.8088785807291665, + "grad_norm": 19.675535202026367, + "learning_rate": 5.75204592609545e-06, + "loss": 4.7012, + "step": 88910 + }, + { + "epoch": 1.8089803059895835, + "grad_norm": 17.76442527770996, + "learning_rate": 5.751650793041893e-06, + "loss": 5.0914, + "step": 88915 + }, + { + "epoch": 1.80908203125, + "grad_norm": 43.79153060913086, + "learning_rate": 5.751255655185526e-06, + "loss": 5.1836, + "step": 88920 + }, + { + "epoch": 1.8091837565104165, + "grad_norm": 20.395479202270508, + "learning_rate": 5.750860512528874e-06, + "loss": 4.7793, + "step": 88925 + }, + { + "epoch": 1.8092854817708335, + "grad_norm": 21.53154945373535, + "learning_rate": 5.750465365074462e-06, + "loss": 4.8938, + "step": 88930 + }, + { + "epoch": 1.80938720703125, + "grad_norm": 18.632776260375977, + "learning_rate": 5.750070212824815e-06, + "loss": 4.8493, + "step": 88935 + }, + { + "epoch": 1.8094889322916665, + "grad_norm": 32.99130630493164, + "learning_rate": 5.749675055782457e-06, + "loss": 4.8918, + "step": 88940 + }, + { + "epoch": 1.8095906575520835, + "grad_norm": 19.40833854675293, + "learning_rate": 5.749279893949912e-06, + "loss": 5.2087, + "step": 88945 + }, + { + "epoch": 1.8096923828125, + "grad_norm": 14.321781158447266, + "learning_rate": 5.748884727329708e-06, + "loss": 4.6655, + "step": 88950 + }, + { + "epoch": 1.8097941080729165, + "grad_norm": 18.70476722717285, + "learning_rate": 5.748489555924368e-06, + "loss": 4.882, + "step": 88955 + }, + { + "epoch": 1.8098958333333335, + "grad_norm": 25.34173583984375, + "learning_rate": 5.7480943797364165e-06, + "loss": 4.7697, + "step": 88960 + }, + { + "epoch": 1.80999755859375, + "grad_norm": 13.268380165100098, + "learning_rate": 5.747699198768381e-06, + "loss": 5.0285, + "step": 88965 + }, + { + "epoch": 1.8100992838541665, + "grad_norm": 17.722585678100586, + "learning_rate": 5.747304013022783e-06, + "loss": 5.0953, + "step": 88970 + }, + { + "epoch": 1.8102010091145835, + "grad_norm": 17.905498504638672, + "learning_rate": 5.746908822502151e-06, + "loss": 4.7257, + "step": 88975 + }, + { + "epoch": 1.810302734375, + "grad_norm": 17.81098175048828, + "learning_rate": 5.746513627209008e-06, + "loss": 4.88, + "step": 88980 + }, + { + "epoch": 1.8104044596354165, + "grad_norm": 16.632234573364258, + "learning_rate": 5.74611842714588e-06, + "loss": 5.1121, + "step": 88985 + }, + { + "epoch": 1.8105061848958335, + "grad_norm": 15.619433403015137, + "learning_rate": 5.745723222315292e-06, + "loss": 4.6544, + "step": 88990 + }, + { + "epoch": 1.81060791015625, + "grad_norm": 15.325960159301758, + "learning_rate": 5.74532801271977e-06, + "loss": 4.962, + "step": 88995 + }, + { + "epoch": 1.8107096354166665, + "grad_norm": 23.86271858215332, + "learning_rate": 5.7449327983618385e-06, + "loss": 5.0872, + "step": 89000 + }, + { + "epoch": 1.8108113606770835, + "grad_norm": 15.09633731842041, + "learning_rate": 5.7445375792440216e-06, + "loss": 4.7338, + "step": 89005 + }, + { + "epoch": 1.8109130859375, + "grad_norm": 18.150325775146484, + "learning_rate": 5.744142355368847e-06, + "loss": 4.9297, + "step": 89010 + }, + { + "epoch": 1.8110148111979165, + "grad_norm": 13.40632438659668, + "learning_rate": 5.743747126738839e-06, + "loss": 5.094, + "step": 89015 + }, + { + "epoch": 1.8111165364583335, + "grad_norm": 16.401039123535156, + "learning_rate": 5.743351893356523e-06, + "loss": 4.8396, + "step": 89020 + }, + { + "epoch": 1.81121826171875, + "grad_norm": 19.453269958496094, + "learning_rate": 5.742956655224423e-06, + "loss": 4.668, + "step": 89025 + }, + { + "epoch": 1.8113199869791665, + "grad_norm": 18.191272735595703, + "learning_rate": 5.742561412345068e-06, + "loss": 4.7788, + "step": 89030 + }, + { + "epoch": 1.8114217122395835, + "grad_norm": 18.61887550354004, + "learning_rate": 5.7421661647209795e-06, + "loss": 4.741, + "step": 89035 + }, + { + "epoch": 1.8115234375, + "grad_norm": 12.413783073425293, + "learning_rate": 5.741770912354686e-06, + "loss": 4.9375, + "step": 89040 + }, + { + "epoch": 1.8116251627604165, + "grad_norm": 13.55652141571045, + "learning_rate": 5.741375655248712e-06, + "loss": 5.1768, + "step": 89045 + }, + { + "epoch": 1.8117268880208335, + "grad_norm": 16.77082061767578, + "learning_rate": 5.7409803934055815e-06, + "loss": 5.2043, + "step": 89050 + }, + { + "epoch": 1.81182861328125, + "grad_norm": 17.736879348754883, + "learning_rate": 5.740585126827823e-06, + "loss": 4.9522, + "step": 89055 + }, + { + "epoch": 1.8119303385416665, + "grad_norm": 16.162307739257812, + "learning_rate": 5.740189855517959e-06, + "loss": 4.6361, + "step": 89060 + }, + { + "epoch": 1.8120320638020835, + "grad_norm": 18.86148452758789, + "learning_rate": 5.739794579478519e-06, + "loss": 4.9994, + "step": 89065 + }, + { + "epoch": 1.8121337890625, + "grad_norm": 26.67274284362793, + "learning_rate": 5.7393992987120255e-06, + "loss": 4.6751, + "step": 89070 + }, + { + "epoch": 1.8122355143229165, + "grad_norm": 20.11577606201172, + "learning_rate": 5.739004013221005e-06, + "loss": 4.9714, + "step": 89075 + }, + { + "epoch": 1.8123372395833335, + "grad_norm": 18.954273223876953, + "learning_rate": 5.738608723007983e-06, + "loss": 4.9413, + "step": 89080 + }, + { + "epoch": 1.81243896484375, + "grad_norm": 19.514192581176758, + "learning_rate": 5.7382134280754854e-06, + "loss": 5.0015, + "step": 89085 + }, + { + "epoch": 1.8125406901041665, + "grad_norm": 30.458465576171875, + "learning_rate": 5.737818128426041e-06, + "loss": 5.1033, + "step": 89090 + }, + { + "epoch": 1.8126424153645835, + "grad_norm": 31.440277099609375, + "learning_rate": 5.73742282406217e-06, + "loss": 5.3712, + "step": 89095 + }, + { + "epoch": 1.812744140625, + "grad_norm": 24.47775650024414, + "learning_rate": 5.737027514986402e-06, + "loss": 5.0112, + "step": 89100 + }, + { + "epoch": 1.8128458658854165, + "grad_norm": 18.042505264282227, + "learning_rate": 5.736632201201263e-06, + "loss": 4.9183, + "step": 89105 + }, + { + "epoch": 1.8129475911458335, + "grad_norm": 20.12124252319336, + "learning_rate": 5.736236882709277e-06, + "loss": 4.8766, + "step": 89110 + }, + { + "epoch": 1.81304931640625, + "grad_norm": 15.444498062133789, + "learning_rate": 5.7358415595129715e-06, + "loss": 4.799, + "step": 89115 + }, + { + "epoch": 1.8131510416666665, + "grad_norm": 18.780988693237305, + "learning_rate": 5.735446231614873e-06, + "loss": 4.6323, + "step": 89120 + }, + { + "epoch": 1.8132527669270835, + "grad_norm": 16.526615142822266, + "learning_rate": 5.735050899017505e-06, + "loss": 4.9283, + "step": 89125 + }, + { + "epoch": 1.8133544921875, + "grad_norm": 26.639848709106445, + "learning_rate": 5.734655561723395e-06, + "loss": 5.1058, + "step": 89130 + }, + { + "epoch": 1.8134562174479165, + "grad_norm": 23.849422454833984, + "learning_rate": 5.734260219735071e-06, + "loss": 4.7975, + "step": 89135 + }, + { + "epoch": 1.8135579427083335, + "grad_norm": 24.880041122436523, + "learning_rate": 5.7338648730550565e-06, + "loss": 4.7858, + "step": 89140 + }, + { + "epoch": 1.81365966796875, + "grad_norm": 16.98501205444336, + "learning_rate": 5.733469521685877e-06, + "loss": 4.8874, + "step": 89145 + }, + { + "epoch": 1.8137613932291665, + "grad_norm": 21.0853328704834, + "learning_rate": 5.733074165630062e-06, + "loss": 4.8423, + "step": 89150 + }, + { + "epoch": 1.8138631184895835, + "grad_norm": 18.0428409576416, + "learning_rate": 5.732678804890134e-06, + "loss": 4.9044, + "step": 89155 + }, + { + "epoch": 1.81396484375, + "grad_norm": 20.66382598876953, + "learning_rate": 5.7322834394686226e-06, + "loss": 4.9883, + "step": 89160 + }, + { + "epoch": 1.8140665690104165, + "grad_norm": 16.594188690185547, + "learning_rate": 5.73188806936805e-06, + "loss": 4.7907, + "step": 89165 + }, + { + "epoch": 1.8141682942708335, + "grad_norm": 13.636271476745605, + "learning_rate": 5.731492694590948e-06, + "loss": 5.0231, + "step": 89170 + }, + { + "epoch": 1.81427001953125, + "grad_norm": 15.960770606994629, + "learning_rate": 5.731097315139839e-06, + "loss": 4.9453, + "step": 89175 + }, + { + "epoch": 1.8143717447916665, + "grad_norm": 14.955391883850098, + "learning_rate": 5.730701931017249e-06, + "loss": 4.877, + "step": 89180 + }, + { + "epoch": 1.8144734700520835, + "grad_norm": 12.911561012268066, + "learning_rate": 5.730306542225707e-06, + "loss": 4.8559, + "step": 89185 + }, + { + "epoch": 1.8145751953125, + "grad_norm": 12.679840087890625, + "learning_rate": 5.729911148767737e-06, + "loss": 5.0443, + "step": 89190 + }, + { + "epoch": 1.8146769205729165, + "grad_norm": 21.27196502685547, + "learning_rate": 5.729515750645867e-06, + "loss": 4.7822, + "step": 89195 + }, + { + "epoch": 1.8147786458333335, + "grad_norm": 17.008695602416992, + "learning_rate": 5.729120347862624e-06, + "loss": 5.0084, + "step": 89200 + }, + { + "epoch": 1.81488037109375, + "grad_norm": 13.559440612792969, + "learning_rate": 5.7287249404205325e-06, + "loss": 5.0489, + "step": 89205 + }, + { + "epoch": 1.8149820963541665, + "grad_norm": 18.92639923095703, + "learning_rate": 5.728329528322119e-06, + "loss": 4.8374, + "step": 89210 + }, + { + "epoch": 1.8150838216145835, + "grad_norm": 31.676416397094727, + "learning_rate": 5.727934111569913e-06, + "loss": 4.8298, + "step": 89215 + }, + { + "epoch": 1.815185546875, + "grad_norm": 25.643089294433594, + "learning_rate": 5.727538690166438e-06, + "loss": 4.7831, + "step": 89220 + }, + { + "epoch": 1.8152872721354165, + "grad_norm": 17.649457931518555, + "learning_rate": 5.727143264114222e-06, + "loss": 4.7665, + "step": 89225 + }, + { + "epoch": 1.8153889973958335, + "grad_norm": 17.652984619140625, + "learning_rate": 5.72674783341579e-06, + "loss": 4.8107, + "step": 89230 + }, + { + "epoch": 1.81549072265625, + "grad_norm": 23.006067276000977, + "learning_rate": 5.726352398073671e-06, + "loss": 4.8322, + "step": 89235 + }, + { + "epoch": 1.8155924479166665, + "grad_norm": 16.93791389465332, + "learning_rate": 5.725956958090392e-06, + "loss": 4.8727, + "step": 89240 + }, + { + "epoch": 1.8156941731770835, + "grad_norm": 23.052997589111328, + "learning_rate": 5.725561513468477e-06, + "loss": 4.6416, + "step": 89245 + }, + { + "epoch": 1.8157958984375, + "grad_norm": 16.530702590942383, + "learning_rate": 5.725166064210454e-06, + "loss": 5.1082, + "step": 89250 + }, + { + "epoch": 1.8158976236979165, + "grad_norm": 19.518779754638672, + "learning_rate": 5.724770610318851e-06, + "loss": 4.7639, + "step": 89255 + }, + { + "epoch": 1.8159993489583335, + "grad_norm": 21.578176498413086, + "learning_rate": 5.724375151796194e-06, + "loss": 4.9846, + "step": 89260 + }, + { + "epoch": 1.81610107421875, + "grad_norm": 13.26927375793457, + "learning_rate": 5.723979688645009e-06, + "loss": 4.7012, + "step": 89265 + }, + { + "epoch": 1.8162027994791665, + "grad_norm": 18.190311431884766, + "learning_rate": 5.723584220867824e-06, + "loss": 4.7894, + "step": 89270 + }, + { + "epoch": 1.8163045247395835, + "grad_norm": 17.06834602355957, + "learning_rate": 5.723188748467166e-06, + "loss": 4.9051, + "step": 89275 + }, + { + "epoch": 1.81640625, + "grad_norm": 18.463943481445312, + "learning_rate": 5.72279327144556e-06, + "loss": 4.6707, + "step": 89280 + }, + { + "epoch": 1.8165079752604165, + "grad_norm": 13.4570894241333, + "learning_rate": 5.722397789805535e-06, + "loss": 4.7613, + "step": 89285 + }, + { + "epoch": 1.8166097005208335, + "grad_norm": 14.457996368408203, + "learning_rate": 5.7220023035496185e-06, + "loss": 4.7595, + "step": 89290 + }, + { + "epoch": 1.81671142578125, + "grad_norm": 20.26447868347168, + "learning_rate": 5.7216068126803345e-06, + "loss": 4.6242, + "step": 89295 + }, + { + "epoch": 1.8168131510416665, + "grad_norm": 16.647232055664062, + "learning_rate": 5.721211317200214e-06, + "loss": 4.9936, + "step": 89300 + }, + { + "epoch": 1.8169148763020835, + "grad_norm": 16.39545249938965, + "learning_rate": 5.720815817111781e-06, + "loss": 5.0471, + "step": 89305 + }, + { + "epoch": 1.8170166015625, + "grad_norm": 19.52714729309082, + "learning_rate": 5.720420312417564e-06, + "loss": 4.9169, + "step": 89310 + }, + { + "epoch": 1.8171183268229165, + "grad_norm": 21.33568572998047, + "learning_rate": 5.720024803120091e-06, + "loss": 5.0106, + "step": 89315 + }, + { + "epoch": 1.8172200520833335, + "grad_norm": 16.464584350585938, + "learning_rate": 5.719629289221886e-06, + "loss": 4.9879, + "step": 89320 + }, + { + "epoch": 1.81732177734375, + "grad_norm": 16.247438430786133, + "learning_rate": 5.71923377072548e-06, + "loss": 4.7583, + "step": 89325 + }, + { + "epoch": 1.8174235026041665, + "grad_norm": 15.778167724609375, + "learning_rate": 5.718838247633398e-06, + "loss": 4.8313, + "step": 89330 + }, + { + "epoch": 1.8175252278645835, + "grad_norm": 24.160579681396484, + "learning_rate": 5.718442719948168e-06, + "loss": 4.8266, + "step": 89335 + }, + { + "epoch": 1.817626953125, + "grad_norm": 18.439937591552734, + "learning_rate": 5.718047187672317e-06, + "loss": 5.2198, + "step": 89340 + }, + { + "epoch": 1.8177286783854165, + "grad_norm": 23.04435920715332, + "learning_rate": 5.717651650808371e-06, + "loss": 4.9123, + "step": 89345 + }, + { + "epoch": 1.8178304036458335, + "grad_norm": 22.427398681640625, + "learning_rate": 5.717256109358861e-06, + "loss": 4.7941, + "step": 89350 + }, + { + "epoch": 1.81793212890625, + "grad_norm": 17.316619873046875, + "learning_rate": 5.716860563326312e-06, + "loss": 4.8965, + "step": 89355 + }, + { + "epoch": 1.8180338541666665, + "grad_norm": 18.198505401611328, + "learning_rate": 5.716465012713249e-06, + "loss": 4.8346, + "step": 89360 + }, + { + "epoch": 1.8181355794270835, + "grad_norm": 23.77219009399414, + "learning_rate": 5.716069457522204e-06, + "loss": 4.8067, + "step": 89365 + }, + { + "epoch": 1.8182373046875, + "grad_norm": 16.941604614257812, + "learning_rate": 5.715673897755702e-06, + "loss": 5.2026, + "step": 89370 + }, + { + "epoch": 1.8183390299479165, + "grad_norm": 14.697875022888184, + "learning_rate": 5.715278333416272e-06, + "loss": 4.8448, + "step": 89375 + }, + { + "epoch": 1.8184407552083335, + "grad_norm": 16.41830825805664, + "learning_rate": 5.71488276450644e-06, + "loss": 4.8381, + "step": 89380 + }, + { + "epoch": 1.81854248046875, + "grad_norm": 17.319730758666992, + "learning_rate": 5.714487191028732e-06, + "loss": 4.7579, + "step": 89385 + }, + { + "epoch": 1.8186442057291665, + "grad_norm": 18.161497116088867, + "learning_rate": 5.71409161298568e-06, + "loss": 4.8979, + "step": 89390 + }, + { + "epoch": 1.8187459309895835, + "grad_norm": 18.23771858215332, + "learning_rate": 5.71369603037981e-06, + "loss": 4.9133, + "step": 89395 + }, + { + "epoch": 1.81884765625, + "grad_norm": 19.211650848388672, + "learning_rate": 5.713300443213647e-06, + "loss": 4.9319, + "step": 89400 + }, + { + "epoch": 1.8189493815104165, + "grad_norm": 30.361377716064453, + "learning_rate": 5.712904851489721e-06, + "loss": 5.0107, + "step": 89405 + }, + { + "epoch": 1.8190511067708335, + "grad_norm": 19.496706008911133, + "learning_rate": 5.712509255210559e-06, + "loss": 4.9773, + "step": 89410 + }, + { + "epoch": 1.81915283203125, + "grad_norm": 15.464609146118164, + "learning_rate": 5.71211365437869e-06, + "loss": 4.9941, + "step": 89415 + }, + { + "epoch": 1.8192545572916665, + "grad_norm": 18.841585159301758, + "learning_rate": 5.711718048996639e-06, + "loss": 4.7653, + "step": 89420 + }, + { + "epoch": 1.8193562825520835, + "grad_norm": 15.403261184692383, + "learning_rate": 5.711322439066937e-06, + "loss": 5.0835, + "step": 89425 + }, + { + "epoch": 1.8194580078125, + "grad_norm": 18.7391300201416, + "learning_rate": 5.710926824592111e-06, + "loss": 4.8213, + "step": 89430 + }, + { + "epoch": 1.8195597330729165, + "grad_norm": 17.17861557006836, + "learning_rate": 5.7105312055746875e-06, + "loss": 4.945, + "step": 89435 + }, + { + "epoch": 1.8196614583333335, + "grad_norm": 20.752500534057617, + "learning_rate": 5.710135582017196e-06, + "loss": 5.0557, + "step": 89440 + }, + { + "epoch": 1.81976318359375, + "grad_norm": 25.08605194091797, + "learning_rate": 5.709739953922162e-06, + "loss": 4.6423, + "step": 89445 + }, + { + "epoch": 1.8198649088541665, + "grad_norm": 40.82236099243164, + "learning_rate": 5.709344321292116e-06, + "loss": 4.9725, + "step": 89450 + }, + { + "epoch": 1.8199666341145835, + "grad_norm": 14.338937759399414, + "learning_rate": 5.708948684129585e-06, + "loss": 5.0886, + "step": 89455 + }, + { + "epoch": 1.820068359375, + "grad_norm": 22.8561954498291, + "learning_rate": 5.708553042437097e-06, + "loss": 5.1187, + "step": 89460 + }, + { + "epoch": 1.8201700846354165, + "grad_norm": 20.988510131835938, + "learning_rate": 5.708157396217179e-06, + "loss": 4.9362, + "step": 89465 + }, + { + "epoch": 1.8202718098958335, + "grad_norm": 18.71856117248535, + "learning_rate": 5.707761745472361e-06, + "loss": 4.8676, + "step": 89470 + }, + { + "epoch": 1.82037353515625, + "grad_norm": 16.207801818847656, + "learning_rate": 5.707366090205169e-06, + "loss": 4.902, + "step": 89475 + }, + { + "epoch": 1.8204752604166665, + "grad_norm": 20.425683975219727, + "learning_rate": 5.706970430418134e-06, + "loss": 4.8645, + "step": 89480 + }, + { + "epoch": 1.8205769856770835, + "grad_norm": 17.879634857177734, + "learning_rate": 5.706574766113782e-06, + "loss": 4.9212, + "step": 89485 + }, + { + "epoch": 1.8206787109375, + "grad_norm": 13.776819229125977, + "learning_rate": 5.706179097294641e-06, + "loss": 4.9426, + "step": 89490 + }, + { + "epoch": 1.8207804361979165, + "grad_norm": 14.367216110229492, + "learning_rate": 5.70578342396324e-06, + "loss": 5.0324, + "step": 89495 + }, + { + "epoch": 1.8208821614583335, + "grad_norm": 18.93411636352539, + "learning_rate": 5.7053877461221064e-06, + "loss": 4.7915, + "step": 89500 + }, + { + "epoch": 1.82098388671875, + "grad_norm": 17.186325073242188, + "learning_rate": 5.704992063773769e-06, + "loss": 4.8115, + "step": 89505 + }, + { + "epoch": 1.8210856119791665, + "grad_norm": 17.794240951538086, + "learning_rate": 5.704596376920757e-06, + "loss": 4.8164, + "step": 89510 + }, + { + "epoch": 1.8211873372395835, + "grad_norm": 20.337360382080078, + "learning_rate": 5.704200685565597e-06, + "loss": 4.846, + "step": 89515 + }, + { + "epoch": 1.8212890625, + "grad_norm": 19.220701217651367, + "learning_rate": 5.703804989710818e-06, + "loss": 4.7571, + "step": 89520 + }, + { + "epoch": 1.8213907877604165, + "grad_norm": 15.331913948059082, + "learning_rate": 5.7034092893589484e-06, + "loss": 5.0459, + "step": 89525 + }, + { + "epoch": 1.8214925130208335, + "grad_norm": 16.625688552856445, + "learning_rate": 5.703013584512518e-06, + "loss": 4.6223, + "step": 89530 + }, + { + "epoch": 1.82159423828125, + "grad_norm": 24.88091278076172, + "learning_rate": 5.702617875174054e-06, + "loss": 4.9624, + "step": 89535 + }, + { + "epoch": 1.8216959635416665, + "grad_norm": 18.042490005493164, + "learning_rate": 5.702222161346084e-06, + "loss": 4.86, + "step": 89540 + }, + { + "epoch": 1.8217976888020835, + "grad_norm": 20.82582664489746, + "learning_rate": 5.7018264430311374e-06, + "loss": 4.8312, + "step": 89545 + }, + { + "epoch": 1.8218994140625, + "grad_norm": 15.771503448486328, + "learning_rate": 5.701430720231742e-06, + "loss": 4.7828, + "step": 89550 + }, + { + "epoch": 1.8220011393229165, + "grad_norm": 16.20650863647461, + "learning_rate": 5.701034992950427e-06, + "loss": 4.992, + "step": 89555 + }, + { + "epoch": 1.8221028645833335, + "grad_norm": 13.601994514465332, + "learning_rate": 5.700639261189722e-06, + "loss": 4.8922, + "step": 89560 + }, + { + "epoch": 1.82220458984375, + "grad_norm": 20.766399383544922, + "learning_rate": 5.700243524952154e-06, + "loss": 4.7835, + "step": 89565 + }, + { + "epoch": 1.8223063151041665, + "grad_norm": 17.66379737854004, + "learning_rate": 5.699847784240252e-06, + "loss": 4.6907, + "step": 89570 + }, + { + "epoch": 1.8224080403645835, + "grad_norm": 18.273284912109375, + "learning_rate": 5.699452039056546e-06, + "loss": 4.9694, + "step": 89575 + }, + { + "epoch": 1.822509765625, + "grad_norm": 21.81241226196289, + "learning_rate": 5.699056289403562e-06, + "loss": 4.7503, + "step": 89580 + }, + { + "epoch": 1.8226114908854165, + "grad_norm": 21.16026496887207, + "learning_rate": 5.698660535283831e-06, + "loss": 4.9974, + "step": 89585 + }, + { + "epoch": 1.8227132161458335, + "grad_norm": 18.057294845581055, + "learning_rate": 5.69826477669988e-06, + "loss": 4.8106, + "step": 89590 + }, + { + "epoch": 1.82281494140625, + "grad_norm": 17.020124435424805, + "learning_rate": 5.697869013654239e-06, + "loss": 4.6912, + "step": 89595 + }, + { + "epoch": 1.8229166666666665, + "grad_norm": 17.5379695892334, + "learning_rate": 5.6974732461494384e-06, + "loss": 4.8322, + "step": 89600 + }, + { + "epoch": 1.8230183919270835, + "grad_norm": 20.10344696044922, + "learning_rate": 5.697077474188003e-06, + "loss": 4.7734, + "step": 89605 + }, + { + "epoch": 1.8231201171875, + "grad_norm": 22.665346145629883, + "learning_rate": 5.696681697772464e-06, + "loss": 4.9928, + "step": 89610 + }, + { + "epoch": 1.8232218424479165, + "grad_norm": 19.165002822875977, + "learning_rate": 5.696285916905352e-06, + "loss": 5.0605, + "step": 89615 + }, + { + "epoch": 1.8233235677083335, + "grad_norm": 17.607507705688477, + "learning_rate": 5.6958901315891926e-06, + "loss": 5.0466, + "step": 89620 + }, + { + "epoch": 1.82342529296875, + "grad_norm": 16.270185470581055, + "learning_rate": 5.695494341826517e-06, + "loss": 5.1431, + "step": 89625 + }, + { + "epoch": 1.8235270182291665, + "grad_norm": 15.239884376525879, + "learning_rate": 5.695098547619851e-06, + "loss": 4.919, + "step": 89630 + }, + { + "epoch": 1.8236287434895835, + "grad_norm": 21.720914840698242, + "learning_rate": 5.694702748971727e-06, + "loss": 4.9402, + "step": 89635 + }, + { + "epoch": 1.82373046875, + "grad_norm": 13.009247779846191, + "learning_rate": 5.694306945884676e-06, + "loss": 5.0539, + "step": 89640 + }, + { + "epoch": 1.8238321940104165, + "grad_norm": 15.103789329528809, + "learning_rate": 5.69391113836122e-06, + "loss": 5.059, + "step": 89645 + }, + { + "epoch": 1.8239339192708335, + "grad_norm": 16.784730911254883, + "learning_rate": 5.693515326403894e-06, + "loss": 4.9427, + "step": 89650 + }, + { + "epoch": 1.82403564453125, + "grad_norm": 16.836694717407227, + "learning_rate": 5.693119510015223e-06, + "loss": 5.1351, + "step": 89655 + }, + { + "epoch": 1.8241373697916665, + "grad_norm": 29.729663848876953, + "learning_rate": 5.692723689197742e-06, + "loss": 5.1262, + "step": 89660 + }, + { + "epoch": 1.8242390950520835, + "grad_norm": 19.392831802368164, + "learning_rate": 5.692327863953975e-06, + "loss": 4.7544, + "step": 89665 + }, + { + "epoch": 1.8243408203125, + "grad_norm": 21.010021209716797, + "learning_rate": 5.691932034286452e-06, + "loss": 4.8303, + "step": 89670 + }, + { + "epoch": 1.8244425455729165, + "grad_norm": 15.550243377685547, + "learning_rate": 5.691536200197704e-06, + "loss": 5.0178, + "step": 89675 + }, + { + "epoch": 1.8245442708333335, + "grad_norm": 22.927589416503906, + "learning_rate": 5.691140361690257e-06, + "loss": 4.7162, + "step": 89680 + }, + { + "epoch": 1.82464599609375, + "grad_norm": 15.834770202636719, + "learning_rate": 5.690744518766646e-06, + "loss": 4.8763, + "step": 89685 + }, + { + "epoch": 1.8247477213541665, + "grad_norm": 18.781309127807617, + "learning_rate": 5.690348671429393e-06, + "loss": 4.6078, + "step": 89690 + }, + { + "epoch": 1.8248494466145835, + "grad_norm": 16.30809783935547, + "learning_rate": 5.689952819681033e-06, + "loss": 4.8904, + "step": 89695 + }, + { + "epoch": 1.824951171875, + "grad_norm": 13.675626754760742, + "learning_rate": 5.689556963524094e-06, + "loss": 5.0559, + "step": 89700 + }, + { + "epoch": 1.8250528971354165, + "grad_norm": 14.811482429504395, + "learning_rate": 5.689161102961105e-06, + "loss": 4.6343, + "step": 89705 + }, + { + "epoch": 1.8251546223958335, + "grad_norm": 15.500221252441406, + "learning_rate": 5.688765237994593e-06, + "loss": 5.0525, + "step": 89710 + }, + { + "epoch": 1.82525634765625, + "grad_norm": 15.734070777893066, + "learning_rate": 5.688369368627093e-06, + "loss": 4.784, + "step": 89715 + }, + { + "epoch": 1.8253580729166665, + "grad_norm": 18.576833724975586, + "learning_rate": 5.687973494861129e-06, + "loss": 4.884, + "step": 89720 + }, + { + "epoch": 1.8254597981770835, + "grad_norm": 16.986665725708008, + "learning_rate": 5.687577616699234e-06, + "loss": 4.6966, + "step": 89725 + }, + { + "epoch": 1.8255615234375, + "grad_norm": 17.467803955078125, + "learning_rate": 5.687181734143936e-06, + "loss": 4.8621, + "step": 89730 + }, + { + "epoch": 1.8256632486979165, + "grad_norm": 19.42022132873535, + "learning_rate": 5.686785847197765e-06, + "loss": 4.8982, + "step": 89735 + }, + { + "epoch": 1.8257649739583335, + "grad_norm": 14.89742374420166, + "learning_rate": 5.68638995586325e-06, + "loss": 4.7494, + "step": 89740 + }, + { + "epoch": 1.82586669921875, + "grad_norm": 12.403730392456055, + "learning_rate": 5.685994060142921e-06, + "loss": 4.7432, + "step": 89745 + }, + { + "epoch": 1.8259684244791665, + "grad_norm": 19.58540916442871, + "learning_rate": 5.685598160039307e-06, + "loss": 4.8783, + "step": 89750 + }, + { + "epoch": 1.8260701497395835, + "grad_norm": 23.522319793701172, + "learning_rate": 5.68520225555494e-06, + "loss": 5.2437, + "step": 89755 + }, + { + "epoch": 1.826171875, + "grad_norm": 13.970019340515137, + "learning_rate": 5.684806346692348e-06, + "loss": 5.0504, + "step": 89760 + }, + { + "epoch": 1.8262736002604165, + "grad_norm": 22.535564422607422, + "learning_rate": 5.68441043345406e-06, + "loss": 5.0693, + "step": 89765 + }, + { + "epoch": 1.8263753255208335, + "grad_norm": 16.749500274658203, + "learning_rate": 5.6840145158426076e-06, + "loss": 4.7336, + "step": 89770 + }, + { + "epoch": 1.82647705078125, + "grad_norm": 16.62549591064453, + "learning_rate": 5.683618593860518e-06, + "loss": 4.8175, + "step": 89775 + }, + { + "epoch": 1.8265787760416665, + "grad_norm": 20.572946548461914, + "learning_rate": 5.683222667510324e-06, + "loss": 4.9905, + "step": 89780 + }, + { + "epoch": 1.8266805013020835, + "grad_norm": 18.608678817749023, + "learning_rate": 5.682826736794552e-06, + "loss": 4.8325, + "step": 89785 + }, + { + "epoch": 1.8267822265625, + "grad_norm": 16.650957107543945, + "learning_rate": 5.682430801715736e-06, + "loss": 4.7551, + "step": 89790 + }, + { + "epoch": 1.8268839518229165, + "grad_norm": 21.829959869384766, + "learning_rate": 5.682034862276404e-06, + "loss": 5.0607, + "step": 89795 + }, + { + "epoch": 1.8269856770833335, + "grad_norm": 19.380203247070312, + "learning_rate": 5.681638918479084e-06, + "loss": 4.8082, + "step": 89800 + }, + { + "epoch": 1.82708740234375, + "grad_norm": 13.97436809539795, + "learning_rate": 5.68124297032631e-06, + "loss": 4.9045, + "step": 89805 + }, + { + "epoch": 1.8271891276041665, + "grad_norm": 23.416112899780273, + "learning_rate": 5.680847017820607e-06, + "loss": 4.9395, + "step": 89810 + }, + { + "epoch": 1.8272908528645835, + "grad_norm": 14.856220245361328, + "learning_rate": 5.680451060964509e-06, + "loss": 4.8039, + "step": 89815 + }, + { + "epoch": 1.827392578125, + "grad_norm": 13.917762756347656, + "learning_rate": 5.680055099760544e-06, + "loss": 4.6805, + "step": 89820 + }, + { + "epoch": 1.8274943033854165, + "grad_norm": 18.314464569091797, + "learning_rate": 5.679659134211244e-06, + "loss": 4.898, + "step": 89825 + }, + { + "epoch": 1.8275960286458335, + "grad_norm": 19.0725154876709, + "learning_rate": 5.679263164319137e-06, + "loss": 5.0697, + "step": 89830 + }, + { + "epoch": 1.82769775390625, + "grad_norm": 20.556730270385742, + "learning_rate": 5.6788671900867545e-06, + "loss": 4.8772, + "step": 89835 + }, + { + "epoch": 1.8277994791666665, + "grad_norm": 18.402179718017578, + "learning_rate": 5.6784712115166255e-06, + "loss": 4.763, + "step": 89840 + }, + { + "epoch": 1.8279012044270835, + "grad_norm": 18.983430862426758, + "learning_rate": 5.678075228611282e-06, + "loss": 5.0892, + "step": 89845 + }, + { + "epoch": 1.8280029296875, + "grad_norm": 17.619604110717773, + "learning_rate": 5.677679241373252e-06, + "loss": 4.6306, + "step": 89850 + }, + { + "epoch": 1.8281046549479165, + "grad_norm": 21.16720962524414, + "learning_rate": 5.677283249805068e-06, + "loss": 4.8501, + "step": 89855 + }, + { + "epoch": 1.8282063802083335, + "grad_norm": 14.793646812438965, + "learning_rate": 5.676887253909258e-06, + "loss": 4.807, + "step": 89860 + }, + { + "epoch": 1.82830810546875, + "grad_norm": 16.887189865112305, + "learning_rate": 5.676491253688353e-06, + "loss": 4.8821, + "step": 89865 + }, + { + "epoch": 1.8284098307291665, + "grad_norm": 23.765457153320312, + "learning_rate": 5.676095249144885e-06, + "loss": 5.1029, + "step": 89870 + }, + { + "epoch": 1.8285115559895835, + "grad_norm": 18.970748901367188, + "learning_rate": 5.675699240281381e-06, + "loss": 4.6719, + "step": 89875 + }, + { + "epoch": 1.82861328125, + "grad_norm": 13.684013366699219, + "learning_rate": 5.675303227100375e-06, + "loss": 4.8639, + "step": 89880 + }, + { + "epoch": 1.8287150065104165, + "grad_norm": 18.703433990478516, + "learning_rate": 5.674907209604395e-06, + "loss": 5.0636, + "step": 89885 + }, + { + "epoch": 1.8288167317708335, + "grad_norm": 20.751094818115234, + "learning_rate": 5.674511187795973e-06, + "loss": 4.9846, + "step": 89890 + }, + { + "epoch": 1.82891845703125, + "grad_norm": 16.555383682250977, + "learning_rate": 5.674115161677638e-06, + "loss": 5.1351, + "step": 89895 + }, + { + "epoch": 1.8290201822916665, + "grad_norm": 22.885574340820312, + "learning_rate": 5.673719131251922e-06, + "loss": 4.8797, + "step": 89900 + }, + { + "epoch": 1.8291219075520835, + "grad_norm": 17.87958526611328, + "learning_rate": 5.6733230965213525e-06, + "loss": 4.8544, + "step": 89905 + }, + { + "epoch": 1.8292236328125, + "grad_norm": 16.82744598388672, + "learning_rate": 5.672927057488464e-06, + "loss": 4.8458, + "step": 89910 + }, + { + "epoch": 1.8293253580729165, + "grad_norm": 18.967723846435547, + "learning_rate": 5.672531014155784e-06, + "loss": 4.8007, + "step": 89915 + }, + { + "epoch": 1.8294270833333335, + "grad_norm": 14.218534469604492, + "learning_rate": 5.672134966525845e-06, + "loss": 4.7247, + "step": 89920 + }, + { + "epoch": 1.82952880859375, + "grad_norm": 20.332399368286133, + "learning_rate": 5.671738914601178e-06, + "loss": 4.9363, + "step": 89925 + }, + { + "epoch": 1.8296305338541665, + "grad_norm": 17.774452209472656, + "learning_rate": 5.671342858384311e-06, + "loss": 4.8994, + "step": 89930 + }, + { + "epoch": 1.8297322591145835, + "grad_norm": 18.13800048828125, + "learning_rate": 5.6709467978777765e-06, + "loss": 4.7952, + "step": 89935 + }, + { + "epoch": 1.829833984375, + "grad_norm": 21.10915756225586, + "learning_rate": 5.670550733084105e-06, + "loss": 4.8415, + "step": 89940 + }, + { + "epoch": 1.8299357096354165, + "grad_norm": 21.698511123657227, + "learning_rate": 5.670154664005828e-06, + "loss": 4.696, + "step": 89945 + }, + { + "epoch": 1.8300374348958335, + "grad_norm": 20.176830291748047, + "learning_rate": 5.669758590645473e-06, + "loss": 4.7996, + "step": 89950 + }, + { + "epoch": 1.83013916015625, + "grad_norm": 18.985536575317383, + "learning_rate": 5.669362513005576e-06, + "loss": 4.8097, + "step": 89955 + }, + { + "epoch": 1.8302408854166665, + "grad_norm": 18.877744674682617, + "learning_rate": 5.668966431088664e-06, + "loss": 4.973, + "step": 89960 + }, + { + "epoch": 1.8303426106770835, + "grad_norm": 17.553142547607422, + "learning_rate": 5.668570344897268e-06, + "loss": 4.8731, + "step": 89965 + }, + { + "epoch": 1.8304443359375, + "grad_norm": 22.10095977783203, + "learning_rate": 5.668174254433922e-06, + "loss": 4.9705, + "step": 89970 + }, + { + "epoch": 1.8305460611979165, + "grad_norm": 18.987831115722656, + "learning_rate": 5.667778159701153e-06, + "loss": 4.8937, + "step": 89975 + }, + { + "epoch": 1.8306477864583335, + "grad_norm": 19.45558738708496, + "learning_rate": 5.667382060701493e-06, + "loss": 4.9023, + "step": 89980 + }, + { + "epoch": 1.83074951171875, + "grad_norm": 14.682842254638672, + "learning_rate": 5.666985957437474e-06, + "loss": 4.9403, + "step": 89985 + }, + { + "epoch": 1.8308512369791665, + "grad_norm": 21.853580474853516, + "learning_rate": 5.666589849911628e-06, + "loss": 4.7222, + "step": 89990 + }, + { + "epoch": 1.8309529622395835, + "grad_norm": 13.546091079711914, + "learning_rate": 5.666193738126482e-06, + "loss": 4.8986, + "step": 89995 + }, + { + "epoch": 1.8310546875, + "grad_norm": 18.714696884155273, + "learning_rate": 5.665797622084571e-06, + "loss": 4.8041, + "step": 90000 + }, + { + "epoch": 1.8311564127604165, + "grad_norm": 16.423208236694336, + "learning_rate": 5.665401501788424e-06, + "loss": 5.0053, + "step": 90005 + }, + { + "epoch": 1.8312581380208335, + "grad_norm": 17.390499114990234, + "learning_rate": 5.665005377240573e-06, + "loss": 4.961, + "step": 90010 + }, + { + "epoch": 1.83135986328125, + "grad_norm": 14.319787979125977, + "learning_rate": 5.664609248443549e-06, + "loss": 5.0409, + "step": 90015 + }, + { + "epoch": 1.8314615885416665, + "grad_norm": 18.821548461914062, + "learning_rate": 5.664213115399881e-06, + "loss": 4.789, + "step": 90020 + }, + { + "epoch": 1.8315633138020835, + "grad_norm": 18.504009246826172, + "learning_rate": 5.663816978112104e-06, + "loss": 4.5416, + "step": 90025 + }, + { + "epoch": 1.8316650390625, + "grad_norm": 16.66693878173828, + "learning_rate": 5.663420836582747e-06, + "loss": 4.8024, + "step": 90030 + }, + { + "epoch": 1.8317667643229165, + "grad_norm": 17.089826583862305, + "learning_rate": 5.663024690814342e-06, + "loss": 5.0014, + "step": 90035 + }, + { + "epoch": 1.8318684895833335, + "grad_norm": 23.67496681213379, + "learning_rate": 5.662628540809419e-06, + "loss": 4.9928, + "step": 90040 + }, + { + "epoch": 1.83197021484375, + "grad_norm": 13.944512367248535, + "learning_rate": 5.66223238657051e-06, + "loss": 4.763, + "step": 90045 + }, + { + "epoch": 1.8320719401041665, + "grad_norm": 15.364465713500977, + "learning_rate": 5.661836228100146e-06, + "loss": 4.8566, + "step": 90050 + }, + { + "epoch": 1.8321736653645835, + "grad_norm": 19.583860397338867, + "learning_rate": 5.66144006540086e-06, + "loss": 4.861, + "step": 90055 + }, + { + "epoch": 1.832275390625, + "grad_norm": 20.619752883911133, + "learning_rate": 5.661043898475179e-06, + "loss": 4.6658, + "step": 90060 + }, + { + "epoch": 1.8323771158854165, + "grad_norm": 13.753101348876953, + "learning_rate": 5.66064772732564e-06, + "loss": 4.9136, + "step": 90065 + }, + { + "epoch": 1.8324788411458335, + "grad_norm": 18.855737686157227, + "learning_rate": 5.6602515519547696e-06, + "loss": 4.9444, + "step": 90070 + }, + { + "epoch": 1.83258056640625, + "grad_norm": 20.6806697845459, + "learning_rate": 5.659855372365103e-06, + "loss": 4.7949, + "step": 90075 + }, + { + "epoch": 1.8326822916666665, + "grad_norm": 23.453245162963867, + "learning_rate": 5.659459188559169e-06, + "loss": 5.028, + "step": 90080 + }, + { + "epoch": 1.8327840169270835, + "grad_norm": 18.452425003051758, + "learning_rate": 5.659063000539501e-06, + "loss": 4.8649, + "step": 90085 + }, + { + "epoch": 1.8328857421875, + "grad_norm": 15.607820510864258, + "learning_rate": 5.658666808308628e-06, + "loss": 4.7868, + "step": 90090 + }, + { + "epoch": 1.8329874674479165, + "grad_norm": 13.494847297668457, + "learning_rate": 5.658270611869085e-06, + "loss": 4.8176, + "step": 90095 + }, + { + "epoch": 1.8330891927083335, + "grad_norm": 17.91583251953125, + "learning_rate": 5.657874411223401e-06, + "loss": 5.0303, + "step": 90100 + }, + { + "epoch": 1.83319091796875, + "grad_norm": 16.10259246826172, + "learning_rate": 5.6574782063741094e-06, + "loss": 4.9966, + "step": 90105 + }, + { + "epoch": 1.8332926432291665, + "grad_norm": 17.768165588378906, + "learning_rate": 5.657081997323739e-06, + "loss": 5.0713, + "step": 90110 + }, + { + "epoch": 1.8333943684895835, + "grad_norm": 17.014331817626953, + "learning_rate": 5.656685784074825e-06, + "loss": 4.9641, + "step": 90115 + }, + { + "epoch": 1.83349609375, + "grad_norm": 18.152996063232422, + "learning_rate": 5.656289566629897e-06, + "loss": 4.9482, + "step": 90120 + }, + { + "epoch": 1.8335978190104165, + "grad_norm": 20.4405574798584, + "learning_rate": 5.655893344991486e-06, + "loss": 5.0148, + "step": 90125 + }, + { + "epoch": 1.8336995442708335, + "grad_norm": 14.787163734436035, + "learning_rate": 5.655497119162125e-06, + "loss": 4.8873, + "step": 90130 + }, + { + "epoch": 1.83380126953125, + "grad_norm": 16.057636260986328, + "learning_rate": 5.655100889144346e-06, + "loss": 5.0229, + "step": 90135 + }, + { + "epoch": 1.8339029947916665, + "grad_norm": 23.181730270385742, + "learning_rate": 5.6547046549406805e-06, + "loss": 4.42, + "step": 90140 + }, + { + "epoch": 1.8340047200520835, + "grad_norm": 20.541210174560547, + "learning_rate": 5.65430841655366e-06, + "loss": 4.8707, + "step": 90145 + }, + { + "epoch": 1.8341064453125, + "grad_norm": 18.001068115234375, + "learning_rate": 5.653912173985815e-06, + "loss": 4.9869, + "step": 90150 + }, + { + "epoch": 1.8342081705729165, + "grad_norm": 23.484085083007812, + "learning_rate": 5.65351592723968e-06, + "loss": 5.0166, + "step": 90155 + }, + { + "epoch": 1.8343098958333335, + "grad_norm": 29.04630470275879, + "learning_rate": 5.6531196763177855e-06, + "loss": 4.9152, + "step": 90160 + }, + { + "epoch": 1.83441162109375, + "grad_norm": 16.626575469970703, + "learning_rate": 5.652723421222663e-06, + "loss": 4.8421, + "step": 90165 + }, + { + "epoch": 1.8345133463541665, + "grad_norm": 18.33174705505371, + "learning_rate": 5.652327161956846e-06, + "loss": 4.8705, + "step": 90170 + }, + { + "epoch": 1.8346150716145835, + "grad_norm": 23.42844581604004, + "learning_rate": 5.651930898522865e-06, + "loss": 4.988, + "step": 90175 + }, + { + "epoch": 1.834716796875, + "grad_norm": 18.08989143371582, + "learning_rate": 5.651534630923254e-06, + "loss": 4.9762, + "step": 90180 + }, + { + "epoch": 1.8348185221354165, + "grad_norm": 16.39999771118164, + "learning_rate": 5.651138359160541e-06, + "loss": 4.6427, + "step": 90185 + }, + { + "epoch": 1.8349202473958335, + "grad_norm": 22.05010223388672, + "learning_rate": 5.650742083237262e-06, + "loss": 4.9813, + "step": 90190 + }, + { + "epoch": 1.83502197265625, + "grad_norm": 14.051259994506836, + "learning_rate": 5.650345803155947e-06, + "loss": 4.7726, + "step": 90195 + }, + { + "epoch": 1.8351236979166665, + "grad_norm": 24.492998123168945, + "learning_rate": 5.64994951891913e-06, + "loss": 5.0912, + "step": 90200 + }, + { + "epoch": 1.8352254231770835, + "grad_norm": 26.812898635864258, + "learning_rate": 5.6495532305293414e-06, + "loss": 4.7967, + "step": 90205 + }, + { + "epoch": 1.8353271484375, + "grad_norm": 17.041109085083008, + "learning_rate": 5.649156937989113e-06, + "loss": 4.9385, + "step": 90210 + }, + { + "epoch": 1.8354288736979165, + "grad_norm": 15.886242866516113, + "learning_rate": 5.648760641300979e-06, + "loss": 4.8491, + "step": 90215 + }, + { + "epoch": 1.8355305989583335, + "grad_norm": 19.800296783447266, + "learning_rate": 5.64836434046747e-06, + "loss": 4.5472, + "step": 90220 + }, + { + "epoch": 1.83563232421875, + "grad_norm": 20.13425064086914, + "learning_rate": 5.647968035491118e-06, + "loss": 4.9716, + "step": 90225 + }, + { + "epoch": 1.8357340494791665, + "grad_norm": 18.379247665405273, + "learning_rate": 5.6475717263744556e-06, + "loss": 4.8423, + "step": 90230 + }, + { + "epoch": 1.8358357747395835, + "grad_norm": 15.477333068847656, + "learning_rate": 5.6471754131200165e-06, + "loss": 4.7881, + "step": 90235 + }, + { + "epoch": 1.8359375, + "grad_norm": 17.09868049621582, + "learning_rate": 5.646779095730331e-06, + "loss": 5.0954, + "step": 90240 + }, + { + "epoch": 1.8360392252604165, + "grad_norm": 14.890953063964844, + "learning_rate": 5.646382774207933e-06, + "loss": 5.0954, + "step": 90245 + }, + { + "epoch": 1.8361409505208335, + "grad_norm": 18.67015838623047, + "learning_rate": 5.6459864485553535e-06, + "loss": 4.7026, + "step": 90250 + }, + { + "epoch": 1.83624267578125, + "grad_norm": 17.665470123291016, + "learning_rate": 5.645590118775128e-06, + "loss": 4.8705, + "step": 90255 + }, + { + "epoch": 1.8363444010416665, + "grad_norm": 19.701364517211914, + "learning_rate": 5.645193784869786e-06, + "loss": 4.8705, + "step": 90260 + }, + { + "epoch": 1.8364461263020835, + "grad_norm": 19.463878631591797, + "learning_rate": 5.644797446841858e-06, + "loss": 4.821, + "step": 90265 + }, + { + "epoch": 1.8365478515625, + "grad_norm": 18.02056312561035, + "learning_rate": 5.644401104693882e-06, + "loss": 5.0, + "step": 90270 + }, + { + "epoch": 1.8366495768229165, + "grad_norm": 15.025665283203125, + "learning_rate": 5.644004758428386e-06, + "loss": 4.8782, + "step": 90275 + }, + { + "epoch": 1.8367513020833335, + "grad_norm": 16.840599060058594, + "learning_rate": 5.643608408047904e-06, + "loss": 4.7533, + "step": 90280 + }, + { + "epoch": 1.83685302734375, + "grad_norm": 13.974286079406738, + "learning_rate": 5.643212053554969e-06, + "loss": 5.0798, + "step": 90285 + }, + { + "epoch": 1.8369547526041665, + "grad_norm": 14.039122581481934, + "learning_rate": 5.642815694952113e-06, + "loss": 4.8621, + "step": 90290 + }, + { + "epoch": 1.8370564778645835, + "grad_norm": 24.842784881591797, + "learning_rate": 5.642419332241868e-06, + "loss": 4.6754, + "step": 90295 + }, + { + "epoch": 1.837158203125, + "grad_norm": 15.4658784866333, + "learning_rate": 5.642022965426769e-06, + "loss": 4.8903, + "step": 90300 + }, + { + "epoch": 1.8372599283854165, + "grad_norm": 19.661163330078125, + "learning_rate": 5.641626594509346e-06, + "loss": 4.662, + "step": 90305 + }, + { + "epoch": 1.8373616536458335, + "grad_norm": 17.00649642944336, + "learning_rate": 5.641230219492133e-06, + "loss": 4.7773, + "step": 90310 + }, + { + "epoch": 1.83746337890625, + "grad_norm": 23.486942291259766, + "learning_rate": 5.640833840377662e-06, + "loss": 4.9185, + "step": 90315 + }, + { + "epoch": 1.8375651041666665, + "grad_norm": 13.00577163696289, + "learning_rate": 5.6404374571684664e-06, + "loss": 5.059, + "step": 90320 + }, + { + "epoch": 1.8376668294270835, + "grad_norm": 19.837339401245117, + "learning_rate": 5.64004106986708e-06, + "loss": 5.136, + "step": 90325 + }, + { + "epoch": 1.8377685546875, + "grad_norm": 17.5087890625, + "learning_rate": 5.639644678476033e-06, + "loss": 5.1213, + "step": 90330 + }, + { + "epoch": 1.8378702799479165, + "grad_norm": 19.353342056274414, + "learning_rate": 5.6392482829978605e-06, + "loss": 4.706, + "step": 90335 + }, + { + "epoch": 1.8379720052083335, + "grad_norm": 14.451083183288574, + "learning_rate": 5.638851883435092e-06, + "loss": 5.2265, + "step": 90340 + }, + { + "epoch": 1.83807373046875, + "grad_norm": 20.988759994506836, + "learning_rate": 5.638455479790264e-06, + "loss": 4.7875, + "step": 90345 + }, + { + "epoch": 1.8381754557291665, + "grad_norm": 15.55047607421875, + "learning_rate": 5.638059072065909e-06, + "loss": 4.7667, + "step": 90350 + }, + { + "epoch": 1.8382771809895835, + "grad_norm": 15.20980453491211, + "learning_rate": 5.637662660264559e-06, + "loss": 4.9036, + "step": 90355 + }, + { + "epoch": 1.83837890625, + "grad_norm": 20.026264190673828, + "learning_rate": 5.637266244388746e-06, + "loss": 4.8458, + "step": 90360 + }, + { + "epoch": 1.8384806315104165, + "grad_norm": 16.893962860107422, + "learning_rate": 5.636869824441004e-06, + "loss": 5.0993, + "step": 90365 + }, + { + "epoch": 1.8385823567708335, + "grad_norm": 17.77842903137207, + "learning_rate": 5.636473400423867e-06, + "loss": 4.9613, + "step": 90370 + }, + { + "epoch": 1.83868408203125, + "grad_norm": 18.541988372802734, + "learning_rate": 5.636076972339868e-06, + "loss": 4.9513, + "step": 90375 + }, + { + "epoch": 1.8387858072916665, + "grad_norm": 20.719873428344727, + "learning_rate": 5.635680540191537e-06, + "loss": 5.312, + "step": 90380 + }, + { + "epoch": 1.8388875325520835, + "grad_norm": 23.85433006286621, + "learning_rate": 5.63528410398141e-06, + "loss": 4.7466, + "step": 90385 + }, + { + "epoch": 1.8389892578125, + "grad_norm": 13.762463569641113, + "learning_rate": 5.63488766371202e-06, + "loss": 4.9125, + "step": 90390 + }, + { + "epoch": 1.8390909830729165, + "grad_norm": 18.24472427368164, + "learning_rate": 5.634491219385897e-06, + "loss": 4.7163, + "step": 90395 + }, + { + "epoch": 1.8391927083333335, + "grad_norm": 19.53291893005371, + "learning_rate": 5.634094771005578e-06, + "loss": 5.1341, + "step": 90400 + }, + { + "epoch": 1.83929443359375, + "grad_norm": 18.913379669189453, + "learning_rate": 5.633698318573595e-06, + "loss": 4.8743, + "step": 90405 + }, + { + "epoch": 1.8393961588541665, + "grad_norm": 18.627944946289062, + "learning_rate": 5.633301862092481e-06, + "loss": 4.8089, + "step": 90410 + }, + { + "epoch": 1.8394978841145835, + "grad_norm": 19.8207950592041, + "learning_rate": 5.632905401564769e-06, + "loss": 4.7338, + "step": 90415 + }, + { + "epoch": 1.839599609375, + "grad_norm": 14.893911361694336, + "learning_rate": 5.632508936992992e-06, + "loss": 4.7237, + "step": 90420 + }, + { + "epoch": 1.8397013346354165, + "grad_norm": 19.901033401489258, + "learning_rate": 5.632112468379685e-06, + "loss": 5.0387, + "step": 90425 + }, + { + "epoch": 1.8398030598958335, + "grad_norm": 19.98657989501953, + "learning_rate": 5.631715995727378e-06, + "loss": 4.9686, + "step": 90430 + }, + { + "epoch": 1.83990478515625, + "grad_norm": 20.391557693481445, + "learning_rate": 5.631319519038607e-06, + "loss": 5.0874, + "step": 90435 + }, + { + "epoch": 1.8400065104166665, + "grad_norm": 23.5108699798584, + "learning_rate": 5.630923038315905e-06, + "loss": 4.9509, + "step": 90440 + }, + { + "epoch": 1.8401082356770835, + "grad_norm": 15.591397285461426, + "learning_rate": 5.6305265535618046e-06, + "loss": 4.769, + "step": 90445 + }, + { + "epoch": 1.8402099609375, + "grad_norm": 18.499963760375977, + "learning_rate": 5.63013006477884e-06, + "loss": 4.9579, + "step": 90450 + }, + { + "epoch": 1.8403116861979165, + "grad_norm": 13.321513175964355, + "learning_rate": 5.629733571969545e-06, + "loss": 4.7447, + "step": 90455 + }, + { + "epoch": 1.8404134114583335, + "grad_norm": 20.3758602142334, + "learning_rate": 5.629337075136451e-06, + "loss": 4.961, + "step": 90460 + }, + { + "epoch": 1.84051513671875, + "grad_norm": 16.32838249206543, + "learning_rate": 5.628940574282094e-06, + "loss": 4.9463, + "step": 90465 + }, + { + "epoch": 1.8406168619791665, + "grad_norm": 15.166962623596191, + "learning_rate": 5.628544069409005e-06, + "loss": 4.9447, + "step": 90470 + }, + { + "epoch": 1.8407185872395835, + "grad_norm": 18.490528106689453, + "learning_rate": 5.628147560519721e-06, + "loss": 5.0154, + "step": 90475 + }, + { + "epoch": 1.8408203125, + "grad_norm": 21.453649520874023, + "learning_rate": 5.627751047616772e-06, + "loss": 4.8439, + "step": 90480 + }, + { + "epoch": 1.8409220377604165, + "grad_norm": 18.27083969116211, + "learning_rate": 5.627354530702693e-06, + "loss": 5.0327, + "step": 90485 + }, + { + "epoch": 1.8410237630208335, + "grad_norm": 14.393431663513184, + "learning_rate": 5.626958009780018e-06, + "loss": 4.8057, + "step": 90490 + }, + { + "epoch": 1.84112548828125, + "grad_norm": 15.850385665893555, + "learning_rate": 5.626561484851281e-06, + "loss": 4.9983, + "step": 90495 + }, + { + "epoch": 1.8412272135416665, + "grad_norm": 16.720823287963867, + "learning_rate": 5.6261649559190134e-06, + "loss": 4.7445, + "step": 90500 + }, + { + "epoch": 1.8413289388020835, + "grad_norm": 24.680349349975586, + "learning_rate": 5.625768422985752e-06, + "loss": 5.2249, + "step": 90505 + }, + { + "epoch": 1.8414306640625, + "grad_norm": 26.449724197387695, + "learning_rate": 5.625371886054028e-06, + "loss": 4.8083, + "step": 90510 + }, + { + "epoch": 1.8415323893229165, + "grad_norm": 15.94723892211914, + "learning_rate": 5.6249753451263775e-06, + "loss": 4.8283, + "step": 90515 + }, + { + "epoch": 1.8416341145833335, + "grad_norm": 16.120180130004883, + "learning_rate": 5.624578800205332e-06, + "loss": 4.7196, + "step": 90520 + }, + { + "epoch": 1.84173583984375, + "grad_norm": 13.997225761413574, + "learning_rate": 5.624182251293426e-06, + "loss": 4.9901, + "step": 90525 + }, + { + "epoch": 1.8418375651041665, + "grad_norm": 20.53710174560547, + "learning_rate": 5.623785698393195e-06, + "loss": 4.8023, + "step": 90530 + }, + { + "epoch": 1.8419392903645835, + "grad_norm": 16.444122314453125, + "learning_rate": 5.623389141507169e-06, + "loss": 4.7738, + "step": 90535 + }, + { + "epoch": 1.842041015625, + "grad_norm": 19.893253326416016, + "learning_rate": 5.622992580637887e-06, + "loss": 4.9504, + "step": 90540 + }, + { + "epoch": 1.8421427408854165, + "grad_norm": 18.04999351501465, + "learning_rate": 5.622596015787879e-06, + "loss": 4.7483, + "step": 90545 + }, + { + "epoch": 1.8422444661458335, + "grad_norm": 20.596145629882812, + "learning_rate": 5.62219944695968e-06, + "loss": 4.8951, + "step": 90550 + }, + { + "epoch": 1.84234619140625, + "grad_norm": 20.291934967041016, + "learning_rate": 5.621802874155824e-06, + "loss": 4.7976, + "step": 90555 + }, + { + "epoch": 1.8424479166666665, + "grad_norm": 17.55929183959961, + "learning_rate": 5.621406297378844e-06, + "loss": 4.863, + "step": 90560 + }, + { + "epoch": 1.8425496419270835, + "grad_norm": 20.107446670532227, + "learning_rate": 5.621009716631277e-06, + "loss": 4.824, + "step": 90565 + }, + { + "epoch": 1.8426513671875, + "grad_norm": 13.974650382995605, + "learning_rate": 5.620613131915654e-06, + "loss": 4.8774, + "step": 90570 + }, + { + "epoch": 1.8427530924479165, + "grad_norm": 19.1873836517334, + "learning_rate": 5.62021654323451e-06, + "loss": 5.0476, + "step": 90575 + }, + { + "epoch": 1.8428548177083335, + "grad_norm": 18.26919174194336, + "learning_rate": 5.619819950590379e-06, + "loss": 4.7425, + "step": 90580 + }, + { + "epoch": 1.84295654296875, + "grad_norm": 15.515401840209961, + "learning_rate": 5.619423353985797e-06, + "loss": 4.9904, + "step": 90585 + }, + { + "epoch": 1.8430582682291665, + "grad_norm": 22.309749603271484, + "learning_rate": 5.619026753423294e-06, + "loss": 4.9278, + "step": 90590 + }, + { + "epoch": 1.8431599934895835, + "grad_norm": 17.079404830932617, + "learning_rate": 5.618630148905408e-06, + "loss": 5.0596, + "step": 90595 + }, + { + "epoch": 1.84326171875, + "grad_norm": 18.292755126953125, + "learning_rate": 5.618233540434671e-06, + "loss": 5.0611, + "step": 90600 + }, + { + "epoch": 1.8433634440104165, + "grad_norm": 14.054459571838379, + "learning_rate": 5.617836928013618e-06, + "loss": 4.8144, + "step": 90605 + }, + { + "epoch": 1.8434651692708335, + "grad_norm": 16.657188415527344, + "learning_rate": 5.617440311644784e-06, + "loss": 4.954, + "step": 90610 + }, + { + "epoch": 1.84356689453125, + "grad_norm": 18.09839630126953, + "learning_rate": 5.617043691330701e-06, + "loss": 5.0898, + "step": 90615 + }, + { + "epoch": 1.8436686197916665, + "grad_norm": 18.216205596923828, + "learning_rate": 5.616647067073905e-06, + "loss": 4.9778, + "step": 90620 + }, + { + "epoch": 1.8437703450520835, + "grad_norm": 16.383962631225586, + "learning_rate": 5.6162504388769314e-06, + "loss": 4.8235, + "step": 90625 + }, + { + "epoch": 1.8438720703125, + "grad_norm": 17.825551986694336, + "learning_rate": 5.615853806742312e-06, + "loss": 4.9567, + "step": 90630 + }, + { + "epoch": 1.8439737955729165, + "grad_norm": 21.801013946533203, + "learning_rate": 5.615457170672582e-06, + "loss": 4.6286, + "step": 90635 + }, + { + "epoch": 1.8440755208333335, + "grad_norm": 14.708855628967285, + "learning_rate": 5.615060530670276e-06, + "loss": 4.7188, + "step": 90640 + }, + { + "epoch": 1.84417724609375, + "grad_norm": 18.400585174560547, + "learning_rate": 5.61466388673793e-06, + "loss": 4.841, + "step": 90645 + }, + { + "epoch": 1.8442789713541665, + "grad_norm": 18.7945556640625, + "learning_rate": 5.614267238878076e-06, + "loss": 4.7245, + "step": 90650 + }, + { + "epoch": 1.8443806966145835, + "grad_norm": 19.722335815429688, + "learning_rate": 5.613870587093249e-06, + "loss": 4.8025, + "step": 90655 + }, + { + "epoch": 1.844482421875, + "grad_norm": 15.633505821228027, + "learning_rate": 5.6134739313859846e-06, + "loss": 5.0411, + "step": 90660 + }, + { + "epoch": 1.8445841471354165, + "grad_norm": 19.402881622314453, + "learning_rate": 5.613077271758814e-06, + "loss": 4.9233, + "step": 90665 + }, + { + "epoch": 1.8446858723958335, + "grad_norm": 24.7783145904541, + "learning_rate": 5.612680608214277e-06, + "loss": 4.7875, + "step": 90670 + }, + { + "epoch": 1.84478759765625, + "grad_norm": 21.30197525024414, + "learning_rate": 5.612283940754906e-06, + "loss": 4.8733, + "step": 90675 + }, + { + "epoch": 1.8448893229166665, + "grad_norm": 20.724584579467773, + "learning_rate": 5.611887269383232e-06, + "loss": 4.8937, + "step": 90680 + }, + { + "epoch": 1.8449910481770835, + "grad_norm": 16.57370948791504, + "learning_rate": 5.611490594101795e-06, + "loss": 4.9241, + "step": 90685 + }, + { + "epoch": 1.8450927734375, + "grad_norm": 17.260427474975586, + "learning_rate": 5.611093914913126e-06, + "loss": 4.7905, + "step": 90690 + }, + { + "epoch": 1.8451944986979165, + "grad_norm": 18.885007858276367, + "learning_rate": 5.610697231819762e-06, + "loss": 4.9367, + "step": 90695 + }, + { + "epoch": 1.8452962239583335, + "grad_norm": 18.830324172973633, + "learning_rate": 5.610300544824236e-06, + "loss": 4.7285, + "step": 90700 + }, + { + "epoch": 1.84539794921875, + "grad_norm": 17.995105743408203, + "learning_rate": 5.609903853929081e-06, + "loss": 4.6469, + "step": 90705 + }, + { + "epoch": 1.8454996744791665, + "grad_norm": 22.486072540283203, + "learning_rate": 5.609507159136838e-06, + "loss": 4.8375, + "step": 90710 + }, + { + "epoch": 1.8456013997395835, + "grad_norm": 24.43042755126953, + "learning_rate": 5.609110460450036e-06, + "loss": 4.6747, + "step": 90715 + }, + { + "epoch": 1.845703125, + "grad_norm": 18.40371322631836, + "learning_rate": 5.608713757871211e-06, + "loss": 4.8637, + "step": 90720 + }, + { + "epoch": 1.8458048502604165, + "grad_norm": 20.416303634643555, + "learning_rate": 5.608317051402898e-06, + "loss": 5.1387, + "step": 90725 + }, + { + "epoch": 1.8459065755208335, + "grad_norm": 13.138680458068848, + "learning_rate": 5.607920341047632e-06, + "loss": 5.0418, + "step": 90730 + }, + { + "epoch": 1.84600830078125, + "grad_norm": 18.72737693786621, + "learning_rate": 5.60752362680795e-06, + "loss": 5.1742, + "step": 90735 + }, + { + "epoch": 1.8461100260416665, + "grad_norm": 17.593727111816406, + "learning_rate": 5.6071269086863845e-06, + "loss": 4.8296, + "step": 90740 + }, + { + "epoch": 1.8462117513020835, + "grad_norm": 16.382980346679688, + "learning_rate": 5.606730186685468e-06, + "loss": 5.1473, + "step": 90745 + }, + { + "epoch": 1.8463134765625, + "grad_norm": 19.821638107299805, + "learning_rate": 5.6063334608077415e-06, + "loss": 4.7468, + "step": 90750 + }, + { + "epoch": 1.8464152018229165, + "grad_norm": 19.928482055664062, + "learning_rate": 5.605936731055734e-06, + "loss": 4.7666, + "step": 90755 + }, + { + "epoch": 1.8465169270833335, + "grad_norm": 17.145971298217773, + "learning_rate": 5.605539997431984e-06, + "loss": 4.7791, + "step": 90760 + }, + { + "epoch": 1.84661865234375, + "grad_norm": 19.555734634399414, + "learning_rate": 5.605143259939027e-06, + "loss": 4.8861, + "step": 90765 + }, + { + "epoch": 1.8467203776041665, + "grad_norm": 13.794074058532715, + "learning_rate": 5.6047465185793945e-06, + "loss": 4.7494, + "step": 90770 + }, + { + "epoch": 1.8468221028645835, + "grad_norm": 23.972820281982422, + "learning_rate": 5.604349773355626e-06, + "loss": 5.0477, + "step": 90775 + }, + { + "epoch": 1.846923828125, + "grad_norm": 18.516504287719727, + "learning_rate": 5.6039530242702525e-06, + "loss": 5.057, + "step": 90780 + }, + { + "epoch": 1.8470255533854165, + "grad_norm": 15.695572853088379, + "learning_rate": 5.6035562713258115e-06, + "loss": 4.7508, + "step": 90785 + }, + { + "epoch": 1.8471272786458335, + "grad_norm": 17.218673706054688, + "learning_rate": 5.603159514524839e-06, + "loss": 4.7747, + "step": 90790 + }, + { + "epoch": 1.84722900390625, + "grad_norm": 18.38174057006836, + "learning_rate": 5.602762753869866e-06, + "loss": 4.8618, + "step": 90795 + }, + { + "epoch": 1.8473307291666665, + "grad_norm": 13.673547744750977, + "learning_rate": 5.602365989363433e-06, + "loss": 4.8061, + "step": 90800 + }, + { + "epoch": 1.8474324544270835, + "grad_norm": 17.122703552246094, + "learning_rate": 5.601969221008071e-06, + "loss": 5.1225, + "step": 90805 + }, + { + "epoch": 1.8475341796875, + "grad_norm": 19.55327796936035, + "learning_rate": 5.601572448806316e-06, + "loss": 4.7805, + "step": 90810 + }, + { + "epoch": 1.8476359049479165, + "grad_norm": 21.63962745666504, + "learning_rate": 5.6011756727607056e-06, + "loss": 4.7897, + "step": 90815 + }, + { + "epoch": 1.8477376302083335, + "grad_norm": 16.38515281677246, + "learning_rate": 5.600778892873773e-06, + "loss": 5.0703, + "step": 90820 + }, + { + "epoch": 1.84783935546875, + "grad_norm": 22.251272201538086, + "learning_rate": 5.600382109148054e-06, + "loss": 5.1934, + "step": 90825 + }, + { + "epoch": 1.8479410807291665, + "grad_norm": 13.443397521972656, + "learning_rate": 5.599985321586084e-06, + "loss": 5.1473, + "step": 90830 + }, + { + "epoch": 1.8480428059895835, + "grad_norm": 14.43365478515625, + "learning_rate": 5.599588530190397e-06, + "loss": 4.8411, + "step": 90835 + }, + { + "epoch": 1.84814453125, + "grad_norm": 16.282638549804688, + "learning_rate": 5.599191734963532e-06, + "loss": 5.0874, + "step": 90840 + }, + { + "epoch": 1.8482462565104165, + "grad_norm": 25.660560607910156, + "learning_rate": 5.59879493590802e-06, + "loss": 4.6704, + "step": 90845 + }, + { + "epoch": 1.8483479817708335, + "grad_norm": 19.125944137573242, + "learning_rate": 5.598398133026399e-06, + "loss": 4.8078, + "step": 90850 + }, + { + "epoch": 1.84844970703125, + "grad_norm": 18.974472045898438, + "learning_rate": 5.5980013263212054e-06, + "loss": 4.7592, + "step": 90855 + }, + { + "epoch": 1.8485514322916665, + "grad_norm": 17.75747299194336, + "learning_rate": 5.597604515794971e-06, + "loss": 5.1082, + "step": 90860 + }, + { + "epoch": 1.8486531575520835, + "grad_norm": 17.63633155822754, + "learning_rate": 5.597207701450235e-06, + "loss": 4.8858, + "step": 90865 + }, + { + "epoch": 1.8487548828125, + "grad_norm": 18.797216415405273, + "learning_rate": 5.59681088328953e-06, + "loss": 4.8629, + "step": 90870 + }, + { + "epoch": 1.8488566080729165, + "grad_norm": 19.093875885009766, + "learning_rate": 5.596414061315394e-06, + "loss": 4.5794, + "step": 90875 + }, + { + "epoch": 1.8489583333333335, + "grad_norm": 14.973759651184082, + "learning_rate": 5.5960172355303614e-06, + "loss": 4.923, + "step": 90880 + }, + { + "epoch": 1.84906005859375, + "grad_norm": 21.745676040649414, + "learning_rate": 5.595620405936967e-06, + "loss": 4.7429, + "step": 90885 + }, + { + "epoch": 1.8491617838541665, + "grad_norm": 24.53646469116211, + "learning_rate": 5.595223572537747e-06, + "loss": 5.0458, + "step": 90890 + }, + { + "epoch": 1.8492635091145835, + "grad_norm": 14.022856712341309, + "learning_rate": 5.594826735335238e-06, + "loss": 4.7893, + "step": 90895 + }, + { + "epoch": 1.849365234375, + "grad_norm": 19.83128547668457, + "learning_rate": 5.594429894331974e-06, + "loss": 4.603, + "step": 90900 + }, + { + "epoch": 1.8494669596354165, + "grad_norm": 18.07124137878418, + "learning_rate": 5.594033049530494e-06, + "loss": 4.5663, + "step": 90905 + }, + { + "epoch": 1.8495686848958335, + "grad_norm": 18.72995376586914, + "learning_rate": 5.593636200933329e-06, + "loss": 4.6941, + "step": 90910 + }, + { + "epoch": 1.84967041015625, + "grad_norm": 25.112348556518555, + "learning_rate": 5.593239348543018e-06, + "loss": 4.9054, + "step": 90915 + }, + { + "epoch": 1.8497721354166665, + "grad_norm": 18.55019760131836, + "learning_rate": 5.592842492362097e-06, + "loss": 4.7903, + "step": 90920 + }, + { + "epoch": 1.8498738606770835, + "grad_norm": 22.92816162109375, + "learning_rate": 5.5924456323931e-06, + "loss": 4.8968, + "step": 90925 + }, + { + "epoch": 1.8499755859375, + "grad_norm": 20.519336700439453, + "learning_rate": 5.592048768638563e-06, + "loss": 5.0702, + "step": 90930 + }, + { + "epoch": 1.8500773111979165, + "grad_norm": 24.160404205322266, + "learning_rate": 5.591651901101023e-06, + "loss": 4.6351, + "step": 90935 + }, + { + "epoch": 1.8501790364583335, + "grad_norm": 19.350542068481445, + "learning_rate": 5.591255029783014e-06, + "loss": 5.0675, + "step": 90940 + }, + { + "epoch": 1.85028076171875, + "grad_norm": 14.466266632080078, + "learning_rate": 5.590858154687075e-06, + "loss": 4.9296, + "step": 90945 + }, + { + "epoch": 1.8503824869791665, + "grad_norm": 18.89902114868164, + "learning_rate": 5.590461275815737e-06, + "loss": 4.5706, + "step": 90950 + }, + { + "epoch": 1.8504842122395835, + "grad_norm": 16.415206909179688, + "learning_rate": 5.590064393171541e-06, + "loss": 5.035, + "step": 90955 + }, + { + "epoch": 1.8505859375, + "grad_norm": 15.717108726501465, + "learning_rate": 5.589667506757022e-06, + "loss": 5.0806, + "step": 90960 + }, + { + "epoch": 1.8506876627604165, + "grad_norm": 20.641767501831055, + "learning_rate": 5.589270616574712e-06, + "loss": 4.8652, + "step": 90965 + }, + { + "epoch": 1.8507893880208335, + "grad_norm": 17.90348243713379, + "learning_rate": 5.588873722627152e-06, + "loss": 4.8436, + "step": 90970 + }, + { + "epoch": 1.85089111328125, + "grad_norm": 17.591232299804688, + "learning_rate": 5.588476824916874e-06, + "loss": 4.8449, + "step": 90975 + }, + { + "epoch": 1.8509928385416665, + "grad_norm": 22.825265884399414, + "learning_rate": 5.588079923446417e-06, + "loss": 4.907, + "step": 90980 + }, + { + "epoch": 1.8510945638020835, + "grad_norm": 24.450244903564453, + "learning_rate": 5.587683018218316e-06, + "loss": 4.6951, + "step": 90985 + }, + { + "epoch": 1.8511962890625, + "grad_norm": 21.537643432617188, + "learning_rate": 5.587286109235107e-06, + "loss": 4.8878, + "step": 90990 + }, + { + "epoch": 1.8512980143229165, + "grad_norm": 13.849050521850586, + "learning_rate": 5.586889196499327e-06, + "loss": 4.9195, + "step": 90995 + }, + { + "epoch": 1.8513997395833335, + "grad_norm": 16.665058135986328, + "learning_rate": 5.586492280013509e-06, + "loss": 4.7385, + "step": 91000 + }, + { + "epoch": 1.85150146484375, + "grad_norm": 19.600448608398438, + "learning_rate": 5.586095359780194e-06, + "loss": 4.8893, + "step": 91005 + }, + { + "epoch": 1.8516031901041665, + "grad_norm": 18.43243408203125, + "learning_rate": 5.585698435801914e-06, + "loss": 4.7637, + "step": 91010 + }, + { + "epoch": 1.8517049153645835, + "grad_norm": 25.237810134887695, + "learning_rate": 5.585301508081208e-06, + "loss": 4.9918, + "step": 91015 + }, + { + "epoch": 1.851806640625, + "grad_norm": 15.249534606933594, + "learning_rate": 5.584904576620611e-06, + "loss": 4.6286, + "step": 91020 + }, + { + "epoch": 1.8519083658854165, + "grad_norm": 20.554702758789062, + "learning_rate": 5.584507641422658e-06, + "loss": 4.8892, + "step": 91025 + }, + { + "epoch": 1.8520100911458335, + "grad_norm": 17.687366485595703, + "learning_rate": 5.584110702489888e-06, + "loss": 4.7665, + "step": 91030 + }, + { + "epoch": 1.85211181640625, + "grad_norm": 15.805188179016113, + "learning_rate": 5.583713759824836e-06, + "loss": 4.8328, + "step": 91035 + }, + { + "epoch": 1.8522135416666665, + "grad_norm": 14.277975082397461, + "learning_rate": 5.583316813430037e-06, + "loss": 5.0968, + "step": 91040 + }, + { + "epoch": 1.8523152669270835, + "grad_norm": 15.86174201965332, + "learning_rate": 5.58291986330803e-06, + "loss": 4.9424, + "step": 91045 + }, + { + "epoch": 1.8524169921875, + "grad_norm": 23.12969207763672, + "learning_rate": 5.582522909461351e-06, + "loss": 5.1608, + "step": 91050 + }, + { + "epoch": 1.8525187174479165, + "grad_norm": 18.2645206451416, + "learning_rate": 5.582125951892534e-06, + "loss": 4.6048, + "step": 91055 + }, + { + "epoch": 1.8526204427083335, + "grad_norm": 16.778486251831055, + "learning_rate": 5.581728990604118e-06, + "loss": 4.8689, + "step": 91060 + }, + { + "epoch": 1.85272216796875, + "grad_norm": 16.46207618713379, + "learning_rate": 5.581332025598638e-06, + "loss": 4.7648, + "step": 91065 + }, + { + "epoch": 1.8528238932291665, + "grad_norm": 17.558006286621094, + "learning_rate": 5.580935056878631e-06, + "loss": 5.0993, + "step": 91070 + }, + { + "epoch": 1.8529256184895835, + "grad_norm": 19.161046981811523, + "learning_rate": 5.5805380844466335e-06, + "loss": 4.8601, + "step": 91075 + }, + { + "epoch": 1.85302734375, + "grad_norm": 20.220905303955078, + "learning_rate": 5.5801411083051815e-06, + "loss": 4.7684, + "step": 91080 + }, + { + "epoch": 1.8531290690104165, + "grad_norm": 16.486318588256836, + "learning_rate": 5.579744128456812e-06, + "loss": 4.7791, + "step": 91085 + }, + { + "epoch": 1.8532307942708335, + "grad_norm": 21.06505012512207, + "learning_rate": 5.579347144904063e-06, + "loss": 5.0099, + "step": 91090 + }, + { + "epoch": 1.85333251953125, + "grad_norm": 18.055173873901367, + "learning_rate": 5.578950157649467e-06, + "loss": 4.9906, + "step": 91095 + }, + { + "epoch": 1.8534342447916665, + "grad_norm": 16.042631149291992, + "learning_rate": 5.5785531666955665e-06, + "loss": 4.7484, + "step": 91100 + }, + { + "epoch": 1.8535359700520835, + "grad_norm": 18.01394271850586, + "learning_rate": 5.578156172044892e-06, + "loss": 4.6487, + "step": 91105 + }, + { + "epoch": 1.8536376953125, + "grad_norm": 13.27407169342041, + "learning_rate": 5.577759173699985e-06, + "loss": 4.9432, + "step": 91110 + }, + { + "epoch": 1.8537394205729165, + "grad_norm": 23.46183204650879, + "learning_rate": 5.5773621716633795e-06, + "loss": 4.8652, + "step": 91115 + }, + { + "epoch": 1.8538411458333335, + "grad_norm": 22.093698501586914, + "learning_rate": 5.576965165937612e-06, + "loss": 4.7386, + "step": 91120 + }, + { + "epoch": 1.85394287109375, + "grad_norm": 15.88077163696289, + "learning_rate": 5.576568156525222e-06, + "loss": 4.9731, + "step": 91125 + }, + { + "epoch": 1.8540445963541665, + "grad_norm": 18.381925582885742, + "learning_rate": 5.576171143428743e-06, + "loss": 4.8859, + "step": 91130 + }, + { + "epoch": 1.8541463216145835, + "grad_norm": 17.955026626586914, + "learning_rate": 5.575774126650715e-06, + "loss": 4.7011, + "step": 91135 + }, + { + "epoch": 1.854248046875, + "grad_norm": 13.562639236450195, + "learning_rate": 5.575377106193672e-06, + "loss": 4.7425, + "step": 91140 + }, + { + "epoch": 1.8543497721354165, + "grad_norm": 18.737934112548828, + "learning_rate": 5.5749800820601516e-06, + "loss": 4.7054, + "step": 91145 + }, + { + "epoch": 1.8544514973958335, + "grad_norm": 17.644634246826172, + "learning_rate": 5.574583054252692e-06, + "loss": 4.9279, + "step": 91150 + }, + { + "epoch": 1.85455322265625, + "grad_norm": 13.732115745544434, + "learning_rate": 5.5741860227738275e-06, + "loss": 4.8874, + "step": 91155 + }, + { + "epoch": 1.8546549479166665, + "grad_norm": 19.0394344329834, + "learning_rate": 5.573788987626099e-06, + "loss": 5.2078, + "step": 91160 + }, + { + "epoch": 1.8547566731770835, + "grad_norm": 18.53523063659668, + "learning_rate": 5.573391948812038e-06, + "loss": 4.7725, + "step": 91165 + }, + { + "epoch": 1.8548583984375, + "grad_norm": 17.030778884887695, + "learning_rate": 5.572994906334186e-06, + "loss": 4.9576, + "step": 91170 + }, + { + "epoch": 1.8549601236979165, + "grad_norm": 34.44041442871094, + "learning_rate": 5.572597860195079e-06, + "loss": 4.9358, + "step": 91175 + }, + { + "epoch": 1.8550618489583335, + "grad_norm": 12.792932510375977, + "learning_rate": 5.572200810397253e-06, + "loss": 4.7954, + "step": 91180 + }, + { + "epoch": 1.85516357421875, + "grad_norm": 21.66972541809082, + "learning_rate": 5.571803756943245e-06, + "loss": 5.1909, + "step": 91185 + }, + { + "epoch": 1.8552652994791665, + "grad_norm": 22.90046501159668, + "learning_rate": 5.571406699835593e-06, + "loss": 4.8783, + "step": 91190 + }, + { + "epoch": 1.8553670247395835, + "grad_norm": 21.106693267822266, + "learning_rate": 5.571009639076833e-06, + "loss": 4.955, + "step": 91195 + }, + { + "epoch": 1.85546875, + "grad_norm": 14.904635429382324, + "learning_rate": 5.570612574669504e-06, + "loss": 4.6394, + "step": 91200 + }, + { + "epoch": 1.8555704752604165, + "grad_norm": 18.632814407348633, + "learning_rate": 5.570215506616141e-06, + "loss": 5.1176, + "step": 91205 + }, + { + "epoch": 1.8556722005208335, + "grad_norm": 23.894350051879883, + "learning_rate": 5.569818434919282e-06, + "loss": 4.6554, + "step": 91210 + }, + { + "epoch": 1.85577392578125, + "grad_norm": 17.842430114746094, + "learning_rate": 5.569421359581464e-06, + "loss": 5.1741, + "step": 91215 + }, + { + "epoch": 1.8558756510416665, + "grad_norm": 15.962475776672363, + "learning_rate": 5.569024280605224e-06, + "loss": 4.8873, + "step": 91220 + }, + { + "epoch": 1.8559773763020835, + "grad_norm": 19.035476684570312, + "learning_rate": 5.5686271979931e-06, + "loss": 4.9614, + "step": 91225 + }, + { + "epoch": 1.8560791015625, + "grad_norm": 19.868717193603516, + "learning_rate": 5.568230111747629e-06, + "loss": 5.3428, + "step": 91230 + }, + { + "epoch": 1.8561808268229165, + "grad_norm": 18.077672958374023, + "learning_rate": 5.5678330218713476e-06, + "loss": 4.8748, + "step": 91235 + }, + { + "epoch": 1.8562825520833335, + "grad_norm": 18.435962677001953, + "learning_rate": 5.567435928366793e-06, + "loss": 4.7853, + "step": 91240 + }, + { + "epoch": 1.85638427734375, + "grad_norm": 24.46708106994629, + "learning_rate": 5.5670388312365045e-06, + "loss": 5.0111, + "step": 91245 + }, + { + "epoch": 1.8564860026041665, + "grad_norm": 23.633407592773438, + "learning_rate": 5.566641730483015e-06, + "loss": 4.7524, + "step": 91250 + }, + { + "epoch": 1.8565877278645835, + "grad_norm": 17.74486541748047, + "learning_rate": 5.5662446261088675e-06, + "loss": 4.8178, + "step": 91255 + }, + { + "epoch": 1.856689453125, + "grad_norm": 19.22121810913086, + "learning_rate": 5.565847518116594e-06, + "loss": 5.057, + "step": 91260 + }, + { + "epoch": 1.8567911783854165, + "grad_norm": 17.036958694458008, + "learning_rate": 5.565450406508737e-06, + "loss": 4.6969, + "step": 91265 + }, + { + "epoch": 1.8568929036458335, + "grad_norm": 16.0377254486084, + "learning_rate": 5.565053291287831e-06, + "loss": 4.9521, + "step": 91270 + }, + { + "epoch": 1.85699462890625, + "grad_norm": 16.041013717651367, + "learning_rate": 5.564656172456412e-06, + "loss": 5.1029, + "step": 91275 + }, + { + "epoch": 1.8570963541666665, + "grad_norm": 12.62061882019043, + "learning_rate": 5.564259050017022e-06, + "loss": 4.8883, + "step": 91280 + }, + { + "epoch": 1.8571980794270835, + "grad_norm": 20.922788619995117, + "learning_rate": 5.5638619239721925e-06, + "loss": 4.7729, + "step": 91285 + }, + { + "epoch": 1.8572998046875, + "grad_norm": 19.993242263793945, + "learning_rate": 5.563464794324467e-06, + "loss": 4.8601, + "step": 91290 + }, + { + "epoch": 1.8574015299479165, + "grad_norm": 19.733840942382812, + "learning_rate": 5.5630676610763774e-06, + "loss": 4.8755, + "step": 91295 + }, + { + "epoch": 1.8575032552083335, + "grad_norm": 13.420479774475098, + "learning_rate": 5.562670524230468e-06, + "loss": 4.8351, + "step": 91300 + }, + { + "epoch": 1.85760498046875, + "grad_norm": 16.167234420776367, + "learning_rate": 5.562273383789269e-06, + "loss": 5.1797, + "step": 91305 + }, + { + "epoch": 1.8577067057291665, + "grad_norm": 19.43939781188965, + "learning_rate": 5.561876239755324e-06, + "loss": 4.9049, + "step": 91310 + }, + { + "epoch": 1.8578084309895835, + "grad_norm": 18.23601531982422, + "learning_rate": 5.5614790921311664e-06, + "loss": 4.8541, + "step": 91315 + }, + { + "epoch": 1.85791015625, + "grad_norm": 17.885467529296875, + "learning_rate": 5.561081940919338e-06, + "loss": 4.8933, + "step": 91320 + }, + { + "epoch": 1.8580118815104165, + "grad_norm": 20.249404907226562, + "learning_rate": 5.560684786122372e-06, + "loss": 4.9081, + "step": 91325 + }, + { + "epoch": 1.8581136067708335, + "grad_norm": 18.786304473876953, + "learning_rate": 5.56028762774281e-06, + "loss": 5.102, + "step": 91330 + }, + { + "epoch": 1.85821533203125, + "grad_norm": 21.8228816986084, + "learning_rate": 5.5598904657831865e-06, + "loss": 4.7068, + "step": 91335 + }, + { + "epoch": 1.8583170572916665, + "grad_norm": 16.708776473999023, + "learning_rate": 5.559493300246041e-06, + "loss": 4.5856, + "step": 91340 + }, + { + "epoch": 1.8584187825520835, + "grad_norm": 17.111309051513672, + "learning_rate": 5.559096131133912e-06, + "loss": 5.1703, + "step": 91345 + }, + { + "epoch": 1.8585205078125, + "grad_norm": 12.987927436828613, + "learning_rate": 5.5586989584493345e-06, + "loss": 4.8712, + "step": 91350 + }, + { + "epoch": 1.8586222330729165, + "grad_norm": 21.080158233642578, + "learning_rate": 5.5583017821948495e-06, + "loss": 4.8998, + "step": 91355 + }, + { + "epoch": 1.8587239583333335, + "grad_norm": 21.19326400756836, + "learning_rate": 5.557904602372992e-06, + "loss": 4.7984, + "step": 91360 + }, + { + "epoch": 1.85882568359375, + "grad_norm": 18.19942855834961, + "learning_rate": 5.557507418986301e-06, + "loss": 4.8475, + "step": 91365 + }, + { + "epoch": 1.8589274088541665, + "grad_norm": 18.917991638183594, + "learning_rate": 5.5571102320373164e-06, + "loss": 4.9796, + "step": 91370 + }, + { + "epoch": 1.8590291341145835, + "grad_norm": 17.333837509155273, + "learning_rate": 5.556713041528573e-06, + "loss": 5.0288, + "step": 91375 + }, + { + "epoch": 1.859130859375, + "grad_norm": 20.716812133789062, + "learning_rate": 5.55631584746261e-06, + "loss": 4.5534, + "step": 91380 + }, + { + "epoch": 1.8592325846354165, + "grad_norm": 20.97273063659668, + "learning_rate": 5.555918649841966e-06, + "loss": 4.9241, + "step": 91385 + }, + { + "epoch": 1.8593343098958335, + "grad_norm": 15.132340431213379, + "learning_rate": 5.5555214486691775e-06, + "loss": 4.9583, + "step": 91390 + }, + { + "epoch": 1.85943603515625, + "grad_norm": 17.843046188354492, + "learning_rate": 5.555124243946783e-06, + "loss": 4.6591, + "step": 91395 + }, + { + "epoch": 1.8595377604166665, + "grad_norm": 17.410114288330078, + "learning_rate": 5.554727035677323e-06, + "loss": 4.898, + "step": 91400 + }, + { + "epoch": 1.8596394856770835, + "grad_norm": 16.86821937561035, + "learning_rate": 5.5543298238633305e-06, + "loss": 4.9411, + "step": 91405 + }, + { + "epoch": 1.8597412109375, + "grad_norm": 17.363245010375977, + "learning_rate": 5.553932608507349e-06, + "loss": 4.9097, + "step": 91410 + }, + { + "epoch": 1.8598429361979165, + "grad_norm": 18.953033447265625, + "learning_rate": 5.553535389611912e-06, + "loss": 4.971, + "step": 91415 + }, + { + "epoch": 1.8599446614583335, + "grad_norm": 16.476146697998047, + "learning_rate": 5.55313816717956e-06, + "loss": 5.0275, + "step": 91420 + }, + { + "epoch": 1.86004638671875, + "grad_norm": 17.3944149017334, + "learning_rate": 5.552740941212831e-06, + "loss": 4.6984, + "step": 91425 + }, + { + "epoch": 1.8601481119791665, + "grad_norm": 20.183101654052734, + "learning_rate": 5.552343711714263e-06, + "loss": 5.0469, + "step": 91430 + }, + { + "epoch": 1.8602498372395835, + "grad_norm": 16.805654525756836, + "learning_rate": 5.551946478686393e-06, + "loss": 5.0684, + "step": 91435 + }, + { + "epoch": 1.8603515625, + "grad_norm": 24.94893455505371, + "learning_rate": 5.551549242131761e-06, + "loss": 4.9284, + "step": 91440 + }, + { + "epoch": 1.8604532877604165, + "grad_norm": 22.89864730834961, + "learning_rate": 5.5511520020529045e-06, + "loss": 4.7637, + "step": 91445 + }, + { + "epoch": 1.8605550130208335, + "grad_norm": 18.556772232055664, + "learning_rate": 5.550754758452361e-06, + "loss": 4.9009, + "step": 91450 + }, + { + "epoch": 1.86065673828125, + "grad_norm": 16.87031364440918, + "learning_rate": 5.5503575113326705e-06, + "loss": 5.2524, + "step": 91455 + }, + { + "epoch": 1.8607584635416665, + "grad_norm": 18.209123611450195, + "learning_rate": 5.54996026069637e-06, + "loss": 4.87, + "step": 91460 + }, + { + "epoch": 1.8608601888020835, + "grad_norm": 19.05719757080078, + "learning_rate": 5.5495630065459975e-06, + "loss": 5.2318, + "step": 91465 + }, + { + "epoch": 1.8609619140625, + "grad_norm": 15.439244270324707, + "learning_rate": 5.5491657488840915e-06, + "loss": 4.8863, + "step": 91470 + }, + { + "epoch": 1.8610636393229165, + "grad_norm": 21.83665657043457, + "learning_rate": 5.548768487713192e-06, + "loss": 4.6295, + "step": 91475 + }, + { + "epoch": 1.8611653645833335, + "grad_norm": 28.294492721557617, + "learning_rate": 5.548371223035834e-06, + "loss": 5.007, + "step": 91480 + }, + { + "epoch": 1.86126708984375, + "grad_norm": 18.214763641357422, + "learning_rate": 5.54797395485456e-06, + "loss": 4.8791, + "step": 91485 + }, + { + "epoch": 1.8613688151041665, + "grad_norm": 19.2043514251709, + "learning_rate": 5.547576683171907e-06, + "loss": 4.8755, + "step": 91490 + }, + { + "epoch": 1.8614705403645835, + "grad_norm": 19.203332901000977, + "learning_rate": 5.547179407990411e-06, + "loss": 4.9164, + "step": 91495 + }, + { + "epoch": 1.861572265625, + "grad_norm": 20.529340744018555, + "learning_rate": 5.546782129312613e-06, + "loss": 5.1883, + "step": 91500 + }, + { + "epoch": 1.8616739908854165, + "grad_norm": 17.389068603515625, + "learning_rate": 5.546384847141049e-06, + "loss": 4.9802, + "step": 91505 + }, + { + "epoch": 1.8617757161458335, + "grad_norm": 16.675228118896484, + "learning_rate": 5.545987561478261e-06, + "loss": 4.8567, + "step": 91510 + }, + { + "epoch": 1.86187744140625, + "grad_norm": 19.627023696899414, + "learning_rate": 5.545590272326786e-06, + "loss": 4.9677, + "step": 91515 + }, + { + "epoch": 1.8619791666666665, + "grad_norm": 15.62098503112793, + "learning_rate": 5.545192979689162e-06, + "loss": 5.0085, + "step": 91520 + }, + { + "epoch": 1.8620808919270835, + "grad_norm": 15.030251502990723, + "learning_rate": 5.544795683567928e-06, + "loss": 4.8076, + "step": 91525 + }, + { + "epoch": 1.8621826171875, + "grad_norm": 23.789628982543945, + "learning_rate": 5.544398383965624e-06, + "loss": 4.9987, + "step": 91530 + }, + { + "epoch": 1.8622843424479165, + "grad_norm": 19.90715980529785, + "learning_rate": 5.544001080884785e-06, + "loss": 4.7972, + "step": 91535 + }, + { + "epoch": 1.8623860677083335, + "grad_norm": 18.65886116027832, + "learning_rate": 5.543603774327953e-06, + "loss": 4.9962, + "step": 91540 + }, + { + "epoch": 1.86248779296875, + "grad_norm": 18.07675552368164, + "learning_rate": 5.543206464297664e-06, + "loss": 4.8524, + "step": 91545 + }, + { + "epoch": 1.8625895182291665, + "grad_norm": 18.06456184387207, + "learning_rate": 5.542809150796458e-06, + "loss": 4.9236, + "step": 91550 + }, + { + "epoch": 1.8626912434895835, + "grad_norm": 15.496565818786621, + "learning_rate": 5.542411833826875e-06, + "loss": 4.8299, + "step": 91555 + }, + { + "epoch": 1.86279296875, + "grad_norm": 17.648040771484375, + "learning_rate": 5.542014513391453e-06, + "loss": 5.3529, + "step": 91560 + }, + { + "epoch": 1.8628946940104165, + "grad_norm": 17.51048469543457, + "learning_rate": 5.541617189492728e-06, + "loss": 4.8222, + "step": 91565 + }, + { + "epoch": 1.8629964192708335, + "grad_norm": 14.223166465759277, + "learning_rate": 5.541219862133243e-06, + "loss": 4.5421, + "step": 91570 + }, + { + "epoch": 1.86309814453125, + "grad_norm": 15.545169830322266, + "learning_rate": 5.540822531315535e-06, + "loss": 4.6482, + "step": 91575 + }, + { + "epoch": 1.8631998697916665, + "grad_norm": 15.419975280761719, + "learning_rate": 5.540425197042142e-06, + "loss": 4.9216, + "step": 91580 + }, + { + "epoch": 1.8633015950520835, + "grad_norm": 18.500701904296875, + "learning_rate": 5.5400278593156035e-06, + "loss": 4.8278, + "step": 91585 + }, + { + "epoch": 1.8634033203125, + "grad_norm": 16.120376586914062, + "learning_rate": 5.539630518138459e-06, + "loss": 4.8113, + "step": 91590 + }, + { + "epoch": 1.8635050455729165, + "grad_norm": 13.130975723266602, + "learning_rate": 5.539233173513248e-06, + "loss": 5.181, + "step": 91595 + }, + { + "epoch": 1.8636067708333335, + "grad_norm": 20.609304428100586, + "learning_rate": 5.538835825442505e-06, + "loss": 4.9179, + "step": 91600 + }, + { + "epoch": 1.86370849609375, + "grad_norm": 19.078323364257812, + "learning_rate": 5.5384384739287745e-06, + "loss": 5.1414, + "step": 91605 + }, + { + "epoch": 1.8638102213541665, + "grad_norm": 26.264602661132812, + "learning_rate": 5.538041118974592e-06, + "loss": 5.0243, + "step": 91610 + }, + { + "epoch": 1.8639119466145835, + "grad_norm": 14.137322425842285, + "learning_rate": 5.537643760582498e-06, + "loss": 5.0514, + "step": 91615 + }, + { + "epoch": 1.864013671875, + "grad_norm": 17.518760681152344, + "learning_rate": 5.537246398755032e-06, + "loss": 4.7923, + "step": 91620 + }, + { + "epoch": 1.8641153971354165, + "grad_norm": 14.3350830078125, + "learning_rate": 5.53684903349473e-06, + "loss": 5.0121, + "step": 91625 + }, + { + "epoch": 1.8642171223958335, + "grad_norm": 13.208763122558594, + "learning_rate": 5.536451664804135e-06, + "loss": 5.0165, + "step": 91630 + }, + { + "epoch": 1.86431884765625, + "grad_norm": 16.36831283569336, + "learning_rate": 5.536054292685783e-06, + "loss": 4.849, + "step": 91635 + }, + { + "epoch": 1.8644205729166665, + "grad_norm": 18.603275299072266, + "learning_rate": 5.535656917142215e-06, + "loss": 4.9912, + "step": 91640 + }, + { + "epoch": 1.8645222981770835, + "grad_norm": 16.562116622924805, + "learning_rate": 5.535259538175969e-06, + "loss": 4.7937, + "step": 91645 + }, + { + "epoch": 1.8646240234375, + "grad_norm": 19.15859031677246, + "learning_rate": 5.534862155789584e-06, + "loss": 4.8173, + "step": 91650 + }, + { + "epoch": 1.8647257486979165, + "grad_norm": 19.83119773864746, + "learning_rate": 5.5344647699856005e-06, + "loss": 4.9007, + "step": 91655 + }, + { + "epoch": 1.8648274739583335, + "grad_norm": 15.427751541137695, + "learning_rate": 5.534067380766555e-06, + "loss": 4.6065, + "step": 91660 + }, + { + "epoch": 1.86492919921875, + "grad_norm": 14.216679573059082, + "learning_rate": 5.533669988134991e-06, + "loss": 4.9487, + "step": 91665 + }, + { + "epoch": 1.8650309244791665, + "grad_norm": 17.647735595703125, + "learning_rate": 5.5332725920934435e-06, + "loss": 4.9106, + "step": 91670 + }, + { + "epoch": 1.8651326497395835, + "grad_norm": 20.065603256225586, + "learning_rate": 5.532875192644454e-06, + "loss": 5.1214, + "step": 91675 + }, + { + "epoch": 1.865234375, + "grad_norm": 17.45903778076172, + "learning_rate": 5.532477789790561e-06, + "loss": 4.8088, + "step": 91680 + }, + { + "epoch": 1.8653361002604165, + "grad_norm": 16.103837966918945, + "learning_rate": 5.532080383534304e-06, + "loss": 4.6723, + "step": 91685 + }, + { + "epoch": 1.8654378255208335, + "grad_norm": 15.144491195678711, + "learning_rate": 5.531682973878223e-06, + "loss": 4.9436, + "step": 91690 + }, + { + "epoch": 1.86553955078125, + "grad_norm": 19.71076202392578, + "learning_rate": 5.531285560824855e-06, + "loss": 4.9958, + "step": 91695 + }, + { + "epoch": 1.8656412760416665, + "grad_norm": 14.363546371459961, + "learning_rate": 5.530888144376742e-06, + "loss": 4.97, + "step": 91700 + }, + { + "epoch": 1.8657430013020835, + "grad_norm": 19.23790168762207, + "learning_rate": 5.530490724536421e-06, + "loss": 4.9705, + "step": 91705 + }, + { + "epoch": 1.8658447265625, + "grad_norm": 23.054039001464844, + "learning_rate": 5.530093301306434e-06, + "loss": 4.9546, + "step": 91710 + }, + { + "epoch": 1.8659464518229165, + "grad_norm": 16.648202896118164, + "learning_rate": 5.529695874689318e-06, + "loss": 4.7992, + "step": 91715 + }, + { + "epoch": 1.8660481770833335, + "grad_norm": 15.727651596069336, + "learning_rate": 5.529298444687615e-06, + "loss": 4.7932, + "step": 91720 + }, + { + "epoch": 1.86614990234375, + "grad_norm": 17.74065399169922, + "learning_rate": 5.528901011303862e-06, + "loss": 5.1059, + "step": 91725 + }, + { + "epoch": 1.8662516276041665, + "grad_norm": 21.089628219604492, + "learning_rate": 5.5285035745406e-06, + "loss": 4.7324, + "step": 91730 + }, + { + "epoch": 1.8663533528645835, + "grad_norm": 15.349169731140137, + "learning_rate": 5.528106134400368e-06, + "loss": 5.0759, + "step": 91735 + }, + { + "epoch": 1.866455078125, + "grad_norm": 18.0784854888916, + "learning_rate": 5.527708690885705e-06, + "loss": 4.9182, + "step": 91740 + }, + { + "epoch": 1.8665568033854165, + "grad_norm": 14.707550048828125, + "learning_rate": 5.5273112439991506e-06, + "loss": 4.9375, + "step": 91745 + }, + { + "epoch": 1.8666585286458335, + "grad_norm": 23.102127075195312, + "learning_rate": 5.526913793743246e-06, + "loss": 4.7292, + "step": 91750 + }, + { + "epoch": 1.86676025390625, + "grad_norm": 21.007184982299805, + "learning_rate": 5.526516340120528e-06, + "loss": 4.9871, + "step": 91755 + }, + { + "epoch": 1.8668619791666665, + "grad_norm": 18.409610748291016, + "learning_rate": 5.5261188831335385e-06, + "loss": 5.0759, + "step": 91760 + }, + { + "epoch": 1.8669637044270835, + "grad_norm": 17.322866439819336, + "learning_rate": 5.525721422784817e-06, + "loss": 5.1099, + "step": 91765 + }, + { + "epoch": 1.8670654296875, + "grad_norm": 18.718273162841797, + "learning_rate": 5.525323959076902e-06, + "loss": 4.7911, + "step": 91770 + }, + { + "epoch": 1.8671671549479165, + "grad_norm": 20.097272872924805, + "learning_rate": 5.524926492012334e-06, + "loss": 4.8528, + "step": 91775 + }, + { + "epoch": 1.8672688802083335, + "grad_norm": 17.92220687866211, + "learning_rate": 5.524529021593651e-06, + "loss": 4.8518, + "step": 91780 + }, + { + "epoch": 1.86737060546875, + "grad_norm": 23.187454223632812, + "learning_rate": 5.524131547823395e-06, + "loss": 5.0477, + "step": 91785 + }, + { + "epoch": 1.8674723307291665, + "grad_norm": 18.42906379699707, + "learning_rate": 5.523734070704103e-06, + "loss": 4.7376, + "step": 91790 + }, + { + "epoch": 1.8675740559895835, + "grad_norm": 28.667909622192383, + "learning_rate": 5.52333659023832e-06, + "loss": 4.9121, + "step": 91795 + }, + { + "epoch": 1.86767578125, + "grad_norm": 17.129667282104492, + "learning_rate": 5.5229391064285795e-06, + "loss": 4.8873, + "step": 91800 + }, + { + "epoch": 1.8677775065104165, + "grad_norm": 15.700472831726074, + "learning_rate": 5.522541619277425e-06, + "loss": 4.8889, + "step": 91805 + }, + { + "epoch": 1.8678792317708335, + "grad_norm": 15.47894287109375, + "learning_rate": 5.522144128787397e-06, + "loss": 4.893, + "step": 91810 + }, + { + "epoch": 1.86798095703125, + "grad_norm": 16.546770095825195, + "learning_rate": 5.5217466349610304e-06, + "loss": 4.6822, + "step": 91815 + }, + { + "epoch": 1.8680826822916665, + "grad_norm": 21.296052932739258, + "learning_rate": 5.521349137800871e-06, + "loss": 4.9107, + "step": 91820 + }, + { + "epoch": 1.8681844075520835, + "grad_norm": 16.18364906311035, + "learning_rate": 5.520951637309455e-06, + "loss": 4.905, + "step": 91825 + }, + { + "epoch": 1.8682861328125, + "grad_norm": 19.04183578491211, + "learning_rate": 5.520554133489324e-06, + "loss": 4.7961, + "step": 91830 + }, + { + "epoch": 1.8683878580729165, + "grad_norm": 18.316696166992188, + "learning_rate": 5.520156626343017e-06, + "loss": 5.0127, + "step": 91835 + }, + { + "epoch": 1.8684895833333335, + "grad_norm": 16.889265060424805, + "learning_rate": 5.519759115873074e-06, + "loss": 4.6725, + "step": 91840 + }, + { + "epoch": 1.86859130859375, + "grad_norm": 15.830732345581055, + "learning_rate": 5.519361602082036e-06, + "loss": 4.7063, + "step": 91845 + }, + { + "epoch": 1.8686930338541665, + "grad_norm": 18.755971908569336, + "learning_rate": 5.518964084972443e-06, + "loss": 5.0789, + "step": 91850 + }, + { + "epoch": 1.8687947591145835, + "grad_norm": 18.355846405029297, + "learning_rate": 5.518566564546832e-06, + "loss": 4.6748, + "step": 91855 + }, + { + "epoch": 1.868896484375, + "grad_norm": 24.7823429107666, + "learning_rate": 5.518169040807747e-06, + "loss": 4.8752, + "step": 91860 + }, + { + "epoch": 1.8689982096354165, + "grad_norm": 20.032991409301758, + "learning_rate": 5.517771513757726e-06, + "loss": 4.8465, + "step": 91865 + }, + { + "epoch": 1.8690999348958335, + "grad_norm": 20.303245544433594, + "learning_rate": 5.517373983399309e-06, + "loss": 4.7957, + "step": 91870 + }, + { + "epoch": 1.86920166015625, + "grad_norm": 18.095335006713867, + "learning_rate": 5.516976449735036e-06, + "loss": 4.764, + "step": 91875 + }, + { + "epoch": 1.8693033854166665, + "grad_norm": 21.080251693725586, + "learning_rate": 5.516578912767449e-06, + "loss": 4.9925, + "step": 91880 + }, + { + "epoch": 1.8694051106770835, + "grad_norm": 18.31336784362793, + "learning_rate": 5.516181372499086e-06, + "loss": 4.5514, + "step": 91885 + }, + { + "epoch": 1.8695068359375, + "grad_norm": 22.163061141967773, + "learning_rate": 5.515783828932488e-06, + "loss": 4.9118, + "step": 91890 + }, + { + "epoch": 1.8696085611979165, + "grad_norm": 17.095794677734375, + "learning_rate": 5.515386282070194e-06, + "loss": 5.1087, + "step": 91895 + }, + { + "epoch": 1.8697102864583335, + "grad_norm": 22.071372985839844, + "learning_rate": 5.514988731914747e-06, + "loss": 4.6992, + "step": 91900 + }, + { + "epoch": 1.86981201171875, + "grad_norm": 18.052600860595703, + "learning_rate": 5.514591178468684e-06, + "loss": 4.961, + "step": 91905 + }, + { + "epoch": 1.8699137369791665, + "grad_norm": 15.977408409118652, + "learning_rate": 5.514193621734548e-06, + "loss": 5.2834, + "step": 91910 + }, + { + "epoch": 1.8700154622395835, + "grad_norm": 20.67996597290039, + "learning_rate": 5.513796061714878e-06, + "loss": 4.9374, + "step": 91915 + }, + { + "epoch": 1.8701171875, + "grad_norm": 21.44523048400879, + "learning_rate": 5.513398498412213e-06, + "loss": 5.0994, + "step": 91920 + }, + { + "epoch": 1.8702189127604165, + "grad_norm": 27.799137115478516, + "learning_rate": 5.513000931829096e-06, + "loss": 4.8525, + "step": 91925 + }, + { + "epoch": 1.8703206380208335, + "grad_norm": 22.7968692779541, + "learning_rate": 5.512603361968065e-06, + "loss": 4.7301, + "step": 91930 + }, + { + "epoch": 1.87042236328125, + "grad_norm": 21.436784744262695, + "learning_rate": 5.512205788831661e-06, + "loss": 4.8889, + "step": 91935 + }, + { + "epoch": 1.8705240885416665, + "grad_norm": 22.59742546081543, + "learning_rate": 5.511808212422425e-06, + "loss": 4.7972, + "step": 91940 + }, + { + "epoch": 1.8706258138020835, + "grad_norm": 20.574995040893555, + "learning_rate": 5.511410632742896e-06, + "loss": 4.9185, + "step": 91945 + }, + { + "epoch": 1.8707275390625, + "grad_norm": 15.957026481628418, + "learning_rate": 5.511013049795617e-06, + "loss": 4.849, + "step": 91950 + }, + { + "epoch": 1.8708292643229165, + "grad_norm": 15.015252113342285, + "learning_rate": 5.5106154635831255e-06, + "loss": 4.8228, + "step": 91955 + }, + { + "epoch": 1.8709309895833335, + "grad_norm": 20.319198608398438, + "learning_rate": 5.510217874107965e-06, + "loss": 5.1289, + "step": 91960 + }, + { + "epoch": 1.87103271484375, + "grad_norm": 17.704484939575195, + "learning_rate": 5.509820281372672e-06, + "loss": 4.9802, + "step": 91965 + }, + { + "epoch": 1.8711344401041665, + "grad_norm": 14.853597640991211, + "learning_rate": 5.50942268537979e-06, + "loss": 4.8752, + "step": 91970 + }, + { + "epoch": 1.8712361653645835, + "grad_norm": 17.529659271240234, + "learning_rate": 5.509025086131859e-06, + "loss": 5.0118, + "step": 91975 + }, + { + "epoch": 1.871337890625, + "grad_norm": 16.452098846435547, + "learning_rate": 5.50862748363142e-06, + "loss": 4.9118, + "step": 91980 + }, + { + "epoch": 1.8714396158854165, + "grad_norm": 16.58048439025879, + "learning_rate": 5.5082298778810116e-06, + "loss": 4.6788, + "step": 91985 + }, + { + "epoch": 1.8715413411458335, + "grad_norm": 29.330041885375977, + "learning_rate": 5.507832268883177e-06, + "loss": 4.7853, + "step": 91990 + }, + { + "epoch": 1.87164306640625, + "grad_norm": 19.033950805664062, + "learning_rate": 5.5074346566404555e-06, + "loss": 5.3195, + "step": 91995 + }, + { + "epoch": 1.8717447916666665, + "grad_norm": 16.960351943969727, + "learning_rate": 5.5070370411553865e-06, + "loss": 4.8863, + "step": 92000 + }, + { + "epoch": 1.8718465169270835, + "grad_norm": 22.654468536376953, + "learning_rate": 5.506639422430512e-06, + "loss": 4.7788, + "step": 92005 + }, + { + "epoch": 1.8719482421875, + "grad_norm": 19.741079330444336, + "learning_rate": 5.506241800468372e-06, + "loss": 4.8988, + "step": 92010 + }, + { + "epoch": 1.8720499674479165, + "grad_norm": 18.70145034790039, + "learning_rate": 5.5058441752715085e-06, + "loss": 5.0602, + "step": 92015 + }, + { + "epoch": 1.8721516927083335, + "grad_norm": 19.51987648010254, + "learning_rate": 5.5054465468424614e-06, + "loss": 4.6912, + "step": 92020 + }, + { + "epoch": 1.87225341796875, + "grad_norm": 21.737022399902344, + "learning_rate": 5.505048915183771e-06, + "loss": 5.1373, + "step": 92025 + }, + { + "epoch": 1.8723551432291665, + "grad_norm": 13.488371849060059, + "learning_rate": 5.504651280297978e-06, + "loss": 4.7599, + "step": 92030 + }, + { + "epoch": 1.8724568684895835, + "grad_norm": 15.93488597869873, + "learning_rate": 5.504253642187625e-06, + "loss": 4.9692, + "step": 92035 + }, + { + "epoch": 1.87255859375, + "grad_norm": 19.885467529296875, + "learning_rate": 5.503856000855249e-06, + "loss": 5.184, + "step": 92040 + }, + { + "epoch": 1.8726603190104165, + "grad_norm": 16.306528091430664, + "learning_rate": 5.503458356303394e-06, + "loss": 4.9891, + "step": 92045 + }, + { + "epoch": 1.8727620442708335, + "grad_norm": 17.902788162231445, + "learning_rate": 5.5030607085346e-06, + "loss": 4.7903, + "step": 92050 + }, + { + "epoch": 1.87286376953125, + "grad_norm": 17.01175308227539, + "learning_rate": 5.5026630575514074e-06, + "loss": 4.9939, + "step": 92055 + }, + { + "epoch": 1.8729654947916665, + "grad_norm": 20.578330993652344, + "learning_rate": 5.502265403356358e-06, + "loss": 4.8048, + "step": 92060 + }, + { + "epoch": 1.8730672200520835, + "grad_norm": 14.498869895935059, + "learning_rate": 5.501867745951992e-06, + "loss": 4.966, + "step": 92065 + }, + { + "epoch": 1.8731689453125, + "grad_norm": 18.47866439819336, + "learning_rate": 5.501470085340851e-06, + "loss": 5.0468, + "step": 92070 + }, + { + "epoch": 1.8732706705729165, + "grad_norm": 21.11434555053711, + "learning_rate": 5.501072421525474e-06, + "loss": 5.0906, + "step": 92075 + }, + { + "epoch": 1.8733723958333335, + "grad_norm": 16.83036231994629, + "learning_rate": 5.500674754508404e-06, + "loss": 4.8063, + "step": 92080 + }, + { + "epoch": 1.87347412109375, + "grad_norm": 20.471904754638672, + "learning_rate": 5.5002770842921814e-06, + "loss": 4.8094, + "step": 92085 + }, + { + "epoch": 1.8735758463541665, + "grad_norm": 19.937580108642578, + "learning_rate": 5.499879410879347e-06, + "loss": 4.9136, + "step": 92090 + }, + { + "epoch": 1.8736775716145835, + "grad_norm": 15.473755836486816, + "learning_rate": 5.49948173427244e-06, + "loss": 4.6377, + "step": 92095 + }, + { + "epoch": 1.873779296875, + "grad_norm": 15.361837387084961, + "learning_rate": 5.499084054474007e-06, + "loss": 4.6964, + "step": 92100 + }, + { + "epoch": 1.8738810221354165, + "grad_norm": 22.763376235961914, + "learning_rate": 5.4986863714865825e-06, + "loss": 5.1131, + "step": 92105 + }, + { + "epoch": 1.8739827473958335, + "grad_norm": 18.827159881591797, + "learning_rate": 5.498288685312712e-06, + "loss": 4.644, + "step": 92110 + }, + { + "epoch": 1.87408447265625, + "grad_norm": 16.68635368347168, + "learning_rate": 5.497890995954933e-06, + "loss": 5.0558, + "step": 92115 + }, + { + "epoch": 1.8741861979166665, + "grad_norm": 15.402429580688477, + "learning_rate": 5.497493303415791e-06, + "loss": 4.6568, + "step": 92120 + }, + { + "epoch": 1.8742879231770835, + "grad_norm": 20.17255210876465, + "learning_rate": 5.497095607697823e-06, + "loss": 5.0198, + "step": 92125 + }, + { + "epoch": 1.8743896484375, + "grad_norm": 17.710878372192383, + "learning_rate": 5.496697908803572e-06, + "loss": 4.8351, + "step": 92130 + }, + { + "epoch": 1.8744913736979165, + "grad_norm": 18.254512786865234, + "learning_rate": 5.49630020673558e-06, + "loss": 4.9545, + "step": 92135 + }, + { + "epoch": 1.8745930989583335, + "grad_norm": 19.23293113708496, + "learning_rate": 5.4959025014963866e-06, + "loss": 5.0248, + "step": 92140 + }, + { + "epoch": 1.87469482421875, + "grad_norm": 21.27627944946289, + "learning_rate": 5.495504793088533e-06, + "loss": 4.9898, + "step": 92145 + }, + { + "epoch": 1.8747965494791665, + "grad_norm": 19.38776397705078, + "learning_rate": 5.495107081514562e-06, + "loss": 4.8536, + "step": 92150 + }, + { + "epoch": 1.8748982747395835, + "grad_norm": 27.82377815246582, + "learning_rate": 5.494709366777013e-06, + "loss": 4.8184, + "step": 92155 + }, + { + "epoch": 1.875, + "grad_norm": 21.430879592895508, + "learning_rate": 5.49431164887843e-06, + "loss": 5.0788, + "step": 92160 + }, + { + "epoch": 1.8751017252604165, + "grad_norm": 15.692948341369629, + "learning_rate": 5.49391392782135e-06, + "loss": 5.0418, + "step": 92165 + }, + { + "epoch": 1.8752034505208335, + "grad_norm": 19.535987854003906, + "learning_rate": 5.493516203608319e-06, + "loss": 4.9478, + "step": 92170 + }, + { + "epoch": 1.87530517578125, + "grad_norm": 15.423843383789062, + "learning_rate": 5.493118476241876e-06, + "loss": 4.8082, + "step": 92175 + }, + { + "epoch": 1.8754069010416665, + "grad_norm": 19.475635528564453, + "learning_rate": 5.4927207457245614e-06, + "loss": 5.0253, + "step": 92180 + }, + { + "epoch": 1.8755086263020835, + "grad_norm": 30.327482223510742, + "learning_rate": 5.492323012058918e-06, + "loss": 4.8394, + "step": 92185 + }, + { + "epoch": 1.8756103515625, + "grad_norm": 19.12451171875, + "learning_rate": 5.491925275247487e-06, + "loss": 4.906, + "step": 92190 + }, + { + "epoch": 1.8757120768229165, + "grad_norm": 17.012739181518555, + "learning_rate": 5.491527535292809e-06, + "loss": 4.943, + "step": 92195 + }, + { + "epoch": 1.8758138020833335, + "grad_norm": 29.156543731689453, + "learning_rate": 5.491129792197427e-06, + "loss": 5.1714, + "step": 92200 + }, + { + "epoch": 1.87591552734375, + "grad_norm": 17.0872859954834, + "learning_rate": 5.49073204596388e-06, + "loss": 4.9024, + "step": 92205 + }, + { + "epoch": 1.8760172526041665, + "grad_norm": 17.52520751953125, + "learning_rate": 5.4903342965947125e-06, + "loss": 4.7178, + "step": 92210 + }, + { + "epoch": 1.8761189778645835, + "grad_norm": 24.160362243652344, + "learning_rate": 5.489936544092465e-06, + "loss": 5.1979, + "step": 92215 + }, + { + "epoch": 1.876220703125, + "grad_norm": 14.534836769104004, + "learning_rate": 5.4895387884596765e-06, + "loss": 4.7879, + "step": 92220 + }, + { + "epoch": 1.8763224283854165, + "grad_norm": 17.546306610107422, + "learning_rate": 5.489141029698892e-06, + "loss": 5.0393, + "step": 92225 + }, + { + "epoch": 1.8764241536458335, + "grad_norm": 15.64705753326416, + "learning_rate": 5.488743267812651e-06, + "loss": 4.9803, + "step": 92230 + }, + { + "epoch": 1.87652587890625, + "grad_norm": 19.67482566833496, + "learning_rate": 5.488345502803495e-06, + "loss": 4.6048, + "step": 92235 + }, + { + "epoch": 1.8766276041666665, + "grad_norm": 15.073782920837402, + "learning_rate": 5.487947734673967e-06, + "loss": 4.9038, + "step": 92240 + }, + { + "epoch": 1.8767293294270835, + "grad_norm": 20.289236068725586, + "learning_rate": 5.487549963426609e-06, + "loss": 4.7889, + "step": 92245 + }, + { + "epoch": 1.8768310546875, + "grad_norm": 22.048892974853516, + "learning_rate": 5.48715218906396e-06, + "loss": 5.0187, + "step": 92250 + }, + { + "epoch": 1.8769327799479165, + "grad_norm": 21.13364601135254, + "learning_rate": 5.486754411588564e-06, + "loss": 4.753, + "step": 92255 + }, + { + "epoch": 1.8770345052083335, + "grad_norm": 14.972295761108398, + "learning_rate": 5.486356631002962e-06, + "loss": 4.8819, + "step": 92260 + }, + { + "epoch": 1.87713623046875, + "grad_norm": 17.584304809570312, + "learning_rate": 5.485958847309695e-06, + "loss": 5.0497, + "step": 92265 + }, + { + "epoch": 1.8772379557291665, + "grad_norm": 19.703882217407227, + "learning_rate": 5.485561060511304e-06, + "loss": 4.8945, + "step": 92270 + }, + { + "epoch": 1.8773396809895835, + "grad_norm": 22.092981338500977, + "learning_rate": 5.485163270610334e-06, + "loss": 5.2204, + "step": 92275 + }, + { + "epoch": 1.87744140625, + "grad_norm": 15.983402252197266, + "learning_rate": 5.484765477609324e-06, + "loss": 4.6897, + "step": 92280 + }, + { + "epoch": 1.8775431315104165, + "grad_norm": 21.201045989990234, + "learning_rate": 5.484367681510816e-06, + "loss": 5.0148, + "step": 92285 + }, + { + "epoch": 1.8776448567708335, + "grad_norm": 22.580297470092773, + "learning_rate": 5.483969882317353e-06, + "loss": 5.0288, + "step": 92290 + }, + { + "epoch": 1.87774658203125, + "grad_norm": 15.681875228881836, + "learning_rate": 5.483572080031475e-06, + "loss": 4.9611, + "step": 92295 + }, + { + "epoch": 1.8778483072916665, + "grad_norm": 23.612884521484375, + "learning_rate": 5.483174274655726e-06, + "loss": 4.9186, + "step": 92300 + }, + { + "epoch": 1.8779500325520835, + "grad_norm": 16.803565979003906, + "learning_rate": 5.482776466192648e-06, + "loss": 4.9747, + "step": 92305 + }, + { + "epoch": 1.8780517578125, + "grad_norm": 18.447437286376953, + "learning_rate": 5.4823786546447785e-06, + "loss": 4.5009, + "step": 92310 + }, + { + "epoch": 1.8781534830729165, + "grad_norm": 17.150978088378906, + "learning_rate": 5.481980840014665e-06, + "loss": 4.6135, + "step": 92315 + }, + { + "epoch": 1.8782552083333335, + "grad_norm": 17.521507263183594, + "learning_rate": 5.481583022304845e-06, + "loss": 4.811, + "step": 92320 + }, + { + "epoch": 1.87835693359375, + "grad_norm": 20.087142944335938, + "learning_rate": 5.481185201517863e-06, + "loss": 4.7902, + "step": 92325 + }, + { + "epoch": 1.8784586588541665, + "grad_norm": 19.05034637451172, + "learning_rate": 5.480787377656263e-06, + "loss": 4.7717, + "step": 92330 + }, + { + "epoch": 1.8785603841145835, + "grad_norm": 15.647688865661621, + "learning_rate": 5.48038955072258e-06, + "loss": 4.7193, + "step": 92335 + }, + { + "epoch": 1.878662109375, + "grad_norm": 16.47617530822754, + "learning_rate": 5.479991720719363e-06, + "loss": 5.2261, + "step": 92340 + }, + { + "epoch": 1.8787638346354165, + "grad_norm": 17.381450653076172, + "learning_rate": 5.47959388764915e-06, + "loss": 4.7442, + "step": 92345 + }, + { + "epoch": 1.8788655598958335, + "grad_norm": 15.194785118103027, + "learning_rate": 5.479196051514485e-06, + "loss": 4.814, + "step": 92350 + }, + { + "epoch": 1.87896728515625, + "grad_norm": 17.47024154663086, + "learning_rate": 5.47879821231791e-06, + "loss": 4.7697, + "step": 92355 + }, + { + "epoch": 1.8790690104166665, + "grad_norm": 16.197864532470703, + "learning_rate": 5.478400370061965e-06, + "loss": 4.9757, + "step": 92360 + }, + { + "epoch": 1.8791707356770835, + "grad_norm": 18.37029457092285, + "learning_rate": 5.478002524749193e-06, + "loss": 4.8625, + "step": 92365 + }, + { + "epoch": 1.8792724609375, + "grad_norm": 21.727643966674805, + "learning_rate": 5.477604676382139e-06, + "loss": 5.015, + "step": 92370 + }, + { + "epoch": 1.8793741861979165, + "grad_norm": 17.592674255371094, + "learning_rate": 5.47720682496334e-06, + "loss": 4.862, + "step": 92375 + }, + { + "epoch": 1.8794759114583335, + "grad_norm": 15.754483222961426, + "learning_rate": 5.476808970495343e-06, + "loss": 4.8922, + "step": 92380 + }, + { + "epoch": 1.87957763671875, + "grad_norm": 24.382877349853516, + "learning_rate": 5.4764111129806866e-06, + "loss": 5.281, + "step": 92385 + }, + { + "epoch": 1.8796793619791665, + "grad_norm": 22.979982376098633, + "learning_rate": 5.4760132524219155e-06, + "loss": 4.7106, + "step": 92390 + }, + { + "epoch": 1.8797810872395835, + "grad_norm": 22.93172836303711, + "learning_rate": 5.475615388821571e-06, + "loss": 5.047, + "step": 92395 + }, + { + "epoch": 1.8798828125, + "grad_norm": 23.578176498413086, + "learning_rate": 5.475217522182195e-06, + "loss": 4.9365, + "step": 92400 + }, + { + "epoch": 1.8799845377604165, + "grad_norm": 24.86281394958496, + "learning_rate": 5.474819652506329e-06, + "loss": 4.7184, + "step": 92405 + }, + { + "epoch": 1.8800862630208335, + "grad_norm": 18.287109375, + "learning_rate": 5.474421779796517e-06, + "loss": 4.7942, + "step": 92410 + }, + { + "epoch": 1.88018798828125, + "grad_norm": 27.712879180908203, + "learning_rate": 5.4740239040553e-06, + "loss": 5.1904, + "step": 92415 + }, + { + "epoch": 1.8802897135416665, + "grad_norm": 16.004600524902344, + "learning_rate": 5.473626025285222e-06, + "loss": 4.9834, + "step": 92420 + }, + { + "epoch": 1.8803914388020835, + "grad_norm": 16.70769691467285, + "learning_rate": 5.4732281434888215e-06, + "loss": 4.9236, + "step": 92425 + }, + { + "epoch": 1.8804931640625, + "grad_norm": 18.473880767822266, + "learning_rate": 5.472830258668644e-06, + "loss": 4.8519, + "step": 92430 + }, + { + "epoch": 1.8805948893229165, + "grad_norm": 15.260566711425781, + "learning_rate": 5.472432370827233e-06, + "loss": 4.845, + "step": 92435 + }, + { + "epoch": 1.8806966145833335, + "grad_norm": 19.290712356567383, + "learning_rate": 5.4720344799671275e-06, + "loss": 4.7846, + "step": 92440 + }, + { + "epoch": 1.88079833984375, + "grad_norm": 19.771385192871094, + "learning_rate": 5.471636586090873e-06, + "loss": 4.854, + "step": 92445 + }, + { + "epoch": 1.8809000651041665, + "grad_norm": 17.235206604003906, + "learning_rate": 5.471238689201008e-06, + "loss": 4.8673, + "step": 92450 + }, + { + "epoch": 1.8810017903645835, + "grad_norm": 17.7247314453125, + "learning_rate": 5.470840789300081e-06, + "loss": 4.9079, + "step": 92455 + }, + { + "epoch": 1.881103515625, + "grad_norm": 15.295035362243652, + "learning_rate": 5.470442886390629e-06, + "loss": 4.9466, + "step": 92460 + }, + { + "epoch": 1.8812052408854165, + "grad_norm": 12.550505638122559, + "learning_rate": 5.470044980475195e-06, + "loss": 4.8991, + "step": 92465 + }, + { + "epoch": 1.8813069661458335, + "grad_norm": 11.375434875488281, + "learning_rate": 5.469647071556325e-06, + "loss": 5.0691, + "step": 92470 + }, + { + "epoch": 1.88140869140625, + "grad_norm": 17.047998428344727, + "learning_rate": 5.469249159636558e-06, + "loss": 4.8843, + "step": 92475 + }, + { + "epoch": 1.8815104166666665, + "grad_norm": 20.708873748779297, + "learning_rate": 5.468851244718437e-06, + "loss": 5.0401, + "step": 92480 + }, + { + "epoch": 1.8816121419270835, + "grad_norm": 16.015716552734375, + "learning_rate": 5.468453326804507e-06, + "loss": 4.9277, + "step": 92485 + }, + { + "epoch": 1.8817138671875, + "grad_norm": 24.64996910095215, + "learning_rate": 5.468055405897308e-06, + "loss": 4.5569, + "step": 92490 + }, + { + "epoch": 1.8818155924479165, + "grad_norm": 16.893451690673828, + "learning_rate": 5.467657481999384e-06, + "loss": 4.8984, + "step": 92495 + }, + { + "epoch": 1.8819173177083335, + "grad_norm": 16.220441818237305, + "learning_rate": 5.4672595551132755e-06, + "loss": 4.9664, + "step": 92500 + }, + { + "epoch": 1.88201904296875, + "grad_norm": 28.614341735839844, + "learning_rate": 5.466861625241528e-06, + "loss": 5.2619, + "step": 92505 + }, + { + "epoch": 1.8821207682291665, + "grad_norm": 16.38055419921875, + "learning_rate": 5.466463692386684e-06, + "loss": 4.949, + "step": 92510 + }, + { + "epoch": 1.8822224934895835, + "grad_norm": 15.709368705749512, + "learning_rate": 5.466065756551283e-06, + "loss": 4.52, + "step": 92515 + }, + { + "epoch": 1.88232421875, + "grad_norm": 18.37717056274414, + "learning_rate": 5.465667817737872e-06, + "loss": 4.9329, + "step": 92520 + }, + { + "epoch": 1.8824259440104165, + "grad_norm": 16.99716567993164, + "learning_rate": 5.465269875948991e-06, + "loss": 5.0496, + "step": 92525 + }, + { + "epoch": 1.8825276692708335, + "grad_norm": 15.939767837524414, + "learning_rate": 5.464871931187182e-06, + "loss": 4.8195, + "step": 92530 + }, + { + "epoch": 1.88262939453125, + "grad_norm": 14.290179252624512, + "learning_rate": 5.46447398345499e-06, + "loss": 4.8196, + "step": 92535 + }, + { + "epoch": 1.8827311197916665, + "grad_norm": 22.48125648498535, + "learning_rate": 5.464076032754956e-06, + "loss": 4.9458, + "step": 92540 + }, + { + "epoch": 1.8828328450520835, + "grad_norm": 14.178638458251953, + "learning_rate": 5.463678079089623e-06, + "loss": 4.7571, + "step": 92545 + }, + { + "epoch": 1.8829345703125, + "grad_norm": 16.101913452148438, + "learning_rate": 5.463280122461536e-06, + "loss": 4.9323, + "step": 92550 + }, + { + "epoch": 1.8830362955729165, + "grad_norm": 18.076229095458984, + "learning_rate": 5.462882162873234e-06, + "loss": 4.7497, + "step": 92555 + }, + { + "epoch": 1.8831380208333335, + "grad_norm": 21.281070709228516, + "learning_rate": 5.462484200327264e-06, + "loss": 4.7726, + "step": 92560 + }, + { + "epoch": 1.88323974609375, + "grad_norm": 16.144929885864258, + "learning_rate": 5.462086234826166e-06, + "loss": 5.0397, + "step": 92565 + }, + { + "epoch": 1.8833414713541665, + "grad_norm": 17.381505966186523, + "learning_rate": 5.461688266372484e-06, + "loss": 4.9973, + "step": 92570 + }, + { + "epoch": 1.8834431966145835, + "grad_norm": 18.518463134765625, + "learning_rate": 5.4612902949687595e-06, + "loss": 4.7459, + "step": 92575 + }, + { + "epoch": 1.883544921875, + "grad_norm": 23.752302169799805, + "learning_rate": 5.460892320617537e-06, + "loss": 4.9733, + "step": 92580 + }, + { + "epoch": 1.8836466471354165, + "grad_norm": 18.557191848754883, + "learning_rate": 5.460494343321358e-06, + "loss": 4.7372, + "step": 92585 + }, + { + "epoch": 1.8837483723958335, + "grad_norm": 26.405303955078125, + "learning_rate": 5.460096363082769e-06, + "loss": 5.0322, + "step": 92590 + }, + { + "epoch": 1.88385009765625, + "grad_norm": 18.813072204589844, + "learning_rate": 5.4596983799043065e-06, + "loss": 4.8847, + "step": 92595 + }, + { + "epoch": 1.8839518229166665, + "grad_norm": 16.491580963134766, + "learning_rate": 5.45930039378852e-06, + "loss": 4.6673, + "step": 92600 + }, + { + "epoch": 1.8840535481770835, + "grad_norm": 19.06393051147461, + "learning_rate": 5.458902404737949e-06, + "loss": 5.1055, + "step": 92605 + }, + { + "epoch": 1.8841552734375, + "grad_norm": 14.117443084716797, + "learning_rate": 5.4585044127551365e-06, + "loss": 4.9607, + "step": 92610 + }, + { + "epoch": 1.8842569986979165, + "grad_norm": 15.121976852416992, + "learning_rate": 5.458106417842628e-06, + "loss": 5.1438, + "step": 92615 + }, + { + "epoch": 1.8843587239583335, + "grad_norm": 14.49691104888916, + "learning_rate": 5.457708420002964e-06, + "loss": 4.6846, + "step": 92620 + }, + { + "epoch": 1.88446044921875, + "grad_norm": 19.492660522460938, + "learning_rate": 5.457310419238688e-06, + "loss": 4.8056, + "step": 92625 + }, + { + "epoch": 1.8845621744791665, + "grad_norm": 22.894412994384766, + "learning_rate": 5.456912415552344e-06, + "loss": 5.0227, + "step": 92630 + }, + { + "epoch": 1.8846638997395835, + "grad_norm": 23.063325881958008, + "learning_rate": 5.456514408946476e-06, + "loss": 5.2618, + "step": 92635 + }, + { + "epoch": 1.884765625, + "grad_norm": 17.966392517089844, + "learning_rate": 5.456116399423623e-06, + "loss": 4.9612, + "step": 92640 + }, + { + "epoch": 1.8848673502604165, + "grad_norm": 23.482717514038086, + "learning_rate": 5.455718386986333e-06, + "loss": 5.2559, + "step": 92645 + }, + { + "epoch": 1.8849690755208335, + "grad_norm": 15.700551986694336, + "learning_rate": 5.455320371637147e-06, + "loss": 5.07, + "step": 92650 + }, + { + "epoch": 1.88507080078125, + "grad_norm": 12.368987083435059, + "learning_rate": 5.454922353378609e-06, + "loss": 4.7827, + "step": 92655 + }, + { + "epoch": 1.8851725260416665, + "grad_norm": 22.309005737304688, + "learning_rate": 5.45452433221326e-06, + "loss": 4.9854, + "step": 92660 + }, + { + "epoch": 1.8852742513020835, + "grad_norm": 17.096044540405273, + "learning_rate": 5.454126308143646e-06, + "loss": 4.962, + "step": 92665 + }, + { + "epoch": 1.8853759765625, + "grad_norm": 16.073041915893555, + "learning_rate": 5.453728281172308e-06, + "loss": 4.8915, + "step": 92670 + }, + { + "epoch": 1.8854777018229165, + "grad_norm": 20.63597869873047, + "learning_rate": 5.453330251301792e-06, + "loss": 4.7607, + "step": 92675 + }, + { + "epoch": 1.8855794270833335, + "grad_norm": 19.564863204956055, + "learning_rate": 5.452932218534639e-06, + "loss": 4.7967, + "step": 92680 + }, + { + "epoch": 1.88568115234375, + "grad_norm": 15.166086196899414, + "learning_rate": 5.452534182873391e-06, + "loss": 4.5821, + "step": 92685 + }, + { + "epoch": 1.8857828776041665, + "grad_norm": 14.6532621383667, + "learning_rate": 5.452136144320596e-06, + "loss": 5.0014, + "step": 92690 + }, + { + "epoch": 1.8858846028645835, + "grad_norm": 16.91562271118164, + "learning_rate": 5.451738102878793e-06, + "loss": 4.8666, + "step": 92695 + }, + { + "epoch": 1.885986328125, + "grad_norm": 19.883926391601562, + "learning_rate": 5.4513400585505274e-06, + "loss": 5.0074, + "step": 92700 + }, + { + "epoch": 1.8860880533854165, + "grad_norm": 16.63773536682129, + "learning_rate": 5.450942011338341e-06, + "loss": 4.8972, + "step": 92705 + }, + { + "epoch": 1.8861897786458335, + "grad_norm": 23.612621307373047, + "learning_rate": 5.45054396124478e-06, + "loss": 4.7, + "step": 92710 + }, + { + "epoch": 1.88629150390625, + "grad_norm": 19.154672622680664, + "learning_rate": 5.450145908272385e-06, + "loss": 4.9875, + "step": 92715 + }, + { + "epoch": 1.8863932291666665, + "grad_norm": 32.0186882019043, + "learning_rate": 5.449747852423701e-06, + "loss": 4.9796, + "step": 92720 + }, + { + "epoch": 1.8864949544270835, + "grad_norm": 15.921988487243652, + "learning_rate": 5.449349793701271e-06, + "loss": 4.8561, + "step": 92725 + }, + { + "epoch": 1.8865966796875, + "grad_norm": 15.353067398071289, + "learning_rate": 5.448951732107638e-06, + "loss": 4.918, + "step": 92730 + }, + { + "epoch": 1.8866984049479165, + "grad_norm": 19.26955795288086, + "learning_rate": 5.4485536676453455e-06, + "loss": 4.9767, + "step": 92735 + }, + { + "epoch": 1.8868001302083335, + "grad_norm": 23.969636917114258, + "learning_rate": 5.448155600316939e-06, + "loss": 5.0087, + "step": 92740 + }, + { + "epoch": 1.88690185546875, + "grad_norm": 19.082157135009766, + "learning_rate": 5.44775753012496e-06, + "loss": 4.9038, + "step": 92745 + }, + { + "epoch": 1.8870035807291665, + "grad_norm": 22.53911781311035, + "learning_rate": 5.4473594570719516e-06, + "loss": 4.7915, + "step": 92750 + }, + { + "epoch": 1.8871053059895835, + "grad_norm": 24.463029861450195, + "learning_rate": 5.4469613811604595e-06, + "loss": 4.9966, + "step": 92755 + }, + { + "epoch": 1.88720703125, + "grad_norm": 21.81464385986328, + "learning_rate": 5.446563302393024e-06, + "loss": 4.8067, + "step": 92760 + }, + { + "epoch": 1.8873087565104165, + "grad_norm": 23.138761520385742, + "learning_rate": 5.446165220772193e-06, + "loss": 4.9215, + "step": 92765 + }, + { + "epoch": 1.8874104817708335, + "grad_norm": 19.232345581054688, + "learning_rate": 5.445767136300507e-06, + "loss": 4.8097, + "step": 92770 + }, + { + "epoch": 1.88751220703125, + "grad_norm": 16.193143844604492, + "learning_rate": 5.445369048980511e-06, + "loss": 5.0036, + "step": 92775 + }, + { + "epoch": 1.8876139322916665, + "grad_norm": 13.825996398925781, + "learning_rate": 5.444970958814747e-06, + "loss": 4.7981, + "step": 92780 + }, + { + "epoch": 1.8877156575520835, + "grad_norm": 16.071088790893555, + "learning_rate": 5.444572865805761e-06, + "loss": 4.7695, + "step": 92785 + }, + { + "epoch": 1.8878173828125, + "grad_norm": 21.41321563720703, + "learning_rate": 5.444174769956094e-06, + "loss": 4.6602, + "step": 92790 + }, + { + "epoch": 1.8879191080729165, + "grad_norm": 23.460336685180664, + "learning_rate": 5.443776671268293e-06, + "loss": 4.8704, + "step": 92795 + }, + { + "epoch": 1.8880208333333335, + "grad_norm": 15.899625778198242, + "learning_rate": 5.4433785697448985e-06, + "loss": 4.9698, + "step": 92800 + }, + { + "epoch": 1.88812255859375, + "grad_norm": 17.85251235961914, + "learning_rate": 5.442980465388457e-06, + "loss": 4.7366, + "step": 92805 + }, + { + "epoch": 1.8882242838541665, + "grad_norm": 17.95648956298828, + "learning_rate": 5.44258235820151e-06, + "loss": 4.7972, + "step": 92810 + }, + { + "epoch": 1.8883260091145835, + "grad_norm": 16.250732421875, + "learning_rate": 5.442184248186603e-06, + "loss": 4.9069, + "step": 92815 + }, + { + "epoch": 1.888427734375, + "grad_norm": 16.074832916259766, + "learning_rate": 5.4417861353462784e-06, + "loss": 4.7973, + "step": 92820 + }, + { + "epoch": 1.8885294596354165, + "grad_norm": 20.83652687072754, + "learning_rate": 5.44138801968308e-06, + "loss": 4.961, + "step": 92825 + }, + { + "epoch": 1.8886311848958335, + "grad_norm": 15.880293846130371, + "learning_rate": 5.440989901199554e-06, + "loss": 5.0235, + "step": 92830 + }, + { + "epoch": 1.88873291015625, + "grad_norm": 13.09022045135498, + "learning_rate": 5.440591779898242e-06, + "loss": 4.9861, + "step": 92835 + }, + { + "epoch": 1.8888346354166665, + "grad_norm": 26.219467163085938, + "learning_rate": 5.440193655781687e-06, + "loss": 5.3559, + "step": 92840 + }, + { + "epoch": 1.8889363606770835, + "grad_norm": 16.024328231811523, + "learning_rate": 5.439795528852436e-06, + "loss": 4.6865, + "step": 92845 + }, + { + "epoch": 1.8890380859375, + "grad_norm": 41.21711349487305, + "learning_rate": 5.439397399113031e-06, + "loss": 5.0717, + "step": 92850 + }, + { + "epoch": 1.8891398111979165, + "grad_norm": 21.81226348876953, + "learning_rate": 5.438999266566015e-06, + "loss": 4.8482, + "step": 92855 + }, + { + "epoch": 1.8892415364583335, + "grad_norm": 13.24376106262207, + "learning_rate": 5.438601131213934e-06, + "loss": 4.7973, + "step": 92860 + }, + { + "epoch": 1.88934326171875, + "grad_norm": 14.634038925170898, + "learning_rate": 5.4382029930593305e-06, + "loss": 5.093, + "step": 92865 + }, + { + "epoch": 1.8894449869791665, + "grad_norm": 17.691936492919922, + "learning_rate": 5.437804852104749e-06, + "loss": 4.9109, + "step": 92870 + }, + { + "epoch": 1.8895467122395835, + "grad_norm": 18.622703552246094, + "learning_rate": 5.437406708352734e-06, + "loss": 4.7516, + "step": 92875 + }, + { + "epoch": 1.8896484375, + "grad_norm": 13.357197761535645, + "learning_rate": 5.437008561805828e-06, + "loss": 4.9523, + "step": 92880 + }, + { + "epoch": 1.8897501627604165, + "grad_norm": 21.406478881835938, + "learning_rate": 5.436610412466578e-06, + "loss": 4.8885, + "step": 92885 + }, + { + "epoch": 1.8898518880208335, + "grad_norm": 20.5092830657959, + "learning_rate": 5.436212260337524e-06, + "loss": 4.9741, + "step": 92890 + }, + { + "epoch": 1.88995361328125, + "grad_norm": 16.67957878112793, + "learning_rate": 5.435814105421212e-06, + "loss": 4.6493, + "step": 92895 + }, + { + "epoch": 1.8900553385416665, + "grad_norm": 16.0117244720459, + "learning_rate": 5.435415947720186e-06, + "loss": 4.7875, + "step": 92900 + }, + { + "epoch": 1.8901570638020835, + "grad_norm": 17.175209045410156, + "learning_rate": 5.435017787236992e-06, + "loss": 5.1057, + "step": 92905 + }, + { + "epoch": 1.8902587890625, + "grad_norm": 17.88678741455078, + "learning_rate": 5.434619623974171e-06, + "loss": 4.7133, + "step": 92910 + }, + { + "epoch": 1.8903605143229165, + "grad_norm": 18.090478897094727, + "learning_rate": 5.43422145793427e-06, + "loss": 4.6775, + "step": 92915 + }, + { + "epoch": 1.8904622395833335, + "grad_norm": 17.942705154418945, + "learning_rate": 5.43382328911983e-06, + "loss": 5.071, + "step": 92920 + }, + { + "epoch": 1.89056396484375, + "grad_norm": 16.577220916748047, + "learning_rate": 5.433425117533399e-06, + "loss": 4.9882, + "step": 92925 + }, + { + "epoch": 1.8906656901041665, + "grad_norm": 18.173856735229492, + "learning_rate": 5.433026943177517e-06, + "loss": 4.9318, + "step": 92930 + }, + { + "epoch": 1.8907674153645835, + "grad_norm": 16.965795516967773, + "learning_rate": 5.432628766054732e-06, + "loss": 4.9321, + "step": 92935 + }, + { + "epoch": 1.890869140625, + "grad_norm": 19.13996124267578, + "learning_rate": 5.432230586167586e-06, + "loss": 4.845, + "step": 92940 + }, + { + "epoch": 1.8909708658854165, + "grad_norm": 16.35723114013672, + "learning_rate": 5.431832403518623e-06, + "loss": 5.1096, + "step": 92945 + }, + { + "epoch": 1.8910725911458335, + "grad_norm": 20.666444778442383, + "learning_rate": 5.43143421811039e-06, + "loss": 4.7196, + "step": 92950 + }, + { + "epoch": 1.89117431640625, + "grad_norm": 14.823539733886719, + "learning_rate": 5.431036029945427e-06, + "loss": 4.9815, + "step": 92955 + }, + { + "epoch": 1.8912760416666665, + "grad_norm": 16.747394561767578, + "learning_rate": 5.430637839026282e-06, + "loss": 4.6128, + "step": 92960 + }, + { + "epoch": 1.8913777669270835, + "grad_norm": 14.567018508911133, + "learning_rate": 5.430239645355497e-06, + "loss": 4.6691, + "step": 92965 + }, + { + "epoch": 1.8914794921875, + "grad_norm": 17.537960052490234, + "learning_rate": 5.429841448935618e-06, + "loss": 5.0177, + "step": 92970 + }, + { + "epoch": 1.8915812174479165, + "grad_norm": 19.309688568115234, + "learning_rate": 5.429443249769189e-06, + "loss": 4.9491, + "step": 92975 + }, + { + "epoch": 1.8916829427083335, + "grad_norm": 16.008337020874023, + "learning_rate": 5.429045047858752e-06, + "loss": 4.7307, + "step": 92980 + }, + { + "epoch": 1.89178466796875, + "grad_norm": 16.94574737548828, + "learning_rate": 5.428646843206855e-06, + "loss": 5.3155, + "step": 92985 + }, + { + "epoch": 1.8918863932291665, + "grad_norm": 18.793928146362305, + "learning_rate": 5.42824863581604e-06, + "loss": 4.9212, + "step": 92990 + }, + { + "epoch": 1.8919881184895835, + "grad_norm": 20.394346237182617, + "learning_rate": 5.427850425688852e-06, + "loss": 5.0598, + "step": 92995 + }, + { + "epoch": 1.89208984375, + "grad_norm": 16.543094635009766, + "learning_rate": 5.4274522128278375e-06, + "loss": 4.9599, + "step": 93000 + }, + { + "epoch": 1.8921915690104165, + "grad_norm": 16.26837921142578, + "learning_rate": 5.427053997235537e-06, + "loss": 4.8531, + "step": 93005 + }, + { + "epoch": 1.8922932942708335, + "grad_norm": 21.217243194580078, + "learning_rate": 5.426655778914497e-06, + "loss": 5.0368, + "step": 93010 + }, + { + "epoch": 1.89239501953125, + "grad_norm": 16.369319915771484, + "learning_rate": 5.426257557867264e-06, + "loss": 4.9473, + "step": 93015 + }, + { + "epoch": 1.8924967447916665, + "grad_norm": 16.338886260986328, + "learning_rate": 5.425859334096378e-06, + "loss": 5.1081, + "step": 93020 + }, + { + "epoch": 1.8925984700520835, + "grad_norm": 17.844717025756836, + "learning_rate": 5.425461107604387e-06, + "loss": 4.9275, + "step": 93025 + }, + { + "epoch": 1.8927001953125, + "grad_norm": 18.341079711914062, + "learning_rate": 5.425062878393835e-06, + "loss": 4.7966, + "step": 93030 + }, + { + "epoch": 1.8928019205729165, + "grad_norm": 24.624765396118164, + "learning_rate": 5.4246646464672655e-06, + "loss": 4.8123, + "step": 93035 + }, + { + "epoch": 1.8929036458333335, + "grad_norm": 20.539344787597656, + "learning_rate": 5.4242664118272235e-06, + "loss": 5.1002, + "step": 93040 + }, + { + "epoch": 1.89300537109375, + "grad_norm": 18.990964889526367, + "learning_rate": 5.423868174476256e-06, + "loss": 4.8695, + "step": 93045 + }, + { + "epoch": 1.8931070963541665, + "grad_norm": 19.391958236694336, + "learning_rate": 5.423469934416903e-06, + "loss": 4.7873, + "step": 93050 + }, + { + "epoch": 1.8932088216145835, + "grad_norm": 19.669126510620117, + "learning_rate": 5.423071691651712e-06, + "loss": 4.9039, + "step": 93055 + }, + { + "epoch": 1.893310546875, + "grad_norm": 15.663667678833008, + "learning_rate": 5.422673446183228e-06, + "loss": 4.8389, + "step": 93060 + }, + { + "epoch": 1.8934122721354165, + "grad_norm": 14.047404289245605, + "learning_rate": 5.4222751980139944e-06, + "loss": 4.7207, + "step": 93065 + }, + { + "epoch": 1.8935139973958335, + "grad_norm": 16.77634620666504, + "learning_rate": 5.421876947146557e-06, + "loss": 4.8345, + "step": 93070 + }, + { + "epoch": 1.89361572265625, + "grad_norm": 17.207504272460938, + "learning_rate": 5.4214786935834585e-06, + "loss": 4.7148, + "step": 93075 + }, + { + "epoch": 1.8937174479166665, + "grad_norm": 16.103609085083008, + "learning_rate": 5.421080437327247e-06, + "loss": 4.7755, + "step": 93080 + }, + { + "epoch": 1.8938191731770835, + "grad_norm": 26.688377380371094, + "learning_rate": 5.420682178380462e-06, + "loss": 5.0967, + "step": 93085 + }, + { + "epoch": 1.8939208984375, + "grad_norm": 16.754596710205078, + "learning_rate": 5.420283916745654e-06, + "loss": 4.9276, + "step": 93090 + }, + { + "epoch": 1.8940226236979165, + "grad_norm": 10.581146240234375, + "learning_rate": 5.419885652425365e-06, + "loss": 4.7292, + "step": 93095 + }, + { + "epoch": 1.8941243489583335, + "grad_norm": 15.564288139343262, + "learning_rate": 5.419487385422138e-06, + "loss": 4.7444, + "step": 93100 + }, + { + "epoch": 1.89422607421875, + "grad_norm": 21.31163787841797, + "learning_rate": 5.419089115738523e-06, + "loss": 4.7936, + "step": 93105 + }, + { + "epoch": 1.8943277994791665, + "grad_norm": 15.529501914978027, + "learning_rate": 5.418690843377058e-06, + "loss": 4.8437, + "step": 93110 + }, + { + "epoch": 1.8944295247395835, + "grad_norm": 17.770597457885742, + "learning_rate": 5.418292568340294e-06, + "loss": 4.7239, + "step": 93115 + }, + { + "epoch": 1.89453125, + "grad_norm": 16.196887969970703, + "learning_rate": 5.417894290630773e-06, + "loss": 4.7343, + "step": 93120 + }, + { + "epoch": 1.8946329752604165, + "grad_norm": 19.7736759185791, + "learning_rate": 5.4174960102510385e-06, + "loss": 4.9825, + "step": 93125 + }, + { + "epoch": 1.8947347005208335, + "grad_norm": 18.960018157958984, + "learning_rate": 5.4170977272036395e-06, + "loss": 4.9223, + "step": 93130 + }, + { + "epoch": 1.89483642578125, + "grad_norm": 14.238478660583496, + "learning_rate": 5.416699441491117e-06, + "loss": 4.9354, + "step": 93135 + }, + { + "epoch": 1.8949381510416665, + "grad_norm": 17.591262817382812, + "learning_rate": 5.416301153116017e-06, + "loss": 5.0873, + "step": 93140 + }, + { + "epoch": 1.8950398763020835, + "grad_norm": 15.01322078704834, + "learning_rate": 5.415902862080885e-06, + "loss": 4.7937, + "step": 93145 + }, + { + "epoch": 1.8951416015625, + "grad_norm": 17.745269775390625, + "learning_rate": 5.415504568388266e-06, + "loss": 4.911, + "step": 93150 + }, + { + "epoch": 1.8952433268229165, + "grad_norm": 18.46556282043457, + "learning_rate": 5.415106272040704e-06, + "loss": 4.7958, + "step": 93155 + }, + { + "epoch": 1.8953450520833335, + "grad_norm": 16.633512496948242, + "learning_rate": 5.414707973040746e-06, + "loss": 4.9251, + "step": 93160 + }, + { + "epoch": 1.89544677734375, + "grad_norm": 16.393068313598633, + "learning_rate": 5.4143096713909345e-06, + "loss": 5.1033, + "step": 93165 + }, + { + "epoch": 1.8955485026041665, + "grad_norm": 17.72931480407715, + "learning_rate": 5.413911367093816e-06, + "loss": 4.897, + "step": 93170 + }, + { + "epoch": 1.8956502278645835, + "grad_norm": 19.924291610717773, + "learning_rate": 5.413513060151936e-06, + "loss": 4.9372, + "step": 93175 + }, + { + "epoch": 1.895751953125, + "grad_norm": 19.439464569091797, + "learning_rate": 5.4131147505678384e-06, + "loss": 4.7848, + "step": 93180 + }, + { + "epoch": 1.8958536783854165, + "grad_norm": 16.046979904174805, + "learning_rate": 5.412716438344068e-06, + "loss": 4.8107, + "step": 93185 + }, + { + "epoch": 1.8959554036458335, + "grad_norm": 13.02415943145752, + "learning_rate": 5.412318123483171e-06, + "loss": 5.0796, + "step": 93190 + }, + { + "epoch": 1.89605712890625, + "grad_norm": 14.873236656188965, + "learning_rate": 5.411919805987693e-06, + "loss": 4.8159, + "step": 93195 + }, + { + "epoch": 1.8961588541666665, + "grad_norm": 13.751240730285645, + "learning_rate": 5.411521485860176e-06, + "loss": 5.0954, + "step": 93200 + }, + { + "epoch": 1.8962605794270835, + "grad_norm": 21.189510345458984, + "learning_rate": 5.4111231631031695e-06, + "loss": 4.8465, + "step": 93205 + }, + { + "epoch": 1.8963623046875, + "grad_norm": 22.41837501525879, + "learning_rate": 5.4107248377192164e-06, + "loss": 4.7008, + "step": 93210 + }, + { + "epoch": 1.8964640299479165, + "grad_norm": 17.99888038635254, + "learning_rate": 5.4103265097108606e-06, + "loss": 4.9375, + "step": 93215 + }, + { + "epoch": 1.8965657552083335, + "grad_norm": 22.561220169067383, + "learning_rate": 5.4099281790806506e-06, + "loss": 4.657, + "step": 93220 + }, + { + "epoch": 1.89666748046875, + "grad_norm": 14.287466049194336, + "learning_rate": 5.409529845831128e-06, + "loss": 4.9869, + "step": 93225 + }, + { + "epoch": 1.8967692057291665, + "grad_norm": 16.172588348388672, + "learning_rate": 5.40913150996484e-06, + "loss": 4.8233, + "step": 93230 + }, + { + "epoch": 1.8968709309895835, + "grad_norm": 20.6529483795166, + "learning_rate": 5.4087331714843315e-06, + "loss": 4.9451, + "step": 93235 + }, + { + "epoch": 1.89697265625, + "grad_norm": 18.464466094970703, + "learning_rate": 5.408334830392148e-06, + "loss": 4.8834, + "step": 93240 + }, + { + "epoch": 1.8970743815104165, + "grad_norm": 21.52555274963379, + "learning_rate": 5.407936486690835e-06, + "loss": 4.7948, + "step": 93245 + }, + { + "epoch": 1.8971761067708335, + "grad_norm": 17.54633903503418, + "learning_rate": 5.407538140382938e-06, + "loss": 4.8008, + "step": 93250 + }, + { + "epoch": 1.89727783203125, + "grad_norm": 28.178863525390625, + "learning_rate": 5.407139791470999e-06, + "loss": 5.0825, + "step": 93255 + }, + { + "epoch": 1.8973795572916665, + "grad_norm": 20.95922088623047, + "learning_rate": 5.406741439957568e-06, + "loss": 5.1951, + "step": 93260 + }, + { + "epoch": 1.8974812825520835, + "grad_norm": 20.220909118652344, + "learning_rate": 5.406343085845187e-06, + "loss": 4.7875, + "step": 93265 + }, + { + "epoch": 1.8975830078125, + "grad_norm": 19.705724716186523, + "learning_rate": 5.405944729136405e-06, + "loss": 4.9501, + "step": 93270 + }, + { + "epoch": 1.8976847330729165, + "grad_norm": 19.59212875366211, + "learning_rate": 5.405546369833763e-06, + "loss": 5.0385, + "step": 93275 + }, + { + "epoch": 1.8977864583333335, + "grad_norm": 15.197775840759277, + "learning_rate": 5.405148007939809e-06, + "loss": 5.0254, + "step": 93280 + }, + { + "epoch": 1.89788818359375, + "grad_norm": 18.28522491455078, + "learning_rate": 5.404749643457088e-06, + "loss": 5.0208, + "step": 93285 + }, + { + "epoch": 1.8979899088541665, + "grad_norm": 16.788360595703125, + "learning_rate": 5.4043512763881445e-06, + "loss": 4.6467, + "step": 93290 + }, + { + "epoch": 1.8980916341145835, + "grad_norm": 13.915765762329102, + "learning_rate": 5.403952906735526e-06, + "loss": 4.7047, + "step": 93295 + }, + { + "epoch": 1.898193359375, + "grad_norm": 18.031208038330078, + "learning_rate": 5.4035545345017745e-06, + "loss": 4.8729, + "step": 93300 + }, + { + "epoch": 1.8982950846354165, + "grad_norm": 13.690703392028809, + "learning_rate": 5.40315615968944e-06, + "loss": 5.0366, + "step": 93305 + }, + { + "epoch": 1.8983968098958335, + "grad_norm": 16.32570457458496, + "learning_rate": 5.402757782301063e-06, + "loss": 4.8062, + "step": 93310 + }, + { + "epoch": 1.89849853515625, + "grad_norm": 17.19338035583496, + "learning_rate": 5.402359402339194e-06, + "loss": 4.8542, + "step": 93315 + }, + { + "epoch": 1.8986002604166665, + "grad_norm": 16.08804702758789, + "learning_rate": 5.401961019806376e-06, + "loss": 4.9576, + "step": 93320 + }, + { + "epoch": 1.8987019856770835, + "grad_norm": 19.49900245666504, + "learning_rate": 5.401562634705154e-06, + "loss": 4.8404, + "step": 93325 + }, + { + "epoch": 1.8988037109375, + "grad_norm": 15.89953327178955, + "learning_rate": 5.401164247038073e-06, + "loss": 4.9637, + "step": 93330 + }, + { + "epoch": 1.8989054361979165, + "grad_norm": 17.45810317993164, + "learning_rate": 5.400765856807683e-06, + "loss": 4.9797, + "step": 93335 + }, + { + "epoch": 1.8990071614583335, + "grad_norm": 21.86698341369629, + "learning_rate": 5.400367464016524e-06, + "loss": 5.1214, + "step": 93340 + }, + { + "epoch": 1.89910888671875, + "grad_norm": 13.254096984863281, + "learning_rate": 5.399969068667144e-06, + "loss": 5.0495, + "step": 93345 + }, + { + "epoch": 1.8992106119791665, + "grad_norm": 17.58172035217285, + "learning_rate": 5.3995706707620896e-06, + "loss": 4.7902, + "step": 93350 + }, + { + "epoch": 1.8993123372395835, + "grad_norm": 13.885978698730469, + "learning_rate": 5.3991722703039064e-06, + "loss": 4.971, + "step": 93355 + }, + { + "epoch": 1.8994140625, + "grad_norm": 21.00851821899414, + "learning_rate": 5.398773867295136e-06, + "loss": 4.8318, + "step": 93360 + }, + { + "epoch": 1.8995157877604165, + "grad_norm": 22.632638931274414, + "learning_rate": 5.39837546173833e-06, + "loss": 4.9264, + "step": 93365 + }, + { + "epoch": 1.8996175130208335, + "grad_norm": 23.522384643554688, + "learning_rate": 5.39797705363603e-06, + "loss": 4.854, + "step": 93370 + }, + { + "epoch": 1.89971923828125, + "grad_norm": 20.558671951293945, + "learning_rate": 5.397578642990783e-06, + "loss": 4.9572, + "step": 93375 + }, + { + "epoch": 1.8998209635416665, + "grad_norm": 18.865009307861328, + "learning_rate": 5.397180229805136e-06, + "loss": 4.8404, + "step": 93380 + }, + { + "epoch": 1.8999226888020835, + "grad_norm": 16.399656295776367, + "learning_rate": 5.396781814081631e-06, + "loss": 4.5224, + "step": 93385 + }, + { + "epoch": 1.9000244140625, + "grad_norm": 20.68812370300293, + "learning_rate": 5.396383395822818e-06, + "loss": 5.0505, + "step": 93390 + }, + { + "epoch": 1.9001261393229165, + "grad_norm": 16.767370223999023, + "learning_rate": 5.39598497503124e-06, + "loss": 4.812, + "step": 93395 + }, + { + "epoch": 1.9002278645833335, + "grad_norm": 14.734038352966309, + "learning_rate": 5.395586551709444e-06, + "loss": 5.0717, + "step": 93400 + }, + { + "epoch": 1.90032958984375, + "grad_norm": 13.306705474853516, + "learning_rate": 5.395188125859976e-06, + "loss": 4.8857, + "step": 93405 + }, + { + "epoch": 1.9004313151041665, + "grad_norm": 14.54050350189209, + "learning_rate": 5.394789697485379e-06, + "loss": 4.7168, + "step": 93410 + }, + { + "epoch": 1.9005330403645835, + "grad_norm": 18.89588737487793, + "learning_rate": 5.394391266588205e-06, + "loss": 5.0223, + "step": 93415 + }, + { + "epoch": 1.900634765625, + "grad_norm": 19.75089454650879, + "learning_rate": 5.393992833170993e-06, + "loss": 4.8497, + "step": 93420 + }, + { + "epoch": 1.9007364908854165, + "grad_norm": 21.222253799438477, + "learning_rate": 5.393594397236294e-06, + "loss": 4.9114, + "step": 93425 + }, + { + "epoch": 1.9008382161458335, + "grad_norm": 18.748563766479492, + "learning_rate": 5.39319595878665e-06, + "loss": 4.871, + "step": 93430 + }, + { + "epoch": 1.90093994140625, + "grad_norm": 20.74974250793457, + "learning_rate": 5.392797517824609e-06, + "loss": 4.5411, + "step": 93435 + }, + { + "epoch": 1.9010416666666665, + "grad_norm": 23.806360244750977, + "learning_rate": 5.3923990743527164e-06, + "loss": 5.0991, + "step": 93440 + }, + { + "epoch": 1.9011433919270835, + "grad_norm": 22.748546600341797, + "learning_rate": 5.392000628373519e-06, + "loss": 5.1101, + "step": 93445 + }, + { + "epoch": 1.9012451171875, + "grad_norm": 13.092248916625977, + "learning_rate": 5.391602179889561e-06, + "loss": 4.7324, + "step": 93450 + }, + { + "epoch": 1.9013468424479165, + "grad_norm": 14.632614135742188, + "learning_rate": 5.391203728903391e-06, + "loss": 4.8281, + "step": 93455 + }, + { + "epoch": 1.9014485677083335, + "grad_norm": 19.22917366027832, + "learning_rate": 5.390805275417552e-06, + "loss": 5.0919, + "step": 93460 + }, + { + "epoch": 1.90155029296875, + "grad_norm": 19.849994659423828, + "learning_rate": 5.390406819434593e-06, + "loss": 4.7824, + "step": 93465 + }, + { + "epoch": 1.9016520182291665, + "grad_norm": 19.584856033325195, + "learning_rate": 5.3900083609570565e-06, + "loss": 4.7393, + "step": 93470 + }, + { + "epoch": 1.9017537434895835, + "grad_norm": 15.77415657043457, + "learning_rate": 5.38960989998749e-06, + "loss": 5.0724, + "step": 93475 + }, + { + "epoch": 1.90185546875, + "grad_norm": 23.97318458557129, + "learning_rate": 5.389211436528442e-06, + "loss": 5.0947, + "step": 93480 + }, + { + "epoch": 1.9019571940104165, + "grad_norm": 18.91376304626465, + "learning_rate": 5.388812970582456e-06, + "loss": 4.7812, + "step": 93485 + }, + { + "epoch": 1.9020589192708335, + "grad_norm": 15.618501663208008, + "learning_rate": 5.388414502152077e-06, + "loss": 4.8018, + "step": 93490 + }, + { + "epoch": 1.90216064453125, + "grad_norm": 28.961305618286133, + "learning_rate": 5.3880160312398545e-06, + "loss": 5.1354, + "step": 93495 + }, + { + "epoch": 1.9022623697916665, + "grad_norm": 21.4664306640625, + "learning_rate": 5.387617557848331e-06, + "loss": 5.2144, + "step": 93500 + }, + { + "epoch": 1.9023640950520835, + "grad_norm": 27.159542083740234, + "learning_rate": 5.387219081980056e-06, + "loss": 4.9367, + "step": 93505 + }, + { + "epoch": 1.9024658203125, + "grad_norm": 13.20370864868164, + "learning_rate": 5.386820603637574e-06, + "loss": 4.9092, + "step": 93510 + }, + { + "epoch": 1.9025675455729165, + "grad_norm": 20.33343505859375, + "learning_rate": 5.386422122823429e-06, + "loss": 5.2621, + "step": 93515 + }, + { + "epoch": 1.9026692708333335, + "grad_norm": 23.964086532592773, + "learning_rate": 5.386023639540171e-06, + "loss": 4.8807, + "step": 93520 + }, + { + "epoch": 1.90277099609375, + "grad_norm": 20.15902328491211, + "learning_rate": 5.385625153790343e-06, + "loss": 4.9495, + "step": 93525 + }, + { + "epoch": 1.9028727213541665, + "grad_norm": 19.57082176208496, + "learning_rate": 5.385226665576495e-06, + "loss": 4.9075, + "step": 93530 + }, + { + "epoch": 1.9029744466145835, + "grad_norm": 17.859079360961914, + "learning_rate": 5.38482817490117e-06, + "loss": 5.0167, + "step": 93535 + }, + { + "epoch": 1.903076171875, + "grad_norm": 14.251797676086426, + "learning_rate": 5.384429681766914e-06, + "loss": 4.7008, + "step": 93540 + }, + { + "epoch": 1.9031778971354165, + "grad_norm": 15.035703659057617, + "learning_rate": 5.384031186176276e-06, + "loss": 4.7211, + "step": 93545 + }, + { + "epoch": 1.9032796223958335, + "grad_norm": 16.5462589263916, + "learning_rate": 5.383632688131799e-06, + "loss": 4.9974, + "step": 93550 + }, + { + "epoch": 1.90338134765625, + "grad_norm": 16.424331665039062, + "learning_rate": 5.383234187636033e-06, + "loss": 5.135, + "step": 93555 + }, + { + "epoch": 1.9034830729166665, + "grad_norm": 15.023905754089355, + "learning_rate": 5.382835684691521e-06, + "loss": 5.3705, + "step": 93560 + }, + { + "epoch": 1.9035847981770835, + "grad_norm": 19.194528579711914, + "learning_rate": 5.38243717930081e-06, + "loss": 4.7295, + "step": 93565 + }, + { + "epoch": 1.9036865234375, + "grad_norm": 16.68707847595215, + "learning_rate": 5.382038671466446e-06, + "loss": 4.8779, + "step": 93570 + }, + { + "epoch": 1.9037882486979165, + "grad_norm": 17.05358123779297, + "learning_rate": 5.3816401611909786e-06, + "loss": 4.8247, + "step": 93575 + }, + { + "epoch": 1.9038899739583335, + "grad_norm": 15.228278160095215, + "learning_rate": 5.3812416484769496e-06, + "loss": 5.0433, + "step": 93580 + }, + { + "epoch": 1.90399169921875, + "grad_norm": 13.996668815612793, + "learning_rate": 5.380843133326909e-06, + "loss": 4.9417, + "step": 93585 + }, + { + "epoch": 1.9040934244791665, + "grad_norm": 11.507285118103027, + "learning_rate": 5.3804446157434e-06, + "loss": 5.0536, + "step": 93590 + }, + { + "epoch": 1.9041951497395835, + "grad_norm": 17.44495391845703, + "learning_rate": 5.380046095728972e-06, + "loss": 4.788, + "step": 93595 + }, + { + "epoch": 1.904296875, + "grad_norm": 20.948665618896484, + "learning_rate": 5.379647573286171e-06, + "loss": 5.0042, + "step": 93600 + }, + { + "epoch": 1.9043986002604165, + "grad_norm": 19.0510196685791, + "learning_rate": 5.379249048417541e-06, + "loss": 4.7954, + "step": 93605 + }, + { + "epoch": 1.9045003255208335, + "grad_norm": 17.111425399780273, + "learning_rate": 5.37885052112563e-06, + "loss": 4.7618, + "step": 93610 + }, + { + "epoch": 1.90460205078125, + "grad_norm": 18.834617614746094, + "learning_rate": 5.378451991412984e-06, + "loss": 5.004, + "step": 93615 + }, + { + "epoch": 1.9047037760416665, + "grad_norm": 14.12595272064209, + "learning_rate": 5.378053459282152e-06, + "loss": 5.0491, + "step": 93620 + }, + { + "epoch": 1.9048055013020835, + "grad_norm": 17.891881942749023, + "learning_rate": 5.377654924735677e-06, + "loss": 4.9216, + "step": 93625 + }, + { + "epoch": 1.9049072265625, + "grad_norm": 19.278085708618164, + "learning_rate": 5.377256387776106e-06, + "loss": 4.9709, + "step": 93630 + }, + { + "epoch": 1.9050089518229165, + "grad_norm": 15.357819557189941, + "learning_rate": 5.3768578484059885e-06, + "loss": 4.8389, + "step": 93635 + }, + { + "epoch": 1.9051106770833335, + "grad_norm": 25.422657012939453, + "learning_rate": 5.376459306627867e-06, + "loss": 4.9286, + "step": 93640 + }, + { + "epoch": 1.90521240234375, + "grad_norm": 20.252487182617188, + "learning_rate": 5.3760607624442915e-06, + "loss": 4.8694, + "step": 93645 + }, + { + "epoch": 1.9053141276041665, + "grad_norm": 19.77918815612793, + "learning_rate": 5.375662215857808e-06, + "loss": 4.9471, + "step": 93650 + }, + { + "epoch": 1.9054158528645835, + "grad_norm": 15.37529468536377, + "learning_rate": 5.375263666870959e-06, + "loss": 4.7608, + "step": 93655 + }, + { + "epoch": 1.905517578125, + "grad_norm": 20.09423065185547, + "learning_rate": 5.3748651154862975e-06, + "loss": 4.9723, + "step": 93660 + }, + { + "epoch": 1.9056193033854165, + "grad_norm": 20.657381057739258, + "learning_rate": 5.3744665617063654e-06, + "loss": 5.0193, + "step": 93665 + }, + { + "epoch": 1.9057210286458335, + "grad_norm": 17.523479461669922, + "learning_rate": 5.37406800553371e-06, + "loss": 4.9013, + "step": 93670 + }, + { + "epoch": 1.90582275390625, + "grad_norm": 15.159598350524902, + "learning_rate": 5.37366944697088e-06, + "loss": 4.7695, + "step": 93675 + }, + { + "epoch": 1.9059244791666665, + "grad_norm": 19.237857818603516, + "learning_rate": 5.373270886020421e-06, + "loss": 4.9486, + "step": 93680 + }, + { + "epoch": 1.9060262044270835, + "grad_norm": 28.138635635375977, + "learning_rate": 5.372872322684879e-06, + "loss": 4.837, + "step": 93685 + }, + { + "epoch": 1.9061279296875, + "grad_norm": 14.398906707763672, + "learning_rate": 5.372473756966802e-06, + "loss": 5.2091, + "step": 93690 + }, + { + "epoch": 1.9062296549479165, + "grad_norm": 16.136035919189453, + "learning_rate": 5.372075188868734e-06, + "loss": 4.7958, + "step": 93695 + }, + { + "epoch": 1.9063313802083335, + "grad_norm": 19.283912658691406, + "learning_rate": 5.371676618393224e-06, + "loss": 4.8726, + "step": 93700 + }, + { + "epoch": 1.90643310546875, + "grad_norm": 16.163570404052734, + "learning_rate": 5.371278045542821e-06, + "loss": 4.9292, + "step": 93705 + }, + { + "epoch": 1.9065348307291665, + "grad_norm": 15.459976196289062, + "learning_rate": 5.370879470320066e-06, + "loss": 4.7892, + "step": 93710 + }, + { + "epoch": 1.9066365559895835, + "grad_norm": 18.235103607177734, + "learning_rate": 5.370480892727511e-06, + "loss": 4.7823, + "step": 93715 + }, + { + "epoch": 1.90673828125, + "grad_norm": 15.068882942199707, + "learning_rate": 5.3700823127677e-06, + "loss": 4.7127, + "step": 93720 + }, + { + "epoch": 1.9068400065104165, + "grad_norm": 16.96445083618164, + "learning_rate": 5.36968373044318e-06, + "loss": 4.6379, + "step": 93725 + }, + { + "epoch": 1.9069417317708335, + "grad_norm": 17.873689651489258, + "learning_rate": 5.3692851457565e-06, + "loss": 4.9919, + "step": 93730 + }, + { + "epoch": 1.90704345703125, + "grad_norm": 19.39373779296875, + "learning_rate": 5.368886558710203e-06, + "loss": 5.3191, + "step": 93735 + }, + { + "epoch": 1.9071451822916665, + "grad_norm": 18.754535675048828, + "learning_rate": 5.36848796930684e-06, + "loss": 5.1111, + "step": 93740 + }, + { + "epoch": 1.9072469075520835, + "grad_norm": 16.950828552246094, + "learning_rate": 5.368089377548954e-06, + "loss": 4.9681, + "step": 93745 + }, + { + "epoch": 1.9073486328125, + "grad_norm": 17.264087677001953, + "learning_rate": 5.367690783439095e-06, + "loss": 4.9039, + "step": 93750 + }, + { + "epoch": 1.9074503580729165, + "grad_norm": 29.43407440185547, + "learning_rate": 5.367292186979809e-06, + "loss": 4.9819, + "step": 93755 + }, + { + "epoch": 1.9075520833333335, + "grad_norm": 18.44590950012207, + "learning_rate": 5.36689358817364e-06, + "loss": 5.0756, + "step": 93760 + }, + { + "epoch": 1.90765380859375, + "grad_norm": 20.07798194885254, + "learning_rate": 5.366494987023141e-06, + "loss": 4.5823, + "step": 93765 + }, + { + "epoch": 1.9077555338541665, + "grad_norm": 16.47015380859375, + "learning_rate": 5.366096383530852e-06, + "loss": 4.8624, + "step": 93770 + }, + { + "epoch": 1.9078572591145835, + "grad_norm": 20.333026885986328, + "learning_rate": 5.365697777699326e-06, + "loss": 4.9279, + "step": 93775 + }, + { + "epoch": 1.907958984375, + "grad_norm": 17.06838035583496, + "learning_rate": 5.3652991695311066e-06, + "loss": 5.1684, + "step": 93780 + }, + { + "epoch": 1.9080607096354165, + "grad_norm": 20.690685272216797, + "learning_rate": 5.36490055902874e-06, + "loss": 4.8196, + "step": 93785 + }, + { + "epoch": 1.9081624348958335, + "grad_norm": 19.520401000976562, + "learning_rate": 5.364501946194775e-06, + "loss": 4.7736, + "step": 93790 + }, + { + "epoch": 1.90826416015625, + "grad_norm": 25.440988540649414, + "learning_rate": 5.364103331031761e-06, + "loss": 4.8635, + "step": 93795 + }, + { + "epoch": 1.9083658854166665, + "grad_norm": 17.567991256713867, + "learning_rate": 5.36370471354224e-06, + "loss": 4.8159, + "step": 93800 + }, + { + "epoch": 1.9084676106770835, + "grad_norm": 18.900548934936523, + "learning_rate": 5.363306093728762e-06, + "loss": 5.244, + "step": 93805 + }, + { + "epoch": 1.9085693359375, + "grad_norm": 21.24300193786621, + "learning_rate": 5.362907471593871e-06, + "loss": 4.7607, + "step": 93810 + }, + { + "epoch": 1.9086710611979165, + "grad_norm": 22.40591049194336, + "learning_rate": 5.36250884714012e-06, + "loss": 4.7952, + "step": 93815 + }, + { + "epoch": 1.9087727864583335, + "grad_norm": 16.846134185791016, + "learning_rate": 5.362110220370052e-06, + "loss": 4.7396, + "step": 93820 + }, + { + "epoch": 1.90887451171875, + "grad_norm": 19.23440933227539, + "learning_rate": 5.361711591286212e-06, + "loss": 5.2389, + "step": 93825 + }, + { + "epoch": 1.9089762369791665, + "grad_norm": 21.825458526611328, + "learning_rate": 5.361312959891151e-06, + "loss": 4.9491, + "step": 93830 + }, + { + "epoch": 1.9090779622395835, + "grad_norm": 15.685689926147461, + "learning_rate": 5.360914326187416e-06, + "loss": 4.7775, + "step": 93835 + }, + { + "epoch": 1.9091796875, + "grad_norm": 22.737648010253906, + "learning_rate": 5.360515690177551e-06, + "loss": 4.9582, + "step": 93840 + }, + { + "epoch": 1.9092814127604165, + "grad_norm": 20.882606506347656, + "learning_rate": 5.3601170518641064e-06, + "loss": 4.9786, + "step": 93845 + }, + { + "epoch": 1.9093831380208335, + "grad_norm": 17.566102981567383, + "learning_rate": 5.359718411249628e-06, + "loss": 4.8224, + "step": 93850 + }, + { + "epoch": 1.90948486328125, + "grad_norm": 16.502573013305664, + "learning_rate": 5.359319768336664e-06, + "loss": 4.9721, + "step": 93855 + }, + { + "epoch": 1.9095865885416665, + "grad_norm": 19.235637664794922, + "learning_rate": 5.358921123127759e-06, + "loss": 4.8198, + "step": 93860 + }, + { + "epoch": 1.9096883138020835, + "grad_norm": 17.533708572387695, + "learning_rate": 5.358522475625464e-06, + "loss": 4.9687, + "step": 93865 + }, + { + "epoch": 1.9097900390625, + "grad_norm": 20.427934646606445, + "learning_rate": 5.358123825832323e-06, + "loss": 5.0619, + "step": 93870 + }, + { + "epoch": 1.9098917643229165, + "grad_norm": 18.087318420410156, + "learning_rate": 5.357725173750885e-06, + "loss": 4.9031, + "step": 93875 + }, + { + "epoch": 1.9099934895833335, + "grad_norm": 15.826823234558105, + "learning_rate": 5.357326519383696e-06, + "loss": 5.0008, + "step": 93880 + }, + { + "epoch": 1.91009521484375, + "grad_norm": 22.23493194580078, + "learning_rate": 5.356927862733305e-06, + "loss": 5.3141, + "step": 93885 + }, + { + "epoch": 1.9101969401041665, + "grad_norm": 19.003690719604492, + "learning_rate": 5.3565292038022576e-06, + "loss": 5.0143, + "step": 93890 + }, + { + "epoch": 1.9102986653645835, + "grad_norm": 19.72605323791504, + "learning_rate": 5.356130542593102e-06, + "loss": 4.8781, + "step": 93895 + }, + { + "epoch": 1.910400390625, + "grad_norm": 14.024299621582031, + "learning_rate": 5.355731879108384e-06, + "loss": 4.9584, + "step": 93900 + }, + { + "epoch": 1.9105021158854165, + "grad_norm": 17.544973373413086, + "learning_rate": 5.355333213350654e-06, + "loss": 4.9395, + "step": 93905 + }, + { + "epoch": 1.9106038411458335, + "grad_norm": 19.233375549316406, + "learning_rate": 5.354934545322457e-06, + "loss": 4.7837, + "step": 93910 + }, + { + "epoch": 1.91070556640625, + "grad_norm": 17.112390518188477, + "learning_rate": 5.354535875026341e-06, + "loss": 4.8925, + "step": 93915 + }, + { + "epoch": 1.9108072916666665, + "grad_norm": 15.6532564163208, + "learning_rate": 5.354137202464853e-06, + "loss": 4.9893, + "step": 93920 + }, + { + "epoch": 1.9109090169270835, + "grad_norm": 17.753398895263672, + "learning_rate": 5.3537385276405405e-06, + "loss": 4.9652, + "step": 93925 + }, + { + "epoch": 1.9110107421875, + "grad_norm": 19.08652114868164, + "learning_rate": 5.353339850555952e-06, + "loss": 5.0079, + "step": 93930 + }, + { + "epoch": 1.9111124674479165, + "grad_norm": 19.317068099975586, + "learning_rate": 5.352941171213634e-06, + "loss": 4.7661, + "step": 93935 + }, + { + "epoch": 1.9112141927083335, + "grad_norm": 17.75146484375, + "learning_rate": 5.352542489616133e-06, + "loss": 4.7307, + "step": 93940 + }, + { + "epoch": 1.91131591796875, + "grad_norm": 19.913270950317383, + "learning_rate": 5.352143805765997e-06, + "loss": 4.918, + "step": 93945 + }, + { + "epoch": 1.9114176432291665, + "grad_norm": 21.46738624572754, + "learning_rate": 5.351745119665775e-06, + "loss": 5.116, + "step": 93950 + }, + { + "epoch": 1.9115193684895835, + "grad_norm": 21.47504234313965, + "learning_rate": 5.351346431318013e-06, + "loss": 4.9289, + "step": 93955 + }, + { + "epoch": 1.91162109375, + "grad_norm": 12.976683616638184, + "learning_rate": 5.350947740725258e-06, + "loss": 5.1367, + "step": 93960 + }, + { + "epoch": 1.9117228190104165, + "grad_norm": 16.870695114135742, + "learning_rate": 5.35054904789006e-06, + "loss": 4.7785, + "step": 93965 + }, + { + "epoch": 1.9118245442708335, + "grad_norm": 24.602148056030273, + "learning_rate": 5.350150352814965e-06, + "loss": 4.8587, + "step": 93970 + }, + { + "epoch": 1.91192626953125, + "grad_norm": 18.73423957824707, + "learning_rate": 5.349751655502517e-06, + "loss": 4.9771, + "step": 93975 + }, + { + "epoch": 1.9120279947916665, + "grad_norm": 20.202024459838867, + "learning_rate": 5.349352955955269e-06, + "loss": 4.7969, + "step": 93980 + }, + { + "epoch": 1.9121297200520835, + "grad_norm": 21.78118324279785, + "learning_rate": 5.348954254175768e-06, + "loss": 4.5693, + "step": 93985 + }, + { + "epoch": 1.9122314453125, + "grad_norm": 28.001060485839844, + "learning_rate": 5.348555550166559e-06, + "loss": 4.9006, + "step": 93990 + }, + { + "epoch": 1.9123331705729165, + "grad_norm": 19.377662658691406, + "learning_rate": 5.34815684393019e-06, + "loss": 4.9701, + "step": 93995 + }, + { + "epoch": 1.9124348958333335, + "grad_norm": 17.799758911132812, + "learning_rate": 5.347758135469211e-06, + "loss": 4.9232, + "step": 94000 + }, + { + "epoch": 1.91253662109375, + "grad_norm": 15.856388092041016, + "learning_rate": 5.347359424786166e-06, + "loss": 4.8535, + "step": 94005 + }, + { + "epoch": 1.9126383463541665, + "grad_norm": 16.151897430419922, + "learning_rate": 5.346960711883605e-06, + "loss": 4.6838, + "step": 94010 + }, + { + "epoch": 1.9127400716145835, + "grad_norm": 12.528669357299805, + "learning_rate": 5.346561996764077e-06, + "loss": 5.0803, + "step": 94015 + }, + { + "epoch": 1.912841796875, + "grad_norm": 14.17497730255127, + "learning_rate": 5.346163279430125e-06, + "loss": 4.8925, + "step": 94020 + }, + { + "epoch": 1.9129435221354165, + "grad_norm": 15.559307098388672, + "learning_rate": 5.345764559884302e-06, + "loss": 4.8177, + "step": 94025 + }, + { + "epoch": 1.9130452473958335, + "grad_norm": 16.363306045532227, + "learning_rate": 5.345365838129152e-06, + "loss": 5.0811, + "step": 94030 + }, + { + "epoch": 1.91314697265625, + "grad_norm": 21.4947452545166, + "learning_rate": 5.344967114167224e-06, + "loss": 4.7538, + "step": 94035 + }, + { + "epoch": 1.9132486979166665, + "grad_norm": 26.62862205505371, + "learning_rate": 5.344568388001067e-06, + "loss": 4.7809, + "step": 94040 + }, + { + "epoch": 1.9133504231770835, + "grad_norm": 18.77497673034668, + "learning_rate": 5.344169659633225e-06, + "loss": 4.7175, + "step": 94045 + }, + { + "epoch": 1.9134521484375, + "grad_norm": 15.040022850036621, + "learning_rate": 5.34377092906625e-06, + "loss": 5.2281, + "step": 94050 + }, + { + "epoch": 1.9135538736979165, + "grad_norm": 32.3785400390625, + "learning_rate": 5.343372196302686e-06, + "loss": 4.8318, + "step": 94055 + }, + { + "epoch": 1.9136555989583335, + "grad_norm": 16.070852279663086, + "learning_rate": 5.342973461345084e-06, + "loss": 5.2356, + "step": 94060 + }, + { + "epoch": 1.91375732421875, + "grad_norm": 17.500730514526367, + "learning_rate": 5.342574724195991e-06, + "loss": 4.9209, + "step": 94065 + }, + { + "epoch": 1.9138590494791665, + "grad_norm": 19.29617691040039, + "learning_rate": 5.342175984857953e-06, + "loss": 5.2406, + "step": 94070 + }, + { + "epoch": 1.9139607747395835, + "grad_norm": 17.775127410888672, + "learning_rate": 5.34177724333352e-06, + "loss": 4.8322, + "step": 94075 + }, + { + "epoch": 1.9140625, + "grad_norm": 21.74266242980957, + "learning_rate": 5.3413784996252385e-06, + "loss": 4.9132, + "step": 94080 + }, + { + "epoch": 1.9141642252604165, + "grad_norm": 18.15887451171875, + "learning_rate": 5.3409797537356565e-06, + "loss": 5.0446, + "step": 94085 + }, + { + "epoch": 1.9142659505208335, + "grad_norm": 17.58346176147461, + "learning_rate": 5.340581005667324e-06, + "loss": 5.0722, + "step": 94090 + }, + { + "epoch": 1.91436767578125, + "grad_norm": 16.356449127197266, + "learning_rate": 5.340182255422784e-06, + "loss": 4.9371, + "step": 94095 + }, + { + "epoch": 1.9144694010416665, + "grad_norm": 23.2813720703125, + "learning_rate": 5.3397835030045885e-06, + "loss": 4.9806, + "step": 94100 + }, + { + "epoch": 1.9145711263020835, + "grad_norm": 16.45219612121582, + "learning_rate": 5.3393847484152835e-06, + "loss": 5.1492, + "step": 94105 + }, + { + "epoch": 1.9146728515625, + "grad_norm": 18.072551727294922, + "learning_rate": 5.33898599165742e-06, + "loss": 4.6768, + "step": 94110 + }, + { + "epoch": 1.9147745768229165, + "grad_norm": 23.48348045349121, + "learning_rate": 5.338587232733542e-06, + "loss": 4.8754, + "step": 94115 + }, + { + "epoch": 1.9148763020833335, + "grad_norm": 19.80550765991211, + "learning_rate": 5.338188471646198e-06, + "loss": 4.7044, + "step": 94120 + }, + { + "epoch": 1.91497802734375, + "grad_norm": 20.11233901977539, + "learning_rate": 5.337789708397939e-06, + "loss": 4.9838, + "step": 94125 + }, + { + "epoch": 1.9150797526041665, + "grad_norm": 15.627205848693848, + "learning_rate": 5.337390942991312e-06, + "loss": 5.3847, + "step": 94130 + }, + { + "epoch": 1.9151814778645835, + "grad_norm": 16.42889976501465, + "learning_rate": 5.336992175428861e-06, + "loss": 5.0191, + "step": 94135 + }, + { + "epoch": 1.915283203125, + "grad_norm": 23.66620635986328, + "learning_rate": 5.336593405713139e-06, + "loss": 4.3426, + "step": 94140 + }, + { + "epoch": 1.9153849283854165, + "grad_norm": 20.937973022460938, + "learning_rate": 5.336194633846691e-06, + "loss": 4.9255, + "step": 94145 + }, + { + "epoch": 1.9154866536458335, + "grad_norm": 17.69077491760254, + "learning_rate": 5.335795859832067e-06, + "loss": 4.9214, + "step": 94150 + }, + { + "epoch": 1.91558837890625, + "grad_norm": 20.09020233154297, + "learning_rate": 5.3353970836718135e-06, + "loss": 4.8678, + "step": 94155 + }, + { + "epoch": 1.9156901041666665, + "grad_norm": 13.999191284179688, + "learning_rate": 5.334998305368479e-06, + "loss": 4.9142, + "step": 94160 + }, + { + "epoch": 1.9157918294270835, + "grad_norm": 23.642126083374023, + "learning_rate": 5.334599524924612e-06, + "loss": 4.7026, + "step": 94165 + }, + { + "epoch": 1.9158935546875, + "grad_norm": 17.92940330505371, + "learning_rate": 5.33420074234276e-06, + "loss": 4.94, + "step": 94170 + }, + { + "epoch": 1.9159952799479165, + "grad_norm": 20.07135581970215, + "learning_rate": 5.333801957625471e-06, + "loss": 5.0121, + "step": 94175 + }, + { + "epoch": 1.9160970052083335, + "grad_norm": 15.668661117553711, + "learning_rate": 5.333403170775294e-06, + "loss": 4.809, + "step": 94180 + }, + { + "epoch": 1.91619873046875, + "grad_norm": 22.627546310424805, + "learning_rate": 5.333004381794775e-06, + "loss": 4.8618, + "step": 94185 + }, + { + "epoch": 1.9163004557291665, + "grad_norm": 15.464651107788086, + "learning_rate": 5.332605590686466e-06, + "loss": 4.9386, + "step": 94190 + }, + { + "epoch": 1.9164021809895835, + "grad_norm": 18.76863670349121, + "learning_rate": 5.332206797452912e-06, + "loss": 4.8155, + "step": 94195 + }, + { + "epoch": 1.91650390625, + "grad_norm": 19.861074447631836, + "learning_rate": 5.331808002096661e-06, + "loss": 4.9057, + "step": 94200 + }, + { + "epoch": 1.9166056315104165, + "grad_norm": 15.922276496887207, + "learning_rate": 5.331409204620264e-06, + "loss": 4.8274, + "step": 94205 + }, + { + "epoch": 1.9167073567708335, + "grad_norm": 17.206039428710938, + "learning_rate": 5.3310104050262655e-06, + "loss": 4.9033, + "step": 94210 + }, + { + "epoch": 1.91680908203125, + "grad_norm": 19.360197067260742, + "learning_rate": 5.330611603317217e-06, + "loss": 5.0607, + "step": 94215 + }, + { + "epoch": 1.9169108072916665, + "grad_norm": 16.412567138671875, + "learning_rate": 5.3302127994956665e-06, + "loss": 4.959, + "step": 94220 + }, + { + "epoch": 1.9170125325520835, + "grad_norm": 15.281131744384766, + "learning_rate": 5.329813993564158e-06, + "loss": 5.0059, + "step": 94225 + }, + { + "epoch": 1.9171142578125, + "grad_norm": 18.115781784057617, + "learning_rate": 5.329415185525245e-06, + "loss": 4.8986, + "step": 94230 + }, + { + "epoch": 1.9172159830729165, + "grad_norm": 16.54869842529297, + "learning_rate": 5.329016375381471e-06, + "loss": 4.8265, + "step": 94235 + }, + { + "epoch": 1.9173177083333335, + "grad_norm": 19.43939208984375, + "learning_rate": 5.32861756313539e-06, + "loss": 4.9725, + "step": 94240 + }, + { + "epoch": 1.91741943359375, + "grad_norm": 17.72829246520996, + "learning_rate": 5.3282187487895456e-06, + "loss": 4.7676, + "step": 94245 + }, + { + "epoch": 1.9175211588541665, + "grad_norm": 17.66028594970703, + "learning_rate": 5.327819932346486e-06, + "loss": 4.8846, + "step": 94250 + }, + { + "epoch": 1.9176228841145835, + "grad_norm": 17.946306228637695, + "learning_rate": 5.327421113808764e-06, + "loss": 4.8133, + "step": 94255 + }, + { + "epoch": 1.917724609375, + "grad_norm": 20.11041831970215, + "learning_rate": 5.327022293178924e-06, + "loss": 4.4691, + "step": 94260 + }, + { + "epoch": 1.9178263346354165, + "grad_norm": 20.007205963134766, + "learning_rate": 5.326623470459514e-06, + "loss": 5.0493, + "step": 94265 + }, + { + "epoch": 1.9179280598958335, + "grad_norm": 26.0732479095459, + "learning_rate": 5.3262246456530856e-06, + "loss": 4.8581, + "step": 94270 + }, + { + "epoch": 1.91802978515625, + "grad_norm": 25.24924087524414, + "learning_rate": 5.325825818762183e-06, + "loss": 4.7078, + "step": 94275 + }, + { + "epoch": 1.9181315104166665, + "grad_norm": 15.691827774047852, + "learning_rate": 5.32542698978936e-06, + "loss": 4.7055, + "step": 94280 + }, + { + "epoch": 1.9182332356770835, + "grad_norm": 20.35869789123535, + "learning_rate": 5.32502815873716e-06, + "loss": 4.8252, + "step": 94285 + }, + { + "epoch": 1.9183349609375, + "grad_norm": 17.164546966552734, + "learning_rate": 5.324629325608133e-06, + "loss": 4.9827, + "step": 94290 + }, + { + "epoch": 1.9184366861979165, + "grad_norm": 13.637750625610352, + "learning_rate": 5.3242304904048284e-06, + "loss": 4.9536, + "step": 94295 + }, + { + "epoch": 1.9185384114583335, + "grad_norm": 14.078155517578125, + "learning_rate": 5.323831653129793e-06, + "loss": 4.9967, + "step": 94300 + }, + { + "epoch": 1.91864013671875, + "grad_norm": 21.02069854736328, + "learning_rate": 5.323432813785577e-06, + "loss": 4.8629, + "step": 94305 + }, + { + "epoch": 1.9187418619791665, + "grad_norm": 18.19855308532715, + "learning_rate": 5.323033972374728e-06, + "loss": 4.9769, + "step": 94310 + }, + { + "epoch": 1.9188435872395835, + "grad_norm": 17.306848526000977, + "learning_rate": 5.322635128899794e-06, + "loss": 4.9023, + "step": 94315 + }, + { + "epoch": 1.9189453125, + "grad_norm": 15.549432754516602, + "learning_rate": 5.322236283363325e-06, + "loss": 4.7756, + "step": 94320 + }, + { + "epoch": 1.9190470377604165, + "grad_norm": 19.28544044494629, + "learning_rate": 5.321837435767867e-06, + "loss": 4.8673, + "step": 94325 + }, + { + "epoch": 1.9191487630208335, + "grad_norm": 16.52393913269043, + "learning_rate": 5.32143858611597e-06, + "loss": 5.0593, + "step": 94330 + }, + { + "epoch": 1.91925048828125, + "grad_norm": 17.17713737487793, + "learning_rate": 5.321039734410184e-06, + "loss": 4.7842, + "step": 94335 + }, + { + "epoch": 1.9193522135416665, + "grad_norm": 16.384408950805664, + "learning_rate": 5.320640880653054e-06, + "loss": 4.7069, + "step": 94340 + }, + { + "epoch": 1.9194539388020835, + "grad_norm": 23.437711715698242, + "learning_rate": 5.320242024847132e-06, + "loss": 4.795, + "step": 94345 + }, + { + "epoch": 1.9195556640625, + "grad_norm": 18.380207061767578, + "learning_rate": 5.319843166994964e-06, + "loss": 5.0235, + "step": 94350 + }, + { + "epoch": 1.9196573893229165, + "grad_norm": 17.127220153808594, + "learning_rate": 5.3194443070991005e-06, + "loss": 4.9696, + "step": 94355 + }, + { + "epoch": 1.9197591145833335, + "grad_norm": 18.374732971191406, + "learning_rate": 5.31904544516209e-06, + "loss": 4.7588, + "step": 94360 + }, + { + "epoch": 1.91986083984375, + "grad_norm": 18.8726806640625, + "learning_rate": 5.318646581186478e-06, + "loss": 4.9951, + "step": 94365 + }, + { + "epoch": 1.9199625651041665, + "grad_norm": 21.023269653320312, + "learning_rate": 5.318247715174818e-06, + "loss": 4.8661, + "step": 94370 + }, + { + "epoch": 1.9200642903645835, + "grad_norm": 18.493074417114258, + "learning_rate": 5.317848847129655e-06, + "loss": 4.9314, + "step": 94375 + }, + { + "epoch": 1.920166015625, + "grad_norm": 38.91717529296875, + "learning_rate": 5.317449977053539e-06, + "loss": 4.8552, + "step": 94380 + }, + { + "epoch": 1.9202677408854165, + "grad_norm": 15.10824203491211, + "learning_rate": 5.317051104949018e-06, + "loss": 4.6263, + "step": 94385 + }, + { + "epoch": 1.9203694661458335, + "grad_norm": 13.553107261657715, + "learning_rate": 5.316652230818642e-06, + "loss": 4.9156, + "step": 94390 + }, + { + "epoch": 1.92047119140625, + "grad_norm": 19.451406478881836, + "learning_rate": 5.3162533546649575e-06, + "loss": 4.7186, + "step": 94395 + }, + { + "epoch": 1.9205729166666665, + "grad_norm": 20.701221466064453, + "learning_rate": 5.315854476490515e-06, + "loss": 4.7561, + "step": 94400 + }, + { + "epoch": 1.9206746419270835, + "grad_norm": 14.511199951171875, + "learning_rate": 5.315455596297863e-06, + "loss": 4.7178, + "step": 94405 + }, + { + "epoch": 1.9207763671875, + "grad_norm": 26.358407974243164, + "learning_rate": 5.31505671408955e-06, + "loss": 4.8793, + "step": 94410 + }, + { + "epoch": 1.9208780924479165, + "grad_norm": 17.552738189697266, + "learning_rate": 5.3146578298681236e-06, + "loss": 4.9505, + "step": 94415 + }, + { + "epoch": 1.9209798177083335, + "grad_norm": 15.777359962463379, + "learning_rate": 5.314258943636134e-06, + "loss": 5.0256, + "step": 94420 + }, + { + "epoch": 1.92108154296875, + "grad_norm": 19.814319610595703, + "learning_rate": 5.313860055396131e-06, + "loss": 4.9304, + "step": 94425 + }, + { + "epoch": 1.9211832682291665, + "grad_norm": 18.954721450805664, + "learning_rate": 5.31346116515066e-06, + "loss": 4.7248, + "step": 94430 + }, + { + "epoch": 1.9212849934895835, + "grad_norm": 16.738229751586914, + "learning_rate": 5.313062272902272e-06, + "loss": 4.8651, + "step": 94435 + }, + { + "epoch": 1.92138671875, + "grad_norm": 16.258459091186523, + "learning_rate": 5.312663378653516e-06, + "loss": 4.7614, + "step": 94440 + }, + { + "epoch": 1.9214884440104165, + "grad_norm": 20.536848068237305, + "learning_rate": 5.312264482406939e-06, + "loss": 4.8599, + "step": 94445 + }, + { + "epoch": 1.9215901692708335, + "grad_norm": 18.92507553100586, + "learning_rate": 5.311865584165093e-06, + "loss": 4.9309, + "step": 94450 + }, + { + "epoch": 1.92169189453125, + "grad_norm": 26.56709098815918, + "learning_rate": 5.311466683930525e-06, + "loss": 5.0613, + "step": 94455 + }, + { + "epoch": 1.9217936197916665, + "grad_norm": 13.7566499710083, + "learning_rate": 5.311067781705782e-06, + "loss": 4.7914, + "step": 94460 + }, + { + "epoch": 1.9218953450520835, + "grad_norm": 15.483705520629883, + "learning_rate": 5.310668877493416e-06, + "loss": 4.7396, + "step": 94465 + }, + { + "epoch": 1.9219970703125, + "grad_norm": 20.140195846557617, + "learning_rate": 5.310269971295973e-06, + "loss": 5.0036, + "step": 94470 + }, + { + "epoch": 1.9220987955729165, + "grad_norm": 22.21231460571289, + "learning_rate": 5.309871063116004e-06, + "loss": 4.8652, + "step": 94475 + }, + { + "epoch": 1.9222005208333335, + "grad_norm": 15.927326202392578, + "learning_rate": 5.309472152956059e-06, + "loss": 4.9073, + "step": 94480 + }, + { + "epoch": 1.92230224609375, + "grad_norm": 17.173765182495117, + "learning_rate": 5.309073240818683e-06, + "loss": 5.0927, + "step": 94485 + }, + { + "epoch": 1.9224039713541665, + "grad_norm": 14.478537559509277, + "learning_rate": 5.308674326706428e-06, + "loss": 4.7794, + "step": 94490 + }, + { + "epoch": 1.9225056966145835, + "grad_norm": 18.704303741455078, + "learning_rate": 5.308275410621842e-06, + "loss": 4.6305, + "step": 94495 + }, + { + "epoch": 1.922607421875, + "grad_norm": 15.061763763427734, + "learning_rate": 5.307876492567473e-06, + "loss": 4.9812, + "step": 94500 + }, + { + "epoch": 1.9227091471354165, + "grad_norm": 17.91948890686035, + "learning_rate": 5.307477572545871e-06, + "loss": 5.2088, + "step": 94505 + }, + { + "epoch": 1.9228108723958335, + "grad_norm": 23.377695083618164, + "learning_rate": 5.307078650559586e-06, + "loss": 4.8279, + "step": 94510 + }, + { + "epoch": 1.92291259765625, + "grad_norm": 17.318113327026367, + "learning_rate": 5.3066797266111655e-06, + "loss": 4.7165, + "step": 94515 + }, + { + "epoch": 1.9230143229166665, + "grad_norm": 14.30473804473877, + "learning_rate": 5.3062808007031595e-06, + "loss": 4.749, + "step": 94520 + }, + { + "epoch": 1.9231160481770835, + "grad_norm": 18.175853729248047, + "learning_rate": 5.305881872838116e-06, + "loss": 4.7986, + "step": 94525 + }, + { + "epoch": 1.9232177734375, + "grad_norm": 18.555330276489258, + "learning_rate": 5.305482943018584e-06, + "loss": 4.6883, + "step": 94530 + }, + { + "epoch": 1.9233194986979165, + "grad_norm": 13.754481315612793, + "learning_rate": 5.3050840112471125e-06, + "loss": 4.9833, + "step": 94535 + }, + { + "epoch": 1.9234212239583335, + "grad_norm": 15.810068130493164, + "learning_rate": 5.3046850775262524e-06, + "loss": 4.9737, + "step": 94540 + }, + { + "epoch": 1.92352294921875, + "grad_norm": 17.26059913635254, + "learning_rate": 5.3042861418585515e-06, + "loss": 5.0156, + "step": 94545 + }, + { + "epoch": 1.9236246744791665, + "grad_norm": 13.13129997253418, + "learning_rate": 5.303887204246558e-06, + "loss": 5.0493, + "step": 94550 + }, + { + "epoch": 1.9237263997395835, + "grad_norm": 21.197887420654297, + "learning_rate": 5.303488264692822e-06, + "loss": 4.6846, + "step": 94555 + }, + { + "epoch": 1.923828125, + "grad_norm": 19.307809829711914, + "learning_rate": 5.303089323199892e-06, + "loss": 4.8264, + "step": 94560 + }, + { + "epoch": 1.9239298502604165, + "grad_norm": 19.28775405883789, + "learning_rate": 5.302690379770316e-06, + "loss": 4.8959, + "step": 94565 + }, + { + "epoch": 1.9240315755208335, + "grad_norm": 15.197155952453613, + "learning_rate": 5.302291434406648e-06, + "loss": 5.1424, + "step": 94570 + }, + { + "epoch": 1.92413330078125, + "grad_norm": 20.507299423217773, + "learning_rate": 5.301892487111431e-06, + "loss": 4.9239, + "step": 94575 + }, + { + "epoch": 1.9242350260416665, + "grad_norm": 17.971832275390625, + "learning_rate": 5.301493537887218e-06, + "loss": 4.891, + "step": 94580 + }, + { + "epoch": 1.9243367513020835, + "grad_norm": 24.044954299926758, + "learning_rate": 5.301094586736556e-06, + "loss": 4.9319, + "step": 94585 + }, + { + "epoch": 1.9244384765625, + "grad_norm": 21.536794662475586, + "learning_rate": 5.3006956336619965e-06, + "loss": 4.9077, + "step": 94590 + }, + { + "epoch": 1.9245402018229165, + "grad_norm": 16.592863082885742, + "learning_rate": 5.300296678666087e-06, + "loss": 4.8008, + "step": 94595 + }, + { + "epoch": 1.9246419270833335, + "grad_norm": 19.110578536987305, + "learning_rate": 5.299897721751376e-06, + "loss": 4.7559, + "step": 94600 + }, + { + "epoch": 1.92474365234375, + "grad_norm": 21.52811050415039, + "learning_rate": 5.299498762920414e-06, + "loss": 5.0624, + "step": 94605 + }, + { + "epoch": 1.9248453776041665, + "grad_norm": 16.227710723876953, + "learning_rate": 5.2990998021757514e-06, + "loss": 4.9743, + "step": 94610 + }, + { + "epoch": 1.9249471028645835, + "grad_norm": 16.83169174194336, + "learning_rate": 5.298700839519935e-06, + "loss": 4.9961, + "step": 94615 + }, + { + "epoch": 1.925048828125, + "grad_norm": 17.058427810668945, + "learning_rate": 5.2983018749555145e-06, + "loss": 4.6849, + "step": 94620 + }, + { + "epoch": 1.9251505533854165, + "grad_norm": 16.521196365356445, + "learning_rate": 5.297902908485041e-06, + "loss": 5.0629, + "step": 94625 + }, + { + "epoch": 1.9252522786458335, + "grad_norm": 19.282411575317383, + "learning_rate": 5.297503940111062e-06, + "loss": 5.0255, + "step": 94630 + }, + { + "epoch": 1.92535400390625, + "grad_norm": 19.0622615814209, + "learning_rate": 5.297104969836127e-06, + "loss": 4.732, + "step": 94635 + }, + { + "epoch": 1.9254557291666665, + "grad_norm": 14.832290649414062, + "learning_rate": 5.296705997662786e-06, + "loss": 4.9059, + "step": 94640 + }, + { + "epoch": 1.9255574544270835, + "grad_norm": 22.782291412353516, + "learning_rate": 5.2963070235935865e-06, + "loss": 4.938, + "step": 94645 + }, + { + "epoch": 1.9256591796875, + "grad_norm": 15.658479690551758, + "learning_rate": 5.29590804763108e-06, + "loss": 4.8316, + "step": 94650 + }, + { + "epoch": 1.9257609049479165, + "grad_norm": 20.857044219970703, + "learning_rate": 5.295509069777814e-06, + "loss": 4.9223, + "step": 94655 + }, + { + "epoch": 1.9258626302083335, + "grad_norm": 21.70479965209961, + "learning_rate": 5.295110090036341e-06, + "loss": 4.8675, + "step": 94660 + }, + { + "epoch": 1.92596435546875, + "grad_norm": 17.73834991455078, + "learning_rate": 5.294711108409207e-06, + "loss": 4.7042, + "step": 94665 + }, + { + "epoch": 1.9260660807291665, + "grad_norm": 33.806053161621094, + "learning_rate": 5.294312124898963e-06, + "loss": 4.7933, + "step": 94670 + }, + { + "epoch": 1.9261678059895835, + "grad_norm": 16.350418090820312, + "learning_rate": 5.293913139508158e-06, + "loss": 4.8121, + "step": 94675 + }, + { + "epoch": 1.92626953125, + "grad_norm": 16.420303344726562, + "learning_rate": 5.293514152239341e-06, + "loss": 4.9397, + "step": 94680 + }, + { + "epoch": 1.9263712565104165, + "grad_norm": 17.228424072265625, + "learning_rate": 5.293115163095062e-06, + "loss": 4.8397, + "step": 94685 + }, + { + "epoch": 1.9264729817708335, + "grad_norm": 13.621172904968262, + "learning_rate": 5.292716172077869e-06, + "loss": 4.8535, + "step": 94690 + }, + { + "epoch": 1.92657470703125, + "grad_norm": 17.267112731933594, + "learning_rate": 5.292317179190315e-06, + "loss": 4.766, + "step": 94695 + }, + { + "epoch": 1.9266764322916665, + "grad_norm": 43.510677337646484, + "learning_rate": 5.2919181844349455e-06, + "loss": 5.2059, + "step": 94700 + }, + { + "epoch": 1.9267781575520835, + "grad_norm": 20.68406867980957, + "learning_rate": 5.2915191878143115e-06, + "loss": 4.7779, + "step": 94705 + }, + { + "epoch": 1.9268798828125, + "grad_norm": 20.4705810546875, + "learning_rate": 5.291120189330962e-06, + "loss": 4.9466, + "step": 94710 + }, + { + "epoch": 1.9269816080729165, + "grad_norm": 16.033987045288086, + "learning_rate": 5.290721188987447e-06, + "loss": 5.3559, + "step": 94715 + }, + { + "epoch": 1.9270833333333335, + "grad_norm": 15.165325164794922, + "learning_rate": 5.2903221867863166e-06, + "loss": 4.7135, + "step": 94720 + }, + { + "epoch": 1.92718505859375, + "grad_norm": 17.340618133544922, + "learning_rate": 5.28992318273012e-06, + "loss": 4.9065, + "step": 94725 + }, + { + "epoch": 1.9272867838541665, + "grad_norm": 16.0295467376709, + "learning_rate": 5.289524176821405e-06, + "loss": 4.9314, + "step": 94730 + }, + { + "epoch": 1.9273885091145835, + "grad_norm": 18.400053024291992, + "learning_rate": 5.289125169062723e-06, + "loss": 4.8885, + "step": 94735 + }, + { + "epoch": 1.927490234375, + "grad_norm": 17.4907169342041, + "learning_rate": 5.288726159456623e-06, + "loss": 4.925, + "step": 94740 + }, + { + "epoch": 1.9275919596354165, + "grad_norm": 18.893064498901367, + "learning_rate": 5.2883271480056544e-06, + "loss": 5.2257, + "step": 94745 + }, + { + "epoch": 1.9276936848958335, + "grad_norm": 16.773006439208984, + "learning_rate": 5.287928134712368e-06, + "loss": 4.8197, + "step": 94750 + }, + { + "epoch": 1.92779541015625, + "grad_norm": 15.573996543884277, + "learning_rate": 5.28752911957931e-06, + "loss": 4.7953, + "step": 94755 + }, + { + "epoch": 1.9278971354166665, + "grad_norm": 16.24224090576172, + "learning_rate": 5.2871301026090335e-06, + "loss": 4.9572, + "step": 94760 + }, + { + "epoch": 1.9279988606770835, + "grad_norm": 16.83684730529785, + "learning_rate": 5.286731083804087e-06, + "loss": 5.1857, + "step": 94765 + }, + { + "epoch": 1.9281005859375, + "grad_norm": 15.80679988861084, + "learning_rate": 5.286332063167019e-06, + "loss": 4.8445, + "step": 94770 + }, + { + "epoch": 1.9282023111979165, + "grad_norm": 24.56260108947754, + "learning_rate": 5.2859330407003794e-06, + "loss": 4.8331, + "step": 94775 + }, + { + "epoch": 1.9283040364583335, + "grad_norm": 22.728410720825195, + "learning_rate": 5.28553401640672e-06, + "loss": 4.8697, + "step": 94780 + }, + { + "epoch": 1.92840576171875, + "grad_norm": 17.2808837890625, + "learning_rate": 5.285134990288588e-06, + "loss": 4.6945, + "step": 94785 + }, + { + "epoch": 1.9285074869791665, + "grad_norm": 19.90939712524414, + "learning_rate": 5.284735962348535e-06, + "loss": 4.7525, + "step": 94790 + }, + { + "epoch": 1.9286092122395835, + "grad_norm": 19.501243591308594, + "learning_rate": 5.284336932589108e-06, + "loss": 4.8053, + "step": 94795 + }, + { + "epoch": 1.9287109375, + "grad_norm": 15.378896713256836, + "learning_rate": 5.28393790101286e-06, + "loss": 4.8961, + "step": 94800 + }, + { + "epoch": 1.9288126627604165, + "grad_norm": 17.583284378051758, + "learning_rate": 5.283538867622338e-06, + "loss": 5.0112, + "step": 94805 + }, + { + "epoch": 1.9289143880208335, + "grad_norm": 19.327415466308594, + "learning_rate": 5.2831398324200936e-06, + "loss": 4.83, + "step": 94810 + }, + { + "epoch": 1.92901611328125, + "grad_norm": 15.679769515991211, + "learning_rate": 5.282740795408674e-06, + "loss": 5.0653, + "step": 94815 + }, + { + "epoch": 1.9291178385416665, + "grad_norm": 16.528444290161133, + "learning_rate": 5.282341756590631e-06, + "loss": 4.8137, + "step": 94820 + }, + { + "epoch": 1.9292195638020835, + "grad_norm": 19.65157699584961, + "learning_rate": 5.281942715968515e-06, + "loss": 5.2638, + "step": 94825 + }, + { + "epoch": 1.9293212890625, + "grad_norm": 17.93821907043457, + "learning_rate": 5.281543673544873e-06, + "loss": 4.98, + "step": 94830 + }, + { + "epoch": 1.9294230143229165, + "grad_norm": 19.43341827392578, + "learning_rate": 5.281144629322257e-06, + "loss": 4.6446, + "step": 94835 + }, + { + "epoch": 1.9295247395833335, + "grad_norm": 15.713711738586426, + "learning_rate": 5.280745583303217e-06, + "loss": 4.9666, + "step": 94840 + }, + { + "epoch": 1.92962646484375, + "grad_norm": 20.84488868713379, + "learning_rate": 5.280346535490299e-06, + "loss": 4.9102, + "step": 94845 + }, + { + "epoch": 1.9297281901041665, + "grad_norm": 20.07949447631836, + "learning_rate": 5.279947485886058e-06, + "loss": 4.5624, + "step": 94850 + }, + { + "epoch": 1.9298299153645835, + "grad_norm": 26.737239837646484, + "learning_rate": 5.279548434493041e-06, + "loss": 4.8846, + "step": 94855 + }, + { + "epoch": 1.929931640625, + "grad_norm": 33.60853576660156, + "learning_rate": 5.279149381313797e-06, + "loss": 4.6343, + "step": 94860 + }, + { + "epoch": 1.9300333658854165, + "grad_norm": 18.029800415039062, + "learning_rate": 5.278750326350879e-06, + "loss": 4.9154, + "step": 94865 + }, + { + "epoch": 1.9301350911458335, + "grad_norm": 26.897071838378906, + "learning_rate": 5.278351269606834e-06, + "loss": 4.8478, + "step": 94870 + }, + { + "epoch": 1.93023681640625, + "grad_norm": 16.547582626342773, + "learning_rate": 5.277952211084213e-06, + "loss": 4.8184, + "step": 94875 + }, + { + "epoch": 1.9303385416666665, + "grad_norm": 21.0667667388916, + "learning_rate": 5.277553150785566e-06, + "loss": 4.7302, + "step": 94880 + }, + { + "epoch": 1.9304402669270835, + "grad_norm": 19.257661819458008, + "learning_rate": 5.277154088713441e-06, + "loss": 4.7524, + "step": 94885 + }, + { + "epoch": 1.9305419921875, + "grad_norm": 18.082487106323242, + "learning_rate": 5.27675502487039e-06, + "loss": 4.9565, + "step": 94890 + }, + { + "epoch": 1.9306437174479165, + "grad_norm": 18.555274963378906, + "learning_rate": 5.276355959258964e-06, + "loss": 5.0002, + "step": 94895 + }, + { + "epoch": 1.9307454427083335, + "grad_norm": 17.418739318847656, + "learning_rate": 5.275956891881708e-06, + "loss": 4.8235, + "step": 94900 + }, + { + "epoch": 1.93084716796875, + "grad_norm": 16.884729385375977, + "learning_rate": 5.275557822741176e-06, + "loss": 4.8021, + "step": 94905 + }, + { + "epoch": 1.9309488932291665, + "grad_norm": 13.995229721069336, + "learning_rate": 5.275158751839919e-06, + "loss": 4.6712, + "step": 94910 + }, + { + "epoch": 1.9310506184895835, + "grad_norm": 18.88153648376465, + "learning_rate": 5.274759679180482e-06, + "loss": 4.7223, + "step": 94915 + }, + { + "epoch": 1.93115234375, + "grad_norm": 17.276046752929688, + "learning_rate": 5.27436060476542e-06, + "loss": 5.0647, + "step": 94920 + }, + { + "epoch": 1.9312540690104165, + "grad_norm": 19.688051223754883, + "learning_rate": 5.27396152859728e-06, + "loss": 4.9293, + "step": 94925 + }, + { + "epoch": 1.9313557942708335, + "grad_norm": 24.654544830322266, + "learning_rate": 5.273562450678615e-06, + "loss": 5.0621, + "step": 94930 + }, + { + "epoch": 1.93145751953125, + "grad_norm": 16.544483184814453, + "learning_rate": 5.2731633710119686e-06, + "loss": 4.768, + "step": 94935 + }, + { + "epoch": 1.9315592447916665, + "grad_norm": 21.32186508178711, + "learning_rate": 5.272764289599898e-06, + "loss": 4.7727, + "step": 94940 + }, + { + "epoch": 1.9316609700520835, + "grad_norm": 19.061298370361328, + "learning_rate": 5.2723652064449505e-06, + "loss": 4.8368, + "step": 94945 + }, + { + "epoch": 1.9317626953125, + "grad_norm": 17.708354949951172, + "learning_rate": 5.2719661215496735e-06, + "loss": 4.9487, + "step": 94950 + }, + { + "epoch": 1.9318644205729165, + "grad_norm": 16.980623245239258, + "learning_rate": 5.271567034916621e-06, + "loss": 5.0482, + "step": 94955 + }, + { + "epoch": 1.9319661458333335, + "grad_norm": 18.882766723632812, + "learning_rate": 5.27116794654834e-06, + "loss": 4.9222, + "step": 94960 + }, + { + "epoch": 1.93206787109375, + "grad_norm": 16.826099395751953, + "learning_rate": 5.2707688564473834e-06, + "loss": 4.9176, + "step": 94965 + }, + { + "epoch": 1.9321695963541665, + "grad_norm": 21.592864990234375, + "learning_rate": 5.2703697646163e-06, + "loss": 4.9066, + "step": 94970 + }, + { + "epoch": 1.9322713216145835, + "grad_norm": 15.437883377075195, + "learning_rate": 5.2699706710576384e-06, + "loss": 4.6326, + "step": 94975 + }, + { + "epoch": 1.932373046875, + "grad_norm": 23.083026885986328, + "learning_rate": 5.2695715757739495e-06, + "loss": 5.0731, + "step": 94980 + }, + { + "epoch": 1.9324747721354165, + "grad_norm": 19.523035049438477, + "learning_rate": 5.269172478767785e-06, + "loss": 5.0167, + "step": 94985 + }, + { + "epoch": 1.9325764973958335, + "grad_norm": 20.828990936279297, + "learning_rate": 5.2687733800416925e-06, + "loss": 4.9168, + "step": 94990 + }, + { + "epoch": 1.93267822265625, + "grad_norm": 15.5794677734375, + "learning_rate": 5.268374279598225e-06, + "loss": 5.0033, + "step": 94995 + }, + { + "epoch": 1.9327799479166665, + "grad_norm": 15.554113388061523, + "learning_rate": 5.2679751774399306e-06, + "loss": 4.7878, + "step": 95000 + }, + { + "epoch": 1.9328816731770835, + "grad_norm": 16.142135620117188, + "learning_rate": 5.26757607356936e-06, + "loss": 5.3038, + "step": 95005 + }, + { + "epoch": 1.9329833984375, + "grad_norm": 17.626819610595703, + "learning_rate": 5.267176967989063e-06, + "loss": 4.7998, + "step": 95010 + }, + { + "epoch": 1.9330851236979165, + "grad_norm": 13.493886947631836, + "learning_rate": 5.26677786070159e-06, + "loss": 4.9103, + "step": 95015 + }, + { + "epoch": 1.9331868489583335, + "grad_norm": 17.43085289001465, + "learning_rate": 5.266378751709491e-06, + "loss": 4.7741, + "step": 95020 + }, + { + "epoch": 1.93328857421875, + "grad_norm": 15.121125221252441, + "learning_rate": 5.2659796410153154e-06, + "loss": 4.9621, + "step": 95025 + }, + { + "epoch": 1.9333902994791665, + "grad_norm": 20.599641799926758, + "learning_rate": 5.265580528621615e-06, + "loss": 4.7837, + "step": 95030 + }, + { + "epoch": 1.9334920247395835, + "grad_norm": 16.717905044555664, + "learning_rate": 5.26518141453094e-06, + "loss": 5.0526, + "step": 95035 + }, + { + "epoch": 1.93359375, + "grad_norm": 15.796806335449219, + "learning_rate": 5.2647822987458395e-06, + "loss": 4.8735, + "step": 95040 + }, + { + "epoch": 1.9336954752604165, + "grad_norm": 19.559410095214844, + "learning_rate": 5.264383181268864e-06, + "loss": 5.1062, + "step": 95045 + }, + { + "epoch": 1.9337972005208335, + "grad_norm": 18.977611541748047, + "learning_rate": 5.2639840621025654e-06, + "loss": 4.9479, + "step": 95050 + }, + { + "epoch": 1.93389892578125, + "grad_norm": 30.156816482543945, + "learning_rate": 5.2635849412494895e-06, + "loss": 5.1389, + "step": 95055 + }, + { + "epoch": 1.9340006510416665, + "grad_norm": 16.72443962097168, + "learning_rate": 5.263185818712192e-06, + "loss": 4.8879, + "step": 95060 + }, + { + "epoch": 1.9341023763020835, + "grad_norm": 16.62199592590332, + "learning_rate": 5.26278669449322e-06, + "loss": 4.5032, + "step": 95065 + }, + { + "epoch": 1.9342041015625, + "grad_norm": 14.507522583007812, + "learning_rate": 5.262387568595124e-06, + "loss": 5.0567, + "step": 95070 + }, + { + "epoch": 1.9343058268229165, + "grad_norm": 18.72462272644043, + "learning_rate": 5.261988441020455e-06, + "loss": 4.8741, + "step": 95075 + }, + { + "epoch": 1.9344075520833335, + "grad_norm": 19.660261154174805, + "learning_rate": 5.261589311771762e-06, + "loss": 4.7404, + "step": 95080 + }, + { + "epoch": 1.93450927734375, + "grad_norm": 19.4941463470459, + "learning_rate": 5.2611901808515985e-06, + "loss": 4.9343, + "step": 95085 + }, + { + "epoch": 1.9346110026041665, + "grad_norm": 19.703510284423828, + "learning_rate": 5.26079104826251e-06, + "loss": 4.7077, + "step": 95090 + }, + { + "epoch": 1.9347127278645835, + "grad_norm": 18.74136734008789, + "learning_rate": 5.260391914007052e-06, + "loss": 4.6897, + "step": 95095 + }, + { + "epoch": 1.934814453125, + "grad_norm": 18.99600601196289, + "learning_rate": 5.259992778087772e-06, + "loss": 4.915, + "step": 95100 + }, + { + "epoch": 1.9349161783854165, + "grad_norm": 16.413673400878906, + "learning_rate": 5.259593640507219e-06, + "loss": 4.9519, + "step": 95105 + }, + { + "epoch": 1.9350179036458335, + "grad_norm": 18.19116973876953, + "learning_rate": 5.259194501267947e-06, + "loss": 4.8717, + "step": 95110 + }, + { + "epoch": 1.93511962890625, + "grad_norm": 17.71071434020996, + "learning_rate": 5.2587953603725035e-06, + "loss": 4.6958, + "step": 95115 + }, + { + "epoch": 1.9352213541666665, + "grad_norm": 20.124509811401367, + "learning_rate": 5.258396217823439e-06, + "loss": 5.0299, + "step": 95120 + }, + { + "epoch": 1.9353230794270835, + "grad_norm": 12.496127128601074, + "learning_rate": 5.257997073623306e-06, + "loss": 4.9631, + "step": 95125 + }, + { + "epoch": 1.9354248046875, + "grad_norm": 19.066957473754883, + "learning_rate": 5.257597927774652e-06, + "loss": 4.7809, + "step": 95130 + }, + { + "epoch": 1.9355265299479165, + "grad_norm": 18.431062698364258, + "learning_rate": 5.257198780280032e-06, + "loss": 4.8641, + "step": 95135 + }, + { + "epoch": 1.9356282552083335, + "grad_norm": 22.403898239135742, + "learning_rate": 5.256799631141991e-06, + "loss": 4.9261, + "step": 95140 + }, + { + "epoch": 1.93572998046875, + "grad_norm": 18.584503173828125, + "learning_rate": 5.256400480363082e-06, + "loss": 4.9672, + "step": 95145 + }, + { + "epoch": 1.9358317057291665, + "grad_norm": 19.666154861450195, + "learning_rate": 5.256001327945857e-06, + "loss": 4.754, + "step": 95150 + }, + { + "epoch": 1.9359334309895835, + "grad_norm": 27.450037002563477, + "learning_rate": 5.255602173892862e-06, + "loss": 5.1249, + "step": 95155 + }, + { + "epoch": 1.93603515625, + "grad_norm": 15.162160873413086, + "learning_rate": 5.255203018206653e-06, + "loss": 4.97, + "step": 95160 + }, + { + "epoch": 1.9361368815104165, + "grad_norm": 18.633718490600586, + "learning_rate": 5.254803860889777e-06, + "loss": 4.8747, + "step": 95165 + }, + { + "epoch": 1.9362386067708335, + "grad_norm": 13.541488647460938, + "learning_rate": 5.254404701944783e-06, + "loss": 4.9268, + "step": 95170 + }, + { + "epoch": 1.93634033203125, + "grad_norm": 14.01423168182373, + "learning_rate": 5.254005541374226e-06, + "loss": 4.5486, + "step": 95175 + }, + { + "epoch": 1.9364420572916665, + "grad_norm": 15.452887535095215, + "learning_rate": 5.253606379180654e-06, + "loss": 5.0725, + "step": 95180 + }, + { + "epoch": 1.9365437825520835, + "grad_norm": 20.9359073638916, + "learning_rate": 5.253207215366616e-06, + "loss": 4.988, + "step": 95185 + }, + { + "epoch": 1.9366455078125, + "grad_norm": 21.32607078552246, + "learning_rate": 5.252808049934666e-06, + "loss": 4.7212, + "step": 95190 + }, + { + "epoch": 1.9367472330729165, + "grad_norm": 19.577396392822266, + "learning_rate": 5.252408882887352e-06, + "loss": 4.9049, + "step": 95195 + }, + { + "epoch": 1.9368489583333335, + "grad_norm": 19.037532806396484, + "learning_rate": 5.252009714227226e-06, + "loss": 4.7438, + "step": 95200 + }, + { + "epoch": 1.93695068359375, + "grad_norm": 19.509384155273438, + "learning_rate": 5.251610543956838e-06, + "loss": 4.7816, + "step": 95205 + }, + { + "epoch": 1.9370524088541665, + "grad_norm": 19.263145446777344, + "learning_rate": 5.251211372078738e-06, + "loss": 5.1531, + "step": 95210 + }, + { + "epoch": 1.9371541341145835, + "grad_norm": 20.089210510253906, + "learning_rate": 5.250812198595477e-06, + "loss": 4.796, + "step": 95215 + }, + { + "epoch": 1.937255859375, + "grad_norm": 15.275016784667969, + "learning_rate": 5.2504130235096064e-06, + "loss": 4.8869, + "step": 95220 + }, + { + "epoch": 1.9373575846354165, + "grad_norm": 20.955707550048828, + "learning_rate": 5.250013846823676e-06, + "loss": 4.9759, + "step": 95225 + }, + { + "epoch": 1.9374593098958335, + "grad_norm": 15.607915878295898, + "learning_rate": 5.249614668540236e-06, + "loss": 5.2798, + "step": 95230 + }, + { + "epoch": 1.93756103515625, + "grad_norm": 16.641483306884766, + "learning_rate": 5.249215488661838e-06, + "loss": 5.0699, + "step": 95235 + }, + { + "epoch": 1.9376627604166665, + "grad_norm": 15.140823364257812, + "learning_rate": 5.248816307191032e-06, + "loss": 4.767, + "step": 95240 + }, + { + "epoch": 1.9377644856770835, + "grad_norm": 18.103940963745117, + "learning_rate": 5.248417124130367e-06, + "loss": 4.8645, + "step": 95245 + }, + { + "epoch": 1.9378662109375, + "grad_norm": 16.859560012817383, + "learning_rate": 5.248017939482398e-06, + "loss": 4.9962, + "step": 95250 + }, + { + "epoch": 1.9379679361979165, + "grad_norm": 17.10369873046875, + "learning_rate": 5.247618753249672e-06, + "loss": 5.4663, + "step": 95255 + }, + { + "epoch": 1.9380696614583335, + "grad_norm": 15.904223442077637, + "learning_rate": 5.2472195654347406e-06, + "loss": 4.7443, + "step": 95260 + }, + { + "epoch": 1.93817138671875, + "grad_norm": 17.710214614868164, + "learning_rate": 5.246820376040155e-06, + "loss": 4.8798, + "step": 95265 + }, + { + "epoch": 1.9382731119791665, + "grad_norm": 16.95535659790039, + "learning_rate": 5.246421185068467e-06, + "loss": 4.6654, + "step": 95270 + }, + { + "epoch": 1.9383748372395835, + "grad_norm": 13.886948585510254, + "learning_rate": 5.246021992522223e-06, + "loss": 4.795, + "step": 95275 + }, + { + "epoch": 1.9384765625, + "grad_norm": 24.6924991607666, + "learning_rate": 5.245622798403979e-06, + "loss": 4.747, + "step": 95280 + }, + { + "epoch": 1.9385782877604165, + "grad_norm": 15.000682830810547, + "learning_rate": 5.245223602716281e-06, + "loss": 4.479, + "step": 95285 + }, + { + "epoch": 1.9386800130208335, + "grad_norm": 14.613137245178223, + "learning_rate": 5.2448244054616834e-06, + "loss": 4.9134, + "step": 95290 + }, + { + "epoch": 1.93878173828125, + "grad_norm": 16.47310447692871, + "learning_rate": 5.244425206642736e-06, + "loss": 4.8896, + "step": 95295 + }, + { + "epoch": 1.9388834635416665, + "grad_norm": 27.70709991455078, + "learning_rate": 5.244026006261987e-06, + "loss": 4.7911, + "step": 95300 + }, + { + "epoch": 1.9389851888020835, + "grad_norm": 12.27614974975586, + "learning_rate": 5.2436268043219914e-06, + "loss": 4.8248, + "step": 95305 + }, + { + "epoch": 1.9390869140625, + "grad_norm": 22.310579299926758, + "learning_rate": 5.243227600825297e-06, + "loss": 4.957, + "step": 95310 + }, + { + "epoch": 1.9391886393229165, + "grad_norm": 17.54094886779785, + "learning_rate": 5.2428283957744556e-06, + "loss": 5.3652, + "step": 95315 + }, + { + "epoch": 1.9392903645833335, + "grad_norm": 16.842742919921875, + "learning_rate": 5.242429189172018e-06, + "loss": 4.847, + "step": 95320 + }, + { + "epoch": 1.93939208984375, + "grad_norm": 18.24949836730957, + "learning_rate": 5.242029981020535e-06, + "loss": 4.6828, + "step": 95325 + }, + { + "epoch": 1.9394938151041665, + "grad_norm": 16.187641143798828, + "learning_rate": 5.241630771322557e-06, + "loss": 4.8058, + "step": 95330 + }, + { + "epoch": 1.9395955403645835, + "grad_norm": 25.42560386657715, + "learning_rate": 5.241231560080635e-06, + "loss": 5.0835, + "step": 95335 + }, + { + "epoch": 1.939697265625, + "grad_norm": 16.792871475219727, + "learning_rate": 5.24083234729732e-06, + "loss": 4.8543, + "step": 95340 + }, + { + "epoch": 1.9397989908854165, + "grad_norm": 20.084827423095703, + "learning_rate": 5.240433132975162e-06, + "loss": 5.0635, + "step": 95345 + }, + { + "epoch": 1.9399007161458335, + "grad_norm": 18.166574478149414, + "learning_rate": 5.2400339171167134e-06, + "loss": 4.8788, + "step": 95350 + }, + { + "epoch": 1.94000244140625, + "grad_norm": 17.030553817749023, + "learning_rate": 5.239634699724525e-06, + "loss": 4.9659, + "step": 95355 + }, + { + "epoch": 1.9401041666666665, + "grad_norm": 18.38336753845215, + "learning_rate": 5.2392354808011455e-06, + "loss": 4.5863, + "step": 95360 + }, + { + "epoch": 1.9402058919270835, + "grad_norm": 19.52803611755371, + "learning_rate": 5.238836260349128e-06, + "loss": 5.0046, + "step": 95365 + }, + { + "epoch": 1.9403076171875, + "grad_norm": 17.261295318603516, + "learning_rate": 5.238437038371023e-06, + "loss": 4.8062, + "step": 95370 + }, + { + "epoch": 1.9404093424479165, + "grad_norm": 17.614166259765625, + "learning_rate": 5.23803781486938e-06, + "loss": 4.5874, + "step": 95375 + }, + { + "epoch": 1.9405110677083335, + "grad_norm": 16.0295352935791, + "learning_rate": 5.2376385898467515e-06, + "loss": 4.6562, + "step": 95380 + }, + { + "epoch": 1.94061279296875, + "grad_norm": 18.103971481323242, + "learning_rate": 5.2372393633056884e-06, + "loss": 4.9843, + "step": 95385 + }, + { + "epoch": 1.9407145182291665, + "grad_norm": 17.956787109375, + "learning_rate": 5.23684013524874e-06, + "loss": 4.6662, + "step": 95390 + }, + { + "epoch": 1.9408162434895835, + "grad_norm": 21.301294326782227, + "learning_rate": 5.236440905678459e-06, + "loss": 4.8671, + "step": 95395 + }, + { + "epoch": 1.94091796875, + "grad_norm": 15.81927490234375, + "learning_rate": 5.236041674597395e-06, + "loss": 4.7855, + "step": 95400 + }, + { + "epoch": 1.9410196940104165, + "grad_norm": 22.08940887451172, + "learning_rate": 5.2356424420081e-06, + "loss": 4.9597, + "step": 95405 + }, + { + "epoch": 1.9411214192708335, + "grad_norm": 21.30187225341797, + "learning_rate": 5.235243207913125e-06, + "loss": 5.0295, + "step": 95410 + }, + { + "epoch": 1.94122314453125, + "grad_norm": 23.400426864624023, + "learning_rate": 5.23484397231502e-06, + "loss": 4.8794, + "step": 95415 + }, + { + "epoch": 1.9413248697916665, + "grad_norm": 16.637104034423828, + "learning_rate": 5.234444735216338e-06, + "loss": 4.6417, + "step": 95420 + }, + { + "epoch": 1.9414265950520835, + "grad_norm": 19.972566604614258, + "learning_rate": 5.2340454966196275e-06, + "loss": 4.7401, + "step": 95425 + }, + { + "epoch": 1.9415283203125, + "grad_norm": 17.214754104614258, + "learning_rate": 5.233646256527439e-06, + "loss": 5.1026, + "step": 95430 + }, + { + "epoch": 1.9416300455729165, + "grad_norm": 18.275577545166016, + "learning_rate": 5.233247014942327e-06, + "loss": 4.8101, + "step": 95435 + }, + { + "epoch": 1.9417317708333335, + "grad_norm": 15.469772338867188, + "learning_rate": 5.23284777186684e-06, + "loss": 4.8026, + "step": 95440 + }, + { + "epoch": 1.94183349609375, + "grad_norm": 19.712125778198242, + "learning_rate": 5.23244852730353e-06, + "loss": 5.0295, + "step": 95445 + }, + { + "epoch": 1.9419352213541665, + "grad_norm": 17.009634017944336, + "learning_rate": 5.232049281254947e-06, + "loss": 4.7527, + "step": 95450 + }, + { + "epoch": 1.9420369466145835, + "grad_norm": 19.236949920654297, + "learning_rate": 5.231650033723642e-06, + "loss": 4.8204, + "step": 95455 + }, + { + "epoch": 1.942138671875, + "grad_norm": 17.20777702331543, + "learning_rate": 5.23125078471217e-06, + "loss": 4.7687, + "step": 95460 + }, + { + "epoch": 1.9422403971354165, + "grad_norm": 17.868812561035156, + "learning_rate": 5.230851534223076e-06, + "loss": 4.9649, + "step": 95465 + }, + { + "epoch": 1.9423421223958335, + "grad_norm": 19.16527557373047, + "learning_rate": 5.230452282258915e-06, + "loss": 5.0045, + "step": 95470 + }, + { + "epoch": 1.94244384765625, + "grad_norm": 19.319250106811523, + "learning_rate": 5.230053028822237e-06, + "loss": 4.7639, + "step": 95475 + }, + { + "epoch": 1.9425455729166665, + "grad_norm": 17.89332389831543, + "learning_rate": 5.229653773915593e-06, + "loss": 5.0111, + "step": 95480 + }, + { + "epoch": 1.9426472981770835, + "grad_norm": 15.443131446838379, + "learning_rate": 5.2292545175415346e-06, + "loss": 5.0731, + "step": 95485 + }, + { + "epoch": 1.9427490234375, + "grad_norm": 20.70400047302246, + "learning_rate": 5.2288552597026135e-06, + "loss": 4.9826, + "step": 95490 + }, + { + "epoch": 1.9428507486979165, + "grad_norm": 18.93942642211914, + "learning_rate": 5.228456000401378e-06, + "loss": 4.6319, + "step": 95495 + }, + { + "epoch": 1.9429524739583335, + "grad_norm": 18.19318199157715, + "learning_rate": 5.228056739640382e-06, + "loss": 4.8983, + "step": 95500 + }, + { + "epoch": 1.94305419921875, + "grad_norm": 21.312480926513672, + "learning_rate": 5.227657477422175e-06, + "loss": 4.9033, + "step": 95505 + }, + { + "epoch": 1.9431559244791665, + "grad_norm": 19.188518524169922, + "learning_rate": 5.2272582137493114e-06, + "loss": 4.9, + "step": 95510 + }, + { + "epoch": 1.9432576497395835, + "grad_norm": 23.889347076416016, + "learning_rate": 5.226858948624338e-06, + "loss": 5.2748, + "step": 95515 + }, + { + "epoch": 1.943359375, + "grad_norm": 14.752084732055664, + "learning_rate": 5.226459682049808e-06, + "loss": 5.0849, + "step": 95520 + }, + { + "epoch": 1.9434611002604165, + "grad_norm": 16.6848201751709, + "learning_rate": 5.2260604140282735e-06, + "loss": 4.7505, + "step": 95525 + }, + { + "epoch": 1.9435628255208335, + "grad_norm": 18.128488540649414, + "learning_rate": 5.225661144562284e-06, + "loss": 4.958, + "step": 95530 + }, + { + "epoch": 1.94366455078125, + "grad_norm": 20.546178817749023, + "learning_rate": 5.225261873654391e-06, + "loss": 4.6644, + "step": 95535 + }, + { + "epoch": 1.9437662760416665, + "grad_norm": 18.440793991088867, + "learning_rate": 5.224862601307148e-06, + "loss": 4.8285, + "step": 95540 + }, + { + "epoch": 1.9438680013020835, + "grad_norm": 19.92308235168457, + "learning_rate": 5.224463327523102e-06, + "loss": 4.8419, + "step": 95545 + }, + { + "epoch": 1.9439697265625, + "grad_norm": 15.230340003967285, + "learning_rate": 5.224064052304808e-06, + "loss": 5.3564, + "step": 95550 + }, + { + "epoch": 1.9440714518229165, + "grad_norm": 18.30097198486328, + "learning_rate": 5.223664775654815e-06, + "loss": 4.8757, + "step": 95555 + }, + { + "epoch": 1.9441731770833335, + "grad_norm": 17.26947021484375, + "learning_rate": 5.223265497575677e-06, + "loss": 4.6348, + "step": 95560 + }, + { + "epoch": 1.94427490234375, + "grad_norm": 18.1900634765625, + "learning_rate": 5.222866218069942e-06, + "loss": 4.935, + "step": 95565 + }, + { + "epoch": 1.9443766276041665, + "grad_norm": 14.81410026550293, + "learning_rate": 5.222466937140161e-06, + "loss": 4.8096, + "step": 95570 + }, + { + "epoch": 1.9444783528645835, + "grad_norm": 23.546916961669922, + "learning_rate": 5.222067654788889e-06, + "loss": 4.7718, + "step": 95575 + }, + { + "epoch": 1.944580078125, + "grad_norm": 20.278043746948242, + "learning_rate": 5.221668371018675e-06, + "loss": 4.8474, + "step": 95580 + }, + { + "epoch": 1.9446818033854165, + "grad_norm": 20.625595092773438, + "learning_rate": 5.22126908583207e-06, + "loss": 4.9356, + "step": 95585 + }, + { + "epoch": 1.9447835286458335, + "grad_norm": 20.02543067932129, + "learning_rate": 5.220869799231625e-06, + "loss": 4.7436, + "step": 95590 + }, + { + "epoch": 1.94488525390625, + "grad_norm": 19.89898109436035, + "learning_rate": 5.220470511219893e-06, + "loss": 4.9219, + "step": 95595 + }, + { + "epoch": 1.9449869791666665, + "grad_norm": 17.568178176879883, + "learning_rate": 5.2200712217994245e-06, + "loss": 4.9093, + "step": 95600 + }, + { + "epoch": 1.9450887044270835, + "grad_norm": 19.949377059936523, + "learning_rate": 5.2196719309727705e-06, + "loss": 4.7678, + "step": 95605 + }, + { + "epoch": 1.9451904296875, + "grad_norm": 17.44237518310547, + "learning_rate": 5.219272638742483e-06, + "loss": 5.1724, + "step": 95610 + }, + { + "epoch": 1.9452921549479165, + "grad_norm": 24.41329002380371, + "learning_rate": 5.218873345111113e-06, + "loss": 4.7254, + "step": 95615 + }, + { + "epoch": 1.9453938802083335, + "grad_norm": 29.356151580810547, + "learning_rate": 5.218474050081211e-06, + "loss": 4.9507, + "step": 95620 + }, + { + "epoch": 1.94549560546875, + "grad_norm": 17.80366325378418, + "learning_rate": 5.21807475365533e-06, + "loss": 4.9784, + "step": 95625 + }, + { + "epoch": 1.9455973307291665, + "grad_norm": 18.141630172729492, + "learning_rate": 5.2176754558360194e-06, + "loss": 4.8922, + "step": 95630 + }, + { + "epoch": 1.9456990559895835, + "grad_norm": 17.088109970092773, + "learning_rate": 5.217276156625832e-06, + "loss": 4.821, + "step": 95635 + }, + { + "epoch": 1.94580078125, + "grad_norm": 17.081134796142578, + "learning_rate": 5.216876856027321e-06, + "loss": 4.8021, + "step": 95640 + }, + { + "epoch": 1.9459025065104165, + "grad_norm": 12.121953964233398, + "learning_rate": 5.216477554043035e-06, + "loss": 5.0444, + "step": 95645 + }, + { + "epoch": 1.9460042317708335, + "grad_norm": 19.74409294128418, + "learning_rate": 5.216078250675524e-06, + "loss": 4.706, + "step": 95650 + }, + { + "epoch": 1.94610595703125, + "grad_norm": 16.646142959594727, + "learning_rate": 5.215678945927344e-06, + "loss": 4.6616, + "step": 95655 + }, + { + "epoch": 1.9462076822916665, + "grad_norm": 17.696819305419922, + "learning_rate": 5.215279639801042e-06, + "loss": 4.867, + "step": 95660 + }, + { + "epoch": 1.9463094075520835, + "grad_norm": 20.884275436401367, + "learning_rate": 5.214880332299173e-06, + "loss": 4.6984, + "step": 95665 + }, + { + "epoch": 1.9464111328125, + "grad_norm": 16.658937454223633, + "learning_rate": 5.214481023424286e-06, + "loss": 4.9101, + "step": 95670 + }, + { + "epoch": 1.9465128580729165, + "grad_norm": 21.465431213378906, + "learning_rate": 5.214081713178934e-06, + "loss": 4.499, + "step": 95675 + }, + { + "epoch": 1.9466145833333335, + "grad_norm": 23.37576675415039, + "learning_rate": 5.213682401565667e-06, + "loss": 4.9677, + "step": 95680 + }, + { + "epoch": 1.94671630859375, + "grad_norm": 13.602344512939453, + "learning_rate": 5.213283088587036e-06, + "loss": 4.7505, + "step": 95685 + }, + { + "epoch": 1.9468180338541665, + "grad_norm": 17.378999710083008, + "learning_rate": 5.2128837742455975e-06, + "loss": 4.5877, + "step": 95690 + }, + { + "epoch": 1.9469197591145835, + "grad_norm": 16.412445068359375, + "learning_rate": 5.2124844585438974e-06, + "loss": 4.8872, + "step": 95695 + }, + { + "epoch": 1.947021484375, + "grad_norm": 20.397302627563477, + "learning_rate": 5.212085141484488e-06, + "loss": 5.2174, + "step": 95700 + }, + { + "epoch": 1.9471232096354165, + "grad_norm": 15.33025074005127, + "learning_rate": 5.211685823069923e-06, + "loss": 4.8672, + "step": 95705 + }, + { + "epoch": 1.9472249348958335, + "grad_norm": 18.368501663208008, + "learning_rate": 5.211286503302753e-06, + "loss": 5.0437, + "step": 95710 + }, + { + "epoch": 1.94732666015625, + "grad_norm": 23.557327270507812, + "learning_rate": 5.210887182185529e-06, + "loss": 4.7636, + "step": 95715 + }, + { + "epoch": 1.9474283854166665, + "grad_norm": 19.12683868408203, + "learning_rate": 5.210487859720802e-06, + "loss": 4.8502, + "step": 95720 + }, + { + "epoch": 1.9475301106770835, + "grad_norm": 20.792579650878906, + "learning_rate": 5.2100885359111245e-06, + "loss": 4.879, + "step": 95725 + }, + { + "epoch": 1.9476318359375, + "grad_norm": 18.3978328704834, + "learning_rate": 5.209689210759049e-06, + "loss": 4.7821, + "step": 95730 + }, + { + "epoch": 1.9477335611979165, + "grad_norm": 18.319110870361328, + "learning_rate": 5.209289884267126e-06, + "loss": 5.0861, + "step": 95735 + }, + { + "epoch": 1.9478352864583335, + "grad_norm": 15.844718933105469, + "learning_rate": 5.208890556437905e-06, + "loss": 4.7528, + "step": 95740 + }, + { + "epoch": 1.94793701171875, + "grad_norm": 15.56650447845459, + "learning_rate": 5.208491227273942e-06, + "loss": 5.0095, + "step": 95745 + }, + { + "epoch": 1.9480387369791665, + "grad_norm": 19.763416290283203, + "learning_rate": 5.208091896777785e-06, + "loss": 4.6947, + "step": 95750 + }, + { + "epoch": 1.9481404622395835, + "grad_norm": 19.560222625732422, + "learning_rate": 5.207692564951989e-06, + "loss": 4.9501, + "step": 95755 + }, + { + "epoch": 1.9482421875, + "grad_norm": 17.830198287963867, + "learning_rate": 5.207293231799101e-06, + "loss": 4.8365, + "step": 95760 + }, + { + "epoch": 1.9483439127604165, + "grad_norm": 21.780864715576172, + "learning_rate": 5.2068938973216754e-06, + "loss": 4.8517, + "step": 95765 + }, + { + "epoch": 1.9484456380208335, + "grad_norm": 16.561429977416992, + "learning_rate": 5.206494561522265e-06, + "loss": 4.6678, + "step": 95770 + }, + { + "epoch": 1.94854736328125, + "grad_norm": 18.167152404785156, + "learning_rate": 5.206095224403418e-06, + "loss": 4.6273, + "step": 95775 + }, + { + "epoch": 1.9486490885416665, + "grad_norm": 21.728485107421875, + "learning_rate": 5.205695885967689e-06, + "loss": 5.16, + "step": 95780 + }, + { + "epoch": 1.9487508138020835, + "grad_norm": 17.073341369628906, + "learning_rate": 5.205296546217629e-06, + "loss": 4.9455, + "step": 95785 + }, + { + "epoch": 1.9488525390625, + "grad_norm": 17.145795822143555, + "learning_rate": 5.204897205155788e-06, + "loss": 5.0492, + "step": 95790 + }, + { + "epoch": 1.9489542643229165, + "grad_norm": 18.440282821655273, + "learning_rate": 5.20449786278472e-06, + "loss": 5.0334, + "step": 95795 + }, + { + "epoch": 1.9490559895833335, + "grad_norm": 13.605146408081055, + "learning_rate": 5.2040985191069744e-06, + "loss": 4.744, + "step": 95800 + }, + { + "epoch": 1.94915771484375, + "grad_norm": 18.36015510559082, + "learning_rate": 5.2036991741251055e-06, + "loss": 4.913, + "step": 95805 + }, + { + "epoch": 1.9492594401041665, + "grad_norm": 18.879613876342773, + "learning_rate": 5.203299827841662e-06, + "loss": 4.8184, + "step": 95810 + }, + { + "epoch": 1.9493611653645835, + "grad_norm": 18.958221435546875, + "learning_rate": 5.202900480259198e-06, + "loss": 5.0425, + "step": 95815 + }, + { + "epoch": 1.949462890625, + "grad_norm": 22.558874130249023, + "learning_rate": 5.202501131380264e-06, + "loss": 5.0126, + "step": 95820 + }, + { + "epoch": 1.9495646158854165, + "grad_norm": 21.682477951049805, + "learning_rate": 5.202101781207414e-06, + "loss": 4.8756, + "step": 95825 + }, + { + "epoch": 1.9496663411458335, + "grad_norm": 15.074703216552734, + "learning_rate": 5.201702429743195e-06, + "loss": 4.7832, + "step": 95830 + }, + { + "epoch": 1.94976806640625, + "grad_norm": 19.480886459350586, + "learning_rate": 5.201303076990163e-06, + "loss": 5.1371, + "step": 95835 + }, + { + "epoch": 1.9498697916666665, + "grad_norm": 13.087913513183594, + "learning_rate": 5.2009037229508675e-06, + "loss": 4.9863, + "step": 95840 + }, + { + "epoch": 1.9499715169270835, + "grad_norm": 23.36522674560547, + "learning_rate": 5.200504367627861e-06, + "loss": 4.9865, + "step": 95845 + }, + { + "epoch": 1.9500732421875, + "grad_norm": 18.247264862060547, + "learning_rate": 5.200105011023695e-06, + "loss": 4.7878, + "step": 95850 + }, + { + "epoch": 1.9501749674479165, + "grad_norm": 16.776451110839844, + "learning_rate": 5.199705653140923e-06, + "loss": 5.0225, + "step": 95855 + }, + { + "epoch": 1.9502766927083335, + "grad_norm": 14.484265327453613, + "learning_rate": 5.199306293982093e-06, + "loss": 4.7666, + "step": 95860 + }, + { + "epoch": 1.95037841796875, + "grad_norm": 16.48238754272461, + "learning_rate": 5.198906933549761e-06, + "loss": 4.8291, + "step": 95865 + }, + { + "epoch": 1.9504801432291665, + "grad_norm": 20.909072875976562, + "learning_rate": 5.198507571846476e-06, + "loss": 5.1471, + "step": 95870 + }, + { + "epoch": 1.9505818684895835, + "grad_norm": 24.616634368896484, + "learning_rate": 5.1981082088747914e-06, + "loss": 4.7415, + "step": 95875 + }, + { + "epoch": 1.95068359375, + "grad_norm": 12.74622917175293, + "learning_rate": 5.197708844637257e-06, + "loss": 4.8245, + "step": 95880 + }, + { + "epoch": 1.9507853190104165, + "grad_norm": 15.62785530090332, + "learning_rate": 5.197309479136426e-06, + "loss": 4.8706, + "step": 95885 + }, + { + "epoch": 1.9508870442708335, + "grad_norm": 18.766441345214844, + "learning_rate": 5.196910112374852e-06, + "loss": 4.6326, + "step": 95890 + }, + { + "epoch": 1.95098876953125, + "grad_norm": 17.19180679321289, + "learning_rate": 5.196510744355082e-06, + "loss": 5.0093, + "step": 95895 + }, + { + "epoch": 1.9510904947916665, + "grad_norm": 20.31884765625, + "learning_rate": 5.196111375079673e-06, + "loss": 4.7901, + "step": 95900 + }, + { + "epoch": 1.9511922200520835, + "grad_norm": 16.06961441040039, + "learning_rate": 5.195712004551174e-06, + "loss": 4.775, + "step": 95905 + }, + { + "epoch": 1.9512939453125, + "grad_norm": 25.51432991027832, + "learning_rate": 5.195312632772137e-06, + "loss": 4.9924, + "step": 95910 + }, + { + "epoch": 1.9513956705729165, + "grad_norm": 21.357276916503906, + "learning_rate": 5.1949132597451145e-06, + "loss": 4.9174, + "step": 95915 + }, + { + "epoch": 1.9514973958333335, + "grad_norm": 21.435543060302734, + "learning_rate": 5.194513885472657e-06, + "loss": 4.7762, + "step": 95920 + }, + { + "epoch": 1.95159912109375, + "grad_norm": 21.42616081237793, + "learning_rate": 5.19411450995732e-06, + "loss": 5.0592, + "step": 95925 + }, + { + "epoch": 1.9517008463541665, + "grad_norm": 17.358158111572266, + "learning_rate": 5.193715133201652e-06, + "loss": 4.9488, + "step": 95930 + }, + { + "epoch": 1.9518025716145835, + "grad_norm": 19.361854553222656, + "learning_rate": 5.1933157552082045e-06, + "loss": 4.7781, + "step": 95935 + }, + { + "epoch": 1.951904296875, + "grad_norm": 19.531394958496094, + "learning_rate": 5.192916375979532e-06, + "loss": 5.0042, + "step": 95940 + }, + { + "epoch": 1.9520060221354165, + "grad_norm": 20.88863754272461, + "learning_rate": 5.192516995518184e-06, + "loss": 5.1102, + "step": 95945 + }, + { + "epoch": 1.9521077473958335, + "grad_norm": 21.443561553955078, + "learning_rate": 5.192117613826715e-06, + "loss": 4.8758, + "step": 95950 + }, + { + "epoch": 1.95220947265625, + "grad_norm": 21.026344299316406, + "learning_rate": 5.191718230907674e-06, + "loss": 5.0823, + "step": 95955 + }, + { + "epoch": 1.9523111979166665, + "grad_norm": 22.655820846557617, + "learning_rate": 5.191318846763614e-06, + "loss": 4.9223, + "step": 95960 + }, + { + "epoch": 1.9524129231770835, + "grad_norm": 18.503202438354492, + "learning_rate": 5.190919461397089e-06, + "loss": 4.6743, + "step": 95965 + }, + { + "epoch": 1.9525146484375, + "grad_norm": 17.497879028320312, + "learning_rate": 5.190520074810648e-06, + "loss": 4.8081, + "step": 95970 + }, + { + "epoch": 1.9526163736979165, + "grad_norm": 21.68988609313965, + "learning_rate": 5.1901206870068456e-06, + "loss": 4.8635, + "step": 95975 + }, + { + "epoch": 1.9527180989583335, + "grad_norm": 16.72725486755371, + "learning_rate": 5.189721297988231e-06, + "loss": 4.9009, + "step": 95980 + }, + { + "epoch": 1.95281982421875, + "grad_norm": 20.945470809936523, + "learning_rate": 5.189321907757358e-06, + "loss": 4.7058, + "step": 95985 + }, + { + "epoch": 1.9529215494791665, + "grad_norm": 22.490060806274414, + "learning_rate": 5.188922516316778e-06, + "loss": 4.9137, + "step": 95990 + }, + { + "epoch": 1.9530232747395835, + "grad_norm": 25.365455627441406, + "learning_rate": 5.188523123669042e-06, + "loss": 4.7317, + "step": 95995 + }, + { + "epoch": 1.953125, + "grad_norm": 13.787766456604004, + "learning_rate": 5.1881237298167044e-06, + "loss": 4.868, + "step": 96000 + }, + { + "epoch": 1.9532267252604165, + "grad_norm": 20.890451431274414, + "learning_rate": 5.187724334762316e-06, + "loss": 4.6721, + "step": 96005 + }, + { + "epoch": 1.9533284505208335, + "grad_norm": 16.56350326538086, + "learning_rate": 5.187324938508428e-06, + "loss": 4.8522, + "step": 96010 + }, + { + "epoch": 1.95343017578125, + "grad_norm": 23.12388801574707, + "learning_rate": 5.186925541057594e-06, + "loss": 4.7856, + "step": 96015 + }, + { + "epoch": 1.9535319010416665, + "grad_norm": 15.453081130981445, + "learning_rate": 5.186526142412365e-06, + "loss": 4.8063, + "step": 96020 + }, + { + "epoch": 1.9536336263020835, + "grad_norm": 19.248165130615234, + "learning_rate": 5.186126742575293e-06, + "loss": 4.7762, + "step": 96025 + }, + { + "epoch": 1.9537353515625, + "grad_norm": 20.93689727783203, + "learning_rate": 5.185727341548929e-06, + "loss": 5.0607, + "step": 96030 + }, + { + "epoch": 1.9538370768229165, + "grad_norm": 25.205852508544922, + "learning_rate": 5.1853279393358265e-06, + "loss": 5.1058, + "step": 96035 + }, + { + "epoch": 1.9539388020833335, + "grad_norm": 18.960559844970703, + "learning_rate": 5.18492853593854e-06, + "loss": 4.8969, + "step": 96040 + }, + { + "epoch": 1.95404052734375, + "grad_norm": 15.87124252319336, + "learning_rate": 5.184529131359617e-06, + "loss": 4.8642, + "step": 96045 + }, + { + "epoch": 1.9541422526041665, + "grad_norm": 44.25074768066406, + "learning_rate": 5.184129725601611e-06, + "loss": 4.7665, + "step": 96050 + }, + { + "epoch": 1.9542439778645835, + "grad_norm": 13.534353256225586, + "learning_rate": 5.183730318667075e-06, + "loss": 4.9248, + "step": 96055 + }, + { + "epoch": 1.954345703125, + "grad_norm": 19.659637451171875, + "learning_rate": 5.1833309105585595e-06, + "loss": 4.8381, + "step": 96060 + }, + { + "epoch": 1.9544474283854165, + "grad_norm": 13.344184875488281, + "learning_rate": 5.182931501278619e-06, + "loss": 4.9015, + "step": 96065 + }, + { + "epoch": 1.9545491536458335, + "grad_norm": 17.584348678588867, + "learning_rate": 5.182532090829805e-06, + "loss": 5.1889, + "step": 96070 + }, + { + "epoch": 1.95465087890625, + "grad_norm": 20.461624145507812, + "learning_rate": 5.1821326792146665e-06, + "loss": 4.9367, + "step": 96075 + }, + { + "epoch": 1.9547526041666665, + "grad_norm": 16.241121292114258, + "learning_rate": 5.18173326643576e-06, + "loss": 4.986, + "step": 96080 + }, + { + "epoch": 1.9548543294270835, + "grad_norm": 14.853734970092773, + "learning_rate": 5.181333852495635e-06, + "loss": 5.077, + "step": 96085 + }, + { + "epoch": 1.9549560546875, + "grad_norm": 13.270036697387695, + "learning_rate": 5.180934437396844e-06, + "loss": 4.8408, + "step": 96090 + }, + { + "epoch": 1.9550577799479165, + "grad_norm": 23.53862762451172, + "learning_rate": 5.1805350211419395e-06, + "loss": 5.0366, + "step": 96095 + }, + { + "epoch": 1.9551595052083335, + "grad_norm": 19.363405227661133, + "learning_rate": 5.180135603733472e-06, + "loss": 4.5914, + "step": 96100 + }, + { + "epoch": 1.95526123046875, + "grad_norm": 17.796159744262695, + "learning_rate": 5.179736185173998e-06, + "loss": 5.0826, + "step": 96105 + }, + { + "epoch": 1.9553629557291665, + "grad_norm": 16.47456169128418, + "learning_rate": 5.1793367654660654e-06, + "loss": 5.0138, + "step": 96110 + }, + { + "epoch": 1.9554646809895835, + "grad_norm": 18.84515380859375, + "learning_rate": 5.178937344612228e-06, + "loss": 4.7528, + "step": 96115 + }, + { + "epoch": 1.95556640625, + "grad_norm": 14.093595504760742, + "learning_rate": 5.178537922615037e-06, + "loss": 4.8714, + "step": 96120 + }, + { + "epoch": 1.9556681315104165, + "grad_norm": 14.950529098510742, + "learning_rate": 5.178138499477047e-06, + "loss": 5.1089, + "step": 96125 + }, + { + "epoch": 1.9557698567708335, + "grad_norm": 23.947856903076172, + "learning_rate": 5.177739075200807e-06, + "loss": 5.1925, + "step": 96130 + }, + { + "epoch": 1.95587158203125, + "grad_norm": 16.655561447143555, + "learning_rate": 5.177339649788872e-06, + "loss": 5.0903, + "step": 96135 + }, + { + "epoch": 1.9559733072916665, + "grad_norm": 18.036609649658203, + "learning_rate": 5.176940223243792e-06, + "loss": 4.9873, + "step": 96140 + }, + { + "epoch": 1.9560750325520835, + "grad_norm": 22.947877883911133, + "learning_rate": 5.1765407955681204e-06, + "loss": 5.1742, + "step": 96145 + }, + { + "epoch": 1.9561767578125, + "grad_norm": 19.06753158569336, + "learning_rate": 5.176141366764409e-06, + "loss": 4.9116, + "step": 96150 + }, + { + "epoch": 1.9562784830729165, + "grad_norm": 14.667726516723633, + "learning_rate": 5.17574193683521e-06, + "loss": 4.7252, + "step": 96155 + }, + { + "epoch": 1.9563802083333335, + "grad_norm": 20.284852981567383, + "learning_rate": 5.175342505783077e-06, + "loss": 4.9226, + "step": 96160 + }, + { + "epoch": 1.95648193359375, + "grad_norm": 15.456286430358887, + "learning_rate": 5.174943073610559e-06, + "loss": 4.9065, + "step": 96165 + }, + { + "epoch": 1.9565836588541665, + "grad_norm": 21.009923934936523, + "learning_rate": 5.174543640320212e-06, + "loss": 5.0335, + "step": 96170 + }, + { + "epoch": 1.9566853841145835, + "grad_norm": 21.284212112426758, + "learning_rate": 5.174144205914587e-06, + "loss": 4.8149, + "step": 96175 + }, + { + "epoch": 1.956787109375, + "grad_norm": 15.855277061462402, + "learning_rate": 5.173744770396234e-06, + "loss": 5.18, + "step": 96180 + }, + { + "epoch": 1.9568888346354165, + "grad_norm": 18.54408073425293, + "learning_rate": 5.1733453337677076e-06, + "loss": 4.8696, + "step": 96185 + }, + { + "epoch": 1.9569905598958335, + "grad_norm": 16.907485961914062, + "learning_rate": 5.172945896031559e-06, + "loss": 4.8924, + "step": 96190 + }, + { + "epoch": 1.95709228515625, + "grad_norm": 16.059438705444336, + "learning_rate": 5.172546457190343e-06, + "loss": 4.7774, + "step": 96195 + }, + { + "epoch": 1.9571940104166665, + "grad_norm": 16.492305755615234, + "learning_rate": 5.172147017246609e-06, + "loss": 4.8164, + "step": 96200 + }, + { + "epoch": 1.9572957356770835, + "grad_norm": 23.163856506347656, + "learning_rate": 5.17174757620291e-06, + "loss": 4.8631, + "step": 96205 + }, + { + "epoch": 1.9573974609375, + "grad_norm": 16.4371395111084, + "learning_rate": 5.171348134061799e-06, + "loss": 4.9808, + "step": 96210 + }, + { + "epoch": 1.9574991861979165, + "grad_norm": 18.671764373779297, + "learning_rate": 5.170948690825827e-06, + "loss": 5.0187, + "step": 96215 + }, + { + "epoch": 1.9576009114583335, + "grad_norm": 17.52350616455078, + "learning_rate": 5.170549246497547e-06, + "loss": 4.8699, + "step": 96220 + }, + { + "epoch": 1.95770263671875, + "grad_norm": 21.291772842407227, + "learning_rate": 5.170149801079512e-06, + "loss": 4.8718, + "step": 96225 + }, + { + "epoch": 1.9578043619791665, + "grad_norm": 20.803495407104492, + "learning_rate": 5.169750354574273e-06, + "loss": 4.8365, + "step": 96230 + }, + { + "epoch": 1.9579060872395835, + "grad_norm": 15.441683769226074, + "learning_rate": 5.169350906984385e-06, + "loss": 5.1321, + "step": 96235 + }, + { + "epoch": 1.9580078125, + "grad_norm": 19.6894588470459, + "learning_rate": 5.168951458312397e-06, + "loss": 4.7512, + "step": 96240 + }, + { + "epoch": 1.9581095377604165, + "grad_norm": 16.91895294189453, + "learning_rate": 5.168552008560863e-06, + "loss": 5.1019, + "step": 96245 + }, + { + "epoch": 1.9582112630208335, + "grad_norm": 17.862321853637695, + "learning_rate": 5.168152557732334e-06, + "loss": 5.0484, + "step": 96250 + }, + { + "epoch": 1.95831298828125, + "grad_norm": 19.103683471679688, + "learning_rate": 5.167753105829366e-06, + "loss": 5.1194, + "step": 96255 + }, + { + "epoch": 1.9584147135416665, + "grad_norm": 19.054840087890625, + "learning_rate": 5.167353652854508e-06, + "loss": 4.7719, + "step": 96260 + }, + { + "epoch": 1.9585164388020835, + "grad_norm": 28.601673126220703, + "learning_rate": 5.166954198810312e-06, + "loss": 5.0264, + "step": 96265 + }, + { + "epoch": 1.9586181640625, + "grad_norm": 18.81171417236328, + "learning_rate": 5.166554743699332e-06, + "loss": 4.8819, + "step": 96270 + }, + { + "epoch": 1.9587198893229165, + "grad_norm": 19.637475967407227, + "learning_rate": 5.166155287524122e-06, + "loss": 4.7522, + "step": 96275 + }, + { + "epoch": 1.9588216145833335, + "grad_norm": 26.608644485473633, + "learning_rate": 5.16575583028723e-06, + "loss": 4.8762, + "step": 96280 + }, + { + "epoch": 1.95892333984375, + "grad_norm": 21.607593536376953, + "learning_rate": 5.165356371991212e-06, + "loss": 5.063, + "step": 96285 + }, + { + "epoch": 1.9590250651041665, + "grad_norm": 21.950469970703125, + "learning_rate": 5.164956912638621e-06, + "loss": 4.7517, + "step": 96290 + }, + { + "epoch": 1.9591267903645835, + "grad_norm": 19.28152847290039, + "learning_rate": 5.164557452232006e-06, + "loss": 4.8639, + "step": 96295 + }, + { + "epoch": 1.959228515625, + "grad_norm": 16.256397247314453, + "learning_rate": 5.16415799077392e-06, + "loss": 4.849, + "step": 96300 + }, + { + "epoch": 1.9593302408854165, + "grad_norm": 13.867639541625977, + "learning_rate": 5.163758528266919e-06, + "loss": 4.9043, + "step": 96305 + }, + { + "epoch": 1.9594319661458335, + "grad_norm": 20.080411911010742, + "learning_rate": 5.163359064713551e-06, + "loss": 4.9954, + "step": 96310 + }, + { + "epoch": 1.95953369140625, + "grad_norm": 20.4511775970459, + "learning_rate": 5.162959600116372e-06, + "loss": 5.0044, + "step": 96315 + }, + { + "epoch": 1.9596354166666665, + "grad_norm": 17.40553855895996, + "learning_rate": 5.162560134477931e-06, + "loss": 4.9891, + "step": 96320 + }, + { + "epoch": 1.9597371419270835, + "grad_norm": 20.32828712463379, + "learning_rate": 5.162160667800784e-06, + "loss": 4.5413, + "step": 96325 + }, + { + "epoch": 1.9598388671875, + "grad_norm": 15.292747497558594, + "learning_rate": 5.161761200087481e-06, + "loss": 4.8883, + "step": 96330 + }, + { + "epoch": 1.9599405924479165, + "grad_norm": 18.96647071838379, + "learning_rate": 5.1613617313405755e-06, + "loss": 4.9959, + "step": 96335 + }, + { + "epoch": 1.9600423177083335, + "grad_norm": 16.456335067749023, + "learning_rate": 5.16096226156262e-06, + "loss": 4.7785, + "step": 96340 + }, + { + "epoch": 1.96014404296875, + "grad_norm": 17.308517456054688, + "learning_rate": 5.160562790756165e-06, + "loss": 4.6992, + "step": 96345 + }, + { + "epoch": 1.9602457682291665, + "grad_norm": 38.92728805541992, + "learning_rate": 5.160163318923767e-06, + "loss": 4.9887, + "step": 96350 + }, + { + "epoch": 1.9603474934895835, + "grad_norm": 14.4138822555542, + "learning_rate": 5.159763846067976e-06, + "loss": 4.9539, + "step": 96355 + }, + { + "epoch": 1.96044921875, + "grad_norm": 17.99026107788086, + "learning_rate": 5.159364372191343e-06, + "loss": 5.011, + "step": 96360 + }, + { + "epoch": 1.9605509440104165, + "grad_norm": 23.327159881591797, + "learning_rate": 5.158964897296424e-06, + "loss": 4.7553, + "step": 96365 + }, + { + "epoch": 1.9606526692708335, + "grad_norm": 18.929868698120117, + "learning_rate": 5.158565421385769e-06, + "loss": 4.9009, + "step": 96370 + }, + { + "epoch": 1.96075439453125, + "grad_norm": 12.827735900878906, + "learning_rate": 5.158165944461931e-06, + "loss": 4.7496, + "step": 96375 + }, + { + "epoch": 1.9608561197916665, + "grad_norm": 20.667184829711914, + "learning_rate": 5.157766466527463e-06, + "loss": 4.944, + "step": 96380 + }, + { + "epoch": 1.9609578450520835, + "grad_norm": 30.850921630859375, + "learning_rate": 5.157366987584918e-06, + "loss": 4.8435, + "step": 96385 + }, + { + "epoch": 1.9610595703125, + "grad_norm": 15.37994384765625, + "learning_rate": 5.1569675076368475e-06, + "loss": 4.8481, + "step": 96390 + }, + { + "epoch": 1.9611612955729165, + "grad_norm": 20.604597091674805, + "learning_rate": 5.156568026685804e-06, + "loss": 4.792, + "step": 96395 + }, + { + "epoch": 1.9612630208333335, + "grad_norm": 17.77416229248047, + "learning_rate": 5.156168544734341e-06, + "loss": 4.9259, + "step": 96400 + }, + { + "epoch": 1.96136474609375, + "grad_norm": 22.335098266601562, + "learning_rate": 5.155769061785011e-06, + "loss": 4.9862, + "step": 96405 + }, + { + "epoch": 1.9614664713541665, + "grad_norm": 20.11331558227539, + "learning_rate": 5.155369577840366e-06, + "loss": 4.7757, + "step": 96410 + }, + { + "epoch": 1.9615681966145835, + "grad_norm": 20.564111709594727, + "learning_rate": 5.1549700929029576e-06, + "loss": 4.8591, + "step": 96415 + }, + { + "epoch": 1.961669921875, + "grad_norm": 15.096736907958984, + "learning_rate": 5.154570606975341e-06, + "loss": 4.7774, + "step": 96420 + }, + { + "epoch": 1.9617716471354165, + "grad_norm": 16.766603469848633, + "learning_rate": 5.1541711200600665e-06, + "loss": 4.8725, + "step": 96425 + }, + { + "epoch": 1.9618733723958335, + "grad_norm": 15.238566398620605, + "learning_rate": 5.153771632159688e-06, + "loss": 4.7844, + "step": 96430 + }, + { + "epoch": 1.96197509765625, + "grad_norm": 13.401033401489258, + "learning_rate": 5.153372143276756e-06, + "loss": 4.624, + "step": 96435 + }, + { + "epoch": 1.9620768229166665, + "grad_norm": 17.896595001220703, + "learning_rate": 5.152972653413827e-06, + "loss": 4.8325, + "step": 96440 + }, + { + "epoch": 1.9621785481770835, + "grad_norm": 13.698519706726074, + "learning_rate": 5.15257316257345e-06, + "loss": 4.7786, + "step": 96445 + }, + { + "epoch": 1.9622802734375, + "grad_norm": 17.588790893554688, + "learning_rate": 5.152173670758179e-06, + "loss": 4.6522, + "step": 96450 + }, + { + "epoch": 1.9623819986979165, + "grad_norm": 18.434640884399414, + "learning_rate": 5.151774177970567e-06, + "loss": 4.9103, + "step": 96455 + }, + { + "epoch": 1.9624837239583335, + "grad_norm": 21.635986328125, + "learning_rate": 5.1513746842131664e-06, + "loss": 5.087, + "step": 96460 + }, + { + "epoch": 1.96258544921875, + "grad_norm": 20.583051681518555, + "learning_rate": 5.150975189488529e-06, + "loss": 4.7507, + "step": 96465 + }, + { + "epoch": 1.9626871744791665, + "grad_norm": 17.038122177124023, + "learning_rate": 5.150575693799208e-06, + "loss": 4.711, + "step": 96470 + }, + { + "epoch": 1.9627888997395835, + "grad_norm": 20.508739471435547, + "learning_rate": 5.150176197147756e-06, + "loss": 4.8501, + "step": 96475 + }, + { + "epoch": 1.962890625, + "grad_norm": 25.220165252685547, + "learning_rate": 5.149776699536726e-06, + "loss": 4.8864, + "step": 96480 + }, + { + "epoch": 1.9629923502604165, + "grad_norm": 19.35573959350586, + "learning_rate": 5.149377200968671e-06, + "loss": 4.6548, + "step": 96485 + }, + { + "epoch": 1.9630940755208335, + "grad_norm": 16.994352340698242, + "learning_rate": 5.148977701446142e-06, + "loss": 5.1662, + "step": 96490 + }, + { + "epoch": 1.96319580078125, + "grad_norm": 19.90662956237793, + "learning_rate": 5.148578200971693e-06, + "loss": 4.9024, + "step": 96495 + }, + { + "epoch": 1.9632975260416665, + "grad_norm": 16.17217254638672, + "learning_rate": 5.148178699547877e-06, + "loss": 4.7707, + "step": 96500 + }, + { + "epoch": 1.9633992513020835, + "grad_norm": 23.214696884155273, + "learning_rate": 5.147779197177246e-06, + "loss": 4.8619, + "step": 96505 + }, + { + "epoch": 1.9635009765625, + "grad_norm": 22.20883560180664, + "learning_rate": 5.147379693862352e-06, + "loss": 4.732, + "step": 96510 + }, + { + "epoch": 1.9636027018229165, + "grad_norm": 26.62696075439453, + "learning_rate": 5.1469801896057506e-06, + "loss": 5.2136, + "step": 96515 + }, + { + "epoch": 1.9637044270833335, + "grad_norm": 16.251955032348633, + "learning_rate": 5.14658068440999e-06, + "loss": 4.7833, + "step": 96520 + }, + { + "epoch": 1.96380615234375, + "grad_norm": 19.264110565185547, + "learning_rate": 5.146181178277627e-06, + "loss": 4.6807, + "step": 96525 + }, + { + "epoch": 1.9639078776041665, + "grad_norm": 18.84783935546875, + "learning_rate": 5.145781671211212e-06, + "loss": 5.1075, + "step": 96530 + }, + { + "epoch": 1.9640096028645835, + "grad_norm": 16.290626525878906, + "learning_rate": 5.145382163213299e-06, + "loss": 5.0148, + "step": 96535 + }, + { + "epoch": 1.964111328125, + "grad_norm": 17.93648338317871, + "learning_rate": 5.144982654286439e-06, + "loss": 4.621, + "step": 96540 + }, + { + "epoch": 1.9642130533854165, + "grad_norm": 19.567054748535156, + "learning_rate": 5.1445831444331876e-06, + "loss": 4.8038, + "step": 96545 + }, + { + "epoch": 1.9643147786458335, + "grad_norm": 15.32781982421875, + "learning_rate": 5.144183633656095e-06, + "loss": 4.764, + "step": 96550 + }, + { + "epoch": 1.96441650390625, + "grad_norm": 16.21766471862793, + "learning_rate": 5.143784121957713e-06, + "loss": 4.8874, + "step": 96555 + }, + { + "epoch": 1.9645182291666665, + "grad_norm": 23.141983032226562, + "learning_rate": 5.143384609340598e-06, + "loss": 4.7914, + "step": 96560 + }, + { + "epoch": 1.9646199544270835, + "grad_norm": 22.095014572143555, + "learning_rate": 5.1429850958073e-06, + "loss": 4.6901, + "step": 96565 + }, + { + "epoch": 1.9647216796875, + "grad_norm": 22.38969612121582, + "learning_rate": 5.142585581360374e-06, + "loss": 4.7597, + "step": 96570 + }, + { + "epoch": 1.9648234049479165, + "grad_norm": 15.46379280090332, + "learning_rate": 5.142186066002371e-06, + "loss": 5.0083, + "step": 96575 + }, + { + "epoch": 1.9649251302083335, + "grad_norm": 20.597881317138672, + "learning_rate": 5.141786549735844e-06, + "loss": 4.9099, + "step": 96580 + }, + { + "epoch": 1.96502685546875, + "grad_norm": 14.89450740814209, + "learning_rate": 5.1413870325633455e-06, + "loss": 5.1809, + "step": 96585 + }, + { + "epoch": 1.9651285807291665, + "grad_norm": 16.44249153137207, + "learning_rate": 5.140987514487429e-06, + "loss": 5.1421, + "step": 96590 + }, + { + "epoch": 1.9652303059895835, + "grad_norm": 16.528223037719727, + "learning_rate": 5.140587995510647e-06, + "loss": 4.7608, + "step": 96595 + }, + { + "epoch": 1.96533203125, + "grad_norm": 17.442405700683594, + "learning_rate": 5.140188475635553e-06, + "loss": 4.6542, + "step": 96600 + }, + { + "epoch": 1.9654337565104165, + "grad_norm": 26.280229568481445, + "learning_rate": 5.139788954864698e-06, + "loss": 4.8962, + "step": 96605 + }, + { + "epoch": 1.9655354817708335, + "grad_norm": 19.327577590942383, + "learning_rate": 5.139389433200635e-06, + "loss": 4.5719, + "step": 96610 + }, + { + "epoch": 1.96563720703125, + "grad_norm": 17.746021270751953, + "learning_rate": 5.1389899106459205e-06, + "loss": 4.7082, + "step": 96615 + }, + { + "epoch": 1.9657389322916665, + "grad_norm": 15.75279426574707, + "learning_rate": 5.138590387203103e-06, + "loss": 4.7983, + "step": 96620 + }, + { + "epoch": 1.9658406575520835, + "grad_norm": 23.419673919677734, + "learning_rate": 5.138190862874737e-06, + "loss": 5.1405, + "step": 96625 + }, + { + "epoch": 1.9659423828125, + "grad_norm": 18.074209213256836, + "learning_rate": 5.137791337663375e-06, + "loss": 4.9018, + "step": 96630 + }, + { + "epoch": 1.9660441080729165, + "grad_norm": 21.731454849243164, + "learning_rate": 5.137391811571571e-06, + "loss": 5.1765, + "step": 96635 + }, + { + "epoch": 1.9661458333333335, + "grad_norm": 16.369117736816406, + "learning_rate": 5.1369922846018775e-06, + "loss": 5.047, + "step": 96640 + }, + { + "epoch": 1.96624755859375, + "grad_norm": 18.346210479736328, + "learning_rate": 5.136592756756844e-06, + "loss": 4.87, + "step": 96645 + }, + { + "epoch": 1.9663492838541665, + "grad_norm": 18.735797882080078, + "learning_rate": 5.136193228039028e-06, + "loss": 5.0445, + "step": 96650 + }, + { + "epoch": 1.9664510091145835, + "grad_norm": 18.548830032348633, + "learning_rate": 5.1357936984509805e-06, + "loss": 4.9999, + "step": 96655 + }, + { + "epoch": 1.966552734375, + "grad_norm": 26.29869842529297, + "learning_rate": 5.135394167995253e-06, + "loss": 4.8683, + "step": 96660 + }, + { + "epoch": 1.9666544596354165, + "grad_norm": 16.335214614868164, + "learning_rate": 5.134994636674402e-06, + "loss": 4.7108, + "step": 96665 + }, + { + "epoch": 1.9667561848958335, + "grad_norm": 21.92477798461914, + "learning_rate": 5.134595104490977e-06, + "loss": 5.0612, + "step": 96670 + }, + { + "epoch": 1.96685791015625, + "grad_norm": 17.181873321533203, + "learning_rate": 5.1341955714475324e-06, + "loss": 4.8471, + "step": 96675 + }, + { + "epoch": 1.9669596354166665, + "grad_norm": 17.66250228881836, + "learning_rate": 5.1337960375466215e-06, + "loss": 4.9896, + "step": 96680 + }, + { + "epoch": 1.9670613606770835, + "grad_norm": 17.146217346191406, + "learning_rate": 5.133396502790794e-06, + "loss": 4.4154, + "step": 96685 + }, + { + "epoch": 1.9671630859375, + "grad_norm": 18.55962562561035, + "learning_rate": 5.132996967182607e-06, + "loss": 4.9125, + "step": 96690 + }, + { + "epoch": 1.9672648111979165, + "grad_norm": 17.61683464050293, + "learning_rate": 5.13259743072461e-06, + "loss": 4.714, + "step": 96695 + }, + { + "epoch": 1.9673665364583335, + "grad_norm": 18.791181564331055, + "learning_rate": 5.13219789341936e-06, + "loss": 4.6845, + "step": 96700 + }, + { + "epoch": 1.96746826171875, + "grad_norm": 16.55396842956543, + "learning_rate": 5.131798355269406e-06, + "loss": 4.9733, + "step": 96705 + }, + { + "epoch": 1.9675699869791665, + "grad_norm": 17.664072036743164, + "learning_rate": 5.131398816277302e-06, + "loss": 4.8448, + "step": 96710 + }, + { + "epoch": 1.9676717122395835, + "grad_norm": 12.734016418457031, + "learning_rate": 5.130999276445602e-06, + "loss": 4.8834, + "step": 96715 + }, + { + "epoch": 1.9677734375, + "grad_norm": 23.613893508911133, + "learning_rate": 5.130599735776858e-06, + "loss": 4.9453, + "step": 96720 + }, + { + "epoch": 1.9678751627604165, + "grad_norm": 13.281968116760254, + "learning_rate": 5.130200194273622e-06, + "loss": 5.0255, + "step": 96725 + }, + { + "epoch": 1.9679768880208335, + "grad_norm": 25.16193199157715, + "learning_rate": 5.12980065193845e-06, + "loss": 4.6129, + "step": 96730 + }, + { + "epoch": 1.96807861328125, + "grad_norm": 17.32440948486328, + "learning_rate": 5.129401108773891e-06, + "loss": 5.0197, + "step": 96735 + }, + { + "epoch": 1.9681803385416665, + "grad_norm": 14.824801445007324, + "learning_rate": 5.129001564782503e-06, + "loss": 4.6958, + "step": 96740 + }, + { + "epoch": 1.9682820638020835, + "grad_norm": 19.69559097290039, + "learning_rate": 5.128602019966834e-06, + "loss": 4.8068, + "step": 96745 + }, + { + "epoch": 1.9683837890625, + "grad_norm": 16.117820739746094, + "learning_rate": 5.128202474329439e-06, + "loss": 4.7889, + "step": 96750 + }, + { + "epoch": 1.9684855143229165, + "grad_norm": 24.807876586914062, + "learning_rate": 5.127802927872871e-06, + "loss": 4.7229, + "step": 96755 + }, + { + "epoch": 1.9685872395833335, + "grad_norm": 19.08103370666504, + "learning_rate": 5.127403380599683e-06, + "loss": 4.9192, + "step": 96760 + }, + { + "epoch": 1.96868896484375, + "grad_norm": 19.059648513793945, + "learning_rate": 5.1270038325124275e-06, + "loss": 4.809, + "step": 96765 + }, + { + "epoch": 1.9687906901041665, + "grad_norm": 16.527572631835938, + "learning_rate": 5.126604283613659e-06, + "loss": 5.1143, + "step": 96770 + }, + { + "epoch": 1.9688924153645835, + "grad_norm": 13.945560455322266, + "learning_rate": 5.126204733905927e-06, + "loss": 4.7981, + "step": 96775 + }, + { + "epoch": 1.968994140625, + "grad_norm": 20.24319839477539, + "learning_rate": 5.125805183391789e-06, + "loss": 5.0012, + "step": 96780 + }, + { + "epoch": 1.9690958658854165, + "grad_norm": 12.954198837280273, + "learning_rate": 5.125405632073793e-06, + "loss": 5.0335, + "step": 96785 + }, + { + "epoch": 1.9691975911458335, + "grad_norm": 19.015918731689453, + "learning_rate": 5.125006079954497e-06, + "loss": 4.8859, + "step": 96790 + }, + { + "epoch": 1.96929931640625, + "grad_norm": 17.391780853271484, + "learning_rate": 5.124606527036452e-06, + "loss": 4.7098, + "step": 96795 + }, + { + "epoch": 1.9694010416666665, + "grad_norm": 13.901869773864746, + "learning_rate": 5.12420697332221e-06, + "loss": 5.0979, + "step": 96800 + }, + { + "epoch": 1.9695027669270835, + "grad_norm": 14.324894905090332, + "learning_rate": 5.123807418814325e-06, + "loss": 4.6828, + "step": 96805 + }, + { + "epoch": 1.9696044921875, + "grad_norm": 19.15869903564453, + "learning_rate": 5.12340786351535e-06, + "loss": 4.8198, + "step": 96810 + }, + { + "epoch": 1.9697062174479165, + "grad_norm": 13.737939834594727, + "learning_rate": 5.123008307427838e-06, + "loss": 4.8526, + "step": 96815 + }, + { + "epoch": 1.9698079427083335, + "grad_norm": 20.746660232543945, + "learning_rate": 5.122608750554343e-06, + "loss": 4.9527, + "step": 96820 + }, + { + "epoch": 1.96990966796875, + "grad_norm": 18.299617767333984, + "learning_rate": 5.122209192897415e-06, + "loss": 4.7104, + "step": 96825 + }, + { + "epoch": 1.9700113932291665, + "grad_norm": 16.858722686767578, + "learning_rate": 5.12180963445961e-06, + "loss": 4.8908, + "step": 96830 + }, + { + "epoch": 1.9701131184895835, + "grad_norm": 15.979304313659668, + "learning_rate": 5.121410075243481e-06, + "loss": 5.0007, + "step": 96835 + }, + { + "epoch": 1.97021484375, + "grad_norm": 18.667457580566406, + "learning_rate": 5.121010515251579e-06, + "loss": 4.8795, + "step": 96840 + }, + { + "epoch": 1.9703165690104165, + "grad_norm": 24.027830123901367, + "learning_rate": 5.120610954486459e-06, + "loss": 4.8406, + "step": 96845 + }, + { + "epoch": 1.9704182942708335, + "grad_norm": 14.118107795715332, + "learning_rate": 5.120211392950673e-06, + "loss": 4.8416, + "step": 96850 + }, + { + "epoch": 1.97052001953125, + "grad_norm": 19.025100708007812, + "learning_rate": 5.119811830646774e-06, + "loss": 4.8662, + "step": 96855 + }, + { + "epoch": 1.9706217447916665, + "grad_norm": 14.999319076538086, + "learning_rate": 5.119412267577316e-06, + "loss": 4.9583, + "step": 96860 + }, + { + "epoch": 1.9707234700520835, + "grad_norm": 22.754514694213867, + "learning_rate": 5.119012703744851e-06, + "loss": 4.7982, + "step": 96865 + }, + { + "epoch": 1.9708251953125, + "grad_norm": 21.103513717651367, + "learning_rate": 5.118613139151933e-06, + "loss": 4.9312, + "step": 96870 + }, + { + "epoch": 1.9709269205729165, + "grad_norm": 20.85550880432129, + "learning_rate": 5.118213573801114e-06, + "loss": 5.2222, + "step": 96875 + }, + { + "epoch": 1.9710286458333335, + "grad_norm": 14.73151969909668, + "learning_rate": 5.117814007694949e-06, + "loss": 4.824, + "step": 96880 + }, + { + "epoch": 1.97113037109375, + "grad_norm": 17.877464294433594, + "learning_rate": 5.1174144408359895e-06, + "loss": 4.8882, + "step": 96885 + }, + { + "epoch": 1.9712320963541665, + "grad_norm": 13.322412490844727, + "learning_rate": 5.1170148732267875e-06, + "loss": 4.9829, + "step": 96890 + }, + { + "epoch": 1.9713338216145835, + "grad_norm": 18.240642547607422, + "learning_rate": 5.116615304869899e-06, + "loss": 4.7814, + "step": 96895 + }, + { + "epoch": 1.971435546875, + "grad_norm": 17.591651916503906, + "learning_rate": 5.116215735767876e-06, + "loss": 4.8471, + "step": 96900 + }, + { + "epoch": 1.9715372721354165, + "grad_norm": 22.58173179626465, + "learning_rate": 5.1158161659232705e-06, + "loss": 4.8946, + "step": 96905 + }, + { + "epoch": 1.9716389973958335, + "grad_norm": 25.241477966308594, + "learning_rate": 5.115416595338637e-06, + "loss": 4.7525, + "step": 96910 + }, + { + "epoch": 1.97174072265625, + "grad_norm": 19.07442855834961, + "learning_rate": 5.115017024016527e-06, + "loss": 4.852, + "step": 96915 + }, + { + "epoch": 1.9718424479166665, + "grad_norm": 22.640701293945312, + "learning_rate": 5.114617451959497e-06, + "loss": 5.1591, + "step": 96920 + }, + { + "epoch": 1.9719441731770835, + "grad_norm": 14.465642929077148, + "learning_rate": 5.1142178791700955e-06, + "loss": 4.9396, + "step": 96925 + }, + { + "epoch": 1.9720458984375, + "grad_norm": 18.040428161621094, + "learning_rate": 5.113818305650879e-06, + "loss": 4.9376, + "step": 96930 + }, + { + "epoch": 1.9721476236979165, + "grad_norm": 21.536842346191406, + "learning_rate": 5.113418731404399e-06, + "loss": 4.6946, + "step": 96935 + }, + { + "epoch": 1.9722493489583335, + "grad_norm": 21.512710571289062, + "learning_rate": 5.1130191564332095e-06, + "loss": 4.7409, + "step": 96940 + }, + { + "epoch": 1.97235107421875, + "grad_norm": 17.672121047973633, + "learning_rate": 5.112619580739865e-06, + "loss": 4.8533, + "step": 96945 + }, + { + "epoch": 1.9724527994791665, + "grad_norm": 17.138973236083984, + "learning_rate": 5.112220004326917e-06, + "loss": 4.9675, + "step": 96950 + }, + { + "epoch": 1.9725545247395835, + "grad_norm": 21.165355682373047, + "learning_rate": 5.111820427196916e-06, + "loss": 4.7493, + "step": 96955 + }, + { + "epoch": 1.97265625, + "grad_norm": 23.94266700744629, + "learning_rate": 5.111420849352421e-06, + "loss": 4.9052, + "step": 96960 + }, + { + "epoch": 1.9727579752604165, + "grad_norm": 22.558561325073242, + "learning_rate": 5.1110212707959816e-06, + "loss": 4.7582, + "step": 96965 + }, + { + "epoch": 1.9728597005208335, + "grad_norm": 23.671232223510742, + "learning_rate": 5.11062169153015e-06, + "loss": 4.8602, + "step": 96970 + }, + { + "epoch": 1.97296142578125, + "grad_norm": 20.473896026611328, + "learning_rate": 5.110222111557482e-06, + "loss": 4.7297, + "step": 96975 + }, + { + "epoch": 1.9730631510416665, + "grad_norm": 13.250249862670898, + "learning_rate": 5.10982253088053e-06, + "loss": 5.3123, + "step": 96980 + }, + { + "epoch": 1.9731648763020835, + "grad_norm": 17.68552017211914, + "learning_rate": 5.109422949501846e-06, + "loss": 4.7139, + "step": 96985 + }, + { + "epoch": 1.9732666015625, + "grad_norm": 13.82294750213623, + "learning_rate": 5.109023367423984e-06, + "loss": 5.0927, + "step": 96990 + }, + { + "epoch": 1.9733683268229165, + "grad_norm": 12.427075386047363, + "learning_rate": 5.108623784649498e-06, + "loss": 5.0061, + "step": 96995 + }, + { + "epoch": 1.9734700520833335, + "grad_norm": 31.139774322509766, + "learning_rate": 5.10822420118094e-06, + "loss": 4.8053, + "step": 97000 + }, + { + "epoch": 1.97357177734375, + "grad_norm": 15.405336380004883, + "learning_rate": 5.1078246170208636e-06, + "loss": 4.9828, + "step": 97005 + }, + { + "epoch": 1.9736735026041665, + "grad_norm": 17.881332397460938, + "learning_rate": 5.107425032171823e-06, + "loss": 5.0571, + "step": 97010 + }, + { + "epoch": 1.9737752278645835, + "grad_norm": 20.536516189575195, + "learning_rate": 5.107025446636371e-06, + "loss": 4.9756, + "step": 97015 + }, + { + "epoch": 1.973876953125, + "grad_norm": 22.33333396911621, + "learning_rate": 5.1066258604170584e-06, + "loss": 4.8232, + "step": 97020 + }, + { + "epoch": 1.9739786783854165, + "grad_norm": 19.48113441467285, + "learning_rate": 5.106226273516441e-06, + "loss": 5.0208, + "step": 97025 + }, + { + "epoch": 1.9740804036458335, + "grad_norm": 18.664262771606445, + "learning_rate": 5.105826685937073e-06, + "loss": 4.9655, + "step": 97030 + }, + { + "epoch": 1.97418212890625, + "grad_norm": 20.76081085205078, + "learning_rate": 5.105427097681505e-06, + "loss": 4.9938, + "step": 97035 + }, + { + "epoch": 1.9742838541666665, + "grad_norm": 13.925430297851562, + "learning_rate": 5.105027508752292e-06, + "loss": 4.7021, + "step": 97040 + }, + { + "epoch": 1.9743855794270835, + "grad_norm": 18.66517448425293, + "learning_rate": 5.104627919151984e-06, + "loss": 4.8447, + "step": 97045 + }, + { + "epoch": 1.9744873046875, + "grad_norm": 19.022008895874023, + "learning_rate": 5.10422832888314e-06, + "loss": 4.6924, + "step": 97050 + }, + { + "epoch": 1.9745890299479165, + "grad_norm": 27.957849502563477, + "learning_rate": 5.103828737948307e-06, + "loss": 5.1631, + "step": 97055 + }, + { + "epoch": 1.9746907552083335, + "grad_norm": 15.465423583984375, + "learning_rate": 5.103429146350043e-06, + "loss": 5.1173, + "step": 97060 + }, + { + "epoch": 1.97479248046875, + "grad_norm": 18.42177963256836, + "learning_rate": 5.1030295540909006e-06, + "loss": 4.827, + "step": 97065 + }, + { + "epoch": 1.9748942057291665, + "grad_norm": 20.461410522460938, + "learning_rate": 5.10262996117343e-06, + "loss": 5.0628, + "step": 97070 + }, + { + "epoch": 1.9749959309895835, + "grad_norm": 20.34478187561035, + "learning_rate": 5.102230367600187e-06, + "loss": 5.0026, + "step": 97075 + }, + { + "epoch": 1.97509765625, + "grad_norm": 21.145599365234375, + "learning_rate": 5.101830773373726e-06, + "loss": 4.7522, + "step": 97080 + }, + { + "epoch": 1.9751993815104165, + "grad_norm": 18.538148880004883, + "learning_rate": 5.101431178496596e-06, + "loss": 4.868, + "step": 97085 + }, + { + "epoch": 1.9753011067708335, + "grad_norm": 17.5394229888916, + "learning_rate": 5.101031582971357e-06, + "loss": 4.923, + "step": 97090 + }, + { + "epoch": 1.97540283203125, + "grad_norm": 16.377803802490234, + "learning_rate": 5.100631986800553e-06, + "loss": 4.8598, + "step": 97095 + }, + { + "epoch": 1.9755045572916665, + "grad_norm": 18.995302200317383, + "learning_rate": 5.100232389986746e-06, + "loss": 4.6459, + "step": 97100 + }, + { + "epoch": 1.9756062825520835, + "grad_norm": 19.4814395904541, + "learning_rate": 5.099832792532487e-06, + "loss": 4.9588, + "step": 97105 + }, + { + "epoch": 1.9757080078125, + "grad_norm": 17.76546859741211, + "learning_rate": 5.099433194440326e-06, + "loss": 5.2179, + "step": 97110 + }, + { + "epoch": 1.9758097330729165, + "grad_norm": 16.401681900024414, + "learning_rate": 5.099033595712819e-06, + "loss": 4.9361, + "step": 97115 + }, + { + "epoch": 1.9759114583333335, + "grad_norm": 21.553712844848633, + "learning_rate": 5.098633996352519e-06, + "loss": 5.1236, + "step": 97120 + }, + { + "epoch": 1.97601318359375, + "grad_norm": 13.204289436340332, + "learning_rate": 5.098234396361978e-06, + "loss": 5.0986, + "step": 97125 + }, + { + "epoch": 1.9761149088541665, + "grad_norm": 16.76371192932129, + "learning_rate": 5.0978347957437515e-06, + "loss": 4.7361, + "step": 97130 + }, + { + "epoch": 1.9762166341145835, + "grad_norm": 27.885976791381836, + "learning_rate": 5.097435194500391e-06, + "loss": 4.7861, + "step": 97135 + }, + { + "epoch": 1.976318359375, + "grad_norm": 17.403894424438477, + "learning_rate": 5.0970355926344506e-06, + "loss": 4.7273, + "step": 97140 + }, + { + "epoch": 1.9764200846354165, + "grad_norm": 19.82938575744629, + "learning_rate": 5.096635990148485e-06, + "loss": 4.6623, + "step": 97145 + }, + { + "epoch": 1.9765218098958335, + "grad_norm": 17.2271671295166, + "learning_rate": 5.096236387045044e-06, + "loss": 5.1259, + "step": 97150 + }, + { + "epoch": 1.97662353515625, + "grad_norm": 15.397109985351562, + "learning_rate": 5.095836783326684e-06, + "loss": 4.8496, + "step": 97155 + }, + { + "epoch": 1.9767252604166665, + "grad_norm": 17.04263687133789, + "learning_rate": 5.095437178995957e-06, + "loss": 4.9527, + "step": 97160 + }, + { + "epoch": 1.9768269856770835, + "grad_norm": 16.811452865600586, + "learning_rate": 5.095037574055417e-06, + "loss": 5.0491, + "step": 97165 + }, + { + "epoch": 1.9769287109375, + "grad_norm": 17.663877487182617, + "learning_rate": 5.094637968507618e-06, + "loss": 4.7764, + "step": 97170 + }, + { + "epoch": 1.9770304361979165, + "grad_norm": 16.97701644897461, + "learning_rate": 5.09423836235511e-06, + "loss": 4.9191, + "step": 97175 + }, + { + "epoch": 1.9771321614583335, + "grad_norm": 18.214160919189453, + "learning_rate": 5.093838755600452e-06, + "loss": 5.2569, + "step": 97180 + }, + { + "epoch": 1.97723388671875, + "grad_norm": 15.616052627563477, + "learning_rate": 5.093439148246192e-06, + "loss": 4.9086, + "step": 97185 + }, + { + "epoch": 1.9773356119791665, + "grad_norm": 20.17173957824707, + "learning_rate": 5.093039540294886e-06, + "loss": 4.8008, + "step": 97190 + }, + { + "epoch": 1.9774373372395835, + "grad_norm": 20.224721908569336, + "learning_rate": 5.092639931749086e-06, + "loss": 4.8672, + "step": 97195 + }, + { + "epoch": 1.9775390625, + "grad_norm": 20.10212516784668, + "learning_rate": 5.092240322611347e-06, + "loss": 4.9714, + "step": 97200 + }, + { + "epoch": 1.9776407877604165, + "grad_norm": 19.07859230041504, + "learning_rate": 5.0918407128842215e-06, + "loss": 5.0954, + "step": 97205 + }, + { + "epoch": 1.9777425130208335, + "grad_norm": 36.287025451660156, + "learning_rate": 5.0914411025702635e-06, + "loss": 4.469, + "step": 97210 + }, + { + "epoch": 1.97784423828125, + "grad_norm": 20.069005966186523, + "learning_rate": 5.091041491672025e-06, + "loss": 4.764, + "step": 97215 + }, + { + "epoch": 1.9779459635416665, + "grad_norm": 19.567401885986328, + "learning_rate": 5.090641880192062e-06, + "loss": 4.7634, + "step": 97220 + }, + { + "epoch": 1.9780476888020835, + "grad_norm": 15.851766586303711, + "learning_rate": 5.0902422681329245e-06, + "loss": 4.9367, + "step": 97225 + }, + { + "epoch": 1.9781494140625, + "grad_norm": 15.821646690368652, + "learning_rate": 5.089842655497168e-06, + "loss": 4.7996, + "step": 97230 + }, + { + "epoch": 1.9782511393229165, + "grad_norm": 14.35529899597168, + "learning_rate": 5.089443042287347e-06, + "loss": 4.9561, + "step": 97235 + }, + { + "epoch": 1.9783528645833335, + "grad_norm": 16.69510841369629, + "learning_rate": 5.08904342850601e-06, + "loss": 4.751, + "step": 97240 + }, + { + "epoch": 1.97845458984375, + "grad_norm": 19.86890983581543, + "learning_rate": 5.088643814155716e-06, + "loss": 4.8961, + "step": 97245 + }, + { + "epoch": 1.9785563151041665, + "grad_norm": 20.675310134887695, + "learning_rate": 5.088244199239016e-06, + "loss": 4.79, + "step": 97250 + }, + { + "epoch": 1.9786580403645835, + "grad_norm": 17.669025421142578, + "learning_rate": 5.087844583758463e-06, + "loss": 4.9786, + "step": 97255 + }, + { + "epoch": 1.978759765625, + "grad_norm": 18.595943450927734, + "learning_rate": 5.087444967716612e-06, + "loss": 4.8834, + "step": 97260 + }, + { + "epoch": 1.9788614908854165, + "grad_norm": 15.640979766845703, + "learning_rate": 5.087045351116014e-06, + "loss": 4.8506, + "step": 97265 + }, + { + "epoch": 1.9789632161458335, + "grad_norm": 21.655353546142578, + "learning_rate": 5.086645733959225e-06, + "loss": 4.9289, + "step": 97270 + }, + { + "epoch": 1.97906494140625, + "grad_norm": 19.476520538330078, + "learning_rate": 5.086246116248798e-06, + "loss": 4.863, + "step": 97275 + }, + { + "epoch": 1.9791666666666665, + "grad_norm": 19.962446212768555, + "learning_rate": 5.085846497987283e-06, + "loss": 4.8582, + "step": 97280 + }, + { + "epoch": 1.9792683919270835, + "grad_norm": 28.368486404418945, + "learning_rate": 5.085446879177238e-06, + "loss": 4.8991, + "step": 97285 + }, + { + "epoch": 1.9793701171875, + "grad_norm": 18.69070053100586, + "learning_rate": 5.0850472598212146e-06, + "loss": 4.8615, + "step": 97290 + }, + { + "epoch": 1.9794718424479165, + "grad_norm": 57.69158935546875, + "learning_rate": 5.084647639921766e-06, + "loss": 4.5817, + "step": 97295 + }, + { + "epoch": 1.9795735677083335, + "grad_norm": 19.854475021362305, + "learning_rate": 5.084248019481446e-06, + "loss": 5.0265, + "step": 97300 + }, + { + "epoch": 1.97967529296875, + "grad_norm": 26.731109619140625, + "learning_rate": 5.083848398502807e-06, + "loss": 4.9648, + "step": 97305 + }, + { + "epoch": 1.9797770182291665, + "grad_norm": 14.801239967346191, + "learning_rate": 5.083448776988405e-06, + "loss": 4.6964, + "step": 97310 + }, + { + "epoch": 1.9798787434895835, + "grad_norm": 16.69476318359375, + "learning_rate": 5.08304915494079e-06, + "loss": 4.8749, + "step": 97315 + }, + { + "epoch": 1.97998046875, + "grad_norm": 16.18915557861328, + "learning_rate": 5.082649532362518e-06, + "loss": 4.7674, + "step": 97320 + }, + { + "epoch": 1.9800821940104165, + "grad_norm": 15.488699913024902, + "learning_rate": 5.082249909256142e-06, + "loss": 4.5996, + "step": 97325 + }, + { + "epoch": 1.9801839192708335, + "grad_norm": 17.93171501159668, + "learning_rate": 5.0818502856242145e-06, + "loss": 4.7684, + "step": 97330 + }, + { + "epoch": 1.98028564453125, + "grad_norm": 17.513423919677734, + "learning_rate": 5.081450661469289e-06, + "loss": 4.8104, + "step": 97335 + }, + { + "epoch": 1.9803873697916665, + "grad_norm": 15.527776718139648, + "learning_rate": 5.081051036793922e-06, + "loss": 5.1467, + "step": 97340 + }, + { + "epoch": 1.9804890950520835, + "grad_norm": 18.559864044189453, + "learning_rate": 5.080651411600662e-06, + "loss": 4.8641, + "step": 97345 + }, + { + "epoch": 1.9805908203125, + "grad_norm": 23.811748504638672, + "learning_rate": 5.080251785892067e-06, + "loss": 5.2408, + "step": 97350 + }, + { + "epoch": 1.9806925455729165, + "grad_norm": 17.641023635864258, + "learning_rate": 5.0798521596706875e-06, + "loss": 4.6691, + "step": 97355 + }, + { + "epoch": 1.9807942708333335, + "grad_norm": 15.917953491210938, + "learning_rate": 5.07945253293908e-06, + "loss": 4.9833, + "step": 97360 + }, + { + "epoch": 1.98089599609375, + "grad_norm": 22.608211517333984, + "learning_rate": 5.079052905699794e-06, + "loss": 4.9194, + "step": 97365 + }, + { + "epoch": 1.9809977213541665, + "grad_norm": 15.774147987365723, + "learning_rate": 5.078653277955385e-06, + "loss": 5.0075, + "step": 97370 + }, + { + "epoch": 1.9810994466145835, + "grad_norm": 12.732622146606445, + "learning_rate": 5.078253649708408e-06, + "loss": 4.8427, + "step": 97375 + }, + { + "epoch": 1.981201171875, + "grad_norm": 15.782121658325195, + "learning_rate": 5.0778540209614135e-06, + "loss": 4.8823, + "step": 97380 + }, + { + "epoch": 1.9813028971354165, + "grad_norm": 15.101648330688477, + "learning_rate": 5.077454391716957e-06, + "loss": 4.8583, + "step": 97385 + }, + { + "epoch": 1.9814046223958335, + "grad_norm": 16.628969192504883, + "learning_rate": 5.077054761977593e-06, + "loss": 4.8608, + "step": 97390 + }, + { + "epoch": 1.98150634765625, + "grad_norm": 14.100303649902344, + "learning_rate": 5.076655131745871e-06, + "loss": 4.9331, + "step": 97395 + }, + { + "epoch": 1.9816080729166665, + "grad_norm": 17.344812393188477, + "learning_rate": 5.076255501024348e-06, + "loss": 4.7198, + "step": 97400 + }, + { + "epoch": 1.9817097981770835, + "grad_norm": 19.660383224487305, + "learning_rate": 5.075855869815578e-06, + "loss": 5.0116, + "step": 97405 + }, + { + "epoch": 1.9818115234375, + "grad_norm": 22.168119430541992, + "learning_rate": 5.0754562381221115e-06, + "loss": 5.0808, + "step": 97410 + }, + { + "epoch": 1.9819132486979165, + "grad_norm": 21.093217849731445, + "learning_rate": 5.075056605946504e-06, + "loss": 5.0351, + "step": 97415 + }, + { + "epoch": 1.9820149739583335, + "grad_norm": 19.39472770690918, + "learning_rate": 5.074656973291309e-06, + "loss": 4.5628, + "step": 97420 + }, + { + "epoch": 1.98211669921875, + "grad_norm": 15.785223007202148, + "learning_rate": 5.074257340159079e-06, + "loss": 4.9866, + "step": 97425 + }, + { + "epoch": 1.9822184244791665, + "grad_norm": 16.89406394958496, + "learning_rate": 5.073857706552368e-06, + "loss": 4.938, + "step": 97430 + }, + { + "epoch": 1.9823201497395835, + "grad_norm": 20.8645076751709, + "learning_rate": 5.0734580724737304e-06, + "loss": 4.7767, + "step": 97435 + }, + { + "epoch": 1.982421875, + "grad_norm": 21.15587615966797, + "learning_rate": 5.073058437925719e-06, + "loss": 4.7456, + "step": 97440 + }, + { + "epoch": 1.9825236002604165, + "grad_norm": 25.78146743774414, + "learning_rate": 5.072658802910887e-06, + "loss": 4.9185, + "step": 97445 + }, + { + "epoch": 1.9826253255208335, + "grad_norm": 16.758771896362305, + "learning_rate": 5.072259167431789e-06, + "loss": 4.6819, + "step": 97450 + }, + { + "epoch": 1.98272705078125, + "grad_norm": 13.516225814819336, + "learning_rate": 5.071859531490975e-06, + "loss": 4.856, + "step": 97455 + }, + { + "epoch": 1.9828287760416665, + "grad_norm": 17.161447525024414, + "learning_rate": 5.071459895091006e-06, + "loss": 5.0165, + "step": 97460 + }, + { + "epoch": 1.9829305013020835, + "grad_norm": 22.312833786010742, + "learning_rate": 5.0710602582344275e-06, + "loss": 4.9718, + "step": 97465 + }, + { + "epoch": 1.9830322265625, + "grad_norm": 19.59856414794922, + "learning_rate": 5.070660620923797e-06, + "loss": 4.7915, + "step": 97470 + }, + { + "epoch": 1.9831339518229165, + "grad_norm": 20.54915428161621, + "learning_rate": 5.070260983161669e-06, + "loss": 4.9175, + "step": 97475 + }, + { + "epoch": 1.9832356770833335, + "grad_norm": 25.829851150512695, + "learning_rate": 5.069861344950595e-06, + "loss": 5.0219, + "step": 97480 + }, + { + "epoch": 1.98333740234375, + "grad_norm": 21.23556137084961, + "learning_rate": 5.069461706293129e-06, + "loss": 4.7697, + "step": 97485 + }, + { + "epoch": 1.9834391276041665, + "grad_norm": 18.344989776611328, + "learning_rate": 5.069062067191824e-06, + "loss": 4.6285, + "step": 97490 + }, + { + "epoch": 1.9835408528645835, + "grad_norm": 19.677265167236328, + "learning_rate": 5.068662427649236e-06, + "loss": 4.629, + "step": 97495 + }, + { + "epoch": 1.983642578125, + "grad_norm": 14.150156021118164, + "learning_rate": 5.068262787667916e-06, + "loss": 5.076, + "step": 97500 + }, + { + "epoch": 1.9837443033854165, + "grad_norm": 21.640750885009766, + "learning_rate": 5.067863147250418e-06, + "loss": 4.6874, + "step": 97505 + }, + { + "epoch": 1.9838460286458335, + "grad_norm": 25.468414306640625, + "learning_rate": 5.067463506399296e-06, + "loss": 5.221, + "step": 97510 + }, + { + "epoch": 1.98394775390625, + "grad_norm": 17.675716400146484, + "learning_rate": 5.067063865117105e-06, + "loss": 4.6891, + "step": 97515 + }, + { + "epoch": 1.9840494791666665, + "grad_norm": 31.031280517578125, + "learning_rate": 5.066664223406397e-06, + "loss": 5.0527, + "step": 97520 + }, + { + "epoch": 1.9841512044270835, + "grad_norm": 21.060302734375, + "learning_rate": 5.0662645812697235e-06, + "loss": 4.9099, + "step": 97525 + }, + { + "epoch": 1.9842529296875, + "grad_norm": 21.586936950683594, + "learning_rate": 5.065864938709642e-06, + "loss": 4.9173, + "step": 97530 + }, + { + "epoch": 1.9843546549479165, + "grad_norm": 23.14491844177246, + "learning_rate": 5.065465295728704e-06, + "loss": 5.0131, + "step": 97535 + }, + { + "epoch": 1.9844563802083335, + "grad_norm": 22.19069480895996, + "learning_rate": 5.065065652329465e-06, + "loss": 4.6416, + "step": 97540 + }, + { + "epoch": 1.98455810546875, + "grad_norm": 16.686201095581055, + "learning_rate": 5.064666008514476e-06, + "loss": 5.0419, + "step": 97545 + }, + { + "epoch": 1.9846598307291665, + "grad_norm": 24.237892150878906, + "learning_rate": 5.064266364286291e-06, + "loss": 5.078, + "step": 97550 + }, + { + "epoch": 1.9847615559895835, + "grad_norm": 16.544025421142578, + "learning_rate": 5.063866719647465e-06, + "loss": 4.8402, + "step": 97555 + }, + { + "epoch": 1.98486328125, + "grad_norm": 17.134763717651367, + "learning_rate": 5.063467074600552e-06, + "loss": 4.8522, + "step": 97560 + }, + { + "epoch": 1.9849650065104165, + "grad_norm": 15.345488548278809, + "learning_rate": 5.063067429148103e-06, + "loss": 4.9057, + "step": 97565 + }, + { + "epoch": 1.9850667317708335, + "grad_norm": 16.19447135925293, + "learning_rate": 5.062667783292674e-06, + "loss": 4.8068, + "step": 97570 + }, + { + "epoch": 1.98516845703125, + "grad_norm": 16.524795532226562, + "learning_rate": 5.062268137036817e-06, + "loss": 4.4662, + "step": 97575 + }, + { + "epoch": 1.9852701822916665, + "grad_norm": 20.748779296875, + "learning_rate": 5.061868490383087e-06, + "loss": 4.8994, + "step": 97580 + }, + { + "epoch": 1.9853719075520835, + "grad_norm": 22.687334060668945, + "learning_rate": 5.061468843334036e-06, + "loss": 5.0301, + "step": 97585 + }, + { + "epoch": 1.9854736328125, + "grad_norm": 15.398686408996582, + "learning_rate": 5.061069195892221e-06, + "loss": 5.1983, + "step": 97590 + }, + { + "epoch": 1.9855753580729165, + "grad_norm": 17.710235595703125, + "learning_rate": 5.060669548060191e-06, + "loss": 4.8814, + "step": 97595 + }, + { + "epoch": 1.9856770833333335, + "grad_norm": 22.56058120727539, + "learning_rate": 5.060269899840502e-06, + "loss": 4.8678, + "step": 97600 + }, + { + "epoch": 1.98577880859375, + "grad_norm": 14.708807945251465, + "learning_rate": 5.0598702512357075e-06, + "loss": 4.9585, + "step": 97605 + }, + { + "epoch": 1.9858805338541665, + "grad_norm": 14.393012046813965, + "learning_rate": 5.059470602248362e-06, + "loss": 4.8875, + "step": 97610 + }, + { + "epoch": 1.9859822591145835, + "grad_norm": 23.99932098388672, + "learning_rate": 5.059070952881018e-06, + "loss": 4.823, + "step": 97615 + }, + { + "epoch": 1.986083984375, + "grad_norm": 24.411815643310547, + "learning_rate": 5.058671303136229e-06, + "loss": 4.9518, + "step": 97620 + }, + { + "epoch": 1.9861857096354165, + "grad_norm": 17.420589447021484, + "learning_rate": 5.05827165301655e-06, + "loss": 4.9752, + "step": 97625 + }, + { + "epoch": 1.9862874348958335, + "grad_norm": 16.83917236328125, + "learning_rate": 5.057872002524532e-06, + "loss": 5.0089, + "step": 97630 + }, + { + "epoch": 1.98638916015625, + "grad_norm": 22.084228515625, + "learning_rate": 5.057472351662731e-06, + "loss": 5.0935, + "step": 97635 + }, + { + "epoch": 1.9864908854166665, + "grad_norm": 16.33029556274414, + "learning_rate": 5.057072700433699e-06, + "loss": 4.745, + "step": 97640 + }, + { + "epoch": 1.9865926106770835, + "grad_norm": 16.652849197387695, + "learning_rate": 5.056673048839993e-06, + "loss": 4.9453, + "step": 97645 + }, + { + "epoch": 1.9866943359375, + "grad_norm": 17.698455810546875, + "learning_rate": 5.0562733968841625e-06, + "loss": 4.9834, + "step": 97650 + }, + { + "epoch": 1.9867960611979165, + "grad_norm": 24.655344009399414, + "learning_rate": 5.055873744568763e-06, + "loss": 5.0588, + "step": 97655 + }, + { + "epoch": 1.9868977864583335, + "grad_norm": 17.756319046020508, + "learning_rate": 5.055474091896348e-06, + "loss": 4.9501, + "step": 97660 + }, + { + "epoch": 1.98699951171875, + "grad_norm": 17.718034744262695, + "learning_rate": 5.055074438869471e-06, + "loss": 4.5436, + "step": 97665 + }, + { + "epoch": 1.9871012369791665, + "grad_norm": 16.33915138244629, + "learning_rate": 5.054674785490687e-06, + "loss": 4.6878, + "step": 97670 + }, + { + "epoch": 1.9872029622395835, + "grad_norm": 20.092735290527344, + "learning_rate": 5.054275131762547e-06, + "loss": 5.2157, + "step": 97675 + }, + { + "epoch": 1.9873046875, + "grad_norm": 18.39425277709961, + "learning_rate": 5.053875477687607e-06, + "loss": 5.0302, + "step": 97680 + }, + { + "epoch": 1.9874064127604165, + "grad_norm": 23.879451751708984, + "learning_rate": 5.05347582326842e-06, + "loss": 5.1715, + "step": 97685 + }, + { + "epoch": 1.9875081380208335, + "grad_norm": 16.20479965209961, + "learning_rate": 5.0530761685075395e-06, + "loss": 4.9203, + "step": 97690 + }, + { + "epoch": 1.98760986328125, + "grad_norm": 19.884963989257812, + "learning_rate": 5.0526765134075175e-06, + "loss": 4.8173, + "step": 97695 + }, + { + "epoch": 1.9877115885416665, + "grad_norm": 14.388839721679688, + "learning_rate": 5.052276857970912e-06, + "loss": 4.8363, + "step": 97700 + }, + { + "epoch": 1.9878133138020835, + "grad_norm": 19.422779083251953, + "learning_rate": 5.051877202200271e-06, + "loss": 4.7313, + "step": 97705 + }, + { + "epoch": 1.9879150390625, + "grad_norm": 15.754204750061035, + "learning_rate": 5.051477546098153e-06, + "loss": 4.536, + "step": 97710 + }, + { + "epoch": 1.9880167643229165, + "grad_norm": 15.974518775939941, + "learning_rate": 5.051077889667108e-06, + "loss": 4.7145, + "step": 97715 + }, + { + "epoch": 1.9881184895833335, + "grad_norm": 21.941200256347656, + "learning_rate": 5.0506782329096935e-06, + "loss": 5.1201, + "step": 97720 + }, + { + "epoch": 1.98822021484375, + "grad_norm": 17.170490264892578, + "learning_rate": 5.05027857582846e-06, + "loss": 4.8783, + "step": 97725 + }, + { + "epoch": 1.9883219401041665, + "grad_norm": 17.05990982055664, + "learning_rate": 5.049878918425963e-06, + "loss": 4.8872, + "step": 97730 + }, + { + "epoch": 1.9884236653645835, + "grad_norm": 18.5360164642334, + "learning_rate": 5.0494792607047545e-06, + "loss": 4.7491, + "step": 97735 + }, + { + "epoch": 1.988525390625, + "grad_norm": 19.44252586364746, + "learning_rate": 5.049079602667392e-06, + "loss": 4.9189, + "step": 97740 + }, + { + "epoch": 1.9886271158854165, + "grad_norm": 22.27772331237793, + "learning_rate": 5.048679944316423e-06, + "loss": 5.0141, + "step": 97745 + }, + { + "epoch": 1.9887288411458335, + "grad_norm": 16.1196346282959, + "learning_rate": 5.048280285654407e-06, + "loss": 4.8963, + "step": 97750 + }, + { + "epoch": 1.98883056640625, + "grad_norm": 18.50126838684082, + "learning_rate": 5.047880626683895e-06, + "loss": 4.7606, + "step": 97755 + }, + { + "epoch": 1.9889322916666665, + "grad_norm": 20.307905197143555, + "learning_rate": 5.04748096740744e-06, + "loss": 4.7979, + "step": 97760 + }, + { + "epoch": 1.9890340169270835, + "grad_norm": 17.93222427368164, + "learning_rate": 5.047081307827597e-06, + "loss": 4.743, + "step": 97765 + }, + { + "epoch": 1.9891357421875, + "grad_norm": 15.119527816772461, + "learning_rate": 5.04668164794692e-06, + "loss": 5.0193, + "step": 97770 + }, + { + "epoch": 1.9892374674479165, + "grad_norm": 17.292558670043945, + "learning_rate": 5.046281987767961e-06, + "loss": 4.9116, + "step": 97775 + }, + { + "epoch": 1.9893391927083335, + "grad_norm": 20.50922203063965, + "learning_rate": 5.045882327293277e-06, + "loss": 5.2097, + "step": 97780 + }, + { + "epoch": 1.98944091796875, + "grad_norm": 19.123613357543945, + "learning_rate": 5.045482666525416e-06, + "loss": 5.2033, + "step": 97785 + }, + { + "epoch": 1.9895426432291665, + "grad_norm": 20.643125534057617, + "learning_rate": 5.045083005466939e-06, + "loss": 4.96, + "step": 97790 + }, + { + "epoch": 1.9896443684895835, + "grad_norm": 16.84971046447754, + "learning_rate": 5.044683344120392e-06, + "loss": 5.0052, + "step": 97795 + }, + { + "epoch": 1.98974609375, + "grad_norm": 14.641027450561523, + "learning_rate": 5.044283682488336e-06, + "loss": 4.7912, + "step": 97800 + }, + { + "epoch": 1.9898478190104165, + "grad_norm": 14.885258674621582, + "learning_rate": 5.043884020573321e-06, + "loss": 5.0616, + "step": 97805 + }, + { + "epoch": 1.9899495442708335, + "grad_norm": 14.960801124572754, + "learning_rate": 5.043484358377899e-06, + "loss": 4.7032, + "step": 97810 + }, + { + "epoch": 1.99005126953125, + "grad_norm": 22.995134353637695, + "learning_rate": 5.043084695904628e-06, + "loss": 4.662, + "step": 97815 + }, + { + "epoch": 1.9901529947916665, + "grad_norm": 18.516498565673828, + "learning_rate": 5.042685033156056e-06, + "loss": 5.0566, + "step": 97820 + }, + { + "epoch": 1.9902547200520835, + "grad_norm": 20.043447494506836, + "learning_rate": 5.042285370134744e-06, + "loss": 4.833, + "step": 97825 + }, + { + "epoch": 1.9903564453125, + "grad_norm": 20.673145294189453, + "learning_rate": 5.04188570684324e-06, + "loss": 4.979, + "step": 97830 + }, + { + "epoch": 1.9904581705729165, + "grad_norm": 25.792360305786133, + "learning_rate": 5.041486043284099e-06, + "loss": 4.7411, + "step": 97835 + }, + { + "epoch": 1.9905598958333335, + "grad_norm": 24.1407470703125, + "learning_rate": 5.041086379459878e-06, + "loss": 4.6819, + "step": 97840 + }, + { + "epoch": 1.99066162109375, + "grad_norm": 16.13352394104004, + "learning_rate": 5.040686715373126e-06, + "loss": 4.9681, + "step": 97845 + }, + { + "epoch": 1.9907633463541665, + "grad_norm": 26.732044219970703, + "learning_rate": 5.040287051026399e-06, + "loss": 4.5173, + "step": 97850 + }, + { + "epoch": 1.9908650716145835, + "grad_norm": 14.835238456726074, + "learning_rate": 5.03988738642225e-06, + "loss": 4.9128, + "step": 97855 + }, + { + "epoch": 1.990966796875, + "grad_norm": 14.3163480758667, + "learning_rate": 5.039487721563234e-06, + "loss": 4.8689, + "step": 97860 + }, + { + "epoch": 1.9910685221354165, + "grad_norm": 25.666030883789062, + "learning_rate": 5.039088056451904e-06, + "loss": 5.1646, + "step": 97865 + }, + { + "epoch": 1.9911702473958335, + "grad_norm": 17.768918991088867, + "learning_rate": 5.0386883910908134e-06, + "loss": 4.8861, + "step": 97870 + }, + { + "epoch": 1.99127197265625, + "grad_norm": 20.814393997192383, + "learning_rate": 5.038288725482517e-06, + "loss": 4.9079, + "step": 97875 + }, + { + "epoch": 1.9913736979166665, + "grad_norm": 12.725703239440918, + "learning_rate": 5.037889059629567e-06, + "loss": 4.9656, + "step": 97880 + }, + { + "epoch": 1.9914754231770835, + "grad_norm": 18.871217727661133, + "learning_rate": 5.037489393534518e-06, + "loss": 4.7446, + "step": 97885 + }, + { + "epoch": 1.9915771484375, + "grad_norm": 13.562034606933594, + "learning_rate": 5.0370897271999245e-06, + "loss": 5.1414, + "step": 97890 + }, + { + "epoch": 1.9916788736979165, + "grad_norm": 27.637441635131836, + "learning_rate": 5.0366900606283396e-06, + "loss": 4.7441, + "step": 97895 + }, + { + "epoch": 1.9917805989583335, + "grad_norm": 18.451805114746094, + "learning_rate": 5.036290393822315e-06, + "loss": 4.9855, + "step": 97900 + }, + { + "epoch": 1.99188232421875, + "grad_norm": 18.79793930053711, + "learning_rate": 5.035890726784408e-06, + "loss": 4.7895, + "step": 97905 + }, + { + "epoch": 1.9919840494791665, + "grad_norm": 26.245603561401367, + "learning_rate": 5.035491059517171e-06, + "loss": 5.2091, + "step": 97910 + }, + { + "epoch": 1.9920857747395835, + "grad_norm": 18.494348526000977, + "learning_rate": 5.035091392023156e-06, + "loss": 4.8996, + "step": 97915 + }, + { + "epoch": 1.9921875, + "grad_norm": 15.313919067382812, + "learning_rate": 5.034691724304919e-06, + "loss": 4.8698, + "step": 97920 + }, + { + "epoch": 1.9922892252604165, + "grad_norm": 15.603528022766113, + "learning_rate": 5.034292056365012e-06, + "loss": 4.927, + "step": 97925 + }, + { + "epoch": 1.9923909505208335, + "grad_norm": 19.54099464416504, + "learning_rate": 5.033892388205991e-06, + "loss": 4.6669, + "step": 97930 + }, + { + "epoch": 1.99249267578125, + "grad_norm": 14.009140014648438, + "learning_rate": 5.033492719830407e-06, + "loss": 4.86, + "step": 97935 + }, + { + "epoch": 1.9925944010416665, + "grad_norm": 22.550893783569336, + "learning_rate": 5.033093051240816e-06, + "loss": 4.9788, + "step": 97940 + }, + { + "epoch": 1.9926961263020835, + "grad_norm": 18.96280860900879, + "learning_rate": 5.03269338243977e-06, + "loss": 4.7815, + "step": 97945 + }, + { + "epoch": 1.9927978515625, + "grad_norm": 20.780656814575195, + "learning_rate": 5.032293713429825e-06, + "loss": 4.643, + "step": 97950 + }, + { + "epoch": 1.9928995768229165, + "grad_norm": 19.377647399902344, + "learning_rate": 5.031894044213534e-06, + "loss": 4.9, + "step": 97955 + }, + { + "epoch": 1.9930013020833335, + "grad_norm": 25.21828269958496, + "learning_rate": 5.031494374793449e-06, + "loss": 4.9094, + "step": 97960 + }, + { + "epoch": 1.99310302734375, + "grad_norm": 22.70151710510254, + "learning_rate": 5.031094705172124e-06, + "loss": 4.8155, + "step": 97965 + }, + { + "epoch": 1.9932047526041665, + "grad_norm": 16.458229064941406, + "learning_rate": 5.030695035352115e-06, + "loss": 4.8895, + "step": 97970 + }, + { + "epoch": 1.9933064778645835, + "grad_norm": 20.615015029907227, + "learning_rate": 5.030295365335973e-06, + "loss": 4.7657, + "step": 97975 + }, + { + "epoch": 1.993408203125, + "grad_norm": 15.373326301574707, + "learning_rate": 5.0298956951262546e-06, + "loss": 5.0, + "step": 97980 + }, + { + "epoch": 1.9935099283854165, + "grad_norm": 23.33922004699707, + "learning_rate": 5.029496024725512e-06, + "loss": 4.9118, + "step": 97985 + }, + { + "epoch": 1.9936116536458335, + "grad_norm": 16.850866317749023, + "learning_rate": 5.029096354136299e-06, + "loss": 5.0992, + "step": 97990 + }, + { + "epoch": 1.99371337890625, + "grad_norm": 21.968994140625, + "learning_rate": 5.0286966833611695e-06, + "loss": 4.9635, + "step": 97995 + }, + { + "epoch": 1.9938151041666665, + "grad_norm": 21.484580993652344, + "learning_rate": 5.028297012402677e-06, + "loss": 4.6524, + "step": 98000 + }, + { + "epoch": 1.9939168294270835, + "grad_norm": 27.1357364654541, + "learning_rate": 5.027897341263376e-06, + "loss": 4.9595, + "step": 98005 + }, + { + "epoch": 1.9940185546875, + "grad_norm": 18.506515502929688, + "learning_rate": 5.02749766994582e-06, + "loss": 4.8823, + "step": 98010 + }, + { + "epoch": 1.9941202799479165, + "grad_norm": 18.576580047607422, + "learning_rate": 5.0270979984525615e-06, + "loss": 4.9204, + "step": 98015 + }, + { + "epoch": 1.9942220052083335, + "grad_norm": 15.947397232055664, + "learning_rate": 5.026698326786157e-06, + "loss": 4.9314, + "step": 98020 + }, + { + "epoch": 1.99432373046875, + "grad_norm": 23.42961883544922, + "learning_rate": 5.026298654949159e-06, + "loss": 4.8613, + "step": 98025 + }, + { + "epoch": 1.9944254557291665, + "grad_norm": 16.08198356628418, + "learning_rate": 5.0258989829441185e-06, + "loss": 5.007, + "step": 98030 + }, + { + "epoch": 1.9945271809895835, + "grad_norm": 18.26337242126465, + "learning_rate": 5.025499310773594e-06, + "loss": 4.8299, + "step": 98035 + }, + { + "epoch": 1.99462890625, + "grad_norm": 14.883487701416016, + "learning_rate": 5.025099638440135e-06, + "loss": 4.8031, + "step": 98040 + }, + { + "epoch": 1.9947306315104165, + "grad_norm": 21.34291648864746, + "learning_rate": 5.024699965946299e-06, + "loss": 5.0092, + "step": 98045 + }, + { + "epoch": 1.9948323567708335, + "grad_norm": 18.131103515625, + "learning_rate": 5.024300293294638e-06, + "loss": 5.0072, + "step": 98050 + }, + { + "epoch": 1.99493408203125, + "grad_norm": 16.015655517578125, + "learning_rate": 5.023900620487705e-06, + "loss": 4.8339, + "step": 98055 + }, + { + "epoch": 1.9950358072916665, + "grad_norm": 15.477343559265137, + "learning_rate": 5.023500947528055e-06, + "loss": 4.8825, + "step": 98060 + }, + { + "epoch": 1.9951375325520835, + "grad_norm": 15.599652290344238, + "learning_rate": 5.0231012744182416e-06, + "loss": 4.9848, + "step": 98065 + }, + { + "epoch": 1.9952392578125, + "grad_norm": 21.963680267333984, + "learning_rate": 5.022701601160817e-06, + "loss": 4.8768, + "step": 98070 + }, + { + "epoch": 1.9953409830729165, + "grad_norm": 18.13705062866211, + "learning_rate": 5.0223019277583385e-06, + "loss": 4.7322, + "step": 98075 + }, + { + "epoch": 1.9954427083333335, + "grad_norm": 18.454673767089844, + "learning_rate": 5.021902254213357e-06, + "loss": 4.887, + "step": 98080 + }, + { + "epoch": 1.99554443359375, + "grad_norm": 17.32974624633789, + "learning_rate": 5.021502580528427e-06, + "loss": 4.7422, + "step": 98085 + }, + { + "epoch": 1.9956461588541665, + "grad_norm": 22.689125061035156, + "learning_rate": 5.021102906706102e-06, + "loss": 4.8269, + "step": 98090 + }, + { + "epoch": 1.9957478841145835, + "grad_norm": 13.468666076660156, + "learning_rate": 5.0207032327489354e-06, + "loss": 5.0184, + "step": 98095 + }, + { + "epoch": 1.995849609375, + "grad_norm": 22.04236602783203, + "learning_rate": 5.020303558659484e-06, + "loss": 4.6857, + "step": 98100 + }, + { + "epoch": 1.9959513346354165, + "grad_norm": 14.267518997192383, + "learning_rate": 5.019903884440297e-06, + "loss": 4.9913, + "step": 98105 + }, + { + "epoch": 1.9960530598958335, + "grad_norm": 22.619291305541992, + "learning_rate": 5.0195042100939315e-06, + "loss": 4.9072, + "step": 98110 + }, + { + "epoch": 1.99615478515625, + "grad_norm": 18.982973098754883, + "learning_rate": 5.019104535622941e-06, + "loss": 4.8675, + "step": 98115 + }, + { + "epoch": 1.9962565104166665, + "grad_norm": 16.957195281982422, + "learning_rate": 5.018704861029877e-06, + "loss": 4.8749, + "step": 98120 + }, + { + "epoch": 1.9963582356770835, + "grad_norm": 17.46567153930664, + "learning_rate": 5.018305186317295e-06, + "loss": 4.8271, + "step": 98125 + }, + { + "epoch": 1.9964599609375, + "grad_norm": 14.984724044799805, + "learning_rate": 5.0179055114877496e-06, + "loss": 4.5597, + "step": 98130 + }, + { + "epoch": 1.9965616861979165, + "grad_norm": 24.027835845947266, + "learning_rate": 5.017505836543794e-06, + "loss": 4.9223, + "step": 98135 + }, + { + "epoch": 1.9966634114583335, + "grad_norm": 19.757869720458984, + "learning_rate": 5.017106161487981e-06, + "loss": 5.0229, + "step": 98140 + }, + { + "epoch": 1.99676513671875, + "grad_norm": 27.822683334350586, + "learning_rate": 5.016706486322864e-06, + "loss": 5.1659, + "step": 98145 + }, + { + "epoch": 1.9968668619791665, + "grad_norm": 20.091157913208008, + "learning_rate": 5.016306811050999e-06, + "loss": 4.9111, + "step": 98150 + }, + { + "epoch": 1.9969685872395835, + "grad_norm": 16.60602378845215, + "learning_rate": 5.01590713567494e-06, + "loss": 4.8808, + "step": 98155 + }, + { + "epoch": 1.9970703125, + "grad_norm": 18.16621208190918, + "learning_rate": 5.015507460197238e-06, + "loss": 4.6662, + "step": 98160 + }, + { + "epoch": 1.9971720377604165, + "grad_norm": 21.604167938232422, + "learning_rate": 5.015107784620447e-06, + "loss": 4.7974, + "step": 98165 + }, + { + "epoch": 1.9972737630208335, + "grad_norm": 13.338696479797363, + "learning_rate": 5.014708108947124e-06, + "loss": 5.0485, + "step": 98170 + }, + { + "epoch": 1.99737548828125, + "grad_norm": 16.06934928894043, + "learning_rate": 5.014308433179821e-06, + "loss": 5.1654, + "step": 98175 + }, + { + "epoch": 1.9974772135416665, + "grad_norm": 19.142379760742188, + "learning_rate": 5.0139087573210906e-06, + "loss": 4.6632, + "step": 98180 + }, + { + "epoch": 1.9975789388020835, + "grad_norm": 17.143817901611328, + "learning_rate": 5.013509081373488e-06, + "loss": 4.9266, + "step": 98185 + }, + { + "epoch": 1.9976806640625, + "grad_norm": 17.647777557373047, + "learning_rate": 5.013109405339567e-06, + "loss": 5.1296, + "step": 98190 + }, + { + "epoch": 1.9977823893229165, + "grad_norm": 14.751250267028809, + "learning_rate": 5.01270972922188e-06, + "loss": 4.8979, + "step": 98195 + }, + { + "epoch": 1.9978841145833335, + "grad_norm": 15.886700630187988, + "learning_rate": 5.012310053022983e-06, + "loss": 4.9169, + "step": 98200 + }, + { + "epoch": 1.99798583984375, + "grad_norm": 18.014860153198242, + "learning_rate": 5.011910376745428e-06, + "loss": 4.7845, + "step": 98205 + }, + { + "epoch": 1.9980875651041665, + "grad_norm": 16.410654067993164, + "learning_rate": 5.011510700391769e-06, + "loss": 4.9607, + "step": 98210 + }, + { + "epoch": 1.9981892903645835, + "grad_norm": 14.066431999206543, + "learning_rate": 5.011111023964561e-06, + "loss": 4.742, + "step": 98215 + }, + { + "epoch": 1.998291015625, + "grad_norm": 20.137714385986328, + "learning_rate": 5.0107113474663585e-06, + "loss": 5.2097, + "step": 98220 + }, + { + "epoch": 1.9983927408854165, + "grad_norm": 26.378929138183594, + "learning_rate": 5.010311670899711e-06, + "loss": 4.8879, + "step": 98225 + }, + { + "epoch": 1.9984944661458335, + "grad_norm": 18.360305786132812, + "learning_rate": 5.009911994267177e-06, + "loss": 4.9734, + "step": 98230 + }, + { + "epoch": 1.99859619140625, + "grad_norm": 17.991952896118164, + "learning_rate": 5.009512317571308e-06, + "loss": 4.8458, + "step": 98235 + }, + { + "epoch": 1.9986979166666665, + "grad_norm": 20.470409393310547, + "learning_rate": 5.009112640814659e-06, + "loss": 5.0146, + "step": 98240 + }, + { + "epoch": 1.9987996419270835, + "grad_norm": 18.57363510131836, + "learning_rate": 5.008712963999782e-06, + "loss": 4.8536, + "step": 98245 + }, + { + "epoch": 1.9989013671875, + "grad_norm": 15.87736988067627, + "learning_rate": 5.008313287129233e-06, + "loss": 4.8305, + "step": 98250 + }, + { + "epoch": 1.9990030924479165, + "grad_norm": 17.7689151763916, + "learning_rate": 5.007913610205565e-06, + "loss": 4.8896, + "step": 98255 + }, + { + "epoch": 1.9991048177083335, + "grad_norm": 15.8595552444458, + "learning_rate": 5.007513933231329e-06, + "loss": 4.9191, + "step": 98260 + }, + { + "epoch": 1.99920654296875, + "grad_norm": 17.02433967590332, + "learning_rate": 5.007114256209082e-06, + "loss": 4.7819, + "step": 98265 + }, + { + "epoch": 1.9993082682291665, + "grad_norm": 13.888455390930176, + "learning_rate": 5.0067145791413794e-06, + "loss": 4.7668, + "step": 98270 + }, + { + "epoch": 1.9994099934895835, + "grad_norm": 14.434595108032227, + "learning_rate": 5.006314902030771e-06, + "loss": 5.0287, + "step": 98275 + }, + { + "epoch": 1.99951171875, + "grad_norm": 22.09933853149414, + "learning_rate": 5.005915224879814e-06, + "loss": 4.9537, + "step": 98280 + }, + { + "epoch": 1.9996134440104165, + "grad_norm": 18.06040382385254, + "learning_rate": 5.00551554769106e-06, + "loss": 5.3243, + "step": 98285 + }, + { + "epoch": 1.9997151692708335, + "grad_norm": 18.011817932128906, + "learning_rate": 5.005115870467062e-06, + "loss": 4.9501, + "step": 98290 + }, + { + "epoch": 1.99981689453125, + "grad_norm": 17.80963897705078, + "learning_rate": 5.004716193210377e-06, + "loss": 4.969, + "step": 98295 + }, + { + "epoch": 1.9999186197916665, + "grad_norm": 17.48598861694336, + "learning_rate": 5.004316515923555e-06, + "loss": 4.6513, + "step": 98300 + }, + { + "epoch": 2.0, + "eval_loss": 4.924122333526611, + "eval_runtime": 106.9585, + "eval_samples_per_second": 18.764, + "eval_steps_per_second": 9.387, + "step": 98304 + }, + { + "epoch": 2.0000203450520835, + "grad_norm": 17.679851531982422, + "learning_rate": 5.003916838609154e-06, + "loss": 4.7238, + "step": 98305 + }, + { + "epoch": 2.0001220703125, + "grad_norm": 13.430336952209473, + "learning_rate": 5.003517161269725e-06, + "loss": 4.8008, + "step": 98310 + }, + { + "epoch": 2.0002237955729165, + "grad_norm": 16.66043472290039, + "learning_rate": 5.0031174839078215e-06, + "loss": 4.8921, + "step": 98315 + }, + { + "epoch": 2.0003255208333335, + "grad_norm": 19.53813934326172, + "learning_rate": 5.002717806526e-06, + "loss": 4.8973, + "step": 98320 + }, + { + "epoch": 2.00042724609375, + "grad_norm": 14.663487434387207, + "learning_rate": 5.0023181291268105e-06, + "loss": 4.6374, + "step": 98325 + }, + { + "epoch": 2.0005289713541665, + "grad_norm": 18.16403579711914, + "learning_rate": 5.001918451712811e-06, + "loss": 4.6374, + "step": 98330 + }, + { + "epoch": 2.0006306966145835, + "grad_norm": 20.019926071166992, + "learning_rate": 5.001518774286552e-06, + "loss": 5.3531, + "step": 98335 + }, + { + "epoch": 2.000732421875, + "grad_norm": 14.984407424926758, + "learning_rate": 5.001119096850587e-06, + "loss": 4.8674, + "step": 98340 + }, + { + "epoch": 2.0008341471354165, + "grad_norm": 14.78381061553955, + "learning_rate": 5.000719419407474e-06, + "loss": 4.8127, + "step": 98345 + }, + { + "epoch": 2.0009358723958335, + "grad_norm": 16.782155990600586, + "learning_rate": 5.0003197419597634e-06, + "loss": 4.6921, + "step": 98350 + }, + { + "epoch": 2.00103759765625, + "grad_norm": 16.3575496673584, + "learning_rate": 4.9999200645100105e-06, + "loss": 4.9778, + "step": 98355 + }, + { + "epoch": 2.0011393229166665, + "grad_norm": 14.416019439697266, + "learning_rate": 4.999520387060766e-06, + "loss": 4.7698, + "step": 98360 + }, + { + "epoch": 2.0012410481770835, + "grad_norm": 17.579505920410156, + "learning_rate": 4.999120709614587e-06, + "loss": 5.1354, + "step": 98365 + }, + { + "epoch": 2.0013427734375, + "grad_norm": 20.05704689025879, + "learning_rate": 4.998721032174027e-06, + "loss": 4.7777, + "step": 98370 + }, + { + "epoch": 2.0014444986979165, + "grad_norm": 23.46247673034668, + "learning_rate": 4.998321354741639e-06, + "loss": 5.2923, + "step": 98375 + }, + { + "epoch": 2.0015462239583335, + "grad_norm": 17.76083755493164, + "learning_rate": 4.9979216773199775e-06, + "loss": 4.9317, + "step": 98380 + }, + { + "epoch": 2.00164794921875, + "grad_norm": 16.93878936767578, + "learning_rate": 4.9975219999115945e-06, + "loss": 4.6746, + "step": 98385 + }, + { + "epoch": 2.0017496744791665, + "grad_norm": 19.87816619873047, + "learning_rate": 4.997122322519046e-06, + "loss": 5.0899, + "step": 98390 + }, + { + "epoch": 2.0018513997395835, + "grad_norm": 16.13589096069336, + "learning_rate": 4.996722645144886e-06, + "loss": 4.924, + "step": 98395 + }, + { + "epoch": 2.001953125, + "grad_norm": 18.180030822753906, + "learning_rate": 4.996322967791665e-06, + "loss": 4.8806, + "step": 98400 + }, + { + "epoch": 2.0020548502604165, + "grad_norm": 15.828471183776855, + "learning_rate": 4.995923290461941e-06, + "loss": 5.1617, + "step": 98405 + }, + { + "epoch": 2.0021565755208335, + "grad_norm": 25.506126403808594, + "learning_rate": 4.995523613158264e-06, + "loss": 4.8654, + "step": 98410 + }, + { + "epoch": 2.00225830078125, + "grad_norm": 22.762184143066406, + "learning_rate": 4.99512393588319e-06, + "loss": 4.7641, + "step": 98415 + }, + { + "epoch": 2.0023600260416665, + "grad_norm": 19.60887908935547, + "learning_rate": 4.994724258639274e-06, + "loss": 4.8806, + "step": 98420 + }, + { + "epoch": 2.0024617513020835, + "grad_norm": 20.303943634033203, + "learning_rate": 4.994324581429068e-06, + "loss": 4.8181, + "step": 98425 + }, + { + "epoch": 2.0025634765625, + "grad_norm": 21.047107696533203, + "learning_rate": 4.993924904255127e-06, + "loss": 4.9471, + "step": 98430 + }, + { + "epoch": 2.0026652018229165, + "grad_norm": 18.886077880859375, + "learning_rate": 4.99352522712e-06, + "loss": 5.1539, + "step": 98435 + }, + { + "epoch": 2.0027669270833335, + "grad_norm": 18.583974838256836, + "learning_rate": 4.993125550026248e-06, + "loss": 4.6774, + "step": 98440 + }, + { + "epoch": 2.00286865234375, + "grad_norm": 19.455551147460938, + "learning_rate": 4.992725872976422e-06, + "loss": 5.0747, + "step": 98445 + }, + { + "epoch": 2.0029703776041665, + "grad_norm": 19.199907302856445, + "learning_rate": 4.992326195973074e-06, + "loss": 5.0049, + "step": 98450 + }, + { + "epoch": 2.0030721028645835, + "grad_norm": 15.257678031921387, + "learning_rate": 4.99192651901876e-06, + "loss": 4.7808, + "step": 98455 + }, + { + "epoch": 2.003173828125, + "grad_norm": 14.488493919372559, + "learning_rate": 4.991526842116033e-06, + "loss": 4.7791, + "step": 98460 + }, + { + "epoch": 2.0032755533854165, + "grad_norm": 29.45501708984375, + "learning_rate": 4.9911271652674444e-06, + "loss": 5.08, + "step": 98465 + }, + { + "epoch": 2.0033772786458335, + "grad_norm": 16.756189346313477, + "learning_rate": 4.990727488475553e-06, + "loss": 4.6874, + "step": 98470 + }, + { + "epoch": 2.00347900390625, + "grad_norm": 16.057559967041016, + "learning_rate": 4.990327811742909e-06, + "loss": 4.9756, + "step": 98475 + }, + { + "epoch": 2.0035807291666665, + "grad_norm": 14.89709758758545, + "learning_rate": 4.9899281350720685e-06, + "loss": 4.7018, + "step": 98480 + }, + { + "epoch": 2.0036824544270835, + "grad_norm": 15.05294418334961, + "learning_rate": 4.9895284584655835e-06, + "loss": 4.9859, + "step": 98485 + }, + { + "epoch": 2.0037841796875, + "grad_norm": 26.292816162109375, + "learning_rate": 4.9891287819260056e-06, + "loss": 4.9487, + "step": 98490 + }, + { + "epoch": 2.0038859049479165, + "grad_norm": 19.62627601623535, + "learning_rate": 4.9887291054558945e-06, + "loss": 4.7383, + "step": 98495 + }, + { + "epoch": 2.0039876302083335, + "grad_norm": 12.689364433288574, + "learning_rate": 4.9883294290578e-06, + "loss": 4.7743, + "step": 98500 + }, + { + "epoch": 2.00408935546875, + "grad_norm": 30.95269012451172, + "learning_rate": 4.987929752734277e-06, + "loss": 4.9932, + "step": 98505 + }, + { + "epoch": 2.0041910807291665, + "grad_norm": 13.04053783416748, + "learning_rate": 4.987530076487878e-06, + "loss": 4.857, + "step": 98510 + }, + { + "epoch": 2.0042928059895835, + "grad_norm": 20.899269104003906, + "learning_rate": 4.987130400321158e-06, + "loss": 4.8557, + "step": 98515 + }, + { + "epoch": 2.00439453125, + "grad_norm": 19.487693786621094, + "learning_rate": 4.986730724236672e-06, + "loss": 4.8811, + "step": 98520 + }, + { + "epoch": 2.0044962565104165, + "grad_norm": 20.97127342224121, + "learning_rate": 4.986331048236972e-06, + "loss": 4.7055, + "step": 98525 + }, + { + "epoch": 2.0045979817708335, + "grad_norm": 19.734460830688477, + "learning_rate": 4.985931372324611e-06, + "loss": 4.8318, + "step": 98530 + }, + { + "epoch": 2.00469970703125, + "grad_norm": 19.22087860107422, + "learning_rate": 4.9855316965021454e-06, + "loss": 4.824, + "step": 98535 + }, + { + "epoch": 2.0048014322916665, + "grad_norm": 16.418819427490234, + "learning_rate": 4.985132020772127e-06, + "loss": 4.8491, + "step": 98540 + }, + { + "epoch": 2.0049031575520835, + "grad_norm": 16.434091567993164, + "learning_rate": 4.984732345137109e-06, + "loss": 4.657, + "step": 98545 + }, + { + "epoch": 2.0050048828125, + "grad_norm": 15.657124519348145, + "learning_rate": 4.984332669599649e-06, + "loss": 4.5751, + "step": 98550 + }, + { + "epoch": 2.0051066080729165, + "grad_norm": 15.500308990478516, + "learning_rate": 4.983932994162297e-06, + "loss": 4.8765, + "step": 98555 + }, + { + "epoch": 2.0052083333333335, + "grad_norm": 15.724743843078613, + "learning_rate": 4.983533318827609e-06, + "loss": 4.8853, + "step": 98560 + }, + { + "epoch": 2.00531005859375, + "grad_norm": 14.869078636169434, + "learning_rate": 4.983133643598136e-06, + "loss": 4.6538, + "step": 98565 + }, + { + "epoch": 2.0054117838541665, + "grad_norm": 22.423349380493164, + "learning_rate": 4.982733968476435e-06, + "loss": 5.1445, + "step": 98570 + }, + { + "epoch": 2.0055135091145835, + "grad_norm": 16.84715461730957, + "learning_rate": 4.9823342934650595e-06, + "loss": 4.7914, + "step": 98575 + }, + { + "epoch": 2.005615234375, + "grad_norm": 18.22833824157715, + "learning_rate": 4.981934618566561e-06, + "loss": 4.6678, + "step": 98580 + }, + { + "epoch": 2.0057169596354165, + "grad_norm": 21.786283493041992, + "learning_rate": 4.981534943783495e-06, + "loss": 4.7565, + "step": 98585 + }, + { + "epoch": 2.0058186848958335, + "grad_norm": 20.581575393676758, + "learning_rate": 4.981135269118413e-06, + "loss": 4.9031, + "step": 98590 + }, + { + "epoch": 2.00592041015625, + "grad_norm": 14.29866886138916, + "learning_rate": 4.980735594573872e-06, + "loss": 4.9325, + "step": 98595 + }, + { + "epoch": 2.0060221354166665, + "grad_norm": 15.494755744934082, + "learning_rate": 4.9803359201524255e-06, + "loss": 4.886, + "step": 98600 + }, + { + "epoch": 2.0061238606770835, + "grad_norm": 18.487825393676758, + "learning_rate": 4.979936245856625e-06, + "loss": 4.8855, + "step": 98605 + }, + { + "epoch": 2.0062255859375, + "grad_norm": 19.22628402709961, + "learning_rate": 4.979536571689027e-06, + "loss": 4.7383, + "step": 98610 + }, + { + "epoch": 2.0063273111979165, + "grad_norm": 16.728286743164062, + "learning_rate": 4.979136897652182e-06, + "loss": 4.8663, + "step": 98615 + }, + { + "epoch": 2.0064290364583335, + "grad_norm": 12.964844703674316, + "learning_rate": 4.978737223748645e-06, + "loss": 4.5922, + "step": 98620 + }, + { + "epoch": 2.00653076171875, + "grad_norm": 15.20476245880127, + "learning_rate": 4.9783375499809726e-06, + "loss": 5.0653, + "step": 98625 + }, + { + "epoch": 2.0066324869791665, + "grad_norm": 16.50149917602539, + "learning_rate": 4.977937876351715e-06, + "loss": 4.7611, + "step": 98630 + }, + { + "epoch": 2.0067342122395835, + "grad_norm": 17.17848777770996, + "learning_rate": 4.977538202863429e-06, + "loss": 4.7484, + "step": 98635 + }, + { + "epoch": 2.0068359375, + "grad_norm": 19.690040588378906, + "learning_rate": 4.977138529518665e-06, + "loss": 4.802, + "step": 98640 + }, + { + "epoch": 2.0069376627604165, + "grad_norm": 17.326641082763672, + "learning_rate": 4.976738856319978e-06, + "loss": 4.6038, + "step": 98645 + }, + { + "epoch": 2.0070393880208335, + "grad_norm": 22.746366500854492, + "learning_rate": 4.976339183269924e-06, + "loss": 5.0463, + "step": 98650 + }, + { + "epoch": 2.00714111328125, + "grad_norm": 16.649354934692383, + "learning_rate": 4.975939510371054e-06, + "loss": 4.7789, + "step": 98655 + }, + { + "epoch": 2.0072428385416665, + "grad_norm": 15.796504974365234, + "learning_rate": 4.975539837625923e-06, + "loss": 4.7043, + "step": 98660 + }, + { + "epoch": 2.0073445638020835, + "grad_norm": 16.914379119873047, + "learning_rate": 4.9751401650370856e-06, + "loss": 4.8867, + "step": 98665 + }, + { + "epoch": 2.0074462890625, + "grad_norm": 24.346403121948242, + "learning_rate": 4.974740492607094e-06, + "loss": 4.7278, + "step": 98670 + }, + { + "epoch": 2.0075480143229165, + "grad_norm": 14.43387508392334, + "learning_rate": 4.974340820338502e-06, + "loss": 4.8739, + "step": 98675 + }, + { + "epoch": 2.0076497395833335, + "grad_norm": 18.044221878051758, + "learning_rate": 4.9739411482338655e-06, + "loss": 5.0657, + "step": 98680 + }, + { + "epoch": 2.00775146484375, + "grad_norm": 19.774871826171875, + "learning_rate": 4.973541476295736e-06, + "loss": 4.7678, + "step": 98685 + }, + { + "epoch": 2.0078531901041665, + "grad_norm": 15.930593490600586, + "learning_rate": 4.9731418045266685e-06, + "loss": 4.7923, + "step": 98690 + }, + { + "epoch": 2.0079549153645835, + "grad_norm": 18.449771881103516, + "learning_rate": 4.972742132929214e-06, + "loss": 4.8691, + "step": 98695 + }, + { + "epoch": 2.008056640625, + "grad_norm": 19.864883422851562, + "learning_rate": 4.972342461505931e-06, + "loss": 4.9887, + "step": 98700 + }, + { + "epoch": 2.0081583658854165, + "grad_norm": 15.432377815246582, + "learning_rate": 4.971942790259371e-06, + "loss": 4.634, + "step": 98705 + }, + { + "epoch": 2.0082600911458335, + "grad_norm": 18.153261184692383, + "learning_rate": 4.971543119192088e-06, + "loss": 5.0764, + "step": 98710 + }, + { + "epoch": 2.00836181640625, + "grad_norm": 21.37388038635254, + "learning_rate": 4.971143448306634e-06, + "loss": 4.9292, + "step": 98715 + }, + { + "epoch": 2.0084635416666665, + "grad_norm": 15.681013107299805, + "learning_rate": 4.970743777605564e-06, + "loss": 5.0256, + "step": 98720 + }, + { + "epoch": 2.0085652669270835, + "grad_norm": 18.62518882751465, + "learning_rate": 4.970344107091433e-06, + "loss": 4.6775, + "step": 98725 + }, + { + "epoch": 2.0086669921875, + "grad_norm": 13.43993854522705, + "learning_rate": 4.969944436766793e-06, + "loss": 4.8988, + "step": 98730 + }, + { + "epoch": 2.0087687174479165, + "grad_norm": 17.404767990112305, + "learning_rate": 4.9695447666342e-06, + "loss": 4.7346, + "step": 98735 + }, + { + "epoch": 2.0088704427083335, + "grad_norm": 17.668649673461914, + "learning_rate": 4.969145096696206e-06, + "loss": 4.8438, + "step": 98740 + }, + { + "epoch": 2.00897216796875, + "grad_norm": 17.932897567749023, + "learning_rate": 4.968745426955362e-06, + "loss": 4.6312, + "step": 98745 + }, + { + "epoch": 2.0090738932291665, + "grad_norm": 18.895729064941406, + "learning_rate": 4.968345757414227e-06, + "loss": 4.7487, + "step": 98750 + }, + { + "epoch": 2.0091756184895835, + "grad_norm": 20.250276565551758, + "learning_rate": 4.9679460880753535e-06, + "loss": 4.856, + "step": 98755 + }, + { + "epoch": 2.00927734375, + "grad_norm": 18.455232620239258, + "learning_rate": 4.967546418941294e-06, + "loss": 4.7899, + "step": 98760 + }, + { + "epoch": 2.0093790690104165, + "grad_norm": 18.4439640045166, + "learning_rate": 4.967146750014603e-06, + "loss": 4.7891, + "step": 98765 + }, + { + "epoch": 2.0094807942708335, + "grad_norm": 19.817771911621094, + "learning_rate": 4.966747081297832e-06, + "loss": 5.0482, + "step": 98770 + }, + { + "epoch": 2.00958251953125, + "grad_norm": 20.01764488220215, + "learning_rate": 4.966347412793537e-06, + "loss": 4.7856, + "step": 98775 + }, + { + "epoch": 2.0096842447916665, + "grad_norm": 17.937633514404297, + "learning_rate": 4.965947744504272e-06, + "loss": 4.8704, + "step": 98780 + }, + { + "epoch": 2.0097859700520835, + "grad_norm": 18.81884002685547, + "learning_rate": 4.96554807643259e-06, + "loss": 4.8408, + "step": 98785 + }, + { + "epoch": 2.0098876953125, + "grad_norm": 17.577674865722656, + "learning_rate": 4.965148408581044e-06, + "loss": 5.085, + "step": 98790 + }, + { + "epoch": 2.0099894205729165, + "grad_norm": 18.52178382873535, + "learning_rate": 4.96474874095219e-06, + "loss": 4.8922, + "step": 98795 + }, + { + "epoch": 2.0100911458333335, + "grad_norm": 21.821813583374023, + "learning_rate": 4.96434907354858e-06, + "loss": 5.0303, + "step": 98800 + }, + { + "epoch": 2.01019287109375, + "grad_norm": 21.715232849121094, + "learning_rate": 4.963949406372768e-06, + "loss": 4.7588, + "step": 98805 + }, + { + "epoch": 2.0102945963541665, + "grad_norm": 14.559025764465332, + "learning_rate": 4.963549739427306e-06, + "loss": 4.7058, + "step": 98810 + }, + { + "epoch": 2.0103963216145835, + "grad_norm": 21.477014541625977, + "learning_rate": 4.963150072714752e-06, + "loss": 4.7677, + "step": 98815 + }, + { + "epoch": 2.010498046875, + "grad_norm": 22.006454467773438, + "learning_rate": 4.962750406237657e-06, + "loss": 4.9018, + "step": 98820 + }, + { + "epoch": 2.0105997721354165, + "grad_norm": 17.572721481323242, + "learning_rate": 4.962350739998573e-06, + "loss": 4.8621, + "step": 98825 + }, + { + "epoch": 2.0107014973958335, + "grad_norm": 21.09674072265625, + "learning_rate": 4.961951074000058e-06, + "loss": 5.1469, + "step": 98830 + }, + { + "epoch": 2.01080322265625, + "grad_norm": 20.287805557250977, + "learning_rate": 4.961551408244664e-06, + "loss": 4.8028, + "step": 98835 + }, + { + "epoch": 2.0109049479166665, + "grad_norm": 13.821070671081543, + "learning_rate": 4.961151742734944e-06, + "loss": 4.6725, + "step": 98840 + }, + { + "epoch": 2.0110066731770835, + "grad_norm": 17.949113845825195, + "learning_rate": 4.960752077473451e-06, + "loss": 4.6375, + "step": 98845 + }, + { + "epoch": 2.0111083984375, + "grad_norm": 19.299039840698242, + "learning_rate": 4.960352412462739e-06, + "loss": 4.8004, + "step": 98850 + }, + { + "epoch": 2.0112101236979165, + "grad_norm": 25.73446273803711, + "learning_rate": 4.959952747705364e-06, + "loss": 4.7734, + "step": 98855 + }, + { + "epoch": 2.0113118489583335, + "grad_norm": 20.833675384521484, + "learning_rate": 4.959553083203878e-06, + "loss": 4.8901, + "step": 98860 + }, + { + "epoch": 2.01141357421875, + "grad_norm": 15.283685684204102, + "learning_rate": 4.959153418960836e-06, + "loss": 4.6899, + "step": 98865 + }, + { + "epoch": 2.0115152994791665, + "grad_norm": 16.52994155883789, + "learning_rate": 4.958753754978789e-06, + "loss": 4.8188, + "step": 98870 + }, + { + "epoch": 2.0116170247395835, + "grad_norm": 19.923980712890625, + "learning_rate": 4.958354091260292e-06, + "loss": 4.8419, + "step": 98875 + }, + { + "epoch": 2.01171875, + "grad_norm": 21.45782470703125, + "learning_rate": 4.957954427807901e-06, + "loss": 4.6753, + "step": 98880 + }, + { + "epoch": 2.0118204752604165, + "grad_norm": 16.09947395324707, + "learning_rate": 4.957554764624167e-06, + "loss": 4.7408, + "step": 98885 + }, + { + "epoch": 2.0119222005208335, + "grad_norm": 15.245113372802734, + "learning_rate": 4.957155101711645e-06, + "loss": 4.9217, + "step": 98890 + }, + { + "epoch": 2.01202392578125, + "grad_norm": 20.03757667541504, + "learning_rate": 4.956755439072888e-06, + "loss": 4.6984, + "step": 98895 + }, + { + "epoch": 2.0121256510416665, + "grad_norm": 17.70685577392578, + "learning_rate": 4.95635577671045e-06, + "loss": 4.6533, + "step": 98900 + }, + { + "epoch": 2.0122273763020835, + "grad_norm": 16.184602737426758, + "learning_rate": 4.9559561146268835e-06, + "loss": 4.643, + "step": 98905 + }, + { + "epoch": 2.0123291015625, + "grad_norm": 18.941810607910156, + "learning_rate": 4.955556452824745e-06, + "loss": 4.8856, + "step": 98910 + }, + { + "epoch": 2.0124308268229165, + "grad_norm": 17.592609405517578, + "learning_rate": 4.955156791306586e-06, + "loss": 4.7648, + "step": 98915 + }, + { + "epoch": 2.0125325520833335, + "grad_norm": 17.401702880859375, + "learning_rate": 4.954757130074961e-06, + "loss": 4.8067, + "step": 98920 + }, + { + "epoch": 2.01263427734375, + "grad_norm": 24.057621002197266, + "learning_rate": 4.954357469132424e-06, + "loss": 4.8627, + "step": 98925 + }, + { + "epoch": 2.0127360026041665, + "grad_norm": 15.264971733093262, + "learning_rate": 4.953957808481527e-06, + "loss": 4.6955, + "step": 98930 + }, + { + "epoch": 2.0128377278645835, + "grad_norm": 20.563020706176758, + "learning_rate": 4.953558148124826e-06, + "loss": 5.1363, + "step": 98935 + }, + { + "epoch": 2.012939453125, + "grad_norm": 16.919021606445312, + "learning_rate": 4.953158488064874e-06, + "loss": 4.7948, + "step": 98940 + }, + { + "epoch": 2.0130411783854165, + "grad_norm": 23.716266632080078, + "learning_rate": 4.952758828304224e-06, + "loss": 4.7819, + "step": 98945 + }, + { + "epoch": 2.0131429036458335, + "grad_norm": 29.986011505126953, + "learning_rate": 4.952359168845431e-06, + "loss": 4.8223, + "step": 98950 + }, + { + "epoch": 2.01324462890625, + "grad_norm": 17.606338500976562, + "learning_rate": 4.951959509691045e-06, + "loss": 4.8827, + "step": 98955 + }, + { + "epoch": 2.0133463541666665, + "grad_norm": 21.946531295776367, + "learning_rate": 4.9515598508436245e-06, + "loss": 4.7715, + "step": 98960 + }, + { + "epoch": 2.0134480794270835, + "grad_norm": 13.687047004699707, + "learning_rate": 4.951160192305721e-06, + "loss": 4.5823, + "step": 98965 + }, + { + "epoch": 2.0135498046875, + "grad_norm": 18.328367233276367, + "learning_rate": 4.950760534079889e-06, + "loss": 4.7554, + "step": 98970 + }, + { + "epoch": 2.0136515299479165, + "grad_norm": 16.067279815673828, + "learning_rate": 4.9503608761686816e-06, + "loss": 4.8128, + "step": 98975 + }, + { + "epoch": 2.0137532552083335, + "grad_norm": 19.73259735107422, + "learning_rate": 4.94996121857465e-06, + "loss": 4.8161, + "step": 98980 + }, + { + "epoch": 2.01385498046875, + "grad_norm": 18.96399688720703, + "learning_rate": 4.949561561300353e-06, + "loss": 4.9149, + "step": 98985 + }, + { + "epoch": 2.0139567057291665, + "grad_norm": 14.475127220153809, + "learning_rate": 4.9491619043483405e-06, + "loss": 4.6312, + "step": 98990 + }, + { + "epoch": 2.0140584309895835, + "grad_norm": 18.57363510131836, + "learning_rate": 4.948762247721168e-06, + "loss": 4.835, + "step": 98995 + }, + { + "epoch": 2.01416015625, + "grad_norm": 18.025667190551758, + "learning_rate": 4.948362591421388e-06, + "loss": 4.8934, + "step": 99000 + }, + { + "epoch": 2.0142618815104165, + "grad_norm": 17.337820053100586, + "learning_rate": 4.947962935451553e-06, + "loss": 4.532, + "step": 99005 + }, + { + "epoch": 2.0143636067708335, + "grad_norm": 19.41027069091797, + "learning_rate": 4.947563279814221e-06, + "loss": 4.8356, + "step": 99010 + }, + { + "epoch": 2.01446533203125, + "grad_norm": 16.400148391723633, + "learning_rate": 4.9471636245119425e-06, + "loss": 4.8779, + "step": 99015 + }, + { + "epoch": 2.0145670572916665, + "grad_norm": 19.531452178955078, + "learning_rate": 4.946763969547271e-06, + "loss": 4.7047, + "step": 99020 + }, + { + "epoch": 2.0146687825520835, + "grad_norm": 16.872953414916992, + "learning_rate": 4.946364314922762e-06, + "loss": 4.7392, + "step": 99025 + }, + { + "epoch": 2.0147705078125, + "grad_norm": 17.619070053100586, + "learning_rate": 4.945964660640965e-06, + "loss": 4.8497, + "step": 99030 + }, + { + "epoch": 2.0148722330729165, + "grad_norm": 17.97203254699707, + "learning_rate": 4.945565006704439e-06, + "loss": 4.7249, + "step": 99035 + }, + { + "epoch": 2.0149739583333335, + "grad_norm": 14.690658569335938, + "learning_rate": 4.945165353115736e-06, + "loss": 4.9543, + "step": 99040 + }, + { + "epoch": 2.01507568359375, + "grad_norm": 17.9864501953125, + "learning_rate": 4.944765699877408e-06, + "loss": 4.8946, + "step": 99045 + }, + { + "epoch": 2.0151774088541665, + "grad_norm": 23.725088119506836, + "learning_rate": 4.944366046992009e-06, + "loss": 4.9225, + "step": 99050 + }, + { + "epoch": 2.0152791341145835, + "grad_norm": 16.680580139160156, + "learning_rate": 4.943966394462094e-06, + "loss": 4.9404, + "step": 99055 + }, + { + "epoch": 2.015380859375, + "grad_norm": 19.80683135986328, + "learning_rate": 4.943566742290216e-06, + "loss": 4.8131, + "step": 99060 + }, + { + "epoch": 2.0154825846354165, + "grad_norm": 14.404853820800781, + "learning_rate": 4.943167090478928e-06, + "loss": 4.7379, + "step": 99065 + }, + { + "epoch": 2.0155843098958335, + "grad_norm": 19.77933692932129, + "learning_rate": 4.942767439030784e-06, + "loss": 4.7418, + "step": 99070 + }, + { + "epoch": 2.01568603515625, + "grad_norm": 19.28270149230957, + "learning_rate": 4.942367787948339e-06, + "loss": 4.7311, + "step": 99075 + }, + { + "epoch": 2.0157877604166665, + "grad_norm": 23.26753044128418, + "learning_rate": 4.941968137234146e-06, + "loss": 4.9299, + "step": 99080 + }, + { + "epoch": 2.0158894856770835, + "grad_norm": 17.388635635375977, + "learning_rate": 4.941568486890756e-06, + "loss": 5.2088, + "step": 99085 + }, + { + "epoch": 2.0159912109375, + "grad_norm": 17.63842010498047, + "learning_rate": 4.941168836920726e-06, + "loss": 4.6839, + "step": 99090 + }, + { + "epoch": 2.0160929361979165, + "grad_norm": 22.965585708618164, + "learning_rate": 4.940769187326609e-06, + "loss": 4.681, + "step": 99095 + }, + { + "epoch": 2.0161946614583335, + "grad_norm": 14.454386711120605, + "learning_rate": 4.940369538110958e-06, + "loss": 4.8726, + "step": 99100 + }, + { + "epoch": 2.01629638671875, + "grad_norm": 16.916696548461914, + "learning_rate": 4.939969889276327e-06, + "loss": 4.8328, + "step": 99105 + }, + { + "epoch": 2.0163981119791665, + "grad_norm": 22.75889778137207, + "learning_rate": 4.939570240825267e-06, + "loss": 4.8006, + "step": 99110 + }, + { + "epoch": 2.0164998372395835, + "grad_norm": 20.66259765625, + "learning_rate": 4.939170592760336e-06, + "loss": 4.8048, + "step": 99115 + }, + { + "epoch": 2.0166015625, + "grad_norm": 23.59454917907715, + "learning_rate": 4.938770945084086e-06, + "loss": 4.8996, + "step": 99120 + }, + { + "epoch": 2.0167032877604165, + "grad_norm": 17.608699798583984, + "learning_rate": 4.93837129779907e-06, + "loss": 4.8161, + "step": 99125 + }, + { + "epoch": 2.0168050130208335, + "grad_norm": 13.713726997375488, + "learning_rate": 4.937971650907842e-06, + "loss": 4.7605, + "step": 99130 + }, + { + "epoch": 2.01690673828125, + "grad_norm": 15.470385551452637, + "learning_rate": 4.937572004412954e-06, + "loss": 4.8737, + "step": 99135 + }, + { + "epoch": 2.0170084635416665, + "grad_norm": 17.92073631286621, + "learning_rate": 4.937172358316961e-06, + "loss": 4.7511, + "step": 99140 + }, + { + "epoch": 2.0171101888020835, + "grad_norm": 17.23728370666504, + "learning_rate": 4.9367727126224185e-06, + "loss": 4.8374, + "step": 99145 + }, + { + "epoch": 2.0172119140625, + "grad_norm": 16.431640625, + "learning_rate": 4.936373067331877e-06, + "loss": 4.7754, + "step": 99150 + }, + { + "epoch": 2.0173136393229165, + "grad_norm": 15.35675048828125, + "learning_rate": 4.9359734224478934e-06, + "loss": 4.7136, + "step": 99155 + }, + { + "epoch": 2.0174153645833335, + "grad_norm": 19.98898696899414, + "learning_rate": 4.935573777973015e-06, + "loss": 4.9457, + "step": 99160 + }, + { + "epoch": 2.01751708984375, + "grad_norm": 16.963855743408203, + "learning_rate": 4.935174133909804e-06, + "loss": 4.7003, + "step": 99165 + }, + { + "epoch": 2.0176188151041665, + "grad_norm": 18.802106857299805, + "learning_rate": 4.934774490260808e-06, + "loss": 4.7437, + "step": 99170 + }, + { + "epoch": 2.0177205403645835, + "grad_norm": 16.459253311157227, + "learning_rate": 4.934374847028583e-06, + "loss": 4.8468, + "step": 99175 + }, + { + "epoch": 2.017822265625, + "grad_norm": 18.261362075805664, + "learning_rate": 4.933975204215682e-06, + "loss": 4.8689, + "step": 99180 + }, + { + "epoch": 2.0179239908854165, + "grad_norm": 24.920705795288086, + "learning_rate": 4.933575561824656e-06, + "loss": 4.7679, + "step": 99185 + }, + { + "epoch": 2.0180257161458335, + "grad_norm": 16.63192367553711, + "learning_rate": 4.933175919858063e-06, + "loss": 4.6853, + "step": 99190 + }, + { + "epoch": 2.01812744140625, + "grad_norm": 13.20835018157959, + "learning_rate": 4.932776278318455e-06, + "loss": 4.869, + "step": 99195 + }, + { + "epoch": 2.0182291666666665, + "grad_norm": 19.59725570678711, + "learning_rate": 4.932376637208384e-06, + "loss": 4.565, + "step": 99200 + }, + { + "epoch": 2.0183308919270835, + "grad_norm": 24.631635665893555, + "learning_rate": 4.9319769965304065e-06, + "loss": 4.6037, + "step": 99205 + }, + { + "epoch": 2.0184326171875, + "grad_norm": 18.253660202026367, + "learning_rate": 4.931577356287074e-06, + "loss": 4.4314, + "step": 99210 + }, + { + "epoch": 2.0185343424479165, + "grad_norm": 18.017196655273438, + "learning_rate": 4.93117771648094e-06, + "loss": 4.7388, + "step": 99215 + }, + { + "epoch": 2.0186360677083335, + "grad_norm": 18.058095932006836, + "learning_rate": 4.930778077114557e-06, + "loss": 4.6712, + "step": 99220 + }, + { + "epoch": 2.01873779296875, + "grad_norm": 14.978167533874512, + "learning_rate": 4.930378438190483e-06, + "loss": 4.621, + "step": 99225 + }, + { + "epoch": 2.0188395182291665, + "grad_norm": 17.336257934570312, + "learning_rate": 4.929978799711267e-06, + "loss": 4.9907, + "step": 99230 + }, + { + "epoch": 2.0189412434895835, + "grad_norm": 19.353160858154297, + "learning_rate": 4.929579161679466e-06, + "loss": 4.7996, + "step": 99235 + }, + { + "epoch": 2.01904296875, + "grad_norm": 21.57246208190918, + "learning_rate": 4.929179524097627e-06, + "loss": 4.8731, + "step": 99240 + }, + { + "epoch": 2.0191446940104165, + "grad_norm": 15.807506561279297, + "learning_rate": 4.928779886968313e-06, + "loss": 4.6557, + "step": 99245 + }, + { + "epoch": 2.0192464192708335, + "grad_norm": 16.331642150878906, + "learning_rate": 4.928380250294072e-06, + "loss": 4.6814, + "step": 99250 + }, + { + "epoch": 2.01934814453125, + "grad_norm": 14.667966842651367, + "learning_rate": 4.927980614077459e-06, + "loss": 4.7425, + "step": 99255 + }, + { + "epoch": 2.0194498697916665, + "grad_norm": 18.328449249267578, + "learning_rate": 4.927580978321026e-06, + "loss": 4.7194, + "step": 99260 + }, + { + "epoch": 2.0195515950520835, + "grad_norm": 20.576913833618164, + "learning_rate": 4.927181343027326e-06, + "loss": 4.9462, + "step": 99265 + }, + { + "epoch": 2.0196533203125, + "grad_norm": 15.362520217895508, + "learning_rate": 4.9267817081989165e-06, + "loss": 4.8065, + "step": 99270 + }, + { + "epoch": 2.0197550455729165, + "grad_norm": 18.341543197631836, + "learning_rate": 4.926382073838348e-06, + "loss": 4.7754, + "step": 99275 + }, + { + "epoch": 2.0198567708333335, + "grad_norm": 18.155229568481445, + "learning_rate": 4.925982439948176e-06, + "loss": 4.6825, + "step": 99280 + }, + { + "epoch": 2.01995849609375, + "grad_norm": 20.44822120666504, + "learning_rate": 4.925582806530951e-06, + "loss": 4.7124, + "step": 99285 + }, + { + "epoch": 2.0200602213541665, + "grad_norm": 17.74825668334961, + "learning_rate": 4.925183173589227e-06, + "loss": 4.7973, + "step": 99290 + }, + { + "epoch": 2.0201619466145835, + "grad_norm": 25.16090965270996, + "learning_rate": 4.924783541125561e-06, + "loss": 4.7576, + "step": 99295 + }, + { + "epoch": 2.020263671875, + "grad_norm": 18.37641716003418, + "learning_rate": 4.924383909142504e-06, + "loss": 4.9086, + "step": 99300 + }, + { + "epoch": 2.0203653971354165, + "grad_norm": 15.179401397705078, + "learning_rate": 4.923984277642609e-06, + "loss": 4.5305, + "step": 99305 + }, + { + "epoch": 2.0204671223958335, + "grad_norm": 14.476790428161621, + "learning_rate": 4.923584646628432e-06, + "loss": 4.5683, + "step": 99310 + }, + { + "epoch": 2.02056884765625, + "grad_norm": 17.654359817504883, + "learning_rate": 4.923185016102522e-06, + "loss": 4.7475, + "step": 99315 + }, + { + "epoch": 2.0206705729166665, + "grad_norm": 21.178813934326172, + "learning_rate": 4.922785386067437e-06, + "loss": 4.4281, + "step": 99320 + }, + { + "epoch": 2.0207722981770835, + "grad_norm": 17.423749923706055, + "learning_rate": 4.92238575652573e-06, + "loss": 4.686, + "step": 99325 + }, + { + "epoch": 2.0208740234375, + "grad_norm": 22.220788955688477, + "learning_rate": 4.9219861274799515e-06, + "loss": 4.8841, + "step": 99330 + }, + { + "epoch": 2.0209757486979165, + "grad_norm": 21.470354080200195, + "learning_rate": 4.921586498932658e-06, + "loss": 4.6664, + "step": 99335 + }, + { + "epoch": 2.0210774739583335, + "grad_norm": 17.90019416809082, + "learning_rate": 4.921186870886402e-06, + "loss": 4.8505, + "step": 99340 + }, + { + "epoch": 2.02117919921875, + "grad_norm": 13.658323287963867, + "learning_rate": 4.920787243343735e-06, + "loss": 4.7836, + "step": 99345 + }, + { + "epoch": 2.0212809244791665, + "grad_norm": 14.903620719909668, + "learning_rate": 4.920387616307213e-06, + "loss": 4.798, + "step": 99350 + }, + { + "epoch": 2.0213826497395835, + "grad_norm": 16.519081115722656, + "learning_rate": 4.9199879897793904e-06, + "loss": 4.817, + "step": 99355 + }, + { + "epoch": 2.021484375, + "grad_norm": 16.092924118041992, + "learning_rate": 4.919588363762819e-06, + "loss": 4.7338, + "step": 99360 + }, + { + "epoch": 2.0215861002604165, + "grad_norm": 17.927888870239258, + "learning_rate": 4.9191887382600525e-06, + "loss": 4.8434, + "step": 99365 + }, + { + "epoch": 2.0216878255208335, + "grad_norm": 15.023361206054688, + "learning_rate": 4.918789113273642e-06, + "loss": 4.6694, + "step": 99370 + }, + { + "epoch": 2.02178955078125, + "grad_norm": 19.722057342529297, + "learning_rate": 4.918389488806145e-06, + "loss": 4.6195, + "step": 99375 + }, + { + "epoch": 2.0218912760416665, + "grad_norm": 18.007091522216797, + "learning_rate": 4.9179898648601135e-06, + "loss": 4.9507, + "step": 99380 + }, + { + "epoch": 2.0219930013020835, + "grad_norm": 16.763242721557617, + "learning_rate": 4.9175902414381005e-06, + "loss": 4.711, + "step": 99385 + }, + { + "epoch": 2.0220947265625, + "grad_norm": 13.949551582336426, + "learning_rate": 4.917190618542661e-06, + "loss": 4.5527, + "step": 99390 + }, + { + "epoch": 2.0221964518229165, + "grad_norm": 22.04863929748535, + "learning_rate": 4.916790996176343e-06, + "loss": 4.7211, + "step": 99395 + }, + { + "epoch": 2.0222981770833335, + "grad_norm": 16.524765014648438, + "learning_rate": 4.916391374341708e-06, + "loss": 4.8691, + "step": 99400 + }, + { + "epoch": 2.02239990234375, + "grad_norm": 19.56622886657715, + "learning_rate": 4.915991753041304e-06, + "loss": 4.9005, + "step": 99405 + }, + { + "epoch": 2.0225016276041665, + "grad_norm": 19.64276885986328, + "learning_rate": 4.915592132277687e-06, + "loss": 4.8647, + "step": 99410 + }, + { + "epoch": 2.0226033528645835, + "grad_norm": 16.244741439819336, + "learning_rate": 4.9151925120534096e-06, + "loss": 4.8978, + "step": 99415 + }, + { + "epoch": 2.022705078125, + "grad_norm": 21.181690216064453, + "learning_rate": 4.914792892371023e-06, + "loss": 4.7652, + "step": 99420 + }, + { + "epoch": 2.0228068033854165, + "grad_norm": 14.452652931213379, + "learning_rate": 4.914393273233084e-06, + "loss": 4.6731, + "step": 99425 + }, + { + "epoch": 2.0229085286458335, + "grad_norm": 15.679312705993652, + "learning_rate": 4.913993654642147e-06, + "loss": 4.6629, + "step": 99430 + }, + { + "epoch": 2.02301025390625, + "grad_norm": 17.33782196044922, + "learning_rate": 4.9135940366007605e-06, + "loss": 4.8501, + "step": 99435 + }, + { + "epoch": 2.0231119791666665, + "grad_norm": 17.118959426879883, + "learning_rate": 4.913194419111482e-06, + "loss": 4.8423, + "step": 99440 + }, + { + "epoch": 2.0232137044270835, + "grad_norm": 16.75983238220215, + "learning_rate": 4.9127948021768615e-06, + "loss": 4.7358, + "step": 99445 + }, + { + "epoch": 2.0233154296875, + "grad_norm": 19.218908309936523, + "learning_rate": 4.912395185799456e-06, + "loss": 5.1254, + "step": 99450 + }, + { + "epoch": 2.0234171549479165, + "grad_norm": 19.550745010375977, + "learning_rate": 4.911995569981818e-06, + "loss": 4.7738, + "step": 99455 + }, + { + "epoch": 2.0235188802083335, + "grad_norm": 16.01423454284668, + "learning_rate": 4.911595954726499e-06, + "loss": 4.9641, + "step": 99460 + }, + { + "epoch": 2.02362060546875, + "grad_norm": 17.790626525878906, + "learning_rate": 4.911196340036056e-06, + "loss": 4.8262, + "step": 99465 + }, + { + "epoch": 2.0237223307291665, + "grad_norm": 18.654865264892578, + "learning_rate": 4.910796725913038e-06, + "loss": 4.8355, + "step": 99470 + }, + { + "epoch": 2.0238240559895835, + "grad_norm": 18.60529899597168, + "learning_rate": 4.910397112360002e-06, + "loss": 4.7166, + "step": 99475 + }, + { + "epoch": 2.02392578125, + "grad_norm": 17.725378036499023, + "learning_rate": 4.9099974993794995e-06, + "loss": 4.9153, + "step": 99480 + }, + { + "epoch": 2.0240275065104165, + "grad_norm": 19.8565673828125, + "learning_rate": 4.909597886974084e-06, + "loss": 4.7307, + "step": 99485 + }, + { + "epoch": 2.0241292317708335, + "grad_norm": 21.76200294494629, + "learning_rate": 4.909198275146311e-06, + "loss": 4.9677, + "step": 99490 + }, + { + "epoch": 2.02423095703125, + "grad_norm": 18.997812271118164, + "learning_rate": 4.908798663898731e-06, + "loss": 4.7149, + "step": 99495 + }, + { + "epoch": 2.0243326822916665, + "grad_norm": 17.939239501953125, + "learning_rate": 4.9083990532338964e-06, + "loss": 4.7766, + "step": 99500 + }, + { + "epoch": 2.0244344075520835, + "grad_norm": 18.032302856445312, + "learning_rate": 4.907999443154366e-06, + "loss": 4.7419, + "step": 99505 + }, + { + "epoch": 2.0245361328125, + "grad_norm": 16.42527961730957, + "learning_rate": 4.907599833662689e-06, + "loss": 4.6185, + "step": 99510 + }, + { + "epoch": 2.0246378580729165, + "grad_norm": 19.35553550720215, + "learning_rate": 4.90720022476142e-06, + "loss": 4.9635, + "step": 99515 + }, + { + "epoch": 2.0247395833333335, + "grad_norm": 24.197532653808594, + "learning_rate": 4.9068006164531125e-06, + "loss": 4.7653, + "step": 99520 + }, + { + "epoch": 2.02484130859375, + "grad_norm": 23.891448974609375, + "learning_rate": 4.9064010087403165e-06, + "loss": 4.7021, + "step": 99525 + }, + { + "epoch": 2.0249430338541665, + "grad_norm": 22.52150535583496, + "learning_rate": 4.906001401625591e-06, + "loss": 4.9651, + "step": 99530 + }, + { + "epoch": 2.0250447591145835, + "grad_norm": 17.465707778930664, + "learning_rate": 4.905601795111486e-06, + "loss": 4.9093, + "step": 99535 + }, + { + "epoch": 2.025146484375, + "grad_norm": 17.478233337402344, + "learning_rate": 4.9052021892005565e-06, + "loss": 4.9109, + "step": 99540 + }, + { + "epoch": 2.0252482096354165, + "grad_norm": 17.519691467285156, + "learning_rate": 4.904802583895355e-06, + "loss": 5.2322, + "step": 99545 + }, + { + "epoch": 2.0253499348958335, + "grad_norm": 13.800225257873535, + "learning_rate": 4.904402979198432e-06, + "loss": 4.7297, + "step": 99550 + }, + { + "epoch": 2.02545166015625, + "grad_norm": 16.816905975341797, + "learning_rate": 4.904003375112346e-06, + "loss": 4.8713, + "step": 99555 + }, + { + "epoch": 2.0255533854166665, + "grad_norm": 16.79359245300293, + "learning_rate": 4.903603771639648e-06, + "loss": 4.695, + "step": 99560 + }, + { + "epoch": 2.0256551106770835, + "grad_norm": 21.173553466796875, + "learning_rate": 4.903204168782891e-06, + "loss": 4.9362, + "step": 99565 + }, + { + "epoch": 2.0257568359375, + "grad_norm": 17.66111946105957, + "learning_rate": 4.902804566544629e-06, + "loss": 5.1512, + "step": 99570 + }, + { + "epoch": 2.0258585611979165, + "grad_norm": 15.992958068847656, + "learning_rate": 4.902404964927413e-06, + "loss": 5.03, + "step": 99575 + }, + { + "epoch": 2.0259602864583335, + "grad_norm": 19.979198455810547, + "learning_rate": 4.9020053639338e-06, + "loss": 4.9451, + "step": 99580 + }, + { + "epoch": 2.02606201171875, + "grad_norm": 14.179130554199219, + "learning_rate": 4.901605763566342e-06, + "loss": 4.648, + "step": 99585 + }, + { + "epoch": 2.0261637369791665, + "grad_norm": 18.185461044311523, + "learning_rate": 4.901206163827592e-06, + "loss": 4.5045, + "step": 99590 + }, + { + "epoch": 2.0262654622395835, + "grad_norm": 22.244321823120117, + "learning_rate": 4.900806564720101e-06, + "loss": 4.7071, + "step": 99595 + }, + { + "epoch": 2.0263671875, + "grad_norm": 17.622909545898438, + "learning_rate": 4.900406966246426e-06, + "loss": 4.8982, + "step": 99600 + }, + { + "epoch": 2.0264689127604165, + "grad_norm": 20.801586151123047, + "learning_rate": 4.90000736840912e-06, + "loss": 4.7759, + "step": 99605 + }, + { + "epoch": 2.0265706380208335, + "grad_norm": 16.027151107788086, + "learning_rate": 4.899607771210733e-06, + "loss": 4.8147, + "step": 99610 + }, + { + "epoch": 2.02667236328125, + "grad_norm": 16.70591926574707, + "learning_rate": 4.899208174653822e-06, + "loss": 4.9048, + "step": 99615 + }, + { + "epoch": 2.0267740885416665, + "grad_norm": 25.984996795654297, + "learning_rate": 4.898808578740939e-06, + "loss": 4.5857, + "step": 99620 + }, + { + "epoch": 2.0268758138020835, + "grad_norm": 14.5411376953125, + "learning_rate": 4.898408983474635e-06, + "loss": 5.0275, + "step": 99625 + }, + { + "epoch": 2.0269775390625, + "grad_norm": 15.910808563232422, + "learning_rate": 4.898009388857467e-06, + "loss": 4.9712, + "step": 99630 + }, + { + "epoch": 2.0270792643229165, + "grad_norm": 17.048656463623047, + "learning_rate": 4.897609794891987e-06, + "loss": 4.7644, + "step": 99635 + }, + { + "epoch": 2.0271809895833335, + "grad_norm": 13.916526794433594, + "learning_rate": 4.897210201580747e-06, + "loss": 4.5783, + "step": 99640 + }, + { + "epoch": 2.02728271484375, + "grad_norm": 21.41795539855957, + "learning_rate": 4.896810608926302e-06, + "loss": 4.7229, + "step": 99645 + }, + { + "epoch": 2.0273844401041665, + "grad_norm": 21.620189666748047, + "learning_rate": 4.896411016931205e-06, + "loss": 4.865, + "step": 99650 + }, + { + "epoch": 2.0274861653645835, + "grad_norm": 20.076433181762695, + "learning_rate": 4.896011425598006e-06, + "loss": 4.7728, + "step": 99655 + }, + { + "epoch": 2.027587890625, + "grad_norm": 25.42906379699707, + "learning_rate": 4.895611834929263e-06, + "loss": 4.9284, + "step": 99660 + }, + { + "epoch": 2.0276896158854165, + "grad_norm": 20.652875900268555, + "learning_rate": 4.895212244927527e-06, + "loss": 4.656, + "step": 99665 + }, + { + "epoch": 2.0277913411458335, + "grad_norm": 17.52806854248047, + "learning_rate": 4.8948126555953515e-06, + "loss": 4.6499, + "step": 99670 + }, + { + "epoch": 2.02789306640625, + "grad_norm": 13.694550514221191, + "learning_rate": 4.894413066935291e-06, + "loss": 4.9046, + "step": 99675 + }, + { + "epoch": 2.0279947916666665, + "grad_norm": 17.59151268005371, + "learning_rate": 4.894013478949894e-06, + "loss": 4.6027, + "step": 99680 + }, + { + "epoch": 2.0280965169270835, + "grad_norm": 19.28911018371582, + "learning_rate": 4.8936138916417195e-06, + "loss": 5.065, + "step": 99685 + }, + { + "epoch": 2.0281982421875, + "grad_norm": 15.23709487915039, + "learning_rate": 4.893214305013319e-06, + "loss": 4.9193, + "step": 99690 + }, + { + "epoch": 2.0282999674479165, + "grad_norm": 13.814961433410645, + "learning_rate": 4.892814719067245e-06, + "loss": 4.7086, + "step": 99695 + }, + { + "epoch": 2.0284016927083335, + "grad_norm": 14.775712013244629, + "learning_rate": 4.892415133806051e-06, + "loss": 4.8905, + "step": 99700 + }, + { + "epoch": 2.02850341796875, + "grad_norm": 17.60704803466797, + "learning_rate": 4.892015549232287e-06, + "loss": 5.0768, + "step": 99705 + }, + { + "epoch": 2.0286051432291665, + "grad_norm": 15.118952751159668, + "learning_rate": 4.891615965348513e-06, + "loss": 4.7267, + "step": 99710 + }, + { + "epoch": 2.0287068684895835, + "grad_norm": 14.505485534667969, + "learning_rate": 4.891216382157278e-06, + "loss": 4.8306, + "step": 99715 + }, + { + "epoch": 2.02880859375, + "grad_norm": 20.928504943847656, + "learning_rate": 4.890816799661136e-06, + "loss": 5.1024, + "step": 99720 + }, + { + "epoch": 2.0289103190104165, + "grad_norm": 18.01095962524414, + "learning_rate": 4.890417217862638e-06, + "loss": 4.6372, + "step": 99725 + }, + { + "epoch": 2.0290120442708335, + "grad_norm": 16.11154556274414, + "learning_rate": 4.8900176367643395e-06, + "loss": 4.7352, + "step": 99730 + }, + { + "epoch": 2.02911376953125, + "grad_norm": 13.547258377075195, + "learning_rate": 4.889618056368795e-06, + "loss": 4.6341, + "step": 99735 + }, + { + "epoch": 2.0292154947916665, + "grad_norm": 15.811355590820312, + "learning_rate": 4.889218476678555e-06, + "loss": 4.8239, + "step": 99740 + }, + { + "epoch": 2.0293172200520835, + "grad_norm": 16.263193130493164, + "learning_rate": 4.888818897696175e-06, + "loss": 4.8076, + "step": 99745 + }, + { + "epoch": 2.0294189453125, + "grad_norm": 21.101423263549805, + "learning_rate": 4.888419319424206e-06, + "loss": 4.7546, + "step": 99750 + }, + { + "epoch": 2.0295206705729165, + "grad_norm": 19.413677215576172, + "learning_rate": 4.888019741865202e-06, + "loss": 4.856, + "step": 99755 + }, + { + "epoch": 2.0296223958333335, + "grad_norm": 16.166542053222656, + "learning_rate": 4.887620165021716e-06, + "loss": 4.5031, + "step": 99760 + }, + { + "epoch": 2.02972412109375, + "grad_norm": 14.679145812988281, + "learning_rate": 4.8872205888963024e-06, + "loss": 4.7501, + "step": 99765 + }, + { + "epoch": 2.0298258463541665, + "grad_norm": 14.88194751739502, + "learning_rate": 4.886821013491513e-06, + "loss": 4.7436, + "step": 99770 + }, + { + "epoch": 2.0299275716145835, + "grad_norm": 19.3585147857666, + "learning_rate": 4.886421438809903e-06, + "loss": 5.1091, + "step": 99775 + }, + { + "epoch": 2.030029296875, + "grad_norm": 20.471282958984375, + "learning_rate": 4.886021864854022e-06, + "loss": 4.9807, + "step": 99780 + }, + { + "epoch": 2.0301310221354165, + "grad_norm": 15.735625267028809, + "learning_rate": 4.885622291626424e-06, + "loss": 4.7615, + "step": 99785 + }, + { + "epoch": 2.0302327473958335, + "grad_norm": 16.4522762298584, + "learning_rate": 4.885222719129666e-06, + "loss": 4.9907, + "step": 99790 + }, + { + "epoch": 2.03033447265625, + "grad_norm": 16.586301803588867, + "learning_rate": 4.8848231473662964e-06, + "loss": 4.8281, + "step": 99795 + }, + { + "epoch": 2.0304361979166665, + "grad_norm": 14.479970932006836, + "learning_rate": 4.884423576338872e-06, + "loss": 4.7961, + "step": 99800 + }, + { + "epoch": 2.0305379231770835, + "grad_norm": 16.962617874145508, + "learning_rate": 4.884024006049943e-06, + "loss": 4.7461, + "step": 99805 + }, + { + "epoch": 2.0306396484375, + "grad_norm": 17.066682815551758, + "learning_rate": 4.883624436502063e-06, + "loss": 5.1684, + "step": 99810 + }, + { + "epoch": 2.0307413736979165, + "grad_norm": 18.417810440063477, + "learning_rate": 4.883224867697787e-06, + "loss": 4.5486, + "step": 99815 + }, + { + "epoch": 2.0308430989583335, + "grad_norm": 19.725481033325195, + "learning_rate": 4.882825299639668e-06, + "loss": 4.9537, + "step": 99820 + }, + { + "epoch": 2.03094482421875, + "grad_norm": 18.392471313476562, + "learning_rate": 4.882425732330258e-06, + "loss": 4.606, + "step": 99825 + }, + { + "epoch": 2.0310465494791665, + "grad_norm": 19.457042694091797, + "learning_rate": 4.8820261657721094e-06, + "loss": 5.0444, + "step": 99830 + }, + { + "epoch": 2.0311482747395835, + "grad_norm": 16.293123245239258, + "learning_rate": 4.8816265999677745e-06, + "loss": 4.7978, + "step": 99835 + }, + { + "epoch": 2.03125, + "grad_norm": 17.019933700561523, + "learning_rate": 4.88122703491981e-06, + "loss": 4.7536, + "step": 99840 + }, + { + "epoch": 2.0313517252604165, + "grad_norm": 16.10517120361328, + "learning_rate": 4.8808274706307675e-06, + "loss": 4.9107, + "step": 99845 + }, + { + "epoch": 2.0314534505208335, + "grad_norm": 20.154130935668945, + "learning_rate": 4.8804279071031995e-06, + "loss": 4.8279, + "step": 99850 + }, + { + "epoch": 2.03155517578125, + "grad_norm": 16.184324264526367, + "learning_rate": 4.880028344339658e-06, + "loss": 5.1274, + "step": 99855 + }, + { + "epoch": 2.0316569010416665, + "grad_norm": 21.28937339782715, + "learning_rate": 4.879628782342697e-06, + "loss": 4.8598, + "step": 99860 + }, + { + "epoch": 2.0317586263020835, + "grad_norm": 14.758081436157227, + "learning_rate": 4.879229221114871e-06, + "loss": 4.8053, + "step": 99865 + }, + { + "epoch": 2.0318603515625, + "grad_norm": 14.455550193786621, + "learning_rate": 4.878829660658732e-06, + "loss": 4.8758, + "step": 99870 + }, + { + "epoch": 2.0319620768229165, + "grad_norm": 19.891321182250977, + "learning_rate": 4.878430100976832e-06, + "loss": 4.8756, + "step": 99875 + }, + { + "epoch": 2.0320638020833335, + "grad_norm": 20.11174201965332, + "learning_rate": 4.878030542071727e-06, + "loss": 4.8581, + "step": 99880 + }, + { + "epoch": 2.03216552734375, + "grad_norm": 21.011131286621094, + "learning_rate": 4.877630983945966e-06, + "loss": 4.7576, + "step": 99885 + }, + { + "epoch": 2.0322672526041665, + "grad_norm": 19.842998504638672, + "learning_rate": 4.877231426602105e-06, + "loss": 4.7334, + "step": 99890 + }, + { + "epoch": 2.0323689778645835, + "grad_norm": 21.522809982299805, + "learning_rate": 4.876831870042697e-06, + "loss": 4.7962, + "step": 99895 + }, + { + "epoch": 2.032470703125, + "grad_norm": 19.243358612060547, + "learning_rate": 4.876432314270294e-06, + "loss": 5.1822, + "step": 99900 + }, + { + "epoch": 2.0325724283854165, + "grad_norm": 18.89229393005371, + "learning_rate": 4.876032759287449e-06, + "loss": 4.8514, + "step": 99905 + }, + { + "epoch": 2.0326741536458335, + "grad_norm": 26.63064956665039, + "learning_rate": 4.875633205096713e-06, + "loss": 4.5992, + "step": 99910 + }, + { + "epoch": 2.03277587890625, + "grad_norm": 18.399703979492188, + "learning_rate": 4.875233651700644e-06, + "loss": 4.7654, + "step": 99915 + }, + { + "epoch": 2.0328776041666665, + "grad_norm": 16.470090866088867, + "learning_rate": 4.874834099101793e-06, + "loss": 4.8759, + "step": 99920 + }, + { + "epoch": 2.0329793294270835, + "grad_norm": 18.169570922851562, + "learning_rate": 4.874434547302711e-06, + "loss": 4.7009, + "step": 99925 + }, + { + "epoch": 2.0330810546875, + "grad_norm": 17.424354553222656, + "learning_rate": 4.874034996305953e-06, + "loss": 4.9333, + "step": 99930 + }, + { + "epoch": 2.0331827799479165, + "grad_norm": 23.076120376586914, + "learning_rate": 4.873635446114072e-06, + "loss": 5.0416, + "step": 99935 + }, + { + "epoch": 2.0332845052083335, + "grad_norm": 19.867568969726562, + "learning_rate": 4.873235896729617e-06, + "loss": 4.7132, + "step": 99940 + }, + { + "epoch": 2.03338623046875, + "grad_norm": 12.406368255615234, + "learning_rate": 4.872836348155148e-06, + "loss": 4.7671, + "step": 99945 + }, + { + "epoch": 2.0334879557291665, + "grad_norm": 14.827316284179688, + "learning_rate": 4.872436800393213e-06, + "loss": 4.9131, + "step": 99950 + }, + { + "epoch": 2.0335896809895835, + "grad_norm": 20.03080940246582, + "learning_rate": 4.872037253446367e-06, + "loss": 4.7037, + "step": 99955 + }, + { + "epoch": 2.03369140625, + "grad_norm": 18.959613800048828, + "learning_rate": 4.871637707317162e-06, + "loss": 5.1905, + "step": 99960 + }, + { + "epoch": 2.0337931315104165, + "grad_norm": 18.696117401123047, + "learning_rate": 4.8712381620081495e-06, + "loss": 4.64, + "step": 99965 + }, + { + "epoch": 2.0338948567708335, + "grad_norm": 14.918585777282715, + "learning_rate": 4.8708386175218865e-06, + "loss": 4.6386, + "step": 99970 + }, + { + "epoch": 2.03399658203125, + "grad_norm": 39.512123107910156, + "learning_rate": 4.870439073860924e-06, + "loss": 5.4099, + "step": 99975 + }, + { + "epoch": 2.0340983072916665, + "grad_norm": 19.728017807006836, + "learning_rate": 4.8700395310278135e-06, + "loss": 4.8512, + "step": 99980 + }, + { + "epoch": 2.0342000325520835, + "grad_norm": 19.55813217163086, + "learning_rate": 4.8696399890251095e-06, + "loss": 4.8853, + "step": 99985 + }, + { + "epoch": 2.0343017578125, + "grad_norm": 14.586308479309082, + "learning_rate": 4.869240447855363e-06, + "loss": 4.7129, + "step": 99990 + }, + { + "epoch": 2.0344034830729165, + "grad_norm": 20.254364013671875, + "learning_rate": 4.868840907521131e-06, + "loss": 5.0441, + "step": 99995 + }, + { + "epoch": 2.0345052083333335, + "grad_norm": 15.401457786560059, + "learning_rate": 4.868441368024964e-06, + "loss": 5.1461, + "step": 100000 + }, + { + "epoch": 2.03460693359375, + "grad_norm": 13.923895835876465, + "learning_rate": 4.868041829369413e-06, + "loss": 4.9908, + "step": 100005 + }, + { + "epoch": 2.0347086588541665, + "grad_norm": 20.113868713378906, + "learning_rate": 4.867642291557034e-06, + "loss": 4.8561, + "step": 100010 + }, + { + "epoch": 2.0348103841145835, + "grad_norm": 16.33217430114746, + "learning_rate": 4.867242754590377e-06, + "loss": 4.558, + "step": 100015 + }, + { + "epoch": 2.034912109375, + "grad_norm": 17.06500244140625, + "learning_rate": 4.8668432184719985e-06, + "loss": 5.0582, + "step": 100020 + }, + { + "epoch": 2.0350138346354165, + "grad_norm": 29.95802879333496, + "learning_rate": 4.86644368320445e-06, + "loss": 4.7455, + "step": 100025 + }, + { + "epoch": 2.0351155598958335, + "grad_norm": 17.776935577392578, + "learning_rate": 4.8660441487902836e-06, + "loss": 4.9999, + "step": 100030 + }, + { + "epoch": 2.03521728515625, + "grad_norm": 16.13401985168457, + "learning_rate": 4.8656446152320525e-06, + "loss": 4.8218, + "step": 100035 + }, + { + "epoch": 2.0353190104166665, + "grad_norm": 15.375394821166992, + "learning_rate": 4.865245082532307e-06, + "loss": 4.8925, + "step": 100040 + }, + { + "epoch": 2.0354207356770835, + "grad_norm": 17.916749954223633, + "learning_rate": 4.864845550693606e-06, + "loss": 4.7148, + "step": 100045 + }, + { + "epoch": 2.0355224609375, + "grad_norm": 17.21319007873535, + "learning_rate": 4.864446019718499e-06, + "loss": 4.9599, + "step": 100050 + }, + { + "epoch": 2.0356241861979165, + "grad_norm": 23.143768310546875, + "learning_rate": 4.8640464896095375e-06, + "loss": 4.6851, + "step": 100055 + }, + { + "epoch": 2.0357259114583335, + "grad_norm": 21.471912384033203, + "learning_rate": 4.863646960369276e-06, + "loss": 4.9584, + "step": 100060 + }, + { + "epoch": 2.03582763671875, + "grad_norm": 20.36737060546875, + "learning_rate": 4.8632474320002655e-06, + "loss": 5.0578, + "step": 100065 + }, + { + "epoch": 2.0359293619791665, + "grad_norm": 19.500457763671875, + "learning_rate": 4.862847904505063e-06, + "loss": 5.0317, + "step": 100070 + }, + { + "epoch": 2.0360310872395835, + "grad_norm": 19.37256622314453, + "learning_rate": 4.862448377886219e-06, + "loss": 4.8829, + "step": 100075 + }, + { + "epoch": 2.0361328125, + "grad_norm": 17.115732192993164, + "learning_rate": 4.862048852146286e-06, + "loss": 4.6455, + "step": 100080 + }, + { + "epoch": 2.0362345377604165, + "grad_norm": 15.92082691192627, + "learning_rate": 4.861649327287817e-06, + "loss": 4.9117, + "step": 100085 + }, + { + "epoch": 2.0363362630208335, + "grad_norm": 11.563328742980957, + "learning_rate": 4.8612498033133646e-06, + "loss": 4.7815, + "step": 100090 + }, + { + "epoch": 2.03643798828125, + "grad_norm": 17.141551971435547, + "learning_rate": 4.86085028022548e-06, + "loss": 4.8024, + "step": 100095 + }, + { + "epoch": 2.0365397135416665, + "grad_norm": 21.417156219482422, + "learning_rate": 4.860450758026719e-06, + "loss": 4.7432, + "step": 100100 + }, + { + "epoch": 2.0366414388020835, + "grad_norm": 20.004884719848633, + "learning_rate": 4.860051236719635e-06, + "loss": 5.1251, + "step": 100105 + }, + { + "epoch": 2.0367431640625, + "grad_norm": 20.69284439086914, + "learning_rate": 4.859651716306778e-06, + "loss": 4.6608, + "step": 100110 + }, + { + "epoch": 2.0368448893229165, + "grad_norm": 18.42059326171875, + "learning_rate": 4.859252196790702e-06, + "loss": 4.7366, + "step": 100115 + }, + { + "epoch": 2.0369466145833335, + "grad_norm": 14.343164443969727, + "learning_rate": 4.858852678173959e-06, + "loss": 5.1134, + "step": 100120 + }, + { + "epoch": 2.03704833984375, + "grad_norm": 22.374549865722656, + "learning_rate": 4.858453160459104e-06, + "loss": 4.754, + "step": 100125 + }, + { + "epoch": 2.0371500651041665, + "grad_norm": 14.2816162109375, + "learning_rate": 4.858053643648688e-06, + "loss": 4.7845, + "step": 100130 + }, + { + "epoch": 2.0372517903645835, + "grad_norm": 17.135719299316406, + "learning_rate": 4.857654127745263e-06, + "loss": 5.0452, + "step": 100135 + }, + { + "epoch": 2.037353515625, + "grad_norm": 18.836185455322266, + "learning_rate": 4.8572546127513845e-06, + "loss": 4.8019, + "step": 100140 + }, + { + "epoch": 2.0374552408854165, + "grad_norm": 21.75788688659668, + "learning_rate": 4.856855098669603e-06, + "loss": 5.0199, + "step": 100145 + }, + { + "epoch": 2.0375569661458335, + "grad_norm": 26.645586013793945, + "learning_rate": 4.856455585502471e-06, + "loss": 4.9267, + "step": 100150 + }, + { + "epoch": 2.03765869140625, + "grad_norm": 21.504100799560547, + "learning_rate": 4.856056073252544e-06, + "loss": 4.6751, + "step": 100155 + }, + { + "epoch": 2.0377604166666665, + "grad_norm": 16.865964889526367, + "learning_rate": 4.8556565619223725e-06, + "loss": 4.7702, + "step": 100160 + }, + { + "epoch": 2.0378621419270835, + "grad_norm": 14.295774459838867, + "learning_rate": 4.85525705151451e-06, + "loss": 4.8594, + "step": 100165 + }, + { + "epoch": 2.0379638671875, + "grad_norm": 22.138967514038086, + "learning_rate": 4.854857542031507e-06, + "loss": 4.724, + "step": 100170 + }, + { + "epoch": 2.0380655924479165, + "grad_norm": 18.043434143066406, + "learning_rate": 4.85445803347592e-06, + "loss": 4.9497, + "step": 100175 + }, + { + "epoch": 2.0381673177083335, + "grad_norm": 19.85261344909668, + "learning_rate": 4.8540585258503e-06, + "loss": 4.9286, + "step": 100180 + }, + { + "epoch": 2.03826904296875, + "grad_norm": 22.292387008666992, + "learning_rate": 4.8536590191572e-06, + "loss": 4.9227, + "step": 100185 + }, + { + "epoch": 2.0383707682291665, + "grad_norm": 16.307857513427734, + "learning_rate": 4.8532595133991725e-06, + "loss": 4.575, + "step": 100190 + }, + { + "epoch": 2.0384724934895835, + "grad_norm": 18.151504516601562, + "learning_rate": 4.852860008578768e-06, + "loss": 4.903, + "step": 100195 + }, + { + "epoch": 2.03857421875, + "grad_norm": 28.071287155151367, + "learning_rate": 4.8524605046985426e-06, + "loss": 5.1234, + "step": 100200 + }, + { + "epoch": 2.0386759440104165, + "grad_norm": 16.98441505432129, + "learning_rate": 4.852061001761049e-06, + "loss": 4.9143, + "step": 100205 + }, + { + "epoch": 2.0387776692708335, + "grad_norm": 20.293498992919922, + "learning_rate": 4.851661499768838e-06, + "loss": 4.8507, + "step": 100210 + }, + { + "epoch": 2.03887939453125, + "grad_norm": 18.798992156982422, + "learning_rate": 4.851261998724462e-06, + "loss": 4.6854, + "step": 100215 + }, + { + "epoch": 2.0389811197916665, + "grad_norm": 20.580612182617188, + "learning_rate": 4.850862498630477e-06, + "loss": 4.7783, + "step": 100220 + }, + { + "epoch": 2.0390828450520835, + "grad_norm": 17.794950485229492, + "learning_rate": 4.850462999489429e-06, + "loss": 4.7403, + "step": 100225 + }, + { + "epoch": 2.0391845703125, + "grad_norm": 13.601666450500488, + "learning_rate": 4.850063501303877e-06, + "loss": 4.6766, + "step": 100230 + }, + { + "epoch": 2.0392862955729165, + "grad_norm": 24.911584854125977, + "learning_rate": 4.849664004076373e-06, + "loss": 5.0713, + "step": 100235 + }, + { + "epoch": 2.0393880208333335, + "grad_norm": 23.282812118530273, + "learning_rate": 4.849264507809468e-06, + "loss": 4.7673, + "step": 100240 + }, + { + "epoch": 2.03948974609375, + "grad_norm": 20.689918518066406, + "learning_rate": 4.848865012505713e-06, + "loss": 4.9012, + "step": 100245 + }, + { + "epoch": 2.0395914713541665, + "grad_norm": 16.537721633911133, + "learning_rate": 4.848465518167664e-06, + "loss": 4.9091, + "step": 100250 + }, + { + "epoch": 2.0396931966145835, + "grad_norm": 21.050153732299805, + "learning_rate": 4.848066024797873e-06, + "loss": 4.8, + "step": 100255 + }, + { + "epoch": 2.039794921875, + "grad_norm": 17.360597610473633, + "learning_rate": 4.84766653239889e-06, + "loss": 4.5661, + "step": 100260 + }, + { + "epoch": 2.0398966471354165, + "grad_norm": 18.822656631469727, + "learning_rate": 4.8472670409732705e-06, + "loss": 4.7205, + "step": 100265 + }, + { + "epoch": 2.0399983723958335, + "grad_norm": 15.833327293395996, + "learning_rate": 4.846867550523567e-06, + "loss": 4.9002, + "step": 100270 + }, + { + "epoch": 2.04010009765625, + "grad_norm": 18.85454750061035, + "learning_rate": 4.846468061052329e-06, + "loss": 4.8032, + "step": 100275 + }, + { + "epoch": 2.0402018229166665, + "grad_norm": 16.646305084228516, + "learning_rate": 4.846068572562113e-06, + "loss": 5.0657, + "step": 100280 + }, + { + "epoch": 2.0403035481770835, + "grad_norm": 17.27202796936035, + "learning_rate": 4.845669085055469e-06, + "loss": 5.4066, + "step": 100285 + }, + { + "epoch": 2.0404052734375, + "grad_norm": 20.882902145385742, + "learning_rate": 4.845269598534952e-06, + "loss": 4.7472, + "step": 100290 + }, + { + "epoch": 2.0405069986979165, + "grad_norm": 17.332530975341797, + "learning_rate": 4.844870113003112e-06, + "loss": 5.0161, + "step": 100295 + }, + { + "epoch": 2.0406087239583335, + "grad_norm": 21.29181480407715, + "learning_rate": 4.844470628462502e-06, + "loss": 4.6916, + "step": 100300 + }, + { + "epoch": 2.04071044921875, + "grad_norm": 18.26431655883789, + "learning_rate": 4.844071144915676e-06, + "loss": 5.0849, + "step": 100305 + }, + { + "epoch": 2.0408121744791665, + "grad_norm": 22.45418357849121, + "learning_rate": 4.843671662365188e-06, + "loss": 4.5736, + "step": 100310 + }, + { + "epoch": 2.0409138997395835, + "grad_norm": 20.616928100585938, + "learning_rate": 4.843272180813586e-06, + "loss": 5.1757, + "step": 100315 + }, + { + "epoch": 2.041015625, + "grad_norm": 20.034564971923828, + "learning_rate": 4.842872700263427e-06, + "loss": 4.8469, + "step": 100320 + }, + { + "epoch": 2.0411173502604165, + "grad_norm": 17.783279418945312, + "learning_rate": 4.842473220717258e-06, + "loss": 4.9172, + "step": 100325 + }, + { + "epoch": 2.0412190755208335, + "grad_norm": 20.97488784790039, + "learning_rate": 4.8420737421776385e-06, + "loss": 4.8937, + "step": 100330 + }, + { + "epoch": 2.04132080078125, + "grad_norm": 17.956626892089844, + "learning_rate": 4.841674264647117e-06, + "loss": 4.9623, + "step": 100335 + }, + { + "epoch": 2.0414225260416665, + "grad_norm": 18.052486419677734, + "learning_rate": 4.841274788128246e-06, + "loss": 4.8802, + "step": 100340 + }, + { + "epoch": 2.0415242513020835, + "grad_norm": 19.743572235107422, + "learning_rate": 4.84087531262358e-06, + "loss": 4.7812, + "step": 100345 + }, + { + "epoch": 2.0416259765625, + "grad_norm": 14.949989318847656, + "learning_rate": 4.840475838135667e-06, + "loss": 4.9178, + "step": 100350 + }, + { + "epoch": 2.0417277018229165, + "grad_norm": 19.6732234954834, + "learning_rate": 4.840076364667065e-06, + "loss": 4.8467, + "step": 100355 + }, + { + "epoch": 2.0418294270833335, + "grad_norm": 21.483701705932617, + "learning_rate": 4.839676892220325e-06, + "loss": 4.8161, + "step": 100360 + }, + { + "epoch": 2.04193115234375, + "grad_norm": 21.911609649658203, + "learning_rate": 4.8392774207979984e-06, + "loss": 5.2347, + "step": 100365 + }, + { + "epoch": 2.0420328776041665, + "grad_norm": 18.19731330871582, + "learning_rate": 4.838877950402638e-06, + "loss": 5.0521, + "step": 100370 + }, + { + "epoch": 2.0421346028645835, + "grad_norm": 22.01627540588379, + "learning_rate": 4.8384784810367965e-06, + "loss": 4.9298, + "step": 100375 + }, + { + "epoch": 2.042236328125, + "grad_norm": 15.209760665893555, + "learning_rate": 4.838079012703024e-06, + "loss": 4.8241, + "step": 100380 + }, + { + "epoch": 2.0423380533854165, + "grad_norm": 16.43498420715332, + "learning_rate": 4.837679545403877e-06, + "loss": 4.6613, + "step": 100385 + }, + { + "epoch": 2.0424397786458335, + "grad_norm": 17.614662170410156, + "learning_rate": 4.837280079141907e-06, + "loss": 4.6508, + "step": 100390 + }, + { + "epoch": 2.04254150390625, + "grad_norm": 19.482999801635742, + "learning_rate": 4.836880613919664e-06, + "loss": 4.7221, + "step": 100395 + }, + { + "epoch": 2.0426432291666665, + "grad_norm": 22.79219627380371, + "learning_rate": 4.836481149739703e-06, + "loss": 4.8813, + "step": 100400 + }, + { + "epoch": 2.0427449544270835, + "grad_norm": 15.852668762207031, + "learning_rate": 4.836081686604575e-06, + "loss": 5.0885, + "step": 100405 + }, + { + "epoch": 2.0428466796875, + "grad_norm": 30.534685134887695, + "learning_rate": 4.835682224516834e-06, + "loss": 4.6773, + "step": 100410 + }, + { + "epoch": 2.0429484049479165, + "grad_norm": 18.77507972717285, + "learning_rate": 4.83528276347903e-06, + "loss": 4.8407, + "step": 100415 + }, + { + "epoch": 2.0430501302083335, + "grad_norm": 18.475528717041016, + "learning_rate": 4.834883303493718e-06, + "loss": 4.9581, + "step": 100420 + }, + { + "epoch": 2.04315185546875, + "grad_norm": 20.19733428955078, + "learning_rate": 4.834483844563449e-06, + "loss": 4.7177, + "step": 100425 + }, + { + "epoch": 2.0432535807291665, + "grad_norm": 28.52387237548828, + "learning_rate": 4.834084386690775e-06, + "loss": 4.9683, + "step": 100430 + }, + { + "epoch": 2.0433553059895835, + "grad_norm": 16.440244674682617, + "learning_rate": 4.833684929878249e-06, + "loss": 4.6869, + "step": 100435 + }, + { + "epoch": 2.04345703125, + "grad_norm": 22.867544174194336, + "learning_rate": 4.833285474128425e-06, + "loss": 4.9283, + "step": 100440 + }, + { + "epoch": 2.0435587565104165, + "grad_norm": 14.17672061920166, + "learning_rate": 4.832886019443854e-06, + "loss": 4.8614, + "step": 100445 + }, + { + "epoch": 2.0436604817708335, + "grad_norm": 18.269947052001953, + "learning_rate": 4.832486565827087e-06, + "loss": 4.9745, + "step": 100450 + }, + { + "epoch": 2.04376220703125, + "grad_norm": 15.89300537109375, + "learning_rate": 4.832087113280677e-06, + "loss": 4.8055, + "step": 100455 + }, + { + "epoch": 2.0438639322916665, + "grad_norm": 17.40559196472168, + "learning_rate": 4.831687661807178e-06, + "loss": 4.9565, + "step": 100460 + }, + { + "epoch": 2.0439656575520835, + "grad_norm": 22.043994903564453, + "learning_rate": 4.831288211409143e-06, + "loss": 4.6369, + "step": 100465 + }, + { + "epoch": 2.0440673828125, + "grad_norm": 35.90878677368164, + "learning_rate": 4.830888762089122e-06, + "loss": 4.7222, + "step": 100470 + }, + { + "epoch": 2.0441691080729165, + "grad_norm": 15.176666259765625, + "learning_rate": 4.8304893138496675e-06, + "loss": 4.904, + "step": 100475 + }, + { + "epoch": 2.0442708333333335, + "grad_norm": 17.188600540161133, + "learning_rate": 4.83008986669333e-06, + "loss": 4.6913, + "step": 100480 + }, + { + "epoch": 2.04437255859375, + "grad_norm": 20.65865135192871, + "learning_rate": 4.8296904206226676e-06, + "loss": 4.6234, + "step": 100485 + }, + { + "epoch": 2.0444742838541665, + "grad_norm": 18.524059295654297, + "learning_rate": 4.829290975640229e-06, + "loss": 4.8858, + "step": 100490 + }, + { + "epoch": 2.0445760091145835, + "grad_norm": 21.396896362304688, + "learning_rate": 4.8288915317485666e-06, + "loss": 4.5559, + "step": 100495 + }, + { + "epoch": 2.044677734375, + "grad_norm": 16.881092071533203, + "learning_rate": 4.8284920889502334e-06, + "loss": 4.8606, + "step": 100500 + }, + { + "epoch": 2.0447794596354165, + "grad_norm": 16.552091598510742, + "learning_rate": 4.828092647247779e-06, + "loss": 4.6671, + "step": 100505 + }, + { + "epoch": 2.0448811848958335, + "grad_norm": 18.109901428222656, + "learning_rate": 4.82769320664376e-06, + "loss": 4.9544, + "step": 100510 + }, + { + "epoch": 2.04498291015625, + "grad_norm": 19.716360092163086, + "learning_rate": 4.827293767140726e-06, + "loss": 4.7536, + "step": 100515 + }, + { + "epoch": 2.0450846354166665, + "grad_norm": 19.87786865234375, + "learning_rate": 4.826894328741232e-06, + "loss": 4.8495, + "step": 100520 + }, + { + "epoch": 2.0451863606770835, + "grad_norm": 15.85126781463623, + "learning_rate": 4.826494891447825e-06, + "loss": 4.8987, + "step": 100525 + }, + { + "epoch": 2.0452880859375, + "grad_norm": 17.16317367553711, + "learning_rate": 4.826095455263063e-06, + "loss": 4.9002, + "step": 100530 + }, + { + "epoch": 2.0453898111979165, + "grad_norm": 16.967992782592773, + "learning_rate": 4.825696020189495e-06, + "loss": 4.8629, + "step": 100535 + }, + { + "epoch": 2.0454915364583335, + "grad_norm": 14.001582145690918, + "learning_rate": 4.825296586229674e-06, + "loss": 4.9292, + "step": 100540 + }, + { + "epoch": 2.04559326171875, + "grad_norm": 20.05876350402832, + "learning_rate": 4.8248971533861525e-06, + "loss": 4.9479, + "step": 100545 + }, + { + "epoch": 2.0456949869791665, + "grad_norm": 17.999385833740234, + "learning_rate": 4.824497721661483e-06, + "loss": 4.8257, + "step": 100550 + }, + { + "epoch": 2.0457967122395835, + "grad_norm": 19.058002471923828, + "learning_rate": 4.824098291058217e-06, + "loss": 4.7368, + "step": 100555 + }, + { + "epoch": 2.0458984375, + "grad_norm": 16.869237899780273, + "learning_rate": 4.823698861578906e-06, + "loss": 4.6608, + "step": 100560 + }, + { + "epoch": 2.0460001627604165, + "grad_norm": 21.200315475463867, + "learning_rate": 4.8232994332261044e-06, + "loss": 4.8466, + "step": 100565 + }, + { + "epoch": 2.0461018880208335, + "grad_norm": 24.603843688964844, + "learning_rate": 4.8229000060023635e-06, + "loss": 4.7076, + "step": 100570 + }, + { + "epoch": 2.04620361328125, + "grad_norm": 18.267597198486328, + "learning_rate": 4.822500579910236e-06, + "loss": 4.7235, + "step": 100575 + }, + { + "epoch": 2.0463053385416665, + "grad_norm": 21.602638244628906, + "learning_rate": 4.822101154952273e-06, + "loss": 4.8667, + "step": 100580 + }, + { + "epoch": 2.0464070638020835, + "grad_norm": 15.564812660217285, + "learning_rate": 4.8217017311310245e-06, + "loss": 4.7496, + "step": 100585 + }, + { + "epoch": 2.0465087890625, + "grad_norm": 28.46035385131836, + "learning_rate": 4.821302308449049e-06, + "loss": 4.8617, + "step": 100590 + }, + { + "epoch": 2.0466105143229165, + "grad_norm": 23.800016403198242, + "learning_rate": 4.820902886908893e-06, + "loss": 4.6911, + "step": 100595 + }, + { + "epoch": 2.0467122395833335, + "grad_norm": 20.42782974243164, + "learning_rate": 4.8205034665131126e-06, + "loss": 4.8554, + "step": 100600 + }, + { + "epoch": 2.04681396484375, + "grad_norm": 14.64866828918457, + "learning_rate": 4.820104047264257e-06, + "loss": 4.8835, + "step": 100605 + }, + { + "epoch": 2.0469156901041665, + "grad_norm": 16.388290405273438, + "learning_rate": 4.819704629164877e-06, + "loss": 4.767, + "step": 100610 + }, + { + "epoch": 2.0470174153645835, + "grad_norm": 15.843594551086426, + "learning_rate": 4.8193052122175295e-06, + "loss": 4.7691, + "step": 100615 + }, + { + "epoch": 2.047119140625, + "grad_norm": 24.919551849365234, + "learning_rate": 4.818905796424765e-06, + "loss": 4.6488, + "step": 100620 + }, + { + "epoch": 2.0472208658854165, + "grad_norm": 16.45829200744629, + "learning_rate": 4.818506381789135e-06, + "loss": 4.8282, + "step": 100625 + }, + { + "epoch": 2.0473225911458335, + "grad_norm": 22.892059326171875, + "learning_rate": 4.818106968313191e-06, + "loss": 4.6905, + "step": 100630 + }, + { + "epoch": 2.04742431640625, + "grad_norm": 21.445951461791992, + "learning_rate": 4.817707555999484e-06, + "loss": 5.1322, + "step": 100635 + }, + { + "epoch": 2.0475260416666665, + "grad_norm": 27.201135635375977, + "learning_rate": 4.81730814485057e-06, + "loss": 4.7555, + "step": 100640 + }, + { + "epoch": 2.0476277669270835, + "grad_norm": 18.914886474609375, + "learning_rate": 4.816908734868998e-06, + "loss": 4.5488, + "step": 100645 + }, + { + "epoch": 2.0477294921875, + "grad_norm": 19.918779373168945, + "learning_rate": 4.816509326057322e-06, + "loss": 4.9085, + "step": 100650 + }, + { + "epoch": 2.0478312174479165, + "grad_norm": 19.165489196777344, + "learning_rate": 4.816109918418092e-06, + "loss": 4.8362, + "step": 100655 + }, + { + "epoch": 2.0479329427083335, + "grad_norm": 19.095630645751953, + "learning_rate": 4.815710511953861e-06, + "loss": 4.8057, + "step": 100660 + }, + { + "epoch": 2.04803466796875, + "grad_norm": 14.145696640014648, + "learning_rate": 4.815311106667183e-06, + "loss": 4.6063, + "step": 100665 + }, + { + "epoch": 2.0481363932291665, + "grad_norm": 16.4950008392334, + "learning_rate": 4.814911702560608e-06, + "loss": 4.5544, + "step": 100670 + }, + { + "epoch": 2.0482381184895835, + "grad_norm": 15.540918350219727, + "learning_rate": 4.814512299636688e-06, + "loss": 4.8322, + "step": 100675 + }, + { + "epoch": 2.04833984375, + "grad_norm": 18.365827560424805, + "learning_rate": 4.814112897897975e-06, + "loss": 4.6468, + "step": 100680 + }, + { + "epoch": 2.0484415690104165, + "grad_norm": 26.752714157104492, + "learning_rate": 4.813713497347023e-06, + "loss": 4.6747, + "step": 100685 + }, + { + "epoch": 2.0485432942708335, + "grad_norm": 22.186399459838867, + "learning_rate": 4.813314097986381e-06, + "loss": 4.8543, + "step": 100690 + }, + { + "epoch": 2.04864501953125, + "grad_norm": 16.94130516052246, + "learning_rate": 4.8129146998186025e-06, + "loss": 4.6646, + "step": 100695 + }, + { + "epoch": 2.0487467447916665, + "grad_norm": 16.290552139282227, + "learning_rate": 4.812515302846241e-06, + "loss": 5.0645, + "step": 100700 + }, + { + "epoch": 2.0488484700520835, + "grad_norm": 28.572816848754883, + "learning_rate": 4.812115907071847e-06, + "loss": 4.622, + "step": 100705 + }, + { + "epoch": 2.0489501953125, + "grad_norm": 16.008825302124023, + "learning_rate": 4.811716512497973e-06, + "loss": 4.853, + "step": 100710 + }, + { + "epoch": 2.0490519205729165, + "grad_norm": 18.947954177856445, + "learning_rate": 4.811317119127168e-06, + "loss": 4.474, + "step": 100715 + }, + { + "epoch": 2.0491536458333335, + "grad_norm": 14.217338562011719, + "learning_rate": 4.81091772696199e-06, + "loss": 4.9202, + "step": 100720 + }, + { + "epoch": 2.04925537109375, + "grad_norm": 21.09691619873047, + "learning_rate": 4.810518336004987e-06, + "loss": 4.6355, + "step": 100725 + }, + { + "epoch": 2.0493570963541665, + "grad_norm": 16.271141052246094, + "learning_rate": 4.810118946258712e-06, + "loss": 4.994, + "step": 100730 + }, + { + "epoch": 2.0494588216145835, + "grad_norm": 17.58016586303711, + "learning_rate": 4.809719557725716e-06, + "loss": 4.9634, + "step": 100735 + }, + { + "epoch": 2.049560546875, + "grad_norm": 15.637653350830078, + "learning_rate": 4.809320170408549e-06, + "loss": 5.1791, + "step": 100740 + }, + { + "epoch": 2.0496622721354165, + "grad_norm": 16.82666015625, + "learning_rate": 4.808920784309769e-06, + "loss": 4.842, + "step": 100745 + }, + { + "epoch": 2.0497639973958335, + "grad_norm": 12.83143424987793, + "learning_rate": 4.808521399431925e-06, + "loss": 4.5747, + "step": 100750 + }, + { + "epoch": 2.04986572265625, + "grad_norm": 17.02246856689453, + "learning_rate": 4.8081220157775675e-06, + "loss": 4.6908, + "step": 100755 + }, + { + "epoch": 2.0499674479166665, + "grad_norm": 16.41691017150879, + "learning_rate": 4.807722633349249e-06, + "loss": 4.7492, + "step": 100760 + }, + { + "epoch": 2.0500691731770835, + "grad_norm": 14.746407508850098, + "learning_rate": 4.80732325214952e-06, + "loss": 4.9586, + "step": 100765 + }, + { + "epoch": 2.0501708984375, + "grad_norm": 16.387741088867188, + "learning_rate": 4.806923872180936e-06, + "loss": 4.7123, + "step": 100770 + }, + { + "epoch": 2.0502726236979165, + "grad_norm": 15.501778602600098, + "learning_rate": 4.806524493446049e-06, + "loss": 4.5888, + "step": 100775 + }, + { + "epoch": 2.0503743489583335, + "grad_norm": 22.922731399536133, + "learning_rate": 4.806125115947407e-06, + "loss": 4.6008, + "step": 100780 + }, + { + "epoch": 2.05047607421875, + "grad_norm": 15.39433765411377, + "learning_rate": 4.805725739687565e-06, + "loss": 4.7178, + "step": 100785 + }, + { + "epoch": 2.0505777994791665, + "grad_norm": 15.152400016784668, + "learning_rate": 4.805326364669071e-06, + "loss": 4.902, + "step": 100790 + }, + { + "epoch": 2.0506795247395835, + "grad_norm": 22.76097869873047, + "learning_rate": 4.804926990894483e-06, + "loss": 5.2179, + "step": 100795 + }, + { + "epoch": 2.05078125, + "grad_norm": 15.41157054901123, + "learning_rate": 4.804527618366349e-06, + "loss": 5.0742, + "step": 100800 + }, + { + "epoch": 2.0508829752604165, + "grad_norm": 21.983444213867188, + "learning_rate": 4.804128247087221e-06, + "loss": 4.7155, + "step": 100805 + }, + { + "epoch": 2.0509847005208335, + "grad_norm": 21.053146362304688, + "learning_rate": 4.8037288770596505e-06, + "loss": 4.8437, + "step": 100810 + }, + { + "epoch": 2.05108642578125, + "grad_norm": 18.346540451049805, + "learning_rate": 4.803329508286192e-06, + "loss": 4.8781, + "step": 100815 + }, + { + "epoch": 2.0511881510416665, + "grad_norm": 28.81781005859375, + "learning_rate": 4.802930140769394e-06, + "loss": 4.9825, + "step": 100820 + }, + { + "epoch": 2.0512898763020835, + "grad_norm": 17.842424392700195, + "learning_rate": 4.80253077451181e-06, + "loss": 4.8267, + "step": 100825 + }, + { + "epoch": 2.0513916015625, + "grad_norm": 18.424232482910156, + "learning_rate": 4.802131409515991e-06, + "loss": 4.7851, + "step": 100830 + }, + { + "epoch": 2.0514933268229165, + "grad_norm": 15.495230674743652, + "learning_rate": 4.801732045784492e-06, + "loss": 4.7513, + "step": 100835 + }, + { + "epoch": 2.0515950520833335, + "grad_norm": 19.290788650512695, + "learning_rate": 4.801332683319861e-06, + "loss": 4.9446, + "step": 100840 + }, + { + "epoch": 2.05169677734375, + "grad_norm": 22.57986068725586, + "learning_rate": 4.800933322124649e-06, + "loss": 4.8664, + "step": 100845 + }, + { + "epoch": 2.0517985026041665, + "grad_norm": 18.65718650817871, + "learning_rate": 4.800533962201413e-06, + "loss": 4.7339, + "step": 100850 + }, + { + "epoch": 2.0519002278645835, + "grad_norm": 21.9931583404541, + "learning_rate": 4.8001346035527e-06, + "loss": 5.0993, + "step": 100855 + }, + { + "epoch": 2.052001953125, + "grad_norm": 17.336891174316406, + "learning_rate": 4.7997352461810644e-06, + "loss": 4.8476, + "step": 100860 + }, + { + "epoch": 2.0521036783854165, + "grad_norm": 17.69095230102539, + "learning_rate": 4.7993358900890565e-06, + "loss": 4.8762, + "step": 100865 + }, + { + "epoch": 2.0522054036458335, + "grad_norm": 15.186376571655273, + "learning_rate": 4.798936535279227e-06, + "loss": 4.9305, + "step": 100870 + }, + { + "epoch": 2.05230712890625, + "grad_norm": 17.803287506103516, + "learning_rate": 4.798537181754131e-06, + "loss": 5.0558, + "step": 100875 + }, + { + "epoch": 2.0524088541666665, + "grad_norm": 18.51963233947754, + "learning_rate": 4.798137829516319e-06, + "loss": 4.7668, + "step": 100880 + }, + { + "epoch": 2.0525105794270835, + "grad_norm": 19.2238826751709, + "learning_rate": 4.797738478568342e-06, + "loss": 4.7845, + "step": 100885 + }, + { + "epoch": 2.0526123046875, + "grad_norm": 22.028793334960938, + "learning_rate": 4.797339128912751e-06, + "loss": 4.8498, + "step": 100890 + }, + { + "epoch": 2.0527140299479165, + "grad_norm": 12.772798538208008, + "learning_rate": 4.796939780552098e-06, + "loss": 4.8044, + "step": 100895 + }, + { + "epoch": 2.0528157552083335, + "grad_norm": 19.92797088623047, + "learning_rate": 4.796540433488936e-06, + "loss": 4.8872, + "step": 100900 + }, + { + "epoch": 2.05291748046875, + "grad_norm": 20.227270126342773, + "learning_rate": 4.796141087725817e-06, + "loss": 4.7778, + "step": 100905 + }, + { + "epoch": 2.0530192057291665, + "grad_norm": 18.529415130615234, + "learning_rate": 4.7957417432652916e-06, + "loss": 4.739, + "step": 100910 + }, + { + "epoch": 2.0531209309895835, + "grad_norm": 25.245609283447266, + "learning_rate": 4.795342400109911e-06, + "loss": 4.7083, + "step": 100915 + }, + { + "epoch": 2.05322265625, + "grad_norm": 26.650554656982422, + "learning_rate": 4.794943058262227e-06, + "loss": 4.7728, + "step": 100920 + }, + { + "epoch": 2.0533243815104165, + "grad_norm": 19.215431213378906, + "learning_rate": 4.794543717724792e-06, + "loss": 4.9932, + "step": 100925 + }, + { + "epoch": 2.0534261067708335, + "grad_norm": 18.18852996826172, + "learning_rate": 4.794144378500159e-06, + "loss": 4.9866, + "step": 100930 + }, + { + "epoch": 2.05352783203125, + "grad_norm": 18.013553619384766, + "learning_rate": 4.793745040590875e-06, + "loss": 4.9073, + "step": 100935 + }, + { + "epoch": 2.0536295572916665, + "grad_norm": 23.532442092895508, + "learning_rate": 4.793345703999497e-06, + "loss": 5.0227, + "step": 100940 + }, + { + "epoch": 2.0537312825520835, + "grad_norm": 17.426801681518555, + "learning_rate": 4.792946368728574e-06, + "loss": 4.8981, + "step": 100945 + }, + { + "epoch": 2.0538330078125, + "grad_norm": 19.992942810058594, + "learning_rate": 4.792547034780658e-06, + "loss": 4.9303, + "step": 100950 + }, + { + "epoch": 2.0539347330729165, + "grad_norm": 23.9036865234375, + "learning_rate": 4.792147702158299e-06, + "loss": 4.8572, + "step": 100955 + }, + { + "epoch": 2.0540364583333335, + "grad_norm": 17.995622634887695, + "learning_rate": 4.7917483708640525e-06, + "loss": 4.7835, + "step": 100960 + }, + { + "epoch": 2.05413818359375, + "grad_norm": 18.364238739013672, + "learning_rate": 4.791349040900467e-06, + "loss": 4.8815, + "step": 100965 + }, + { + "epoch": 2.0542399088541665, + "grad_norm": 14.871469497680664, + "learning_rate": 4.790949712270096e-06, + "loss": 4.942, + "step": 100970 + }, + { + "epoch": 2.0543416341145835, + "grad_norm": 17.234086990356445, + "learning_rate": 4.790550384975487e-06, + "loss": 4.7708, + "step": 100975 + }, + { + "epoch": 2.054443359375, + "grad_norm": 19.248943328857422, + "learning_rate": 4.790151059019197e-06, + "loss": 4.7527, + "step": 100980 + }, + { + "epoch": 2.0545450846354165, + "grad_norm": 14.097336769104004, + "learning_rate": 4.789751734403775e-06, + "loss": 4.8514, + "step": 100985 + }, + { + "epoch": 2.0546468098958335, + "grad_norm": 15.590710639953613, + "learning_rate": 4.789352411131773e-06, + "loss": 4.7975, + "step": 100990 + }, + { + "epoch": 2.05474853515625, + "grad_norm": 16.082029342651367, + "learning_rate": 4.7889530892057415e-06, + "loss": 4.7195, + "step": 100995 + }, + { + "epoch": 2.0548502604166665, + "grad_norm": 18.33749008178711, + "learning_rate": 4.788553768628231e-06, + "loss": 4.6601, + "step": 101000 + }, + { + "epoch": 2.0549519856770835, + "grad_norm": 15.548284530639648, + "learning_rate": 4.7881544494017964e-06, + "loss": 4.6716, + "step": 101005 + }, + { + "epoch": 2.0550537109375, + "grad_norm": 20.16717529296875, + "learning_rate": 4.787755131528988e-06, + "loss": 4.8518, + "step": 101010 + }, + { + "epoch": 2.0551554361979165, + "grad_norm": 17.511167526245117, + "learning_rate": 4.787355815012357e-06, + "loss": 4.7475, + "step": 101015 + }, + { + "epoch": 2.0552571614583335, + "grad_norm": 23.161405563354492, + "learning_rate": 4.7869564998544554e-06, + "loss": 4.5714, + "step": 101020 + }, + { + "epoch": 2.05535888671875, + "grad_norm": 20.3461856842041, + "learning_rate": 4.7865571860578316e-06, + "loss": 5.1607, + "step": 101025 + }, + { + "epoch": 2.0554606119791665, + "grad_norm": 24.182058334350586, + "learning_rate": 4.786157873625042e-06, + "loss": 4.8712, + "step": 101030 + }, + { + "epoch": 2.0555623372395835, + "grad_norm": 17.377592086791992, + "learning_rate": 4.785758562558635e-06, + "loss": 4.7453, + "step": 101035 + }, + { + "epoch": 2.0556640625, + "grad_norm": 16.699491500854492, + "learning_rate": 4.785359252861163e-06, + "loss": 4.7378, + "step": 101040 + }, + { + "epoch": 2.0557657877604165, + "grad_norm": 16.977685928344727, + "learning_rate": 4.784959944535178e-06, + "loss": 4.7773, + "step": 101045 + }, + { + "epoch": 2.0558675130208335, + "grad_norm": 15.833562850952148, + "learning_rate": 4.784560637583227e-06, + "loss": 5.0409, + "step": 101050 + }, + { + "epoch": 2.05596923828125, + "grad_norm": 14.689312934875488, + "learning_rate": 4.784161332007869e-06, + "loss": 4.657, + "step": 101055 + }, + { + "epoch": 2.0560709635416665, + "grad_norm": 32.557498931884766, + "learning_rate": 4.78376202781165e-06, + "loss": 4.77, + "step": 101060 + }, + { + "epoch": 2.0561726888020835, + "grad_norm": 20.350910186767578, + "learning_rate": 4.7833627249971235e-06, + "loss": 4.6028, + "step": 101065 + }, + { + "epoch": 2.0562744140625, + "grad_norm": 16.356975555419922, + "learning_rate": 4.78296342356684e-06, + "loss": 4.6927, + "step": 101070 + }, + { + "epoch": 2.0563761393229165, + "grad_norm": 34.03138732910156, + "learning_rate": 4.782564123523351e-06, + "loss": 4.7759, + "step": 101075 + }, + { + "epoch": 2.0564778645833335, + "grad_norm": 19.138885498046875, + "learning_rate": 4.782164824869209e-06, + "loss": 4.9254, + "step": 101080 + }, + { + "epoch": 2.05657958984375, + "grad_norm": 25.68146324157715, + "learning_rate": 4.781765527606964e-06, + "loss": 4.8588, + "step": 101085 + }, + { + "epoch": 2.0566813151041665, + "grad_norm": 19.17670249938965, + "learning_rate": 4.781366231739169e-06, + "loss": 4.8296, + "step": 101090 + }, + { + "epoch": 2.0567830403645835, + "grad_norm": 17.85704231262207, + "learning_rate": 4.780966937268374e-06, + "loss": 4.7422, + "step": 101095 + }, + { + "epoch": 2.056884765625, + "grad_norm": 17.043254852294922, + "learning_rate": 4.7805676441971295e-06, + "loss": 4.9092, + "step": 101100 + }, + { + "epoch": 2.0569864908854165, + "grad_norm": 14.930903434753418, + "learning_rate": 4.7801683525279895e-06, + "loss": 4.8928, + "step": 101105 + }, + { + "epoch": 2.0570882161458335, + "grad_norm": 17.705272674560547, + "learning_rate": 4.779769062263504e-06, + "loss": 4.8483, + "step": 101110 + }, + { + "epoch": 2.05718994140625, + "grad_norm": 16.891109466552734, + "learning_rate": 4.779369773406224e-06, + "loss": 4.6187, + "step": 101115 + }, + { + "epoch": 2.0572916666666665, + "grad_norm": 19.288963317871094, + "learning_rate": 4.778970485958701e-06, + "loss": 4.7192, + "step": 101120 + }, + { + "epoch": 2.0573933919270835, + "grad_norm": 15.326212882995605, + "learning_rate": 4.778571199923487e-06, + "loss": 4.7377, + "step": 101125 + }, + { + "epoch": 2.0574951171875, + "grad_norm": 17.44032096862793, + "learning_rate": 4.77817191530313e-06, + "loss": 4.5146, + "step": 101130 + }, + { + "epoch": 2.0575968424479165, + "grad_norm": 17.89404296875, + "learning_rate": 4.777772632100187e-06, + "loss": 5.0723, + "step": 101135 + }, + { + "epoch": 2.0576985677083335, + "grad_norm": 18.5718936920166, + "learning_rate": 4.777373350317206e-06, + "loss": 4.665, + "step": 101140 + }, + { + "epoch": 2.05780029296875, + "grad_norm": 20.481884002685547, + "learning_rate": 4.776974069956738e-06, + "loss": 4.915, + "step": 101145 + }, + { + "epoch": 2.0579020182291665, + "grad_norm": 16.07745361328125, + "learning_rate": 4.776574791021336e-06, + "loss": 5.0138, + "step": 101150 + }, + { + "epoch": 2.0580037434895835, + "grad_norm": 16.351285934448242, + "learning_rate": 4.776175513513547e-06, + "loss": 4.9361, + "step": 101155 + }, + { + "epoch": 2.05810546875, + "grad_norm": 15.682068824768066, + "learning_rate": 4.775776237435928e-06, + "loss": 4.8312, + "step": 101160 + }, + { + "epoch": 2.0582071940104165, + "grad_norm": 19.819475173950195, + "learning_rate": 4.775376962791028e-06, + "loss": 4.9643, + "step": 101165 + }, + { + "epoch": 2.0583089192708335, + "grad_norm": 16.756547927856445, + "learning_rate": 4.774977689581398e-06, + "loss": 4.9908, + "step": 101170 + }, + { + "epoch": 2.05841064453125, + "grad_norm": 17.47171401977539, + "learning_rate": 4.774578417809589e-06, + "loss": 4.4891, + "step": 101175 + }, + { + "epoch": 2.0585123697916665, + "grad_norm": 17.102481842041016, + "learning_rate": 4.77417914747815e-06, + "loss": 4.542, + "step": 101180 + }, + { + "epoch": 2.0586140950520835, + "grad_norm": 25.460107803344727, + "learning_rate": 4.773779878589636e-06, + "loss": 4.8263, + "step": 101185 + }, + { + "epoch": 2.0587158203125, + "grad_norm": 15.667125701904297, + "learning_rate": 4.773380611146598e-06, + "loss": 4.8527, + "step": 101190 + }, + { + "epoch": 2.0588175455729165, + "grad_norm": 22.27829360961914, + "learning_rate": 4.772981345151585e-06, + "loss": 4.5473, + "step": 101195 + }, + { + "epoch": 2.0589192708333335, + "grad_norm": 17.628192901611328, + "learning_rate": 4.772582080607149e-06, + "loss": 4.878, + "step": 101200 + }, + { + "epoch": 2.05902099609375, + "grad_norm": 19.776504516601562, + "learning_rate": 4.77218281751584e-06, + "loss": 4.787, + "step": 101205 + }, + { + "epoch": 2.0591227213541665, + "grad_norm": 18.546045303344727, + "learning_rate": 4.7717835558802124e-06, + "loss": 4.8722, + "step": 101210 + }, + { + "epoch": 2.0592244466145835, + "grad_norm": 31.58452033996582, + "learning_rate": 4.771384295702815e-06, + "loss": 4.9921, + "step": 101215 + }, + { + "epoch": 2.059326171875, + "grad_norm": 15.536352157592773, + "learning_rate": 4.7709850369862e-06, + "loss": 4.9642, + "step": 101220 + }, + { + "epoch": 2.0594278971354165, + "grad_norm": 20.181352615356445, + "learning_rate": 4.770585779732917e-06, + "loss": 4.6567, + "step": 101225 + }, + { + "epoch": 2.0595296223958335, + "grad_norm": 15.237805366516113, + "learning_rate": 4.7701865239455175e-06, + "loss": 4.8814, + "step": 101230 + }, + { + "epoch": 2.05963134765625, + "grad_norm": 22.843421936035156, + "learning_rate": 4.769787269626554e-06, + "loss": 4.6741, + "step": 101235 + }, + { + "epoch": 2.0597330729166665, + "grad_norm": 17.288055419921875, + "learning_rate": 4.769388016778577e-06, + "loss": 4.869, + "step": 101240 + }, + { + "epoch": 2.0598347981770835, + "grad_norm": 19.7629451751709, + "learning_rate": 4.7689887654041366e-06, + "loss": 4.614, + "step": 101245 + }, + { + "epoch": 2.0599365234375, + "grad_norm": 18.93234634399414, + "learning_rate": 4.768589515505786e-06, + "loss": 5.0303, + "step": 101250 + }, + { + "epoch": 2.0600382486979165, + "grad_norm": 14.705766677856445, + "learning_rate": 4.7681902670860745e-06, + "loss": 4.7029, + "step": 101255 + }, + { + "epoch": 2.0601399739583335, + "grad_norm": 17.629425048828125, + "learning_rate": 4.767791020147551e-06, + "loss": 4.6536, + "step": 101260 + }, + { + "epoch": 2.06024169921875, + "grad_norm": 15.908663749694824, + "learning_rate": 4.767391774692772e-06, + "loss": 5.0363, + "step": 101265 + }, + { + "epoch": 2.0603434244791665, + "grad_norm": 15.90963077545166, + "learning_rate": 4.766992530724285e-06, + "loss": 4.6788, + "step": 101270 + }, + { + "epoch": 2.0604451497395835, + "grad_norm": 18.788942337036133, + "learning_rate": 4.766593288244642e-06, + "loss": 4.655, + "step": 101275 + }, + { + "epoch": 2.060546875, + "grad_norm": 24.295637130737305, + "learning_rate": 4.766194047256395e-06, + "loss": 4.7375, + "step": 101280 + }, + { + "epoch": 2.0606486002604165, + "grad_norm": 22.022802352905273, + "learning_rate": 4.7657948077620904e-06, + "loss": 4.7748, + "step": 101285 + }, + { + "epoch": 2.0607503255208335, + "grad_norm": 16.807695388793945, + "learning_rate": 4.765395569764285e-06, + "loss": 4.7713, + "step": 101290 + }, + { + "epoch": 2.06085205078125, + "grad_norm": 16.568193435668945, + "learning_rate": 4.764996333265527e-06, + "loss": 4.7995, + "step": 101295 + }, + { + "epoch": 2.0609537760416665, + "grad_norm": 22.689926147460938, + "learning_rate": 4.764597098268368e-06, + "loss": 4.8596, + "step": 101300 + }, + { + "epoch": 2.0610555013020835, + "grad_norm": 16.222702026367188, + "learning_rate": 4.76419786477536e-06, + "loss": 4.8667, + "step": 101305 + }, + { + "epoch": 2.0611572265625, + "grad_norm": 15.322646141052246, + "learning_rate": 4.7637986327890495e-06, + "loss": 4.9137, + "step": 101310 + }, + { + "epoch": 2.0612589518229165, + "grad_norm": 14.971114158630371, + "learning_rate": 4.763399402311994e-06, + "loss": 4.6532, + "step": 101315 + }, + { + "epoch": 2.0613606770833335, + "grad_norm": 17.22572898864746, + "learning_rate": 4.76300017334674e-06, + "loss": 5.2364, + "step": 101320 + }, + { + "epoch": 2.06146240234375, + "grad_norm": 18.157194137573242, + "learning_rate": 4.76260094589584e-06, + "loss": 4.9261, + "step": 101325 + }, + { + "epoch": 2.0615641276041665, + "grad_norm": 22.091161727905273, + "learning_rate": 4.762201719961844e-06, + "loss": 4.7256, + "step": 101330 + }, + { + "epoch": 2.0616658528645835, + "grad_norm": 18.715559005737305, + "learning_rate": 4.761802495547303e-06, + "loss": 4.649, + "step": 101335 + }, + { + "epoch": 2.061767578125, + "grad_norm": 16.179445266723633, + "learning_rate": 4.761403272654769e-06, + "loss": 4.8079, + "step": 101340 + }, + { + "epoch": 2.0618693033854165, + "grad_norm": 23.3615779876709, + "learning_rate": 4.761004051286792e-06, + "loss": 4.9214, + "step": 101345 + }, + { + "epoch": 2.0619710286458335, + "grad_norm": 19.241317749023438, + "learning_rate": 4.760604831445924e-06, + "loss": 4.6561, + "step": 101350 + }, + { + "epoch": 2.06207275390625, + "grad_norm": 14.274453163146973, + "learning_rate": 4.760205613134716e-06, + "loss": 5.0795, + "step": 101355 + }, + { + "epoch": 2.0621744791666665, + "grad_norm": 17.143840789794922, + "learning_rate": 4.759806396355716e-06, + "loss": 4.7463, + "step": 101360 + }, + { + "epoch": 2.0622762044270835, + "grad_norm": 15.104227066040039, + "learning_rate": 4.759407181111477e-06, + "loss": 4.7364, + "step": 101365 + }, + { + "epoch": 2.0623779296875, + "grad_norm": 21.716514587402344, + "learning_rate": 4.759007967404552e-06, + "loss": 5.0529, + "step": 101370 + }, + { + "epoch": 2.0624796549479165, + "grad_norm": 26.39582633972168, + "learning_rate": 4.758608755237488e-06, + "loss": 4.7688, + "step": 101375 + }, + { + "epoch": 2.0625813802083335, + "grad_norm": 16.636247634887695, + "learning_rate": 4.758209544612838e-06, + "loss": 4.7606, + "step": 101380 + }, + { + "epoch": 2.06268310546875, + "grad_norm": 20.805360794067383, + "learning_rate": 4.757810335533149e-06, + "loss": 4.87, + "step": 101385 + }, + { + "epoch": 2.0627848307291665, + "grad_norm": 17.414892196655273, + "learning_rate": 4.757411128000979e-06, + "loss": 4.8935, + "step": 101390 + }, + { + "epoch": 2.0628865559895835, + "grad_norm": 16.70155143737793, + "learning_rate": 4.757011922018875e-06, + "loss": 4.5756, + "step": 101395 + }, + { + "epoch": 2.06298828125, + "grad_norm": 22.99529266357422, + "learning_rate": 4.7566127175893865e-06, + "loss": 4.8213, + "step": 101400 + }, + { + "epoch": 2.0630900065104165, + "grad_norm": 14.921157836914062, + "learning_rate": 4.756213514715066e-06, + "loss": 4.9925, + "step": 101405 + }, + { + "epoch": 2.0631917317708335, + "grad_norm": 20.876155853271484, + "learning_rate": 4.755814313398464e-06, + "loss": 4.782, + "step": 101410 + }, + { + "epoch": 2.06329345703125, + "grad_norm": 20.329557418823242, + "learning_rate": 4.7554151136421275e-06, + "loss": 4.7122, + "step": 101415 + }, + { + "epoch": 2.0633951822916665, + "grad_norm": 21.415903091430664, + "learning_rate": 4.7550159154486145e-06, + "loss": 5.0093, + "step": 101420 + }, + { + "epoch": 2.0634969075520835, + "grad_norm": 13.020194053649902, + "learning_rate": 4.754616718820471e-06, + "loss": 4.6595, + "step": 101425 + }, + { + "epoch": 2.0635986328125, + "grad_norm": 16.49935531616211, + "learning_rate": 4.75421752376025e-06, + "loss": 5.1751, + "step": 101430 + }, + { + "epoch": 2.0637003580729165, + "grad_norm": 30.838319778442383, + "learning_rate": 4.7538183302704995e-06, + "loss": 4.7671, + "step": 101435 + }, + { + "epoch": 2.0638020833333335, + "grad_norm": 18.569730758666992, + "learning_rate": 4.75341913835377e-06, + "loss": 4.7335, + "step": 101440 + }, + { + "epoch": 2.06390380859375, + "grad_norm": 22.12628936767578, + "learning_rate": 4.7530199480126166e-06, + "loss": 4.6398, + "step": 101445 + }, + { + "epoch": 2.0640055338541665, + "grad_norm": 18.558828353881836, + "learning_rate": 4.752620759249587e-06, + "loss": 4.7842, + "step": 101450 + }, + { + "epoch": 2.0641072591145835, + "grad_norm": 21.934890747070312, + "learning_rate": 4.752221572067232e-06, + "loss": 4.9568, + "step": 101455 + } + ], + "logging_steps": 5, + "max_steps": 196608, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 197, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.902358190974566e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}