diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6704 @@ +{ + "best_metric": 0.5016890168190002, + "best_model_checkpoint": "data/hansken_human_hql_v3/checkpoint-2345", + "epoch": 10.0, + "eval_steps": 500, + "global_step": 4690, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0021321961620469083, + "grad_norm": 1.0516366958618164, + "learning_rate": 1.4214641080312722e-07, + "loss": 1.9389, + "step": 1 + }, + { + "epoch": 0.010660980810234541, + "grad_norm": 0.9856139421463013, + "learning_rate": 7.107320540156362e-07, + "loss": 2.0398, + "step": 5 + }, + { + "epoch": 0.021321961620469083, + "grad_norm": 1.0568891763687134, + "learning_rate": 1.4214641080312723e-06, + "loss": 2.0618, + "step": 10 + }, + { + "epoch": 0.031982942430703626, + "grad_norm": 0.9998515844345093, + "learning_rate": 2.132196162046908e-06, + "loss": 2.0543, + "step": 15 + }, + { + "epoch": 0.042643923240938165, + "grad_norm": 1.004911184310913, + "learning_rate": 2.8429282160625447e-06, + "loss": 1.9997, + "step": 20 + }, + { + "epoch": 0.053304904051172705, + "grad_norm": 0.9931671619415283, + "learning_rate": 3.553660270078181e-06, + "loss": 1.9913, + "step": 25 + }, + { + "epoch": 0.06396588486140725, + "grad_norm": 0.9859012365341187, + "learning_rate": 4.264392324093816e-06, + "loss": 1.9729, + "step": 30 + }, + { + "epoch": 0.07462686567164178, + "grad_norm": 1.0391347408294678, + "learning_rate": 4.975124378109453e-06, + "loss": 1.9434, + "step": 35 + }, + { + "epoch": 0.08528784648187633, + "grad_norm": 0.8275197744369507, + "learning_rate": 5.685856432125089e-06, + "loss": 1.9092, + "step": 40 + }, + { + "epoch": 0.09594882729211088, + "grad_norm": 0.7102633714675903, + "learning_rate": 6.396588486140726e-06, + "loss": 1.8488, + "step": 45 + }, + { + "epoch": 0.10660980810234541, + "grad_norm": 0.6521381735801697, + "learning_rate": 7.107320540156362e-06, + "loss": 1.8673, + "step": 50 + }, + { + "epoch": 0.11727078891257996, + "grad_norm": 0.5477872490882874, + "learning_rate": 7.818052594171997e-06, + "loss": 1.7758, + "step": 55 + }, + { + "epoch": 0.1279317697228145, + "grad_norm": 0.49889788031578064, + "learning_rate": 8.528784648187633e-06, + "loss": 1.7453, + "step": 60 + }, + { + "epoch": 0.13859275053304904, + "grad_norm": 0.5726047158241272, + "learning_rate": 9.23951670220327e-06, + "loss": 1.7635, + "step": 65 + }, + { + "epoch": 0.14925373134328357, + "grad_norm": 0.4760012924671173, + "learning_rate": 9.950248756218906e-06, + "loss": 1.7027, + "step": 70 + }, + { + "epoch": 0.15991471215351813, + "grad_norm": 0.4642033278942108, + "learning_rate": 1.0660980810234541e-05, + "loss": 1.7086, + "step": 75 + }, + { + "epoch": 0.17057569296375266, + "grad_norm": 0.42560943961143494, + "learning_rate": 1.1371712864250179e-05, + "loss": 1.638, + "step": 80 + }, + { + "epoch": 0.1812366737739872, + "grad_norm": 0.4680778384208679, + "learning_rate": 1.2082444918265814e-05, + "loss": 1.6029, + "step": 85 + }, + { + "epoch": 0.19189765458422176, + "grad_norm": 0.4264519214630127, + "learning_rate": 1.2793176972281452e-05, + "loss": 1.4899, + "step": 90 + }, + { + "epoch": 0.2025586353944563, + "grad_norm": 0.41101664304733276, + "learning_rate": 1.3503909026297087e-05, + "loss": 1.4997, + "step": 95 + }, + { + "epoch": 0.21321961620469082, + "grad_norm": 0.34257784485816956, + "learning_rate": 1.4214641080312725e-05, + "loss": 1.4734, + "step": 100 + }, + { + "epoch": 0.22388059701492538, + "grad_norm": 0.34164702892303467, + "learning_rate": 1.4925373134328357e-05, + "loss": 1.4341, + "step": 105 + }, + { + "epoch": 0.2345415778251599, + "grad_norm": 0.3285938501358032, + "learning_rate": 1.5636105188343994e-05, + "loss": 1.4293, + "step": 110 + }, + { + "epoch": 0.24520255863539445, + "grad_norm": 0.33409905433654785, + "learning_rate": 1.634683724235963e-05, + "loss": 1.3792, + "step": 115 + }, + { + "epoch": 0.255863539445629, + "grad_norm": 0.3385579288005829, + "learning_rate": 1.7057569296375266e-05, + "loss": 1.3811, + "step": 120 + }, + { + "epoch": 0.26652452025586354, + "grad_norm": 0.35849225521087646, + "learning_rate": 1.7768301350390903e-05, + "loss": 1.3217, + "step": 125 + }, + { + "epoch": 0.2771855010660981, + "grad_norm": 0.3905642330646515, + "learning_rate": 1.847903340440654e-05, + "loss": 1.2792, + "step": 130 + }, + { + "epoch": 0.2878464818763326, + "grad_norm": 0.45816823840141296, + "learning_rate": 1.9189765458422178e-05, + "loss": 1.268, + "step": 135 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 0.42841047048568726, + "learning_rate": 1.990049751243781e-05, + "loss": 1.1999, + "step": 140 + }, + { + "epoch": 0.3091684434968017, + "grad_norm": 0.42461100220680237, + "learning_rate": 2.061122956645345e-05, + "loss": 1.1908, + "step": 145 + }, + { + "epoch": 0.31982942430703626, + "grad_norm": 0.3846851885318756, + "learning_rate": 2.1321961620469083e-05, + "loss": 1.0417, + "step": 150 + }, + { + "epoch": 0.3304904051172708, + "grad_norm": 0.35793304443359375, + "learning_rate": 2.203269367448472e-05, + "loss": 1.0804, + "step": 155 + }, + { + "epoch": 0.3411513859275053, + "grad_norm": 0.3422033488750458, + "learning_rate": 2.2743425728500358e-05, + "loss": 1.0433, + "step": 160 + }, + { + "epoch": 0.35181236673773986, + "grad_norm": 0.34404265880584717, + "learning_rate": 2.345415778251599e-05, + "loss": 1.0823, + "step": 165 + }, + { + "epoch": 0.3624733475479744, + "grad_norm": 0.31916388869285583, + "learning_rate": 2.416488983653163e-05, + "loss": 1.001, + "step": 170 + }, + { + "epoch": 0.373134328358209, + "grad_norm": 0.33065563440322876, + "learning_rate": 2.4875621890547266e-05, + "loss": 0.9698, + "step": 175 + }, + { + "epoch": 0.3837953091684435, + "grad_norm": 0.34518882632255554, + "learning_rate": 2.5586353944562904e-05, + "loss": 0.9731, + "step": 180 + }, + { + "epoch": 0.39445628997867804, + "grad_norm": 0.31844091415405273, + "learning_rate": 2.6297085998578534e-05, + "loss": 0.9293, + "step": 185 + }, + { + "epoch": 0.4051172707889126, + "grad_norm": 0.32537004351615906, + "learning_rate": 2.7007818052594175e-05, + "loss": 0.9306, + "step": 190 + }, + { + "epoch": 0.4157782515991471, + "grad_norm": 0.38439956307411194, + "learning_rate": 2.771855010660981e-05, + "loss": 0.8915, + "step": 195 + }, + { + "epoch": 0.42643923240938164, + "grad_norm": 0.3455168306827545, + "learning_rate": 2.842928216062545e-05, + "loss": 0.903, + "step": 200 + }, + { + "epoch": 0.43710021321961623, + "grad_norm": 0.36652979254722595, + "learning_rate": 2.914001421464108e-05, + "loss": 0.8468, + "step": 205 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 0.35580819845199585, + "learning_rate": 2.9850746268656714e-05, + "loss": 0.8467, + "step": 210 + }, + { + "epoch": 0.4584221748400853, + "grad_norm": 0.3748577833175659, + "learning_rate": 3.056147832267235e-05, + "loss": 0.8037, + "step": 215 + }, + { + "epoch": 0.4690831556503198, + "grad_norm": 0.3399907052516937, + "learning_rate": 3.127221037668799e-05, + "loss": 0.8525, + "step": 220 + }, + { + "epoch": 0.47974413646055436, + "grad_norm": 0.39041897654533386, + "learning_rate": 3.1982942430703626e-05, + "loss": 0.8672, + "step": 225 + }, + { + "epoch": 0.4904051172707889, + "grad_norm": 0.37930938601493835, + "learning_rate": 3.269367448471926e-05, + "loss": 0.7967, + "step": 230 + }, + { + "epoch": 0.5010660980810234, + "grad_norm": 0.4009639024734497, + "learning_rate": 3.34044065387349e-05, + "loss": 0.8134, + "step": 235 + }, + { + "epoch": 0.511727078891258, + "grad_norm": 0.4189032018184662, + "learning_rate": 3.411513859275053e-05, + "loss": 0.791, + "step": 240 + }, + { + "epoch": 0.5223880597014925, + "grad_norm": 0.3848344385623932, + "learning_rate": 3.4825870646766175e-05, + "loss": 0.8183, + "step": 245 + }, + { + "epoch": 0.5330490405117271, + "grad_norm": 0.41223597526550293, + "learning_rate": 3.5536602700781806e-05, + "loss": 0.7668, + "step": 250 + }, + { + "epoch": 0.5437100213219617, + "grad_norm": 0.4024832844734192, + "learning_rate": 3.624733475479744e-05, + "loss": 0.7819, + "step": 255 + }, + { + "epoch": 0.5543710021321961, + "grad_norm": 0.3832787871360779, + "learning_rate": 3.695806680881308e-05, + "loss": 0.7693, + "step": 260 + }, + { + "epoch": 0.5650319829424307, + "grad_norm": 0.4266470670700073, + "learning_rate": 3.766879886282871e-05, + "loss": 0.795, + "step": 265 + }, + { + "epoch": 0.5756929637526652, + "grad_norm": 0.47055262327194214, + "learning_rate": 3.8379530916844355e-05, + "loss": 0.7752, + "step": 270 + }, + { + "epoch": 0.5863539445628998, + "grad_norm": 0.420669823884964, + "learning_rate": 3.9090262970859986e-05, + "loss": 0.7691, + "step": 275 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 0.4140627384185791, + "learning_rate": 3.980099502487562e-05, + "loss": 0.7385, + "step": 280 + }, + { + "epoch": 0.6076759061833689, + "grad_norm": 0.4674805998802185, + "learning_rate": 4.051172707889126e-05, + "loss": 0.7668, + "step": 285 + }, + { + "epoch": 0.6183368869936035, + "grad_norm": 0.45881038904190063, + "learning_rate": 4.12224591329069e-05, + "loss": 0.7777, + "step": 290 + }, + { + "epoch": 0.6289978678038379, + "grad_norm": 0.4218686819076538, + "learning_rate": 4.1933191186922535e-05, + "loss": 0.7106, + "step": 295 + }, + { + "epoch": 0.6396588486140725, + "grad_norm": 0.43359580636024475, + "learning_rate": 4.2643923240938166e-05, + "loss": 0.7076, + "step": 300 + }, + { + "epoch": 0.650319829424307, + "grad_norm": 0.42106226086616516, + "learning_rate": 4.33546552949538e-05, + "loss": 0.7353, + "step": 305 + }, + { + "epoch": 0.6609808102345416, + "grad_norm": 0.4189695715904236, + "learning_rate": 4.406538734896944e-05, + "loss": 0.698, + "step": 310 + }, + { + "epoch": 0.6716417910447762, + "grad_norm": 0.45314905047416687, + "learning_rate": 4.477611940298508e-05, + "loss": 0.7356, + "step": 315 + }, + { + "epoch": 0.6823027718550106, + "grad_norm": 0.46034571528434753, + "learning_rate": 4.5486851457000715e-05, + "loss": 0.7397, + "step": 320 + }, + { + "epoch": 0.6929637526652452, + "grad_norm": 0.44907087087631226, + "learning_rate": 4.619758351101635e-05, + "loss": 0.7326, + "step": 325 + }, + { + "epoch": 0.7036247334754797, + "grad_norm": 0.46258679032325745, + "learning_rate": 4.690831556503198e-05, + "loss": 0.6663, + "step": 330 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.446308434009552, + "learning_rate": 4.761904761904762e-05, + "loss": 0.6941, + "step": 335 + }, + { + "epoch": 0.7249466950959488, + "grad_norm": 0.40378594398498535, + "learning_rate": 4.832977967306326e-05, + "loss": 0.7174, + "step": 340 + }, + { + "epoch": 0.7356076759061834, + "grad_norm": 0.39717379212379456, + "learning_rate": 4.904051172707889e-05, + "loss": 0.659, + "step": 345 + }, + { + "epoch": 0.746268656716418, + "grad_norm": 0.4855833053588867, + "learning_rate": 4.975124378109453e-05, + "loss": 0.6762, + "step": 350 + }, + { + "epoch": 0.7569296375266524, + "grad_norm": 0.47973328828811646, + "learning_rate": 5.046197583511016e-05, + "loss": 0.6782, + "step": 355 + }, + { + "epoch": 0.767590618336887, + "grad_norm": 0.4429256319999695, + "learning_rate": 5.117270788912581e-05, + "loss": 0.6634, + "step": 360 + }, + { + "epoch": 0.7782515991471215, + "grad_norm": 0.44692516326904297, + "learning_rate": 5.1883439943141444e-05, + "loss": 0.6792, + "step": 365 + }, + { + "epoch": 0.7889125799573561, + "grad_norm": 0.4430787265300751, + "learning_rate": 5.259417199715707e-05, + "loss": 0.6416, + "step": 370 + }, + { + "epoch": 0.7995735607675906, + "grad_norm": 0.4461454451084137, + "learning_rate": 5.330490405117271e-05, + "loss": 0.7013, + "step": 375 + }, + { + "epoch": 0.8102345415778252, + "grad_norm": 0.526995837688446, + "learning_rate": 5.401563610518835e-05, + "loss": 0.6396, + "step": 380 + }, + { + "epoch": 0.8208955223880597, + "grad_norm": 0.4485580623149872, + "learning_rate": 5.472636815920398e-05, + "loss": 0.6307, + "step": 385 + }, + { + "epoch": 0.8315565031982942, + "grad_norm": 0.45416155457496643, + "learning_rate": 5.543710021321962e-05, + "loss": 0.6361, + "step": 390 + }, + { + "epoch": 0.8422174840085288, + "grad_norm": 0.4746207296848297, + "learning_rate": 5.6147832267235255e-05, + "loss": 0.641, + "step": 395 + }, + { + "epoch": 0.8528784648187633, + "grad_norm": 0.4466172456741333, + "learning_rate": 5.68585643212509e-05, + "loss": 0.643, + "step": 400 + }, + { + "epoch": 0.8635394456289979, + "grad_norm": 0.46807265281677246, + "learning_rate": 5.756929637526652e-05, + "loss": 0.6258, + "step": 405 + }, + { + "epoch": 0.8742004264392325, + "grad_norm": 0.46169164776802063, + "learning_rate": 5.828002842928216e-05, + "loss": 0.6212, + "step": 410 + }, + { + "epoch": 0.8848614072494669, + "grad_norm": 0.47564077377319336, + "learning_rate": 5.8990760483297804e-05, + "loss": 0.6369, + "step": 415 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 0.4582447409629822, + "learning_rate": 5.970149253731343e-05, + "loss": 0.6086, + "step": 420 + }, + { + "epoch": 0.906183368869936, + "grad_norm": 0.5161389708518982, + "learning_rate": 6.041222459132907e-05, + "loss": 0.6529, + "step": 425 + }, + { + "epoch": 0.9168443496801706, + "grad_norm": 0.47045719623565674, + "learning_rate": 6.11229566453447e-05, + "loss": 0.6119, + "step": 430 + }, + { + "epoch": 0.9275053304904051, + "grad_norm": 0.5950572490692139, + "learning_rate": 6.183368869936035e-05, + "loss": 0.6259, + "step": 435 + }, + { + "epoch": 0.9381663113006397, + "grad_norm": 0.5470284223556519, + "learning_rate": 6.254442075337598e-05, + "loss": 0.6282, + "step": 440 + }, + { + "epoch": 0.9488272921108742, + "grad_norm": 0.5164011716842651, + "learning_rate": 6.325515280739162e-05, + "loss": 0.6399, + "step": 445 + }, + { + "epoch": 0.9594882729211087, + "grad_norm": 0.4264001250267029, + "learning_rate": 6.396588486140725e-05, + "loss": 0.6405, + "step": 450 + }, + { + "epoch": 0.9701492537313433, + "grad_norm": 0.4878412187099457, + "learning_rate": 6.46766169154229e-05, + "loss": 0.6548, + "step": 455 + }, + { + "epoch": 0.9808102345415778, + "grad_norm": 0.47677186131477356, + "learning_rate": 6.538734896943853e-05, + "loss": 0.6506, + "step": 460 + }, + { + "epoch": 0.9914712153518124, + "grad_norm": 0.4687974452972412, + "learning_rate": 6.609808102345416e-05, + "loss": 0.6267, + "step": 465 + }, + { + "epoch": 1.0, + "eval_loss": 0.6078405976295471, + "eval_runtime": 377.5565, + "eval_samples_per_second": 1.091, + "eval_steps_per_second": 1.091, + "step": 469 + }, + { + "epoch": 1.0021321961620469, + "grad_norm": 0.4401796758174896, + "learning_rate": 6.68088130774698e-05, + "loss": 0.5968, + "step": 470 + }, + { + "epoch": 1.0127931769722816, + "grad_norm": 0.8371634483337402, + "learning_rate": 6.751954513148543e-05, + "loss": 0.5923, + "step": 475 + }, + { + "epoch": 1.023454157782516, + "grad_norm": 0.49846479296684265, + "learning_rate": 6.823027718550106e-05, + "loss": 0.6835, + "step": 480 + }, + { + "epoch": 1.0341151385927505, + "grad_norm": 0.5845323801040649, + "learning_rate": 6.89410092395167e-05, + "loss": 0.5906, + "step": 485 + }, + { + "epoch": 1.044776119402985, + "grad_norm": 0.5639384984970093, + "learning_rate": 6.965174129353235e-05, + "loss": 0.5881, + "step": 490 + }, + { + "epoch": 1.0554371002132197, + "grad_norm": 0.5082396268844604, + "learning_rate": 7.036247334754798e-05, + "loss": 0.6224, + "step": 495 + }, + { + "epoch": 1.0660980810234542, + "grad_norm": 0.5611528158187866, + "learning_rate": 7.107320540156361e-05, + "loss": 0.5643, + "step": 500 + }, + { + "epoch": 1.0767590618336886, + "grad_norm": 0.7102047801017761, + "learning_rate": 7.178393745557926e-05, + "loss": 0.5814, + "step": 505 + }, + { + "epoch": 1.0874200426439233, + "grad_norm": 0.46847936511039734, + "learning_rate": 7.249466950959489e-05, + "loss": 0.5642, + "step": 510 + }, + { + "epoch": 1.0980810234541578, + "grad_norm": 0.47119173407554626, + "learning_rate": 7.320540156361052e-05, + "loss": 0.5674, + "step": 515 + }, + { + "epoch": 1.1087420042643923, + "grad_norm": 1.0005890130996704, + "learning_rate": 7.391613361762616e-05, + "loss": 0.5949, + "step": 520 + }, + { + "epoch": 1.1194029850746268, + "grad_norm": 0.7785916924476624, + "learning_rate": 7.46268656716418e-05, + "loss": 0.5643, + "step": 525 + }, + { + "epoch": 1.1300639658848615, + "grad_norm": 0.6393773555755615, + "learning_rate": 7.533759772565742e-05, + "loss": 0.5886, + "step": 530 + }, + { + "epoch": 1.140724946695096, + "grad_norm": 0.6369247436523438, + "learning_rate": 7.604832977967307e-05, + "loss": 0.58, + "step": 535 + }, + { + "epoch": 1.1513859275053304, + "grad_norm": 0.48704272508621216, + "learning_rate": 7.675906183368871e-05, + "loss": 0.6125, + "step": 540 + }, + { + "epoch": 1.1620469083155651, + "grad_norm": 0.5542349219322205, + "learning_rate": 7.746979388770433e-05, + "loss": 0.5688, + "step": 545 + }, + { + "epoch": 1.1727078891257996, + "grad_norm": 0.4632197618484497, + "learning_rate": 7.818052594171997e-05, + "loss": 0.5727, + "step": 550 + }, + { + "epoch": 1.183368869936034, + "grad_norm": 0.40735307335853577, + "learning_rate": 7.889125799573562e-05, + "loss": 0.5704, + "step": 555 + }, + { + "epoch": 1.1940298507462686, + "grad_norm": 0.45803022384643555, + "learning_rate": 7.960199004975125e-05, + "loss": 0.6041, + "step": 560 + }, + { + "epoch": 1.2046908315565032, + "grad_norm": 0.47275593876838684, + "learning_rate": 8.031272210376688e-05, + "loss": 0.5476, + "step": 565 + }, + { + "epoch": 1.2153518123667377, + "grad_norm": 0.4402256906032562, + "learning_rate": 8.102345415778252e-05, + "loss": 0.6101, + "step": 570 + }, + { + "epoch": 1.2260127931769722, + "grad_norm": 0.4577506184577942, + "learning_rate": 8.173418621179815e-05, + "loss": 0.6021, + "step": 575 + }, + { + "epoch": 1.236673773987207, + "grad_norm": 0.4695811867713928, + "learning_rate": 8.24449182658138e-05, + "loss": 0.5843, + "step": 580 + }, + { + "epoch": 1.2473347547974414, + "grad_norm": 0.5012730360031128, + "learning_rate": 8.315565031982943e-05, + "loss": 0.5963, + "step": 585 + }, + { + "epoch": 1.2579957356076759, + "grad_norm": 0.4261506199836731, + "learning_rate": 8.386638237384507e-05, + "loss": 0.5608, + "step": 590 + }, + { + "epoch": 1.2686567164179103, + "grad_norm": 0.48886266350746155, + "learning_rate": 8.45771144278607e-05, + "loss": 0.5768, + "step": 595 + }, + { + "epoch": 1.279317697228145, + "grad_norm": 0.4756333529949188, + "learning_rate": 8.528784648187633e-05, + "loss": 0.5581, + "step": 600 + }, + { + "epoch": 1.2899786780383795, + "grad_norm": 0.4242517054080963, + "learning_rate": 8.599857853589198e-05, + "loss": 0.5436, + "step": 605 + }, + { + "epoch": 1.3006396588486142, + "grad_norm": 0.44590556621551514, + "learning_rate": 8.67093105899076e-05, + "loss": 0.5821, + "step": 610 + }, + { + "epoch": 1.3113006396588487, + "grad_norm": 0.4373833239078522, + "learning_rate": 8.742004264392325e-05, + "loss": 0.544, + "step": 615 + }, + { + "epoch": 1.3219616204690832, + "grad_norm": 0.42627617716789246, + "learning_rate": 8.813077469793888e-05, + "loss": 0.5417, + "step": 620 + }, + { + "epoch": 1.3326226012793176, + "grad_norm": 0.516544759273529, + "learning_rate": 8.884150675195451e-05, + "loss": 0.573, + "step": 625 + }, + { + "epoch": 1.3432835820895521, + "grad_norm": 0.4419044256210327, + "learning_rate": 8.955223880597016e-05, + "loss": 0.5523, + "step": 630 + }, + { + "epoch": 1.3539445628997868, + "grad_norm": 0.4533810019493103, + "learning_rate": 9.026297085998579e-05, + "loss": 0.5372, + "step": 635 + }, + { + "epoch": 1.3646055437100213, + "grad_norm": 0.4296520948410034, + "learning_rate": 9.097370291400143e-05, + "loss": 0.5742, + "step": 640 + }, + { + "epoch": 1.375266524520256, + "grad_norm": 0.4285917282104492, + "learning_rate": 9.168443496801706e-05, + "loss": 0.5577, + "step": 645 + }, + { + "epoch": 1.3859275053304905, + "grad_norm": 0.41438210010528564, + "learning_rate": 9.23951670220327e-05, + "loss": 0.5659, + "step": 650 + }, + { + "epoch": 1.396588486140725, + "grad_norm": 0.43702948093414307, + "learning_rate": 9.310589907604834e-05, + "loss": 0.5425, + "step": 655 + }, + { + "epoch": 1.4072494669509594, + "grad_norm": 0.520577609539032, + "learning_rate": 9.381663113006397e-05, + "loss": 0.5624, + "step": 660 + }, + { + "epoch": 1.417910447761194, + "grad_norm": 0.451948881149292, + "learning_rate": 9.452736318407961e-05, + "loss": 0.5598, + "step": 665 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.4748338460922241, + "learning_rate": 9.523809523809524e-05, + "loss": 0.6579, + "step": 670 + }, + { + "epoch": 1.439232409381663, + "grad_norm": 0.4351726472377777, + "learning_rate": 9.594882729211087e-05, + "loss": 0.541, + "step": 675 + }, + { + "epoch": 1.4498933901918978, + "grad_norm": 0.4322686493396759, + "learning_rate": 9.665955934612652e-05, + "loss": 0.5941, + "step": 680 + }, + { + "epoch": 1.4605543710021323, + "grad_norm": 0.43369051814079285, + "learning_rate": 9.737029140014216e-05, + "loss": 0.5862, + "step": 685 + }, + { + "epoch": 1.4712153518123667, + "grad_norm": 0.5028679966926575, + "learning_rate": 9.808102345415778e-05, + "loss": 0.5444, + "step": 690 + }, + { + "epoch": 1.4818763326226012, + "grad_norm": 0.4060784578323364, + "learning_rate": 9.879175550817342e-05, + "loss": 0.549, + "step": 695 + }, + { + "epoch": 1.4925373134328357, + "grad_norm": 0.4283974766731262, + "learning_rate": 9.950248756218906e-05, + "loss": 0.5474, + "step": 700 + }, + { + "epoch": 1.5031982942430704, + "grad_norm": 0.3743923008441925, + "learning_rate": 0.0001002132196162047, + "loss": 0.5394, + "step": 705 + }, + { + "epoch": 1.5138592750533049, + "grad_norm": 0.44469088315963745, + "learning_rate": 0.00010092395167022033, + "loss": 0.5563, + "step": 710 + }, + { + "epoch": 1.5245202558635396, + "grad_norm": 0.43209415674209595, + "learning_rate": 0.00010163468372423597, + "loss": 0.5803, + "step": 715 + }, + { + "epoch": 1.535181236673774, + "grad_norm": 0.4075677990913391, + "learning_rate": 0.00010234541577825161, + "loss": 0.5369, + "step": 720 + }, + { + "epoch": 1.5458422174840085, + "grad_norm": 0.4084095358848572, + "learning_rate": 0.00010305614783226724, + "loss": 0.5687, + "step": 725 + }, + { + "epoch": 1.556503198294243, + "grad_norm": 0.4053703248500824, + "learning_rate": 0.00010376687988628289, + "loss": 0.5301, + "step": 730 + }, + { + "epoch": 1.5671641791044775, + "grad_norm": 0.46452564001083374, + "learning_rate": 0.0001044776119402985, + "loss": 0.5823, + "step": 735 + }, + { + "epoch": 1.5778251599147122, + "grad_norm": 0.4020977020263672, + "learning_rate": 0.00010518834399431414, + "loss": 0.5463, + "step": 740 + }, + { + "epoch": 1.5884861407249466, + "grad_norm": 0.3993551433086395, + "learning_rate": 0.00010589907604832978, + "loss": 0.5551, + "step": 745 + }, + { + "epoch": 1.5991471215351813, + "grad_norm": 0.4211786985397339, + "learning_rate": 0.00010660980810234542, + "loss": 0.5607, + "step": 750 + }, + { + "epoch": 1.6098081023454158, + "grad_norm": 0.4241097867488861, + "learning_rate": 0.00010732054015636106, + "loss": 0.5402, + "step": 755 + }, + { + "epoch": 1.6204690831556503, + "grad_norm": 0.3934391736984253, + "learning_rate": 0.0001080312722103767, + "loss": 0.5618, + "step": 760 + }, + { + "epoch": 1.6311300639658848, + "grad_norm": 0.37157073616981506, + "learning_rate": 0.00010874200426439234, + "loss": 0.5232, + "step": 765 + }, + { + "epoch": 1.6417910447761193, + "grad_norm": 0.4151962399482727, + "learning_rate": 0.00010945273631840796, + "loss": 0.563, + "step": 770 + }, + { + "epoch": 1.652452025586354, + "grad_norm": 0.42233771085739136, + "learning_rate": 0.00011016346837242359, + "loss": 0.5667, + "step": 775 + }, + { + "epoch": 1.6631130063965884, + "grad_norm": 0.3891717493534088, + "learning_rate": 0.00011087420042643924, + "loss": 0.582, + "step": 780 + }, + { + "epoch": 1.6737739872068231, + "grad_norm": 0.4017283618450165, + "learning_rate": 0.00011158493248045488, + "loss": 0.5386, + "step": 785 + }, + { + "epoch": 1.6844349680170576, + "grad_norm": 0.4058316648006439, + "learning_rate": 0.00011229566453447051, + "loss": 0.5357, + "step": 790 + }, + { + "epoch": 1.695095948827292, + "grad_norm": 0.38968625664711, + "learning_rate": 0.00011300639658848615, + "loss": 0.527, + "step": 795 + }, + { + "epoch": 1.7057569296375266, + "grad_norm": 0.4108840525150299, + "learning_rate": 0.0001137171286425018, + "loss": 0.5347, + "step": 800 + }, + { + "epoch": 1.716417910447761, + "grad_norm": 0.37222376465797424, + "learning_rate": 0.00011442786069651741, + "loss": 0.524, + "step": 805 + }, + { + "epoch": 1.7270788912579957, + "grad_norm": 0.4046708047389984, + "learning_rate": 0.00011513859275053305, + "loss": 0.5096, + "step": 810 + }, + { + "epoch": 1.7377398720682304, + "grad_norm": 0.37089455127716064, + "learning_rate": 0.00011584932480454869, + "loss": 0.5316, + "step": 815 + }, + { + "epoch": 1.748400852878465, + "grad_norm": 0.3895399272441864, + "learning_rate": 0.00011656005685856432, + "loss": 0.5274, + "step": 820 + }, + { + "epoch": 1.7590618336886994, + "grad_norm": 0.3956606984138489, + "learning_rate": 0.00011727078891257996, + "loss": 0.5395, + "step": 825 + }, + { + "epoch": 1.7697228144989339, + "grad_norm": 0.4023361802101135, + "learning_rate": 0.00011798152096659561, + "loss": 0.53, + "step": 830 + }, + { + "epoch": 1.7803837953091683, + "grad_norm": 0.39323511719703674, + "learning_rate": 0.00011869225302061124, + "loss": 0.5341, + "step": 835 + }, + { + "epoch": 1.7910447761194028, + "grad_norm": 0.3870689868927002, + "learning_rate": 0.00011940298507462686, + "loss": 0.5268, + "step": 840 + }, + { + "epoch": 1.8017057569296375, + "grad_norm": 0.39864471554756165, + "learning_rate": 0.0001201137171286425, + "loss": 0.5754, + "step": 845 + }, + { + "epoch": 1.8123667377398722, + "grad_norm": 0.413980633020401, + "learning_rate": 0.00012082444918265814, + "loss": 0.5274, + "step": 850 + }, + { + "epoch": 1.8230277185501067, + "grad_norm": 0.3994651138782501, + "learning_rate": 0.00012153518123667377, + "loss": 0.5313, + "step": 855 + }, + { + "epoch": 1.8336886993603412, + "grad_norm": 0.4106079041957855, + "learning_rate": 0.0001222459132906894, + "loss": 0.5293, + "step": 860 + }, + { + "epoch": 1.8443496801705757, + "grad_norm": 0.38014471530914307, + "learning_rate": 0.00012295664534470505, + "loss": 0.5313, + "step": 865 + }, + { + "epoch": 1.8550106609808101, + "grad_norm": 0.3477731943130493, + "learning_rate": 0.0001236673773987207, + "loss": 0.5499, + "step": 870 + }, + { + "epoch": 1.8656716417910446, + "grad_norm": 0.3609556555747986, + "learning_rate": 0.0001243781094527363, + "loss": 0.5195, + "step": 875 + }, + { + "epoch": 1.8763326226012793, + "grad_norm": 0.3532927334308624, + "learning_rate": 0.00012508884150675195, + "loss": 0.5233, + "step": 880 + }, + { + "epoch": 1.886993603411514, + "grad_norm": 0.3663487434387207, + "learning_rate": 0.0001257995735607676, + "loss": 0.5129, + "step": 885 + }, + { + "epoch": 1.8976545842217485, + "grad_norm": 0.35837364196777344, + "learning_rate": 0.00012651030561478324, + "loss": 0.5106, + "step": 890 + }, + { + "epoch": 1.908315565031983, + "grad_norm": 0.38498660922050476, + "learning_rate": 0.00012722103766879886, + "loss": 0.5216, + "step": 895 + }, + { + "epoch": 1.9189765458422174, + "grad_norm": 0.3501322269439697, + "learning_rate": 0.0001279317697228145, + "loss": 0.54, + "step": 900 + }, + { + "epoch": 1.929637526652452, + "grad_norm": 0.34796684980392456, + "learning_rate": 0.00012864250177683015, + "loss": 0.5165, + "step": 905 + }, + { + "epoch": 1.9402985074626866, + "grad_norm": 0.46670106053352356, + "learning_rate": 0.0001293532338308458, + "loss": 0.5437, + "step": 910 + }, + { + "epoch": 1.950959488272921, + "grad_norm": 0.3535880148410797, + "learning_rate": 0.0001300639658848614, + "loss": 0.5561, + "step": 915 + }, + { + "epoch": 1.9616204690831558, + "grad_norm": 0.3591325283050537, + "learning_rate": 0.00013077469793887705, + "loss": 0.5193, + "step": 920 + }, + { + "epoch": 1.9722814498933903, + "grad_norm": 0.4969016909599304, + "learning_rate": 0.00013148542999289267, + "loss": 0.526, + "step": 925 + }, + { + "epoch": 1.9829424307036247, + "grad_norm": 0.3567504584789276, + "learning_rate": 0.00013219616204690831, + "loss": 0.5063, + "step": 930 + }, + { + "epoch": 1.9936034115138592, + "grad_norm": 0.3647787272930145, + "learning_rate": 0.00013290689410092396, + "loss": 0.5094, + "step": 935 + }, + { + "epoch": 2.0, + "eval_loss": 0.5335173606872559, + "eval_runtime": 377.8765, + "eval_samples_per_second": 1.09, + "eval_steps_per_second": 1.09, + "step": 938 + }, + { + "epoch": 2.0042643923240937, + "grad_norm": 0.34923797845840454, + "learning_rate": 0.0001336176261549396, + "loss": 0.5126, + "step": 940 + }, + { + "epoch": 2.014925373134328, + "grad_norm": 0.4439273476600647, + "learning_rate": 0.00013432835820895525, + "loss": 0.5349, + "step": 945 + }, + { + "epoch": 2.025586353944563, + "grad_norm": 0.35956764221191406, + "learning_rate": 0.00013503909026297086, + "loss": 0.493, + "step": 950 + }, + { + "epoch": 2.0362473347547976, + "grad_norm": 0.3677864074707031, + "learning_rate": 0.0001357498223169865, + "loss": 0.523, + "step": 955 + }, + { + "epoch": 2.046908315565032, + "grad_norm": 0.3486590087413788, + "learning_rate": 0.00013646055437100213, + "loss": 0.5322, + "step": 960 + }, + { + "epoch": 2.0575692963752665, + "grad_norm": 0.3785991072654724, + "learning_rate": 0.00013717128642501777, + "loss": 0.4903, + "step": 965 + }, + { + "epoch": 2.068230277185501, + "grad_norm": 0.3422692120075226, + "learning_rate": 0.0001378820184790334, + "loss": 0.5356, + "step": 970 + }, + { + "epoch": 2.0788912579957355, + "grad_norm": 0.41184964776039124, + "learning_rate": 0.00013859275053304906, + "loss": 0.4969, + "step": 975 + }, + { + "epoch": 2.08955223880597, + "grad_norm": 0.34267646074295044, + "learning_rate": 0.0001393034825870647, + "loss": 0.5113, + "step": 980 + }, + { + "epoch": 2.100213219616205, + "grad_norm": 0.38112279772758484, + "learning_rate": 0.00014001421464108032, + "loss": 0.4793, + "step": 985 + }, + { + "epoch": 2.1108742004264394, + "grad_norm": 0.33497291803359985, + "learning_rate": 0.00014072494669509596, + "loss": 0.5185, + "step": 990 + }, + { + "epoch": 2.121535181236674, + "grad_norm": 0.37100210785865784, + "learning_rate": 0.00014143567874911158, + "loss": 0.5024, + "step": 995 + }, + { + "epoch": 2.1321961620469083, + "grad_norm": 0.3079771101474762, + "learning_rate": 0.00014214641080312722, + "loss": 0.5066, + "step": 1000 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.3615591824054718, + "learning_rate": 0.00014285714285714287, + "loss": 0.5157, + "step": 1005 + }, + { + "epoch": 2.1535181236673773, + "grad_norm": 0.3394719958305359, + "learning_rate": 0.0001435678749111585, + "loss": 0.4906, + "step": 1010 + }, + { + "epoch": 2.1641791044776117, + "grad_norm": 0.4234224557876587, + "learning_rate": 0.00014427860696517416, + "loss": 0.5015, + "step": 1015 + }, + { + "epoch": 2.1748400852878467, + "grad_norm": 0.3535841107368469, + "learning_rate": 0.00014498933901918977, + "loss": 0.5107, + "step": 1020 + }, + { + "epoch": 2.185501066098081, + "grad_norm": 0.41673514246940613, + "learning_rate": 0.0001457000710732054, + "loss": 0.505, + "step": 1025 + }, + { + "epoch": 2.1961620469083156, + "grad_norm": 0.3521960973739624, + "learning_rate": 0.00014641080312722103, + "loss": 0.5339, + "step": 1030 + }, + { + "epoch": 2.20682302771855, + "grad_norm": 0.341727614402771, + "learning_rate": 0.00014712153518123668, + "loss": 0.4897, + "step": 1035 + }, + { + "epoch": 2.2174840085287846, + "grad_norm": 0.32079800963401794, + "learning_rate": 0.00014783226723525232, + "loss": 0.5049, + "step": 1040 + }, + { + "epoch": 2.228144989339019, + "grad_norm": 0.34027552604675293, + "learning_rate": 0.00014854299928926797, + "loss": 0.4993, + "step": 1045 + }, + { + "epoch": 2.2388059701492535, + "grad_norm": 0.34183624386787415, + "learning_rate": 0.0001492537313432836, + "loss": 0.51, + "step": 1050 + }, + { + "epoch": 2.2494669509594885, + "grad_norm": 0.31983354687690735, + "learning_rate": 0.00014996446339729923, + "loss": 0.5084, + "step": 1055 + }, + { + "epoch": 2.260127931769723, + "grad_norm": 0.3631596565246582, + "learning_rate": 0.00015067519545131484, + "loss": 0.4986, + "step": 1060 + }, + { + "epoch": 2.2707889125799574, + "grad_norm": 0.32126784324645996, + "learning_rate": 0.0001513859275053305, + "loss": 0.4832, + "step": 1065 + }, + { + "epoch": 2.281449893390192, + "grad_norm": 0.3390761911869049, + "learning_rate": 0.00015209665955934613, + "loss": 0.4972, + "step": 1070 + }, + { + "epoch": 2.2921108742004264, + "grad_norm": 0.3330533504486084, + "learning_rate": 0.00015280739161336178, + "loss": 0.4772, + "step": 1075 + }, + { + "epoch": 2.302771855010661, + "grad_norm": 0.3619351089000702, + "learning_rate": 0.00015351812366737742, + "loss": 0.5141, + "step": 1080 + }, + { + "epoch": 2.3134328358208958, + "grad_norm": 0.3252182602882385, + "learning_rate": 0.00015422885572139304, + "loss": 0.5056, + "step": 1085 + }, + { + "epoch": 2.3240938166311302, + "grad_norm": 0.3745068311691284, + "learning_rate": 0.00015493958777540866, + "loss": 0.5395, + "step": 1090 + }, + { + "epoch": 2.3347547974413647, + "grad_norm": 0.38191962242126465, + "learning_rate": 0.0001556503198294243, + "loss": 0.4865, + "step": 1095 + }, + { + "epoch": 2.345415778251599, + "grad_norm": 0.32218611240386963, + "learning_rate": 0.00015636105188343994, + "loss": 0.4955, + "step": 1100 + }, + { + "epoch": 2.3560767590618337, + "grad_norm": 0.32240140438079834, + "learning_rate": 0.0001570717839374556, + "loss": 0.4972, + "step": 1105 + }, + { + "epoch": 2.366737739872068, + "grad_norm": 0.37284377217292786, + "learning_rate": 0.00015778251599147123, + "loss": 0.4874, + "step": 1110 + }, + { + "epoch": 2.3773987206823026, + "grad_norm": 0.350769579410553, + "learning_rate": 0.00015849324804548688, + "loss": 0.4931, + "step": 1115 + }, + { + "epoch": 2.388059701492537, + "grad_norm": 0.3309812843799591, + "learning_rate": 0.0001592039800995025, + "loss": 0.5103, + "step": 1120 + }, + { + "epoch": 2.398720682302772, + "grad_norm": 0.3497963547706604, + "learning_rate": 0.0001599147121535181, + "loss": 0.4864, + "step": 1125 + }, + { + "epoch": 2.4093816631130065, + "grad_norm": 0.3567025661468506, + "learning_rate": 0.00016062544420753375, + "loss": 0.5461, + "step": 1130 + }, + { + "epoch": 2.420042643923241, + "grad_norm": 0.5213941931724548, + "learning_rate": 0.0001613361762615494, + "loss": 0.5138, + "step": 1135 + }, + { + "epoch": 2.4307036247334755, + "grad_norm": 0.32027000188827515, + "learning_rate": 0.00016204690831556504, + "loss": 0.5078, + "step": 1140 + }, + { + "epoch": 2.44136460554371, + "grad_norm": 0.37092500925064087, + "learning_rate": 0.00016275764036958069, + "loss": 0.4903, + "step": 1145 + }, + { + "epoch": 2.4520255863539444, + "grad_norm": 0.35545867681503296, + "learning_rate": 0.0001634683724235963, + "loss": 0.5131, + "step": 1150 + }, + { + "epoch": 2.4626865671641793, + "grad_norm": 0.3277740776538849, + "learning_rate": 0.00016417910447761195, + "loss": 0.4814, + "step": 1155 + }, + { + "epoch": 2.473347547974414, + "grad_norm": 0.3226880133152008, + "learning_rate": 0.0001648898365316276, + "loss": 0.4944, + "step": 1160 + }, + { + "epoch": 2.4840085287846483, + "grad_norm": 0.3283137381076813, + "learning_rate": 0.0001656005685856432, + "loss": 0.5058, + "step": 1165 + }, + { + "epoch": 2.4946695095948828, + "grad_norm": 0.38707828521728516, + "learning_rate": 0.00016631130063965885, + "loss": 0.5108, + "step": 1170 + }, + { + "epoch": 2.5053304904051172, + "grad_norm": 0.3053881824016571, + "learning_rate": 0.0001670220326936745, + "loss": 0.4751, + "step": 1175 + }, + { + "epoch": 2.5159914712153517, + "grad_norm": 0.29871490597724915, + "learning_rate": 0.00016773276474769014, + "loss": 0.4848, + "step": 1180 + }, + { + "epoch": 2.526652452025586, + "grad_norm": 0.3135201930999756, + "learning_rate": 0.00016844349680170576, + "loss": 0.4852, + "step": 1185 + }, + { + "epoch": 2.5373134328358207, + "grad_norm": 0.31287622451782227, + "learning_rate": 0.0001691542288557214, + "loss": 0.4804, + "step": 1190 + }, + { + "epoch": 2.5479744136460556, + "grad_norm": 0.30184197425842285, + "learning_rate": 0.00016986496090973705, + "loss": 0.5006, + "step": 1195 + }, + { + "epoch": 2.55863539445629, + "grad_norm": 0.29948562383651733, + "learning_rate": 0.00017057569296375266, + "loss": 0.4934, + "step": 1200 + }, + { + "epoch": 2.5692963752665245, + "grad_norm": 0.29258280992507935, + "learning_rate": 0.0001712864250177683, + "loss": 0.4887, + "step": 1205 + }, + { + "epoch": 2.579957356076759, + "grad_norm": 0.29767826199531555, + "learning_rate": 0.00017199715707178395, + "loss": 0.4958, + "step": 1210 + }, + { + "epoch": 2.5906183368869935, + "grad_norm": 0.29649823904037476, + "learning_rate": 0.0001727078891257996, + "loss": 0.51, + "step": 1215 + }, + { + "epoch": 2.6012793176972284, + "grad_norm": 0.30332130193710327, + "learning_rate": 0.0001734186211798152, + "loss": 0.4954, + "step": 1220 + }, + { + "epoch": 2.611940298507463, + "grad_norm": 0.3551209270954132, + "learning_rate": 0.00017412935323383086, + "loss": 0.5088, + "step": 1225 + }, + { + "epoch": 2.6226012793176974, + "grad_norm": 0.33677777647972107, + "learning_rate": 0.0001748400852878465, + "loss": 0.5248, + "step": 1230 + }, + { + "epoch": 2.633262260127932, + "grad_norm": 0.29216548800468445, + "learning_rate": 0.00017555081734186212, + "loss": 0.4954, + "step": 1235 + }, + { + "epoch": 2.6439232409381663, + "grad_norm": 0.32732442021369934, + "learning_rate": 0.00017626154939587776, + "loss": 0.5048, + "step": 1240 + }, + { + "epoch": 2.654584221748401, + "grad_norm": 0.29788029193878174, + "learning_rate": 0.0001769722814498934, + "loss": 0.5056, + "step": 1245 + }, + { + "epoch": 2.6652452025586353, + "grad_norm": 0.3407440185546875, + "learning_rate": 0.00017768301350390902, + "loss": 0.5385, + "step": 1250 + }, + { + "epoch": 2.6759061833688698, + "grad_norm": 0.2790848910808563, + "learning_rate": 0.00017839374555792467, + "loss": 0.5014, + "step": 1255 + }, + { + "epoch": 2.6865671641791042, + "grad_norm": 0.30173078179359436, + "learning_rate": 0.0001791044776119403, + "loss": 0.5118, + "step": 1260 + }, + { + "epoch": 2.697228144989339, + "grad_norm": 0.2736753821372986, + "learning_rate": 0.00017981520966595596, + "loss": 0.5018, + "step": 1265 + }, + { + "epoch": 2.7078891257995736, + "grad_norm": 0.2970294952392578, + "learning_rate": 0.00018052594171997157, + "loss": 0.4966, + "step": 1270 + }, + { + "epoch": 2.718550106609808, + "grad_norm": 0.2721494138240814, + "learning_rate": 0.00018123667377398722, + "loss": 0.4746, + "step": 1275 + }, + { + "epoch": 2.7292110874200426, + "grad_norm": 0.29144713282585144, + "learning_rate": 0.00018194740582800286, + "loss": 0.4739, + "step": 1280 + }, + { + "epoch": 2.739872068230277, + "grad_norm": 0.3217550814151764, + "learning_rate": 0.00018265813788201848, + "loss": 0.4868, + "step": 1285 + }, + { + "epoch": 2.750533049040512, + "grad_norm": 0.25847169756889343, + "learning_rate": 0.00018336886993603412, + "loss": 0.4664, + "step": 1290 + }, + { + "epoch": 2.7611940298507465, + "grad_norm": 0.2917424142360687, + "learning_rate": 0.00018407960199004977, + "loss": 0.4659, + "step": 1295 + }, + { + "epoch": 2.771855010660981, + "grad_norm": 0.29807865619659424, + "learning_rate": 0.0001847903340440654, + "loss": 0.4838, + "step": 1300 + }, + { + "epoch": 2.7825159914712154, + "grad_norm": 0.28630420565605164, + "learning_rate": 0.00018550106609808103, + "loss": 0.4658, + "step": 1305 + }, + { + "epoch": 2.79317697228145, + "grad_norm": 0.2946392595767975, + "learning_rate": 0.00018621179815209667, + "loss": 0.5037, + "step": 1310 + }, + { + "epoch": 2.8038379530916844, + "grad_norm": 0.38894176483154297, + "learning_rate": 0.0001869225302061123, + "loss": 0.525, + "step": 1315 + }, + { + "epoch": 2.814498933901919, + "grad_norm": 0.28793737292289734, + "learning_rate": 0.00018763326226012793, + "loss": 0.5238, + "step": 1320 + }, + { + "epoch": 2.8251599147121533, + "grad_norm": 0.3103950023651123, + "learning_rate": 0.00018834399431414358, + "loss": 0.4932, + "step": 1325 + }, + { + "epoch": 2.835820895522388, + "grad_norm": 0.2969878017902374, + "learning_rate": 0.00018905472636815922, + "loss": 0.4807, + "step": 1330 + }, + { + "epoch": 2.8464818763326227, + "grad_norm": 0.2937600612640381, + "learning_rate": 0.00018976545842217486, + "loss": 0.4862, + "step": 1335 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.2892070710659027, + "learning_rate": 0.00019047619047619048, + "loss": 0.526, + "step": 1340 + }, + { + "epoch": 2.8678038379530917, + "grad_norm": 0.28446847200393677, + "learning_rate": 0.00019118692253020613, + "loss": 0.4846, + "step": 1345 + }, + { + "epoch": 2.878464818763326, + "grad_norm": 0.2877322733402252, + "learning_rate": 0.00019189765458422174, + "loss": 0.4759, + "step": 1350 + }, + { + "epoch": 2.8891257995735606, + "grad_norm": 0.2837788462638855, + "learning_rate": 0.0001926083866382374, + "loss": 0.4894, + "step": 1355 + }, + { + "epoch": 2.8997867803837956, + "grad_norm": 0.3020360469818115, + "learning_rate": 0.00019331911869225303, + "loss": 0.4936, + "step": 1360 + }, + { + "epoch": 2.91044776119403, + "grad_norm": 0.28344911336898804, + "learning_rate": 0.00019402985074626867, + "loss": 0.4881, + "step": 1365 + }, + { + "epoch": 2.9211087420042645, + "grad_norm": 0.2753186821937561, + "learning_rate": 0.00019474058280028432, + "loss": 0.4826, + "step": 1370 + }, + { + "epoch": 2.931769722814499, + "grad_norm": 0.2922317385673523, + "learning_rate": 0.00019545131485429994, + "loss": 0.4759, + "step": 1375 + }, + { + "epoch": 2.9424307036247335, + "grad_norm": 0.3179524540901184, + "learning_rate": 0.00019616204690831555, + "loss": 0.4883, + "step": 1380 + }, + { + "epoch": 2.953091684434968, + "grad_norm": 0.2944222688674927, + "learning_rate": 0.0001968727789623312, + "loss": 0.4804, + "step": 1385 + }, + { + "epoch": 2.9637526652452024, + "grad_norm": 0.2687291204929352, + "learning_rate": 0.00019758351101634684, + "loss": 0.4891, + "step": 1390 + }, + { + "epoch": 2.974413646055437, + "grad_norm": 0.25935596227645874, + "learning_rate": 0.00019829424307036249, + "loss": 0.4902, + "step": 1395 + }, + { + "epoch": 2.9850746268656714, + "grad_norm": 0.30086612701416016, + "learning_rate": 0.00019900497512437813, + "loss": 0.4942, + "step": 1400 + }, + { + "epoch": 2.9957356076759063, + "grad_norm": 0.2930257022380829, + "learning_rate": 0.00019971570717839377, + "loss": 0.513, + "step": 1405 + }, + { + "epoch": 3.0, + "eval_loss": 0.5142309069633484, + "eval_runtime": 377.5199, + "eval_samples_per_second": 1.091, + "eval_steps_per_second": 1.091, + "step": 1407 + }, + { + "epoch": 3.0063965884861408, + "grad_norm": 0.28208208084106445, + "learning_rate": 0.00019999997230259856, + "loss": 0.467, + "step": 1410 + }, + { + "epoch": 3.0170575692963753, + "grad_norm": 0.290385365486145, + "learning_rate": 0.00019999980304075655, + "loss": 0.44, + "step": 1415 + }, + { + "epoch": 3.0277185501066097, + "grad_norm": 0.27436771988868713, + "learning_rate": 0.00019999947990477788, + "loss": 0.4876, + "step": 1420 + }, + { + "epoch": 3.038379530916844, + "grad_norm": 0.2883841395378113, + "learning_rate": 0.00019999900289515975, + "loss": 0.4509, + "step": 1425 + }, + { + "epoch": 3.0490405117270787, + "grad_norm": 0.279857337474823, + "learning_rate": 0.00019999837201263622, + "loss": 0.4431, + "step": 1430 + }, + { + "epoch": 3.0597014925373136, + "grad_norm": 0.31563228368759155, + "learning_rate": 0.000199997587258178, + "loss": 0.4789, + "step": 1435 + }, + { + "epoch": 3.070362473347548, + "grad_norm": 0.302135169506073, + "learning_rate": 0.00019999664863299267, + "loss": 0.4685, + "step": 1440 + }, + { + "epoch": 3.0810234541577826, + "grad_norm": 0.2668147385120392, + "learning_rate": 0.00019999555613852449, + "loss": 0.4361, + "step": 1445 + }, + { + "epoch": 3.091684434968017, + "grad_norm": 0.28701773285865784, + "learning_rate": 0.00019999430977645457, + "loss": 0.4417, + "step": 1450 + }, + { + "epoch": 3.1023454157782515, + "grad_norm": 0.2622893154621124, + "learning_rate": 0.00019999290954870073, + "loss": 0.4524, + "step": 1455 + }, + { + "epoch": 3.113006396588486, + "grad_norm": 0.2776693105697632, + "learning_rate": 0.00019999135545741755, + "loss": 0.463, + "step": 1460 + }, + { + "epoch": 3.1236673773987205, + "grad_norm": 0.26774516701698303, + "learning_rate": 0.00019998964750499637, + "loss": 0.4732, + "step": 1465 + }, + { + "epoch": 3.1343283582089554, + "grad_norm": 0.26958051323890686, + "learning_rate": 0.0001999877856940653, + "loss": 0.4517, + "step": 1470 + }, + { + "epoch": 3.14498933901919, + "grad_norm": 0.2604299485683441, + "learning_rate": 0.00019998577002748924, + "loss": 0.4476, + "step": 1475 + }, + { + "epoch": 3.1556503198294243, + "grad_norm": 1.0628249645233154, + "learning_rate": 0.00019998360050836974, + "loss": 0.4542, + "step": 1480 + }, + { + "epoch": 3.166311300639659, + "grad_norm": 0.26215219497680664, + "learning_rate": 0.0001999812771400451, + "loss": 0.4608, + "step": 1485 + }, + { + "epoch": 3.1769722814498933, + "grad_norm": 0.2745310068130493, + "learning_rate": 0.00019997879992609047, + "loss": 0.4532, + "step": 1490 + }, + { + "epoch": 3.1876332622601278, + "grad_norm": 0.3186289072036743, + "learning_rate": 0.0001999761688703176, + "loss": 0.4854, + "step": 1495 + }, + { + "epoch": 3.1982942430703627, + "grad_norm": 0.2697219252586365, + "learning_rate": 0.000199973383976775, + "loss": 0.4759, + "step": 1500 + }, + { + "epoch": 3.208955223880597, + "grad_norm": 0.32173436880111694, + "learning_rate": 0.00019997044524974799, + "loss": 0.47, + "step": 1505 + }, + { + "epoch": 3.2196162046908317, + "grad_norm": 0.28551211953163147, + "learning_rate": 0.00019996735269375843, + "loss": 0.4537, + "step": 1510 + }, + { + "epoch": 3.230277185501066, + "grad_norm": 0.2618770897388458, + "learning_rate": 0.00019996410631356498, + "loss": 0.455, + "step": 1515 + }, + { + "epoch": 3.2409381663113006, + "grad_norm": 0.3189204931259155, + "learning_rate": 0.00019996070611416305, + "loss": 0.4869, + "step": 1520 + }, + { + "epoch": 3.251599147121535, + "grad_norm": 0.2555652856826782, + "learning_rate": 0.00019995715210078464, + "loss": 0.4582, + "step": 1525 + }, + { + "epoch": 3.2622601279317696, + "grad_norm": 0.45129457116127014, + "learning_rate": 0.00019995344427889845, + "loss": 0.5055, + "step": 1530 + }, + { + "epoch": 3.272921108742004, + "grad_norm": 0.2851119637489319, + "learning_rate": 0.0001999495826542099, + "loss": 0.4495, + "step": 1535 + }, + { + "epoch": 3.283582089552239, + "grad_norm": 0.4647831916809082, + "learning_rate": 0.00019994556723266103, + "loss": 0.4442, + "step": 1540 + }, + { + "epoch": 3.2942430703624734, + "grad_norm": 0.28650426864624023, + "learning_rate": 0.00019994139802043055, + "loss": 0.488, + "step": 1545 + }, + { + "epoch": 3.304904051172708, + "grad_norm": 0.2804616093635559, + "learning_rate": 0.0001999370750239338, + "loss": 0.4538, + "step": 1550 + }, + { + "epoch": 3.3155650319829424, + "grad_norm": 0.2778622508049011, + "learning_rate": 0.0001999325982498228, + "loss": 0.4468, + "step": 1555 + }, + { + "epoch": 3.326226012793177, + "grad_norm": 0.26577600836753845, + "learning_rate": 0.00019992796770498616, + "loss": 0.4805, + "step": 1560 + }, + { + "epoch": 3.3368869936034113, + "grad_norm": 0.25679486989974976, + "learning_rate": 0.00019992318339654905, + "loss": 0.4648, + "step": 1565 + }, + { + "epoch": 3.3475479744136463, + "grad_norm": 0.263921856880188, + "learning_rate": 0.00019991824533187335, + "loss": 0.4638, + "step": 1570 + }, + { + "epoch": 3.3582089552238807, + "grad_norm": 0.25445836782455444, + "learning_rate": 0.00019991315351855748, + "loss": 0.4395, + "step": 1575 + }, + { + "epoch": 3.368869936034115, + "grad_norm": 0.2354278415441513, + "learning_rate": 0.0001999079079644364, + "loss": 0.487, + "step": 1580 + }, + { + "epoch": 3.3795309168443497, + "grad_norm": 0.2561117708683014, + "learning_rate": 0.0001999025086775817, + "loss": 0.4562, + "step": 1585 + }, + { + "epoch": 3.390191897654584, + "grad_norm": 0.3330647349357605, + "learning_rate": 0.00019989695566630152, + "loss": 0.4445, + "step": 1590 + }, + { + "epoch": 3.4008528784648187, + "grad_norm": 0.26299235224723816, + "learning_rate": 0.00019989124893914046, + "loss": 0.4488, + "step": 1595 + }, + { + "epoch": 3.411513859275053, + "grad_norm": 0.299434095621109, + "learning_rate": 0.0001998853885048798, + "loss": 0.4563, + "step": 1600 + }, + { + "epoch": 3.4221748400852876, + "grad_norm": 0.23711760342121124, + "learning_rate": 0.0001998793743725372, + "loss": 0.4473, + "step": 1605 + }, + { + "epoch": 3.4328358208955225, + "grad_norm": 0.24863874912261963, + "learning_rate": 0.00019987320655136693, + "loss": 0.4574, + "step": 1610 + }, + { + "epoch": 3.443496801705757, + "grad_norm": 0.24471955001354218, + "learning_rate": 0.00019986688505085957, + "loss": 0.4665, + "step": 1615 + }, + { + "epoch": 3.4541577825159915, + "grad_norm": 0.2540249526500702, + "learning_rate": 0.00019986040988074238, + "loss": 0.4689, + "step": 1620 + }, + { + "epoch": 3.464818763326226, + "grad_norm": 0.2666712701320648, + "learning_rate": 0.00019985378105097902, + "loss": 0.4477, + "step": 1625 + }, + { + "epoch": 3.4754797441364604, + "grad_norm": 0.27709081768989563, + "learning_rate": 0.0001998469985717695, + "loss": 0.4403, + "step": 1630 + }, + { + "epoch": 3.486140724946695, + "grad_norm": 0.27587834000587463, + "learning_rate": 0.00019984006245355037, + "loss": 0.4565, + "step": 1635 + }, + { + "epoch": 3.49680170575693, + "grad_norm": 0.22859402000904083, + "learning_rate": 0.00019983297270699448, + "loss": 0.4514, + "step": 1640 + }, + { + "epoch": 3.5074626865671643, + "grad_norm": 0.3489368259906769, + "learning_rate": 0.00019982572934301122, + "loss": 0.4727, + "step": 1645 + }, + { + "epoch": 3.518123667377399, + "grad_norm": 0.2632017135620117, + "learning_rate": 0.00019981833237274618, + "loss": 0.4415, + "step": 1650 + }, + { + "epoch": 3.5287846481876333, + "grad_norm": 0.27099326252937317, + "learning_rate": 0.00019981078180758154, + "loss": 0.4489, + "step": 1655 + }, + { + "epoch": 3.5394456289978677, + "grad_norm": 0.2415977120399475, + "learning_rate": 0.00019980307765913552, + "loss": 0.4764, + "step": 1660 + }, + { + "epoch": 3.550106609808102, + "grad_norm": 0.23986046016216278, + "learning_rate": 0.000199795219939263, + "loss": 0.4458, + "step": 1665 + }, + { + "epoch": 3.5607675906183367, + "grad_norm": 0.28455114364624023, + "learning_rate": 0.00019978720866005488, + "loss": 0.4846, + "step": 1670 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 0.2913159430027008, + "learning_rate": 0.0001997790438338385, + "loss": 0.4547, + "step": 1675 + }, + { + "epoch": 3.582089552238806, + "grad_norm": 0.25150275230407715, + "learning_rate": 0.0001997707254731775, + "loss": 0.4599, + "step": 1680 + }, + { + "epoch": 3.5927505330490406, + "grad_norm": 0.23482745885849, + "learning_rate": 0.00019976225359087164, + "loss": 0.4315, + "step": 1685 + }, + { + "epoch": 3.603411513859275, + "grad_norm": 0.23308737576007843, + "learning_rate": 0.00019975362819995703, + "loss": 0.449, + "step": 1690 + }, + { + "epoch": 3.6140724946695095, + "grad_norm": 0.2528814375400543, + "learning_rate": 0.00019974484931370592, + "loss": 0.4392, + "step": 1695 + }, + { + "epoch": 3.624733475479744, + "grad_norm": 0.25079530477523804, + "learning_rate": 0.00019973591694562678, + "loss": 0.4536, + "step": 1700 + }, + { + "epoch": 3.635394456289979, + "grad_norm": 0.2929099202156067, + "learning_rate": 0.00019972683110946421, + "loss": 0.4426, + "step": 1705 + }, + { + "epoch": 3.6460554371002134, + "grad_norm": 0.23356157541275024, + "learning_rate": 0.00019971759181919903, + "loss": 0.4602, + "step": 1710 + }, + { + "epoch": 3.656716417910448, + "grad_norm": 0.3128319978713989, + "learning_rate": 0.00019970819908904814, + "loss": 0.4629, + "step": 1715 + }, + { + "epoch": 3.6673773987206824, + "grad_norm": 0.23164990544319153, + "learning_rate": 0.00019969865293346454, + "loss": 0.4662, + "step": 1720 + }, + { + "epoch": 3.678038379530917, + "grad_norm": 0.43762582540512085, + "learning_rate": 0.00019968895336713733, + "loss": 0.4685, + "step": 1725 + }, + { + "epoch": 3.6886993603411513, + "grad_norm": 0.34830760955810547, + "learning_rate": 0.00019967910040499164, + "loss": 0.4504, + "step": 1730 + }, + { + "epoch": 3.699360341151386, + "grad_norm": 0.2538786828517914, + "learning_rate": 0.00019966909406218868, + "loss": 0.4967, + "step": 1735 + }, + { + "epoch": 3.7100213219616203, + "grad_norm": 0.23103195428848267, + "learning_rate": 0.0001996589343541257, + "loss": 0.4556, + "step": 1740 + }, + { + "epoch": 3.7206823027718547, + "grad_norm": 0.2618430554866791, + "learning_rate": 0.0001996486212964358, + "loss": 0.4453, + "step": 1745 + }, + { + "epoch": 3.7313432835820897, + "grad_norm": 0.23393474519252777, + "learning_rate": 0.00019963815490498817, + "loss": 0.4613, + "step": 1750 + }, + { + "epoch": 3.742004264392324, + "grad_norm": 0.2798391282558441, + "learning_rate": 0.00019962753519588798, + "loss": 0.4668, + "step": 1755 + }, + { + "epoch": 3.7526652452025586, + "grad_norm": 0.24927425384521484, + "learning_rate": 0.00019961676218547617, + "loss": 0.4424, + "step": 1760 + }, + { + "epoch": 3.763326226012793, + "grad_norm": 0.2537556290626526, + "learning_rate": 0.00019960583589032966, + "loss": 0.4413, + "step": 1765 + }, + { + "epoch": 3.7739872068230276, + "grad_norm": 0.2401181310415268, + "learning_rate": 0.00019959475632726128, + "loss": 0.4365, + "step": 1770 + }, + { + "epoch": 3.7846481876332625, + "grad_norm": 0.22927629947662354, + "learning_rate": 0.00019958352351331956, + "loss": 0.4455, + "step": 1775 + }, + { + "epoch": 3.795309168443497, + "grad_norm": 0.21933622658252716, + "learning_rate": 0.00019957213746578902, + "loss": 0.4661, + "step": 1780 + }, + { + "epoch": 3.8059701492537314, + "grad_norm": 0.28884589672088623, + "learning_rate": 0.00019956059820218982, + "loss": 0.4931, + "step": 1785 + }, + { + "epoch": 3.816631130063966, + "grad_norm": 0.2619436979293823, + "learning_rate": 0.00019954890574027797, + "loss": 0.4446, + "step": 1790 + }, + { + "epoch": 3.8272921108742004, + "grad_norm": 0.22175399959087372, + "learning_rate": 0.00019953706009804512, + "loss": 0.4482, + "step": 1795 + }, + { + "epoch": 3.837953091684435, + "grad_norm": 0.23060369491577148, + "learning_rate": 0.00019952506129371873, + "loss": 0.451, + "step": 1800 + }, + { + "epoch": 3.8486140724946694, + "grad_norm": 0.2313724309206009, + "learning_rate": 0.0001995129093457619, + "loss": 0.4496, + "step": 1805 + }, + { + "epoch": 3.859275053304904, + "grad_norm": 0.23518264293670654, + "learning_rate": 0.00019950060427287335, + "loss": 0.4581, + "step": 1810 + }, + { + "epoch": 3.8699360341151388, + "grad_norm": 0.22398614883422852, + "learning_rate": 0.00019948814609398746, + "loss": 0.4382, + "step": 1815 + }, + { + "epoch": 3.8805970149253732, + "grad_norm": 0.21408702433109283, + "learning_rate": 0.00019947553482827418, + "loss": 0.4517, + "step": 1820 + }, + { + "epoch": 3.8912579957356077, + "grad_norm": 0.26791512966156006, + "learning_rate": 0.00019946277049513904, + "loss": 0.4671, + "step": 1825 + }, + { + "epoch": 3.901918976545842, + "grad_norm": 0.37972912192344666, + "learning_rate": 0.00019944985311422304, + "loss": 0.4665, + "step": 1830 + }, + { + "epoch": 3.9125799573560767, + "grad_norm": 0.2744680941104889, + "learning_rate": 0.00019943678270540276, + "loss": 0.4627, + "step": 1835 + }, + { + "epoch": 3.923240938166311, + "grad_norm": 0.3253777325153351, + "learning_rate": 0.00019942355928879023, + "loss": 0.468, + "step": 1840 + }, + { + "epoch": 3.933901918976546, + "grad_norm": 0.32431936264038086, + "learning_rate": 0.00019941018288473285, + "loss": 0.4497, + "step": 1845 + }, + { + "epoch": 3.9445628997867805, + "grad_norm": 0.2247323989868164, + "learning_rate": 0.00019939665351381355, + "loss": 0.4444, + "step": 1850 + }, + { + "epoch": 3.955223880597015, + "grad_norm": 0.35610342025756836, + "learning_rate": 0.00019938297119685054, + "loss": 0.4563, + "step": 1855 + }, + { + "epoch": 3.9658848614072495, + "grad_norm": 0.2513818144798279, + "learning_rate": 0.00019936913595489743, + "loss": 0.442, + "step": 1860 + }, + { + "epoch": 3.976545842217484, + "grad_norm": 0.3135777711868286, + "learning_rate": 0.0001993551478092431, + "loss": 0.4377, + "step": 1865 + }, + { + "epoch": 3.9872068230277184, + "grad_norm": 0.24127310514450073, + "learning_rate": 0.0001993410067814118, + "loss": 0.4478, + "step": 1870 + }, + { + "epoch": 3.997867803837953, + "grad_norm": 0.23388491570949554, + "learning_rate": 0.00019932671289316282, + "loss": 0.4306, + "step": 1875 + }, + { + "epoch": 4.0, + "eval_loss": 0.5043795108795166, + "eval_runtime": 377.5601, + "eval_samples_per_second": 1.091, + "eval_steps_per_second": 1.091, + "step": 1876 + }, + { + "epoch": 4.008528784648187, + "grad_norm": 0.3674967288970947, + "learning_rate": 0.0001993122661664909, + "loss": 0.4371, + "step": 1880 + }, + { + "epoch": 4.019189765458422, + "grad_norm": 0.2773316204547882, + "learning_rate": 0.00019929766662362585, + "loss": 0.4043, + "step": 1885 + }, + { + "epoch": 4.029850746268656, + "grad_norm": 0.2394101619720459, + "learning_rate": 0.00019928291428703262, + "loss": 0.413, + "step": 1890 + }, + { + "epoch": 4.040511727078891, + "grad_norm": 0.23238113522529602, + "learning_rate": 0.00019926800917941128, + "loss": 0.4021, + "step": 1895 + }, + { + "epoch": 4.051172707889126, + "grad_norm": 0.22244401276111603, + "learning_rate": 0.000199252951323697, + "loss": 0.4101, + "step": 1900 + }, + { + "epoch": 4.061833688699361, + "grad_norm": 0.24964463710784912, + "learning_rate": 0.00019923774074306, + "loss": 0.4123, + "step": 1905 + }, + { + "epoch": 4.072494669509595, + "grad_norm": 0.23066940903663635, + "learning_rate": 0.00019922237746090537, + "loss": 0.4267, + "step": 1910 + }, + { + "epoch": 4.08315565031983, + "grad_norm": 0.23452460765838623, + "learning_rate": 0.00019920686150087336, + "loss": 0.4223, + "step": 1915 + }, + { + "epoch": 4.093816631130064, + "grad_norm": 0.3032955527305603, + "learning_rate": 0.00019919119288683908, + "loss": 0.432, + "step": 1920 + }, + { + "epoch": 4.104477611940299, + "grad_norm": 0.3310707211494446, + "learning_rate": 0.00019917537164291244, + "loss": 0.42, + "step": 1925 + }, + { + "epoch": 4.115138592750533, + "grad_norm": 0.24135416746139526, + "learning_rate": 0.00019915939779343838, + "loss": 0.4289, + "step": 1930 + }, + { + "epoch": 4.1257995735607675, + "grad_norm": 0.23443254828453064, + "learning_rate": 0.00019914327136299651, + "loss": 0.4216, + "step": 1935 + }, + { + "epoch": 4.136460554371002, + "grad_norm": 0.3196619749069214, + "learning_rate": 0.0001991269923764013, + "loss": 0.4387, + "step": 1940 + }, + { + "epoch": 4.1471215351812365, + "grad_norm": 0.2881762981414795, + "learning_rate": 0.00019911056085870197, + "loss": 0.4176, + "step": 1945 + }, + { + "epoch": 4.157782515991471, + "grad_norm": 0.25249961018562317, + "learning_rate": 0.00019909397683518242, + "loss": 0.4221, + "step": 1950 + }, + { + "epoch": 4.1684434968017055, + "grad_norm": 0.22756356000900269, + "learning_rate": 0.00019907724033136118, + "loss": 0.413, + "step": 1955 + }, + { + "epoch": 4.17910447761194, + "grad_norm": 0.24332334101200104, + "learning_rate": 0.0001990603513729915, + "loss": 0.4218, + "step": 1960 + }, + { + "epoch": 4.189765458422174, + "grad_norm": 0.23593220114707947, + "learning_rate": 0.00019904330998606116, + "loss": 0.4114, + "step": 1965 + }, + { + "epoch": 4.20042643923241, + "grad_norm": 0.266313374042511, + "learning_rate": 0.00019902611619679252, + "loss": 0.4309, + "step": 1970 + }, + { + "epoch": 4.211087420042644, + "grad_norm": 0.3359983563423157, + "learning_rate": 0.00019900877003164235, + "loss": 0.4339, + "step": 1975 + }, + { + "epoch": 4.221748400852879, + "grad_norm": 0.22711415588855743, + "learning_rate": 0.00019899127151730206, + "loss": 0.4165, + "step": 1980 + }, + { + "epoch": 4.232409381663113, + "grad_norm": 0.2225334793329239, + "learning_rate": 0.00019897362068069732, + "loss": 0.4094, + "step": 1985 + }, + { + "epoch": 4.243070362473348, + "grad_norm": 0.2701500356197357, + "learning_rate": 0.0001989558175489883, + "loss": 0.4239, + "step": 1990 + }, + { + "epoch": 4.253731343283582, + "grad_norm": 0.2480495721101761, + "learning_rate": 0.00019893786214956945, + "loss": 0.4137, + "step": 1995 + }, + { + "epoch": 4.264392324093817, + "grad_norm": 0.22299885749816895, + "learning_rate": 0.00019891975451006953, + "loss": 0.4273, + "step": 2000 + }, + { + "epoch": 4.275053304904051, + "grad_norm": 0.2259630262851715, + "learning_rate": 0.0001989014946583516, + "loss": 0.4223, + "step": 2005 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 0.3351574242115021, + "learning_rate": 0.00019888308262251285, + "loss": 0.4483, + "step": 2010 + }, + { + "epoch": 4.29637526652452, + "grad_norm": 0.21363438665866852, + "learning_rate": 0.0001988645184308848, + "loss": 0.4138, + "step": 2015 + }, + { + "epoch": 4.3070362473347545, + "grad_norm": 0.2409023493528366, + "learning_rate": 0.00019884580211203287, + "loss": 0.4166, + "step": 2020 + }, + { + "epoch": 4.317697228144989, + "grad_norm": 0.24684803187847137, + "learning_rate": 0.00019882693369475675, + "loss": 0.4089, + "step": 2025 + }, + { + "epoch": 4.3283582089552235, + "grad_norm": 0.24175861477851868, + "learning_rate": 0.0001988079132080901, + "loss": 0.4169, + "step": 2030 + }, + { + "epoch": 4.339019189765459, + "grad_norm": 0.3582640290260315, + "learning_rate": 0.00019878874068130062, + "loss": 0.4207, + "step": 2035 + }, + { + "epoch": 4.349680170575693, + "grad_norm": 0.23563334345817566, + "learning_rate": 0.00019876941614388992, + "loss": 0.4056, + "step": 2040 + }, + { + "epoch": 4.360341151385928, + "grad_norm": 0.24959246814250946, + "learning_rate": 0.0001987499396255935, + "loss": 0.4152, + "step": 2045 + }, + { + "epoch": 4.371002132196162, + "grad_norm": 0.2378864586353302, + "learning_rate": 0.00019873031115638073, + "loss": 0.428, + "step": 2050 + }, + { + "epoch": 4.381663113006397, + "grad_norm": 0.25769662857055664, + "learning_rate": 0.00019871053076645488, + "loss": 0.4273, + "step": 2055 + }, + { + "epoch": 4.392324093816631, + "grad_norm": 0.2148350328207016, + "learning_rate": 0.0001986905984862528, + "loss": 0.4341, + "step": 2060 + }, + { + "epoch": 4.402985074626866, + "grad_norm": 0.22630667686462402, + "learning_rate": 0.0001986705143464453, + "loss": 0.43, + "step": 2065 + }, + { + "epoch": 4.4136460554371, + "grad_norm": 0.23718136548995972, + "learning_rate": 0.00019865027837793665, + "loss": 0.4193, + "step": 2070 + }, + { + "epoch": 4.424307036247335, + "grad_norm": 0.26240232586860657, + "learning_rate": 0.00019862989061186483, + "loss": 0.4327, + "step": 2075 + }, + { + "epoch": 4.434968017057569, + "grad_norm": 0.21503274142742157, + "learning_rate": 0.0001986093510796015, + "loss": 0.4208, + "step": 2080 + }, + { + "epoch": 4.445628997867804, + "grad_norm": 0.31747710704803467, + "learning_rate": 0.0001985886598127516, + "loss": 0.4348, + "step": 2085 + }, + { + "epoch": 4.456289978678038, + "grad_norm": 0.24618090689182281, + "learning_rate": 0.00019856781684315382, + "loss": 0.4247, + "step": 2090 + }, + { + "epoch": 4.466950959488273, + "grad_norm": 0.33112359046936035, + "learning_rate": 0.00019854682220288013, + "loss": 0.4175, + "step": 2095 + }, + { + "epoch": 4.477611940298507, + "grad_norm": 0.23943935334682465, + "learning_rate": 0.0001985256759242359, + "loss": 0.4271, + "step": 2100 + }, + { + "epoch": 4.4882729211087415, + "grad_norm": 0.24192848801612854, + "learning_rate": 0.00019850437803975988, + "loss": 0.4221, + "step": 2105 + }, + { + "epoch": 4.498933901918977, + "grad_norm": 0.22631579637527466, + "learning_rate": 0.00019848292858222401, + "loss": 0.4233, + "step": 2110 + }, + { + "epoch": 4.509594882729211, + "grad_norm": 0.23344965279102325, + "learning_rate": 0.00019846132758463356, + "loss": 0.4161, + "step": 2115 + }, + { + "epoch": 4.520255863539446, + "grad_norm": 0.22698044776916504, + "learning_rate": 0.000198439575080227, + "loss": 0.4112, + "step": 2120 + }, + { + "epoch": 4.53091684434968, + "grad_norm": 0.3037104308605194, + "learning_rate": 0.00019841767110247575, + "loss": 0.4362, + "step": 2125 + }, + { + "epoch": 4.541577825159915, + "grad_norm": 0.24173210561275482, + "learning_rate": 0.00019839561568508454, + "loss": 0.4223, + "step": 2130 + }, + { + "epoch": 4.552238805970149, + "grad_norm": 0.2352645844221115, + "learning_rate": 0.00019837340886199096, + "loss": 0.4274, + "step": 2135 + }, + { + "epoch": 4.562899786780384, + "grad_norm": 0.2779860496520996, + "learning_rate": 0.0001983510506673657, + "loss": 0.4316, + "step": 2140 + }, + { + "epoch": 4.573560767590618, + "grad_norm": 0.24002455174922943, + "learning_rate": 0.0001983285411356122, + "loss": 0.4159, + "step": 2145 + }, + { + "epoch": 4.584221748400853, + "grad_norm": 0.22028042376041412, + "learning_rate": 0.00019830588030136698, + "loss": 0.4296, + "step": 2150 + }, + { + "epoch": 4.594882729211087, + "grad_norm": 0.3180830776691437, + "learning_rate": 0.0001982830681994992, + "loss": 0.4339, + "step": 2155 + }, + { + "epoch": 4.605543710021322, + "grad_norm": 0.2228025496006012, + "learning_rate": 0.00019826010486511091, + "loss": 0.4149, + "step": 2160 + }, + { + "epoch": 4.616204690831556, + "grad_norm": 0.2128361463546753, + "learning_rate": 0.00019823699033353677, + "loss": 0.4126, + "step": 2165 + }, + { + "epoch": 4.6268656716417915, + "grad_norm": 0.2322179228067398, + "learning_rate": 0.00019821372464034416, + "loss": 0.4128, + "step": 2170 + }, + { + "epoch": 4.637526652452026, + "grad_norm": 0.30600860714912415, + "learning_rate": 0.00019819030782133304, + "loss": 0.414, + "step": 2175 + }, + { + "epoch": 4.6481876332622605, + "grad_norm": 0.22045232355594635, + "learning_rate": 0.00019816673991253586, + "loss": 0.409, + "step": 2180 + }, + { + "epoch": 4.658848614072495, + "grad_norm": 0.2302045375108719, + "learning_rate": 0.00019814302095021768, + "loss": 0.4199, + "step": 2185 + }, + { + "epoch": 4.669509594882729, + "grad_norm": 0.22577248513698578, + "learning_rate": 0.00019811915097087587, + "loss": 0.4058, + "step": 2190 + }, + { + "epoch": 4.680170575692964, + "grad_norm": 0.6790816187858582, + "learning_rate": 0.00019809513001124024, + "loss": 0.4356, + "step": 2195 + }, + { + "epoch": 4.690831556503198, + "grad_norm": 0.2510231137275696, + "learning_rate": 0.00019807095810827293, + "loss": 0.4062, + "step": 2200 + }, + { + "epoch": 4.701492537313433, + "grad_norm": 0.24071648716926575, + "learning_rate": 0.00019804663529916826, + "loss": 0.4282, + "step": 2205 + }, + { + "epoch": 4.712153518123667, + "grad_norm": 0.2886710464954376, + "learning_rate": 0.00019802216162135287, + "loss": 0.4254, + "step": 2210 + }, + { + "epoch": 4.722814498933902, + "grad_norm": 0.2941761910915375, + "learning_rate": 0.0001979975371124855, + "loss": 0.4343, + "step": 2215 + }, + { + "epoch": 4.733475479744136, + "grad_norm": 0.2591281533241272, + "learning_rate": 0.00019797276181045693, + "loss": 0.4165, + "step": 2220 + }, + { + "epoch": 4.744136460554371, + "grad_norm": 0.2245703637599945, + "learning_rate": 0.00019794783575339004, + "loss": 0.4112, + "step": 2225 + }, + { + "epoch": 4.754797441364605, + "grad_norm": 0.48405957221984863, + "learning_rate": 0.00019792275897963967, + "loss": 0.4279, + "step": 2230 + }, + { + "epoch": 4.76545842217484, + "grad_norm": 0.22091209888458252, + "learning_rate": 0.00019789753152779258, + "loss": 0.4371, + "step": 2235 + }, + { + "epoch": 4.776119402985074, + "grad_norm": 0.23672465980052948, + "learning_rate": 0.00019787215343666732, + "loss": 0.4166, + "step": 2240 + }, + { + "epoch": 4.786780383795309, + "grad_norm": 0.43999361991882324, + "learning_rate": 0.0001978466247453143, + "loss": 0.4167, + "step": 2245 + }, + { + "epoch": 4.797441364605544, + "grad_norm": 0.2732659578323364, + "learning_rate": 0.0001978209454930157, + "loss": 0.4326, + "step": 2250 + }, + { + "epoch": 4.8081023454157785, + "grad_norm": 0.27667996287345886, + "learning_rate": 0.00019779511571928527, + "loss": 0.4192, + "step": 2255 + }, + { + "epoch": 4.818763326226013, + "grad_norm": 0.24479329586029053, + "learning_rate": 0.00019776913546386843, + "loss": 0.4158, + "step": 2260 + }, + { + "epoch": 4.8294243070362475, + "grad_norm": 0.21344681084156036, + "learning_rate": 0.0001977430047667422, + "loss": 0.4112, + "step": 2265 + }, + { + "epoch": 4.840085287846482, + "grad_norm": 0.24819132685661316, + "learning_rate": 0.00019771672366811503, + "loss": 0.414, + "step": 2270 + }, + { + "epoch": 4.850746268656716, + "grad_norm": 0.2435145080089569, + "learning_rate": 0.00019769029220842677, + "loss": 0.4172, + "step": 2275 + }, + { + "epoch": 4.861407249466951, + "grad_norm": 0.21831800043582916, + "learning_rate": 0.0001976637104283487, + "loss": 0.4168, + "step": 2280 + }, + { + "epoch": 4.872068230277185, + "grad_norm": 0.3001014292240143, + "learning_rate": 0.00019763697836878343, + "loss": 0.4271, + "step": 2285 + }, + { + "epoch": 4.88272921108742, + "grad_norm": 0.3473288118839264, + "learning_rate": 0.00019761009607086472, + "loss": 0.4256, + "step": 2290 + }, + { + "epoch": 4.893390191897654, + "grad_norm": 0.2094939649105072, + "learning_rate": 0.00019758306357595755, + "loss": 0.4207, + "step": 2295 + }, + { + "epoch": 4.904051172707889, + "grad_norm": 0.224636048078537, + "learning_rate": 0.00019755588092565805, + "loss": 0.4214, + "step": 2300 + }, + { + "epoch": 4.914712153518123, + "grad_norm": 0.22260229289531708, + "learning_rate": 0.00019752854816179336, + "loss": 0.4226, + "step": 2305 + }, + { + "epoch": 4.925373134328359, + "grad_norm": 0.21004381775856018, + "learning_rate": 0.0001975010653264216, + "loss": 0.414, + "step": 2310 + }, + { + "epoch": 4.936034115138593, + "grad_norm": 0.2120514214038849, + "learning_rate": 0.00019747343246183185, + "loss": 0.4152, + "step": 2315 + }, + { + "epoch": 4.946695095948828, + "grad_norm": 0.2152203619480133, + "learning_rate": 0.00019744564961054402, + "loss": 0.4159, + "step": 2320 + }, + { + "epoch": 4.957356076759062, + "grad_norm": 0.22371242940425873, + "learning_rate": 0.0001974177168153088, + "loss": 0.4095, + "step": 2325 + }, + { + "epoch": 4.968017057569297, + "grad_norm": 0.21865862607955933, + "learning_rate": 0.00019738963411910766, + "loss": 0.4261, + "step": 2330 + }, + { + "epoch": 4.978678038379531, + "grad_norm": 0.3230665326118469, + "learning_rate": 0.0001973614015651527, + "loss": 0.4116, + "step": 2335 + }, + { + "epoch": 4.9893390191897655, + "grad_norm": 0.21557492017745972, + "learning_rate": 0.00019733301919688651, + "loss": 0.4161, + "step": 2340 + }, + { + "epoch": 5.0, + "grad_norm": 0.21153585612773895, + "learning_rate": 0.00019730448705798239, + "loss": 0.4128, + "step": 2345 + }, + { + "epoch": 5.0, + "eval_loss": 0.5016890168190002, + "eval_runtime": 377.5434, + "eval_samples_per_second": 1.091, + "eval_steps_per_second": 1.091, + "step": 2345 + }, + { + "epoch": 5.0106609808102345, + "grad_norm": 0.20196357369422913, + "learning_rate": 0.000197275805192344, + "loss": 0.3909, + "step": 2350 + }, + { + "epoch": 5.021321961620469, + "grad_norm": 0.2446993738412857, + "learning_rate": 0.00019724697364410535, + "loss": 0.3876, + "step": 2355 + }, + { + "epoch": 5.031982942430703, + "grad_norm": 0.22501204907894135, + "learning_rate": 0.00019721799245763088, + "loss": 0.3882, + "step": 2360 + }, + { + "epoch": 5.042643923240938, + "grad_norm": 0.23419953882694244, + "learning_rate": 0.0001971888616775152, + "loss": 0.3786, + "step": 2365 + }, + { + "epoch": 5.053304904051172, + "grad_norm": 0.23151536285877228, + "learning_rate": 0.00019715958134858315, + "loss": 0.3925, + "step": 2370 + }, + { + "epoch": 5.063965884861407, + "grad_norm": 0.23873166739940643, + "learning_rate": 0.00019713015151588966, + "loss": 0.3927, + "step": 2375 + }, + { + "epoch": 5.074626865671641, + "grad_norm": 0.23083342611789703, + "learning_rate": 0.00019710057222471967, + "loss": 0.3836, + "step": 2380 + }, + { + "epoch": 5.085287846481877, + "grad_norm": 0.22406326234340668, + "learning_rate": 0.00019707084352058827, + "loss": 0.389, + "step": 2385 + }, + { + "epoch": 5.095948827292111, + "grad_norm": 0.37570300698280334, + "learning_rate": 0.00019704096544924022, + "loss": 0.3999, + "step": 2390 + }, + { + "epoch": 5.106609808102346, + "grad_norm": 0.21594493091106415, + "learning_rate": 0.0001970109380566503, + "loss": 0.38, + "step": 2395 + }, + { + "epoch": 5.11727078891258, + "grad_norm": 0.2725168466567993, + "learning_rate": 0.00019698076138902298, + "loss": 0.3848, + "step": 2400 + }, + { + "epoch": 5.127931769722815, + "grad_norm": 0.2510855495929718, + "learning_rate": 0.00019695043549279243, + "loss": 0.3859, + "step": 2405 + }, + { + "epoch": 5.138592750533049, + "grad_norm": 0.23722735047340393, + "learning_rate": 0.00019691996041462244, + "loss": 0.3876, + "step": 2410 + }, + { + "epoch": 5.149253731343284, + "grad_norm": 0.35469353199005127, + "learning_rate": 0.00019688933620140637, + "loss": 0.3863, + "step": 2415 + }, + { + "epoch": 5.159914712153518, + "grad_norm": 0.23087090253829956, + "learning_rate": 0.0001968585629002671, + "loss": 0.3898, + "step": 2420 + }, + { + "epoch": 5.1705756929637525, + "grad_norm": 0.21194830536842346, + "learning_rate": 0.00019682764055855683, + "loss": 0.3832, + "step": 2425 + }, + { + "epoch": 5.181236673773987, + "grad_norm": 0.23261596262454987, + "learning_rate": 0.00019679656922385715, + "loss": 0.3895, + "step": 2430 + }, + { + "epoch": 5.1918976545842215, + "grad_norm": 0.24160555005073547, + "learning_rate": 0.0001967653489439789, + "loss": 0.391, + "step": 2435 + }, + { + "epoch": 5.202558635394456, + "grad_norm": 0.23709999024868011, + "learning_rate": 0.00019673397976696216, + "loss": 0.3904, + "step": 2440 + }, + { + "epoch": 5.21321961620469, + "grad_norm": 0.2529030740261078, + "learning_rate": 0.00019670246174107597, + "loss": 0.3853, + "step": 2445 + }, + { + "epoch": 5.223880597014926, + "grad_norm": 0.22068992257118225, + "learning_rate": 0.0001966707949148186, + "loss": 0.3791, + "step": 2450 + }, + { + "epoch": 5.23454157782516, + "grad_norm": 0.23219233751296997, + "learning_rate": 0.00019663897933691718, + "loss": 0.3904, + "step": 2455 + }, + { + "epoch": 5.245202558635395, + "grad_norm": 0.25079360604286194, + "learning_rate": 0.00019660701505632772, + "loss": 0.3995, + "step": 2460 + }, + { + "epoch": 5.255863539445629, + "grad_norm": 0.2510697841644287, + "learning_rate": 0.00019657490212223515, + "loss": 0.3861, + "step": 2465 + }, + { + "epoch": 5.266524520255864, + "grad_norm": 0.25218454003334045, + "learning_rate": 0.000196542640584053, + "loss": 0.3878, + "step": 2470 + }, + { + "epoch": 5.277185501066098, + "grad_norm": 0.21124300360679626, + "learning_rate": 0.00019651023049142356, + "loss": 0.3881, + "step": 2475 + }, + { + "epoch": 5.287846481876333, + "grad_norm": 0.23286496102809906, + "learning_rate": 0.0001964776718942177, + "loss": 0.3893, + "step": 2480 + }, + { + "epoch": 5.298507462686567, + "grad_norm": 0.2385607361793518, + "learning_rate": 0.00019644496484253474, + "loss": 0.381, + "step": 2485 + }, + { + "epoch": 5.309168443496802, + "grad_norm": 0.22742030024528503, + "learning_rate": 0.00019641210938670247, + "loss": 0.393, + "step": 2490 + }, + { + "epoch": 5.319829424307036, + "grad_norm": 0.22051115334033966, + "learning_rate": 0.00019637910557727706, + "loss": 0.3933, + "step": 2495 + }, + { + "epoch": 5.330490405117271, + "grad_norm": 0.23317855596542358, + "learning_rate": 0.00019634595346504293, + "loss": 0.3877, + "step": 2500 + }, + { + "epoch": 5.341151385927505, + "grad_norm": 0.23425228893756866, + "learning_rate": 0.00019631265310101272, + "loss": 0.4158, + "step": 2505 + }, + { + "epoch": 5.3518123667377395, + "grad_norm": 0.25701725482940674, + "learning_rate": 0.00019627920453642715, + "loss": 0.3835, + "step": 2510 + }, + { + "epoch": 5.362473347547974, + "grad_norm": 0.23093344271183014, + "learning_rate": 0.00019624560782275505, + "loss": 0.3846, + "step": 2515 + }, + { + "epoch": 5.373134328358209, + "grad_norm": 0.2600732147693634, + "learning_rate": 0.00019621186301169315, + "loss": 0.3917, + "step": 2520 + }, + { + "epoch": 5.383795309168444, + "grad_norm": 0.2647717595100403, + "learning_rate": 0.00019617797015516607, + "loss": 0.3938, + "step": 2525 + }, + { + "epoch": 5.394456289978678, + "grad_norm": 0.24304771423339844, + "learning_rate": 0.0001961439293053263, + "loss": 0.3925, + "step": 2530 + }, + { + "epoch": 5.405117270788913, + "grad_norm": 0.2271909862756729, + "learning_rate": 0.00019610974051455398, + "loss": 0.3878, + "step": 2535 + }, + { + "epoch": 5.415778251599147, + "grad_norm": 0.22085613012313843, + "learning_rate": 0.00019607540383545692, + "loss": 0.4025, + "step": 2540 + }, + { + "epoch": 5.426439232409382, + "grad_norm": 0.2830078899860382, + "learning_rate": 0.0001960409193208705, + "loss": 0.3935, + "step": 2545 + }, + { + "epoch": 5.437100213219616, + "grad_norm": 0.37187430262565613, + "learning_rate": 0.00019600628702385751, + "loss": 0.3896, + "step": 2550 + }, + { + "epoch": 5.447761194029851, + "grad_norm": 0.23631027340888977, + "learning_rate": 0.00019597150699770835, + "loss": 0.3911, + "step": 2555 + }, + { + "epoch": 5.458422174840085, + "grad_norm": 0.224113330245018, + "learning_rate": 0.00019593657929594044, + "loss": 0.3876, + "step": 2560 + }, + { + "epoch": 5.46908315565032, + "grad_norm": 0.29911914467811584, + "learning_rate": 0.00019590150397229866, + "loss": 0.3966, + "step": 2565 + }, + { + "epoch": 5.479744136460554, + "grad_norm": 0.22963348031044006, + "learning_rate": 0.000195866281080755, + "loss": 0.3931, + "step": 2570 + }, + { + "epoch": 5.490405117270789, + "grad_norm": 0.24756336212158203, + "learning_rate": 0.0001958309106755084, + "loss": 0.3827, + "step": 2575 + }, + { + "epoch": 5.501066098081023, + "grad_norm": 0.22494661808013916, + "learning_rate": 0.00019579539281098493, + "loss": 0.3884, + "step": 2580 + }, + { + "epoch": 5.5117270788912585, + "grad_norm": 0.2217581868171692, + "learning_rate": 0.00019575972754183748, + "loss": 0.3954, + "step": 2585 + }, + { + "epoch": 5.522388059701493, + "grad_norm": 0.22264057397842407, + "learning_rate": 0.0001957239149229458, + "loss": 0.3925, + "step": 2590 + }, + { + "epoch": 5.533049040511727, + "grad_norm": 0.24900676310062408, + "learning_rate": 0.00019568795500941635, + "loss": 0.3938, + "step": 2595 + }, + { + "epoch": 5.543710021321962, + "grad_norm": 0.22802846133708954, + "learning_rate": 0.00019565184785658223, + "loss": 0.3903, + "step": 2600 + }, + { + "epoch": 5.554371002132196, + "grad_norm": 0.2182716578245163, + "learning_rate": 0.00019561559352000317, + "loss": 0.3929, + "step": 2605 + }, + { + "epoch": 5.565031982942431, + "grad_norm": 0.23668424785137177, + "learning_rate": 0.00019557919205546526, + "loss": 0.3815, + "step": 2610 + }, + { + "epoch": 5.575692963752665, + "grad_norm": 0.22820915281772614, + "learning_rate": 0.0001955426435189811, + "loss": 0.3937, + "step": 2615 + }, + { + "epoch": 5.5863539445629, + "grad_norm": 0.21698084473609924, + "learning_rate": 0.00019550594796678952, + "loss": 0.3925, + "step": 2620 + }, + { + "epoch": 5.597014925373134, + "grad_norm": 0.22192837297916412, + "learning_rate": 0.00019546910545535558, + "loss": 0.3858, + "step": 2625 + }, + { + "epoch": 5.607675906183369, + "grad_norm": 0.22095522284507751, + "learning_rate": 0.00019543211604137052, + "loss": 0.3863, + "step": 2630 + }, + { + "epoch": 5.618336886993603, + "grad_norm": 0.22427357733249664, + "learning_rate": 0.0001953949797817516, + "loss": 0.3836, + "step": 2635 + }, + { + "epoch": 5.628997867803838, + "grad_norm": 0.23269647359848022, + "learning_rate": 0.00019535769673364203, + "loss": 0.3913, + "step": 2640 + }, + { + "epoch": 5.639658848614072, + "grad_norm": 0.21933898329734802, + "learning_rate": 0.00019532026695441083, + "loss": 0.3948, + "step": 2645 + }, + { + "epoch": 5.650319829424307, + "grad_norm": 0.227766752243042, + "learning_rate": 0.00019528269050165297, + "loss": 0.3861, + "step": 2650 + }, + { + "epoch": 5.660980810234541, + "grad_norm": 0.22262893617153168, + "learning_rate": 0.00019524496743318891, + "loss": 0.3921, + "step": 2655 + }, + { + "epoch": 5.6716417910447765, + "grad_norm": 0.28188657760620117, + "learning_rate": 0.00019520709780706486, + "loss": 0.3802, + "step": 2660 + }, + { + "epoch": 5.682302771855011, + "grad_norm": 0.22414395213127136, + "learning_rate": 0.00019516908168155245, + "loss": 0.3858, + "step": 2665 + }, + { + "epoch": 5.6929637526652455, + "grad_norm": 0.222300723195076, + "learning_rate": 0.00019513091911514885, + "loss": 0.3886, + "step": 2670 + }, + { + "epoch": 5.70362473347548, + "grad_norm": 0.2155119776725769, + "learning_rate": 0.00019509261016657643, + "loss": 0.3948, + "step": 2675 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 0.23029391467571259, + "learning_rate": 0.0001950541548947829, + "loss": 0.3915, + "step": 2680 + }, + { + "epoch": 5.724946695095949, + "grad_norm": 0.23538485169410706, + "learning_rate": 0.0001950155533589411, + "loss": 0.4005, + "step": 2685 + }, + { + "epoch": 5.735607675906183, + "grad_norm": 0.249455988407135, + "learning_rate": 0.00019497680561844893, + "loss": 0.386, + "step": 2690 + }, + { + "epoch": 5.746268656716418, + "grad_norm": 0.21184088289737701, + "learning_rate": 0.00019493791173292923, + "loss": 0.3931, + "step": 2695 + }, + { + "epoch": 5.756929637526652, + "grad_norm": 0.21931645274162292, + "learning_rate": 0.00019489887176222975, + "loss": 0.3981, + "step": 2700 + }, + { + "epoch": 5.767590618336887, + "grad_norm": 0.2259492725133896, + "learning_rate": 0.00019485968576642308, + "loss": 0.3848, + "step": 2705 + }, + { + "epoch": 5.778251599147121, + "grad_norm": 0.23413480818271637, + "learning_rate": 0.00019482035380580638, + "loss": 0.3875, + "step": 2710 + }, + { + "epoch": 5.788912579957356, + "grad_norm": 0.22880232334136963, + "learning_rate": 0.00019478087594090155, + "loss": 0.3838, + "step": 2715 + }, + { + "epoch": 5.79957356076759, + "grad_norm": 0.22865185141563416, + "learning_rate": 0.00019474125223245488, + "loss": 0.3855, + "step": 2720 + }, + { + "epoch": 5.810234541577826, + "grad_norm": 0.24277456104755402, + "learning_rate": 0.00019470148274143713, + "loss": 0.3938, + "step": 2725 + }, + { + "epoch": 5.82089552238806, + "grad_norm": 0.2189398854970932, + "learning_rate": 0.00019466156752904343, + "loss": 0.4008, + "step": 2730 + }, + { + "epoch": 5.8315565031982945, + "grad_norm": 0.21893605589866638, + "learning_rate": 0.00019462150665669302, + "loss": 0.3874, + "step": 2735 + }, + { + "epoch": 5.842217484008529, + "grad_norm": 0.23077057301998138, + "learning_rate": 0.00019458130018602945, + "loss": 0.3929, + "step": 2740 + }, + { + "epoch": 5.8528784648187635, + "grad_norm": 0.2599683701992035, + "learning_rate": 0.00019454094817892008, + "loss": 0.3892, + "step": 2745 + }, + { + "epoch": 5.863539445628998, + "grad_norm": 0.22645121812820435, + "learning_rate": 0.00019450045069745642, + "loss": 0.3913, + "step": 2750 + }, + { + "epoch": 5.8742004264392325, + "grad_norm": 0.22834275662899017, + "learning_rate": 0.00019445980780395368, + "loss": 0.3958, + "step": 2755 + }, + { + "epoch": 5.884861407249467, + "grad_norm": 0.24456727504730225, + "learning_rate": 0.00019441901956095093, + "loss": 0.3939, + "step": 2760 + }, + { + "epoch": 5.895522388059701, + "grad_norm": 0.21773149073123932, + "learning_rate": 0.00019437808603121087, + "loss": 0.3988, + "step": 2765 + }, + { + "epoch": 5.906183368869936, + "grad_norm": 0.21768063306808472, + "learning_rate": 0.00019433700727771965, + "loss": 0.3894, + "step": 2770 + }, + { + "epoch": 5.91684434968017, + "grad_norm": 0.2415178418159485, + "learning_rate": 0.00019429578336368708, + "loss": 0.3931, + "step": 2775 + }, + { + "epoch": 5.927505330490405, + "grad_norm": 0.21271879971027374, + "learning_rate": 0.00019425441435254616, + "loss": 0.3957, + "step": 2780 + }, + { + "epoch": 5.938166311300639, + "grad_norm": 0.21745960414409637, + "learning_rate": 0.00019421290030795322, + "loss": 0.3948, + "step": 2785 + }, + { + "epoch": 5.948827292110874, + "grad_norm": 0.22035416960716248, + "learning_rate": 0.0001941712412937878, + "loss": 0.3922, + "step": 2790 + }, + { + "epoch": 5.959488272921108, + "grad_norm": 0.20828816294670105, + "learning_rate": 0.00019412943737415246, + "loss": 0.3976, + "step": 2795 + }, + { + "epoch": 5.970149253731344, + "grad_norm": 0.19749729335308075, + "learning_rate": 0.00019408748861337273, + "loss": 0.3994, + "step": 2800 + }, + { + "epoch": 5.980810234541578, + "grad_norm": 0.20768584311008453, + "learning_rate": 0.00019404539507599707, + "loss": 0.3869, + "step": 2805 + }, + { + "epoch": 5.991471215351813, + "grad_norm": 0.2182578146457672, + "learning_rate": 0.00019400315682679663, + "loss": 0.3924, + "step": 2810 + }, + { + "epoch": 6.0, + "eval_loss": 0.5093127489089966, + "eval_runtime": 377.4947, + "eval_samples_per_second": 1.091, + "eval_steps_per_second": 1.091, + "step": 2814 + }, + { + "epoch": 6.002132196162047, + "grad_norm": 0.21125191450119019, + "learning_rate": 0.0001939607739307653, + "loss": 0.3874, + "step": 2815 + }, + { + "epoch": 6.0127931769722816, + "grad_norm": 0.31068113446235657, + "learning_rate": 0.0001939182464531195, + "loss": 0.3704, + "step": 2820 + }, + { + "epoch": 6.023454157782516, + "grad_norm": 0.23276059329509735, + "learning_rate": 0.00019387557445929823, + "loss": 0.353, + "step": 2825 + }, + { + "epoch": 6.0341151385927505, + "grad_norm": 0.25309714674949646, + "learning_rate": 0.00019383275801496268, + "loss": 0.3494, + "step": 2830 + }, + { + "epoch": 6.044776119402985, + "grad_norm": 0.2310338020324707, + "learning_rate": 0.00019378979718599645, + "loss": 0.3534, + "step": 2835 + }, + { + "epoch": 6.0554371002132195, + "grad_norm": 0.23623259365558624, + "learning_rate": 0.00019374669203850532, + "loss": 0.3513, + "step": 2840 + }, + { + "epoch": 6.066098081023454, + "grad_norm": 0.2299884408712387, + "learning_rate": 0.00019370344263881702, + "loss": 0.3534, + "step": 2845 + }, + { + "epoch": 6.076759061833688, + "grad_norm": 0.5613902807235718, + "learning_rate": 0.0001936600490534814, + "loss": 0.3615, + "step": 2850 + }, + { + "epoch": 6.087420042643923, + "grad_norm": 0.22940614819526672, + "learning_rate": 0.00019361651134927003, + "loss": 0.3522, + "step": 2855 + }, + { + "epoch": 6.098081023454157, + "grad_norm": 0.22831672430038452, + "learning_rate": 0.0001935728295931763, + "loss": 0.3523, + "step": 2860 + }, + { + "epoch": 6.108742004264393, + "grad_norm": 0.23445968329906464, + "learning_rate": 0.00019352900385241536, + "loss": 0.369, + "step": 2865 + }, + { + "epoch": 6.119402985074627, + "grad_norm": 0.2444639503955841, + "learning_rate": 0.0001934850341944237, + "loss": 0.355, + "step": 2870 + }, + { + "epoch": 6.130063965884862, + "grad_norm": 0.2400490790605545, + "learning_rate": 0.00019344092068685948, + "loss": 0.3625, + "step": 2875 + }, + { + "epoch": 6.140724946695096, + "grad_norm": 0.2361455261707306, + "learning_rate": 0.00019339666339760207, + "loss": 0.3649, + "step": 2880 + }, + { + "epoch": 6.151385927505331, + "grad_norm": 0.26625874638557434, + "learning_rate": 0.00019335226239475215, + "loss": 0.3572, + "step": 2885 + }, + { + "epoch": 6.162046908315565, + "grad_norm": 0.2775781750679016, + "learning_rate": 0.0001933077177466315, + "loss": 0.3446, + "step": 2890 + }, + { + "epoch": 6.1727078891258, + "grad_norm": 0.25833654403686523, + "learning_rate": 0.00019326302952178294, + "loss": 0.3624, + "step": 2895 + }, + { + "epoch": 6.183368869936034, + "grad_norm": 0.2403610199689865, + "learning_rate": 0.00019321819778897023, + "loss": 0.3578, + "step": 2900 + }, + { + "epoch": 6.1940298507462686, + "grad_norm": 0.2580753266811371, + "learning_rate": 0.00019317322261717794, + "loss": 0.3536, + "step": 2905 + }, + { + "epoch": 6.204690831556503, + "grad_norm": 0.2725096046924591, + "learning_rate": 0.0001931281040756114, + "loss": 0.3689, + "step": 2910 + }, + { + "epoch": 6.2153518123667375, + "grad_norm": 0.27059614658355713, + "learning_rate": 0.00019308284223369646, + "loss": 0.3656, + "step": 2915 + }, + { + "epoch": 6.226012793176972, + "grad_norm": 0.24707560241222382, + "learning_rate": 0.00019303743716107957, + "loss": 0.3682, + "step": 2920 + }, + { + "epoch": 6.2366737739872065, + "grad_norm": 0.23825524747371674, + "learning_rate": 0.00019299188892762752, + "loss": 0.3578, + "step": 2925 + }, + { + "epoch": 6.247334754797441, + "grad_norm": 0.24557247757911682, + "learning_rate": 0.00019294619760342737, + "loss": 0.3624, + "step": 2930 + }, + { + "epoch": 6.257995735607676, + "grad_norm": 0.2559678256511688, + "learning_rate": 0.00019290036325878644, + "loss": 0.3693, + "step": 2935 + }, + { + "epoch": 6.268656716417911, + "grad_norm": 0.25294074416160583, + "learning_rate": 0.00019285438596423204, + "loss": 0.3651, + "step": 2940 + }, + { + "epoch": 6.279317697228145, + "grad_norm": 0.24387520551681519, + "learning_rate": 0.00019280826579051147, + "loss": 0.3589, + "step": 2945 + }, + { + "epoch": 6.28997867803838, + "grad_norm": 0.22580432891845703, + "learning_rate": 0.0001927620028085919, + "loss": 0.3703, + "step": 2950 + }, + { + "epoch": 6.300639658848614, + "grad_norm": 0.24953973293304443, + "learning_rate": 0.00019271559708966023, + "loss": 0.3606, + "step": 2955 + }, + { + "epoch": 6.311300639658849, + "grad_norm": 0.2454618364572525, + "learning_rate": 0.000192669048705123, + "loss": 0.362, + "step": 2960 + }, + { + "epoch": 6.321961620469083, + "grad_norm": 0.2393016368150711, + "learning_rate": 0.00019262235772660627, + "loss": 0.3695, + "step": 2965 + }, + { + "epoch": 6.332622601279318, + "grad_norm": 0.2463667392730713, + "learning_rate": 0.00019257552422595554, + "loss": 0.3658, + "step": 2970 + }, + { + "epoch": 6.343283582089552, + "grad_norm": 0.24116967618465424, + "learning_rate": 0.00019252854827523557, + "loss": 0.3671, + "step": 2975 + }, + { + "epoch": 6.353944562899787, + "grad_norm": 0.2345789670944214, + "learning_rate": 0.00019248142994673036, + "loss": 0.368, + "step": 2980 + }, + { + "epoch": 6.364605543710021, + "grad_norm": 0.26505357027053833, + "learning_rate": 0.000192434169312943, + "loss": 0.3695, + "step": 2985 + }, + { + "epoch": 6.3752665245202556, + "grad_norm": 0.2504933476448059, + "learning_rate": 0.00019238676644659546, + "loss": 0.3605, + "step": 2990 + }, + { + "epoch": 6.38592750533049, + "grad_norm": 0.24889980256557465, + "learning_rate": 0.0001923392214206287, + "loss": 0.3684, + "step": 2995 + }, + { + "epoch": 6.396588486140725, + "grad_norm": 0.2319326400756836, + "learning_rate": 0.00019229153430820232, + "loss": 0.3621, + "step": 3000 + }, + { + "epoch": 6.40724946695096, + "grad_norm": 0.2329808622598648, + "learning_rate": 0.00019224370518269458, + "loss": 0.3649, + "step": 3005 + }, + { + "epoch": 6.417910447761194, + "grad_norm": 0.2565195560455322, + "learning_rate": 0.00019219573411770235, + "loss": 0.3602, + "step": 3010 + }, + { + "epoch": 6.428571428571429, + "grad_norm": 0.24189329147338867, + "learning_rate": 0.00019214762118704076, + "loss": 0.3691, + "step": 3015 + }, + { + "epoch": 6.439232409381663, + "grad_norm": 0.2512595057487488, + "learning_rate": 0.0001920993664647434, + "loss": 0.364, + "step": 3020 + }, + { + "epoch": 6.449893390191898, + "grad_norm": 0.24277447164058685, + "learning_rate": 0.00019205097002506185, + "loss": 0.3732, + "step": 3025 + }, + { + "epoch": 6.460554371002132, + "grad_norm": 0.242990642786026, + "learning_rate": 0.00019200243194246594, + "loss": 0.3674, + "step": 3030 + }, + { + "epoch": 6.471215351812367, + "grad_norm": 0.23621074855327606, + "learning_rate": 0.00019195375229164334, + "loss": 0.3599, + "step": 3035 + }, + { + "epoch": 6.481876332622601, + "grad_norm": 0.26253125071525574, + "learning_rate": 0.0001919049311474996, + "loss": 0.3708, + "step": 3040 + }, + { + "epoch": 6.492537313432836, + "grad_norm": 0.2214423567056656, + "learning_rate": 0.000191855968585158, + "loss": 0.3612, + "step": 3045 + }, + { + "epoch": 6.50319829424307, + "grad_norm": 0.24866749346256256, + "learning_rate": 0.00019180686467995935, + "loss": 0.3682, + "step": 3050 + }, + { + "epoch": 6.513859275053305, + "grad_norm": 0.2474697232246399, + "learning_rate": 0.00019175761950746204, + "loss": 0.354, + "step": 3055 + }, + { + "epoch": 6.524520255863539, + "grad_norm": 0.26961109042167664, + "learning_rate": 0.00019170823314344185, + "loss": 0.3708, + "step": 3060 + }, + { + "epoch": 6.535181236673774, + "grad_norm": 0.2510351538658142, + "learning_rate": 0.0001916587056638917, + "loss": 0.3667, + "step": 3065 + }, + { + "epoch": 6.545842217484008, + "grad_norm": 0.24457301199436188, + "learning_rate": 0.00019160903714502173, + "loss": 0.3679, + "step": 3070 + }, + { + "epoch": 6.556503198294243, + "grad_norm": 0.23988381028175354, + "learning_rate": 0.00019155922766325918, + "loss": 0.3608, + "step": 3075 + }, + { + "epoch": 6.567164179104478, + "grad_norm": 0.2317483127117157, + "learning_rate": 0.000191509277295248, + "loss": 0.3761, + "step": 3080 + }, + { + "epoch": 6.577825159914712, + "grad_norm": 0.2614232301712036, + "learning_rate": 0.0001914591861178491, + "loss": 0.3606, + "step": 3085 + }, + { + "epoch": 6.588486140724947, + "grad_norm": 0.24253317713737488, + "learning_rate": 0.00019140895420813997, + "loss": 0.362, + "step": 3090 + }, + { + "epoch": 6.599147121535181, + "grad_norm": 0.2507173418998718, + "learning_rate": 0.00019135858164341473, + "loss": 0.3594, + "step": 3095 + }, + { + "epoch": 6.609808102345416, + "grad_norm": 0.23574085533618927, + "learning_rate": 0.0001913080685011838, + "loss": 0.3661, + "step": 3100 + }, + { + "epoch": 6.62046908315565, + "grad_norm": 0.2325553447008133, + "learning_rate": 0.00019125741485917405, + "loss": 0.3756, + "step": 3105 + }, + { + "epoch": 6.631130063965885, + "grad_norm": 0.2191423624753952, + "learning_rate": 0.00019120662079532853, + "loss": 0.354, + "step": 3110 + }, + { + "epoch": 6.641791044776119, + "grad_norm": 0.21787339448928833, + "learning_rate": 0.00019115568638780622, + "loss": 0.3657, + "step": 3115 + }, + { + "epoch": 6.652452025586354, + "grad_norm": 0.21904399991035461, + "learning_rate": 0.0001911046117149822, + "loss": 0.367, + "step": 3120 + }, + { + "epoch": 6.663113006396588, + "grad_norm": 0.23119735717773438, + "learning_rate": 0.00019105339685544735, + "loss": 0.3646, + "step": 3125 + }, + { + "epoch": 6.673773987206823, + "grad_norm": 0.24613478779792786, + "learning_rate": 0.00019100204188800827, + "loss": 0.3682, + "step": 3130 + }, + { + "epoch": 6.684434968017058, + "grad_norm": 0.2366684079170227, + "learning_rate": 0.00019095054689168705, + "loss": 0.3714, + "step": 3135 + }, + { + "epoch": 6.6950959488272925, + "grad_norm": 0.2413744032382965, + "learning_rate": 0.0001908989119457214, + "loss": 0.3682, + "step": 3140 + }, + { + "epoch": 6.705756929637527, + "grad_norm": 0.23421700298786163, + "learning_rate": 0.00019084713712956428, + "loss": 0.3639, + "step": 3145 + }, + { + "epoch": 6.7164179104477615, + "grad_norm": 0.23423875868320465, + "learning_rate": 0.00019079522252288386, + "loss": 0.3655, + "step": 3150 + }, + { + "epoch": 6.727078891257996, + "grad_norm": 0.23802149295806885, + "learning_rate": 0.00019074316820556352, + "loss": 0.3708, + "step": 3155 + }, + { + "epoch": 6.73773987206823, + "grad_norm": 0.25665974617004395, + "learning_rate": 0.00019069097425770154, + "loss": 0.3762, + "step": 3160 + }, + { + "epoch": 6.748400852878465, + "grad_norm": 0.23551535606384277, + "learning_rate": 0.00019063864075961098, + "loss": 0.3687, + "step": 3165 + }, + { + "epoch": 6.759061833688699, + "grad_norm": 0.24098068475723267, + "learning_rate": 0.00019058616779181982, + "loss": 0.3659, + "step": 3170 + }, + { + "epoch": 6.769722814498934, + "grad_norm": 0.22562439739704132, + "learning_rate": 0.0001905335554350705, + "loss": 0.3724, + "step": 3175 + }, + { + "epoch": 6.780383795309168, + "grad_norm": 0.224997878074646, + "learning_rate": 0.00019048080377031995, + "loss": 0.3705, + "step": 3180 + }, + { + "epoch": 6.791044776119403, + "grad_norm": 0.2575388252735138, + "learning_rate": 0.00019042791287873957, + "loss": 0.3611, + "step": 3185 + }, + { + "epoch": 6.801705756929637, + "grad_norm": 0.231009379029274, + "learning_rate": 0.0001903748828417149, + "loss": 0.3653, + "step": 3190 + }, + { + "epoch": 6.812366737739872, + "grad_norm": 0.23769618570804596, + "learning_rate": 0.0001903217137408456, + "loss": 0.3615, + "step": 3195 + }, + { + "epoch": 6.823027718550106, + "grad_norm": 0.23301640152931213, + "learning_rate": 0.00019026840565794536, + "loss": 0.366, + "step": 3200 + }, + { + "epoch": 6.833688699360341, + "grad_norm": 0.2212369292974472, + "learning_rate": 0.00019021495867504163, + "loss": 0.3632, + "step": 3205 + }, + { + "epoch": 6.844349680170575, + "grad_norm": 0.23795363306999207, + "learning_rate": 0.0001901613728743757, + "loss": 0.3681, + "step": 3210 + }, + { + "epoch": 6.855010660980811, + "grad_norm": 0.24354343116283417, + "learning_rate": 0.00019010764833840243, + "loss": 0.3695, + "step": 3215 + }, + { + "epoch": 6.865671641791045, + "grad_norm": 0.24145299196243286, + "learning_rate": 0.00019005378514979008, + "loss": 0.3667, + "step": 3220 + }, + { + "epoch": 6.8763326226012795, + "grad_norm": 0.24070268869400024, + "learning_rate": 0.0001899997833914204, + "loss": 0.3693, + "step": 3225 + }, + { + "epoch": 6.886993603411514, + "grad_norm": 0.22578920423984528, + "learning_rate": 0.00018994564314638832, + "loss": 0.3692, + "step": 3230 + }, + { + "epoch": 6.8976545842217485, + "grad_norm": 0.22691179811954498, + "learning_rate": 0.00018989136449800174, + "loss": 0.3766, + "step": 3235 + }, + { + "epoch": 6.908315565031983, + "grad_norm": 0.2194678634405136, + "learning_rate": 0.0001898369475297817, + "loss": 0.3668, + "step": 3240 + }, + { + "epoch": 6.918976545842217, + "grad_norm": 0.22618421912193298, + "learning_rate": 0.000189782392325462, + "loss": 0.3592, + "step": 3245 + }, + { + "epoch": 6.929637526652452, + "grad_norm": 0.2549285292625427, + "learning_rate": 0.0001897276989689891, + "loss": 0.3653, + "step": 3250 + }, + { + "epoch": 6.940298507462686, + "grad_norm": 0.23101598024368286, + "learning_rate": 0.00018967286754452214, + "loss": 0.3569, + "step": 3255 + }, + { + "epoch": 6.950959488272921, + "grad_norm": 0.2506960332393646, + "learning_rate": 0.00018961789813643268, + "loss": 0.3633, + "step": 3260 + }, + { + "epoch": 6.961620469083155, + "grad_norm": 0.2284671515226364, + "learning_rate": 0.00018956279082930455, + "loss": 0.3624, + "step": 3265 + }, + { + "epoch": 6.97228144989339, + "grad_norm": 0.22146272659301758, + "learning_rate": 0.00018950754570793384, + "loss": 0.37, + "step": 3270 + }, + { + "epoch": 6.982942430703625, + "grad_norm": 0.2425510585308075, + "learning_rate": 0.00018945216285732864, + "loss": 0.366, + "step": 3275 + }, + { + "epoch": 6.99360341151386, + "grad_norm": 0.2304454892873764, + "learning_rate": 0.00018939664236270907, + "loss": 0.3684, + "step": 3280 + }, + { + "epoch": 7.0, + "eval_loss": 0.5168320536613464, + "eval_runtime": 377.6098, + "eval_samples_per_second": 1.091, + "eval_steps_per_second": 1.091, + "step": 3283 + }, + { + "epoch": 7.004264392324094, + "grad_norm": 0.2056385576725006, + "learning_rate": 0.00018934098430950692, + "loss": 0.3479, + "step": 3285 + }, + { + "epoch": 7.014925373134329, + "grad_norm": 0.2757323086261749, + "learning_rate": 0.0001892851887833657, + "loss": 0.333, + "step": 3290 + }, + { + "epoch": 7.025586353944563, + "grad_norm": 0.25871726870536804, + "learning_rate": 0.00018922925587014046, + "loss": 0.3212, + "step": 3295 + }, + { + "epoch": 7.036247334754798, + "grad_norm": 0.2494359016418457, + "learning_rate": 0.00018917318565589772, + "loss": 0.3248, + "step": 3300 + }, + { + "epoch": 7.046908315565032, + "grad_norm": 0.2385275512933731, + "learning_rate": 0.00018911697822691516, + "loss": 0.3189, + "step": 3305 + }, + { + "epoch": 7.0575692963752665, + "grad_norm": 0.2520158588886261, + "learning_rate": 0.00018906063366968165, + "loss": 0.3268, + "step": 3310 + }, + { + "epoch": 7.068230277185501, + "grad_norm": 0.25822409987449646, + "learning_rate": 0.00018900415207089708, + "loss": 0.3169, + "step": 3315 + }, + { + "epoch": 7.0788912579957355, + "grad_norm": 0.2619076669216156, + "learning_rate": 0.00018894753351747214, + "loss": 0.3279, + "step": 3320 + }, + { + "epoch": 7.08955223880597, + "grad_norm": 0.30978551506996155, + "learning_rate": 0.0001888907780965284, + "loss": 0.327, + "step": 3325 + }, + { + "epoch": 7.100213219616204, + "grad_norm": 0.25372347235679626, + "learning_rate": 0.00018883388589539785, + "loss": 0.3254, + "step": 3330 + }, + { + "epoch": 7.110874200426439, + "grad_norm": 0.27630311250686646, + "learning_rate": 0.0001887768570016231, + "loss": 0.3291, + "step": 3335 + }, + { + "epoch": 7.121535181236673, + "grad_norm": 0.2716643810272217, + "learning_rate": 0.00018871969150295706, + "loss": 0.3241, + "step": 3340 + }, + { + "epoch": 7.132196162046908, + "grad_norm": 0.2678888440132141, + "learning_rate": 0.00018866238948736278, + "loss": 0.3304, + "step": 3345 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 0.2532709240913391, + "learning_rate": 0.00018860495104301345, + "loss": 0.3331, + "step": 3350 + }, + { + "epoch": 7.153518123667378, + "grad_norm": 0.2671636939048767, + "learning_rate": 0.0001885473762582921, + "loss": 0.3315, + "step": 3355 + }, + { + "epoch": 7.164179104477612, + "grad_norm": 0.2550068497657776, + "learning_rate": 0.00018848966522179168, + "loss": 0.3306, + "step": 3360 + }, + { + "epoch": 7.174840085287847, + "grad_norm": 0.2700331211090088, + "learning_rate": 0.00018843181802231465, + "loss": 0.329, + "step": 3365 + }, + { + "epoch": 7.185501066098081, + "grad_norm": 0.26168689131736755, + "learning_rate": 0.00018837383474887314, + "loss": 0.3327, + "step": 3370 + }, + { + "epoch": 7.196162046908316, + "grad_norm": 0.24964787065982819, + "learning_rate": 0.00018831571549068852, + "loss": 0.3353, + "step": 3375 + }, + { + "epoch": 7.20682302771855, + "grad_norm": 0.2676330804824829, + "learning_rate": 0.00018825746033719149, + "loss": 0.3316, + "step": 3380 + }, + { + "epoch": 7.217484008528785, + "grad_norm": 0.25253960490226746, + "learning_rate": 0.0001881990693780219, + "loss": 0.3316, + "step": 3385 + }, + { + "epoch": 7.228144989339019, + "grad_norm": 0.257114440202713, + "learning_rate": 0.0001881405427030284, + "loss": 0.3307, + "step": 3390 + }, + { + "epoch": 7.2388059701492535, + "grad_norm": 0.25102248787879944, + "learning_rate": 0.00018808188040226868, + "loss": 0.3348, + "step": 3395 + }, + { + "epoch": 7.249466950959488, + "grad_norm": 0.25489816069602966, + "learning_rate": 0.000188023082566009, + "loss": 0.3342, + "step": 3400 + }, + { + "epoch": 7.2601279317697225, + "grad_norm": 0.27044063806533813, + "learning_rate": 0.00018796414928472417, + "loss": 0.3391, + "step": 3405 + }, + { + "epoch": 7.270788912579957, + "grad_norm": 0.26209956407546997, + "learning_rate": 0.00018790508064909746, + "loss": 0.3318, + "step": 3410 + }, + { + "epoch": 7.281449893390192, + "grad_norm": 0.25549113750457764, + "learning_rate": 0.00018784587675002045, + "loss": 0.3322, + "step": 3415 + }, + { + "epoch": 7.292110874200427, + "grad_norm": 0.26465660333633423, + "learning_rate": 0.00018778653767859274, + "loss": 0.3319, + "step": 3420 + }, + { + "epoch": 7.302771855010661, + "grad_norm": 0.2753106951713562, + "learning_rate": 0.00018772706352612203, + "loss": 0.3329, + "step": 3425 + }, + { + "epoch": 7.313432835820896, + "grad_norm": 0.2526467740535736, + "learning_rate": 0.00018766745438412384, + "loss": 0.3311, + "step": 3430 + }, + { + "epoch": 7.32409381663113, + "grad_norm": 0.2626464068889618, + "learning_rate": 0.00018760771034432138, + "loss": 0.3318, + "step": 3435 + }, + { + "epoch": 7.334754797441365, + "grad_norm": 0.2631151080131531, + "learning_rate": 0.0001875478314986455, + "loss": 0.3453, + "step": 3440 + }, + { + "epoch": 7.345415778251599, + "grad_norm": 0.25757527351379395, + "learning_rate": 0.0001874878179392344, + "loss": 0.3373, + "step": 3445 + }, + { + "epoch": 7.356076759061834, + "grad_norm": 0.2395113706588745, + "learning_rate": 0.0001874276697584336, + "loss": 0.331, + "step": 3450 + }, + { + "epoch": 7.366737739872068, + "grad_norm": 0.2804111838340759, + "learning_rate": 0.0001873673870487958, + "loss": 0.3378, + "step": 3455 + }, + { + "epoch": 7.377398720682303, + "grad_norm": 0.24439595639705658, + "learning_rate": 0.00018730696990308069, + "loss": 0.3381, + "step": 3460 + }, + { + "epoch": 7.388059701492537, + "grad_norm": 0.270958811044693, + "learning_rate": 0.00018724641841425478, + "loss": 0.3418, + "step": 3465 + }, + { + "epoch": 7.398720682302772, + "grad_norm": 0.2635878324508667, + "learning_rate": 0.0001871857326754914, + "loss": 0.3433, + "step": 3470 + }, + { + "epoch": 7.409381663113006, + "grad_norm": 0.24128612875938416, + "learning_rate": 0.00018712491278017032, + "loss": 0.3395, + "step": 3475 + }, + { + "epoch": 7.4200426439232405, + "grad_norm": 0.2588317096233368, + "learning_rate": 0.00018706395882187783, + "loss": 0.3415, + "step": 3480 + }, + { + "epoch": 7.430703624733475, + "grad_norm": 0.2590773105621338, + "learning_rate": 0.0001870028708944065, + "loss": 0.3392, + "step": 3485 + }, + { + "epoch": 7.44136460554371, + "grad_norm": 0.25688695907592773, + "learning_rate": 0.00018694164909175505, + "loss": 0.3385, + "step": 3490 + }, + { + "epoch": 7.452025586353945, + "grad_norm": 0.23704120516777039, + "learning_rate": 0.00018688029350812817, + "loss": 0.3356, + "step": 3495 + }, + { + "epoch": 7.462686567164179, + "grad_norm": 0.2817398011684418, + "learning_rate": 0.00018681880423793642, + "loss": 0.3368, + "step": 3500 + }, + { + "epoch": 7.473347547974414, + "grad_norm": 0.2590171694755554, + "learning_rate": 0.00018675718137579607, + "loss": 0.3382, + "step": 3505 + }, + { + "epoch": 7.484008528784648, + "grad_norm": 0.2843134105205536, + "learning_rate": 0.00018669542501652896, + "loss": 0.3304, + "step": 3510 + }, + { + "epoch": 7.494669509594883, + "grad_norm": 0.25284621119499207, + "learning_rate": 0.00018663353525516234, + "loss": 0.3337, + "step": 3515 + }, + { + "epoch": 7.505330490405117, + "grad_norm": 0.24715737998485565, + "learning_rate": 0.00018657151218692873, + "loss": 0.3373, + "step": 3520 + }, + { + "epoch": 7.515991471215352, + "grad_norm": 0.28074926137924194, + "learning_rate": 0.0001865093559072658, + "loss": 0.3376, + "step": 3525 + }, + { + "epoch": 7.526652452025586, + "grad_norm": 0.2531152367591858, + "learning_rate": 0.00018644706651181614, + "loss": 0.3329, + "step": 3530 + }, + { + "epoch": 7.537313432835821, + "grad_norm": 0.27217596769332886, + "learning_rate": 0.00018638464409642723, + "loss": 0.3486, + "step": 3535 + }, + { + "epoch": 7.547974413646055, + "grad_norm": 0.2517159581184387, + "learning_rate": 0.0001863220887571512, + "loss": 0.343, + "step": 3540 + }, + { + "epoch": 7.55863539445629, + "grad_norm": 0.2538190484046936, + "learning_rate": 0.00018625940059024477, + "loss": 0.3343, + "step": 3545 + }, + { + "epoch": 7.569296375266525, + "grad_norm": 0.26679527759552, + "learning_rate": 0.00018619657969216893, + "loss": 0.348, + "step": 3550 + }, + { + "epoch": 7.5799573560767595, + "grad_norm": 0.24433985352516174, + "learning_rate": 0.00018613362615958905, + "loss": 0.3455, + "step": 3555 + }, + { + "epoch": 7.590618336886994, + "grad_norm": 0.2719508111476898, + "learning_rate": 0.0001860705400893745, + "loss": 0.3414, + "step": 3560 + }, + { + "epoch": 7.601279317697228, + "grad_norm": 0.2666242718696594, + "learning_rate": 0.00018600732157859863, + "loss": 0.3384, + "step": 3565 + }, + { + "epoch": 7.611940298507463, + "grad_norm": 0.24249517917633057, + "learning_rate": 0.00018594397072453856, + "loss": 0.339, + "step": 3570 + }, + { + "epoch": 7.622601279317697, + "grad_norm": 0.2475687712430954, + "learning_rate": 0.00018588048762467502, + "loss": 0.3428, + "step": 3575 + }, + { + "epoch": 7.633262260127932, + "grad_norm": 0.2500527799129486, + "learning_rate": 0.00018581687237669234, + "loss": 0.3332, + "step": 3580 + }, + { + "epoch": 7.643923240938166, + "grad_norm": 0.2528587281703949, + "learning_rate": 0.0001857531250784781, + "loss": 0.3429, + "step": 3585 + }, + { + "epoch": 7.654584221748401, + "grad_norm": 0.2627830505371094, + "learning_rate": 0.0001856892458281231, + "loss": 0.3396, + "step": 3590 + }, + { + "epoch": 7.665245202558635, + "grad_norm": 0.2573624849319458, + "learning_rate": 0.00018562523472392118, + "loss": 0.3391, + "step": 3595 + }, + { + "epoch": 7.67590618336887, + "grad_norm": 0.2411065399646759, + "learning_rate": 0.0001855610918643691, + "loss": 0.3384, + "step": 3600 + }, + { + "epoch": 7.686567164179104, + "grad_norm": 0.2589527666568756, + "learning_rate": 0.00018549681734816623, + "loss": 0.3429, + "step": 3605 + }, + { + "epoch": 7.697228144989339, + "grad_norm": 0.2436107099056244, + "learning_rate": 0.00018543241127421474, + "loss": 0.3435, + "step": 3610 + }, + { + "epoch": 7.707889125799573, + "grad_norm": 0.272020161151886, + "learning_rate": 0.00018536787374161902, + "loss": 0.3418, + "step": 3615 + }, + { + "epoch": 7.718550106609808, + "grad_norm": 0.26080530881881714, + "learning_rate": 0.00018530320484968588, + "loss": 0.3367, + "step": 3620 + }, + { + "epoch": 7.729211087420042, + "grad_norm": 0.2503691613674164, + "learning_rate": 0.0001852384046979242, + "loss": 0.3367, + "step": 3625 + }, + { + "epoch": 7.7398720682302775, + "grad_norm": 0.26822352409362793, + "learning_rate": 0.0001851734733860449, + "loss": 0.3498, + "step": 3630 + }, + { + "epoch": 7.750533049040512, + "grad_norm": 0.28552523255348206, + "learning_rate": 0.00018510841101396062, + "loss": 0.3406, + "step": 3635 + }, + { + "epoch": 7.7611940298507465, + "grad_norm": 0.2446276843547821, + "learning_rate": 0.0001850432176817857, + "loss": 0.3465, + "step": 3640 + }, + { + "epoch": 7.771855010660981, + "grad_norm": 0.24052871763706207, + "learning_rate": 0.00018497789348983606, + "loss": 0.3434, + "step": 3645 + }, + { + "epoch": 7.782515991471215, + "grad_norm": 0.23899152874946594, + "learning_rate": 0.00018491243853862893, + "loss": 0.3365, + "step": 3650 + }, + { + "epoch": 7.79317697228145, + "grad_norm": 0.24732346832752228, + "learning_rate": 0.00018484685292888278, + "loss": 0.3382, + "step": 3655 + }, + { + "epoch": 7.803837953091684, + "grad_norm": 0.2519215941429138, + "learning_rate": 0.00018478113676151703, + "loss": 0.3463, + "step": 3660 + }, + { + "epoch": 7.814498933901919, + "grad_norm": 0.24091705679893494, + "learning_rate": 0.00018471529013765209, + "loss": 0.3404, + "step": 3665 + }, + { + "epoch": 7.825159914712153, + "grad_norm": 0.2794884443283081, + "learning_rate": 0.0001846493131586091, + "loss": 0.3469, + "step": 3670 + }, + { + "epoch": 7.835820895522388, + "grad_norm": 0.24296560883522034, + "learning_rate": 0.00018458320592590975, + "loss": 0.3434, + "step": 3675 + }, + { + "epoch": 7.846481876332622, + "grad_norm": 0.24800756573677063, + "learning_rate": 0.00018451696854127617, + "loss": 0.3384, + "step": 3680 + }, + { + "epoch": 7.857142857142857, + "grad_norm": 0.2350349873304367, + "learning_rate": 0.0001844506011066308, + "loss": 0.3428, + "step": 3685 + }, + { + "epoch": 7.867803837953092, + "grad_norm": 0.2573322355747223, + "learning_rate": 0.0001843841037240961, + "loss": 0.3463, + "step": 3690 + }, + { + "epoch": 7.878464818763327, + "grad_norm": 0.256381630897522, + "learning_rate": 0.00018431747649599463, + "loss": 0.3397, + "step": 3695 + }, + { + "epoch": 7.889125799573561, + "grad_norm": 0.23707297444343567, + "learning_rate": 0.0001842507195248486, + "loss": 0.3437, + "step": 3700 + }, + { + "epoch": 7.899786780383796, + "grad_norm": 0.24699944257736206, + "learning_rate": 0.00018418383291337988, + "loss": 0.3398, + "step": 3705 + }, + { + "epoch": 7.91044776119403, + "grad_norm": 0.25237977504730225, + "learning_rate": 0.00018411681676450999, + "loss": 0.3409, + "step": 3710 + }, + { + "epoch": 7.9211087420042645, + "grad_norm": 0.2656485438346863, + "learning_rate": 0.00018404967118135955, + "loss": 0.3487, + "step": 3715 + }, + { + "epoch": 7.931769722814499, + "grad_norm": 0.23709309101104736, + "learning_rate": 0.0001839823962672485, + "loss": 0.3398, + "step": 3720 + }, + { + "epoch": 7.9424307036247335, + "grad_norm": 0.24946698546409607, + "learning_rate": 0.00018391499212569573, + "loss": 0.3459, + "step": 3725 + }, + { + "epoch": 7.953091684434968, + "grad_norm": 0.2608436346054077, + "learning_rate": 0.00018384745886041898, + "loss": 0.3394, + "step": 3730 + }, + { + "epoch": 7.963752665245202, + "grad_norm": 0.2503463625907898, + "learning_rate": 0.00018377979657533468, + "loss": 0.3436, + "step": 3735 + }, + { + "epoch": 7.974413646055437, + "grad_norm": 0.2556673586368561, + "learning_rate": 0.0001837120053745578, + "loss": 0.3519, + "step": 3740 + }, + { + "epoch": 7.985074626865671, + "grad_norm": 0.24612018465995789, + "learning_rate": 0.0001836440853624017, + "loss": 0.3388, + "step": 3745 + }, + { + "epoch": 7.995735607675906, + "grad_norm": 0.26963427662849426, + "learning_rate": 0.00018357603664337786, + "loss": 0.3403, + "step": 3750 + }, + { + "epoch": 8.0, + "eval_loss": 0.5337910056114197, + "eval_runtime": 377.6371, + "eval_samples_per_second": 1.091, + "eval_steps_per_second": 1.091, + "step": 3752 + }, + { + "epoch": 8.00639658848614, + "grad_norm": 0.2208224982023239, + "learning_rate": 0.00018350785932219588, + "loss": 0.3081, + "step": 3755 + }, + { + "epoch": 8.017057569296375, + "grad_norm": 0.30632683634757996, + "learning_rate": 0.00018343955350376325, + "loss": 0.2978, + "step": 3760 + }, + { + "epoch": 8.02771855010661, + "grad_norm": 0.25390052795410156, + "learning_rate": 0.00018337111929318516, + "loss": 0.2948, + "step": 3765 + }, + { + "epoch": 8.038379530916844, + "grad_norm": 0.296369731426239, + "learning_rate": 0.00018330255679576438, + "loss": 0.2963, + "step": 3770 + }, + { + "epoch": 8.049040511727078, + "grad_norm": 0.2958175837993622, + "learning_rate": 0.00018323386611700105, + "loss": 0.2905, + "step": 3775 + }, + { + "epoch": 8.059701492537313, + "grad_norm": 0.2595365047454834, + "learning_rate": 0.00018316504736259255, + "loss": 0.2918, + "step": 3780 + }, + { + "epoch": 8.070362473347547, + "grad_norm": 0.2825353145599365, + "learning_rate": 0.00018309610063843337, + "loss": 0.3, + "step": 3785 + }, + { + "epoch": 8.081023454157782, + "grad_norm": 0.2677433490753174, + "learning_rate": 0.00018302702605061492, + "loss": 0.2964, + "step": 3790 + }, + { + "epoch": 8.091684434968018, + "grad_norm": 0.28075000643730164, + "learning_rate": 0.00018295782370542532, + "loss": 0.2979, + "step": 3795 + }, + { + "epoch": 8.102345415778252, + "grad_norm": 0.2629709243774414, + "learning_rate": 0.00018288849370934926, + "loss": 0.3005, + "step": 3800 + }, + { + "epoch": 8.113006396588487, + "grad_norm": 0.2850215435028076, + "learning_rate": 0.00018281903616906796, + "loss": 0.2976, + "step": 3805 + }, + { + "epoch": 8.123667377398721, + "grad_norm": 0.29631924629211426, + "learning_rate": 0.0001827494511914587, + "loss": 0.2938, + "step": 3810 + }, + { + "epoch": 8.134328358208956, + "grad_norm": 0.26315709948539734, + "learning_rate": 0.00018267973888359509, + "loss": 0.3021, + "step": 3815 + }, + { + "epoch": 8.14498933901919, + "grad_norm": 0.30577051639556885, + "learning_rate": 0.0001826098993527465, + "loss": 0.2996, + "step": 3820 + }, + { + "epoch": 8.155650319829425, + "grad_norm": 0.2897678315639496, + "learning_rate": 0.0001825399327063781, + "loss": 0.3048, + "step": 3825 + }, + { + "epoch": 8.16631130063966, + "grad_norm": 0.3003354072570801, + "learning_rate": 0.00018246983905215075, + "loss": 0.3075, + "step": 3830 + }, + { + "epoch": 8.176972281449894, + "grad_norm": 0.28864815831184387, + "learning_rate": 0.00018239961849792055, + "loss": 0.3091, + "step": 3835 + }, + { + "epoch": 8.187633262260128, + "grad_norm": 0.28102535009384155, + "learning_rate": 0.0001823292711517391, + "loss": 0.2969, + "step": 3840 + }, + { + "epoch": 8.198294243070363, + "grad_norm": 0.2669455409049988, + "learning_rate": 0.00018225879712185293, + "loss": 0.3061, + "step": 3845 + }, + { + "epoch": 8.208955223880597, + "grad_norm": 0.2893795669078827, + "learning_rate": 0.00018218819651670356, + "loss": 0.3003, + "step": 3850 + }, + { + "epoch": 8.219616204690832, + "grad_norm": 0.31041857600212097, + "learning_rate": 0.00018211746944492727, + "loss": 0.3069, + "step": 3855 + }, + { + "epoch": 8.230277185501066, + "grad_norm": 0.2678110599517822, + "learning_rate": 0.000182046616015355, + "loss": 0.3023, + "step": 3860 + }, + { + "epoch": 8.2409381663113, + "grad_norm": 0.3051944375038147, + "learning_rate": 0.00018197563633701196, + "loss": 0.3095, + "step": 3865 + }, + { + "epoch": 8.251599147121535, + "grad_norm": 0.267646461725235, + "learning_rate": 0.00018190453051911782, + "loss": 0.3047, + "step": 3870 + }, + { + "epoch": 8.26226012793177, + "grad_norm": 0.27988821268081665, + "learning_rate": 0.00018183329867108624, + "loss": 0.3132, + "step": 3875 + }, + { + "epoch": 8.272921108742004, + "grad_norm": 0.293363094329834, + "learning_rate": 0.0001817619409025248, + "loss": 0.3054, + "step": 3880 + }, + { + "epoch": 8.283582089552239, + "grad_norm": 0.28679507970809937, + "learning_rate": 0.00018169045732323492, + "loss": 0.3049, + "step": 3885 + }, + { + "epoch": 8.294243070362473, + "grad_norm": 0.28792116045951843, + "learning_rate": 0.0001816188480432115, + "loss": 0.3112, + "step": 3890 + }, + { + "epoch": 8.304904051172707, + "grad_norm": 0.2938394844532013, + "learning_rate": 0.00018154711317264297, + "loss": 0.3101, + "step": 3895 + }, + { + "epoch": 8.315565031982942, + "grad_norm": 0.2776646316051483, + "learning_rate": 0.00018147525282191093, + "loss": 0.3046, + "step": 3900 + }, + { + "epoch": 8.326226012793176, + "grad_norm": 0.2619486153125763, + "learning_rate": 0.00018140326710159007, + "loss": 0.3066, + "step": 3905 + }, + { + "epoch": 8.336886993603411, + "grad_norm": 0.2895703911781311, + "learning_rate": 0.00018133115612244807, + "loss": 0.3122, + "step": 3910 + }, + { + "epoch": 8.347547974413645, + "grad_norm": 0.2928364872932434, + "learning_rate": 0.00018125891999544525, + "loss": 0.303, + "step": 3915 + }, + { + "epoch": 8.35820895522388, + "grad_norm": 0.27352485060691833, + "learning_rate": 0.00018118655883173456, + "loss": 0.301, + "step": 3920 + }, + { + "epoch": 8.368869936034114, + "grad_norm": 0.3004440665245056, + "learning_rate": 0.00018111407274266136, + "loss": 0.3084, + "step": 3925 + }, + { + "epoch": 8.379530916844349, + "grad_norm": 0.26515400409698486, + "learning_rate": 0.00018104146183976316, + "loss": 0.3052, + "step": 3930 + }, + { + "epoch": 8.390191897654585, + "grad_norm": 0.29159972071647644, + "learning_rate": 0.00018096872623476963, + "loss": 0.3018, + "step": 3935 + }, + { + "epoch": 8.40085287846482, + "grad_norm": 0.31077924370765686, + "learning_rate": 0.00018089586603960224, + "loss": 0.3139, + "step": 3940 + }, + { + "epoch": 8.411513859275054, + "grad_norm": 0.2826644480228424, + "learning_rate": 0.00018082288136637422, + "loss": 0.2955, + "step": 3945 + }, + { + "epoch": 8.422174840085288, + "grad_norm": 0.2825087308883667, + "learning_rate": 0.00018074977232739031, + "loss": 0.3127, + "step": 3950 + }, + { + "epoch": 8.432835820895523, + "grad_norm": 0.2901898920536041, + "learning_rate": 0.0001806765390351467, + "loss": 0.3099, + "step": 3955 + }, + { + "epoch": 8.443496801705757, + "grad_norm": 0.28308314085006714, + "learning_rate": 0.00018060318160233063, + "loss": 0.3122, + "step": 3960 + }, + { + "epoch": 8.454157782515992, + "grad_norm": 0.26890453696250916, + "learning_rate": 0.00018052970014182046, + "loss": 0.3156, + "step": 3965 + }, + { + "epoch": 8.464818763326226, + "grad_norm": 0.2962822914123535, + "learning_rate": 0.00018045609476668545, + "loss": 0.3184, + "step": 3970 + }, + { + "epoch": 8.47547974413646, + "grad_norm": 0.2848854959011078, + "learning_rate": 0.00018038236559018533, + "loss": 0.309, + "step": 3975 + }, + { + "epoch": 8.486140724946695, + "grad_norm": 0.3047114312648773, + "learning_rate": 0.00018030851272577051, + "loss": 0.3118, + "step": 3980 + }, + { + "epoch": 8.49680170575693, + "grad_norm": 0.28175976872444153, + "learning_rate": 0.00018023453628708173, + "loss": 0.3074, + "step": 3985 + }, + { + "epoch": 8.507462686567164, + "grad_norm": 0.27742594480514526, + "learning_rate": 0.00018016043638794974, + "loss": 0.3127, + "step": 3990 + }, + { + "epoch": 8.518123667377399, + "grad_norm": 0.28773581981658936, + "learning_rate": 0.0001800862131423954, + "loss": 0.3057, + "step": 3995 + }, + { + "epoch": 8.528784648187633, + "grad_norm": 0.2765009105205536, + "learning_rate": 0.00018001186666462927, + "loss": 0.3128, + "step": 4000 + }, + { + "epoch": 8.539445628997868, + "grad_norm": 0.2800111174583435, + "learning_rate": 0.00017993739706905162, + "loss": 0.3096, + "step": 4005 + }, + { + "epoch": 8.550106609808102, + "grad_norm": 0.30302369594573975, + "learning_rate": 0.00017986280447025209, + "loss": 0.3016, + "step": 4010 + }, + { + "epoch": 8.560767590618337, + "grad_norm": 0.2798007130622864, + "learning_rate": 0.0001797880889830096, + "loss": 0.3061, + "step": 4015 + }, + { + "epoch": 8.571428571428571, + "grad_norm": 0.29015523195266724, + "learning_rate": 0.00017971325072229226, + "loss": 0.3134, + "step": 4020 + }, + { + "epoch": 8.582089552238806, + "grad_norm": 0.3815457820892334, + "learning_rate": 0.00017963828980325697, + "loss": 0.3131, + "step": 4025 + }, + { + "epoch": 8.59275053304904, + "grad_norm": 0.2907319664955139, + "learning_rate": 0.00017956320634124944, + "loss": 0.314, + "step": 4030 + }, + { + "epoch": 8.603411513859275, + "grad_norm": 0.29612481594085693, + "learning_rate": 0.00017948800045180393, + "loss": 0.3168, + "step": 4035 + }, + { + "epoch": 8.614072494669509, + "grad_norm": 0.2797704339027405, + "learning_rate": 0.00017941267225064306, + "loss": 0.3144, + "step": 4040 + }, + { + "epoch": 8.624733475479744, + "grad_norm": 0.27811723947525024, + "learning_rate": 0.00017933722185367774, + "loss": 0.303, + "step": 4045 + }, + { + "epoch": 8.635394456289978, + "grad_norm": 0.2933618724346161, + "learning_rate": 0.00017926164937700676, + "loss": 0.3097, + "step": 4050 + }, + { + "epoch": 8.646055437100213, + "grad_norm": 0.282921701669693, + "learning_rate": 0.0001791859549369169, + "loss": 0.3104, + "step": 4055 + }, + { + "epoch": 8.656716417910447, + "grad_norm": 0.2758900225162506, + "learning_rate": 0.00017911013864988252, + "loss": 0.3108, + "step": 4060 + }, + { + "epoch": 8.667377398720681, + "grad_norm": 0.2904449999332428, + "learning_rate": 0.00017903420063256555, + "loss": 0.3209, + "step": 4065 + }, + { + "epoch": 8.678038379530918, + "grad_norm": 0.28849634528160095, + "learning_rate": 0.00017895814100181515, + "loss": 0.3055, + "step": 4070 + }, + { + "epoch": 8.688699360341152, + "grad_norm": 0.2709294259548187, + "learning_rate": 0.0001788819598746677, + "loss": 0.3167, + "step": 4075 + }, + { + "epoch": 8.699360341151387, + "grad_norm": 0.28200262784957886, + "learning_rate": 0.0001788056573683464, + "loss": 0.307, + "step": 4080 + }, + { + "epoch": 8.710021321961621, + "grad_norm": 0.27431854605674744, + "learning_rate": 0.00017872923360026137, + "loss": 0.3163, + "step": 4085 + }, + { + "epoch": 8.720682302771856, + "grad_norm": 0.28479164838790894, + "learning_rate": 0.00017865268868800925, + "loss": 0.3257, + "step": 4090 + }, + { + "epoch": 8.73134328358209, + "grad_norm": 0.2959545850753784, + "learning_rate": 0.00017857602274937308, + "loss": 0.3138, + "step": 4095 + }, + { + "epoch": 8.742004264392325, + "grad_norm": 0.270533949136734, + "learning_rate": 0.00017849923590232213, + "loss": 0.3182, + "step": 4100 + }, + { + "epoch": 8.752665245202559, + "grad_norm": 0.26438501477241516, + "learning_rate": 0.0001784223282650118, + "loss": 0.3084, + "step": 4105 + }, + { + "epoch": 8.763326226012794, + "grad_norm": 0.2890710234642029, + "learning_rate": 0.00017834529995578317, + "loss": 0.3093, + "step": 4110 + }, + { + "epoch": 8.773987206823028, + "grad_norm": 0.2725368142127991, + "learning_rate": 0.0001782681510931632, + "loss": 0.3185, + "step": 4115 + }, + { + "epoch": 8.784648187633262, + "grad_norm": 0.2648097276687622, + "learning_rate": 0.00017819088179586427, + "loss": 0.3126, + "step": 4120 + }, + { + "epoch": 8.795309168443497, + "grad_norm": 0.27868813276290894, + "learning_rate": 0.00017811349218278407, + "loss": 0.3157, + "step": 4125 + }, + { + "epoch": 8.805970149253731, + "grad_norm": 0.3133993446826935, + "learning_rate": 0.00017803598237300537, + "loss": 0.3128, + "step": 4130 + }, + { + "epoch": 8.816631130063966, + "grad_norm": 0.270416796207428, + "learning_rate": 0.00017795835248579606, + "loss": 0.3087, + "step": 4135 + }, + { + "epoch": 8.8272921108742, + "grad_norm": 0.299452543258667, + "learning_rate": 0.00017788060264060864, + "loss": 0.3126, + "step": 4140 + }, + { + "epoch": 8.837953091684435, + "grad_norm": 0.2789115607738495, + "learning_rate": 0.00017780273295708025, + "loss": 0.3149, + "step": 4145 + }, + { + "epoch": 8.84861407249467, + "grad_norm": 0.2616700828075409, + "learning_rate": 0.0001777247435550324, + "loss": 0.3151, + "step": 4150 + }, + { + "epoch": 8.859275053304904, + "grad_norm": 0.2998231053352356, + "learning_rate": 0.0001776466345544709, + "loss": 0.3143, + "step": 4155 + }, + { + "epoch": 8.869936034115138, + "grad_norm": 0.2851693034172058, + "learning_rate": 0.00017756840607558553, + "loss": 0.3153, + "step": 4160 + }, + { + "epoch": 8.880597014925373, + "grad_norm": 0.2862933874130249, + "learning_rate": 0.00017749005823874988, + "loss": 0.3124, + "step": 4165 + }, + { + "epoch": 8.891257995735607, + "grad_norm": 0.29242345690727234, + "learning_rate": 0.00017741159116452132, + "loss": 0.3137, + "step": 4170 + }, + { + "epoch": 8.901918976545842, + "grad_norm": 0.3226570188999176, + "learning_rate": 0.00017733300497364054, + "loss": 0.3168, + "step": 4175 + }, + { + "epoch": 8.912579957356076, + "grad_norm": 0.31018882989883423, + "learning_rate": 0.00017725429978703163, + "loss": 0.3162, + "step": 4180 + }, + { + "epoch": 8.92324093816631, + "grad_norm": 0.30581411719322205, + "learning_rate": 0.00017717547572580178, + "loss": 0.3166, + "step": 4185 + }, + { + "epoch": 8.933901918976545, + "grad_norm": 0.27954214811325073, + "learning_rate": 0.00017709653291124103, + "loss": 0.3175, + "step": 4190 + }, + { + "epoch": 8.94456289978678, + "grad_norm": 0.2803252041339874, + "learning_rate": 0.00017701747146482222, + "loss": 0.3228, + "step": 4195 + }, + { + "epoch": 8.955223880597014, + "grad_norm": 0.27694806456565857, + "learning_rate": 0.00017693829150820068, + "loss": 0.3152, + "step": 4200 + }, + { + "epoch": 8.96588486140725, + "grad_norm": 0.2755722403526306, + "learning_rate": 0.00017685899316321422, + "loss": 0.3105, + "step": 4205 + }, + { + "epoch": 8.976545842217483, + "grad_norm": 0.26287201046943665, + "learning_rate": 0.00017677957655188258, + "loss": 0.3146, + "step": 4210 + }, + { + "epoch": 8.98720682302772, + "grad_norm": 0.2679538428783417, + "learning_rate": 0.00017670004179640774, + "loss": 0.3196, + "step": 4215 + }, + { + "epoch": 8.997867803837954, + "grad_norm": 0.2998240292072296, + "learning_rate": 0.0001766203890191733, + "loss": 0.311, + "step": 4220 + }, + { + "epoch": 9.0, + "eval_loss": 0.556614875793457, + "eval_runtime": 377.56, + "eval_samples_per_second": 1.091, + "eval_steps_per_second": 1.091, + "step": 4221 + }, + { + "epoch": 9.008528784648188, + "grad_norm": 0.2680657207965851, + "learning_rate": 0.00017654061834274453, + "loss": 0.2787, + "step": 4225 + }, + { + "epoch": 9.019189765458423, + "grad_norm": 0.28186333179473877, + "learning_rate": 0.00017646072988986816, + "loss": 0.2668, + "step": 4230 + }, + { + "epoch": 9.029850746268657, + "grad_norm": 0.3159712255001068, + "learning_rate": 0.00017638072378347203, + "loss": 0.2681, + "step": 4235 + }, + { + "epoch": 9.040511727078892, + "grad_norm": 0.29439476132392883, + "learning_rate": 0.00017630060014666514, + "loss": 0.2644, + "step": 4240 + }, + { + "epoch": 9.051172707889126, + "grad_norm": 0.27110064029693604, + "learning_rate": 0.00017622035910273726, + "loss": 0.2645, + "step": 4245 + }, + { + "epoch": 9.06183368869936, + "grad_norm": 0.3253141939640045, + "learning_rate": 0.00017614000077515886, + "loss": 0.2668, + "step": 4250 + }, + { + "epoch": 9.072494669509595, + "grad_norm": 0.27271440625190735, + "learning_rate": 0.00017605952528758085, + "loss": 0.2636, + "step": 4255 + }, + { + "epoch": 9.08315565031983, + "grad_norm": 0.3024181127548218, + "learning_rate": 0.00017597893276383446, + "loss": 0.2651, + "step": 4260 + }, + { + "epoch": 9.093816631130064, + "grad_norm": 0.29704058170318604, + "learning_rate": 0.00017589822332793098, + "loss": 0.2705, + "step": 4265 + }, + { + "epoch": 9.104477611940299, + "grad_norm": 0.3102332055568695, + "learning_rate": 0.0001758173971040616, + "loss": 0.2645, + "step": 4270 + }, + { + "epoch": 9.115138592750533, + "grad_norm": 0.28398755192756653, + "learning_rate": 0.00017573645421659715, + "loss": 0.2695, + "step": 4275 + }, + { + "epoch": 9.125799573560768, + "grad_norm": 0.3188519775867462, + "learning_rate": 0.00017565539479008814, + "loss": 0.272, + "step": 4280 + }, + { + "epoch": 9.136460554371002, + "grad_norm": 0.30803632736206055, + "learning_rate": 0.0001755742189492643, + "loss": 0.268, + "step": 4285 + }, + { + "epoch": 9.147121535181236, + "grad_norm": 0.3042227327823639, + "learning_rate": 0.00017549292681903444, + "loss": 0.2659, + "step": 4290 + }, + { + "epoch": 9.157782515991471, + "grad_norm": 0.3055075407028198, + "learning_rate": 0.00017541151852448644, + "loss": 0.2705, + "step": 4295 + }, + { + "epoch": 9.168443496801705, + "grad_norm": 0.3084838092327118, + "learning_rate": 0.00017532999419088682, + "loss": 0.2711, + "step": 4300 + }, + { + "epoch": 9.17910447761194, + "grad_norm": 0.3110904395580292, + "learning_rate": 0.00017524835394368065, + "loss": 0.2678, + "step": 4305 + }, + { + "epoch": 9.189765458422174, + "grad_norm": 0.3138080835342407, + "learning_rate": 0.0001751665979084915, + "loss": 0.2715, + "step": 4310 + }, + { + "epoch": 9.200426439232409, + "grad_norm": 0.2787773609161377, + "learning_rate": 0.00017508472621112093, + "loss": 0.2764, + "step": 4315 + }, + { + "epoch": 9.211087420042643, + "grad_norm": 0.31073546409606934, + "learning_rate": 0.0001750027389775486, + "loss": 0.2745, + "step": 4320 + }, + { + "epoch": 9.221748400852878, + "grad_norm": 0.3100415766239166, + "learning_rate": 0.00017492063633393188, + "loss": 0.2731, + "step": 4325 + }, + { + "epoch": 9.232409381663112, + "grad_norm": 0.300081342458725, + "learning_rate": 0.00017483841840660577, + "loss": 0.2711, + "step": 4330 + }, + { + "epoch": 9.243070362473347, + "grad_norm": 0.31163203716278076, + "learning_rate": 0.0001747560853220826, + "loss": 0.2786, + "step": 4335 + }, + { + "epoch": 9.253731343283581, + "grad_norm": 0.33607375621795654, + "learning_rate": 0.00017467363720705204, + "loss": 0.2728, + "step": 4340 + }, + { + "epoch": 9.264392324093816, + "grad_norm": 0.300729900598526, + "learning_rate": 0.0001745910741883806, + "loss": 0.2749, + "step": 4345 + }, + { + "epoch": 9.275053304904052, + "grad_norm": 0.3036794364452362, + "learning_rate": 0.00017450839639311162, + "loss": 0.2726, + "step": 4350 + }, + { + "epoch": 9.285714285714286, + "grad_norm": 0.32798221707344055, + "learning_rate": 0.00017442560394846516, + "loss": 0.2752, + "step": 4355 + }, + { + "epoch": 9.296375266524521, + "grad_norm": 0.2973875105381012, + "learning_rate": 0.00017434269698183763, + "loss": 0.2743, + "step": 4360 + }, + { + "epoch": 9.307036247334755, + "grad_norm": 0.3339863717556, + "learning_rate": 0.00017425967562080167, + "loss": 0.2766, + "step": 4365 + }, + { + "epoch": 9.31769722814499, + "grad_norm": 0.30738508701324463, + "learning_rate": 0.00017417653999310585, + "loss": 0.2728, + "step": 4370 + }, + { + "epoch": 9.328358208955224, + "grad_norm": 0.3430582284927368, + "learning_rate": 0.0001740932902266747, + "loss": 0.2744, + "step": 4375 + }, + { + "epoch": 9.339019189765459, + "grad_norm": 0.2887689769268036, + "learning_rate": 0.00017400992644960842, + "loss": 0.2772, + "step": 4380 + }, + { + "epoch": 9.349680170575693, + "grad_norm": 0.3249075412750244, + "learning_rate": 0.0001739264487901824, + "loss": 0.2757, + "step": 4385 + }, + { + "epoch": 9.360341151385928, + "grad_norm": 0.31958818435668945, + "learning_rate": 0.00017384285737684753, + "loss": 0.2744, + "step": 4390 + }, + { + "epoch": 9.371002132196162, + "grad_norm": 0.31824401021003723, + "learning_rate": 0.0001737591523382296, + "loss": 0.2809, + "step": 4395 + }, + { + "epoch": 9.381663113006397, + "grad_norm": 0.3125913143157959, + "learning_rate": 0.00017367533380312924, + "loss": 0.276, + "step": 4400 + }, + { + "epoch": 9.392324093816631, + "grad_norm": 0.32215094566345215, + "learning_rate": 0.0001735914019005218, + "loss": 0.2746, + "step": 4405 + }, + { + "epoch": 9.402985074626866, + "grad_norm": 0.3145129382610321, + "learning_rate": 0.00017350735675955697, + "loss": 0.2818, + "step": 4410 + }, + { + "epoch": 9.4136460554371, + "grad_norm": 0.3180083930492401, + "learning_rate": 0.0001734231985095588, + "loss": 0.2782, + "step": 4415 + }, + { + "epoch": 9.424307036247335, + "grad_norm": 0.307829350233078, + "learning_rate": 0.00017333892728002527, + "loss": 0.2744, + "step": 4420 + }, + { + "epoch": 9.43496801705757, + "grad_norm": 0.3098660111427307, + "learning_rate": 0.00017325454320062832, + "loss": 0.2794, + "step": 4425 + }, + { + "epoch": 9.445628997867804, + "grad_norm": 0.2991037666797638, + "learning_rate": 0.0001731700464012134, + "loss": 0.2778, + "step": 4430 + }, + { + "epoch": 9.456289978678038, + "grad_norm": 0.3197588622570038, + "learning_rate": 0.0001730854370117996, + "loss": 0.2764, + "step": 4435 + }, + { + "epoch": 9.466950959488273, + "grad_norm": 0.31818678975105286, + "learning_rate": 0.00017300071516257904, + "loss": 0.2754, + "step": 4440 + }, + { + "epoch": 9.477611940298507, + "grad_norm": 0.3030422031879425, + "learning_rate": 0.000172915880983917, + "loss": 0.2795, + "step": 4445 + }, + { + "epoch": 9.488272921108742, + "grad_norm": 0.304565966129303, + "learning_rate": 0.00017283093460635166, + "loss": 0.2837, + "step": 4450 + }, + { + "epoch": 9.498933901918976, + "grad_norm": 0.3034186363220215, + "learning_rate": 0.00017274587616059376, + "loss": 0.2768, + "step": 4455 + }, + { + "epoch": 9.50959488272921, + "grad_norm": 0.30095112323760986, + "learning_rate": 0.00017266070577752647, + "loss": 0.2786, + "step": 4460 + }, + { + "epoch": 9.520255863539445, + "grad_norm": 0.3102254271507263, + "learning_rate": 0.0001725754235882053, + "loss": 0.2776, + "step": 4465 + }, + { + "epoch": 9.53091684434968, + "grad_norm": 0.2985278367996216, + "learning_rate": 0.00017249002972385765, + "loss": 0.2784, + "step": 4470 + }, + { + "epoch": 9.541577825159914, + "grad_norm": 0.32831713557243347, + "learning_rate": 0.00017240452431588294, + "loss": 0.2869, + "step": 4475 + }, + { + "epoch": 9.552238805970148, + "grad_norm": 0.3177868127822876, + "learning_rate": 0.0001723189074958521, + "loss": 0.2784, + "step": 4480 + }, + { + "epoch": 9.562899786780385, + "grad_norm": 0.3071228265762329, + "learning_rate": 0.00017223317939550753, + "loss": 0.2804, + "step": 4485 + }, + { + "epoch": 9.57356076759062, + "grad_norm": 0.3183000981807709, + "learning_rate": 0.00017214734014676288, + "loss": 0.2799, + "step": 4490 + }, + { + "epoch": 9.584221748400854, + "grad_norm": 0.33166825771331787, + "learning_rate": 0.00017206138988170281, + "loss": 0.2828, + "step": 4495 + }, + { + "epoch": 9.594882729211088, + "grad_norm": 0.3132229149341583, + "learning_rate": 0.0001719753287325828, + "loss": 0.279, + "step": 4500 + }, + { + "epoch": 9.605543710021323, + "grad_norm": 0.3281535506248474, + "learning_rate": 0.00017188915683182896, + "loss": 0.2767, + "step": 4505 + }, + { + "epoch": 9.616204690831557, + "grad_norm": 0.31389063596725464, + "learning_rate": 0.00017180287431203781, + "loss": 0.2851, + "step": 4510 + }, + { + "epoch": 9.626865671641792, + "grad_norm": 0.315807580947876, + "learning_rate": 0.00017171648130597612, + "loss": 0.2816, + "step": 4515 + }, + { + "epoch": 9.637526652452026, + "grad_norm": 0.3103027939796448, + "learning_rate": 0.0001716299779465806, + "loss": 0.2797, + "step": 4520 + }, + { + "epoch": 9.64818763326226, + "grad_norm": 0.3018797039985657, + "learning_rate": 0.00017154336436695785, + "loss": 0.2827, + "step": 4525 + }, + { + "epoch": 9.658848614072495, + "grad_norm": 0.3306185007095337, + "learning_rate": 0.00017145664070038406, + "loss": 0.2861, + "step": 4530 + }, + { + "epoch": 9.66950959488273, + "grad_norm": 0.3151242434978485, + "learning_rate": 0.0001713698070803047, + "loss": 0.2855, + "step": 4535 + }, + { + "epoch": 9.680170575692964, + "grad_norm": 0.3073995113372803, + "learning_rate": 0.0001712828636403346, + "loss": 0.2825, + "step": 4540 + }, + { + "epoch": 9.690831556503198, + "grad_norm": 0.31615933775901794, + "learning_rate": 0.00017119581051425742, + "loss": 0.2791, + "step": 4545 + }, + { + "epoch": 9.701492537313433, + "grad_norm": 0.3101312816143036, + "learning_rate": 0.0001711086478360257, + "loss": 0.287, + "step": 4550 + }, + { + "epoch": 9.712153518123667, + "grad_norm": 0.3094468116760254, + "learning_rate": 0.00017102137573976058, + "loss": 0.2804, + "step": 4555 + }, + { + "epoch": 9.722814498933902, + "grad_norm": 0.33349186182022095, + "learning_rate": 0.00017093399435975142, + "loss": 0.2773, + "step": 4560 + }, + { + "epoch": 9.733475479744136, + "grad_norm": 0.2954055368900299, + "learning_rate": 0.00017084650383045587, + "loss": 0.2762, + "step": 4565 + }, + { + "epoch": 9.74413646055437, + "grad_norm": 0.2962237000465393, + "learning_rate": 0.0001707589042864995, + "loss": 0.2861, + "step": 4570 + }, + { + "epoch": 9.754797441364605, + "grad_norm": 0.3323478698730469, + "learning_rate": 0.00017067119586267556, + "loss": 0.2861, + "step": 4575 + }, + { + "epoch": 9.76545842217484, + "grad_norm": 0.2926410138607025, + "learning_rate": 0.000170583378693945, + "loss": 0.2817, + "step": 4580 + }, + { + "epoch": 9.776119402985074, + "grad_norm": 0.3227819502353668, + "learning_rate": 0.0001704954529154359, + "loss": 0.2884, + "step": 4585 + }, + { + "epoch": 9.786780383795309, + "grad_norm": 0.32089999318122864, + "learning_rate": 0.00017040741866244358, + "loss": 0.2881, + "step": 4590 + }, + { + "epoch": 9.797441364605543, + "grad_norm": 0.3188937306404114, + "learning_rate": 0.0001703192760704303, + "loss": 0.2855, + "step": 4595 + }, + { + "epoch": 9.808102345415778, + "grad_norm": 0.3184082508087158, + "learning_rate": 0.00017023102527502496, + "loss": 0.2842, + "step": 4600 + }, + { + "epoch": 9.818763326226012, + "grad_norm": 0.2914822995662689, + "learning_rate": 0.00017014266641202292, + "loss": 0.274, + "step": 4605 + }, + { + "epoch": 9.829424307036247, + "grad_norm": 0.33117881417274475, + "learning_rate": 0.00017005419961738593, + "loss": 0.2888, + "step": 4610 + }, + { + "epoch": 9.840085287846481, + "grad_norm": 0.32017573714256287, + "learning_rate": 0.0001699656250272418, + "loss": 0.2785, + "step": 4615 + }, + { + "epoch": 9.850746268656717, + "grad_norm": 0.29259586334228516, + "learning_rate": 0.00016987694277788417, + "loss": 0.2888, + "step": 4620 + }, + { + "epoch": 9.86140724946695, + "grad_norm": 0.29314401745796204, + "learning_rate": 0.00016978815300577234, + "loss": 0.2826, + "step": 4625 + }, + { + "epoch": 9.872068230277186, + "grad_norm": 0.3312009572982788, + "learning_rate": 0.00016969925584753108, + "loss": 0.2828, + "step": 4630 + }, + { + "epoch": 9.88272921108742, + "grad_norm": 0.31798672676086426, + "learning_rate": 0.00016961025143995037, + "loss": 0.2777, + "step": 4635 + }, + { + "epoch": 9.893390191897655, + "grad_norm": 0.2987801134586334, + "learning_rate": 0.00016952113991998527, + "loss": 0.2818, + "step": 4640 + }, + { + "epoch": 9.90405117270789, + "grad_norm": 0.3148316442966461, + "learning_rate": 0.00016943192142475564, + "loss": 0.2853, + "step": 4645 + }, + { + "epoch": 9.914712153518124, + "grad_norm": 0.3207818269729614, + "learning_rate": 0.00016934259609154592, + "loss": 0.2835, + "step": 4650 + }, + { + "epoch": 9.925373134328359, + "grad_norm": 0.29595887660980225, + "learning_rate": 0.000169253164057805, + "loss": 0.2845, + "step": 4655 + }, + { + "epoch": 9.936034115138593, + "grad_norm": 0.2958875894546509, + "learning_rate": 0.00016916362546114585, + "loss": 0.2793, + "step": 4660 + }, + { + "epoch": 9.946695095948828, + "grad_norm": 0.2999938726425171, + "learning_rate": 0.00016907398043934557, + "loss": 0.2794, + "step": 4665 + }, + { + "epoch": 9.957356076759062, + "grad_norm": 0.29154959321022034, + "learning_rate": 0.00016898422913034486, + "loss": 0.2891, + "step": 4670 + }, + { + "epoch": 9.968017057569297, + "grad_norm": 0.30298835039138794, + "learning_rate": 0.0001688943716722481, + "loss": 0.2859, + "step": 4675 + }, + { + "epoch": 9.978678038379531, + "grad_norm": 0.3251824975013733, + "learning_rate": 0.00016880440820332291, + "loss": 0.283, + "step": 4680 + }, + { + "epoch": 9.989339019189766, + "grad_norm": 0.29153597354888916, + "learning_rate": 0.0001687143388620001, + "loss": 0.2871, + "step": 4685 + }, + { + "epoch": 10.0, + "grad_norm": 0.3233014643192291, + "learning_rate": 0.0001686241637868734, + "loss": 0.2853, + "step": 4690 + }, + { + "epoch": 10.0, + "eval_loss": 0.5920408368110657, + "eval_runtime": 377.5422, + "eval_samples_per_second": 1.091, + "eval_steps_per_second": 1.091, + "step": 4690 + }, + { + "epoch": 10.0, + "step": 4690, + "total_flos": 3.4794514845867704e+18, + "train_loss": 0.46453510172077334, + "train_runtime": 112907.655, + "train_samples_per_second": 0.997, + "train_steps_per_second": 0.125 + } + ], + "logging_steps": 5, + "max_steps": 14070, + "num_input_tokens_seen": 0, + "num_train_epochs": 30, + "save_steps": 50, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.4794514845867704e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}