|
{ |
|
"best_metric": 0.5016890168190002, |
|
"best_model_checkpoint": "data/hansken_human_hql_v3/checkpoint-2345", |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 4690, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0021321961620469083, |
|
"grad_norm": 1.0516366958618164, |
|
"learning_rate": 1.4214641080312722e-07, |
|
"loss": 1.9389, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010660980810234541, |
|
"grad_norm": 0.9856139421463013, |
|
"learning_rate": 7.107320540156362e-07, |
|
"loss": 2.0398, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.021321961620469083, |
|
"grad_norm": 1.0568891763687134, |
|
"learning_rate": 1.4214641080312723e-06, |
|
"loss": 2.0618, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.031982942430703626, |
|
"grad_norm": 0.9998515844345093, |
|
"learning_rate": 2.132196162046908e-06, |
|
"loss": 2.0543, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.042643923240938165, |
|
"grad_norm": 1.004911184310913, |
|
"learning_rate": 2.8429282160625447e-06, |
|
"loss": 1.9997, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.053304904051172705, |
|
"grad_norm": 0.9931671619415283, |
|
"learning_rate": 3.553660270078181e-06, |
|
"loss": 1.9913, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06396588486140725, |
|
"grad_norm": 0.9859012365341187, |
|
"learning_rate": 4.264392324093816e-06, |
|
"loss": 1.9729, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07462686567164178, |
|
"grad_norm": 1.0391347408294678, |
|
"learning_rate": 4.975124378109453e-06, |
|
"loss": 1.9434, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08528784648187633, |
|
"grad_norm": 0.8275197744369507, |
|
"learning_rate": 5.685856432125089e-06, |
|
"loss": 1.9092, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09594882729211088, |
|
"grad_norm": 0.7102633714675903, |
|
"learning_rate": 6.396588486140726e-06, |
|
"loss": 1.8488, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.10660980810234541, |
|
"grad_norm": 0.6521381735801697, |
|
"learning_rate": 7.107320540156362e-06, |
|
"loss": 1.8673, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11727078891257996, |
|
"grad_norm": 0.5477872490882874, |
|
"learning_rate": 7.818052594171997e-06, |
|
"loss": 1.7758, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1279317697228145, |
|
"grad_norm": 0.49889788031578064, |
|
"learning_rate": 8.528784648187633e-06, |
|
"loss": 1.7453, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.13859275053304904, |
|
"grad_norm": 0.5726047158241272, |
|
"learning_rate": 9.23951670220327e-06, |
|
"loss": 1.7635, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.14925373134328357, |
|
"grad_norm": 0.4760012924671173, |
|
"learning_rate": 9.950248756218906e-06, |
|
"loss": 1.7027, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.15991471215351813, |
|
"grad_norm": 0.4642033278942108, |
|
"learning_rate": 1.0660980810234541e-05, |
|
"loss": 1.7086, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.17057569296375266, |
|
"grad_norm": 0.42560943961143494, |
|
"learning_rate": 1.1371712864250179e-05, |
|
"loss": 1.638, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1812366737739872, |
|
"grad_norm": 0.4680778384208679, |
|
"learning_rate": 1.2082444918265814e-05, |
|
"loss": 1.6029, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.19189765458422176, |
|
"grad_norm": 0.4264519214630127, |
|
"learning_rate": 1.2793176972281452e-05, |
|
"loss": 1.4899, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2025586353944563, |
|
"grad_norm": 0.41101664304733276, |
|
"learning_rate": 1.3503909026297087e-05, |
|
"loss": 1.4997, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.21321961620469082, |
|
"grad_norm": 0.34257784485816956, |
|
"learning_rate": 1.4214641080312725e-05, |
|
"loss": 1.4734, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.22388059701492538, |
|
"grad_norm": 0.34164702892303467, |
|
"learning_rate": 1.4925373134328357e-05, |
|
"loss": 1.4341, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2345415778251599, |
|
"grad_norm": 0.3285938501358032, |
|
"learning_rate": 1.5636105188343994e-05, |
|
"loss": 1.4293, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.24520255863539445, |
|
"grad_norm": 0.33409905433654785, |
|
"learning_rate": 1.634683724235963e-05, |
|
"loss": 1.3792, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.255863539445629, |
|
"grad_norm": 0.3385579288005829, |
|
"learning_rate": 1.7057569296375266e-05, |
|
"loss": 1.3811, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.26652452025586354, |
|
"grad_norm": 0.35849225521087646, |
|
"learning_rate": 1.7768301350390903e-05, |
|
"loss": 1.3217, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.2771855010660981, |
|
"grad_norm": 0.3905642330646515, |
|
"learning_rate": 1.847903340440654e-05, |
|
"loss": 1.2792, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2878464818763326, |
|
"grad_norm": 0.45816823840141296, |
|
"learning_rate": 1.9189765458422178e-05, |
|
"loss": 1.268, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.29850746268656714, |
|
"grad_norm": 0.42841047048568726, |
|
"learning_rate": 1.990049751243781e-05, |
|
"loss": 1.1999, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3091684434968017, |
|
"grad_norm": 0.42461100220680237, |
|
"learning_rate": 2.061122956645345e-05, |
|
"loss": 1.1908, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.31982942430703626, |
|
"grad_norm": 0.3846851885318756, |
|
"learning_rate": 2.1321961620469083e-05, |
|
"loss": 1.0417, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3304904051172708, |
|
"grad_norm": 0.35793304443359375, |
|
"learning_rate": 2.203269367448472e-05, |
|
"loss": 1.0804, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3411513859275053, |
|
"grad_norm": 0.3422033488750458, |
|
"learning_rate": 2.2743425728500358e-05, |
|
"loss": 1.0433, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.35181236673773986, |
|
"grad_norm": 0.34404265880584717, |
|
"learning_rate": 2.345415778251599e-05, |
|
"loss": 1.0823, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.3624733475479744, |
|
"grad_norm": 0.31916388869285583, |
|
"learning_rate": 2.416488983653163e-05, |
|
"loss": 1.001, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.373134328358209, |
|
"grad_norm": 0.33065563440322876, |
|
"learning_rate": 2.4875621890547266e-05, |
|
"loss": 0.9698, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.3837953091684435, |
|
"grad_norm": 0.34518882632255554, |
|
"learning_rate": 2.5586353944562904e-05, |
|
"loss": 0.9731, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.39445628997867804, |
|
"grad_norm": 0.31844091415405273, |
|
"learning_rate": 2.6297085998578534e-05, |
|
"loss": 0.9293, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.4051172707889126, |
|
"grad_norm": 0.32537004351615906, |
|
"learning_rate": 2.7007818052594175e-05, |
|
"loss": 0.9306, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4157782515991471, |
|
"grad_norm": 0.38439956307411194, |
|
"learning_rate": 2.771855010660981e-05, |
|
"loss": 0.8915, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.42643923240938164, |
|
"grad_norm": 0.3455168306827545, |
|
"learning_rate": 2.842928216062545e-05, |
|
"loss": 0.903, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.43710021321961623, |
|
"grad_norm": 0.36652979254722595, |
|
"learning_rate": 2.914001421464108e-05, |
|
"loss": 0.8468, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.44776119402985076, |
|
"grad_norm": 0.35580819845199585, |
|
"learning_rate": 2.9850746268656714e-05, |
|
"loss": 0.8467, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4584221748400853, |
|
"grad_norm": 0.3748577833175659, |
|
"learning_rate": 3.056147832267235e-05, |
|
"loss": 0.8037, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.4690831556503198, |
|
"grad_norm": 0.3399907052516937, |
|
"learning_rate": 3.127221037668799e-05, |
|
"loss": 0.8525, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.47974413646055436, |
|
"grad_norm": 0.39041897654533386, |
|
"learning_rate": 3.1982942430703626e-05, |
|
"loss": 0.8672, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.4904051172707889, |
|
"grad_norm": 0.37930938601493835, |
|
"learning_rate": 3.269367448471926e-05, |
|
"loss": 0.7967, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5010660980810234, |
|
"grad_norm": 0.4009639024734497, |
|
"learning_rate": 3.34044065387349e-05, |
|
"loss": 0.8134, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.511727078891258, |
|
"grad_norm": 0.4189032018184662, |
|
"learning_rate": 3.411513859275053e-05, |
|
"loss": 0.791, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5223880597014925, |
|
"grad_norm": 0.3848344385623932, |
|
"learning_rate": 3.4825870646766175e-05, |
|
"loss": 0.8183, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.5330490405117271, |
|
"grad_norm": 0.41223597526550293, |
|
"learning_rate": 3.5536602700781806e-05, |
|
"loss": 0.7668, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5437100213219617, |
|
"grad_norm": 0.4024832844734192, |
|
"learning_rate": 3.624733475479744e-05, |
|
"loss": 0.7819, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.5543710021321961, |
|
"grad_norm": 0.3832787871360779, |
|
"learning_rate": 3.695806680881308e-05, |
|
"loss": 0.7693, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5650319829424307, |
|
"grad_norm": 0.4266470670700073, |
|
"learning_rate": 3.766879886282871e-05, |
|
"loss": 0.795, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.5756929637526652, |
|
"grad_norm": 0.47055262327194214, |
|
"learning_rate": 3.8379530916844355e-05, |
|
"loss": 0.7752, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5863539445628998, |
|
"grad_norm": 0.420669823884964, |
|
"learning_rate": 3.9090262970859986e-05, |
|
"loss": 0.7691, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.5970149253731343, |
|
"grad_norm": 0.4140627384185791, |
|
"learning_rate": 3.980099502487562e-05, |
|
"loss": 0.7385, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6076759061833689, |
|
"grad_norm": 0.4674805998802185, |
|
"learning_rate": 4.051172707889126e-05, |
|
"loss": 0.7668, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.6183368869936035, |
|
"grad_norm": 0.45881038904190063, |
|
"learning_rate": 4.12224591329069e-05, |
|
"loss": 0.7777, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6289978678038379, |
|
"grad_norm": 0.4218686819076538, |
|
"learning_rate": 4.1933191186922535e-05, |
|
"loss": 0.7106, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.6396588486140725, |
|
"grad_norm": 0.43359580636024475, |
|
"learning_rate": 4.2643923240938166e-05, |
|
"loss": 0.7076, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.650319829424307, |
|
"grad_norm": 0.42106226086616516, |
|
"learning_rate": 4.33546552949538e-05, |
|
"loss": 0.7353, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.6609808102345416, |
|
"grad_norm": 0.4189695715904236, |
|
"learning_rate": 4.406538734896944e-05, |
|
"loss": 0.698, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6716417910447762, |
|
"grad_norm": 0.45314905047416687, |
|
"learning_rate": 4.477611940298508e-05, |
|
"loss": 0.7356, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.6823027718550106, |
|
"grad_norm": 0.46034571528434753, |
|
"learning_rate": 4.5486851457000715e-05, |
|
"loss": 0.7397, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6929637526652452, |
|
"grad_norm": 0.44907087087631226, |
|
"learning_rate": 4.619758351101635e-05, |
|
"loss": 0.7326, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.7036247334754797, |
|
"grad_norm": 0.46258679032325745, |
|
"learning_rate": 4.690831556503198e-05, |
|
"loss": 0.6663, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.446308434009552, |
|
"learning_rate": 4.761904761904762e-05, |
|
"loss": 0.6941, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.7249466950959488, |
|
"grad_norm": 0.40378594398498535, |
|
"learning_rate": 4.832977967306326e-05, |
|
"loss": 0.7174, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7356076759061834, |
|
"grad_norm": 0.39717379212379456, |
|
"learning_rate": 4.904051172707889e-05, |
|
"loss": 0.659, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.746268656716418, |
|
"grad_norm": 0.4855833053588867, |
|
"learning_rate": 4.975124378109453e-05, |
|
"loss": 0.6762, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7569296375266524, |
|
"grad_norm": 0.47973328828811646, |
|
"learning_rate": 5.046197583511016e-05, |
|
"loss": 0.6782, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.767590618336887, |
|
"grad_norm": 0.4429256319999695, |
|
"learning_rate": 5.117270788912581e-05, |
|
"loss": 0.6634, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7782515991471215, |
|
"grad_norm": 0.44692516326904297, |
|
"learning_rate": 5.1883439943141444e-05, |
|
"loss": 0.6792, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.7889125799573561, |
|
"grad_norm": 0.4430787265300751, |
|
"learning_rate": 5.259417199715707e-05, |
|
"loss": 0.6416, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7995735607675906, |
|
"grad_norm": 0.4461454451084137, |
|
"learning_rate": 5.330490405117271e-05, |
|
"loss": 0.7013, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.8102345415778252, |
|
"grad_norm": 0.526995837688446, |
|
"learning_rate": 5.401563610518835e-05, |
|
"loss": 0.6396, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.8208955223880597, |
|
"grad_norm": 0.4485580623149872, |
|
"learning_rate": 5.472636815920398e-05, |
|
"loss": 0.6307, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.8315565031982942, |
|
"grad_norm": 0.45416155457496643, |
|
"learning_rate": 5.543710021321962e-05, |
|
"loss": 0.6361, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.8422174840085288, |
|
"grad_norm": 0.4746207296848297, |
|
"learning_rate": 5.6147832267235255e-05, |
|
"loss": 0.641, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.8528784648187633, |
|
"grad_norm": 0.4466172456741333, |
|
"learning_rate": 5.68585643212509e-05, |
|
"loss": 0.643, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8635394456289979, |
|
"grad_norm": 0.46807265281677246, |
|
"learning_rate": 5.756929637526652e-05, |
|
"loss": 0.6258, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.8742004264392325, |
|
"grad_norm": 0.46169164776802063, |
|
"learning_rate": 5.828002842928216e-05, |
|
"loss": 0.6212, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.8848614072494669, |
|
"grad_norm": 0.47564077377319336, |
|
"learning_rate": 5.8990760483297804e-05, |
|
"loss": 0.6369, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.8955223880597015, |
|
"grad_norm": 0.4582447409629822, |
|
"learning_rate": 5.970149253731343e-05, |
|
"loss": 0.6086, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.906183368869936, |
|
"grad_norm": 0.5161389708518982, |
|
"learning_rate": 6.041222459132907e-05, |
|
"loss": 0.6529, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.9168443496801706, |
|
"grad_norm": 0.47045719623565674, |
|
"learning_rate": 6.11229566453447e-05, |
|
"loss": 0.6119, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.9275053304904051, |
|
"grad_norm": 0.5950572490692139, |
|
"learning_rate": 6.183368869936035e-05, |
|
"loss": 0.6259, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.9381663113006397, |
|
"grad_norm": 0.5470284223556519, |
|
"learning_rate": 6.254442075337598e-05, |
|
"loss": 0.6282, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.9488272921108742, |
|
"grad_norm": 0.5164011716842651, |
|
"learning_rate": 6.325515280739162e-05, |
|
"loss": 0.6399, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.9594882729211087, |
|
"grad_norm": 0.4264001250267029, |
|
"learning_rate": 6.396588486140725e-05, |
|
"loss": 0.6405, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9701492537313433, |
|
"grad_norm": 0.4878412187099457, |
|
"learning_rate": 6.46766169154229e-05, |
|
"loss": 0.6548, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.9808102345415778, |
|
"grad_norm": 0.47677186131477356, |
|
"learning_rate": 6.538734896943853e-05, |
|
"loss": 0.6506, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.9914712153518124, |
|
"grad_norm": 0.4687974452972412, |
|
"learning_rate": 6.609808102345416e-05, |
|
"loss": 0.6267, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.6078405976295471, |
|
"eval_runtime": 377.5565, |
|
"eval_samples_per_second": 1.091, |
|
"eval_steps_per_second": 1.091, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 1.0021321961620469, |
|
"grad_norm": 0.4401796758174896, |
|
"learning_rate": 6.68088130774698e-05, |
|
"loss": 0.5968, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.0127931769722816, |
|
"grad_norm": 0.8371634483337402, |
|
"learning_rate": 6.751954513148543e-05, |
|
"loss": 0.5923, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.023454157782516, |
|
"grad_norm": 0.49846479296684265, |
|
"learning_rate": 6.823027718550106e-05, |
|
"loss": 0.6835, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.0341151385927505, |
|
"grad_norm": 0.5845323801040649, |
|
"learning_rate": 6.89410092395167e-05, |
|
"loss": 0.5906, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.044776119402985, |
|
"grad_norm": 0.5639384984970093, |
|
"learning_rate": 6.965174129353235e-05, |
|
"loss": 0.5881, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.0554371002132197, |
|
"grad_norm": 0.5082396268844604, |
|
"learning_rate": 7.036247334754798e-05, |
|
"loss": 0.6224, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.0660980810234542, |
|
"grad_norm": 0.5611528158187866, |
|
"learning_rate": 7.107320540156361e-05, |
|
"loss": 0.5643, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0767590618336886, |
|
"grad_norm": 0.7102047801017761, |
|
"learning_rate": 7.178393745557926e-05, |
|
"loss": 0.5814, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.0874200426439233, |
|
"grad_norm": 0.46847936511039734, |
|
"learning_rate": 7.249466950959489e-05, |
|
"loss": 0.5642, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.0980810234541578, |
|
"grad_norm": 0.47119173407554626, |
|
"learning_rate": 7.320540156361052e-05, |
|
"loss": 0.5674, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.1087420042643923, |
|
"grad_norm": 1.0005890130996704, |
|
"learning_rate": 7.391613361762616e-05, |
|
"loss": 0.5949, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.1194029850746268, |
|
"grad_norm": 0.7785916924476624, |
|
"learning_rate": 7.46268656716418e-05, |
|
"loss": 0.5643, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.1300639658848615, |
|
"grad_norm": 0.6393773555755615, |
|
"learning_rate": 7.533759772565742e-05, |
|
"loss": 0.5886, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.140724946695096, |
|
"grad_norm": 0.6369247436523438, |
|
"learning_rate": 7.604832977967307e-05, |
|
"loss": 0.58, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.1513859275053304, |
|
"grad_norm": 0.48704272508621216, |
|
"learning_rate": 7.675906183368871e-05, |
|
"loss": 0.6125, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.1620469083155651, |
|
"grad_norm": 0.5542349219322205, |
|
"learning_rate": 7.746979388770433e-05, |
|
"loss": 0.5688, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.1727078891257996, |
|
"grad_norm": 0.4632197618484497, |
|
"learning_rate": 7.818052594171997e-05, |
|
"loss": 0.5727, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.183368869936034, |
|
"grad_norm": 0.40735307335853577, |
|
"learning_rate": 7.889125799573562e-05, |
|
"loss": 0.5704, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.1940298507462686, |
|
"grad_norm": 0.45803022384643555, |
|
"learning_rate": 7.960199004975125e-05, |
|
"loss": 0.6041, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.2046908315565032, |
|
"grad_norm": 0.47275593876838684, |
|
"learning_rate": 8.031272210376688e-05, |
|
"loss": 0.5476, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.2153518123667377, |
|
"grad_norm": 0.4402256906032562, |
|
"learning_rate": 8.102345415778252e-05, |
|
"loss": 0.6101, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.2260127931769722, |
|
"grad_norm": 0.4577506184577942, |
|
"learning_rate": 8.173418621179815e-05, |
|
"loss": 0.6021, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.236673773987207, |
|
"grad_norm": 0.4695811867713928, |
|
"learning_rate": 8.24449182658138e-05, |
|
"loss": 0.5843, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.2473347547974414, |
|
"grad_norm": 0.5012730360031128, |
|
"learning_rate": 8.315565031982943e-05, |
|
"loss": 0.5963, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.2579957356076759, |
|
"grad_norm": 0.4261506199836731, |
|
"learning_rate": 8.386638237384507e-05, |
|
"loss": 0.5608, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.2686567164179103, |
|
"grad_norm": 0.48886266350746155, |
|
"learning_rate": 8.45771144278607e-05, |
|
"loss": 0.5768, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.279317697228145, |
|
"grad_norm": 0.4756333529949188, |
|
"learning_rate": 8.528784648187633e-05, |
|
"loss": 0.5581, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.2899786780383795, |
|
"grad_norm": 0.4242517054080963, |
|
"learning_rate": 8.599857853589198e-05, |
|
"loss": 0.5436, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.3006396588486142, |
|
"grad_norm": 0.44590556621551514, |
|
"learning_rate": 8.67093105899076e-05, |
|
"loss": 0.5821, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.3113006396588487, |
|
"grad_norm": 0.4373833239078522, |
|
"learning_rate": 8.742004264392325e-05, |
|
"loss": 0.544, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.3219616204690832, |
|
"grad_norm": 0.42627617716789246, |
|
"learning_rate": 8.813077469793888e-05, |
|
"loss": 0.5417, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.3326226012793176, |
|
"grad_norm": 0.516544759273529, |
|
"learning_rate": 8.884150675195451e-05, |
|
"loss": 0.573, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.3432835820895521, |
|
"grad_norm": 0.4419044256210327, |
|
"learning_rate": 8.955223880597016e-05, |
|
"loss": 0.5523, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.3539445628997868, |
|
"grad_norm": 0.4533810019493103, |
|
"learning_rate": 9.026297085998579e-05, |
|
"loss": 0.5372, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.3646055437100213, |
|
"grad_norm": 0.4296520948410034, |
|
"learning_rate": 9.097370291400143e-05, |
|
"loss": 0.5742, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.375266524520256, |
|
"grad_norm": 0.4285917282104492, |
|
"learning_rate": 9.168443496801706e-05, |
|
"loss": 0.5577, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.3859275053304905, |
|
"grad_norm": 0.41438210010528564, |
|
"learning_rate": 9.23951670220327e-05, |
|
"loss": 0.5659, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.396588486140725, |
|
"grad_norm": 0.43702948093414307, |
|
"learning_rate": 9.310589907604834e-05, |
|
"loss": 0.5425, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.4072494669509594, |
|
"grad_norm": 0.520577609539032, |
|
"learning_rate": 9.381663113006397e-05, |
|
"loss": 0.5624, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.417910447761194, |
|
"grad_norm": 0.451948881149292, |
|
"learning_rate": 9.452736318407961e-05, |
|
"loss": 0.5598, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.4748338460922241, |
|
"learning_rate": 9.523809523809524e-05, |
|
"loss": 0.6579, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.439232409381663, |
|
"grad_norm": 0.4351726472377777, |
|
"learning_rate": 9.594882729211087e-05, |
|
"loss": 0.541, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.4498933901918978, |
|
"grad_norm": 0.4322686493396759, |
|
"learning_rate": 9.665955934612652e-05, |
|
"loss": 0.5941, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.4605543710021323, |
|
"grad_norm": 0.43369051814079285, |
|
"learning_rate": 9.737029140014216e-05, |
|
"loss": 0.5862, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.4712153518123667, |
|
"grad_norm": 0.5028679966926575, |
|
"learning_rate": 9.808102345415778e-05, |
|
"loss": 0.5444, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.4818763326226012, |
|
"grad_norm": 0.4060784578323364, |
|
"learning_rate": 9.879175550817342e-05, |
|
"loss": 0.549, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.4925373134328357, |
|
"grad_norm": 0.4283974766731262, |
|
"learning_rate": 9.950248756218906e-05, |
|
"loss": 0.5474, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.5031982942430704, |
|
"grad_norm": 0.3743923008441925, |
|
"learning_rate": 0.0001002132196162047, |
|
"loss": 0.5394, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.5138592750533049, |
|
"grad_norm": 0.44469088315963745, |
|
"learning_rate": 0.00010092395167022033, |
|
"loss": 0.5563, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.5245202558635396, |
|
"grad_norm": 0.43209415674209595, |
|
"learning_rate": 0.00010163468372423597, |
|
"loss": 0.5803, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.535181236673774, |
|
"grad_norm": 0.4075677990913391, |
|
"learning_rate": 0.00010234541577825161, |
|
"loss": 0.5369, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.5458422174840085, |
|
"grad_norm": 0.4084095358848572, |
|
"learning_rate": 0.00010305614783226724, |
|
"loss": 0.5687, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.556503198294243, |
|
"grad_norm": 0.4053703248500824, |
|
"learning_rate": 0.00010376687988628289, |
|
"loss": 0.5301, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.5671641791044775, |
|
"grad_norm": 0.46452564001083374, |
|
"learning_rate": 0.0001044776119402985, |
|
"loss": 0.5823, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.5778251599147122, |
|
"grad_norm": 0.4020977020263672, |
|
"learning_rate": 0.00010518834399431414, |
|
"loss": 0.5463, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.5884861407249466, |
|
"grad_norm": 0.3993551433086395, |
|
"learning_rate": 0.00010589907604832978, |
|
"loss": 0.5551, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.5991471215351813, |
|
"grad_norm": 0.4211786985397339, |
|
"learning_rate": 0.00010660980810234542, |
|
"loss": 0.5607, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.6098081023454158, |
|
"grad_norm": 0.4241097867488861, |
|
"learning_rate": 0.00010732054015636106, |
|
"loss": 0.5402, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.6204690831556503, |
|
"grad_norm": 0.3934391736984253, |
|
"learning_rate": 0.0001080312722103767, |
|
"loss": 0.5618, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.6311300639658848, |
|
"grad_norm": 0.37157073616981506, |
|
"learning_rate": 0.00010874200426439234, |
|
"loss": 0.5232, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.6417910447761193, |
|
"grad_norm": 0.4151962399482727, |
|
"learning_rate": 0.00010945273631840796, |
|
"loss": 0.563, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.652452025586354, |
|
"grad_norm": 0.42233771085739136, |
|
"learning_rate": 0.00011016346837242359, |
|
"loss": 0.5667, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.6631130063965884, |
|
"grad_norm": 0.3891717493534088, |
|
"learning_rate": 0.00011087420042643924, |
|
"loss": 0.582, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.6737739872068231, |
|
"grad_norm": 0.4017283618450165, |
|
"learning_rate": 0.00011158493248045488, |
|
"loss": 0.5386, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.6844349680170576, |
|
"grad_norm": 0.4058316648006439, |
|
"learning_rate": 0.00011229566453447051, |
|
"loss": 0.5357, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.695095948827292, |
|
"grad_norm": 0.38968625664711, |
|
"learning_rate": 0.00011300639658848615, |
|
"loss": 0.527, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.7057569296375266, |
|
"grad_norm": 0.4108840525150299, |
|
"learning_rate": 0.0001137171286425018, |
|
"loss": 0.5347, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.716417910447761, |
|
"grad_norm": 0.37222376465797424, |
|
"learning_rate": 0.00011442786069651741, |
|
"loss": 0.524, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.7270788912579957, |
|
"grad_norm": 0.4046708047389984, |
|
"learning_rate": 0.00011513859275053305, |
|
"loss": 0.5096, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.7377398720682304, |
|
"grad_norm": 0.37089455127716064, |
|
"learning_rate": 0.00011584932480454869, |
|
"loss": 0.5316, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.748400852878465, |
|
"grad_norm": 0.3895399272441864, |
|
"learning_rate": 0.00011656005685856432, |
|
"loss": 0.5274, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.7590618336886994, |
|
"grad_norm": 0.3956606984138489, |
|
"learning_rate": 0.00011727078891257996, |
|
"loss": 0.5395, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.7697228144989339, |
|
"grad_norm": 0.4023361802101135, |
|
"learning_rate": 0.00011798152096659561, |
|
"loss": 0.53, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.7803837953091683, |
|
"grad_norm": 0.39323511719703674, |
|
"learning_rate": 0.00011869225302061124, |
|
"loss": 0.5341, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.7910447761194028, |
|
"grad_norm": 0.3870689868927002, |
|
"learning_rate": 0.00011940298507462686, |
|
"loss": 0.5268, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.8017057569296375, |
|
"grad_norm": 0.39864471554756165, |
|
"learning_rate": 0.0001201137171286425, |
|
"loss": 0.5754, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.8123667377398722, |
|
"grad_norm": 0.413980633020401, |
|
"learning_rate": 0.00012082444918265814, |
|
"loss": 0.5274, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.8230277185501067, |
|
"grad_norm": 0.3994651138782501, |
|
"learning_rate": 0.00012153518123667377, |
|
"loss": 0.5313, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.8336886993603412, |
|
"grad_norm": 0.4106079041957855, |
|
"learning_rate": 0.0001222459132906894, |
|
"loss": 0.5293, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.8443496801705757, |
|
"grad_norm": 0.38014471530914307, |
|
"learning_rate": 0.00012295664534470505, |
|
"loss": 0.5313, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.8550106609808101, |
|
"grad_norm": 0.3477731943130493, |
|
"learning_rate": 0.0001236673773987207, |
|
"loss": 0.5499, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.8656716417910446, |
|
"grad_norm": 0.3609556555747986, |
|
"learning_rate": 0.0001243781094527363, |
|
"loss": 0.5195, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.8763326226012793, |
|
"grad_norm": 0.3532927334308624, |
|
"learning_rate": 0.00012508884150675195, |
|
"loss": 0.5233, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.886993603411514, |
|
"grad_norm": 0.3663487434387207, |
|
"learning_rate": 0.0001257995735607676, |
|
"loss": 0.5129, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.8976545842217485, |
|
"grad_norm": 0.35837364196777344, |
|
"learning_rate": 0.00012651030561478324, |
|
"loss": 0.5106, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.908315565031983, |
|
"grad_norm": 0.38498660922050476, |
|
"learning_rate": 0.00012722103766879886, |
|
"loss": 0.5216, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.9189765458422174, |
|
"grad_norm": 0.3501322269439697, |
|
"learning_rate": 0.0001279317697228145, |
|
"loss": 0.54, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.929637526652452, |
|
"grad_norm": 0.34796684980392456, |
|
"learning_rate": 0.00012864250177683015, |
|
"loss": 0.5165, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.9402985074626866, |
|
"grad_norm": 0.46670106053352356, |
|
"learning_rate": 0.0001293532338308458, |
|
"loss": 0.5437, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.950959488272921, |
|
"grad_norm": 0.3535880148410797, |
|
"learning_rate": 0.0001300639658848614, |
|
"loss": 0.5561, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.9616204690831558, |
|
"grad_norm": 0.3591325283050537, |
|
"learning_rate": 0.00013077469793887705, |
|
"loss": 0.5193, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.9722814498933903, |
|
"grad_norm": 0.4969016909599304, |
|
"learning_rate": 0.00013148542999289267, |
|
"loss": 0.526, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.9829424307036247, |
|
"grad_norm": 0.3567504584789276, |
|
"learning_rate": 0.00013219616204690831, |
|
"loss": 0.5063, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.9936034115138592, |
|
"grad_norm": 0.3647787272930145, |
|
"learning_rate": 0.00013290689410092396, |
|
"loss": 0.5094, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.5335173606872559, |
|
"eval_runtime": 377.8765, |
|
"eval_samples_per_second": 1.09, |
|
"eval_steps_per_second": 1.09, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 2.0042643923240937, |
|
"grad_norm": 0.34923797845840454, |
|
"learning_rate": 0.0001336176261549396, |
|
"loss": 0.5126, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.014925373134328, |
|
"grad_norm": 0.4439273476600647, |
|
"learning_rate": 0.00013432835820895525, |
|
"loss": 0.5349, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 2.025586353944563, |
|
"grad_norm": 0.35956764221191406, |
|
"learning_rate": 0.00013503909026297086, |
|
"loss": 0.493, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.0362473347547976, |
|
"grad_norm": 0.3677864074707031, |
|
"learning_rate": 0.0001357498223169865, |
|
"loss": 0.523, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 2.046908315565032, |
|
"grad_norm": 0.3486590087413788, |
|
"learning_rate": 0.00013646055437100213, |
|
"loss": 0.5322, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.0575692963752665, |
|
"grad_norm": 0.3785991072654724, |
|
"learning_rate": 0.00013717128642501777, |
|
"loss": 0.4903, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 2.068230277185501, |
|
"grad_norm": 0.3422692120075226, |
|
"learning_rate": 0.0001378820184790334, |
|
"loss": 0.5356, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.0788912579957355, |
|
"grad_norm": 0.41184964776039124, |
|
"learning_rate": 0.00013859275053304906, |
|
"loss": 0.4969, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.08955223880597, |
|
"grad_norm": 0.34267646074295044, |
|
"learning_rate": 0.0001393034825870647, |
|
"loss": 0.5113, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.100213219616205, |
|
"grad_norm": 0.38112279772758484, |
|
"learning_rate": 0.00014001421464108032, |
|
"loss": 0.4793, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 2.1108742004264394, |
|
"grad_norm": 0.33497291803359985, |
|
"learning_rate": 0.00014072494669509596, |
|
"loss": 0.5185, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.121535181236674, |
|
"grad_norm": 0.37100210785865784, |
|
"learning_rate": 0.00014143567874911158, |
|
"loss": 0.5024, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 2.1321961620469083, |
|
"grad_norm": 0.3079771101474762, |
|
"learning_rate": 0.00014214641080312722, |
|
"loss": 0.5066, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 0.3615591824054718, |
|
"learning_rate": 0.00014285714285714287, |
|
"loss": 0.5157, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 2.1535181236673773, |
|
"grad_norm": 0.3394719958305359, |
|
"learning_rate": 0.0001435678749111585, |
|
"loss": 0.4906, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.1641791044776117, |
|
"grad_norm": 0.4234224557876587, |
|
"learning_rate": 0.00014427860696517416, |
|
"loss": 0.5015, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 2.1748400852878467, |
|
"grad_norm": 0.3535841107368469, |
|
"learning_rate": 0.00014498933901918977, |
|
"loss": 0.5107, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.185501066098081, |
|
"grad_norm": 0.41673514246940613, |
|
"learning_rate": 0.0001457000710732054, |
|
"loss": 0.505, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 2.1961620469083156, |
|
"grad_norm": 0.3521960973739624, |
|
"learning_rate": 0.00014641080312722103, |
|
"loss": 0.5339, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.20682302771855, |
|
"grad_norm": 0.341727614402771, |
|
"learning_rate": 0.00014712153518123668, |
|
"loss": 0.4897, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 2.2174840085287846, |
|
"grad_norm": 0.32079800963401794, |
|
"learning_rate": 0.00014783226723525232, |
|
"loss": 0.5049, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.228144989339019, |
|
"grad_norm": 0.34027552604675293, |
|
"learning_rate": 0.00014854299928926797, |
|
"loss": 0.4993, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 2.2388059701492535, |
|
"grad_norm": 0.34183624386787415, |
|
"learning_rate": 0.0001492537313432836, |
|
"loss": 0.51, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.2494669509594885, |
|
"grad_norm": 0.31983354687690735, |
|
"learning_rate": 0.00014996446339729923, |
|
"loss": 0.5084, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 2.260127931769723, |
|
"grad_norm": 0.3631596565246582, |
|
"learning_rate": 0.00015067519545131484, |
|
"loss": 0.4986, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.2707889125799574, |
|
"grad_norm": 0.32126784324645996, |
|
"learning_rate": 0.0001513859275053305, |
|
"loss": 0.4832, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 2.281449893390192, |
|
"grad_norm": 0.3390761911869049, |
|
"learning_rate": 0.00015209665955934613, |
|
"loss": 0.4972, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.2921108742004264, |
|
"grad_norm": 0.3330533504486084, |
|
"learning_rate": 0.00015280739161336178, |
|
"loss": 0.4772, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 2.302771855010661, |
|
"grad_norm": 0.3619351089000702, |
|
"learning_rate": 0.00015351812366737742, |
|
"loss": 0.5141, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.3134328358208958, |
|
"grad_norm": 0.3252182602882385, |
|
"learning_rate": 0.00015422885572139304, |
|
"loss": 0.5056, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 2.3240938166311302, |
|
"grad_norm": 0.3745068311691284, |
|
"learning_rate": 0.00015493958777540866, |
|
"loss": 0.5395, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.3347547974413647, |
|
"grad_norm": 0.38191962242126465, |
|
"learning_rate": 0.0001556503198294243, |
|
"loss": 0.4865, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 2.345415778251599, |
|
"grad_norm": 0.32218611240386963, |
|
"learning_rate": 0.00015636105188343994, |
|
"loss": 0.4955, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.3560767590618337, |
|
"grad_norm": 0.32240140438079834, |
|
"learning_rate": 0.0001570717839374556, |
|
"loss": 0.4972, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 2.366737739872068, |
|
"grad_norm": 0.37284377217292786, |
|
"learning_rate": 0.00015778251599147123, |
|
"loss": 0.4874, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.3773987206823026, |
|
"grad_norm": 0.350769579410553, |
|
"learning_rate": 0.00015849324804548688, |
|
"loss": 0.4931, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 2.388059701492537, |
|
"grad_norm": 0.3309812843799591, |
|
"learning_rate": 0.0001592039800995025, |
|
"loss": 0.5103, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.398720682302772, |
|
"grad_norm": 0.3497963547706604, |
|
"learning_rate": 0.0001599147121535181, |
|
"loss": 0.4864, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 2.4093816631130065, |
|
"grad_norm": 0.3567025661468506, |
|
"learning_rate": 0.00016062544420753375, |
|
"loss": 0.5461, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.420042643923241, |
|
"grad_norm": 0.5213941931724548, |
|
"learning_rate": 0.0001613361762615494, |
|
"loss": 0.5138, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 2.4307036247334755, |
|
"grad_norm": 0.32027000188827515, |
|
"learning_rate": 0.00016204690831556504, |
|
"loss": 0.5078, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.44136460554371, |
|
"grad_norm": 0.37092500925064087, |
|
"learning_rate": 0.00016275764036958069, |
|
"loss": 0.4903, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 2.4520255863539444, |
|
"grad_norm": 0.35545867681503296, |
|
"learning_rate": 0.0001634683724235963, |
|
"loss": 0.5131, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.4626865671641793, |
|
"grad_norm": 0.3277740776538849, |
|
"learning_rate": 0.00016417910447761195, |
|
"loss": 0.4814, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 2.473347547974414, |
|
"grad_norm": 0.3226880133152008, |
|
"learning_rate": 0.0001648898365316276, |
|
"loss": 0.4944, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.4840085287846483, |
|
"grad_norm": 0.3283137381076813, |
|
"learning_rate": 0.0001656005685856432, |
|
"loss": 0.5058, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 2.4946695095948828, |
|
"grad_norm": 0.38707828521728516, |
|
"learning_rate": 0.00016631130063965885, |
|
"loss": 0.5108, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.5053304904051172, |
|
"grad_norm": 0.3053881824016571, |
|
"learning_rate": 0.0001670220326936745, |
|
"loss": 0.4751, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 2.5159914712153517, |
|
"grad_norm": 0.29871490597724915, |
|
"learning_rate": 0.00016773276474769014, |
|
"loss": 0.4848, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.526652452025586, |
|
"grad_norm": 0.3135201930999756, |
|
"learning_rate": 0.00016844349680170576, |
|
"loss": 0.4852, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 2.5373134328358207, |
|
"grad_norm": 0.31287622451782227, |
|
"learning_rate": 0.0001691542288557214, |
|
"loss": 0.4804, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.5479744136460556, |
|
"grad_norm": 0.30184197425842285, |
|
"learning_rate": 0.00016986496090973705, |
|
"loss": 0.5006, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 2.55863539445629, |
|
"grad_norm": 0.29948562383651733, |
|
"learning_rate": 0.00017057569296375266, |
|
"loss": 0.4934, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.5692963752665245, |
|
"grad_norm": 0.29258280992507935, |
|
"learning_rate": 0.0001712864250177683, |
|
"loss": 0.4887, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 2.579957356076759, |
|
"grad_norm": 0.29767826199531555, |
|
"learning_rate": 0.00017199715707178395, |
|
"loss": 0.4958, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.5906183368869935, |
|
"grad_norm": 0.29649823904037476, |
|
"learning_rate": 0.0001727078891257996, |
|
"loss": 0.51, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 2.6012793176972284, |
|
"grad_norm": 0.30332130193710327, |
|
"learning_rate": 0.0001734186211798152, |
|
"loss": 0.4954, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.611940298507463, |
|
"grad_norm": 0.3551209270954132, |
|
"learning_rate": 0.00017412935323383086, |
|
"loss": 0.5088, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 2.6226012793176974, |
|
"grad_norm": 0.33677777647972107, |
|
"learning_rate": 0.0001748400852878465, |
|
"loss": 0.5248, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.633262260127932, |
|
"grad_norm": 0.29216548800468445, |
|
"learning_rate": 0.00017555081734186212, |
|
"loss": 0.4954, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 2.6439232409381663, |
|
"grad_norm": 0.32732442021369934, |
|
"learning_rate": 0.00017626154939587776, |
|
"loss": 0.5048, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.654584221748401, |
|
"grad_norm": 0.29788029193878174, |
|
"learning_rate": 0.0001769722814498934, |
|
"loss": 0.5056, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 2.6652452025586353, |
|
"grad_norm": 0.3407440185546875, |
|
"learning_rate": 0.00017768301350390902, |
|
"loss": 0.5385, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.6759061833688698, |
|
"grad_norm": 0.2790848910808563, |
|
"learning_rate": 0.00017839374555792467, |
|
"loss": 0.5014, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 2.6865671641791042, |
|
"grad_norm": 0.30173078179359436, |
|
"learning_rate": 0.0001791044776119403, |
|
"loss": 0.5118, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.697228144989339, |
|
"grad_norm": 0.2736753821372986, |
|
"learning_rate": 0.00017981520966595596, |
|
"loss": 0.5018, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 2.7078891257995736, |
|
"grad_norm": 0.2970294952392578, |
|
"learning_rate": 0.00018052594171997157, |
|
"loss": 0.4966, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.718550106609808, |
|
"grad_norm": 0.2721494138240814, |
|
"learning_rate": 0.00018123667377398722, |
|
"loss": 0.4746, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 2.7292110874200426, |
|
"grad_norm": 0.29144713282585144, |
|
"learning_rate": 0.00018194740582800286, |
|
"loss": 0.4739, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.739872068230277, |
|
"grad_norm": 0.3217550814151764, |
|
"learning_rate": 0.00018265813788201848, |
|
"loss": 0.4868, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 2.750533049040512, |
|
"grad_norm": 0.25847169756889343, |
|
"learning_rate": 0.00018336886993603412, |
|
"loss": 0.4664, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.7611940298507465, |
|
"grad_norm": 0.2917424142360687, |
|
"learning_rate": 0.00018407960199004977, |
|
"loss": 0.4659, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 2.771855010660981, |
|
"grad_norm": 0.29807865619659424, |
|
"learning_rate": 0.0001847903340440654, |
|
"loss": 0.4838, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.7825159914712154, |
|
"grad_norm": 0.28630420565605164, |
|
"learning_rate": 0.00018550106609808103, |
|
"loss": 0.4658, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 2.79317697228145, |
|
"grad_norm": 0.2946392595767975, |
|
"learning_rate": 0.00018621179815209667, |
|
"loss": 0.5037, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.8038379530916844, |
|
"grad_norm": 0.38894176483154297, |
|
"learning_rate": 0.0001869225302061123, |
|
"loss": 0.525, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 2.814498933901919, |
|
"grad_norm": 0.28793737292289734, |
|
"learning_rate": 0.00018763326226012793, |
|
"loss": 0.5238, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.8251599147121533, |
|
"grad_norm": 0.3103950023651123, |
|
"learning_rate": 0.00018834399431414358, |
|
"loss": 0.4932, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 2.835820895522388, |
|
"grad_norm": 0.2969878017902374, |
|
"learning_rate": 0.00018905472636815922, |
|
"loss": 0.4807, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.8464818763326227, |
|
"grad_norm": 0.2937600612640381, |
|
"learning_rate": 0.00018976545842217486, |
|
"loss": 0.4862, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.2892070710659027, |
|
"learning_rate": 0.00019047619047619048, |
|
"loss": 0.526, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.8678038379530917, |
|
"grad_norm": 0.28446847200393677, |
|
"learning_rate": 0.00019118692253020613, |
|
"loss": 0.4846, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 2.878464818763326, |
|
"grad_norm": 0.2877322733402252, |
|
"learning_rate": 0.00019189765458422174, |
|
"loss": 0.4759, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.8891257995735606, |
|
"grad_norm": 0.2837788462638855, |
|
"learning_rate": 0.0001926083866382374, |
|
"loss": 0.4894, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 2.8997867803837956, |
|
"grad_norm": 0.3020360469818115, |
|
"learning_rate": 0.00019331911869225303, |
|
"loss": 0.4936, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.91044776119403, |
|
"grad_norm": 0.28344911336898804, |
|
"learning_rate": 0.00019402985074626867, |
|
"loss": 0.4881, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 2.9211087420042645, |
|
"grad_norm": 0.2753186821937561, |
|
"learning_rate": 0.00019474058280028432, |
|
"loss": 0.4826, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.931769722814499, |
|
"grad_norm": 0.2922317385673523, |
|
"learning_rate": 0.00019545131485429994, |
|
"loss": 0.4759, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 2.9424307036247335, |
|
"grad_norm": 0.3179524540901184, |
|
"learning_rate": 0.00019616204690831555, |
|
"loss": 0.4883, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.953091684434968, |
|
"grad_norm": 0.2944222688674927, |
|
"learning_rate": 0.0001968727789623312, |
|
"loss": 0.4804, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 2.9637526652452024, |
|
"grad_norm": 0.2687291204929352, |
|
"learning_rate": 0.00019758351101634684, |
|
"loss": 0.4891, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.974413646055437, |
|
"grad_norm": 0.25935596227645874, |
|
"learning_rate": 0.00019829424307036249, |
|
"loss": 0.4902, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 2.9850746268656714, |
|
"grad_norm": 0.30086612701416016, |
|
"learning_rate": 0.00019900497512437813, |
|
"loss": 0.4942, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.9957356076759063, |
|
"grad_norm": 0.2930257022380829, |
|
"learning_rate": 0.00019971570717839377, |
|
"loss": 0.513, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.5142309069633484, |
|
"eval_runtime": 377.5199, |
|
"eval_samples_per_second": 1.091, |
|
"eval_steps_per_second": 1.091, |
|
"step": 1407 |
|
}, |
|
{ |
|
"epoch": 3.0063965884861408, |
|
"grad_norm": 0.28208208084106445, |
|
"learning_rate": 0.00019999997230259856, |
|
"loss": 0.467, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 3.0170575692963753, |
|
"grad_norm": 0.290385365486145, |
|
"learning_rate": 0.00019999980304075655, |
|
"loss": 0.44, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 3.0277185501066097, |
|
"grad_norm": 0.27436771988868713, |
|
"learning_rate": 0.00019999947990477788, |
|
"loss": 0.4876, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 3.038379530916844, |
|
"grad_norm": 0.2883841395378113, |
|
"learning_rate": 0.00019999900289515975, |
|
"loss": 0.4509, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 3.0490405117270787, |
|
"grad_norm": 0.279857337474823, |
|
"learning_rate": 0.00019999837201263622, |
|
"loss": 0.4431, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 3.0597014925373136, |
|
"grad_norm": 0.31563228368759155, |
|
"learning_rate": 0.000199997587258178, |
|
"loss": 0.4789, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 3.070362473347548, |
|
"grad_norm": 0.302135169506073, |
|
"learning_rate": 0.00019999664863299267, |
|
"loss": 0.4685, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 3.0810234541577826, |
|
"grad_norm": 0.2668147385120392, |
|
"learning_rate": 0.00019999555613852449, |
|
"loss": 0.4361, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 3.091684434968017, |
|
"grad_norm": 0.28701773285865784, |
|
"learning_rate": 0.00019999430977645457, |
|
"loss": 0.4417, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 3.1023454157782515, |
|
"grad_norm": 0.2622893154621124, |
|
"learning_rate": 0.00019999290954870073, |
|
"loss": 0.4524, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 3.113006396588486, |
|
"grad_norm": 0.2776693105697632, |
|
"learning_rate": 0.00019999135545741755, |
|
"loss": 0.463, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 3.1236673773987205, |
|
"grad_norm": 0.26774516701698303, |
|
"learning_rate": 0.00019998964750499637, |
|
"loss": 0.4732, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 3.1343283582089554, |
|
"grad_norm": 0.26958051323890686, |
|
"learning_rate": 0.0001999877856940653, |
|
"loss": 0.4517, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 3.14498933901919, |
|
"grad_norm": 0.2604299485683441, |
|
"learning_rate": 0.00019998577002748924, |
|
"loss": 0.4476, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 3.1556503198294243, |
|
"grad_norm": 1.0628249645233154, |
|
"learning_rate": 0.00019998360050836974, |
|
"loss": 0.4542, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 3.166311300639659, |
|
"grad_norm": 0.26215219497680664, |
|
"learning_rate": 0.0001999812771400451, |
|
"loss": 0.4608, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 3.1769722814498933, |
|
"grad_norm": 0.2745310068130493, |
|
"learning_rate": 0.00019997879992609047, |
|
"loss": 0.4532, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 3.1876332622601278, |
|
"grad_norm": 0.3186289072036743, |
|
"learning_rate": 0.0001999761688703176, |
|
"loss": 0.4854, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 3.1982942430703627, |
|
"grad_norm": 0.2697219252586365, |
|
"learning_rate": 0.000199973383976775, |
|
"loss": 0.4759, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.208955223880597, |
|
"grad_norm": 0.32173436880111694, |
|
"learning_rate": 0.00019997044524974799, |
|
"loss": 0.47, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 3.2196162046908317, |
|
"grad_norm": 0.28551211953163147, |
|
"learning_rate": 0.00019996735269375843, |
|
"loss": 0.4537, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 3.230277185501066, |
|
"grad_norm": 0.2618770897388458, |
|
"learning_rate": 0.00019996410631356498, |
|
"loss": 0.455, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 3.2409381663113006, |
|
"grad_norm": 0.3189204931259155, |
|
"learning_rate": 0.00019996070611416305, |
|
"loss": 0.4869, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 3.251599147121535, |
|
"grad_norm": 0.2555652856826782, |
|
"learning_rate": 0.00019995715210078464, |
|
"loss": 0.4582, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 3.2622601279317696, |
|
"grad_norm": 0.45129457116127014, |
|
"learning_rate": 0.00019995344427889845, |
|
"loss": 0.5055, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 3.272921108742004, |
|
"grad_norm": 0.2851119637489319, |
|
"learning_rate": 0.0001999495826542099, |
|
"loss": 0.4495, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 3.283582089552239, |
|
"grad_norm": 0.4647831916809082, |
|
"learning_rate": 0.00019994556723266103, |
|
"loss": 0.4442, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 3.2942430703624734, |
|
"grad_norm": 0.28650426864624023, |
|
"learning_rate": 0.00019994139802043055, |
|
"loss": 0.488, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 3.304904051172708, |
|
"grad_norm": 0.2804616093635559, |
|
"learning_rate": 0.0001999370750239338, |
|
"loss": 0.4538, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 3.3155650319829424, |
|
"grad_norm": 0.2778622508049011, |
|
"learning_rate": 0.0001999325982498228, |
|
"loss": 0.4468, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 3.326226012793177, |
|
"grad_norm": 0.26577600836753845, |
|
"learning_rate": 0.00019992796770498616, |
|
"loss": 0.4805, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 3.3368869936034113, |
|
"grad_norm": 0.25679486989974976, |
|
"learning_rate": 0.00019992318339654905, |
|
"loss": 0.4648, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 3.3475479744136463, |
|
"grad_norm": 0.263921856880188, |
|
"learning_rate": 0.00019991824533187335, |
|
"loss": 0.4638, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 3.3582089552238807, |
|
"grad_norm": 0.25445836782455444, |
|
"learning_rate": 0.00019991315351855748, |
|
"loss": 0.4395, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 3.368869936034115, |
|
"grad_norm": 0.2354278415441513, |
|
"learning_rate": 0.0001999079079644364, |
|
"loss": 0.487, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 3.3795309168443497, |
|
"grad_norm": 0.2561117708683014, |
|
"learning_rate": 0.0001999025086775817, |
|
"loss": 0.4562, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 3.390191897654584, |
|
"grad_norm": 0.3330647349357605, |
|
"learning_rate": 0.00019989695566630152, |
|
"loss": 0.4445, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 3.4008528784648187, |
|
"grad_norm": 0.26299235224723816, |
|
"learning_rate": 0.00019989124893914046, |
|
"loss": 0.4488, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 3.411513859275053, |
|
"grad_norm": 0.299434095621109, |
|
"learning_rate": 0.0001998853885048798, |
|
"loss": 0.4563, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.4221748400852876, |
|
"grad_norm": 0.23711760342121124, |
|
"learning_rate": 0.0001998793743725372, |
|
"loss": 0.4473, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 3.4328358208955225, |
|
"grad_norm": 0.24863874912261963, |
|
"learning_rate": 0.00019987320655136693, |
|
"loss": 0.4574, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 3.443496801705757, |
|
"grad_norm": 0.24471955001354218, |
|
"learning_rate": 0.00019986688505085957, |
|
"loss": 0.4665, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 3.4541577825159915, |
|
"grad_norm": 0.2540249526500702, |
|
"learning_rate": 0.00019986040988074238, |
|
"loss": 0.4689, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 3.464818763326226, |
|
"grad_norm": 0.2666712701320648, |
|
"learning_rate": 0.00019985378105097902, |
|
"loss": 0.4477, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 3.4754797441364604, |
|
"grad_norm": 0.27709081768989563, |
|
"learning_rate": 0.0001998469985717695, |
|
"loss": 0.4403, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 3.486140724946695, |
|
"grad_norm": 0.27587834000587463, |
|
"learning_rate": 0.00019984006245355037, |
|
"loss": 0.4565, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 3.49680170575693, |
|
"grad_norm": 0.22859402000904083, |
|
"learning_rate": 0.00019983297270699448, |
|
"loss": 0.4514, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 3.5074626865671643, |
|
"grad_norm": 0.3489368259906769, |
|
"learning_rate": 0.00019982572934301122, |
|
"loss": 0.4727, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 3.518123667377399, |
|
"grad_norm": 0.2632017135620117, |
|
"learning_rate": 0.00019981833237274618, |
|
"loss": 0.4415, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.5287846481876333, |
|
"grad_norm": 0.27099326252937317, |
|
"learning_rate": 0.00019981078180758154, |
|
"loss": 0.4489, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 3.5394456289978677, |
|
"grad_norm": 0.2415977120399475, |
|
"learning_rate": 0.00019980307765913552, |
|
"loss": 0.4764, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 3.550106609808102, |
|
"grad_norm": 0.23986046016216278, |
|
"learning_rate": 0.000199795219939263, |
|
"loss": 0.4458, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 3.5607675906183367, |
|
"grad_norm": 0.28455114364624023, |
|
"learning_rate": 0.00019978720866005488, |
|
"loss": 0.4846, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 3.571428571428571, |
|
"grad_norm": 0.2913159430027008, |
|
"learning_rate": 0.0001997790438338385, |
|
"loss": 0.4547, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 3.582089552238806, |
|
"grad_norm": 0.25150275230407715, |
|
"learning_rate": 0.0001997707254731775, |
|
"loss": 0.4599, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 3.5927505330490406, |
|
"grad_norm": 0.23482745885849, |
|
"learning_rate": 0.00019976225359087164, |
|
"loss": 0.4315, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 3.603411513859275, |
|
"grad_norm": 0.23308737576007843, |
|
"learning_rate": 0.00019975362819995703, |
|
"loss": 0.449, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 3.6140724946695095, |
|
"grad_norm": 0.2528814375400543, |
|
"learning_rate": 0.00019974484931370592, |
|
"loss": 0.4392, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 3.624733475479744, |
|
"grad_norm": 0.25079530477523804, |
|
"learning_rate": 0.00019973591694562678, |
|
"loss": 0.4536, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.635394456289979, |
|
"grad_norm": 0.2929099202156067, |
|
"learning_rate": 0.00019972683110946421, |
|
"loss": 0.4426, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 3.6460554371002134, |
|
"grad_norm": 0.23356157541275024, |
|
"learning_rate": 0.00019971759181919903, |
|
"loss": 0.4602, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 3.656716417910448, |
|
"grad_norm": 0.3128319978713989, |
|
"learning_rate": 0.00019970819908904814, |
|
"loss": 0.4629, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 3.6673773987206824, |
|
"grad_norm": 0.23164990544319153, |
|
"learning_rate": 0.00019969865293346454, |
|
"loss": 0.4662, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 3.678038379530917, |
|
"grad_norm": 0.43762582540512085, |
|
"learning_rate": 0.00019968895336713733, |
|
"loss": 0.4685, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 3.6886993603411513, |
|
"grad_norm": 0.34830760955810547, |
|
"learning_rate": 0.00019967910040499164, |
|
"loss": 0.4504, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 3.699360341151386, |
|
"grad_norm": 0.2538786828517914, |
|
"learning_rate": 0.00019966909406218868, |
|
"loss": 0.4967, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 3.7100213219616203, |
|
"grad_norm": 0.23103195428848267, |
|
"learning_rate": 0.0001996589343541257, |
|
"loss": 0.4556, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.7206823027718547, |
|
"grad_norm": 0.2618430554866791, |
|
"learning_rate": 0.0001996486212964358, |
|
"loss": 0.4453, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 3.7313432835820897, |
|
"grad_norm": 0.23393474519252777, |
|
"learning_rate": 0.00019963815490498817, |
|
"loss": 0.4613, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.742004264392324, |
|
"grad_norm": 0.2798391282558441, |
|
"learning_rate": 0.00019962753519588798, |
|
"loss": 0.4668, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 3.7526652452025586, |
|
"grad_norm": 0.24927425384521484, |
|
"learning_rate": 0.00019961676218547617, |
|
"loss": 0.4424, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 3.763326226012793, |
|
"grad_norm": 0.2537556290626526, |
|
"learning_rate": 0.00019960583589032966, |
|
"loss": 0.4413, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 3.7739872068230276, |
|
"grad_norm": 0.2401181310415268, |
|
"learning_rate": 0.00019959475632726128, |
|
"loss": 0.4365, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 3.7846481876332625, |
|
"grad_norm": 0.22927629947662354, |
|
"learning_rate": 0.00019958352351331956, |
|
"loss": 0.4455, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 3.795309168443497, |
|
"grad_norm": 0.21933622658252716, |
|
"learning_rate": 0.00019957213746578902, |
|
"loss": 0.4661, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 3.8059701492537314, |
|
"grad_norm": 0.28884589672088623, |
|
"learning_rate": 0.00019956059820218982, |
|
"loss": 0.4931, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 3.816631130063966, |
|
"grad_norm": 0.2619436979293823, |
|
"learning_rate": 0.00019954890574027797, |
|
"loss": 0.4446, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 3.8272921108742004, |
|
"grad_norm": 0.22175399959087372, |
|
"learning_rate": 0.00019953706009804512, |
|
"loss": 0.4482, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 3.837953091684435, |
|
"grad_norm": 0.23060369491577148, |
|
"learning_rate": 0.00019952506129371873, |
|
"loss": 0.451, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.8486140724946694, |
|
"grad_norm": 0.2313724309206009, |
|
"learning_rate": 0.0001995129093457619, |
|
"loss": 0.4496, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 3.859275053304904, |
|
"grad_norm": 0.23518264293670654, |
|
"learning_rate": 0.00019950060427287335, |
|
"loss": 0.4581, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 3.8699360341151388, |
|
"grad_norm": 0.22398614883422852, |
|
"learning_rate": 0.00019948814609398746, |
|
"loss": 0.4382, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 3.8805970149253732, |
|
"grad_norm": 0.21408702433109283, |
|
"learning_rate": 0.00019947553482827418, |
|
"loss": 0.4517, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 3.8912579957356077, |
|
"grad_norm": 0.26791512966156006, |
|
"learning_rate": 0.00019946277049513904, |
|
"loss": 0.4671, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 3.901918976545842, |
|
"grad_norm": 0.37972912192344666, |
|
"learning_rate": 0.00019944985311422304, |
|
"loss": 0.4665, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 3.9125799573560767, |
|
"grad_norm": 0.2744680941104889, |
|
"learning_rate": 0.00019943678270540276, |
|
"loss": 0.4627, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 3.923240938166311, |
|
"grad_norm": 0.3253777325153351, |
|
"learning_rate": 0.00019942355928879023, |
|
"loss": 0.468, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 3.933901918976546, |
|
"grad_norm": 0.32431936264038086, |
|
"learning_rate": 0.00019941018288473285, |
|
"loss": 0.4497, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 3.9445628997867805, |
|
"grad_norm": 0.2247323989868164, |
|
"learning_rate": 0.00019939665351381355, |
|
"loss": 0.4444, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.955223880597015, |
|
"grad_norm": 0.35610342025756836, |
|
"learning_rate": 0.00019938297119685054, |
|
"loss": 0.4563, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 3.9658848614072495, |
|
"grad_norm": 0.2513818144798279, |
|
"learning_rate": 0.00019936913595489743, |
|
"loss": 0.442, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 3.976545842217484, |
|
"grad_norm": 0.3135777711868286, |
|
"learning_rate": 0.0001993551478092431, |
|
"loss": 0.4377, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 3.9872068230277184, |
|
"grad_norm": 0.24127310514450073, |
|
"learning_rate": 0.0001993410067814118, |
|
"loss": 0.4478, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 3.997867803837953, |
|
"grad_norm": 0.23388491570949554, |
|
"learning_rate": 0.00019932671289316282, |
|
"loss": 0.4306, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.5043795108795166, |
|
"eval_runtime": 377.5601, |
|
"eval_samples_per_second": 1.091, |
|
"eval_steps_per_second": 1.091, |
|
"step": 1876 |
|
}, |
|
{ |
|
"epoch": 4.008528784648187, |
|
"grad_norm": 0.3674967288970947, |
|
"learning_rate": 0.0001993122661664909, |
|
"loss": 0.4371, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 4.019189765458422, |
|
"grad_norm": 0.2773316204547882, |
|
"learning_rate": 0.00019929766662362585, |
|
"loss": 0.4043, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 4.029850746268656, |
|
"grad_norm": 0.2394101619720459, |
|
"learning_rate": 0.00019928291428703262, |
|
"loss": 0.413, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 4.040511727078891, |
|
"grad_norm": 0.23238113522529602, |
|
"learning_rate": 0.00019926800917941128, |
|
"loss": 0.4021, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 4.051172707889126, |
|
"grad_norm": 0.22244401276111603, |
|
"learning_rate": 0.000199252951323697, |
|
"loss": 0.4101, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.061833688699361, |
|
"grad_norm": 0.24964463710784912, |
|
"learning_rate": 0.00019923774074306, |
|
"loss": 0.4123, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 4.072494669509595, |
|
"grad_norm": 0.23066940903663635, |
|
"learning_rate": 0.00019922237746090537, |
|
"loss": 0.4267, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 4.08315565031983, |
|
"grad_norm": 0.23452460765838623, |
|
"learning_rate": 0.00019920686150087336, |
|
"loss": 0.4223, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 4.093816631130064, |
|
"grad_norm": 0.3032955527305603, |
|
"learning_rate": 0.00019919119288683908, |
|
"loss": 0.432, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 4.104477611940299, |
|
"grad_norm": 0.3310707211494446, |
|
"learning_rate": 0.00019917537164291244, |
|
"loss": 0.42, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 4.115138592750533, |
|
"grad_norm": 0.24135416746139526, |
|
"learning_rate": 0.00019915939779343838, |
|
"loss": 0.4289, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 4.1257995735607675, |
|
"grad_norm": 0.23443254828453064, |
|
"learning_rate": 0.00019914327136299651, |
|
"loss": 0.4216, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 4.136460554371002, |
|
"grad_norm": 0.3196619749069214, |
|
"learning_rate": 0.0001991269923764013, |
|
"loss": 0.4387, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 4.1471215351812365, |
|
"grad_norm": 0.2881762981414795, |
|
"learning_rate": 0.00019911056085870197, |
|
"loss": 0.4176, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 4.157782515991471, |
|
"grad_norm": 0.25249961018562317, |
|
"learning_rate": 0.00019909397683518242, |
|
"loss": 0.4221, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 4.1684434968017055, |
|
"grad_norm": 0.22756356000900269, |
|
"learning_rate": 0.00019907724033136118, |
|
"loss": 0.413, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 4.17910447761194, |
|
"grad_norm": 0.24332334101200104, |
|
"learning_rate": 0.0001990603513729915, |
|
"loss": 0.4218, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 4.189765458422174, |
|
"grad_norm": 0.23593220114707947, |
|
"learning_rate": 0.00019904330998606116, |
|
"loss": 0.4114, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 4.20042643923241, |
|
"grad_norm": 0.266313374042511, |
|
"learning_rate": 0.00019902611619679252, |
|
"loss": 0.4309, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 4.211087420042644, |
|
"grad_norm": 0.3359983563423157, |
|
"learning_rate": 0.00019900877003164235, |
|
"loss": 0.4339, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 4.221748400852879, |
|
"grad_norm": 0.22711415588855743, |
|
"learning_rate": 0.00019899127151730206, |
|
"loss": 0.4165, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 4.232409381663113, |
|
"grad_norm": 0.2225334793329239, |
|
"learning_rate": 0.00019897362068069732, |
|
"loss": 0.4094, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 4.243070362473348, |
|
"grad_norm": 0.2701500356197357, |
|
"learning_rate": 0.0001989558175489883, |
|
"loss": 0.4239, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 4.253731343283582, |
|
"grad_norm": 0.2480495721101761, |
|
"learning_rate": 0.00019893786214956945, |
|
"loss": 0.4137, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 4.264392324093817, |
|
"grad_norm": 0.22299885749816895, |
|
"learning_rate": 0.00019891975451006953, |
|
"loss": 0.4273, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.275053304904051, |
|
"grad_norm": 0.2259630262851715, |
|
"learning_rate": 0.0001989014946583516, |
|
"loss": 0.4223, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 4.285714285714286, |
|
"grad_norm": 0.3351574242115021, |
|
"learning_rate": 0.00019888308262251285, |
|
"loss": 0.4483, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 4.29637526652452, |
|
"grad_norm": 0.21363438665866852, |
|
"learning_rate": 0.0001988645184308848, |
|
"loss": 0.4138, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 4.3070362473347545, |
|
"grad_norm": 0.2409023493528366, |
|
"learning_rate": 0.00019884580211203287, |
|
"loss": 0.4166, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 4.317697228144989, |
|
"grad_norm": 0.24684803187847137, |
|
"learning_rate": 0.00019882693369475675, |
|
"loss": 0.4089, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 4.3283582089552235, |
|
"grad_norm": 0.24175861477851868, |
|
"learning_rate": 0.0001988079132080901, |
|
"loss": 0.4169, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 4.339019189765459, |
|
"grad_norm": 0.3582640290260315, |
|
"learning_rate": 0.00019878874068130062, |
|
"loss": 0.4207, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 4.349680170575693, |
|
"grad_norm": 0.23563334345817566, |
|
"learning_rate": 0.00019876941614388992, |
|
"loss": 0.4056, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 4.360341151385928, |
|
"grad_norm": 0.24959246814250946, |
|
"learning_rate": 0.0001987499396255935, |
|
"loss": 0.4152, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 4.371002132196162, |
|
"grad_norm": 0.2378864586353302, |
|
"learning_rate": 0.00019873031115638073, |
|
"loss": 0.428, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 4.381663113006397, |
|
"grad_norm": 0.25769662857055664, |
|
"learning_rate": 0.00019871053076645488, |
|
"loss": 0.4273, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 4.392324093816631, |
|
"grad_norm": 0.2148350328207016, |
|
"learning_rate": 0.0001986905984862528, |
|
"loss": 0.4341, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 4.402985074626866, |
|
"grad_norm": 0.22630667686462402, |
|
"learning_rate": 0.0001986705143464453, |
|
"loss": 0.43, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 4.4136460554371, |
|
"grad_norm": 0.23718136548995972, |
|
"learning_rate": 0.00019865027837793665, |
|
"loss": 0.4193, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 4.424307036247335, |
|
"grad_norm": 0.26240232586860657, |
|
"learning_rate": 0.00019862989061186483, |
|
"loss": 0.4327, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 4.434968017057569, |
|
"grad_norm": 0.21503274142742157, |
|
"learning_rate": 0.0001986093510796015, |
|
"loss": 0.4208, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 4.445628997867804, |
|
"grad_norm": 0.31747710704803467, |
|
"learning_rate": 0.0001985886598127516, |
|
"loss": 0.4348, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 4.456289978678038, |
|
"grad_norm": 0.24618090689182281, |
|
"learning_rate": 0.00019856781684315382, |
|
"loss": 0.4247, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 4.466950959488273, |
|
"grad_norm": 0.33112359046936035, |
|
"learning_rate": 0.00019854682220288013, |
|
"loss": 0.4175, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 4.477611940298507, |
|
"grad_norm": 0.23943935334682465, |
|
"learning_rate": 0.0001985256759242359, |
|
"loss": 0.4271, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.4882729211087415, |
|
"grad_norm": 0.24192848801612854, |
|
"learning_rate": 0.00019850437803975988, |
|
"loss": 0.4221, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 4.498933901918977, |
|
"grad_norm": 0.22631579637527466, |
|
"learning_rate": 0.00019848292858222401, |
|
"loss": 0.4233, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 4.509594882729211, |
|
"grad_norm": 0.23344965279102325, |
|
"learning_rate": 0.00019846132758463356, |
|
"loss": 0.4161, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 4.520255863539446, |
|
"grad_norm": 0.22698044776916504, |
|
"learning_rate": 0.000198439575080227, |
|
"loss": 0.4112, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 4.53091684434968, |
|
"grad_norm": 0.3037104308605194, |
|
"learning_rate": 0.00019841767110247575, |
|
"loss": 0.4362, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 4.541577825159915, |
|
"grad_norm": 0.24173210561275482, |
|
"learning_rate": 0.00019839561568508454, |
|
"loss": 0.4223, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 4.552238805970149, |
|
"grad_norm": 0.2352645844221115, |
|
"learning_rate": 0.00019837340886199096, |
|
"loss": 0.4274, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 4.562899786780384, |
|
"grad_norm": 0.2779860496520996, |
|
"learning_rate": 0.0001983510506673657, |
|
"loss": 0.4316, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 4.573560767590618, |
|
"grad_norm": 0.24002455174922943, |
|
"learning_rate": 0.0001983285411356122, |
|
"loss": 0.4159, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 4.584221748400853, |
|
"grad_norm": 0.22028042376041412, |
|
"learning_rate": 0.00019830588030136698, |
|
"loss": 0.4296, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 4.594882729211087, |
|
"grad_norm": 0.3180830776691437, |
|
"learning_rate": 0.0001982830681994992, |
|
"loss": 0.4339, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 4.605543710021322, |
|
"grad_norm": 0.2228025496006012, |
|
"learning_rate": 0.00019826010486511091, |
|
"loss": 0.4149, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 4.616204690831556, |
|
"grad_norm": 0.2128361463546753, |
|
"learning_rate": 0.00019823699033353677, |
|
"loss": 0.4126, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 4.6268656716417915, |
|
"grad_norm": 0.2322179228067398, |
|
"learning_rate": 0.00019821372464034416, |
|
"loss": 0.4128, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 4.637526652452026, |
|
"grad_norm": 0.30600860714912415, |
|
"learning_rate": 0.00019819030782133304, |
|
"loss": 0.414, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 4.6481876332622605, |
|
"grad_norm": 0.22045232355594635, |
|
"learning_rate": 0.00019816673991253586, |
|
"loss": 0.409, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 4.658848614072495, |
|
"grad_norm": 0.2302045375108719, |
|
"learning_rate": 0.00019814302095021768, |
|
"loss": 0.4199, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 4.669509594882729, |
|
"grad_norm": 0.22577248513698578, |
|
"learning_rate": 0.00019811915097087587, |
|
"loss": 0.4058, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 4.680170575692964, |
|
"grad_norm": 0.6790816187858582, |
|
"learning_rate": 0.00019809513001124024, |
|
"loss": 0.4356, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 4.690831556503198, |
|
"grad_norm": 0.2510231137275696, |
|
"learning_rate": 0.00019807095810827293, |
|
"loss": 0.4062, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.701492537313433, |
|
"grad_norm": 0.24071648716926575, |
|
"learning_rate": 0.00019804663529916826, |
|
"loss": 0.4282, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 4.712153518123667, |
|
"grad_norm": 0.2886710464954376, |
|
"learning_rate": 0.00019802216162135287, |
|
"loss": 0.4254, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 4.722814498933902, |
|
"grad_norm": 0.2941761910915375, |
|
"learning_rate": 0.0001979975371124855, |
|
"loss": 0.4343, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 4.733475479744136, |
|
"grad_norm": 0.2591281533241272, |
|
"learning_rate": 0.00019797276181045693, |
|
"loss": 0.4165, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 4.744136460554371, |
|
"grad_norm": 0.2245703637599945, |
|
"learning_rate": 0.00019794783575339004, |
|
"loss": 0.4112, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 4.754797441364605, |
|
"grad_norm": 0.48405957221984863, |
|
"learning_rate": 0.00019792275897963967, |
|
"loss": 0.4279, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 4.76545842217484, |
|
"grad_norm": 0.22091209888458252, |
|
"learning_rate": 0.00019789753152779258, |
|
"loss": 0.4371, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 4.776119402985074, |
|
"grad_norm": 0.23672465980052948, |
|
"learning_rate": 0.00019787215343666732, |
|
"loss": 0.4166, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 4.786780383795309, |
|
"grad_norm": 0.43999361991882324, |
|
"learning_rate": 0.0001978466247453143, |
|
"loss": 0.4167, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 4.797441364605544, |
|
"grad_norm": 0.2732659578323364, |
|
"learning_rate": 0.0001978209454930157, |
|
"loss": 0.4326, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 4.8081023454157785, |
|
"grad_norm": 0.27667996287345886, |
|
"learning_rate": 0.00019779511571928527, |
|
"loss": 0.4192, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 4.818763326226013, |
|
"grad_norm": 0.24479329586029053, |
|
"learning_rate": 0.00019776913546386843, |
|
"loss": 0.4158, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 4.8294243070362475, |
|
"grad_norm": 0.21344681084156036, |
|
"learning_rate": 0.0001977430047667422, |
|
"loss": 0.4112, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 4.840085287846482, |
|
"grad_norm": 0.24819132685661316, |
|
"learning_rate": 0.00019771672366811503, |
|
"loss": 0.414, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 4.850746268656716, |
|
"grad_norm": 0.2435145080089569, |
|
"learning_rate": 0.00019769029220842677, |
|
"loss": 0.4172, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 4.861407249466951, |
|
"grad_norm": 0.21831800043582916, |
|
"learning_rate": 0.0001976637104283487, |
|
"loss": 0.4168, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 4.872068230277185, |
|
"grad_norm": 0.3001014292240143, |
|
"learning_rate": 0.00019763697836878343, |
|
"loss": 0.4271, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 4.88272921108742, |
|
"grad_norm": 0.3473288118839264, |
|
"learning_rate": 0.00019761009607086472, |
|
"loss": 0.4256, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 4.893390191897654, |
|
"grad_norm": 0.2094939649105072, |
|
"learning_rate": 0.00019758306357595755, |
|
"loss": 0.4207, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 4.904051172707889, |
|
"grad_norm": 0.224636048078537, |
|
"learning_rate": 0.00019755588092565805, |
|
"loss": 0.4214, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.914712153518123, |
|
"grad_norm": 0.22260229289531708, |
|
"learning_rate": 0.00019752854816179336, |
|
"loss": 0.4226, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 4.925373134328359, |
|
"grad_norm": 0.21004381775856018, |
|
"learning_rate": 0.0001975010653264216, |
|
"loss": 0.414, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 4.936034115138593, |
|
"grad_norm": 0.2120514214038849, |
|
"learning_rate": 0.00019747343246183185, |
|
"loss": 0.4152, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 4.946695095948828, |
|
"grad_norm": 0.2152203619480133, |
|
"learning_rate": 0.00019744564961054402, |
|
"loss": 0.4159, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 4.957356076759062, |
|
"grad_norm": 0.22371242940425873, |
|
"learning_rate": 0.0001974177168153088, |
|
"loss": 0.4095, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 4.968017057569297, |
|
"grad_norm": 0.21865862607955933, |
|
"learning_rate": 0.00019738963411910766, |
|
"loss": 0.4261, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 4.978678038379531, |
|
"grad_norm": 0.3230665326118469, |
|
"learning_rate": 0.0001973614015651527, |
|
"loss": 0.4116, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 4.9893390191897655, |
|
"grad_norm": 0.21557492017745972, |
|
"learning_rate": 0.00019733301919688651, |
|
"loss": 0.4161, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.21153585612773895, |
|
"learning_rate": 0.00019730448705798239, |
|
"loss": 0.4128, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.5016890168190002, |
|
"eval_runtime": 377.5434, |
|
"eval_samples_per_second": 1.091, |
|
"eval_steps_per_second": 1.091, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 5.0106609808102345, |
|
"grad_norm": 0.20196357369422913, |
|
"learning_rate": 0.000197275805192344, |
|
"loss": 0.3909, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 5.021321961620469, |
|
"grad_norm": 0.2446993738412857, |
|
"learning_rate": 0.00019724697364410535, |
|
"loss": 0.3876, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 5.031982942430703, |
|
"grad_norm": 0.22501204907894135, |
|
"learning_rate": 0.00019721799245763088, |
|
"loss": 0.3882, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 5.042643923240938, |
|
"grad_norm": 0.23419953882694244, |
|
"learning_rate": 0.0001971888616775152, |
|
"loss": 0.3786, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 5.053304904051172, |
|
"grad_norm": 0.23151536285877228, |
|
"learning_rate": 0.00019715958134858315, |
|
"loss": 0.3925, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 5.063965884861407, |
|
"grad_norm": 0.23873166739940643, |
|
"learning_rate": 0.00019713015151588966, |
|
"loss": 0.3927, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 5.074626865671641, |
|
"grad_norm": 0.23083342611789703, |
|
"learning_rate": 0.00019710057222471967, |
|
"loss": 0.3836, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 5.085287846481877, |
|
"grad_norm": 0.22406326234340668, |
|
"learning_rate": 0.00019707084352058827, |
|
"loss": 0.389, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 5.095948827292111, |
|
"grad_norm": 0.37570300698280334, |
|
"learning_rate": 0.00019704096544924022, |
|
"loss": 0.3999, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 5.106609808102346, |
|
"grad_norm": 0.21594493091106415, |
|
"learning_rate": 0.0001970109380566503, |
|
"loss": 0.38, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 5.11727078891258, |
|
"grad_norm": 0.2725168466567993, |
|
"learning_rate": 0.00019698076138902298, |
|
"loss": 0.3848, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 5.127931769722815, |
|
"grad_norm": 0.2510855495929718, |
|
"learning_rate": 0.00019695043549279243, |
|
"loss": 0.3859, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 5.138592750533049, |
|
"grad_norm": 0.23722735047340393, |
|
"learning_rate": 0.00019691996041462244, |
|
"loss": 0.3876, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 5.149253731343284, |
|
"grad_norm": 0.35469353199005127, |
|
"learning_rate": 0.00019688933620140637, |
|
"loss": 0.3863, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 5.159914712153518, |
|
"grad_norm": 0.23087090253829956, |
|
"learning_rate": 0.0001968585629002671, |
|
"loss": 0.3898, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 5.1705756929637525, |
|
"grad_norm": 0.21194830536842346, |
|
"learning_rate": 0.00019682764055855683, |
|
"loss": 0.3832, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 5.181236673773987, |
|
"grad_norm": 0.23261596262454987, |
|
"learning_rate": 0.00019679656922385715, |
|
"loss": 0.3895, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 5.1918976545842215, |
|
"grad_norm": 0.24160555005073547, |
|
"learning_rate": 0.0001967653489439789, |
|
"loss": 0.391, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 5.202558635394456, |
|
"grad_norm": 0.23709999024868011, |
|
"learning_rate": 0.00019673397976696216, |
|
"loss": 0.3904, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 5.21321961620469, |
|
"grad_norm": 0.2529030740261078, |
|
"learning_rate": 0.00019670246174107597, |
|
"loss": 0.3853, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 5.223880597014926, |
|
"grad_norm": 0.22068992257118225, |
|
"learning_rate": 0.0001966707949148186, |
|
"loss": 0.3791, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 5.23454157782516, |
|
"grad_norm": 0.23219233751296997, |
|
"learning_rate": 0.00019663897933691718, |
|
"loss": 0.3904, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 5.245202558635395, |
|
"grad_norm": 0.25079360604286194, |
|
"learning_rate": 0.00019660701505632772, |
|
"loss": 0.3995, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 5.255863539445629, |
|
"grad_norm": 0.2510697841644287, |
|
"learning_rate": 0.00019657490212223515, |
|
"loss": 0.3861, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 5.266524520255864, |
|
"grad_norm": 0.25218454003334045, |
|
"learning_rate": 0.000196542640584053, |
|
"loss": 0.3878, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 5.277185501066098, |
|
"grad_norm": 0.21124300360679626, |
|
"learning_rate": 0.00019651023049142356, |
|
"loss": 0.3881, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 5.287846481876333, |
|
"grad_norm": 0.23286496102809906, |
|
"learning_rate": 0.0001964776718942177, |
|
"loss": 0.3893, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 5.298507462686567, |
|
"grad_norm": 0.2385607361793518, |
|
"learning_rate": 0.00019644496484253474, |
|
"loss": 0.381, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 5.309168443496802, |
|
"grad_norm": 0.22742030024528503, |
|
"learning_rate": 0.00019641210938670247, |
|
"loss": 0.393, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 5.319829424307036, |
|
"grad_norm": 0.22051115334033966, |
|
"learning_rate": 0.00019637910557727706, |
|
"loss": 0.3933, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 5.330490405117271, |
|
"grad_norm": 0.23317855596542358, |
|
"learning_rate": 0.00019634595346504293, |
|
"loss": 0.3877, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 5.341151385927505, |
|
"grad_norm": 0.23425228893756866, |
|
"learning_rate": 0.00019631265310101272, |
|
"loss": 0.4158, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 5.3518123667377395, |
|
"grad_norm": 0.25701725482940674, |
|
"learning_rate": 0.00019627920453642715, |
|
"loss": 0.3835, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 5.362473347547974, |
|
"grad_norm": 0.23093344271183014, |
|
"learning_rate": 0.00019624560782275505, |
|
"loss": 0.3846, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 5.373134328358209, |
|
"grad_norm": 0.2600732147693634, |
|
"learning_rate": 0.00019621186301169315, |
|
"loss": 0.3917, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 5.383795309168444, |
|
"grad_norm": 0.2647717595100403, |
|
"learning_rate": 0.00019617797015516607, |
|
"loss": 0.3938, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 5.394456289978678, |
|
"grad_norm": 0.24304771423339844, |
|
"learning_rate": 0.0001961439293053263, |
|
"loss": 0.3925, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 5.405117270788913, |
|
"grad_norm": 0.2271909862756729, |
|
"learning_rate": 0.00019610974051455398, |
|
"loss": 0.3878, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 5.415778251599147, |
|
"grad_norm": 0.22085613012313843, |
|
"learning_rate": 0.00019607540383545692, |
|
"loss": 0.4025, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 5.426439232409382, |
|
"grad_norm": 0.2830078899860382, |
|
"learning_rate": 0.0001960409193208705, |
|
"loss": 0.3935, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 5.437100213219616, |
|
"grad_norm": 0.37187430262565613, |
|
"learning_rate": 0.00019600628702385751, |
|
"loss": 0.3896, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 5.447761194029851, |
|
"grad_norm": 0.23631027340888977, |
|
"learning_rate": 0.00019597150699770835, |
|
"loss": 0.3911, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 5.458422174840085, |
|
"grad_norm": 0.224113330245018, |
|
"learning_rate": 0.00019593657929594044, |
|
"loss": 0.3876, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 5.46908315565032, |
|
"grad_norm": 0.29911914467811584, |
|
"learning_rate": 0.00019590150397229866, |
|
"loss": 0.3966, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 5.479744136460554, |
|
"grad_norm": 0.22963348031044006, |
|
"learning_rate": 0.000195866281080755, |
|
"loss": 0.3931, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 5.490405117270789, |
|
"grad_norm": 0.24756336212158203, |
|
"learning_rate": 0.0001958309106755084, |
|
"loss": 0.3827, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 5.501066098081023, |
|
"grad_norm": 0.22494661808013916, |
|
"learning_rate": 0.00019579539281098493, |
|
"loss": 0.3884, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 5.5117270788912585, |
|
"grad_norm": 0.2217581868171692, |
|
"learning_rate": 0.00019575972754183748, |
|
"loss": 0.3954, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 5.522388059701493, |
|
"grad_norm": 0.22264057397842407, |
|
"learning_rate": 0.0001957239149229458, |
|
"loss": 0.3925, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 5.533049040511727, |
|
"grad_norm": 0.24900676310062408, |
|
"learning_rate": 0.00019568795500941635, |
|
"loss": 0.3938, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 5.543710021321962, |
|
"grad_norm": 0.22802846133708954, |
|
"learning_rate": 0.00019565184785658223, |
|
"loss": 0.3903, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 5.554371002132196, |
|
"grad_norm": 0.2182716578245163, |
|
"learning_rate": 0.00019561559352000317, |
|
"loss": 0.3929, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 5.565031982942431, |
|
"grad_norm": 0.23668424785137177, |
|
"learning_rate": 0.00019557919205546526, |
|
"loss": 0.3815, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 5.575692963752665, |
|
"grad_norm": 0.22820915281772614, |
|
"learning_rate": 0.0001955426435189811, |
|
"loss": 0.3937, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 5.5863539445629, |
|
"grad_norm": 0.21698084473609924, |
|
"learning_rate": 0.00019550594796678952, |
|
"loss": 0.3925, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 5.597014925373134, |
|
"grad_norm": 0.22192837297916412, |
|
"learning_rate": 0.00019546910545535558, |
|
"loss": 0.3858, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 5.607675906183369, |
|
"grad_norm": 0.22095522284507751, |
|
"learning_rate": 0.00019543211604137052, |
|
"loss": 0.3863, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 5.618336886993603, |
|
"grad_norm": 0.22427357733249664, |
|
"learning_rate": 0.0001953949797817516, |
|
"loss": 0.3836, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 5.628997867803838, |
|
"grad_norm": 0.23269647359848022, |
|
"learning_rate": 0.00019535769673364203, |
|
"loss": 0.3913, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 5.639658848614072, |
|
"grad_norm": 0.21933898329734802, |
|
"learning_rate": 0.00019532026695441083, |
|
"loss": 0.3948, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 5.650319829424307, |
|
"grad_norm": 0.227766752243042, |
|
"learning_rate": 0.00019528269050165297, |
|
"loss": 0.3861, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 5.660980810234541, |
|
"grad_norm": 0.22262893617153168, |
|
"learning_rate": 0.00019524496743318891, |
|
"loss": 0.3921, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 5.6716417910447765, |
|
"grad_norm": 0.28188657760620117, |
|
"learning_rate": 0.00019520709780706486, |
|
"loss": 0.3802, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 5.682302771855011, |
|
"grad_norm": 0.22414395213127136, |
|
"learning_rate": 0.00019516908168155245, |
|
"loss": 0.3858, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 5.6929637526652455, |
|
"grad_norm": 0.222300723195076, |
|
"learning_rate": 0.00019513091911514885, |
|
"loss": 0.3886, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 5.70362473347548, |
|
"grad_norm": 0.2155119776725769, |
|
"learning_rate": 0.00019509261016657643, |
|
"loss": 0.3948, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 5.714285714285714, |
|
"grad_norm": 0.23029391467571259, |
|
"learning_rate": 0.0001950541548947829, |
|
"loss": 0.3915, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 5.724946695095949, |
|
"grad_norm": 0.23538485169410706, |
|
"learning_rate": 0.0001950155533589411, |
|
"loss": 0.4005, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 5.735607675906183, |
|
"grad_norm": 0.249455988407135, |
|
"learning_rate": 0.00019497680561844893, |
|
"loss": 0.386, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 5.746268656716418, |
|
"grad_norm": 0.21184088289737701, |
|
"learning_rate": 0.00019493791173292923, |
|
"loss": 0.3931, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 5.756929637526652, |
|
"grad_norm": 0.21931645274162292, |
|
"learning_rate": 0.00019489887176222975, |
|
"loss": 0.3981, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 5.767590618336887, |
|
"grad_norm": 0.2259492725133896, |
|
"learning_rate": 0.00019485968576642308, |
|
"loss": 0.3848, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 5.778251599147121, |
|
"grad_norm": 0.23413480818271637, |
|
"learning_rate": 0.00019482035380580638, |
|
"loss": 0.3875, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 5.788912579957356, |
|
"grad_norm": 0.22880232334136963, |
|
"learning_rate": 0.00019478087594090155, |
|
"loss": 0.3838, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 5.79957356076759, |
|
"grad_norm": 0.22865185141563416, |
|
"learning_rate": 0.00019474125223245488, |
|
"loss": 0.3855, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 5.810234541577826, |
|
"grad_norm": 0.24277456104755402, |
|
"learning_rate": 0.00019470148274143713, |
|
"loss": 0.3938, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 5.82089552238806, |
|
"grad_norm": 0.2189398854970932, |
|
"learning_rate": 0.00019466156752904343, |
|
"loss": 0.4008, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 5.8315565031982945, |
|
"grad_norm": 0.21893605589866638, |
|
"learning_rate": 0.00019462150665669302, |
|
"loss": 0.3874, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 5.842217484008529, |
|
"grad_norm": 0.23077057301998138, |
|
"learning_rate": 0.00019458130018602945, |
|
"loss": 0.3929, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 5.8528784648187635, |
|
"grad_norm": 0.2599683701992035, |
|
"learning_rate": 0.00019454094817892008, |
|
"loss": 0.3892, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 5.863539445628998, |
|
"grad_norm": 0.22645121812820435, |
|
"learning_rate": 0.00019450045069745642, |
|
"loss": 0.3913, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 5.8742004264392325, |
|
"grad_norm": 0.22834275662899017, |
|
"learning_rate": 0.00019445980780395368, |
|
"loss": 0.3958, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 5.884861407249467, |
|
"grad_norm": 0.24456727504730225, |
|
"learning_rate": 0.00019441901956095093, |
|
"loss": 0.3939, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 5.895522388059701, |
|
"grad_norm": 0.21773149073123932, |
|
"learning_rate": 0.00019437808603121087, |
|
"loss": 0.3988, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 5.906183368869936, |
|
"grad_norm": 0.21768063306808472, |
|
"learning_rate": 0.00019433700727771965, |
|
"loss": 0.3894, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 5.91684434968017, |
|
"grad_norm": 0.2415178418159485, |
|
"learning_rate": 0.00019429578336368708, |
|
"loss": 0.3931, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 5.927505330490405, |
|
"grad_norm": 0.21271879971027374, |
|
"learning_rate": 0.00019425441435254616, |
|
"loss": 0.3957, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 5.938166311300639, |
|
"grad_norm": 0.21745960414409637, |
|
"learning_rate": 0.00019421290030795322, |
|
"loss": 0.3948, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 5.948827292110874, |
|
"grad_norm": 0.22035416960716248, |
|
"learning_rate": 0.0001941712412937878, |
|
"loss": 0.3922, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 5.959488272921108, |
|
"grad_norm": 0.20828816294670105, |
|
"learning_rate": 0.00019412943737415246, |
|
"loss": 0.3976, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 5.970149253731344, |
|
"grad_norm": 0.19749729335308075, |
|
"learning_rate": 0.00019408748861337273, |
|
"loss": 0.3994, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 5.980810234541578, |
|
"grad_norm": 0.20768584311008453, |
|
"learning_rate": 0.00019404539507599707, |
|
"loss": 0.3869, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 5.991471215351813, |
|
"grad_norm": 0.2182578146457672, |
|
"learning_rate": 0.00019400315682679663, |
|
"loss": 0.3924, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 0.5093127489089966, |
|
"eval_runtime": 377.4947, |
|
"eval_samples_per_second": 1.091, |
|
"eval_steps_per_second": 1.091, |
|
"step": 2814 |
|
}, |
|
{ |
|
"epoch": 6.002132196162047, |
|
"grad_norm": 0.21125191450119019, |
|
"learning_rate": 0.0001939607739307653, |
|
"loss": 0.3874, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 6.0127931769722816, |
|
"grad_norm": 0.31068113446235657, |
|
"learning_rate": 0.0001939182464531195, |
|
"loss": 0.3704, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 6.023454157782516, |
|
"grad_norm": 0.23276059329509735, |
|
"learning_rate": 0.00019387557445929823, |
|
"loss": 0.353, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 6.0341151385927505, |
|
"grad_norm": 0.25309714674949646, |
|
"learning_rate": 0.00019383275801496268, |
|
"loss": 0.3494, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 6.044776119402985, |
|
"grad_norm": 0.2310338020324707, |
|
"learning_rate": 0.00019378979718599645, |
|
"loss": 0.3534, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 6.0554371002132195, |
|
"grad_norm": 0.23623259365558624, |
|
"learning_rate": 0.00019374669203850532, |
|
"loss": 0.3513, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 6.066098081023454, |
|
"grad_norm": 0.2299884408712387, |
|
"learning_rate": 0.00019370344263881702, |
|
"loss": 0.3534, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 6.076759061833688, |
|
"grad_norm": 0.5613902807235718, |
|
"learning_rate": 0.0001936600490534814, |
|
"loss": 0.3615, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 6.087420042643923, |
|
"grad_norm": 0.22940614819526672, |
|
"learning_rate": 0.00019361651134927003, |
|
"loss": 0.3522, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 6.098081023454157, |
|
"grad_norm": 0.22831672430038452, |
|
"learning_rate": 0.0001935728295931763, |
|
"loss": 0.3523, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 6.108742004264393, |
|
"grad_norm": 0.23445968329906464, |
|
"learning_rate": 0.00019352900385241536, |
|
"loss": 0.369, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 6.119402985074627, |
|
"grad_norm": 0.2444639503955841, |
|
"learning_rate": 0.0001934850341944237, |
|
"loss": 0.355, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 6.130063965884862, |
|
"grad_norm": 0.2400490790605545, |
|
"learning_rate": 0.00019344092068685948, |
|
"loss": 0.3625, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 6.140724946695096, |
|
"grad_norm": 0.2361455261707306, |
|
"learning_rate": 0.00019339666339760207, |
|
"loss": 0.3649, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 6.151385927505331, |
|
"grad_norm": 0.26625874638557434, |
|
"learning_rate": 0.00019335226239475215, |
|
"loss": 0.3572, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 6.162046908315565, |
|
"grad_norm": 0.2775781750679016, |
|
"learning_rate": 0.0001933077177466315, |
|
"loss": 0.3446, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 6.1727078891258, |
|
"grad_norm": 0.25833654403686523, |
|
"learning_rate": 0.00019326302952178294, |
|
"loss": 0.3624, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 6.183368869936034, |
|
"grad_norm": 0.2403610199689865, |
|
"learning_rate": 0.00019321819778897023, |
|
"loss": 0.3578, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 6.1940298507462686, |
|
"grad_norm": 0.2580753266811371, |
|
"learning_rate": 0.00019317322261717794, |
|
"loss": 0.3536, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 6.204690831556503, |
|
"grad_norm": 0.2725096046924591, |
|
"learning_rate": 0.0001931281040756114, |
|
"loss": 0.3689, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 6.2153518123667375, |
|
"grad_norm": 0.27059614658355713, |
|
"learning_rate": 0.00019308284223369646, |
|
"loss": 0.3656, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 6.226012793176972, |
|
"grad_norm": 0.24707560241222382, |
|
"learning_rate": 0.00019303743716107957, |
|
"loss": 0.3682, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 6.2366737739872065, |
|
"grad_norm": 0.23825524747371674, |
|
"learning_rate": 0.00019299188892762752, |
|
"loss": 0.3578, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 6.247334754797441, |
|
"grad_norm": 0.24557247757911682, |
|
"learning_rate": 0.00019294619760342737, |
|
"loss": 0.3624, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 6.257995735607676, |
|
"grad_norm": 0.2559678256511688, |
|
"learning_rate": 0.00019290036325878644, |
|
"loss": 0.3693, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 6.268656716417911, |
|
"grad_norm": 0.25294074416160583, |
|
"learning_rate": 0.00019285438596423204, |
|
"loss": 0.3651, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 6.279317697228145, |
|
"grad_norm": 0.24387520551681519, |
|
"learning_rate": 0.00019280826579051147, |
|
"loss": 0.3589, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 6.28997867803838, |
|
"grad_norm": 0.22580432891845703, |
|
"learning_rate": 0.0001927620028085919, |
|
"loss": 0.3703, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 6.300639658848614, |
|
"grad_norm": 0.24953973293304443, |
|
"learning_rate": 0.00019271559708966023, |
|
"loss": 0.3606, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 6.311300639658849, |
|
"grad_norm": 0.2454618364572525, |
|
"learning_rate": 0.000192669048705123, |
|
"loss": 0.362, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 6.321961620469083, |
|
"grad_norm": 0.2393016368150711, |
|
"learning_rate": 0.00019262235772660627, |
|
"loss": 0.3695, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 6.332622601279318, |
|
"grad_norm": 0.2463667392730713, |
|
"learning_rate": 0.00019257552422595554, |
|
"loss": 0.3658, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 6.343283582089552, |
|
"grad_norm": 0.24116967618465424, |
|
"learning_rate": 0.00019252854827523557, |
|
"loss": 0.3671, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 6.353944562899787, |
|
"grad_norm": 0.2345789670944214, |
|
"learning_rate": 0.00019248142994673036, |
|
"loss": 0.368, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 6.364605543710021, |
|
"grad_norm": 0.26505357027053833, |
|
"learning_rate": 0.000192434169312943, |
|
"loss": 0.3695, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 6.3752665245202556, |
|
"grad_norm": 0.2504933476448059, |
|
"learning_rate": 0.00019238676644659546, |
|
"loss": 0.3605, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 6.38592750533049, |
|
"grad_norm": 0.24889980256557465, |
|
"learning_rate": 0.0001923392214206287, |
|
"loss": 0.3684, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 6.396588486140725, |
|
"grad_norm": 0.2319326400756836, |
|
"learning_rate": 0.00019229153430820232, |
|
"loss": 0.3621, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 6.40724946695096, |
|
"grad_norm": 0.2329808622598648, |
|
"learning_rate": 0.00019224370518269458, |
|
"loss": 0.3649, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 6.417910447761194, |
|
"grad_norm": 0.2565195560455322, |
|
"learning_rate": 0.00019219573411770235, |
|
"loss": 0.3602, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 6.428571428571429, |
|
"grad_norm": 0.24189329147338867, |
|
"learning_rate": 0.00019214762118704076, |
|
"loss": 0.3691, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 6.439232409381663, |
|
"grad_norm": 0.2512595057487488, |
|
"learning_rate": 0.0001920993664647434, |
|
"loss": 0.364, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 6.449893390191898, |
|
"grad_norm": 0.24277447164058685, |
|
"learning_rate": 0.00019205097002506185, |
|
"loss": 0.3732, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 6.460554371002132, |
|
"grad_norm": 0.242990642786026, |
|
"learning_rate": 0.00019200243194246594, |
|
"loss": 0.3674, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 6.471215351812367, |
|
"grad_norm": 0.23621074855327606, |
|
"learning_rate": 0.00019195375229164334, |
|
"loss": 0.3599, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 6.481876332622601, |
|
"grad_norm": 0.26253125071525574, |
|
"learning_rate": 0.0001919049311474996, |
|
"loss": 0.3708, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 6.492537313432836, |
|
"grad_norm": 0.2214423567056656, |
|
"learning_rate": 0.000191855968585158, |
|
"loss": 0.3612, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 6.50319829424307, |
|
"grad_norm": 0.24866749346256256, |
|
"learning_rate": 0.00019180686467995935, |
|
"loss": 0.3682, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 6.513859275053305, |
|
"grad_norm": 0.2474697232246399, |
|
"learning_rate": 0.00019175761950746204, |
|
"loss": 0.354, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 6.524520255863539, |
|
"grad_norm": 0.26961109042167664, |
|
"learning_rate": 0.00019170823314344185, |
|
"loss": 0.3708, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 6.535181236673774, |
|
"grad_norm": 0.2510351538658142, |
|
"learning_rate": 0.0001916587056638917, |
|
"loss": 0.3667, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 6.545842217484008, |
|
"grad_norm": 0.24457301199436188, |
|
"learning_rate": 0.00019160903714502173, |
|
"loss": 0.3679, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 6.556503198294243, |
|
"grad_norm": 0.23988381028175354, |
|
"learning_rate": 0.00019155922766325918, |
|
"loss": 0.3608, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 6.567164179104478, |
|
"grad_norm": 0.2317483127117157, |
|
"learning_rate": 0.000191509277295248, |
|
"loss": 0.3761, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 6.577825159914712, |
|
"grad_norm": 0.2614232301712036, |
|
"learning_rate": 0.0001914591861178491, |
|
"loss": 0.3606, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 6.588486140724947, |
|
"grad_norm": 0.24253317713737488, |
|
"learning_rate": 0.00019140895420813997, |
|
"loss": 0.362, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 6.599147121535181, |
|
"grad_norm": 0.2507173418998718, |
|
"learning_rate": 0.00019135858164341473, |
|
"loss": 0.3594, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 6.609808102345416, |
|
"grad_norm": 0.23574085533618927, |
|
"learning_rate": 0.0001913080685011838, |
|
"loss": 0.3661, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 6.62046908315565, |
|
"grad_norm": 0.2325553447008133, |
|
"learning_rate": 0.00019125741485917405, |
|
"loss": 0.3756, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 6.631130063965885, |
|
"grad_norm": 0.2191423624753952, |
|
"learning_rate": 0.00019120662079532853, |
|
"loss": 0.354, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 6.641791044776119, |
|
"grad_norm": 0.21787339448928833, |
|
"learning_rate": 0.00019115568638780622, |
|
"loss": 0.3657, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 6.652452025586354, |
|
"grad_norm": 0.21904399991035461, |
|
"learning_rate": 0.0001911046117149822, |
|
"loss": 0.367, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 6.663113006396588, |
|
"grad_norm": 0.23119735717773438, |
|
"learning_rate": 0.00019105339685544735, |
|
"loss": 0.3646, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 6.673773987206823, |
|
"grad_norm": 0.24613478779792786, |
|
"learning_rate": 0.00019100204188800827, |
|
"loss": 0.3682, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 6.684434968017058, |
|
"grad_norm": 0.2366684079170227, |
|
"learning_rate": 0.00019095054689168705, |
|
"loss": 0.3714, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 6.6950959488272925, |
|
"grad_norm": 0.2413744032382965, |
|
"learning_rate": 0.0001908989119457214, |
|
"loss": 0.3682, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 6.705756929637527, |
|
"grad_norm": 0.23421700298786163, |
|
"learning_rate": 0.00019084713712956428, |
|
"loss": 0.3639, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 6.7164179104477615, |
|
"grad_norm": 0.23423875868320465, |
|
"learning_rate": 0.00019079522252288386, |
|
"loss": 0.3655, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 6.727078891257996, |
|
"grad_norm": 0.23802149295806885, |
|
"learning_rate": 0.00019074316820556352, |
|
"loss": 0.3708, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 6.73773987206823, |
|
"grad_norm": 0.25665974617004395, |
|
"learning_rate": 0.00019069097425770154, |
|
"loss": 0.3762, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 6.748400852878465, |
|
"grad_norm": 0.23551535606384277, |
|
"learning_rate": 0.00019063864075961098, |
|
"loss": 0.3687, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 6.759061833688699, |
|
"grad_norm": 0.24098068475723267, |
|
"learning_rate": 0.00019058616779181982, |
|
"loss": 0.3659, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 6.769722814498934, |
|
"grad_norm": 0.22562439739704132, |
|
"learning_rate": 0.0001905335554350705, |
|
"loss": 0.3724, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 6.780383795309168, |
|
"grad_norm": 0.224997878074646, |
|
"learning_rate": 0.00019048080377031995, |
|
"loss": 0.3705, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 6.791044776119403, |
|
"grad_norm": 0.2575388252735138, |
|
"learning_rate": 0.00019042791287873957, |
|
"loss": 0.3611, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 6.801705756929637, |
|
"grad_norm": 0.231009379029274, |
|
"learning_rate": 0.0001903748828417149, |
|
"loss": 0.3653, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 6.812366737739872, |
|
"grad_norm": 0.23769618570804596, |
|
"learning_rate": 0.0001903217137408456, |
|
"loss": 0.3615, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 6.823027718550106, |
|
"grad_norm": 0.23301640152931213, |
|
"learning_rate": 0.00019026840565794536, |
|
"loss": 0.366, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 6.833688699360341, |
|
"grad_norm": 0.2212369292974472, |
|
"learning_rate": 0.00019021495867504163, |
|
"loss": 0.3632, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 6.844349680170575, |
|
"grad_norm": 0.23795363306999207, |
|
"learning_rate": 0.0001901613728743757, |
|
"loss": 0.3681, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 6.855010660980811, |
|
"grad_norm": 0.24354343116283417, |
|
"learning_rate": 0.00019010764833840243, |
|
"loss": 0.3695, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 6.865671641791045, |
|
"grad_norm": 0.24145299196243286, |
|
"learning_rate": 0.00019005378514979008, |
|
"loss": 0.3667, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 6.8763326226012795, |
|
"grad_norm": 0.24070268869400024, |
|
"learning_rate": 0.0001899997833914204, |
|
"loss": 0.3693, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 6.886993603411514, |
|
"grad_norm": 0.22578920423984528, |
|
"learning_rate": 0.00018994564314638832, |
|
"loss": 0.3692, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 6.8976545842217485, |
|
"grad_norm": 0.22691179811954498, |
|
"learning_rate": 0.00018989136449800174, |
|
"loss": 0.3766, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 6.908315565031983, |
|
"grad_norm": 0.2194678634405136, |
|
"learning_rate": 0.0001898369475297817, |
|
"loss": 0.3668, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 6.918976545842217, |
|
"grad_norm": 0.22618421912193298, |
|
"learning_rate": 0.000189782392325462, |
|
"loss": 0.3592, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 6.929637526652452, |
|
"grad_norm": 0.2549285292625427, |
|
"learning_rate": 0.0001897276989689891, |
|
"loss": 0.3653, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 6.940298507462686, |
|
"grad_norm": 0.23101598024368286, |
|
"learning_rate": 0.00018967286754452214, |
|
"loss": 0.3569, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 6.950959488272921, |
|
"grad_norm": 0.2506960332393646, |
|
"learning_rate": 0.00018961789813643268, |
|
"loss": 0.3633, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 6.961620469083155, |
|
"grad_norm": 0.2284671515226364, |
|
"learning_rate": 0.00018956279082930455, |
|
"loss": 0.3624, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 6.97228144989339, |
|
"grad_norm": 0.22146272659301758, |
|
"learning_rate": 0.00018950754570793384, |
|
"loss": 0.37, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 6.982942430703625, |
|
"grad_norm": 0.2425510585308075, |
|
"learning_rate": 0.00018945216285732864, |
|
"loss": 0.366, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 6.99360341151386, |
|
"grad_norm": 0.2304454892873764, |
|
"learning_rate": 0.00018939664236270907, |
|
"loss": 0.3684, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 0.5168320536613464, |
|
"eval_runtime": 377.6098, |
|
"eval_samples_per_second": 1.091, |
|
"eval_steps_per_second": 1.091, |
|
"step": 3283 |
|
}, |
|
{ |
|
"epoch": 7.004264392324094, |
|
"grad_norm": 0.2056385576725006, |
|
"learning_rate": 0.00018934098430950692, |
|
"loss": 0.3479, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 7.014925373134329, |
|
"grad_norm": 0.2757323086261749, |
|
"learning_rate": 0.0001892851887833657, |
|
"loss": 0.333, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 7.025586353944563, |
|
"grad_norm": 0.25871726870536804, |
|
"learning_rate": 0.00018922925587014046, |
|
"loss": 0.3212, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 7.036247334754798, |
|
"grad_norm": 0.2494359016418457, |
|
"learning_rate": 0.00018917318565589772, |
|
"loss": 0.3248, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 7.046908315565032, |
|
"grad_norm": 0.2385275512933731, |
|
"learning_rate": 0.00018911697822691516, |
|
"loss": 0.3189, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 7.0575692963752665, |
|
"grad_norm": 0.2520158588886261, |
|
"learning_rate": 0.00018906063366968165, |
|
"loss": 0.3268, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 7.068230277185501, |
|
"grad_norm": 0.25822409987449646, |
|
"learning_rate": 0.00018900415207089708, |
|
"loss": 0.3169, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 7.0788912579957355, |
|
"grad_norm": 0.2619076669216156, |
|
"learning_rate": 0.00018894753351747214, |
|
"loss": 0.3279, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 7.08955223880597, |
|
"grad_norm": 0.30978551506996155, |
|
"learning_rate": 0.0001888907780965284, |
|
"loss": 0.327, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 7.100213219616204, |
|
"grad_norm": 0.25372347235679626, |
|
"learning_rate": 0.00018883388589539785, |
|
"loss": 0.3254, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 7.110874200426439, |
|
"grad_norm": 0.27630311250686646, |
|
"learning_rate": 0.0001887768570016231, |
|
"loss": 0.3291, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 7.121535181236673, |
|
"grad_norm": 0.2716643810272217, |
|
"learning_rate": 0.00018871969150295706, |
|
"loss": 0.3241, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 7.132196162046908, |
|
"grad_norm": 0.2678888440132141, |
|
"learning_rate": 0.00018866238948736278, |
|
"loss": 0.3304, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 7.142857142857143, |
|
"grad_norm": 0.2532709240913391, |
|
"learning_rate": 0.00018860495104301345, |
|
"loss": 0.3331, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 7.153518123667378, |
|
"grad_norm": 0.2671636939048767, |
|
"learning_rate": 0.0001885473762582921, |
|
"loss": 0.3315, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 7.164179104477612, |
|
"grad_norm": 0.2550068497657776, |
|
"learning_rate": 0.00018848966522179168, |
|
"loss": 0.3306, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 7.174840085287847, |
|
"grad_norm": 0.2700331211090088, |
|
"learning_rate": 0.00018843181802231465, |
|
"loss": 0.329, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 7.185501066098081, |
|
"grad_norm": 0.26168689131736755, |
|
"learning_rate": 0.00018837383474887314, |
|
"loss": 0.3327, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 7.196162046908316, |
|
"grad_norm": 0.24964787065982819, |
|
"learning_rate": 0.00018831571549068852, |
|
"loss": 0.3353, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 7.20682302771855, |
|
"grad_norm": 0.2676330804824829, |
|
"learning_rate": 0.00018825746033719149, |
|
"loss": 0.3316, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 7.217484008528785, |
|
"grad_norm": 0.25253960490226746, |
|
"learning_rate": 0.0001881990693780219, |
|
"loss": 0.3316, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 7.228144989339019, |
|
"grad_norm": 0.257114440202713, |
|
"learning_rate": 0.0001881405427030284, |
|
"loss": 0.3307, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 7.2388059701492535, |
|
"grad_norm": 0.25102248787879944, |
|
"learning_rate": 0.00018808188040226868, |
|
"loss": 0.3348, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 7.249466950959488, |
|
"grad_norm": 0.25489816069602966, |
|
"learning_rate": 0.000188023082566009, |
|
"loss": 0.3342, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 7.2601279317697225, |
|
"grad_norm": 0.27044063806533813, |
|
"learning_rate": 0.00018796414928472417, |
|
"loss": 0.3391, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 7.270788912579957, |
|
"grad_norm": 0.26209956407546997, |
|
"learning_rate": 0.00018790508064909746, |
|
"loss": 0.3318, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 7.281449893390192, |
|
"grad_norm": 0.25549113750457764, |
|
"learning_rate": 0.00018784587675002045, |
|
"loss": 0.3322, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 7.292110874200427, |
|
"grad_norm": 0.26465660333633423, |
|
"learning_rate": 0.00018778653767859274, |
|
"loss": 0.3319, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 7.302771855010661, |
|
"grad_norm": 0.2753106951713562, |
|
"learning_rate": 0.00018772706352612203, |
|
"loss": 0.3329, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 7.313432835820896, |
|
"grad_norm": 0.2526467740535736, |
|
"learning_rate": 0.00018766745438412384, |
|
"loss": 0.3311, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 7.32409381663113, |
|
"grad_norm": 0.2626464068889618, |
|
"learning_rate": 0.00018760771034432138, |
|
"loss": 0.3318, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 7.334754797441365, |
|
"grad_norm": 0.2631151080131531, |
|
"learning_rate": 0.0001875478314986455, |
|
"loss": 0.3453, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 7.345415778251599, |
|
"grad_norm": 0.25757527351379395, |
|
"learning_rate": 0.0001874878179392344, |
|
"loss": 0.3373, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 7.356076759061834, |
|
"grad_norm": 0.2395113706588745, |
|
"learning_rate": 0.0001874276697584336, |
|
"loss": 0.331, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 7.366737739872068, |
|
"grad_norm": 0.2804111838340759, |
|
"learning_rate": 0.0001873673870487958, |
|
"loss": 0.3378, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 7.377398720682303, |
|
"grad_norm": 0.24439595639705658, |
|
"learning_rate": 0.00018730696990308069, |
|
"loss": 0.3381, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 7.388059701492537, |
|
"grad_norm": 0.270958811044693, |
|
"learning_rate": 0.00018724641841425478, |
|
"loss": 0.3418, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 7.398720682302772, |
|
"grad_norm": 0.2635878324508667, |
|
"learning_rate": 0.0001871857326754914, |
|
"loss": 0.3433, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 7.409381663113006, |
|
"grad_norm": 0.24128612875938416, |
|
"learning_rate": 0.00018712491278017032, |
|
"loss": 0.3395, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 7.4200426439232405, |
|
"grad_norm": 0.2588317096233368, |
|
"learning_rate": 0.00018706395882187783, |
|
"loss": 0.3415, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 7.430703624733475, |
|
"grad_norm": 0.2590773105621338, |
|
"learning_rate": 0.0001870028708944065, |
|
"loss": 0.3392, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 7.44136460554371, |
|
"grad_norm": 0.25688695907592773, |
|
"learning_rate": 0.00018694164909175505, |
|
"loss": 0.3385, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 7.452025586353945, |
|
"grad_norm": 0.23704120516777039, |
|
"learning_rate": 0.00018688029350812817, |
|
"loss": 0.3356, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 7.462686567164179, |
|
"grad_norm": 0.2817398011684418, |
|
"learning_rate": 0.00018681880423793642, |
|
"loss": 0.3368, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 7.473347547974414, |
|
"grad_norm": 0.2590171694755554, |
|
"learning_rate": 0.00018675718137579607, |
|
"loss": 0.3382, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 7.484008528784648, |
|
"grad_norm": 0.2843134105205536, |
|
"learning_rate": 0.00018669542501652896, |
|
"loss": 0.3304, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 7.494669509594883, |
|
"grad_norm": 0.25284621119499207, |
|
"learning_rate": 0.00018663353525516234, |
|
"loss": 0.3337, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 7.505330490405117, |
|
"grad_norm": 0.24715737998485565, |
|
"learning_rate": 0.00018657151218692873, |
|
"loss": 0.3373, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 7.515991471215352, |
|
"grad_norm": 0.28074926137924194, |
|
"learning_rate": 0.0001865093559072658, |
|
"loss": 0.3376, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 7.526652452025586, |
|
"grad_norm": 0.2531152367591858, |
|
"learning_rate": 0.00018644706651181614, |
|
"loss": 0.3329, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 7.537313432835821, |
|
"grad_norm": 0.27217596769332886, |
|
"learning_rate": 0.00018638464409642723, |
|
"loss": 0.3486, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 7.547974413646055, |
|
"grad_norm": 0.2517159581184387, |
|
"learning_rate": 0.0001863220887571512, |
|
"loss": 0.343, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 7.55863539445629, |
|
"grad_norm": 0.2538190484046936, |
|
"learning_rate": 0.00018625940059024477, |
|
"loss": 0.3343, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 7.569296375266525, |
|
"grad_norm": 0.26679527759552, |
|
"learning_rate": 0.00018619657969216893, |
|
"loss": 0.348, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 7.5799573560767595, |
|
"grad_norm": 0.24433985352516174, |
|
"learning_rate": 0.00018613362615958905, |
|
"loss": 0.3455, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 7.590618336886994, |
|
"grad_norm": 0.2719508111476898, |
|
"learning_rate": 0.0001860705400893745, |
|
"loss": 0.3414, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 7.601279317697228, |
|
"grad_norm": 0.2666242718696594, |
|
"learning_rate": 0.00018600732157859863, |
|
"loss": 0.3384, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 7.611940298507463, |
|
"grad_norm": 0.24249517917633057, |
|
"learning_rate": 0.00018594397072453856, |
|
"loss": 0.339, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 7.622601279317697, |
|
"grad_norm": 0.2475687712430954, |
|
"learning_rate": 0.00018588048762467502, |
|
"loss": 0.3428, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 7.633262260127932, |
|
"grad_norm": 0.2500527799129486, |
|
"learning_rate": 0.00018581687237669234, |
|
"loss": 0.3332, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 7.643923240938166, |
|
"grad_norm": 0.2528587281703949, |
|
"learning_rate": 0.0001857531250784781, |
|
"loss": 0.3429, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 7.654584221748401, |
|
"grad_norm": 0.2627830505371094, |
|
"learning_rate": 0.0001856892458281231, |
|
"loss": 0.3396, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 7.665245202558635, |
|
"grad_norm": 0.2573624849319458, |
|
"learning_rate": 0.00018562523472392118, |
|
"loss": 0.3391, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 7.67590618336887, |
|
"grad_norm": 0.2411065399646759, |
|
"learning_rate": 0.0001855610918643691, |
|
"loss": 0.3384, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 7.686567164179104, |
|
"grad_norm": 0.2589527666568756, |
|
"learning_rate": 0.00018549681734816623, |
|
"loss": 0.3429, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 7.697228144989339, |
|
"grad_norm": 0.2436107099056244, |
|
"learning_rate": 0.00018543241127421474, |
|
"loss": 0.3435, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 7.707889125799573, |
|
"grad_norm": 0.272020161151886, |
|
"learning_rate": 0.00018536787374161902, |
|
"loss": 0.3418, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 7.718550106609808, |
|
"grad_norm": 0.26080530881881714, |
|
"learning_rate": 0.00018530320484968588, |
|
"loss": 0.3367, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 7.729211087420042, |
|
"grad_norm": 0.2503691613674164, |
|
"learning_rate": 0.0001852384046979242, |
|
"loss": 0.3367, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 7.7398720682302775, |
|
"grad_norm": 0.26822352409362793, |
|
"learning_rate": 0.0001851734733860449, |
|
"loss": 0.3498, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 7.750533049040512, |
|
"grad_norm": 0.28552523255348206, |
|
"learning_rate": 0.00018510841101396062, |
|
"loss": 0.3406, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 7.7611940298507465, |
|
"grad_norm": 0.2446276843547821, |
|
"learning_rate": 0.0001850432176817857, |
|
"loss": 0.3465, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 7.771855010660981, |
|
"grad_norm": 0.24052871763706207, |
|
"learning_rate": 0.00018497789348983606, |
|
"loss": 0.3434, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 7.782515991471215, |
|
"grad_norm": 0.23899152874946594, |
|
"learning_rate": 0.00018491243853862893, |
|
"loss": 0.3365, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 7.79317697228145, |
|
"grad_norm": 0.24732346832752228, |
|
"learning_rate": 0.00018484685292888278, |
|
"loss": 0.3382, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 7.803837953091684, |
|
"grad_norm": 0.2519215941429138, |
|
"learning_rate": 0.00018478113676151703, |
|
"loss": 0.3463, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 7.814498933901919, |
|
"grad_norm": 0.24091705679893494, |
|
"learning_rate": 0.00018471529013765209, |
|
"loss": 0.3404, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 7.825159914712153, |
|
"grad_norm": 0.2794884443283081, |
|
"learning_rate": 0.0001846493131586091, |
|
"loss": 0.3469, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 7.835820895522388, |
|
"grad_norm": 0.24296560883522034, |
|
"learning_rate": 0.00018458320592590975, |
|
"loss": 0.3434, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 7.846481876332622, |
|
"grad_norm": 0.24800756573677063, |
|
"learning_rate": 0.00018451696854127617, |
|
"loss": 0.3384, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 7.857142857142857, |
|
"grad_norm": 0.2350349873304367, |
|
"learning_rate": 0.0001844506011066308, |
|
"loss": 0.3428, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 7.867803837953092, |
|
"grad_norm": 0.2573322355747223, |
|
"learning_rate": 0.0001843841037240961, |
|
"loss": 0.3463, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 7.878464818763327, |
|
"grad_norm": 0.256381630897522, |
|
"learning_rate": 0.00018431747649599463, |
|
"loss": 0.3397, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 7.889125799573561, |
|
"grad_norm": 0.23707297444343567, |
|
"learning_rate": 0.0001842507195248486, |
|
"loss": 0.3437, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 7.899786780383796, |
|
"grad_norm": 0.24699944257736206, |
|
"learning_rate": 0.00018418383291337988, |
|
"loss": 0.3398, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 7.91044776119403, |
|
"grad_norm": 0.25237977504730225, |
|
"learning_rate": 0.00018411681676450999, |
|
"loss": 0.3409, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 7.9211087420042645, |
|
"grad_norm": 0.2656485438346863, |
|
"learning_rate": 0.00018404967118135955, |
|
"loss": 0.3487, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 7.931769722814499, |
|
"grad_norm": 0.23709309101104736, |
|
"learning_rate": 0.0001839823962672485, |
|
"loss": 0.3398, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 7.9424307036247335, |
|
"grad_norm": 0.24946698546409607, |
|
"learning_rate": 0.00018391499212569573, |
|
"loss": 0.3459, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 7.953091684434968, |
|
"grad_norm": 0.2608436346054077, |
|
"learning_rate": 0.00018384745886041898, |
|
"loss": 0.3394, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 7.963752665245202, |
|
"grad_norm": 0.2503463625907898, |
|
"learning_rate": 0.00018377979657533468, |
|
"loss": 0.3436, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 7.974413646055437, |
|
"grad_norm": 0.2556673586368561, |
|
"learning_rate": 0.0001837120053745578, |
|
"loss": 0.3519, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 7.985074626865671, |
|
"grad_norm": 0.24612018465995789, |
|
"learning_rate": 0.0001836440853624017, |
|
"loss": 0.3388, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 7.995735607675906, |
|
"grad_norm": 0.26963427662849426, |
|
"learning_rate": 0.00018357603664337786, |
|
"loss": 0.3403, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 0.5337910056114197, |
|
"eval_runtime": 377.6371, |
|
"eval_samples_per_second": 1.091, |
|
"eval_steps_per_second": 1.091, |
|
"step": 3752 |
|
}, |
|
{ |
|
"epoch": 8.00639658848614, |
|
"grad_norm": 0.2208224982023239, |
|
"learning_rate": 0.00018350785932219588, |
|
"loss": 0.3081, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 8.017057569296375, |
|
"grad_norm": 0.30632683634757996, |
|
"learning_rate": 0.00018343955350376325, |
|
"loss": 0.2978, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 8.02771855010661, |
|
"grad_norm": 0.25390052795410156, |
|
"learning_rate": 0.00018337111929318516, |
|
"loss": 0.2948, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 8.038379530916844, |
|
"grad_norm": 0.296369731426239, |
|
"learning_rate": 0.00018330255679576438, |
|
"loss": 0.2963, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 8.049040511727078, |
|
"grad_norm": 0.2958175837993622, |
|
"learning_rate": 0.00018323386611700105, |
|
"loss": 0.2905, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 8.059701492537313, |
|
"grad_norm": 0.2595365047454834, |
|
"learning_rate": 0.00018316504736259255, |
|
"loss": 0.2918, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 8.070362473347547, |
|
"grad_norm": 0.2825353145599365, |
|
"learning_rate": 0.00018309610063843337, |
|
"loss": 0.3, |
|
"step": 3785 |
|
}, |
|
{ |
|
"epoch": 8.081023454157782, |
|
"grad_norm": 0.2677433490753174, |
|
"learning_rate": 0.00018302702605061492, |
|
"loss": 0.2964, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 8.091684434968018, |
|
"grad_norm": 0.28075000643730164, |
|
"learning_rate": 0.00018295782370542532, |
|
"loss": 0.2979, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 8.102345415778252, |
|
"grad_norm": 0.2629709243774414, |
|
"learning_rate": 0.00018288849370934926, |
|
"loss": 0.3005, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 8.113006396588487, |
|
"grad_norm": 0.2850215435028076, |
|
"learning_rate": 0.00018281903616906796, |
|
"loss": 0.2976, |
|
"step": 3805 |
|
}, |
|
{ |
|
"epoch": 8.123667377398721, |
|
"grad_norm": 0.29631924629211426, |
|
"learning_rate": 0.0001827494511914587, |
|
"loss": 0.2938, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 8.134328358208956, |
|
"grad_norm": 0.26315709948539734, |
|
"learning_rate": 0.00018267973888359509, |
|
"loss": 0.3021, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 8.14498933901919, |
|
"grad_norm": 0.30577051639556885, |
|
"learning_rate": 0.0001826098993527465, |
|
"loss": 0.2996, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 8.155650319829425, |
|
"grad_norm": 0.2897678315639496, |
|
"learning_rate": 0.0001825399327063781, |
|
"loss": 0.3048, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 8.16631130063966, |
|
"grad_norm": 0.3003354072570801, |
|
"learning_rate": 0.00018246983905215075, |
|
"loss": 0.3075, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 8.176972281449894, |
|
"grad_norm": 0.28864815831184387, |
|
"learning_rate": 0.00018239961849792055, |
|
"loss": 0.3091, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 8.187633262260128, |
|
"grad_norm": 0.28102535009384155, |
|
"learning_rate": 0.0001823292711517391, |
|
"loss": 0.2969, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 8.198294243070363, |
|
"grad_norm": 0.2669455409049988, |
|
"learning_rate": 0.00018225879712185293, |
|
"loss": 0.3061, |
|
"step": 3845 |
|
}, |
|
{ |
|
"epoch": 8.208955223880597, |
|
"grad_norm": 0.2893795669078827, |
|
"learning_rate": 0.00018218819651670356, |
|
"loss": 0.3003, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 8.219616204690832, |
|
"grad_norm": 0.31041857600212097, |
|
"learning_rate": 0.00018211746944492727, |
|
"loss": 0.3069, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 8.230277185501066, |
|
"grad_norm": 0.2678110599517822, |
|
"learning_rate": 0.000182046616015355, |
|
"loss": 0.3023, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 8.2409381663113, |
|
"grad_norm": 0.3051944375038147, |
|
"learning_rate": 0.00018197563633701196, |
|
"loss": 0.3095, |
|
"step": 3865 |
|
}, |
|
{ |
|
"epoch": 8.251599147121535, |
|
"grad_norm": 0.267646461725235, |
|
"learning_rate": 0.00018190453051911782, |
|
"loss": 0.3047, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 8.26226012793177, |
|
"grad_norm": 0.27988821268081665, |
|
"learning_rate": 0.00018183329867108624, |
|
"loss": 0.3132, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 8.272921108742004, |
|
"grad_norm": 0.293363094329834, |
|
"learning_rate": 0.0001817619409025248, |
|
"loss": 0.3054, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 8.283582089552239, |
|
"grad_norm": 0.28679507970809937, |
|
"learning_rate": 0.00018169045732323492, |
|
"loss": 0.3049, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 8.294243070362473, |
|
"grad_norm": 0.28792116045951843, |
|
"learning_rate": 0.0001816188480432115, |
|
"loss": 0.3112, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 8.304904051172707, |
|
"grad_norm": 0.2938394844532013, |
|
"learning_rate": 0.00018154711317264297, |
|
"loss": 0.3101, |
|
"step": 3895 |
|
}, |
|
{ |
|
"epoch": 8.315565031982942, |
|
"grad_norm": 0.2776646316051483, |
|
"learning_rate": 0.00018147525282191093, |
|
"loss": 0.3046, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 8.326226012793176, |
|
"grad_norm": 0.2619486153125763, |
|
"learning_rate": 0.00018140326710159007, |
|
"loss": 0.3066, |
|
"step": 3905 |
|
}, |
|
{ |
|
"epoch": 8.336886993603411, |
|
"grad_norm": 0.2895703911781311, |
|
"learning_rate": 0.00018133115612244807, |
|
"loss": 0.3122, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 8.347547974413645, |
|
"grad_norm": 0.2928364872932434, |
|
"learning_rate": 0.00018125891999544525, |
|
"loss": 0.303, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 8.35820895522388, |
|
"grad_norm": 0.27352485060691833, |
|
"learning_rate": 0.00018118655883173456, |
|
"loss": 0.301, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 8.368869936034114, |
|
"grad_norm": 0.3004440665245056, |
|
"learning_rate": 0.00018111407274266136, |
|
"loss": 0.3084, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 8.379530916844349, |
|
"grad_norm": 0.26515400409698486, |
|
"learning_rate": 0.00018104146183976316, |
|
"loss": 0.3052, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 8.390191897654585, |
|
"grad_norm": 0.29159972071647644, |
|
"learning_rate": 0.00018096872623476963, |
|
"loss": 0.3018, |
|
"step": 3935 |
|
}, |
|
{ |
|
"epoch": 8.40085287846482, |
|
"grad_norm": 0.31077924370765686, |
|
"learning_rate": 0.00018089586603960224, |
|
"loss": 0.3139, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 8.411513859275054, |
|
"grad_norm": 0.2826644480228424, |
|
"learning_rate": 0.00018082288136637422, |
|
"loss": 0.2955, |
|
"step": 3945 |
|
}, |
|
{ |
|
"epoch": 8.422174840085288, |
|
"grad_norm": 0.2825087308883667, |
|
"learning_rate": 0.00018074977232739031, |
|
"loss": 0.3127, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 8.432835820895523, |
|
"grad_norm": 0.2901898920536041, |
|
"learning_rate": 0.0001806765390351467, |
|
"loss": 0.3099, |
|
"step": 3955 |
|
}, |
|
{ |
|
"epoch": 8.443496801705757, |
|
"grad_norm": 0.28308314085006714, |
|
"learning_rate": 0.00018060318160233063, |
|
"loss": 0.3122, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 8.454157782515992, |
|
"grad_norm": 0.26890453696250916, |
|
"learning_rate": 0.00018052970014182046, |
|
"loss": 0.3156, |
|
"step": 3965 |
|
}, |
|
{ |
|
"epoch": 8.464818763326226, |
|
"grad_norm": 0.2962822914123535, |
|
"learning_rate": 0.00018045609476668545, |
|
"loss": 0.3184, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 8.47547974413646, |
|
"grad_norm": 0.2848854959011078, |
|
"learning_rate": 0.00018038236559018533, |
|
"loss": 0.309, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 8.486140724946695, |
|
"grad_norm": 0.3047114312648773, |
|
"learning_rate": 0.00018030851272577051, |
|
"loss": 0.3118, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 8.49680170575693, |
|
"grad_norm": 0.28175976872444153, |
|
"learning_rate": 0.00018023453628708173, |
|
"loss": 0.3074, |
|
"step": 3985 |
|
}, |
|
{ |
|
"epoch": 8.507462686567164, |
|
"grad_norm": 0.27742594480514526, |
|
"learning_rate": 0.00018016043638794974, |
|
"loss": 0.3127, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 8.518123667377399, |
|
"grad_norm": 0.28773581981658936, |
|
"learning_rate": 0.0001800862131423954, |
|
"loss": 0.3057, |
|
"step": 3995 |
|
}, |
|
{ |
|
"epoch": 8.528784648187633, |
|
"grad_norm": 0.2765009105205536, |
|
"learning_rate": 0.00018001186666462927, |
|
"loss": 0.3128, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 8.539445628997868, |
|
"grad_norm": 0.2800111174583435, |
|
"learning_rate": 0.00017993739706905162, |
|
"loss": 0.3096, |
|
"step": 4005 |
|
}, |
|
{ |
|
"epoch": 8.550106609808102, |
|
"grad_norm": 0.30302369594573975, |
|
"learning_rate": 0.00017986280447025209, |
|
"loss": 0.3016, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 8.560767590618337, |
|
"grad_norm": 0.2798007130622864, |
|
"learning_rate": 0.0001797880889830096, |
|
"loss": 0.3061, |
|
"step": 4015 |
|
}, |
|
{ |
|
"epoch": 8.571428571428571, |
|
"grad_norm": 0.29015523195266724, |
|
"learning_rate": 0.00017971325072229226, |
|
"loss": 0.3134, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 8.582089552238806, |
|
"grad_norm": 0.3815457820892334, |
|
"learning_rate": 0.00017963828980325697, |
|
"loss": 0.3131, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 8.59275053304904, |
|
"grad_norm": 0.2907319664955139, |
|
"learning_rate": 0.00017956320634124944, |
|
"loss": 0.314, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 8.603411513859275, |
|
"grad_norm": 0.29612481594085693, |
|
"learning_rate": 0.00017948800045180393, |
|
"loss": 0.3168, |
|
"step": 4035 |
|
}, |
|
{ |
|
"epoch": 8.614072494669509, |
|
"grad_norm": 0.2797704339027405, |
|
"learning_rate": 0.00017941267225064306, |
|
"loss": 0.3144, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 8.624733475479744, |
|
"grad_norm": 0.27811723947525024, |
|
"learning_rate": 0.00017933722185367774, |
|
"loss": 0.303, |
|
"step": 4045 |
|
}, |
|
{ |
|
"epoch": 8.635394456289978, |
|
"grad_norm": 0.2933618724346161, |
|
"learning_rate": 0.00017926164937700676, |
|
"loss": 0.3097, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 8.646055437100213, |
|
"grad_norm": 0.282921701669693, |
|
"learning_rate": 0.0001791859549369169, |
|
"loss": 0.3104, |
|
"step": 4055 |
|
}, |
|
{ |
|
"epoch": 8.656716417910447, |
|
"grad_norm": 0.2758900225162506, |
|
"learning_rate": 0.00017911013864988252, |
|
"loss": 0.3108, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 8.667377398720681, |
|
"grad_norm": 0.2904449999332428, |
|
"learning_rate": 0.00017903420063256555, |
|
"loss": 0.3209, |
|
"step": 4065 |
|
}, |
|
{ |
|
"epoch": 8.678038379530918, |
|
"grad_norm": 0.28849634528160095, |
|
"learning_rate": 0.00017895814100181515, |
|
"loss": 0.3055, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 8.688699360341152, |
|
"grad_norm": 0.2709294259548187, |
|
"learning_rate": 0.0001788819598746677, |
|
"loss": 0.3167, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 8.699360341151387, |
|
"grad_norm": 0.28200262784957886, |
|
"learning_rate": 0.0001788056573683464, |
|
"loss": 0.307, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 8.710021321961621, |
|
"grad_norm": 0.27431854605674744, |
|
"learning_rate": 0.00017872923360026137, |
|
"loss": 0.3163, |
|
"step": 4085 |
|
}, |
|
{ |
|
"epoch": 8.720682302771856, |
|
"grad_norm": 0.28479164838790894, |
|
"learning_rate": 0.00017865268868800925, |
|
"loss": 0.3257, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 8.73134328358209, |
|
"grad_norm": 0.2959545850753784, |
|
"learning_rate": 0.00017857602274937308, |
|
"loss": 0.3138, |
|
"step": 4095 |
|
}, |
|
{ |
|
"epoch": 8.742004264392325, |
|
"grad_norm": 0.270533949136734, |
|
"learning_rate": 0.00017849923590232213, |
|
"loss": 0.3182, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 8.752665245202559, |
|
"grad_norm": 0.26438501477241516, |
|
"learning_rate": 0.0001784223282650118, |
|
"loss": 0.3084, |
|
"step": 4105 |
|
}, |
|
{ |
|
"epoch": 8.763326226012794, |
|
"grad_norm": 0.2890710234642029, |
|
"learning_rate": 0.00017834529995578317, |
|
"loss": 0.3093, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 8.773987206823028, |
|
"grad_norm": 0.2725368142127991, |
|
"learning_rate": 0.0001782681510931632, |
|
"loss": 0.3185, |
|
"step": 4115 |
|
}, |
|
{ |
|
"epoch": 8.784648187633262, |
|
"grad_norm": 0.2648097276687622, |
|
"learning_rate": 0.00017819088179586427, |
|
"loss": 0.3126, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 8.795309168443497, |
|
"grad_norm": 0.27868813276290894, |
|
"learning_rate": 0.00017811349218278407, |
|
"loss": 0.3157, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 8.805970149253731, |
|
"grad_norm": 0.3133993446826935, |
|
"learning_rate": 0.00017803598237300537, |
|
"loss": 0.3128, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 8.816631130063966, |
|
"grad_norm": 0.270416796207428, |
|
"learning_rate": 0.00017795835248579606, |
|
"loss": 0.3087, |
|
"step": 4135 |
|
}, |
|
{ |
|
"epoch": 8.8272921108742, |
|
"grad_norm": 0.299452543258667, |
|
"learning_rate": 0.00017788060264060864, |
|
"loss": 0.3126, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 8.837953091684435, |
|
"grad_norm": 0.2789115607738495, |
|
"learning_rate": 0.00017780273295708025, |
|
"loss": 0.3149, |
|
"step": 4145 |
|
}, |
|
{ |
|
"epoch": 8.84861407249467, |
|
"grad_norm": 0.2616700828075409, |
|
"learning_rate": 0.0001777247435550324, |
|
"loss": 0.3151, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 8.859275053304904, |
|
"grad_norm": 0.2998231053352356, |
|
"learning_rate": 0.0001776466345544709, |
|
"loss": 0.3143, |
|
"step": 4155 |
|
}, |
|
{ |
|
"epoch": 8.869936034115138, |
|
"grad_norm": 0.2851693034172058, |
|
"learning_rate": 0.00017756840607558553, |
|
"loss": 0.3153, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 8.880597014925373, |
|
"grad_norm": 0.2862933874130249, |
|
"learning_rate": 0.00017749005823874988, |
|
"loss": 0.3124, |
|
"step": 4165 |
|
}, |
|
{ |
|
"epoch": 8.891257995735607, |
|
"grad_norm": 0.29242345690727234, |
|
"learning_rate": 0.00017741159116452132, |
|
"loss": 0.3137, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 8.901918976545842, |
|
"grad_norm": 0.3226570188999176, |
|
"learning_rate": 0.00017733300497364054, |
|
"loss": 0.3168, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 8.912579957356076, |
|
"grad_norm": 0.31018882989883423, |
|
"learning_rate": 0.00017725429978703163, |
|
"loss": 0.3162, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 8.92324093816631, |
|
"grad_norm": 0.30581411719322205, |
|
"learning_rate": 0.00017717547572580178, |
|
"loss": 0.3166, |
|
"step": 4185 |
|
}, |
|
{ |
|
"epoch": 8.933901918976545, |
|
"grad_norm": 0.27954214811325073, |
|
"learning_rate": 0.00017709653291124103, |
|
"loss": 0.3175, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 8.94456289978678, |
|
"grad_norm": 0.2803252041339874, |
|
"learning_rate": 0.00017701747146482222, |
|
"loss": 0.3228, |
|
"step": 4195 |
|
}, |
|
{ |
|
"epoch": 8.955223880597014, |
|
"grad_norm": 0.27694806456565857, |
|
"learning_rate": 0.00017693829150820068, |
|
"loss": 0.3152, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 8.96588486140725, |
|
"grad_norm": 0.2755722403526306, |
|
"learning_rate": 0.00017685899316321422, |
|
"loss": 0.3105, |
|
"step": 4205 |
|
}, |
|
{ |
|
"epoch": 8.976545842217483, |
|
"grad_norm": 0.26287201046943665, |
|
"learning_rate": 0.00017677957655188258, |
|
"loss": 0.3146, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 8.98720682302772, |
|
"grad_norm": 0.2679538428783417, |
|
"learning_rate": 0.00017670004179640774, |
|
"loss": 0.3196, |
|
"step": 4215 |
|
}, |
|
{ |
|
"epoch": 8.997867803837954, |
|
"grad_norm": 0.2998240292072296, |
|
"learning_rate": 0.0001766203890191733, |
|
"loss": 0.311, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 0.556614875793457, |
|
"eval_runtime": 377.56, |
|
"eval_samples_per_second": 1.091, |
|
"eval_steps_per_second": 1.091, |
|
"step": 4221 |
|
}, |
|
{ |
|
"epoch": 9.008528784648188, |
|
"grad_norm": 0.2680657207965851, |
|
"learning_rate": 0.00017654061834274453, |
|
"loss": 0.2787, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 9.019189765458423, |
|
"grad_norm": 0.28186333179473877, |
|
"learning_rate": 0.00017646072988986816, |
|
"loss": 0.2668, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 9.029850746268657, |
|
"grad_norm": 0.3159712255001068, |
|
"learning_rate": 0.00017638072378347203, |
|
"loss": 0.2681, |
|
"step": 4235 |
|
}, |
|
{ |
|
"epoch": 9.040511727078892, |
|
"grad_norm": 0.29439476132392883, |
|
"learning_rate": 0.00017630060014666514, |
|
"loss": 0.2644, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 9.051172707889126, |
|
"grad_norm": 0.27110064029693604, |
|
"learning_rate": 0.00017622035910273726, |
|
"loss": 0.2645, |
|
"step": 4245 |
|
}, |
|
{ |
|
"epoch": 9.06183368869936, |
|
"grad_norm": 0.3253141939640045, |
|
"learning_rate": 0.00017614000077515886, |
|
"loss": 0.2668, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 9.072494669509595, |
|
"grad_norm": 0.27271440625190735, |
|
"learning_rate": 0.00017605952528758085, |
|
"loss": 0.2636, |
|
"step": 4255 |
|
}, |
|
{ |
|
"epoch": 9.08315565031983, |
|
"grad_norm": 0.3024181127548218, |
|
"learning_rate": 0.00017597893276383446, |
|
"loss": 0.2651, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 9.093816631130064, |
|
"grad_norm": 0.29704058170318604, |
|
"learning_rate": 0.00017589822332793098, |
|
"loss": 0.2705, |
|
"step": 4265 |
|
}, |
|
{ |
|
"epoch": 9.104477611940299, |
|
"grad_norm": 0.3102332055568695, |
|
"learning_rate": 0.0001758173971040616, |
|
"loss": 0.2645, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 9.115138592750533, |
|
"grad_norm": 0.28398755192756653, |
|
"learning_rate": 0.00017573645421659715, |
|
"loss": 0.2695, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 9.125799573560768, |
|
"grad_norm": 0.3188519775867462, |
|
"learning_rate": 0.00017565539479008814, |
|
"loss": 0.272, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 9.136460554371002, |
|
"grad_norm": 0.30803632736206055, |
|
"learning_rate": 0.0001755742189492643, |
|
"loss": 0.268, |
|
"step": 4285 |
|
}, |
|
{ |
|
"epoch": 9.147121535181236, |
|
"grad_norm": 0.3042227327823639, |
|
"learning_rate": 0.00017549292681903444, |
|
"loss": 0.2659, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 9.157782515991471, |
|
"grad_norm": 0.3055075407028198, |
|
"learning_rate": 0.00017541151852448644, |
|
"loss": 0.2705, |
|
"step": 4295 |
|
}, |
|
{ |
|
"epoch": 9.168443496801705, |
|
"grad_norm": 0.3084838092327118, |
|
"learning_rate": 0.00017532999419088682, |
|
"loss": 0.2711, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 9.17910447761194, |
|
"grad_norm": 0.3110904395580292, |
|
"learning_rate": 0.00017524835394368065, |
|
"loss": 0.2678, |
|
"step": 4305 |
|
}, |
|
{ |
|
"epoch": 9.189765458422174, |
|
"grad_norm": 0.3138080835342407, |
|
"learning_rate": 0.0001751665979084915, |
|
"loss": 0.2715, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 9.200426439232409, |
|
"grad_norm": 0.2787773609161377, |
|
"learning_rate": 0.00017508472621112093, |
|
"loss": 0.2764, |
|
"step": 4315 |
|
}, |
|
{ |
|
"epoch": 9.211087420042643, |
|
"grad_norm": 0.31073546409606934, |
|
"learning_rate": 0.0001750027389775486, |
|
"loss": 0.2745, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 9.221748400852878, |
|
"grad_norm": 0.3100415766239166, |
|
"learning_rate": 0.00017492063633393188, |
|
"loss": 0.2731, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 9.232409381663112, |
|
"grad_norm": 0.300081342458725, |
|
"learning_rate": 0.00017483841840660577, |
|
"loss": 0.2711, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 9.243070362473347, |
|
"grad_norm": 0.31163203716278076, |
|
"learning_rate": 0.0001747560853220826, |
|
"loss": 0.2786, |
|
"step": 4335 |
|
}, |
|
{ |
|
"epoch": 9.253731343283581, |
|
"grad_norm": 0.33607375621795654, |
|
"learning_rate": 0.00017467363720705204, |
|
"loss": 0.2728, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 9.264392324093816, |
|
"grad_norm": 0.300729900598526, |
|
"learning_rate": 0.0001745910741883806, |
|
"loss": 0.2749, |
|
"step": 4345 |
|
}, |
|
{ |
|
"epoch": 9.275053304904052, |
|
"grad_norm": 0.3036794364452362, |
|
"learning_rate": 0.00017450839639311162, |
|
"loss": 0.2726, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 9.285714285714286, |
|
"grad_norm": 0.32798221707344055, |
|
"learning_rate": 0.00017442560394846516, |
|
"loss": 0.2752, |
|
"step": 4355 |
|
}, |
|
{ |
|
"epoch": 9.296375266524521, |
|
"grad_norm": 0.2973875105381012, |
|
"learning_rate": 0.00017434269698183763, |
|
"loss": 0.2743, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 9.307036247334755, |
|
"grad_norm": 0.3339863717556, |
|
"learning_rate": 0.00017425967562080167, |
|
"loss": 0.2766, |
|
"step": 4365 |
|
}, |
|
{ |
|
"epoch": 9.31769722814499, |
|
"grad_norm": 0.30738508701324463, |
|
"learning_rate": 0.00017417653999310585, |
|
"loss": 0.2728, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 9.328358208955224, |
|
"grad_norm": 0.3430582284927368, |
|
"learning_rate": 0.0001740932902266747, |
|
"loss": 0.2744, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 9.339019189765459, |
|
"grad_norm": 0.2887689769268036, |
|
"learning_rate": 0.00017400992644960842, |
|
"loss": 0.2772, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 9.349680170575693, |
|
"grad_norm": 0.3249075412750244, |
|
"learning_rate": 0.0001739264487901824, |
|
"loss": 0.2757, |
|
"step": 4385 |
|
}, |
|
{ |
|
"epoch": 9.360341151385928, |
|
"grad_norm": 0.31958818435668945, |
|
"learning_rate": 0.00017384285737684753, |
|
"loss": 0.2744, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 9.371002132196162, |
|
"grad_norm": 0.31824401021003723, |
|
"learning_rate": 0.0001737591523382296, |
|
"loss": 0.2809, |
|
"step": 4395 |
|
}, |
|
{ |
|
"epoch": 9.381663113006397, |
|
"grad_norm": 0.3125913143157959, |
|
"learning_rate": 0.00017367533380312924, |
|
"loss": 0.276, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 9.392324093816631, |
|
"grad_norm": 0.32215094566345215, |
|
"learning_rate": 0.0001735914019005218, |
|
"loss": 0.2746, |
|
"step": 4405 |
|
}, |
|
{ |
|
"epoch": 9.402985074626866, |
|
"grad_norm": 0.3145129382610321, |
|
"learning_rate": 0.00017350735675955697, |
|
"loss": 0.2818, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 9.4136460554371, |
|
"grad_norm": 0.3180083930492401, |
|
"learning_rate": 0.0001734231985095588, |
|
"loss": 0.2782, |
|
"step": 4415 |
|
}, |
|
{ |
|
"epoch": 9.424307036247335, |
|
"grad_norm": 0.307829350233078, |
|
"learning_rate": 0.00017333892728002527, |
|
"loss": 0.2744, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 9.43496801705757, |
|
"grad_norm": 0.3098660111427307, |
|
"learning_rate": 0.00017325454320062832, |
|
"loss": 0.2794, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 9.445628997867804, |
|
"grad_norm": 0.2991037666797638, |
|
"learning_rate": 0.0001731700464012134, |
|
"loss": 0.2778, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 9.456289978678038, |
|
"grad_norm": 0.3197588622570038, |
|
"learning_rate": 0.0001730854370117996, |
|
"loss": 0.2764, |
|
"step": 4435 |
|
}, |
|
{ |
|
"epoch": 9.466950959488273, |
|
"grad_norm": 0.31818678975105286, |
|
"learning_rate": 0.00017300071516257904, |
|
"loss": 0.2754, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 9.477611940298507, |
|
"grad_norm": 0.3030422031879425, |
|
"learning_rate": 0.000172915880983917, |
|
"loss": 0.2795, |
|
"step": 4445 |
|
}, |
|
{ |
|
"epoch": 9.488272921108742, |
|
"grad_norm": 0.304565966129303, |
|
"learning_rate": 0.00017283093460635166, |
|
"loss": 0.2837, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 9.498933901918976, |
|
"grad_norm": 0.3034186363220215, |
|
"learning_rate": 0.00017274587616059376, |
|
"loss": 0.2768, |
|
"step": 4455 |
|
}, |
|
{ |
|
"epoch": 9.50959488272921, |
|
"grad_norm": 0.30095112323760986, |
|
"learning_rate": 0.00017266070577752647, |
|
"loss": 0.2786, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 9.520255863539445, |
|
"grad_norm": 0.3102254271507263, |
|
"learning_rate": 0.0001725754235882053, |
|
"loss": 0.2776, |
|
"step": 4465 |
|
}, |
|
{ |
|
"epoch": 9.53091684434968, |
|
"grad_norm": 0.2985278367996216, |
|
"learning_rate": 0.00017249002972385765, |
|
"loss": 0.2784, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 9.541577825159914, |
|
"grad_norm": 0.32831713557243347, |
|
"learning_rate": 0.00017240452431588294, |
|
"loss": 0.2869, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 9.552238805970148, |
|
"grad_norm": 0.3177868127822876, |
|
"learning_rate": 0.0001723189074958521, |
|
"loss": 0.2784, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 9.562899786780385, |
|
"grad_norm": 0.3071228265762329, |
|
"learning_rate": 0.00017223317939550753, |
|
"loss": 0.2804, |
|
"step": 4485 |
|
}, |
|
{ |
|
"epoch": 9.57356076759062, |
|
"grad_norm": 0.3183000981807709, |
|
"learning_rate": 0.00017214734014676288, |
|
"loss": 0.2799, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 9.584221748400854, |
|
"grad_norm": 0.33166825771331787, |
|
"learning_rate": 0.00017206138988170281, |
|
"loss": 0.2828, |
|
"step": 4495 |
|
}, |
|
{ |
|
"epoch": 9.594882729211088, |
|
"grad_norm": 0.3132229149341583, |
|
"learning_rate": 0.0001719753287325828, |
|
"loss": 0.279, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 9.605543710021323, |
|
"grad_norm": 0.3281535506248474, |
|
"learning_rate": 0.00017188915683182896, |
|
"loss": 0.2767, |
|
"step": 4505 |
|
}, |
|
{ |
|
"epoch": 9.616204690831557, |
|
"grad_norm": 0.31389063596725464, |
|
"learning_rate": 0.00017180287431203781, |
|
"loss": 0.2851, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 9.626865671641792, |
|
"grad_norm": 0.315807580947876, |
|
"learning_rate": 0.00017171648130597612, |
|
"loss": 0.2816, |
|
"step": 4515 |
|
}, |
|
{ |
|
"epoch": 9.637526652452026, |
|
"grad_norm": 0.3103027939796448, |
|
"learning_rate": 0.0001716299779465806, |
|
"loss": 0.2797, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 9.64818763326226, |
|
"grad_norm": 0.3018797039985657, |
|
"learning_rate": 0.00017154336436695785, |
|
"loss": 0.2827, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 9.658848614072495, |
|
"grad_norm": 0.3306185007095337, |
|
"learning_rate": 0.00017145664070038406, |
|
"loss": 0.2861, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 9.66950959488273, |
|
"grad_norm": 0.3151242434978485, |
|
"learning_rate": 0.0001713698070803047, |
|
"loss": 0.2855, |
|
"step": 4535 |
|
}, |
|
{ |
|
"epoch": 9.680170575692964, |
|
"grad_norm": 0.3073995113372803, |
|
"learning_rate": 0.0001712828636403346, |
|
"loss": 0.2825, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 9.690831556503198, |
|
"grad_norm": 0.31615933775901794, |
|
"learning_rate": 0.00017119581051425742, |
|
"loss": 0.2791, |
|
"step": 4545 |
|
}, |
|
{ |
|
"epoch": 9.701492537313433, |
|
"grad_norm": 0.3101312816143036, |
|
"learning_rate": 0.0001711086478360257, |
|
"loss": 0.287, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 9.712153518123667, |
|
"grad_norm": 0.3094468116760254, |
|
"learning_rate": 0.00017102137573976058, |
|
"loss": 0.2804, |
|
"step": 4555 |
|
}, |
|
{ |
|
"epoch": 9.722814498933902, |
|
"grad_norm": 0.33349186182022095, |
|
"learning_rate": 0.00017093399435975142, |
|
"loss": 0.2773, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 9.733475479744136, |
|
"grad_norm": 0.2954055368900299, |
|
"learning_rate": 0.00017084650383045587, |
|
"loss": 0.2762, |
|
"step": 4565 |
|
}, |
|
{ |
|
"epoch": 9.74413646055437, |
|
"grad_norm": 0.2962237000465393, |
|
"learning_rate": 0.0001707589042864995, |
|
"loss": 0.2861, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 9.754797441364605, |
|
"grad_norm": 0.3323478698730469, |
|
"learning_rate": 0.00017067119586267556, |
|
"loss": 0.2861, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 9.76545842217484, |
|
"grad_norm": 0.2926410138607025, |
|
"learning_rate": 0.000170583378693945, |
|
"loss": 0.2817, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 9.776119402985074, |
|
"grad_norm": 0.3227819502353668, |
|
"learning_rate": 0.0001704954529154359, |
|
"loss": 0.2884, |
|
"step": 4585 |
|
}, |
|
{ |
|
"epoch": 9.786780383795309, |
|
"grad_norm": 0.32089999318122864, |
|
"learning_rate": 0.00017040741866244358, |
|
"loss": 0.2881, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 9.797441364605543, |
|
"grad_norm": 0.3188937306404114, |
|
"learning_rate": 0.0001703192760704303, |
|
"loss": 0.2855, |
|
"step": 4595 |
|
}, |
|
{ |
|
"epoch": 9.808102345415778, |
|
"grad_norm": 0.3184082508087158, |
|
"learning_rate": 0.00017023102527502496, |
|
"loss": 0.2842, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 9.818763326226012, |
|
"grad_norm": 0.2914822995662689, |
|
"learning_rate": 0.00017014266641202292, |
|
"loss": 0.274, |
|
"step": 4605 |
|
}, |
|
{ |
|
"epoch": 9.829424307036247, |
|
"grad_norm": 0.33117881417274475, |
|
"learning_rate": 0.00017005419961738593, |
|
"loss": 0.2888, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 9.840085287846481, |
|
"grad_norm": 0.32017573714256287, |
|
"learning_rate": 0.0001699656250272418, |
|
"loss": 0.2785, |
|
"step": 4615 |
|
}, |
|
{ |
|
"epoch": 9.850746268656717, |
|
"grad_norm": 0.29259586334228516, |
|
"learning_rate": 0.00016987694277788417, |
|
"loss": 0.2888, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 9.86140724946695, |
|
"grad_norm": 0.29314401745796204, |
|
"learning_rate": 0.00016978815300577234, |
|
"loss": 0.2826, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 9.872068230277186, |
|
"grad_norm": 0.3312009572982788, |
|
"learning_rate": 0.00016969925584753108, |
|
"loss": 0.2828, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 9.88272921108742, |
|
"grad_norm": 0.31798672676086426, |
|
"learning_rate": 0.00016961025143995037, |
|
"loss": 0.2777, |
|
"step": 4635 |
|
}, |
|
{ |
|
"epoch": 9.893390191897655, |
|
"grad_norm": 0.2987801134586334, |
|
"learning_rate": 0.00016952113991998527, |
|
"loss": 0.2818, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 9.90405117270789, |
|
"grad_norm": 0.3148316442966461, |
|
"learning_rate": 0.00016943192142475564, |
|
"loss": 0.2853, |
|
"step": 4645 |
|
}, |
|
{ |
|
"epoch": 9.914712153518124, |
|
"grad_norm": 0.3207818269729614, |
|
"learning_rate": 0.00016934259609154592, |
|
"loss": 0.2835, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 9.925373134328359, |
|
"grad_norm": 0.29595887660980225, |
|
"learning_rate": 0.000169253164057805, |
|
"loss": 0.2845, |
|
"step": 4655 |
|
}, |
|
{ |
|
"epoch": 9.936034115138593, |
|
"grad_norm": 0.2958875894546509, |
|
"learning_rate": 0.00016916362546114585, |
|
"loss": 0.2793, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 9.946695095948828, |
|
"grad_norm": 0.2999938726425171, |
|
"learning_rate": 0.00016907398043934557, |
|
"loss": 0.2794, |
|
"step": 4665 |
|
}, |
|
{ |
|
"epoch": 9.957356076759062, |
|
"grad_norm": 0.29154959321022034, |
|
"learning_rate": 0.00016898422913034486, |
|
"loss": 0.2891, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 9.968017057569297, |
|
"grad_norm": 0.30298835039138794, |
|
"learning_rate": 0.0001688943716722481, |
|
"loss": 0.2859, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 9.978678038379531, |
|
"grad_norm": 0.3251824975013733, |
|
"learning_rate": 0.00016880440820332291, |
|
"loss": 0.283, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 9.989339019189766, |
|
"grad_norm": 0.29153597354888916, |
|
"learning_rate": 0.0001687143388620001, |
|
"loss": 0.2871, |
|
"step": 4685 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.3233014643192291, |
|
"learning_rate": 0.0001686241637868734, |
|
"loss": 0.2853, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 0.5920408368110657, |
|
"eval_runtime": 377.5422, |
|
"eval_samples_per_second": 1.091, |
|
"eval_steps_per_second": 1.091, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 4690, |
|
"total_flos": 3.4794514845867704e+18, |
|
"train_loss": 0.46453510172077334, |
|
"train_runtime": 112907.655, |
|
"train_samples_per_second": 0.997, |
|
"train_steps_per_second": 0.125 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 14070, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 30, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.4794514845867704e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|