|
{ |
|
"best_metric": 0.23624150454998016, |
|
"best_model_checkpoint": "data/hansken_human_hql/checkpoint-511", |
|
"epoch": 9.995110024449877, |
|
"eval_steps": 500, |
|
"global_step": 1022, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009779951100244499, |
|
"grad_norm": 1.0709587335586548, |
|
"learning_rate": 9.80392156862745e-07, |
|
"loss": 1.4707, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0488997555012225, |
|
"grad_norm": 1.1860318183898926, |
|
"learning_rate": 4.901960784313726e-06, |
|
"loss": 1.4227, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.097799511002445, |
|
"grad_norm": 1.145372986793518, |
|
"learning_rate": 9.803921568627451e-06, |
|
"loss": 1.444, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1466992665036675, |
|
"grad_norm": 1.0051456689834595, |
|
"learning_rate": 1.4705882352941177e-05, |
|
"loss": 1.3414, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.19559902200489, |
|
"grad_norm": 0.5133008360862732, |
|
"learning_rate": 1.9607843137254903e-05, |
|
"loss": 1.2116, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.24449877750611246, |
|
"grad_norm": 0.41832494735717773, |
|
"learning_rate": 2.4509803921568626e-05, |
|
"loss": 1.1424, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.293398533007335, |
|
"grad_norm": 0.41829216480255127, |
|
"learning_rate": 2.9411764705882354e-05, |
|
"loss": 1.143, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.3422982885085575, |
|
"grad_norm": 0.35495856404304504, |
|
"learning_rate": 3.431372549019608e-05, |
|
"loss": 1.0472, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.39119804400978, |
|
"grad_norm": 0.40399229526519775, |
|
"learning_rate": 3.9215686274509805e-05, |
|
"loss": 0.9879, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4400977995110024, |
|
"grad_norm": 0.31430941820144653, |
|
"learning_rate": 4.411764705882353e-05, |
|
"loss": 0.9467, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.4889975550122249, |
|
"grad_norm": 0.29712405800819397, |
|
"learning_rate": 4.901960784313725e-05, |
|
"loss": 0.885, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5378973105134475, |
|
"grad_norm": 0.40078112483024597, |
|
"learning_rate": 5.392156862745098e-05, |
|
"loss": 0.7973, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.58679706601467, |
|
"grad_norm": 0.34199750423431396, |
|
"learning_rate": 5.882352941176471e-05, |
|
"loss": 0.776, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6356968215158925, |
|
"grad_norm": 0.4243955910205841, |
|
"learning_rate": 6.372549019607843e-05, |
|
"loss": 0.651, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.684596577017115, |
|
"grad_norm": 0.30432572960853577, |
|
"learning_rate": 6.862745098039216e-05, |
|
"loss": 0.5769, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.7334963325183375, |
|
"grad_norm": 0.27279356122016907, |
|
"learning_rate": 7.352941176470589e-05, |
|
"loss": 0.5436, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.78239608801956, |
|
"grad_norm": 0.2576221823692322, |
|
"learning_rate": 7.843137254901961e-05, |
|
"loss": 0.501, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.8312958435207825, |
|
"grad_norm": 0.22290439903736115, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 0.4915, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.8801955990220048, |
|
"grad_norm": 0.21740856766700745, |
|
"learning_rate": 8.823529411764706e-05, |
|
"loss": 0.4487, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.9290953545232273, |
|
"grad_norm": 0.21560043096542358, |
|
"learning_rate": 9.313725490196079e-05, |
|
"loss": 0.4351, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.9779951100244498, |
|
"grad_norm": 0.2607389986515045, |
|
"learning_rate": 9.80392156862745e-05, |
|
"loss": 0.4508, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9975550122249389, |
|
"eval_loss": 0.44326770305633545, |
|
"eval_runtime": 398.4802, |
|
"eval_samples_per_second": 1.029, |
|
"eval_steps_per_second": 1.029, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.0268948655256724, |
|
"grad_norm": 0.21888603270053864, |
|
"learning_rate": 0.00010294117647058823, |
|
"loss": 0.3968, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.075794621026895, |
|
"grad_norm": 0.21742066740989685, |
|
"learning_rate": 0.00010784313725490196, |
|
"loss": 0.3785, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.1246943765281174, |
|
"grad_norm": 0.2523965537548065, |
|
"learning_rate": 0.0001127450980392157, |
|
"loss": 0.4034, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.17359413202934, |
|
"grad_norm": 0.2155005782842636, |
|
"learning_rate": 0.00011764705882352942, |
|
"loss": 0.3766, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.2224938875305624, |
|
"grad_norm": 0.25576308369636536, |
|
"learning_rate": 0.00012254901960784316, |
|
"loss": 0.364, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.271393643031785, |
|
"grad_norm": 0.2288295179605484, |
|
"learning_rate": 0.00012745098039215687, |
|
"loss": 0.3451, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.3202933985330074, |
|
"grad_norm": 0.2045079469680786, |
|
"learning_rate": 0.0001323529411764706, |
|
"loss": 0.3425, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.36919315403423, |
|
"grad_norm": 0.2297014445066452, |
|
"learning_rate": 0.0001372549019607843, |
|
"loss": 0.3658, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.4180929095354524, |
|
"grad_norm": 0.2170581817626953, |
|
"learning_rate": 0.00014215686274509804, |
|
"loss": 0.3482, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.466992665036675, |
|
"grad_norm": 0.2250969409942627, |
|
"learning_rate": 0.00014705882352941178, |
|
"loss": 0.3353, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.5158924205378974, |
|
"grad_norm": 0.23191578686237335, |
|
"learning_rate": 0.00015196078431372549, |
|
"loss": 0.3271, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.56479217603912, |
|
"grad_norm": 0.2477528601884842, |
|
"learning_rate": 0.00015686274509803922, |
|
"loss": 0.3549, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.6136919315403424, |
|
"grad_norm": 0.20846064388751984, |
|
"learning_rate": 0.00016176470588235295, |
|
"loss": 0.3171, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.662591687041565, |
|
"grad_norm": 0.21829602122306824, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 0.3642, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.7114914425427874, |
|
"grad_norm": 0.22842282056808472, |
|
"learning_rate": 0.0001715686274509804, |
|
"loss": 0.3116, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.76039119804401, |
|
"grad_norm": 0.24106037616729736, |
|
"learning_rate": 0.00017647058823529413, |
|
"loss": 0.3066, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.8092909535452324, |
|
"grad_norm": 0.25696486234664917, |
|
"learning_rate": 0.00018137254901960786, |
|
"loss": 0.3053, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.858190709046455, |
|
"grad_norm": 0.22010771930217743, |
|
"learning_rate": 0.00018627450980392157, |
|
"loss": 0.3233, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.9070904645476774, |
|
"grad_norm": 0.2373352199792862, |
|
"learning_rate": 0.0001911764705882353, |
|
"loss": 0.3102, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.9559902200488999, |
|
"grad_norm": 0.21177123486995697, |
|
"learning_rate": 0.000196078431372549, |
|
"loss": 0.302, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.9951100244498776, |
|
"eval_loss": 0.3139691650867462, |
|
"eval_runtime": 387.4792, |
|
"eval_samples_per_second": 1.058, |
|
"eval_steps_per_second": 1.058, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 2.0048899755501224, |
|
"grad_norm": 0.2193712592124939, |
|
"learning_rate": 0.00019999985360565867, |
|
"loss": 0.2813, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 2.053789731051345, |
|
"grad_norm": 0.3371932804584503, |
|
"learning_rate": 0.00019999472984871732, |
|
"loss": 0.2844, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.1026894865525674, |
|
"grad_norm": 0.23578821122646332, |
|
"learning_rate": 0.00019998228680332932, |
|
"loss": 0.263, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.15158924205379, |
|
"grad_norm": 0.27435311675071716, |
|
"learning_rate": 0.00019996252538028507, |
|
"loss": 0.2752, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.2004889975550124, |
|
"grad_norm": 0.24362725019454956, |
|
"learning_rate": 0.00019993544702605638, |
|
"loss": 0.2572, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.249388753056235, |
|
"grad_norm": 0.24360118806362152, |
|
"learning_rate": 0.0001999010537226905, |
|
"loss": 0.3191, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.2982885085574574, |
|
"grad_norm": 0.2612737715244293, |
|
"learning_rate": 0.0001998593479876652, |
|
"loss": 0.2506, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.34718826405868, |
|
"grad_norm": 0.21556636691093445, |
|
"learning_rate": 0.00019981033287370443, |
|
"loss": 0.2416, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.3960880195599024, |
|
"grad_norm": 0.22406277060508728, |
|
"learning_rate": 0.00019975401196855482, |
|
"loss": 0.273, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.444987775061125, |
|
"grad_norm": 0.3020350933074951, |
|
"learning_rate": 0.00019969038939472315, |
|
"loss": 0.2457, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.4938875305623474, |
|
"grad_norm": 0.20698243379592896, |
|
"learning_rate": 0.00019961946980917456, |
|
"loss": 0.2569, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.54278728606357, |
|
"grad_norm": 0.4294751286506653, |
|
"learning_rate": 0.00019954125840299165, |
|
"loss": 0.2246, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.591687041564792, |
|
"grad_norm": 0.37185847759246826, |
|
"learning_rate": 0.00019945576090099452, |
|
"loss": 0.229, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.640586797066015, |
|
"grad_norm": 0.2863105237483978, |
|
"learning_rate": 0.00019936298356132176, |
|
"loss": 0.2338, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.689486552567237, |
|
"grad_norm": 0.19301028549671173, |
|
"learning_rate": 0.00019926293317497245, |
|
"loss": 0.2167, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.73838630806846, |
|
"grad_norm": 0.22075964510440826, |
|
"learning_rate": 0.00019915561706530883, |
|
"loss": 0.2367, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.787286063569682, |
|
"grad_norm": 0.22829142212867737, |
|
"learning_rate": 0.0001990410430875205, |
|
"loss": 0.245, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.836185819070905, |
|
"grad_norm": 0.1982724666595459, |
|
"learning_rate": 0.00019891921962804943, |
|
"loss": 0.217, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.885085574572127, |
|
"grad_norm": 0.23672354221343994, |
|
"learning_rate": 0.00019879015560397587, |
|
"loss": 0.2298, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.93398533007335, |
|
"grad_norm": 0.21391943097114563, |
|
"learning_rate": 0.00019865386046236596, |
|
"loss": 0.2326, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.982885085574572, |
|
"grad_norm": 0.19821615517139435, |
|
"learning_rate": 0.00019851034417958, |
|
"loss": 0.2692, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.9926650366748166, |
|
"eval_loss": 0.2616053521633148, |
|
"eval_runtime": 387.9813, |
|
"eval_samples_per_second": 1.057, |
|
"eval_steps_per_second": 1.057, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 3.031784841075795, |
|
"grad_norm": 0.22108572721481323, |
|
"learning_rate": 0.0001983596172605423, |
|
"loss": 0.2104, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 3.0806845965770173, |
|
"grad_norm": 0.24487629532814026, |
|
"learning_rate": 0.00019820169073797228, |
|
"loss": 0.1942, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 3.12958435207824, |
|
"grad_norm": 0.2110164612531662, |
|
"learning_rate": 0.0001980365761715769, |
|
"loss": 0.1833, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.178484107579462, |
|
"grad_norm": 0.20861805975437164, |
|
"learning_rate": 0.0001978642856472045, |
|
"loss": 0.1981, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 3.227383863080685, |
|
"grad_norm": 0.1969948559999466, |
|
"learning_rate": 0.0001976848317759601, |
|
"loss": 0.1868, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.276283618581907, |
|
"grad_norm": 0.19443638622760773, |
|
"learning_rate": 0.0001974982276932824, |
|
"loss": 0.1902, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 3.32518337408313, |
|
"grad_norm": 0.30058181285858154, |
|
"learning_rate": 0.00019730448705798239, |
|
"loss": 0.2241, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.374083129584352, |
|
"grad_norm": 0.21647138893604279, |
|
"learning_rate": 0.00019710362405124334, |
|
"loss": 0.1838, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 3.422982885085575, |
|
"grad_norm": 0.19676022231578827, |
|
"learning_rate": 0.00019689565337558288, |
|
"loss": 0.1961, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.471882640586797, |
|
"grad_norm": 0.2371009737253189, |
|
"learning_rate": 0.00019668059025377703, |
|
"loss": 0.2052, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 3.52078239608802, |
|
"grad_norm": 0.21014133095741272, |
|
"learning_rate": 0.00019645845042774553, |
|
"loss": 0.1987, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.569682151589242, |
|
"grad_norm": 0.2178957760334015, |
|
"learning_rate": 0.00019622925015739997, |
|
"loss": 0.1903, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 3.618581907090465, |
|
"grad_norm": 0.21497862040996552, |
|
"learning_rate": 0.0001959930062194534, |
|
"loss": 0.2, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.667481662591687, |
|
"grad_norm": 0.22582682967185974, |
|
"learning_rate": 0.00019574973590619243, |
|
"loss": 0.1868, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.71638141809291, |
|
"grad_norm": 0.1827058047056198, |
|
"learning_rate": 0.00019549945702421144, |
|
"loss": 0.2172, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.765281173594132, |
|
"grad_norm": 0.19827835261821747, |
|
"learning_rate": 0.00019524218789310912, |
|
"loss": 0.1785, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 3.8141809290953548, |
|
"grad_norm": 0.2121572494506836, |
|
"learning_rate": 0.0001949779473441478, |
|
"loss": 0.1795, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.863080684596577, |
|
"grad_norm": 0.20197761058807373, |
|
"learning_rate": 0.0001947067547188747, |
|
"loss": 0.19, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.9119804400977998, |
|
"grad_norm": 0.21854069828987122, |
|
"learning_rate": 0.00019442862986770646, |
|
"loss": 0.1886, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.960880195599022, |
|
"grad_norm": 0.20974552631378174, |
|
"learning_rate": 0.0001941435931484761, |
|
"loss": 0.177, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.24309584498405457, |
|
"eval_runtime": 399.7773, |
|
"eval_samples_per_second": 1.026, |
|
"eval_steps_per_second": 1.026, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 4.009779951100245, |
|
"grad_norm": 0.19041913747787476, |
|
"learning_rate": 0.0001938516654249428, |
|
"loss": 0.1709, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 4.058679706601467, |
|
"grad_norm": 0.22610776126384735, |
|
"learning_rate": 0.00019355286806526493, |
|
"loss": 0.158, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 4.10757946210269, |
|
"grad_norm": 0.2044234424829483, |
|
"learning_rate": 0.00019324722294043558, |
|
"loss": 0.1522, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 4.156479217603912, |
|
"grad_norm": 0.2402704805135727, |
|
"learning_rate": 0.00019293475242268223, |
|
"loss": 0.1509, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 4.205378973105135, |
|
"grad_norm": 0.20224688947200775, |
|
"learning_rate": 0.0001926154793838288, |
|
"loss": 0.1565, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 4.254278728606357, |
|
"grad_norm": 0.21887710690498352, |
|
"learning_rate": 0.00019228942719362143, |
|
"loss": 0.1551, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 4.30317848410758, |
|
"grad_norm": 0.20886527001857758, |
|
"learning_rate": 0.00019195661971801827, |
|
"loss": 0.1568, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 4.352078239608802, |
|
"grad_norm": 0.21612216532230377, |
|
"learning_rate": 0.00019161708131744222, |
|
"loss": 0.1516, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 4.400977995110025, |
|
"grad_norm": 0.20036669075489044, |
|
"learning_rate": 0.00019127083684499806, |
|
"loss": 0.1529, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 4.449877750611247, |
|
"grad_norm": 0.3197900950908661, |
|
"learning_rate": 0.00019091791164465305, |
|
"loss": 0.1854, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 4.49877750611247, |
|
"grad_norm": 0.18851010501384735, |
|
"learning_rate": 0.00019055833154938207, |
|
"loss": 0.1574, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 4.547677261613692, |
|
"grad_norm": 0.214978888630867, |
|
"learning_rate": 0.00019019212287927663, |
|
"loss": 0.1555, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 4.596577017114915, |
|
"grad_norm": 0.21155217289924622, |
|
"learning_rate": 0.00018981931243961824, |
|
"loss": 0.176, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 4.645476772616137, |
|
"grad_norm": 0.18137674033641815, |
|
"learning_rate": 0.00018943992751891653, |
|
"loss": 0.1575, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 4.69437652811736, |
|
"grad_norm": 0.24663567543029785, |
|
"learning_rate": 0.00018905399588691163, |
|
"loss": 0.1568, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 4.743276283618582, |
|
"grad_norm": 0.19319510459899902, |
|
"learning_rate": 0.0001886615457925417, |
|
"loss": 0.1547, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 4.792176039119805, |
|
"grad_norm": 0.18611547350883484, |
|
"learning_rate": 0.00018826260596187505, |
|
"loss": 0.1755, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.841075794621027, |
|
"grad_norm": 0.47814473509788513, |
|
"learning_rate": 0.00018785720559600752, |
|
"loss": 0.1647, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 4.88997555012225, |
|
"grad_norm": 0.19350242614746094, |
|
"learning_rate": 0.00018744537436892516, |
|
"loss": 0.155, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.938875305623472, |
|
"grad_norm": 0.19956329464912415, |
|
"learning_rate": 0.00018702714242533204, |
|
"loss": 0.156, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 4.987775061124695, |
|
"grad_norm": 0.20709875226020813, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 0.1616, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.997555012224939, |
|
"eval_loss": 0.23624150454998016, |
|
"eval_runtime": 387.7068, |
|
"eval_samples_per_second": 1.058, |
|
"eval_steps_per_second": 1.058, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 5.036674816625917, |
|
"grad_norm": 0.22790652513504028, |
|
"learning_rate": 0.00018617159930774715, |
|
"loss": 0.1377, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 5.08557457212714, |
|
"grad_norm": 0.21796418726444244, |
|
"learning_rate": 0.00018573435075672424, |
|
"loss": 0.1326, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 5.134474327628362, |
|
"grad_norm": 0.19105204939842224, |
|
"learning_rate": 0.00018529082673054457, |
|
"loss": 0.1303, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 5.183374083129585, |
|
"grad_norm": 0.2682870328426361, |
|
"learning_rate": 0.00018484105969372182, |
|
"loss": 0.1316, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 5.232273838630807, |
|
"grad_norm": 0.18370023369789124, |
|
"learning_rate": 0.00018438508256773785, |
|
"loss": 0.1323, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 5.28117359413203, |
|
"grad_norm": 0.24072639644145966, |
|
"learning_rate": 0.00018392292872863267, |
|
"loss": 0.1332, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 5.330073349633252, |
|
"grad_norm": 0.19523735344409943, |
|
"learning_rate": 0.00018345463200456164, |
|
"loss": 0.1344, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 5.378973105134475, |
|
"grad_norm": 0.24865508079528809, |
|
"learning_rate": 0.0001829802266733193, |
|
"loss": 0.1359, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 5.427872860635697, |
|
"grad_norm": 0.2039840966463089, |
|
"learning_rate": 0.00018249974745983023, |
|
"loss": 0.1337, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 5.47677261613692, |
|
"grad_norm": 0.20024679601192474, |
|
"learning_rate": 0.00018201322953360758, |
|
"loss": 0.154, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 5.525672371638142, |
|
"grad_norm": 0.1976476013660431, |
|
"learning_rate": 0.0001815207085061784, |
|
"loss": 0.1353, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 5.574572127139365, |
|
"grad_norm": 0.1974327266216278, |
|
"learning_rate": 0.00018102222042847737, |
|
"loss": 0.1373, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 5.623471882640587, |
|
"grad_norm": 0.27005520462989807, |
|
"learning_rate": 0.00018051780178820765, |
|
"loss": 0.1437, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 5.67237163814181, |
|
"grad_norm": 0.20781448483467102, |
|
"learning_rate": 0.00018000748950717038, |
|
"loss": 0.1322, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 5.721271393643032, |
|
"grad_norm": 0.20179703831672668, |
|
"learning_rate": 0.000179491320938562, |
|
"loss": 0.1378, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 5.770171149144255, |
|
"grad_norm": 0.22105282545089722, |
|
"learning_rate": 0.00017896933386423998, |
|
"loss": 0.136, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 5.819070904645477, |
|
"grad_norm": 0.4113224446773529, |
|
"learning_rate": 0.00017844156649195759, |
|
"loss": 0.1495, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 5.8679706601467, |
|
"grad_norm": 0.20451286435127258, |
|
"learning_rate": 0.00017790805745256704, |
|
"loss": 0.1318, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 5.916870415647922, |
|
"grad_norm": 0.18566569685935974, |
|
"learning_rate": 0.0001773688457971919, |
|
"loss": 0.1359, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 5.965770171149144, |
|
"grad_norm": 0.1862591803073883, |
|
"learning_rate": 0.0001768239709943686, |
|
"loss": 0.1358, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 5.995110024449878, |
|
"eval_loss": 0.23938237130641937, |
|
"eval_runtime": 387.8478, |
|
"eval_samples_per_second": 1.057, |
|
"eval_steps_per_second": 1.057, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 6.014669926650367, |
|
"grad_norm": 0.16670842468738556, |
|
"learning_rate": 0.0001762734729271575, |
|
"loss": 0.1275, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 6.06356968215159, |
|
"grad_norm": 0.23901741206645966, |
|
"learning_rate": 0.00017571739189022365, |
|
"loss": 0.1113, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 6.112469437652812, |
|
"grad_norm": 0.19317218661308289, |
|
"learning_rate": 0.00017515576858688722, |
|
"loss": 0.1101, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 6.161369193154035, |
|
"grad_norm": 0.21369099617004395, |
|
"learning_rate": 0.00017458864412614434, |
|
"loss": 0.1122, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 6.210268948655257, |
|
"grad_norm": 0.21011659502983093, |
|
"learning_rate": 0.00017401606001965782, |
|
"loss": 0.1136, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 6.25916870415648, |
|
"grad_norm": 0.1860456019639969, |
|
"learning_rate": 0.00017343805817871886, |
|
"loss": 0.1305, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 6.308068459657702, |
|
"grad_norm": 0.23417602479457855, |
|
"learning_rate": 0.00017285468091117904, |
|
"loss": 0.1165, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 6.356968215158924, |
|
"grad_norm": 0.189472958445549, |
|
"learning_rate": 0.00017226597091835378, |
|
"loss": 0.119, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 6.405867970660147, |
|
"grad_norm": 0.2460348904132843, |
|
"learning_rate": 0.00017167197129189652, |
|
"loss": 0.1188, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 6.45476772616137, |
|
"grad_norm": 0.20059679448604584, |
|
"learning_rate": 0.00017107272551064473, |
|
"loss": 0.1194, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 6.503667481662592, |
|
"grad_norm": 0.19838838279247284, |
|
"learning_rate": 0.00017046827743743726, |
|
"loss": 0.1165, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 6.552567237163814, |
|
"grad_norm": 0.20280085504055023, |
|
"learning_rate": 0.00016985867131590383, |
|
"loss": 0.1168, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 6.601466992665037, |
|
"grad_norm": 0.27974265813827515, |
|
"learning_rate": 0.00016924395176722647, |
|
"loss": 0.122, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 6.65036674816626, |
|
"grad_norm": 0.1994495540857315, |
|
"learning_rate": 0.0001686241637868734, |
|
"loss": 0.1173, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 6.699266503667482, |
|
"grad_norm": 0.20043040812015533, |
|
"learning_rate": 0.00016799935274130546, |
|
"loss": 0.1183, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 6.748166259168704, |
|
"grad_norm": 0.19184747338294983, |
|
"learning_rate": 0.00016736956436465573, |
|
"loss": 0.1192, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 6.797066014669927, |
|
"grad_norm": 0.20747938752174377, |
|
"learning_rate": 0.00016673484475538146, |
|
"loss": 0.1188, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 6.84596577017115, |
|
"grad_norm": 0.19285354018211365, |
|
"learning_rate": 0.00016609524037289019, |
|
"loss": 0.117, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 6.894865525672372, |
|
"grad_norm": 0.18242338299751282, |
|
"learning_rate": 0.00016545079803413892, |
|
"loss": 0.1208, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 6.943765281173594, |
|
"grad_norm": 0.19887416064739227, |
|
"learning_rate": 0.00016480156491020727, |
|
"loss": 0.1227, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 6.992665036674817, |
|
"grad_norm": 0.19773922860622406, |
|
"learning_rate": 0.00016414758852284478, |
|
"loss": 0.1199, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 6.992665036674817, |
|
"eval_loss": 0.24741248786449432, |
|
"eval_runtime": 387.335, |
|
"eval_samples_per_second": 1.059, |
|
"eval_steps_per_second": 1.059, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 7.041564792176039, |
|
"grad_norm": 0.5106807351112366, |
|
"learning_rate": 0.0001634889167409923, |
|
"loss": 0.1051, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 7.090464547677262, |
|
"grad_norm": 0.18619847297668457, |
|
"learning_rate": 0.0001628255977772784, |
|
"loss": 0.0979, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 7.139364303178484, |
|
"grad_norm": 0.18676620721817017, |
|
"learning_rate": 0.00016215768018449012, |
|
"loss": 0.1009, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 7.188264058679707, |
|
"grad_norm": 0.2054695338010788, |
|
"learning_rate": 0.00016148521285201927, |
|
"loss": 0.1002, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 7.237163814180929, |
|
"grad_norm": 0.20496530830860138, |
|
"learning_rate": 0.00016080824500228367, |
|
"loss": 0.1011, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 7.286063569682152, |
|
"grad_norm": 0.18679122626781464, |
|
"learning_rate": 0.0001601268261871244, |
|
"loss": 0.1052, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 7.334963325183374, |
|
"grad_norm": 0.20614224672317505, |
|
"learning_rate": 0.00015944100628417868, |
|
"loss": 0.1021, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 7.383863080684597, |
|
"grad_norm": 0.20026642084121704, |
|
"learning_rate": 0.00015875083549322908, |
|
"loss": 0.1019, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 7.432762836185819, |
|
"grad_norm": 0.1852520853281021, |
|
"learning_rate": 0.00015805636433252891, |
|
"loss": 0.1028, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 7.481662591687042, |
|
"grad_norm": 0.19096429646015167, |
|
"learning_rate": 0.0001573576436351046, |
|
"loss": 0.1031, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 7.530562347188264, |
|
"grad_norm": 0.18263529241085052, |
|
"learning_rate": 0.00015665472454503483, |
|
"loss": 0.1033, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 7.579462102689487, |
|
"grad_norm": 0.1884106546640396, |
|
"learning_rate": 0.00015594765851370684, |
|
"loss": 0.1063, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 7.628361858190709, |
|
"grad_norm": 0.2005338817834854, |
|
"learning_rate": 0.0001552364972960506, |
|
"loss": 0.1054, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 7.677261613691932, |
|
"grad_norm": 0.184016153216362, |
|
"learning_rate": 0.0001545212929467503, |
|
"loss": 0.1048, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 7.726161369193154, |
|
"grad_norm": 0.19765067100524902, |
|
"learning_rate": 0.0001538020978164341, |
|
"loss": 0.1044, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 7.775061124694377, |
|
"grad_norm": 0.18265607953071594, |
|
"learning_rate": 0.0001530789645478426, |
|
"loss": 0.1051, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 7.823960880195599, |
|
"grad_norm": 0.19815443456172943, |
|
"learning_rate": 0.00015235194607197508, |
|
"loss": 0.1081, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 7.872860635696822, |
|
"grad_norm": 0.22219662368297577, |
|
"learning_rate": 0.0001516210956042153, |
|
"loss": 0.1071, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 7.921760391198044, |
|
"grad_norm": 0.20078670978546143, |
|
"learning_rate": 0.0001508864666404365, |
|
"loss": 0.1075, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 7.970660146699267, |
|
"grad_norm": 0.17794115841388702, |
|
"learning_rate": 0.00015014811295308543, |
|
"loss": 0.1051, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 0.2625426948070526, |
|
"eval_runtime": 387.4946, |
|
"eval_samples_per_second": 1.058, |
|
"eval_steps_per_second": 1.058, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 8.01955990220049, |
|
"grad_norm": 0.1608039289712906, |
|
"learning_rate": 0.0001494060885872464, |
|
"loss": 0.0994, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 8.06845965770171, |
|
"grad_norm": 0.2323434203863144, |
|
"learning_rate": 0.00014866044785668563, |
|
"loss": 0.0895, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 8.117359413202934, |
|
"grad_norm": 0.17606528103351593, |
|
"learning_rate": 0.0001479112453398753, |
|
"loss": 0.0849, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 8.166259168704157, |
|
"grad_norm": 0.19025173783302307, |
|
"learning_rate": 0.0001471585358759987, |
|
"loss": 0.0886, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 8.21515892420538, |
|
"grad_norm": 0.1990627497434616, |
|
"learning_rate": 0.00014640237456093634, |
|
"loss": 0.0905, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 8.2640586797066, |
|
"grad_norm": 0.1725684553384781, |
|
"learning_rate": 0.00014564281674323297, |
|
"loss": 0.0899, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 8.312958435207824, |
|
"grad_norm": 0.18845060467720032, |
|
"learning_rate": 0.00014487991802004623, |
|
"loss": 0.0886, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 8.361858190709047, |
|
"grad_norm": 0.23856212198734283, |
|
"learning_rate": 0.00014411373423307714, |
|
"loss": 0.0924, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 8.41075794621027, |
|
"grad_norm": 0.18084120750427246, |
|
"learning_rate": 0.00014334432146448272, |
|
"loss": 0.0918, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 8.45965770171149, |
|
"grad_norm": 0.18600909411907196, |
|
"learning_rate": 0.00014257173603277095, |
|
"loss": 0.0913, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 8.508557457212714, |
|
"grad_norm": 0.1851680874824524, |
|
"learning_rate": 0.00014179603448867835, |
|
"loss": 0.0912, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 8.557457212713937, |
|
"grad_norm": 0.1818709820508957, |
|
"learning_rate": 0.00014101727361103076, |
|
"loss": 0.0903, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 8.60635696821516, |
|
"grad_norm": 0.19458520412445068, |
|
"learning_rate": 0.00014023551040258725, |
|
"loss": 0.0916, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 8.65525672371638, |
|
"grad_norm": 0.17777447402477264, |
|
"learning_rate": 0.00013945080208586775, |
|
"loss": 0.0928, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 8.704156479217604, |
|
"grad_norm": 0.20647075772285461, |
|
"learning_rate": 0.00013866320609896447, |
|
"loss": 0.0926, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 8.753056234718827, |
|
"grad_norm": 0.18589670956134796, |
|
"learning_rate": 0.00013787278009133776, |
|
"loss": 0.0934, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 8.80195599022005, |
|
"grad_norm": 0.19582615792751312, |
|
"learning_rate": 0.00013707958191959608, |
|
"loss": 0.0954, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 8.85085574572127, |
|
"grad_norm": 0.19688870012760162, |
|
"learning_rate": 0.00013628366964326153, |
|
"loss": 0.0925, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 8.899755501222494, |
|
"grad_norm": 0.1874823123216629, |
|
"learning_rate": 0.00013548510152051963, |
|
"loss": 0.0939, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 8.948655256723717, |
|
"grad_norm": 0.1876133382320404, |
|
"learning_rate": 0.00013468393600395525, |
|
"loss": 0.097, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 8.99755501222494, |
|
"grad_norm": 0.1735718548297882, |
|
"learning_rate": 0.00013388023173627414, |
|
"loss": 0.0945, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 8.99755501222494, |
|
"eval_loss": 0.27974453568458557, |
|
"eval_runtime": 387.9073, |
|
"eval_samples_per_second": 1.057, |
|
"eval_steps_per_second": 1.057, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 9.04645476772616, |
|
"grad_norm": 0.1655295491218567, |
|
"learning_rate": 0.00013307404754601013, |
|
"loss": 0.0806, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 9.095354523227384, |
|
"grad_norm": 0.19395217299461365, |
|
"learning_rate": 0.0001322654424432195, |
|
"loss": 0.0788, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 9.144254278728607, |
|
"grad_norm": 0.18941174447536469, |
|
"learning_rate": 0.00013145447561516138, |
|
"loss": 0.0793, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 9.19315403422983, |
|
"grad_norm": 0.20010443031787872, |
|
"learning_rate": 0.00013064120642196548, |
|
"loss": 0.0807, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 9.24205378973105, |
|
"grad_norm": 0.20777645707130432, |
|
"learning_rate": 0.00012982569439228713, |
|
"loss": 0.08, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 9.290953545232274, |
|
"grad_norm": 0.173665389418602, |
|
"learning_rate": 0.00012900799921895003, |
|
"loss": 0.0808, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 9.339853300733497, |
|
"grad_norm": 0.20865468680858612, |
|
"learning_rate": 0.0001281881807545769, |
|
"loss": 0.0808, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 9.38875305623472, |
|
"grad_norm": 0.18372130393981934, |
|
"learning_rate": 0.0001273662990072083, |
|
"loss": 0.0804, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 9.43765281173594, |
|
"grad_norm": 0.1785283237695694, |
|
"learning_rate": 0.00012654241413591054, |
|
"loss": 0.0812, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 9.486552567237164, |
|
"grad_norm": 0.17695043981075287, |
|
"learning_rate": 0.000125716586446372, |
|
"loss": 0.0827, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 9.535452322738386, |
|
"grad_norm": 0.18287776410579681, |
|
"learning_rate": 0.00012488887638648907, |
|
"loss": 0.083, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 9.58435207823961, |
|
"grad_norm": 0.20748884975910187, |
|
"learning_rate": 0.00012405934454194146, |
|
"loss": 0.0816, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 9.63325183374083, |
|
"grad_norm": 0.18160052597522736, |
|
"learning_rate": 0.00012322805163175762, |
|
"loss": 0.0823, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 9.682151589242054, |
|
"grad_norm": 0.17889925837516785, |
|
"learning_rate": 0.0001223950585038703, |
|
"loss": 0.0822, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 9.731051344743276, |
|
"grad_norm": 0.1896965056657791, |
|
"learning_rate": 0.00012156042613066258, |
|
"loss": 0.0839, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 9.7799511002445, |
|
"grad_norm": 0.19203361868858337, |
|
"learning_rate": 0.00012072421560450497, |
|
"loss": 0.0828, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 9.82885085574572, |
|
"grad_norm": 0.18262554705142975, |
|
"learning_rate": 0.00011988648813328367, |
|
"loss": 0.0838, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 9.877750611246944, |
|
"grad_norm": 0.18471267819404602, |
|
"learning_rate": 0.0001190473050359203, |
|
"loss": 0.084, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 9.926650366748166, |
|
"grad_norm": 0.18675756454467773, |
|
"learning_rate": 0.00011820672773788353, |
|
"loss": 0.0835, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 9.97555012224939, |
|
"grad_norm": 0.17983846366405487, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.0843, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 9.995110024449877, |
|
"eval_loss": 0.2892283499240875, |
|
"eval_runtime": 389.1699, |
|
"eval_samples_per_second": 1.054, |
|
"eval_steps_per_second": 1.054, |
|
"step": 1022 |
|
}, |
|
{ |
|
"epoch": 9.995110024449877, |
|
"step": 1022, |
|
"total_flos": 7.585797735459062e+17, |
|
"train_loss": 0.2326957560244605, |
|
"train_runtime": 28787.7929, |
|
"train_samples_per_second": 0.568, |
|
"train_steps_per_second": 0.071 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2040, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.585797735459062e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|