{ "best_metric": 0.23624150454998016, "best_model_checkpoint": "data/hansken_human_hql/checkpoint-511", "epoch": 9.995110024449877, "eval_steps": 500, "global_step": 1022, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009779951100244499, "grad_norm": 1.0709587335586548, "learning_rate": 9.80392156862745e-07, "loss": 1.4707, "step": 1 }, { "epoch": 0.0488997555012225, "grad_norm": 1.1860318183898926, "learning_rate": 4.901960784313726e-06, "loss": 1.4227, "step": 5 }, { "epoch": 0.097799511002445, "grad_norm": 1.145372986793518, "learning_rate": 9.803921568627451e-06, "loss": 1.444, "step": 10 }, { "epoch": 0.1466992665036675, "grad_norm": 1.0051456689834595, "learning_rate": 1.4705882352941177e-05, "loss": 1.3414, "step": 15 }, { "epoch": 0.19559902200489, "grad_norm": 0.5133008360862732, "learning_rate": 1.9607843137254903e-05, "loss": 1.2116, "step": 20 }, { "epoch": 0.24449877750611246, "grad_norm": 0.41832494735717773, "learning_rate": 2.4509803921568626e-05, "loss": 1.1424, "step": 25 }, { "epoch": 0.293398533007335, "grad_norm": 0.41829216480255127, "learning_rate": 2.9411764705882354e-05, "loss": 1.143, "step": 30 }, { "epoch": 0.3422982885085575, "grad_norm": 0.35495856404304504, "learning_rate": 3.431372549019608e-05, "loss": 1.0472, "step": 35 }, { "epoch": 0.39119804400978, "grad_norm": 0.40399229526519775, "learning_rate": 3.9215686274509805e-05, "loss": 0.9879, "step": 40 }, { "epoch": 0.4400977995110024, "grad_norm": 0.31430941820144653, "learning_rate": 4.411764705882353e-05, "loss": 0.9467, "step": 45 }, { "epoch": 0.4889975550122249, "grad_norm": 0.29712405800819397, "learning_rate": 4.901960784313725e-05, "loss": 0.885, "step": 50 }, { "epoch": 0.5378973105134475, "grad_norm": 0.40078112483024597, "learning_rate": 5.392156862745098e-05, "loss": 0.7973, "step": 55 }, { "epoch": 0.58679706601467, "grad_norm": 0.34199750423431396, "learning_rate": 5.882352941176471e-05, "loss": 0.776, "step": 60 }, { "epoch": 0.6356968215158925, "grad_norm": 0.4243955910205841, "learning_rate": 6.372549019607843e-05, "loss": 0.651, "step": 65 }, { "epoch": 0.684596577017115, "grad_norm": 0.30432572960853577, "learning_rate": 6.862745098039216e-05, "loss": 0.5769, "step": 70 }, { "epoch": 0.7334963325183375, "grad_norm": 0.27279356122016907, "learning_rate": 7.352941176470589e-05, "loss": 0.5436, "step": 75 }, { "epoch": 0.78239608801956, "grad_norm": 0.2576221823692322, "learning_rate": 7.843137254901961e-05, "loss": 0.501, "step": 80 }, { "epoch": 0.8312958435207825, "grad_norm": 0.22290439903736115, "learning_rate": 8.333333333333334e-05, "loss": 0.4915, "step": 85 }, { "epoch": 0.8801955990220048, "grad_norm": 0.21740856766700745, "learning_rate": 8.823529411764706e-05, "loss": 0.4487, "step": 90 }, { "epoch": 0.9290953545232273, "grad_norm": 0.21560043096542358, "learning_rate": 9.313725490196079e-05, "loss": 0.4351, "step": 95 }, { "epoch": 0.9779951100244498, "grad_norm": 0.2607389986515045, "learning_rate": 9.80392156862745e-05, "loss": 0.4508, "step": 100 }, { "epoch": 0.9975550122249389, "eval_loss": 0.44326770305633545, "eval_runtime": 398.4802, "eval_samples_per_second": 1.029, "eval_steps_per_second": 1.029, "step": 102 }, { "epoch": 1.0268948655256724, "grad_norm": 0.21888603270053864, "learning_rate": 0.00010294117647058823, "loss": 0.3968, "step": 105 }, { "epoch": 1.075794621026895, "grad_norm": 0.21742066740989685, "learning_rate": 0.00010784313725490196, "loss": 0.3785, "step": 110 }, { "epoch": 1.1246943765281174, "grad_norm": 0.2523965537548065, "learning_rate": 0.0001127450980392157, "loss": 0.4034, "step": 115 }, { "epoch": 1.17359413202934, "grad_norm": 0.2155005782842636, "learning_rate": 0.00011764705882352942, "loss": 0.3766, "step": 120 }, { "epoch": 1.2224938875305624, "grad_norm": 0.25576308369636536, "learning_rate": 0.00012254901960784316, "loss": 0.364, "step": 125 }, { "epoch": 1.271393643031785, "grad_norm": 0.2288295179605484, "learning_rate": 0.00012745098039215687, "loss": 0.3451, "step": 130 }, { "epoch": 1.3202933985330074, "grad_norm": 0.2045079469680786, "learning_rate": 0.0001323529411764706, "loss": 0.3425, "step": 135 }, { "epoch": 1.36919315403423, "grad_norm": 0.2297014445066452, "learning_rate": 0.0001372549019607843, "loss": 0.3658, "step": 140 }, { "epoch": 1.4180929095354524, "grad_norm": 0.2170581817626953, "learning_rate": 0.00014215686274509804, "loss": 0.3482, "step": 145 }, { "epoch": 1.466992665036675, "grad_norm": 0.2250969409942627, "learning_rate": 0.00014705882352941178, "loss": 0.3353, "step": 150 }, { "epoch": 1.5158924205378974, "grad_norm": 0.23191578686237335, "learning_rate": 0.00015196078431372549, "loss": 0.3271, "step": 155 }, { "epoch": 1.56479217603912, "grad_norm": 0.2477528601884842, "learning_rate": 0.00015686274509803922, "loss": 0.3549, "step": 160 }, { "epoch": 1.6136919315403424, "grad_norm": 0.20846064388751984, "learning_rate": 0.00016176470588235295, "loss": 0.3171, "step": 165 }, { "epoch": 1.662591687041565, "grad_norm": 0.21829602122306824, "learning_rate": 0.0001666666666666667, "loss": 0.3642, "step": 170 }, { "epoch": 1.7114914425427874, "grad_norm": 0.22842282056808472, "learning_rate": 0.0001715686274509804, "loss": 0.3116, "step": 175 }, { "epoch": 1.76039119804401, "grad_norm": 0.24106037616729736, "learning_rate": 0.00017647058823529413, "loss": 0.3066, "step": 180 }, { "epoch": 1.8092909535452324, "grad_norm": 0.25696486234664917, "learning_rate": 0.00018137254901960786, "loss": 0.3053, "step": 185 }, { "epoch": 1.858190709046455, "grad_norm": 0.22010771930217743, "learning_rate": 0.00018627450980392157, "loss": 0.3233, "step": 190 }, { "epoch": 1.9070904645476774, "grad_norm": 0.2373352199792862, "learning_rate": 0.0001911764705882353, "loss": 0.3102, "step": 195 }, { "epoch": 1.9559902200488999, "grad_norm": 0.21177123486995697, "learning_rate": 0.000196078431372549, "loss": 0.302, "step": 200 }, { "epoch": 1.9951100244498776, "eval_loss": 0.3139691650867462, "eval_runtime": 387.4792, "eval_samples_per_second": 1.058, "eval_steps_per_second": 1.058, "step": 204 }, { "epoch": 2.0048899755501224, "grad_norm": 0.2193712592124939, "learning_rate": 0.00019999985360565867, "loss": 0.2813, "step": 205 }, { "epoch": 2.053789731051345, "grad_norm": 0.3371932804584503, "learning_rate": 0.00019999472984871732, "loss": 0.2844, "step": 210 }, { "epoch": 2.1026894865525674, "grad_norm": 0.23578821122646332, "learning_rate": 0.00019998228680332932, "loss": 0.263, "step": 215 }, { "epoch": 2.15158924205379, "grad_norm": 0.27435311675071716, "learning_rate": 0.00019996252538028507, "loss": 0.2752, "step": 220 }, { "epoch": 2.2004889975550124, "grad_norm": 0.24362725019454956, "learning_rate": 0.00019993544702605638, "loss": 0.2572, "step": 225 }, { "epoch": 2.249388753056235, "grad_norm": 0.24360118806362152, "learning_rate": 0.0001999010537226905, "loss": 0.3191, "step": 230 }, { "epoch": 2.2982885085574574, "grad_norm": 0.2612737715244293, "learning_rate": 0.0001998593479876652, "loss": 0.2506, "step": 235 }, { "epoch": 2.34718826405868, "grad_norm": 0.21556636691093445, "learning_rate": 0.00019981033287370443, "loss": 0.2416, "step": 240 }, { "epoch": 2.3960880195599024, "grad_norm": 0.22406277060508728, "learning_rate": 0.00019975401196855482, "loss": 0.273, "step": 245 }, { "epoch": 2.444987775061125, "grad_norm": 0.3020350933074951, "learning_rate": 0.00019969038939472315, "loss": 0.2457, "step": 250 }, { "epoch": 2.4938875305623474, "grad_norm": 0.20698243379592896, "learning_rate": 0.00019961946980917456, "loss": 0.2569, "step": 255 }, { "epoch": 2.54278728606357, "grad_norm": 0.4294751286506653, "learning_rate": 0.00019954125840299165, "loss": 0.2246, "step": 260 }, { "epoch": 2.591687041564792, "grad_norm": 0.37185847759246826, "learning_rate": 0.00019945576090099452, "loss": 0.229, "step": 265 }, { "epoch": 2.640586797066015, "grad_norm": 0.2863105237483978, "learning_rate": 0.00019936298356132176, "loss": 0.2338, "step": 270 }, { "epoch": 2.689486552567237, "grad_norm": 0.19301028549671173, "learning_rate": 0.00019926293317497245, "loss": 0.2167, "step": 275 }, { "epoch": 2.73838630806846, "grad_norm": 0.22075964510440826, "learning_rate": 0.00019915561706530883, "loss": 0.2367, "step": 280 }, { "epoch": 2.787286063569682, "grad_norm": 0.22829142212867737, "learning_rate": 0.0001990410430875205, "loss": 0.245, "step": 285 }, { "epoch": 2.836185819070905, "grad_norm": 0.1982724666595459, "learning_rate": 0.00019891921962804943, "loss": 0.217, "step": 290 }, { "epoch": 2.885085574572127, "grad_norm": 0.23672354221343994, "learning_rate": 0.00019879015560397587, "loss": 0.2298, "step": 295 }, { "epoch": 2.93398533007335, "grad_norm": 0.21391943097114563, "learning_rate": 0.00019865386046236596, "loss": 0.2326, "step": 300 }, { "epoch": 2.982885085574572, "grad_norm": 0.19821615517139435, "learning_rate": 0.00019851034417958, "loss": 0.2692, "step": 305 }, { "epoch": 2.9926650366748166, "eval_loss": 0.2616053521633148, "eval_runtime": 387.9813, "eval_samples_per_second": 1.057, "eval_steps_per_second": 1.057, "step": 306 }, { "epoch": 3.031784841075795, "grad_norm": 0.22108572721481323, "learning_rate": 0.0001983596172605423, "loss": 0.2104, "step": 310 }, { "epoch": 3.0806845965770173, "grad_norm": 0.24487629532814026, "learning_rate": 0.00019820169073797228, "loss": 0.1942, "step": 315 }, { "epoch": 3.12958435207824, "grad_norm": 0.2110164612531662, "learning_rate": 0.0001980365761715769, "loss": 0.1833, "step": 320 }, { "epoch": 3.178484107579462, "grad_norm": 0.20861805975437164, "learning_rate": 0.0001978642856472045, "loss": 0.1981, "step": 325 }, { "epoch": 3.227383863080685, "grad_norm": 0.1969948559999466, "learning_rate": 0.0001976848317759601, "loss": 0.1868, "step": 330 }, { "epoch": 3.276283618581907, "grad_norm": 0.19443638622760773, "learning_rate": 0.0001974982276932824, "loss": 0.1902, "step": 335 }, { "epoch": 3.32518337408313, "grad_norm": 0.30058181285858154, "learning_rate": 0.00019730448705798239, "loss": 0.2241, "step": 340 }, { "epoch": 3.374083129584352, "grad_norm": 0.21647138893604279, "learning_rate": 0.00019710362405124334, "loss": 0.1838, "step": 345 }, { "epoch": 3.422982885085575, "grad_norm": 0.19676022231578827, "learning_rate": 0.00019689565337558288, "loss": 0.1961, "step": 350 }, { "epoch": 3.471882640586797, "grad_norm": 0.2371009737253189, "learning_rate": 0.00019668059025377703, "loss": 0.2052, "step": 355 }, { "epoch": 3.52078239608802, "grad_norm": 0.21014133095741272, "learning_rate": 0.00019645845042774553, "loss": 0.1987, "step": 360 }, { "epoch": 3.569682151589242, "grad_norm": 0.2178957760334015, "learning_rate": 0.00019622925015739997, "loss": 0.1903, "step": 365 }, { "epoch": 3.618581907090465, "grad_norm": 0.21497862040996552, "learning_rate": 0.0001959930062194534, "loss": 0.2, "step": 370 }, { "epoch": 3.667481662591687, "grad_norm": 0.22582682967185974, "learning_rate": 0.00019574973590619243, "loss": 0.1868, "step": 375 }, { "epoch": 3.71638141809291, "grad_norm": 0.1827058047056198, "learning_rate": 0.00019549945702421144, "loss": 0.2172, "step": 380 }, { "epoch": 3.765281173594132, "grad_norm": 0.19827835261821747, "learning_rate": 0.00019524218789310912, "loss": 0.1785, "step": 385 }, { "epoch": 3.8141809290953548, "grad_norm": 0.2121572494506836, "learning_rate": 0.0001949779473441478, "loss": 0.1795, "step": 390 }, { "epoch": 3.863080684596577, "grad_norm": 0.20197761058807373, "learning_rate": 0.0001947067547188747, "loss": 0.19, "step": 395 }, { "epoch": 3.9119804400977998, "grad_norm": 0.21854069828987122, "learning_rate": 0.00019442862986770646, "loss": 0.1886, "step": 400 }, { "epoch": 3.960880195599022, "grad_norm": 0.20974552631378174, "learning_rate": 0.0001941435931484761, "loss": 0.177, "step": 405 }, { "epoch": 4.0, "eval_loss": 0.24309584498405457, "eval_runtime": 399.7773, "eval_samples_per_second": 1.026, "eval_steps_per_second": 1.026, "step": 409 }, { "epoch": 4.009779951100245, "grad_norm": 0.19041913747787476, "learning_rate": 0.0001938516654249428, "loss": 0.1709, "step": 410 }, { "epoch": 4.058679706601467, "grad_norm": 0.22610776126384735, "learning_rate": 0.00019355286806526493, "loss": 0.158, "step": 415 }, { "epoch": 4.10757946210269, "grad_norm": 0.2044234424829483, "learning_rate": 0.00019324722294043558, "loss": 0.1522, "step": 420 }, { "epoch": 4.156479217603912, "grad_norm": 0.2402704805135727, "learning_rate": 0.00019293475242268223, "loss": 0.1509, "step": 425 }, { "epoch": 4.205378973105135, "grad_norm": 0.20224688947200775, "learning_rate": 0.0001926154793838288, "loss": 0.1565, "step": 430 }, { "epoch": 4.254278728606357, "grad_norm": 0.21887710690498352, "learning_rate": 0.00019228942719362143, "loss": 0.1551, "step": 435 }, { "epoch": 4.30317848410758, "grad_norm": 0.20886527001857758, "learning_rate": 0.00019195661971801827, "loss": 0.1568, "step": 440 }, { "epoch": 4.352078239608802, "grad_norm": 0.21612216532230377, "learning_rate": 0.00019161708131744222, "loss": 0.1516, "step": 445 }, { "epoch": 4.400977995110025, "grad_norm": 0.20036669075489044, "learning_rate": 0.00019127083684499806, "loss": 0.1529, "step": 450 }, { "epoch": 4.449877750611247, "grad_norm": 0.3197900950908661, "learning_rate": 0.00019091791164465305, "loss": 0.1854, "step": 455 }, { "epoch": 4.49877750611247, "grad_norm": 0.18851010501384735, "learning_rate": 0.00019055833154938207, "loss": 0.1574, "step": 460 }, { "epoch": 4.547677261613692, "grad_norm": 0.214978888630867, "learning_rate": 0.00019019212287927663, "loss": 0.1555, "step": 465 }, { "epoch": 4.596577017114915, "grad_norm": 0.21155217289924622, "learning_rate": 0.00018981931243961824, "loss": 0.176, "step": 470 }, { "epoch": 4.645476772616137, "grad_norm": 0.18137674033641815, "learning_rate": 0.00018943992751891653, "loss": 0.1575, "step": 475 }, { "epoch": 4.69437652811736, "grad_norm": 0.24663567543029785, "learning_rate": 0.00018905399588691163, "loss": 0.1568, "step": 480 }, { "epoch": 4.743276283618582, "grad_norm": 0.19319510459899902, "learning_rate": 0.0001886615457925417, "loss": 0.1547, "step": 485 }, { "epoch": 4.792176039119805, "grad_norm": 0.18611547350883484, "learning_rate": 0.00018826260596187505, "loss": 0.1755, "step": 490 }, { "epoch": 4.841075794621027, "grad_norm": 0.47814473509788513, "learning_rate": 0.00018785720559600752, "loss": 0.1647, "step": 495 }, { "epoch": 4.88997555012225, "grad_norm": 0.19350242614746094, "learning_rate": 0.00018744537436892516, "loss": 0.155, "step": 500 }, { "epoch": 4.938875305623472, "grad_norm": 0.19956329464912415, "learning_rate": 0.00018702714242533204, "loss": 0.156, "step": 505 }, { "epoch": 4.987775061124695, "grad_norm": 0.20709875226020813, "learning_rate": 0.00018660254037844388, "loss": 0.1616, "step": 510 }, { "epoch": 4.997555012224939, "eval_loss": 0.23624150454998016, "eval_runtime": 387.7068, "eval_samples_per_second": 1.058, "eval_steps_per_second": 1.058, "step": 511 }, { "epoch": 5.036674816625917, "grad_norm": 0.22790652513504028, "learning_rate": 0.00018617159930774715, "loss": 0.1377, "step": 515 }, { "epoch": 5.08557457212714, "grad_norm": 0.21796418726444244, "learning_rate": 0.00018573435075672424, "loss": 0.1326, "step": 520 }, { "epoch": 5.134474327628362, "grad_norm": 0.19105204939842224, "learning_rate": 0.00018529082673054457, "loss": 0.1303, "step": 525 }, { "epoch": 5.183374083129585, "grad_norm": 0.2682870328426361, "learning_rate": 0.00018484105969372182, "loss": 0.1316, "step": 530 }, { "epoch": 5.232273838630807, "grad_norm": 0.18370023369789124, "learning_rate": 0.00018438508256773785, "loss": 0.1323, "step": 535 }, { "epoch": 5.28117359413203, "grad_norm": 0.24072639644145966, "learning_rate": 0.00018392292872863267, "loss": 0.1332, "step": 540 }, { "epoch": 5.330073349633252, "grad_norm": 0.19523735344409943, "learning_rate": 0.00018345463200456164, "loss": 0.1344, "step": 545 }, { "epoch": 5.378973105134475, "grad_norm": 0.24865508079528809, "learning_rate": 0.0001829802266733193, "loss": 0.1359, "step": 550 }, { "epoch": 5.427872860635697, "grad_norm": 0.2039840966463089, "learning_rate": 0.00018249974745983023, "loss": 0.1337, "step": 555 }, { "epoch": 5.47677261613692, "grad_norm": 0.20024679601192474, "learning_rate": 0.00018201322953360758, "loss": 0.154, "step": 560 }, { "epoch": 5.525672371638142, "grad_norm": 0.1976476013660431, "learning_rate": 0.0001815207085061784, "loss": 0.1353, "step": 565 }, { "epoch": 5.574572127139365, "grad_norm": 0.1974327266216278, "learning_rate": 0.00018102222042847737, "loss": 0.1373, "step": 570 }, { "epoch": 5.623471882640587, "grad_norm": 0.27005520462989807, "learning_rate": 0.00018051780178820765, "loss": 0.1437, "step": 575 }, { "epoch": 5.67237163814181, "grad_norm": 0.20781448483467102, "learning_rate": 0.00018000748950717038, "loss": 0.1322, "step": 580 }, { "epoch": 5.721271393643032, "grad_norm": 0.20179703831672668, "learning_rate": 0.000179491320938562, "loss": 0.1378, "step": 585 }, { "epoch": 5.770171149144255, "grad_norm": 0.22105282545089722, "learning_rate": 0.00017896933386423998, "loss": 0.136, "step": 590 }, { "epoch": 5.819070904645477, "grad_norm": 0.4113224446773529, "learning_rate": 0.00017844156649195759, "loss": 0.1495, "step": 595 }, { "epoch": 5.8679706601467, "grad_norm": 0.20451286435127258, "learning_rate": 0.00017790805745256704, "loss": 0.1318, "step": 600 }, { "epoch": 5.916870415647922, "grad_norm": 0.18566569685935974, "learning_rate": 0.0001773688457971919, "loss": 0.1359, "step": 605 }, { "epoch": 5.965770171149144, "grad_norm": 0.1862591803073883, "learning_rate": 0.0001768239709943686, "loss": 0.1358, "step": 610 }, { "epoch": 5.995110024449878, "eval_loss": 0.23938237130641937, "eval_runtime": 387.8478, "eval_samples_per_second": 1.057, "eval_steps_per_second": 1.057, "step": 613 }, { "epoch": 6.014669926650367, "grad_norm": 0.16670842468738556, "learning_rate": 0.0001762734729271575, "loss": 0.1275, "step": 615 }, { "epoch": 6.06356968215159, "grad_norm": 0.23901741206645966, "learning_rate": 0.00017571739189022365, "loss": 0.1113, "step": 620 }, { "epoch": 6.112469437652812, "grad_norm": 0.19317218661308289, "learning_rate": 0.00017515576858688722, "loss": 0.1101, "step": 625 }, { "epoch": 6.161369193154035, "grad_norm": 0.21369099617004395, "learning_rate": 0.00017458864412614434, "loss": 0.1122, "step": 630 }, { "epoch": 6.210268948655257, "grad_norm": 0.21011659502983093, "learning_rate": 0.00017401606001965782, "loss": 0.1136, "step": 635 }, { "epoch": 6.25916870415648, "grad_norm": 0.1860456019639969, "learning_rate": 0.00017343805817871886, "loss": 0.1305, "step": 640 }, { "epoch": 6.308068459657702, "grad_norm": 0.23417602479457855, "learning_rate": 0.00017285468091117904, "loss": 0.1165, "step": 645 }, { "epoch": 6.356968215158924, "grad_norm": 0.189472958445549, "learning_rate": 0.00017226597091835378, "loss": 0.119, "step": 650 }, { "epoch": 6.405867970660147, "grad_norm": 0.2460348904132843, "learning_rate": 0.00017167197129189652, "loss": 0.1188, "step": 655 }, { "epoch": 6.45476772616137, "grad_norm": 0.20059679448604584, "learning_rate": 0.00017107272551064473, "loss": 0.1194, "step": 660 }, { "epoch": 6.503667481662592, "grad_norm": 0.19838838279247284, "learning_rate": 0.00017046827743743726, "loss": 0.1165, "step": 665 }, { "epoch": 6.552567237163814, "grad_norm": 0.20280085504055023, "learning_rate": 0.00016985867131590383, "loss": 0.1168, "step": 670 }, { "epoch": 6.601466992665037, "grad_norm": 0.27974265813827515, "learning_rate": 0.00016924395176722647, "loss": 0.122, "step": 675 }, { "epoch": 6.65036674816626, "grad_norm": 0.1994495540857315, "learning_rate": 0.0001686241637868734, "loss": 0.1173, "step": 680 }, { "epoch": 6.699266503667482, "grad_norm": 0.20043040812015533, "learning_rate": 0.00016799935274130546, "loss": 0.1183, "step": 685 }, { "epoch": 6.748166259168704, "grad_norm": 0.19184747338294983, "learning_rate": 0.00016736956436465573, "loss": 0.1192, "step": 690 }, { "epoch": 6.797066014669927, "grad_norm": 0.20747938752174377, "learning_rate": 0.00016673484475538146, "loss": 0.1188, "step": 695 }, { "epoch": 6.84596577017115, "grad_norm": 0.19285354018211365, "learning_rate": 0.00016609524037289019, "loss": 0.117, "step": 700 }, { "epoch": 6.894865525672372, "grad_norm": 0.18242338299751282, "learning_rate": 0.00016545079803413892, "loss": 0.1208, "step": 705 }, { "epoch": 6.943765281173594, "grad_norm": 0.19887416064739227, "learning_rate": 0.00016480156491020727, "loss": 0.1227, "step": 710 }, { "epoch": 6.992665036674817, "grad_norm": 0.19773922860622406, "learning_rate": 0.00016414758852284478, "loss": 0.1199, "step": 715 }, { "epoch": 6.992665036674817, "eval_loss": 0.24741248786449432, "eval_runtime": 387.335, "eval_samples_per_second": 1.059, "eval_steps_per_second": 1.059, "step": 715 }, { "epoch": 7.041564792176039, "grad_norm": 0.5106807351112366, "learning_rate": 0.0001634889167409923, "loss": 0.1051, "step": 720 }, { "epoch": 7.090464547677262, "grad_norm": 0.18619847297668457, "learning_rate": 0.0001628255977772784, "loss": 0.0979, "step": 725 }, { "epoch": 7.139364303178484, "grad_norm": 0.18676620721817017, "learning_rate": 0.00016215768018449012, "loss": 0.1009, "step": 730 }, { "epoch": 7.188264058679707, "grad_norm": 0.2054695338010788, "learning_rate": 0.00016148521285201927, "loss": 0.1002, "step": 735 }, { "epoch": 7.237163814180929, "grad_norm": 0.20496530830860138, "learning_rate": 0.00016080824500228367, "loss": 0.1011, "step": 740 }, { "epoch": 7.286063569682152, "grad_norm": 0.18679122626781464, "learning_rate": 0.0001601268261871244, "loss": 0.1052, "step": 745 }, { "epoch": 7.334963325183374, "grad_norm": 0.20614224672317505, "learning_rate": 0.00015944100628417868, "loss": 0.1021, "step": 750 }, { "epoch": 7.383863080684597, "grad_norm": 0.20026642084121704, "learning_rate": 0.00015875083549322908, "loss": 0.1019, "step": 755 }, { "epoch": 7.432762836185819, "grad_norm": 0.1852520853281021, "learning_rate": 0.00015805636433252891, "loss": 0.1028, "step": 760 }, { "epoch": 7.481662591687042, "grad_norm": 0.19096429646015167, "learning_rate": 0.0001573576436351046, "loss": 0.1031, "step": 765 }, { "epoch": 7.530562347188264, "grad_norm": 0.18263529241085052, "learning_rate": 0.00015665472454503483, "loss": 0.1033, "step": 770 }, { "epoch": 7.579462102689487, "grad_norm": 0.1884106546640396, "learning_rate": 0.00015594765851370684, "loss": 0.1063, "step": 775 }, { "epoch": 7.628361858190709, "grad_norm": 0.2005338817834854, "learning_rate": 0.0001552364972960506, "loss": 0.1054, "step": 780 }, { "epoch": 7.677261613691932, "grad_norm": 0.184016153216362, "learning_rate": 0.0001545212929467503, "loss": 0.1048, "step": 785 }, { "epoch": 7.726161369193154, "grad_norm": 0.19765067100524902, "learning_rate": 0.0001538020978164341, "loss": 0.1044, "step": 790 }, { "epoch": 7.775061124694377, "grad_norm": 0.18265607953071594, "learning_rate": 0.0001530789645478426, "loss": 0.1051, "step": 795 }, { "epoch": 7.823960880195599, "grad_norm": 0.19815443456172943, "learning_rate": 0.00015235194607197508, "loss": 0.1081, "step": 800 }, { "epoch": 7.872860635696822, "grad_norm": 0.22219662368297577, "learning_rate": 0.0001516210956042153, "loss": 0.1071, "step": 805 }, { "epoch": 7.921760391198044, "grad_norm": 0.20078670978546143, "learning_rate": 0.0001508864666404365, "loss": 0.1075, "step": 810 }, { "epoch": 7.970660146699267, "grad_norm": 0.17794115841388702, "learning_rate": 0.00015014811295308543, "loss": 0.1051, "step": 815 }, { "epoch": 8.0, "eval_loss": 0.2625426948070526, "eval_runtime": 387.4946, "eval_samples_per_second": 1.058, "eval_steps_per_second": 1.058, "step": 818 }, { "epoch": 8.01955990220049, "grad_norm": 0.1608039289712906, "learning_rate": 0.0001494060885872464, "loss": 0.0994, "step": 820 }, { "epoch": 8.06845965770171, "grad_norm": 0.2323434203863144, "learning_rate": 0.00014866044785668563, "loss": 0.0895, "step": 825 }, { "epoch": 8.117359413202934, "grad_norm": 0.17606528103351593, "learning_rate": 0.0001479112453398753, "loss": 0.0849, "step": 830 }, { "epoch": 8.166259168704157, "grad_norm": 0.19025173783302307, "learning_rate": 0.0001471585358759987, "loss": 0.0886, "step": 835 }, { "epoch": 8.21515892420538, "grad_norm": 0.1990627497434616, "learning_rate": 0.00014640237456093634, "loss": 0.0905, "step": 840 }, { "epoch": 8.2640586797066, "grad_norm": 0.1725684553384781, "learning_rate": 0.00014564281674323297, "loss": 0.0899, "step": 845 }, { "epoch": 8.312958435207824, "grad_norm": 0.18845060467720032, "learning_rate": 0.00014487991802004623, "loss": 0.0886, "step": 850 }, { "epoch": 8.361858190709047, "grad_norm": 0.23856212198734283, "learning_rate": 0.00014411373423307714, "loss": 0.0924, "step": 855 }, { "epoch": 8.41075794621027, "grad_norm": 0.18084120750427246, "learning_rate": 0.00014334432146448272, "loss": 0.0918, "step": 860 }, { "epoch": 8.45965770171149, "grad_norm": 0.18600909411907196, "learning_rate": 0.00014257173603277095, "loss": 0.0913, "step": 865 }, { "epoch": 8.508557457212714, "grad_norm": 0.1851680874824524, "learning_rate": 0.00014179603448867835, "loss": 0.0912, "step": 870 }, { "epoch": 8.557457212713937, "grad_norm": 0.1818709820508957, "learning_rate": 0.00014101727361103076, "loss": 0.0903, "step": 875 }, { "epoch": 8.60635696821516, "grad_norm": 0.19458520412445068, "learning_rate": 0.00014023551040258725, "loss": 0.0916, "step": 880 }, { "epoch": 8.65525672371638, "grad_norm": 0.17777447402477264, "learning_rate": 0.00013945080208586775, "loss": 0.0928, "step": 885 }, { "epoch": 8.704156479217604, "grad_norm": 0.20647075772285461, "learning_rate": 0.00013866320609896447, "loss": 0.0926, "step": 890 }, { "epoch": 8.753056234718827, "grad_norm": 0.18589670956134796, "learning_rate": 0.00013787278009133776, "loss": 0.0934, "step": 895 }, { "epoch": 8.80195599022005, "grad_norm": 0.19582615792751312, "learning_rate": 0.00013707958191959608, "loss": 0.0954, "step": 900 }, { "epoch": 8.85085574572127, "grad_norm": 0.19688870012760162, "learning_rate": 0.00013628366964326153, "loss": 0.0925, "step": 905 }, { "epoch": 8.899755501222494, "grad_norm": 0.1874823123216629, "learning_rate": 0.00013548510152051963, "loss": 0.0939, "step": 910 }, { "epoch": 8.948655256723717, "grad_norm": 0.1876133382320404, "learning_rate": 0.00013468393600395525, "loss": 0.097, "step": 915 }, { "epoch": 8.99755501222494, "grad_norm": 0.1735718548297882, "learning_rate": 0.00013388023173627414, "loss": 0.0945, "step": 920 }, { "epoch": 8.99755501222494, "eval_loss": 0.27974453568458557, "eval_runtime": 387.9073, "eval_samples_per_second": 1.057, "eval_steps_per_second": 1.057, "step": 920 }, { "epoch": 9.04645476772616, "grad_norm": 0.1655295491218567, "learning_rate": 0.00013307404754601013, "loss": 0.0806, "step": 925 }, { "epoch": 9.095354523227384, "grad_norm": 0.19395217299461365, "learning_rate": 0.0001322654424432195, "loss": 0.0788, "step": 930 }, { "epoch": 9.144254278728607, "grad_norm": 0.18941174447536469, "learning_rate": 0.00013145447561516138, "loss": 0.0793, "step": 935 }, { "epoch": 9.19315403422983, "grad_norm": 0.20010443031787872, "learning_rate": 0.00013064120642196548, "loss": 0.0807, "step": 940 }, { "epoch": 9.24205378973105, "grad_norm": 0.20777645707130432, "learning_rate": 0.00012982569439228713, "loss": 0.08, "step": 945 }, { "epoch": 9.290953545232274, "grad_norm": 0.173665389418602, "learning_rate": 0.00012900799921895003, "loss": 0.0808, "step": 950 }, { "epoch": 9.339853300733497, "grad_norm": 0.20865468680858612, "learning_rate": 0.0001281881807545769, "loss": 0.0808, "step": 955 }, { "epoch": 9.38875305623472, "grad_norm": 0.18372130393981934, "learning_rate": 0.0001273662990072083, "loss": 0.0804, "step": 960 }, { "epoch": 9.43765281173594, "grad_norm": 0.1785283237695694, "learning_rate": 0.00012654241413591054, "loss": 0.0812, "step": 965 }, { "epoch": 9.486552567237164, "grad_norm": 0.17695043981075287, "learning_rate": 0.000125716586446372, "loss": 0.0827, "step": 970 }, { "epoch": 9.535452322738386, "grad_norm": 0.18287776410579681, "learning_rate": 0.00012488887638648907, "loss": 0.083, "step": 975 }, { "epoch": 9.58435207823961, "grad_norm": 0.20748884975910187, "learning_rate": 0.00012405934454194146, "loss": 0.0816, "step": 980 }, { "epoch": 9.63325183374083, "grad_norm": 0.18160052597522736, "learning_rate": 0.00012322805163175762, "loss": 0.0823, "step": 985 }, { "epoch": 9.682151589242054, "grad_norm": 0.17889925837516785, "learning_rate": 0.0001223950585038703, "loss": 0.0822, "step": 990 }, { "epoch": 9.731051344743276, "grad_norm": 0.1896965056657791, "learning_rate": 0.00012156042613066258, "loss": 0.0839, "step": 995 }, { "epoch": 9.7799511002445, "grad_norm": 0.19203361868858337, "learning_rate": 0.00012072421560450497, "loss": 0.0828, "step": 1000 }, { "epoch": 9.82885085574572, "grad_norm": 0.18262554705142975, "learning_rate": 0.00011988648813328367, "loss": 0.0838, "step": 1005 }, { "epoch": 9.877750611246944, "grad_norm": 0.18471267819404602, "learning_rate": 0.0001190473050359203, "loss": 0.084, "step": 1010 }, { "epoch": 9.926650366748166, "grad_norm": 0.18675756454467773, "learning_rate": 0.00011820672773788353, "loss": 0.0835, "step": 1015 }, { "epoch": 9.97555012224939, "grad_norm": 0.17983846366405487, "learning_rate": 0.00011736481776669306, "loss": 0.0843, "step": 1020 }, { "epoch": 9.995110024449877, "eval_loss": 0.2892283499240875, "eval_runtime": 389.1699, "eval_samples_per_second": 1.054, "eval_steps_per_second": 1.054, "step": 1022 }, { "epoch": 9.995110024449877, "step": 1022, "total_flos": 7.585797735459062e+17, "train_loss": 0.2326957560244605, "train_runtime": 28787.7929, "train_samples_per_second": 0.568, "train_steps_per_second": 0.071 } ], "logging_steps": 5, "max_steps": 2040, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.585797735459062e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }