{ "best_metric": null, "best_model_checkpoint": null, "epoch": 32.467532467532465, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006493506493506494, "grad_norm": 33.13806915283203, "learning_rate": 9.999999013039593e-05, "loss": 5.645, "step": 1 }, { "epoch": 0.012987012987012988, "grad_norm": 23.576648712158203, "learning_rate": 9.99999605215876e-05, "loss": 3.9143, "step": 2 }, { "epoch": 0.01948051948051948, "grad_norm": 8.88882827758789, "learning_rate": 9.999991117358668e-05, "loss": 3.524, "step": 3 }, { "epoch": 0.025974025974025976, "grad_norm": 7.060589790344238, "learning_rate": 9.999984208641271e-05, "loss": 3.3346, "step": 4 }, { "epoch": 0.032467532467532464, "grad_norm": 6.797203063964844, "learning_rate": 9.999975326009292e-05, "loss": 3.097, "step": 5 }, { "epoch": 0.03896103896103896, "grad_norm": 5.928432464599609, "learning_rate": 9.999964469466236e-05, "loss": 3.0141, "step": 6 }, { "epoch": 0.045454545454545456, "grad_norm": 5.673449516296387, "learning_rate": 9.999951639016395e-05, "loss": 3.1291, "step": 7 }, { "epoch": 0.05194805194805195, "grad_norm": 5.7318596839904785, "learning_rate": 9.99993683466483e-05, "loss": 2.9019, "step": 8 }, { "epoch": 0.05844155844155844, "grad_norm": 6.2937493324279785, "learning_rate": 9.999920056417385e-05, "loss": 2.9567, "step": 9 }, { "epoch": 0.06493506493506493, "grad_norm": 5.941080570220947, "learning_rate": 9.999901304280685e-05, "loss": 3.2404, "step": 10 }, { "epoch": 0.07142857142857142, "grad_norm": 5.431388854980469, "learning_rate": 9.999880578262135e-05, "loss": 2.8637, "step": 11 }, { "epoch": 0.07792207792207792, "grad_norm": 5.87606143951416, "learning_rate": 9.999857878369916e-05, "loss": 2.9333, "step": 12 }, { "epoch": 0.08441558441558442, "grad_norm": 5.996065616607666, "learning_rate": 9.999833204612988e-05, "loss": 3.2531, "step": 13 }, { "epoch": 0.09090909090909091, "grad_norm": 5.6484832763671875, "learning_rate": 9.999806557001093e-05, "loss": 2.8224, "step": 14 }, { "epoch": 0.09740259740259741, "grad_norm": 5.376333713531494, "learning_rate": 9.99977793554475e-05, "loss": 2.7981, "step": 15 }, { "epoch": 0.1038961038961039, "grad_norm": 5.346993446350098, "learning_rate": 9.999747340255259e-05, "loss": 2.6952, "step": 16 }, { "epoch": 0.11038961038961038, "grad_norm": 5.37903356552124, "learning_rate": 9.999714771144701e-05, "loss": 2.7209, "step": 17 }, { "epoch": 0.11688311688311688, "grad_norm": 5.450172424316406, "learning_rate": 9.99968022822593e-05, "loss": 2.7421, "step": 18 }, { "epoch": 0.12337662337662338, "grad_norm": 5.602138042449951, "learning_rate": 9.999643711512586e-05, "loss": 2.4858, "step": 19 }, { "epoch": 0.12987012987012986, "grad_norm": 5.360761642456055, "learning_rate": 9.999605221019081e-05, "loss": 2.7158, "step": 20 }, { "epoch": 0.13636363636363635, "grad_norm": 6.177570343017578, "learning_rate": 9.999564756760615e-05, "loss": 2.6152, "step": 21 }, { "epoch": 0.14285714285714285, "grad_norm": 6.498959541320801, "learning_rate": 9.99952231875316e-05, "loss": 2.7611, "step": 22 }, { "epoch": 0.14935064935064934, "grad_norm": 5.865311622619629, "learning_rate": 9.999477907013473e-05, "loss": 2.6758, "step": 23 }, { "epoch": 0.15584415584415584, "grad_norm": 6.0329508781433105, "learning_rate": 9.999431521559082e-05, "loss": 2.6535, "step": 24 }, { "epoch": 0.16233766233766234, "grad_norm": 5.814820766448975, "learning_rate": 9.999383162408304e-05, "loss": 2.6313, "step": 25 }, { "epoch": 0.16883116883116883, "grad_norm": 6.224546432495117, "learning_rate": 9.999332829580226e-05, "loss": 2.627, "step": 26 }, { "epoch": 0.17532467532467533, "grad_norm": 5.967427730560303, "learning_rate": 9.999280523094724e-05, "loss": 2.5997, "step": 27 }, { "epoch": 0.18181818181818182, "grad_norm": 5.6386213302612305, "learning_rate": 9.999226242972444e-05, "loss": 2.4397, "step": 28 }, { "epoch": 0.18831168831168832, "grad_norm": 5.742101669311523, "learning_rate": 9.999169989234815e-05, "loss": 2.6343, "step": 29 }, { "epoch": 0.19480519480519481, "grad_norm": 5.685164928436279, "learning_rate": 9.999111761904046e-05, "loss": 2.3948, "step": 30 }, { "epoch": 0.2012987012987013, "grad_norm": 5.332027912139893, "learning_rate": 9.999051561003123e-05, "loss": 2.4506, "step": 31 }, { "epoch": 0.2077922077922078, "grad_norm": 5.785808086395264, "learning_rate": 9.998989386555814e-05, "loss": 2.7135, "step": 32 }, { "epoch": 0.21428571428571427, "grad_norm": 6.223892688751221, "learning_rate": 9.998925238586665e-05, "loss": 2.5245, "step": 33 }, { "epoch": 0.22077922077922077, "grad_norm": 5.84872579574585, "learning_rate": 9.998859117121e-05, "loss": 2.4981, "step": 34 }, { "epoch": 0.22727272727272727, "grad_norm": 5.572268486022949, "learning_rate": 9.998791022184922e-05, "loss": 2.608, "step": 35 }, { "epoch": 0.23376623376623376, "grad_norm": 5.090015411376953, "learning_rate": 9.998720953805312e-05, "loss": 2.4585, "step": 36 }, { "epoch": 0.24025974025974026, "grad_norm": 5.478267192840576, "learning_rate": 9.998648912009835e-05, "loss": 2.3653, "step": 37 }, { "epoch": 0.24675324675324675, "grad_norm": 5.772948741912842, "learning_rate": 9.998574896826931e-05, "loss": 2.438, "step": 38 }, { "epoch": 0.2532467532467532, "grad_norm": 4.907359600067139, "learning_rate": 9.998498908285819e-05, "loss": 2.2816, "step": 39 }, { "epoch": 0.2597402597402597, "grad_norm": 5.5392045974731445, "learning_rate": 9.9984209464165e-05, "loss": 2.3041, "step": 40 }, { "epoch": 0.2662337662337662, "grad_norm": 4.999871253967285, "learning_rate": 9.99834101124975e-05, "loss": 2.1686, "step": 41 }, { "epoch": 0.2727272727272727, "grad_norm": 6.051657676696777, "learning_rate": 9.998259102817129e-05, "loss": 2.3825, "step": 42 }, { "epoch": 0.2792207792207792, "grad_norm": 4.7146687507629395, "learning_rate": 9.99817522115097e-05, "loss": 2.2417, "step": 43 }, { "epoch": 0.2857142857142857, "grad_norm": 4.671162128448486, "learning_rate": 9.998089366284391e-05, "loss": 2.241, "step": 44 }, { "epoch": 0.2922077922077922, "grad_norm": 6.131312847137451, "learning_rate": 9.998001538251282e-05, "loss": 2.5224, "step": 45 }, { "epoch": 0.2987012987012987, "grad_norm": 5.212986469268799, "learning_rate": 9.997911737086322e-05, "loss": 2.452, "step": 46 }, { "epoch": 0.3051948051948052, "grad_norm": 4.900334358215332, "learning_rate": 9.997819962824957e-05, "loss": 2.4364, "step": 47 }, { "epoch": 0.3116883116883117, "grad_norm": 5.208124160766602, "learning_rate": 9.997726215503422e-05, "loss": 2.3829, "step": 48 }, { "epoch": 0.3181818181818182, "grad_norm": 4.627975940704346, "learning_rate": 9.997630495158728e-05, "loss": 2.0702, "step": 49 }, { "epoch": 0.3246753246753247, "grad_norm": 5.099819183349609, "learning_rate": 9.997532801828658e-05, "loss": 2.3067, "step": 50 }, { "epoch": 0.33116883116883117, "grad_norm": 4.694891929626465, "learning_rate": 9.997433135551786e-05, "loss": 2.3014, "step": 51 }, { "epoch": 0.33766233766233766, "grad_norm": 5.41646146774292, "learning_rate": 9.997331496367455e-05, "loss": 2.4805, "step": 52 }, { "epoch": 0.34415584415584416, "grad_norm": 5.233139514923096, "learning_rate": 9.997227884315791e-05, "loss": 2.2605, "step": 53 }, { "epoch": 0.35064935064935066, "grad_norm": 5.671755313873291, "learning_rate": 9.9971222994377e-05, "loss": 2.3303, "step": 54 }, { "epoch": 0.35714285714285715, "grad_norm": 5.070577144622803, "learning_rate": 9.997014741774866e-05, "loss": 2.3019, "step": 55 }, { "epoch": 0.36363636363636365, "grad_norm": 4.925657272338867, "learning_rate": 9.996905211369748e-05, "loss": 2.3937, "step": 56 }, { "epoch": 0.37012987012987014, "grad_norm": 5.051799774169922, "learning_rate": 9.996793708265586e-05, "loss": 2.358, "step": 57 }, { "epoch": 0.37662337662337664, "grad_norm": 4.3391828536987305, "learning_rate": 9.996680232506405e-05, "loss": 2.0576, "step": 58 }, { "epoch": 0.38311688311688313, "grad_norm": 4.852685451507568, "learning_rate": 9.996564784137e-05, "loss": 2.3678, "step": 59 }, { "epoch": 0.38961038961038963, "grad_norm": 4.842132091522217, "learning_rate": 9.996447363202946e-05, "loss": 2.3493, "step": 60 }, { "epoch": 0.3961038961038961, "grad_norm": 4.50392484664917, "learning_rate": 9.996327969750605e-05, "loss": 2.516, "step": 61 }, { "epoch": 0.4025974025974026, "grad_norm": 5.139745712280273, "learning_rate": 9.996206603827105e-05, "loss": 2.2679, "step": 62 }, { "epoch": 0.4090909090909091, "grad_norm": 4.663613319396973, "learning_rate": 9.996083265480365e-05, "loss": 2.2269, "step": 63 }, { "epoch": 0.4155844155844156, "grad_norm": 4.955394744873047, "learning_rate": 9.995957954759071e-05, "loss": 2.5408, "step": 64 }, { "epoch": 0.42207792207792205, "grad_norm": 4.935218334197998, "learning_rate": 9.9958306717127e-05, "loss": 2.4971, "step": 65 }, { "epoch": 0.42857142857142855, "grad_norm": 4.313268661499023, "learning_rate": 9.995701416391499e-05, "loss": 2.2366, "step": 66 }, { "epoch": 0.43506493506493504, "grad_norm": 4.81383752822876, "learning_rate": 9.995570188846495e-05, "loss": 2.7036, "step": 67 }, { "epoch": 0.44155844155844154, "grad_norm": 5.073368549346924, "learning_rate": 9.995436989129495e-05, "loss": 2.2867, "step": 68 }, { "epoch": 0.44805194805194803, "grad_norm": 4.764294147491455, "learning_rate": 9.995301817293084e-05, "loss": 2.5181, "step": 69 }, { "epoch": 0.45454545454545453, "grad_norm": 4.322338104248047, "learning_rate": 9.995164673390625e-05, "loss": 2.3062, "step": 70 }, { "epoch": 0.461038961038961, "grad_norm": 4.6427764892578125, "learning_rate": 9.995025557476261e-05, "loss": 2.2503, "step": 71 }, { "epoch": 0.4675324675324675, "grad_norm": 4.552596092224121, "learning_rate": 9.994884469604912e-05, "loss": 2.268, "step": 72 }, { "epoch": 0.474025974025974, "grad_norm": 3.8830368518829346, "learning_rate": 9.99474140983228e-05, "loss": 2.0846, "step": 73 }, { "epoch": 0.4805194805194805, "grad_norm": 4.275032043457031, "learning_rate": 9.994596378214841e-05, "loss": 2.2989, "step": 74 }, { "epoch": 0.487012987012987, "grad_norm": 4.56163215637207, "learning_rate": 9.994449374809851e-05, "loss": 2.0471, "step": 75 }, { "epoch": 0.4935064935064935, "grad_norm": 4.3414626121521, "learning_rate": 9.994300399675342e-05, "loss": 2.2404, "step": 76 }, { "epoch": 0.5, "grad_norm": 4.33914041519165, "learning_rate": 9.994149452870133e-05, "loss": 2.419, "step": 77 }, { "epoch": 0.5064935064935064, "grad_norm": 4.387986660003662, "learning_rate": 9.99399653445381e-05, "loss": 2.3415, "step": 78 }, { "epoch": 0.512987012987013, "grad_norm": 4.46196985244751, "learning_rate": 9.993841644486747e-05, "loss": 2.0966, "step": 79 }, { "epoch": 0.5194805194805194, "grad_norm": 4.377128601074219, "learning_rate": 9.993684783030088e-05, "loss": 2.1728, "step": 80 }, { "epoch": 0.525974025974026, "grad_norm": 4.3036789894104, "learning_rate": 9.99352595014576e-05, "loss": 2.3914, "step": 81 }, { "epoch": 0.5324675324675324, "grad_norm": 3.62605619430542, "learning_rate": 9.993365145896473e-05, "loss": 1.768, "step": 82 }, { "epoch": 0.538961038961039, "grad_norm": 4.524649143218994, "learning_rate": 9.993202370345705e-05, "loss": 2.3089, "step": 83 }, { "epoch": 0.5454545454545454, "grad_norm": 4.413171291351318, "learning_rate": 9.993037623557716e-05, "loss": 2.3156, "step": 84 }, { "epoch": 0.551948051948052, "grad_norm": 4.606533527374268, "learning_rate": 9.992870905597548e-05, "loss": 2.4141, "step": 85 }, { "epoch": 0.5584415584415584, "grad_norm": 4.309783935546875, "learning_rate": 9.99270221653102e-05, "loss": 2.1423, "step": 86 }, { "epoch": 0.564935064935065, "grad_norm": 4.384764671325684, "learning_rate": 9.992531556424726e-05, "loss": 2.5358, "step": 87 }, { "epoch": 0.5714285714285714, "grad_norm": 4.653176784515381, "learning_rate": 9.99235892534604e-05, "loss": 2.5152, "step": 88 }, { "epoch": 0.577922077922078, "grad_norm": 4.79496955871582, "learning_rate": 9.992184323363112e-05, "loss": 2.1936, "step": 89 }, { "epoch": 0.5844155844155844, "grad_norm": 3.893005847930908, "learning_rate": 9.992007750544876e-05, "loss": 1.916, "step": 90 }, { "epoch": 0.5909090909090909, "grad_norm": 4.456315040588379, "learning_rate": 9.991829206961037e-05, "loss": 2.6147, "step": 91 }, { "epoch": 0.5974025974025974, "grad_norm": 4.070108890533447, "learning_rate": 9.991648692682083e-05, "loss": 2.3899, "step": 92 }, { "epoch": 0.6038961038961039, "grad_norm": 4.310725212097168, "learning_rate": 9.991466207779278e-05, "loss": 1.9895, "step": 93 }, { "epoch": 0.6103896103896104, "grad_norm": 4.388233184814453, "learning_rate": 9.991281752324664e-05, "loss": 2.43, "step": 94 }, { "epoch": 0.6168831168831169, "grad_norm": 4.071033000946045, "learning_rate": 9.99109532639106e-05, "loss": 2.6234, "step": 95 }, { "epoch": 0.6233766233766234, "grad_norm": 4.230044841766357, "learning_rate": 9.990906930052064e-05, "loss": 2.2542, "step": 96 }, { "epoch": 0.6298701298701299, "grad_norm": 4.155112266540527, "learning_rate": 9.990716563382055e-05, "loss": 2.386, "step": 97 }, { "epoch": 0.6363636363636364, "grad_norm": 3.8967134952545166, "learning_rate": 9.990524226456182e-05, "loss": 2.1869, "step": 98 }, { "epoch": 0.6428571428571429, "grad_norm": 3.701253890991211, "learning_rate": 9.99032991935038e-05, "loss": 2.0778, "step": 99 }, { "epoch": 0.6493506493506493, "grad_norm": 3.9027299880981445, "learning_rate": 9.990133642141359e-05, "loss": 2.3151, "step": 100 }, { "epoch": 0.6558441558441559, "grad_norm": 3.9109201431274414, "learning_rate": 9.989935394906602e-05, "loss": 2.2484, "step": 101 }, { "epoch": 0.6623376623376623, "grad_norm": 3.9390170574188232, "learning_rate": 9.989735177724378e-05, "loss": 2.1411, "step": 102 }, { "epoch": 0.6688311688311688, "grad_norm": 3.8148396015167236, "learning_rate": 9.989532990673728e-05, "loss": 2.238, "step": 103 }, { "epoch": 0.6753246753246753, "grad_norm": 3.8671321868896484, "learning_rate": 9.989328833834471e-05, "loss": 2.1264, "step": 104 }, { "epoch": 0.6818181818181818, "grad_norm": 4.0604448318481445, "learning_rate": 9.989122707287208e-05, "loss": 2.3146, "step": 105 }, { "epoch": 0.6883116883116883, "grad_norm": 4.460545539855957, "learning_rate": 9.988914611113311e-05, "loss": 2.2619, "step": 106 }, { "epoch": 0.6948051948051948, "grad_norm": 3.8163511753082275, "learning_rate": 9.988704545394936e-05, "loss": 2.3224, "step": 107 }, { "epoch": 0.7012987012987013, "grad_norm": 3.963921070098877, "learning_rate": 9.988492510215011e-05, "loss": 2.0558, "step": 108 }, { "epoch": 0.7077922077922078, "grad_norm": 3.638936996459961, "learning_rate": 9.988278505657247e-05, "loss": 2.2582, "step": 109 }, { "epoch": 0.7142857142857143, "grad_norm": 3.886962890625, "learning_rate": 9.988062531806126e-05, "loss": 2.369, "step": 110 }, { "epoch": 0.7207792207792207, "grad_norm": 3.7281506061553955, "learning_rate": 9.987844588746915e-05, "loss": 2.3923, "step": 111 }, { "epoch": 0.7272727272727273, "grad_norm": 4.045536041259766, "learning_rate": 9.987624676565652e-05, "loss": 2.2701, "step": 112 }, { "epoch": 0.7337662337662337, "grad_norm": 3.914747953414917, "learning_rate": 9.987402795349154e-05, "loss": 2.3457, "step": 113 }, { "epoch": 0.7402597402597403, "grad_norm": 3.742039203643799, "learning_rate": 9.98717894518502e-05, "loss": 2.1281, "step": 114 }, { "epoch": 0.7467532467532467, "grad_norm": 3.6615986824035645, "learning_rate": 9.986953126161619e-05, "loss": 2.2539, "step": 115 }, { "epoch": 0.7532467532467533, "grad_norm": 4.145374298095703, "learning_rate": 9.986725338368102e-05, "loss": 2.168, "step": 116 }, { "epoch": 0.7597402597402597, "grad_norm": 3.4575271606445312, "learning_rate": 9.986495581894395e-05, "loss": 2.2219, "step": 117 }, { "epoch": 0.7662337662337663, "grad_norm": 3.4362294673919678, "learning_rate": 9.986263856831204e-05, "loss": 2.1648, "step": 118 }, { "epoch": 0.7727272727272727, "grad_norm": 3.6311452388763428, "learning_rate": 9.986030163270011e-05, "loss": 2.2759, "step": 119 }, { "epoch": 0.7792207792207793, "grad_norm": 3.4129133224487305, "learning_rate": 9.98579450130307e-05, "loss": 1.9268, "step": 120 }, { "epoch": 0.7857142857142857, "grad_norm": 3.2100963592529297, "learning_rate": 9.98555687102342e-05, "loss": 2.0587, "step": 121 }, { "epoch": 0.7922077922077922, "grad_norm": 3.662796974182129, "learning_rate": 9.985317272524876e-05, "loss": 2.0628, "step": 122 }, { "epoch": 0.7987012987012987, "grad_norm": 3.4176554679870605, "learning_rate": 9.985075705902022e-05, "loss": 2.1833, "step": 123 }, { "epoch": 0.8051948051948052, "grad_norm": 3.2369673252105713, "learning_rate": 9.98483217125023e-05, "loss": 2.226, "step": 124 }, { "epoch": 0.8116883116883117, "grad_norm": 3.5990474224090576, "learning_rate": 9.98458666866564e-05, "loss": 2.2601, "step": 125 }, { "epoch": 0.8181818181818182, "grad_norm": 2.904496431350708, "learning_rate": 9.984339198245175e-05, "loss": 1.7978, "step": 126 }, { "epoch": 0.8246753246753247, "grad_norm": 3.4239206314086914, "learning_rate": 9.98408976008653e-05, "loss": 1.8985, "step": 127 }, { "epoch": 0.8311688311688312, "grad_norm": 3.4764034748077393, "learning_rate": 9.983838354288181e-05, "loss": 2.0324, "step": 128 }, { "epoch": 0.8376623376623377, "grad_norm": 3.777717351913452, "learning_rate": 9.98358498094938e-05, "loss": 2.35, "step": 129 }, { "epoch": 0.8441558441558441, "grad_norm": 3.3230550289154053, "learning_rate": 9.983329640170149e-05, "loss": 2.0381, "step": 130 }, { "epoch": 0.8506493506493507, "grad_norm": 3.5832202434539795, "learning_rate": 9.9830723320513e-05, "loss": 2.1329, "step": 131 }, { "epoch": 0.8571428571428571, "grad_norm": 3.628079414367676, "learning_rate": 9.982813056694412e-05, "loss": 2.1468, "step": 132 }, { "epoch": 0.8636363636363636, "grad_norm": 3.3164730072021484, "learning_rate": 9.982551814201839e-05, "loss": 2.1018, "step": 133 }, { "epoch": 0.8701298701298701, "grad_norm": 3.202061414718628, "learning_rate": 9.98228860467672e-05, "loss": 2.1209, "step": 134 }, { "epoch": 0.8766233766233766, "grad_norm": 3.5353541374206543, "learning_rate": 9.982023428222962e-05, "loss": 2.3157, "step": 135 }, { "epoch": 0.8831168831168831, "grad_norm": 3.284064292907715, "learning_rate": 9.981756284945256e-05, "loss": 2.1389, "step": 136 }, { "epoch": 0.8896103896103896, "grad_norm": 3.548656463623047, "learning_rate": 9.981487174949065e-05, "loss": 2.0996, "step": 137 }, { "epoch": 0.8961038961038961, "grad_norm": 3.6342179775238037, "learning_rate": 9.981216098340629e-05, "loss": 2.2534, "step": 138 }, { "epoch": 0.9025974025974026, "grad_norm": 3.3490617275238037, "learning_rate": 9.980943055226964e-05, "loss": 2.0916, "step": 139 }, { "epoch": 0.9090909090909091, "grad_norm": 3.469787120819092, "learning_rate": 9.980668045715864e-05, "loss": 2.0929, "step": 140 }, { "epoch": 0.9155844155844156, "grad_norm": 3.651165723800659, "learning_rate": 9.980391069915897e-05, "loss": 2.3875, "step": 141 }, { "epoch": 0.922077922077922, "grad_norm": 3.3916146755218506, "learning_rate": 9.980112127936409e-05, "loss": 2.2071, "step": 142 }, { "epoch": 0.9285714285714286, "grad_norm": 3.484081745147705, "learning_rate": 9.979831219887525e-05, "loss": 2.2033, "step": 143 }, { "epoch": 0.935064935064935, "grad_norm": 3.538928270339966, "learning_rate": 9.979548345880141e-05, "loss": 2.2221, "step": 144 }, { "epoch": 0.9415584415584416, "grad_norm": 3.1394541263580322, "learning_rate": 9.979263506025929e-05, "loss": 2.1657, "step": 145 }, { "epoch": 0.948051948051948, "grad_norm": 3.273376941680908, "learning_rate": 9.978976700437342e-05, "loss": 2.0307, "step": 146 }, { "epoch": 0.9545454545454546, "grad_norm": 3.474287748336792, "learning_rate": 9.978687929227606e-05, "loss": 2.2569, "step": 147 }, { "epoch": 0.961038961038961, "grad_norm": 3.40504789352417, "learning_rate": 9.978397192510721e-05, "loss": 2.2194, "step": 148 }, { "epoch": 0.9675324675324676, "grad_norm": 3.5432350635528564, "learning_rate": 9.978104490401467e-05, "loss": 2.208, "step": 149 }, { "epoch": 0.974025974025974, "grad_norm": 3.4987633228302, "learning_rate": 9.977809823015401e-05, "loss": 2.108, "step": 150 }, { "epoch": 0.9805194805194806, "grad_norm": 2.981435775756836, "learning_rate": 9.977513190468848e-05, "loss": 1.9615, "step": 151 }, { "epoch": 0.987012987012987, "grad_norm": 3.688192129135132, "learning_rate": 9.977214592878916e-05, "loss": 2.3847, "step": 152 }, { "epoch": 0.9935064935064936, "grad_norm": 3.7138729095458984, "learning_rate": 9.976914030363487e-05, "loss": 2.1349, "step": 153 }, { "epoch": 1.0, "grad_norm": 798.8204956054688, "learning_rate": 9.976611503041218e-05, "loss": 1.9848, "step": 154 }, { "epoch": 1.0064935064935066, "grad_norm": 3.2269575595855713, "learning_rate": 9.976307011031542e-05, "loss": 1.985, "step": 155 }, { "epoch": 1.0129870129870129, "grad_norm": 3.4152016639709473, "learning_rate": 9.976000554454668e-05, "loss": 1.8731, "step": 156 }, { "epoch": 1.0194805194805194, "grad_norm": 2.8445022106170654, "learning_rate": 9.975692133431579e-05, "loss": 1.6258, "step": 157 }, { "epoch": 1.025974025974026, "grad_norm": 3.289297580718994, "learning_rate": 9.975381748084035e-05, "loss": 1.9762, "step": 158 }, { "epoch": 1.0324675324675325, "grad_norm": 3.303457260131836, "learning_rate": 9.975069398534574e-05, "loss": 1.8754, "step": 159 }, { "epoch": 1.0389610389610389, "grad_norm": 3.0584588050842285, "learning_rate": 9.974755084906502e-05, "loss": 1.7928, "step": 160 }, { "epoch": 1.0454545454545454, "grad_norm": 3.4114456176757812, "learning_rate": 9.974438807323907e-05, "loss": 1.9171, "step": 161 }, { "epoch": 1.051948051948052, "grad_norm": 3.154326915740967, "learning_rate": 9.974120565911652e-05, "loss": 1.8924, "step": 162 }, { "epoch": 1.0584415584415585, "grad_norm": 4.066158771514893, "learning_rate": 9.973800360795372e-05, "loss": 2.1918, "step": 163 }, { "epoch": 1.0649350649350648, "grad_norm": 3.5308237075805664, "learning_rate": 9.97347819210148e-05, "loss": 2.0059, "step": 164 }, { "epoch": 1.0714285714285714, "grad_norm": 3.352773427963257, "learning_rate": 9.973154059957162e-05, "loss": 2.0407, "step": 165 }, { "epoch": 1.077922077922078, "grad_norm": 3.2745213508605957, "learning_rate": 9.972827964490381e-05, "loss": 1.9063, "step": 166 }, { "epoch": 1.0844155844155845, "grad_norm": 3.0867488384246826, "learning_rate": 9.972499905829875e-05, "loss": 1.7633, "step": 167 }, { "epoch": 1.0909090909090908, "grad_norm": 3.094118118286133, "learning_rate": 9.972169884105153e-05, "loss": 1.7058, "step": 168 }, { "epoch": 1.0974025974025974, "grad_norm": 3.409409284591675, "learning_rate": 9.971837899446505e-05, "loss": 1.9263, "step": 169 }, { "epoch": 1.103896103896104, "grad_norm": 3.7799603939056396, "learning_rate": 9.971503951984995e-05, "loss": 2.0513, "step": 170 }, { "epoch": 1.1103896103896105, "grad_norm": 3.531250238418579, "learning_rate": 9.971168041852456e-05, "loss": 1.7284, "step": 171 }, { "epoch": 1.1168831168831168, "grad_norm": 3.0355734825134277, "learning_rate": 9.970830169181505e-05, "loss": 1.7999, "step": 172 }, { "epoch": 1.1233766233766234, "grad_norm": 3.7033843994140625, "learning_rate": 9.970490334105524e-05, "loss": 2.1174, "step": 173 }, { "epoch": 1.12987012987013, "grad_norm": 3.9485671520233154, "learning_rate": 9.970148536758677e-05, "loss": 2.0429, "step": 174 }, { "epoch": 1.1363636363636362, "grad_norm": 3.2483878135681152, "learning_rate": 9.9698047772759e-05, "loss": 1.8513, "step": 175 }, { "epoch": 1.1428571428571428, "grad_norm": 3.4199907779693604, "learning_rate": 9.969459055792903e-05, "loss": 1.7395, "step": 176 }, { "epoch": 1.1493506493506493, "grad_norm": 3.7996020317077637, "learning_rate": 9.969111372446171e-05, "loss": 1.9037, "step": 177 }, { "epoch": 1.155844155844156, "grad_norm": 3.3956806659698486, "learning_rate": 9.968761727372964e-05, "loss": 1.6993, "step": 178 }, { "epoch": 1.1623376623376624, "grad_norm": 3.3464372158050537, "learning_rate": 9.96841012071132e-05, "loss": 1.7862, "step": 179 }, { "epoch": 1.1688311688311688, "grad_norm": 3.4040322303771973, "learning_rate": 9.968056552600043e-05, "loss": 2.0307, "step": 180 }, { "epoch": 1.1753246753246753, "grad_norm": 3.239704132080078, "learning_rate": 9.967701023178717e-05, "loss": 1.7453, "step": 181 }, { "epoch": 1.1818181818181819, "grad_norm": 3.682248115539551, "learning_rate": 9.967343532587702e-05, "loss": 1.8286, "step": 182 }, { "epoch": 1.1883116883116882, "grad_norm": 3.8498799800872803, "learning_rate": 9.966984080968128e-05, "loss": 2.072, "step": 183 }, { "epoch": 1.1948051948051948, "grad_norm": 3.3957226276397705, "learning_rate": 9.9666226684619e-05, "loss": 1.8372, "step": 184 }, { "epoch": 1.2012987012987013, "grad_norm": 3.5456008911132812, "learning_rate": 9.966259295211697e-05, "loss": 1.9703, "step": 185 }, { "epoch": 1.2077922077922079, "grad_norm": 3.291201591491699, "learning_rate": 9.965893961360976e-05, "loss": 1.9931, "step": 186 }, { "epoch": 1.2142857142857142, "grad_norm": 3.3770711421966553, "learning_rate": 9.965526667053963e-05, "loss": 1.9248, "step": 187 }, { "epoch": 1.2207792207792207, "grad_norm": 3.346139669418335, "learning_rate": 9.965157412435663e-05, "loss": 1.9951, "step": 188 }, { "epoch": 1.2272727272727273, "grad_norm": 2.8916828632354736, "learning_rate": 9.964786197651847e-05, "loss": 1.7675, "step": 189 }, { "epoch": 1.2337662337662338, "grad_norm": 3.3172147274017334, "learning_rate": 9.964413022849068e-05, "loss": 1.7783, "step": 190 }, { "epoch": 1.2402597402597402, "grad_norm": 3.2727859020233154, "learning_rate": 9.96403788817465e-05, "loss": 2.0687, "step": 191 }, { "epoch": 1.2467532467532467, "grad_norm": 2.8867673873901367, "learning_rate": 9.963660793776688e-05, "loss": 1.467, "step": 192 }, { "epoch": 1.2532467532467533, "grad_norm": 3.398193359375, "learning_rate": 9.963281739804054e-05, "loss": 2.084, "step": 193 }, { "epoch": 1.2597402597402598, "grad_norm": 3.4608664512634277, "learning_rate": 9.962900726406391e-05, "loss": 2.0284, "step": 194 }, { "epoch": 1.2662337662337662, "grad_norm": 2.9325497150421143, "learning_rate": 9.96251775373412e-05, "loss": 1.7167, "step": 195 }, { "epoch": 1.2727272727272727, "grad_norm": 3.263169765472412, "learning_rate": 9.96213282193843e-05, "loss": 1.9843, "step": 196 }, { "epoch": 1.2792207792207793, "grad_norm": 3.2453436851501465, "learning_rate": 9.961745931171287e-05, "loss": 1.804, "step": 197 }, { "epoch": 1.2857142857142856, "grad_norm": 3.2929818630218506, "learning_rate": 9.96135708158543e-05, "loss": 2.0279, "step": 198 }, { "epoch": 1.2922077922077921, "grad_norm": 3.565657377243042, "learning_rate": 9.96096627333437e-05, "loss": 1.9242, "step": 199 }, { "epoch": 1.2987012987012987, "grad_norm": 3.3671059608459473, "learning_rate": 9.96057350657239e-05, "loss": 1.7902, "step": 200 }, { "epoch": 1.3051948051948052, "grad_norm": 3.2640137672424316, "learning_rate": 9.96017878145455e-05, "loss": 1.9485, "step": 201 }, { "epoch": 1.3116883116883118, "grad_norm": 3.6283884048461914, "learning_rate": 9.959782098136683e-05, "loss": 2.204, "step": 202 }, { "epoch": 1.3181818181818181, "grad_norm": 3.5066027641296387, "learning_rate": 9.959383456775391e-05, "loss": 2.0808, "step": 203 }, { "epoch": 1.3246753246753247, "grad_norm": 3.4553568363189697, "learning_rate": 9.958982857528052e-05, "loss": 1.8267, "step": 204 }, { "epoch": 1.3311688311688312, "grad_norm": 3.434098482131958, "learning_rate": 9.958580300552815e-05, "loss": 2.0149, "step": 205 }, { "epoch": 1.3376623376623376, "grad_norm": 3.090224504470825, "learning_rate": 9.958175786008604e-05, "loss": 1.6188, "step": 206 }, { "epoch": 1.344155844155844, "grad_norm": 3.104416847229004, "learning_rate": 9.957769314055117e-05, "loss": 1.8435, "step": 207 }, { "epoch": 1.3506493506493507, "grad_norm": 3.1267154216766357, "learning_rate": 9.957360884852817e-05, "loss": 1.839, "step": 208 }, { "epoch": 1.3571428571428572, "grad_norm": 3.384131908416748, "learning_rate": 9.956950498562953e-05, "loss": 2.0758, "step": 209 }, { "epoch": 1.3636363636363638, "grad_norm": 3.1655869483947754, "learning_rate": 9.956538155347534e-05, "loss": 1.6829, "step": 210 }, { "epoch": 1.37012987012987, "grad_norm": 3.373323917388916, "learning_rate": 9.956123855369346e-05, "loss": 1.8551, "step": 211 }, { "epoch": 1.3766233766233766, "grad_norm": 3.1319708824157715, "learning_rate": 9.955707598791952e-05, "loss": 1.7109, "step": 212 }, { "epoch": 1.3831168831168832, "grad_norm": 3.38543963432312, "learning_rate": 9.95528938577968e-05, "loss": 1.9705, "step": 213 }, { "epoch": 1.3896103896103895, "grad_norm": 3.442453145980835, "learning_rate": 9.954869216497635e-05, "loss": 1.7815, "step": 214 }, { "epoch": 1.396103896103896, "grad_norm": 3.13667368888855, "learning_rate": 9.954447091111694e-05, "loss": 1.9754, "step": 215 }, { "epoch": 1.4025974025974026, "grad_norm": 3.447659492492676, "learning_rate": 9.954023009788504e-05, "loss": 2.269, "step": 216 }, { "epoch": 1.4090909090909092, "grad_norm": 3.6094558238983154, "learning_rate": 9.953596972695487e-05, "loss": 2.0854, "step": 217 }, { "epoch": 1.4155844155844157, "grad_norm": 3.1890039443969727, "learning_rate": 9.953168980000835e-05, "loss": 1.993, "step": 218 }, { "epoch": 1.422077922077922, "grad_norm": 3.3757541179656982, "learning_rate": 9.952739031873512e-05, "loss": 2.0556, "step": 219 }, { "epoch": 1.4285714285714286, "grad_norm": 3.047400712966919, "learning_rate": 9.952307128483256e-05, "loss": 1.9276, "step": 220 }, { "epoch": 1.435064935064935, "grad_norm": 3.213284730911255, "learning_rate": 9.951873270000576e-05, "loss": 2.0443, "step": 221 }, { "epoch": 1.4415584415584415, "grad_norm": 3.250971794128418, "learning_rate": 9.95143745659675e-05, "loss": 1.9392, "step": 222 }, { "epoch": 1.448051948051948, "grad_norm": 3.3131914138793945, "learning_rate": 9.950999688443833e-05, "loss": 1.8581, "step": 223 }, { "epoch": 1.4545454545454546, "grad_norm": 2.970548629760742, "learning_rate": 9.950559965714648e-05, "loss": 1.7365, "step": 224 }, { "epoch": 1.4610389610389611, "grad_norm": 3.0069758892059326, "learning_rate": 9.950118288582788e-05, "loss": 1.4736, "step": 225 }, { "epoch": 1.4675324675324675, "grad_norm": 2.8806533813476562, "learning_rate": 9.949674657222624e-05, "loss": 1.6931, "step": 226 }, { "epoch": 1.474025974025974, "grad_norm": 2.8493435382843018, "learning_rate": 9.949229071809293e-05, "loss": 1.6771, "step": 227 }, { "epoch": 1.4805194805194806, "grad_norm": 3.4888033866882324, "learning_rate": 9.948781532518705e-05, "loss": 1.9593, "step": 228 }, { "epoch": 1.487012987012987, "grad_norm": 3.370680332183838, "learning_rate": 9.948332039527541e-05, "loss": 1.9331, "step": 229 }, { "epoch": 1.4935064935064934, "grad_norm": 3.5115890502929688, "learning_rate": 9.947880593013255e-05, "loss": 2.0011, "step": 230 }, { "epoch": 1.5, "grad_norm": 3.091661214828491, "learning_rate": 9.947427193154071e-05, "loss": 1.8513, "step": 231 }, { "epoch": 1.5064935064935066, "grad_norm": 3.062349796295166, "learning_rate": 9.946971840128981e-05, "loss": 1.7876, "step": 232 }, { "epoch": 1.512987012987013, "grad_norm": 3.1313397884368896, "learning_rate": 9.946514534117754e-05, "loss": 1.8766, "step": 233 }, { "epoch": 1.5194805194805194, "grad_norm": 3.2378571033477783, "learning_rate": 9.946055275300928e-05, "loss": 1.9461, "step": 234 }, { "epoch": 1.525974025974026, "grad_norm": 3.385910987854004, "learning_rate": 9.945594063859809e-05, "loss": 1.8297, "step": 235 }, { "epoch": 1.5324675324675323, "grad_norm": 3.524550676345825, "learning_rate": 9.945130899976477e-05, "loss": 1.9914, "step": 236 }, { "epoch": 1.5389610389610389, "grad_norm": 3.3748812675476074, "learning_rate": 9.944665783833782e-05, "loss": 1.8368, "step": 237 }, { "epoch": 1.5454545454545454, "grad_norm": 3.096031904220581, "learning_rate": 9.944198715615342e-05, "loss": 1.783, "step": 238 }, { "epoch": 1.551948051948052, "grad_norm": 3.375197410583496, "learning_rate": 9.943729695505552e-05, "loss": 2.073, "step": 239 }, { "epoch": 1.5584415584415585, "grad_norm": 2.887650966644287, "learning_rate": 9.94325872368957e-05, "loss": 1.8864, "step": 240 }, { "epoch": 1.564935064935065, "grad_norm": 3.060448169708252, "learning_rate": 9.942785800353332e-05, "loss": 1.7054, "step": 241 }, { "epoch": 1.5714285714285714, "grad_norm": 3.2923696041107178, "learning_rate": 9.942310925683538e-05, "loss": 1.999, "step": 242 }, { "epoch": 1.577922077922078, "grad_norm": 2.896110773086548, "learning_rate": 9.941834099867659e-05, "loss": 1.832, "step": 243 }, { "epoch": 1.5844155844155843, "grad_norm": 2.9442901611328125, "learning_rate": 9.941355323093943e-05, "loss": 2.1465, "step": 244 }, { "epoch": 1.5909090909090908, "grad_norm": 2.99206280708313, "learning_rate": 9.940874595551404e-05, "loss": 1.7449, "step": 245 }, { "epoch": 1.5974025974025974, "grad_norm": 3.049651622772217, "learning_rate": 9.940391917429818e-05, "loss": 1.5516, "step": 246 }, { "epoch": 1.603896103896104, "grad_norm": 3.3082385063171387, "learning_rate": 9.939907288919747e-05, "loss": 2.0167, "step": 247 }, { "epoch": 1.6103896103896105, "grad_norm": 3.1149022579193115, "learning_rate": 9.939420710212511e-05, "loss": 2.0022, "step": 248 }, { "epoch": 1.616883116883117, "grad_norm": 2.9124245643615723, "learning_rate": 9.938932181500205e-05, "loss": 1.9863, "step": 249 }, { "epoch": 1.6233766233766234, "grad_norm": 3.180870294570923, "learning_rate": 9.938441702975689e-05, "loss": 1.9872, "step": 250 }, { "epoch": 1.62987012987013, "grad_norm": 3.2378902435302734, "learning_rate": 9.9379492748326e-05, "loss": 1.9957, "step": 251 }, { "epoch": 1.6363636363636362, "grad_norm": 2.9901888370513916, "learning_rate": 9.937454897265337e-05, "loss": 1.7305, "step": 252 }, { "epoch": 1.6428571428571428, "grad_norm": 3.472109794616699, "learning_rate": 9.936958570469077e-05, "loss": 2.1072, "step": 253 }, { "epoch": 1.6493506493506493, "grad_norm": 3.1424944400787354, "learning_rate": 9.93646029463976e-05, "loss": 1.961, "step": 254 }, { "epoch": 1.655844155844156, "grad_norm": 3.177325487136841, "learning_rate": 9.935960069974096e-05, "loss": 1.7897, "step": 255 }, { "epoch": 1.6623376623376624, "grad_norm": 3.207707166671753, "learning_rate": 9.935457896669568e-05, "loss": 1.8971, "step": 256 }, { "epoch": 1.6688311688311688, "grad_norm": 3.142314910888672, "learning_rate": 9.934953774924424e-05, "loss": 1.8993, "step": 257 }, { "epoch": 1.6753246753246753, "grad_norm": 3.1800239086151123, "learning_rate": 9.934447704937684e-05, "loss": 1.9482, "step": 258 }, { "epoch": 1.6818181818181817, "grad_norm": 2.71478271484375, "learning_rate": 9.933939686909137e-05, "loss": 1.7173, "step": 259 }, { "epoch": 1.6883116883116882, "grad_norm": 3.2946617603302, "learning_rate": 9.93342972103934e-05, "loss": 2.0823, "step": 260 }, { "epoch": 1.6948051948051948, "grad_norm": 3.1132423877716064, "learning_rate": 9.93291780752962e-05, "loss": 1.9475, "step": 261 }, { "epoch": 1.7012987012987013, "grad_norm": 3.383772373199463, "learning_rate": 9.932403946582072e-05, "loss": 2.1187, "step": 262 }, { "epoch": 1.7077922077922079, "grad_norm": 3.0686593055725098, "learning_rate": 9.931888138399561e-05, "loss": 1.7843, "step": 263 }, { "epoch": 1.7142857142857144, "grad_norm": 3.1363070011138916, "learning_rate": 9.931370383185718e-05, "loss": 2.0091, "step": 264 }, { "epoch": 1.7207792207792207, "grad_norm": 3.5003199577331543, "learning_rate": 9.930850681144945e-05, "loss": 2.1529, "step": 265 }, { "epoch": 1.7272727272727273, "grad_norm": 3.456496000289917, "learning_rate": 9.930329032482413e-05, "loss": 2.0149, "step": 266 }, { "epoch": 1.7337662337662336, "grad_norm": 2.9732820987701416, "learning_rate": 9.92980543740406e-05, "loss": 1.7583, "step": 267 }, { "epoch": 1.7402597402597402, "grad_norm": 3.0580830574035645, "learning_rate": 9.929279896116594e-05, "loss": 1.9094, "step": 268 }, { "epoch": 1.7467532467532467, "grad_norm": 2.9448652267456055, "learning_rate": 9.92875240882749e-05, "loss": 1.8479, "step": 269 }, { "epoch": 1.7532467532467533, "grad_norm": 3.3782246112823486, "learning_rate": 9.928222975744991e-05, "loss": 1.7582, "step": 270 }, { "epoch": 1.7597402597402598, "grad_norm": 3.2021212577819824, "learning_rate": 9.927691597078108e-05, "loss": 2.0634, "step": 271 }, { "epoch": 1.7662337662337664, "grad_norm": 3.008000135421753, "learning_rate": 9.927158273036625e-05, "loss": 1.8569, "step": 272 }, { "epoch": 1.7727272727272727, "grad_norm": 3.255671501159668, "learning_rate": 9.926623003831084e-05, "loss": 1.8678, "step": 273 }, { "epoch": 1.7792207792207793, "grad_norm": 3.4665205478668213, "learning_rate": 9.926085789672806e-05, "loss": 2.1407, "step": 274 }, { "epoch": 1.7857142857142856, "grad_norm": 3.0679426193237305, "learning_rate": 9.92554663077387e-05, "loss": 2.0964, "step": 275 }, { "epoch": 1.7922077922077921, "grad_norm": 3.3258745670318604, "learning_rate": 9.92500552734713e-05, "loss": 2.0542, "step": 276 }, { "epoch": 1.7987012987012987, "grad_norm": 3.138080596923828, "learning_rate": 9.924462479606207e-05, "loss": 1.9672, "step": 277 }, { "epoch": 1.8051948051948052, "grad_norm": 3.225432872772217, "learning_rate": 9.923917487765484e-05, "loss": 1.9354, "step": 278 }, { "epoch": 1.8116883116883118, "grad_norm": 2.762915849685669, "learning_rate": 9.923370552040116e-05, "loss": 1.7821, "step": 279 }, { "epoch": 1.8181818181818183, "grad_norm": 3.107593059539795, "learning_rate": 9.922821672646027e-05, "loss": 1.8868, "step": 280 }, { "epoch": 1.8246753246753247, "grad_norm": 3.052213668823242, "learning_rate": 9.922270849799905e-05, "loss": 1.8148, "step": 281 }, { "epoch": 1.8311688311688312, "grad_norm": 3.04420804977417, "learning_rate": 9.921718083719203e-05, "loss": 2.0534, "step": 282 }, { "epoch": 1.8376623376623376, "grad_norm": 3.083042860031128, "learning_rate": 9.921163374622147e-05, "loss": 2.0427, "step": 283 }, { "epoch": 1.844155844155844, "grad_norm": 2.7837564945220947, "learning_rate": 9.920606722727725e-05, "loss": 1.5982, "step": 284 }, { "epoch": 1.8506493506493507, "grad_norm": 2.737576961517334, "learning_rate": 9.920048128255699e-05, "loss": 1.6942, "step": 285 }, { "epoch": 1.8571428571428572, "grad_norm": 2.9239234924316406, "learning_rate": 9.919487591426591e-05, "loss": 1.9588, "step": 286 }, { "epoch": 1.8636363636363638, "grad_norm": 2.5686419010162354, "learning_rate": 9.918925112461688e-05, "loss": 1.798, "step": 287 }, { "epoch": 1.87012987012987, "grad_norm": 3.106424570083618, "learning_rate": 9.918360691583056e-05, "loss": 1.9961, "step": 288 }, { "epoch": 1.8766233766233766, "grad_norm": 3.469996452331543, "learning_rate": 9.91779432901351e-05, "loss": 2.1417, "step": 289 }, { "epoch": 1.883116883116883, "grad_norm": 3.0456113815307617, "learning_rate": 9.917226024976649e-05, "loss": 1.922, "step": 290 }, { "epoch": 1.8896103896103895, "grad_norm": 3.158688545227051, "learning_rate": 9.916655779696826e-05, "loss": 1.8516, "step": 291 }, { "epoch": 1.896103896103896, "grad_norm": 2.9286715984344482, "learning_rate": 9.916083593399166e-05, "loss": 2.0368, "step": 292 }, { "epoch": 1.9025974025974026, "grad_norm": 2.7478342056274414, "learning_rate": 9.91550946630956e-05, "loss": 1.7571, "step": 293 }, { "epoch": 1.9090909090909092, "grad_norm": 2.5094292163848877, "learning_rate": 9.914933398654663e-05, "loss": 1.7007, "step": 294 }, { "epoch": 1.9155844155844157, "grad_norm": 2.9641265869140625, "learning_rate": 9.914355390661896e-05, "loss": 1.7764, "step": 295 }, { "epoch": 1.922077922077922, "grad_norm": 3.029686689376831, "learning_rate": 9.913775442559452e-05, "loss": 1.9938, "step": 296 }, { "epoch": 1.9285714285714286, "grad_norm": 3.169807195663452, "learning_rate": 9.91319355457628e-05, "loss": 1.9341, "step": 297 }, { "epoch": 1.935064935064935, "grad_norm": 2.8388686180114746, "learning_rate": 9.912609726942103e-05, "loss": 1.96, "step": 298 }, { "epoch": 1.9415584415584415, "grad_norm": 3.2075181007385254, "learning_rate": 9.912023959887408e-05, "loss": 2.0209, "step": 299 }, { "epoch": 1.948051948051948, "grad_norm": 2.870790481567383, "learning_rate": 9.911436253643445e-05, "loss": 1.7023, "step": 300 }, { "epoch": 1.9545454545454546, "grad_norm": 3.17559814453125, "learning_rate": 9.910846608442229e-05, "loss": 1.8861, "step": 301 }, { "epoch": 1.9610389610389611, "grad_norm": 3.2682406902313232, "learning_rate": 9.910255024516546e-05, "loss": 1.9807, "step": 302 }, { "epoch": 1.9675324675324677, "grad_norm": 3.0713870525360107, "learning_rate": 9.909661502099943e-05, "loss": 1.7816, "step": 303 }, { "epoch": 1.974025974025974, "grad_norm": 2.817713975906372, "learning_rate": 9.909066041426733e-05, "loss": 1.802, "step": 304 }, { "epoch": 1.9805194805194806, "grad_norm": 3.0280492305755615, "learning_rate": 9.908468642731995e-05, "loss": 2.1874, "step": 305 }, { "epoch": 1.987012987012987, "grad_norm": 2.799920082092285, "learning_rate": 9.907869306251572e-05, "loss": 1.6937, "step": 306 }, { "epoch": 1.9935064935064934, "grad_norm": 2.9983012676239014, "learning_rate": 9.907268032222071e-05, "loss": 1.9305, "step": 307 }, { "epoch": 2.0, "grad_norm": 4888.9140625, "learning_rate": 9.90666482088087e-05, "loss": 1.771, "step": 308 }, { "epoch": 2.0064935064935066, "grad_norm": 3.2918434143066406, "learning_rate": 9.906059672466101e-05, "loss": 1.651, "step": 309 }, { "epoch": 2.012987012987013, "grad_norm": 3.1957743167877197, "learning_rate": 9.90545258721667e-05, "loss": 1.7395, "step": 310 }, { "epoch": 2.0194805194805197, "grad_norm": 3.090162515640259, "learning_rate": 9.904843565372248e-05, "loss": 1.6515, "step": 311 }, { "epoch": 2.0259740259740258, "grad_norm": 3.333552598953247, "learning_rate": 9.904232607173262e-05, "loss": 1.6349, "step": 312 }, { "epoch": 2.0324675324675323, "grad_norm": 3.266960382461548, "learning_rate": 9.903619712860912e-05, "loss": 1.6787, "step": 313 }, { "epoch": 2.038961038961039, "grad_norm": 3.407783031463623, "learning_rate": 9.903004882677156e-05, "loss": 1.5224, "step": 314 }, { "epoch": 2.0454545454545454, "grad_norm": 3.5633609294891357, "learning_rate": 9.902388116864722e-05, "loss": 1.7562, "step": 315 }, { "epoch": 2.051948051948052, "grad_norm": 3.1980199813842773, "learning_rate": 9.901769415667099e-05, "loss": 1.6488, "step": 316 }, { "epoch": 2.0584415584415585, "grad_norm": 2.946018695831299, "learning_rate": 9.90114877932854e-05, "loss": 1.3956, "step": 317 }, { "epoch": 2.064935064935065, "grad_norm": 3.0959718227386475, "learning_rate": 9.900526208094061e-05, "loss": 1.6679, "step": 318 }, { "epoch": 2.0714285714285716, "grad_norm": 3.6357126235961914, "learning_rate": 9.899901702209445e-05, "loss": 1.6934, "step": 319 }, { "epoch": 2.0779220779220777, "grad_norm": 3.2969744205474854, "learning_rate": 9.899275261921234e-05, "loss": 1.7037, "step": 320 }, { "epoch": 2.0844155844155843, "grad_norm": 3.406505584716797, "learning_rate": 9.898646887476741e-05, "loss": 1.6802, "step": 321 }, { "epoch": 2.090909090909091, "grad_norm": 3.356058120727539, "learning_rate": 9.898016579124037e-05, "loss": 1.6779, "step": 322 }, { "epoch": 2.0974025974025974, "grad_norm": 3.2132608890533447, "learning_rate": 9.897384337111957e-05, "loss": 1.6753, "step": 323 }, { "epoch": 2.103896103896104, "grad_norm": 3.860316514968872, "learning_rate": 9.8967501616901e-05, "loss": 1.7702, "step": 324 }, { "epoch": 2.1103896103896105, "grad_norm": 3.283928871154785, "learning_rate": 9.896114053108829e-05, "loss": 1.874, "step": 325 }, { "epoch": 2.116883116883117, "grad_norm": 3.458469867706299, "learning_rate": 9.895476011619269e-05, "loss": 1.6123, "step": 326 }, { "epoch": 2.1233766233766236, "grad_norm": 3.1832275390625, "learning_rate": 9.89483603747331e-05, "loss": 1.5368, "step": 327 }, { "epoch": 2.1298701298701297, "grad_norm": 3.380354166030884, "learning_rate": 9.894194130923602e-05, "loss": 1.8633, "step": 328 }, { "epoch": 2.1363636363636362, "grad_norm": 3.0025081634521484, "learning_rate": 9.89355029222356e-05, "loss": 1.693, "step": 329 }, { "epoch": 2.142857142857143, "grad_norm": 3.223750591278076, "learning_rate": 9.892904521627361e-05, "loss": 1.6421, "step": 330 }, { "epoch": 2.1493506493506493, "grad_norm": 2.978912830352783, "learning_rate": 9.892256819389947e-05, "loss": 1.6442, "step": 331 }, { "epoch": 2.155844155844156, "grad_norm": 3.126190185546875, "learning_rate": 9.891607185767018e-05, "loss": 1.642, "step": 332 }, { "epoch": 2.1623376623376624, "grad_norm": 3.196380615234375, "learning_rate": 9.890955621015039e-05, "loss": 1.8094, "step": 333 }, { "epoch": 2.168831168831169, "grad_norm": 3.2463197708129883, "learning_rate": 9.890302125391239e-05, "loss": 1.7948, "step": 334 }, { "epoch": 2.175324675324675, "grad_norm": 3.05271053314209, "learning_rate": 9.88964669915361e-05, "loss": 1.7538, "step": 335 }, { "epoch": 2.1818181818181817, "grad_norm": 2.841326951980591, "learning_rate": 9.888989342560899e-05, "loss": 1.7039, "step": 336 }, { "epoch": 2.188311688311688, "grad_norm": 2.989499807357788, "learning_rate": 9.888330055872623e-05, "loss": 1.6692, "step": 337 }, { "epoch": 2.1948051948051948, "grad_norm": 3.3557872772216797, "learning_rate": 9.887668839349057e-05, "loss": 1.6721, "step": 338 }, { "epoch": 2.2012987012987013, "grad_norm": 3.342548370361328, "learning_rate": 9.88700569325124e-05, "loss": 1.83, "step": 339 }, { "epoch": 2.207792207792208, "grad_norm": 2.9942660331726074, "learning_rate": 9.886340617840968e-05, "loss": 1.8081, "step": 340 }, { "epoch": 2.2142857142857144, "grad_norm": 3.1562952995300293, "learning_rate": 9.885673613380806e-05, "loss": 1.7479, "step": 341 }, { "epoch": 2.220779220779221, "grad_norm": 2.7380170822143555, "learning_rate": 9.885004680134076e-05, "loss": 1.3577, "step": 342 }, { "epoch": 2.227272727272727, "grad_norm": 2.7151942253112793, "learning_rate": 9.884333818364861e-05, "loss": 1.3956, "step": 343 }, { "epoch": 2.2337662337662336, "grad_norm": 3.0294029712677, "learning_rate": 9.883661028338008e-05, "loss": 1.5567, "step": 344 }, { "epoch": 2.24025974025974, "grad_norm": 3.0623183250427246, "learning_rate": 9.882986310319124e-05, "loss": 1.6769, "step": 345 }, { "epoch": 2.2467532467532467, "grad_norm": 3.1786789894104004, "learning_rate": 9.882309664574575e-05, "loss": 1.595, "step": 346 }, { "epoch": 2.2532467532467533, "grad_norm": 3.2232789993286133, "learning_rate": 9.881631091371491e-05, "loss": 1.6652, "step": 347 }, { "epoch": 2.25974025974026, "grad_norm": 2.559070110321045, "learning_rate": 9.880950590977765e-05, "loss": 1.3795, "step": 348 }, { "epoch": 2.2662337662337664, "grad_norm": 3.3524162769317627, "learning_rate": 9.880268163662042e-05, "loss": 1.8738, "step": 349 }, { "epoch": 2.2727272727272725, "grad_norm": 3.0645384788513184, "learning_rate": 9.879583809693738e-05, "loss": 1.6108, "step": 350 }, { "epoch": 2.279220779220779, "grad_norm": 3.547124147415161, "learning_rate": 9.878897529343023e-05, "loss": 1.9958, "step": 351 }, { "epoch": 2.2857142857142856, "grad_norm": 2.875279188156128, "learning_rate": 9.87820932288083e-05, "loss": 1.5304, "step": 352 }, { "epoch": 2.292207792207792, "grad_norm": 3.039005756378174, "learning_rate": 9.877519190578852e-05, "loss": 1.6787, "step": 353 }, { "epoch": 2.2987012987012987, "grad_norm": 3.231783151626587, "learning_rate": 9.876827132709544e-05, "loss": 1.8304, "step": 354 }, { "epoch": 2.3051948051948052, "grad_norm": 3.3293938636779785, "learning_rate": 9.876133149546118e-05, "loss": 1.8101, "step": 355 }, { "epoch": 2.311688311688312, "grad_norm": 3.1368279457092285, "learning_rate": 9.875437241362546e-05, "loss": 1.7841, "step": 356 }, { "epoch": 2.3181818181818183, "grad_norm": 3.4561681747436523, "learning_rate": 9.874739408433565e-05, "loss": 1.8775, "step": 357 }, { "epoch": 2.324675324675325, "grad_norm": 2.352166175842285, "learning_rate": 9.874039651034666e-05, "loss": 1.4749, "step": 358 }, { "epoch": 2.331168831168831, "grad_norm": 2.7010676860809326, "learning_rate": 9.873337969442101e-05, "loss": 1.6247, "step": 359 }, { "epoch": 2.3376623376623376, "grad_norm": 2.420346260070801, "learning_rate": 9.872634363932887e-05, "loss": 1.2657, "step": 360 }, { "epoch": 2.344155844155844, "grad_norm": 3.2130322456359863, "learning_rate": 9.871928834784792e-05, "loss": 1.796, "step": 361 }, { "epoch": 2.3506493506493507, "grad_norm": 2.915154457092285, "learning_rate": 9.87122138227635e-05, "loss": 1.6665, "step": 362 }, { "epoch": 2.357142857142857, "grad_norm": 3.347503185272217, "learning_rate": 9.870512006686851e-05, "loss": 1.599, "step": 363 }, { "epoch": 2.3636363636363638, "grad_norm": 3.082561492919922, "learning_rate": 9.869800708296346e-05, "loss": 1.9103, "step": 364 }, { "epoch": 2.3701298701298703, "grad_norm": 2.9106686115264893, "learning_rate": 9.869087487385644e-05, "loss": 1.8083, "step": 365 }, { "epoch": 2.3766233766233764, "grad_norm": 3.256690740585327, "learning_rate": 9.868372344236313e-05, "loss": 1.6163, "step": 366 }, { "epoch": 2.383116883116883, "grad_norm": 3.0932395458221436, "learning_rate": 9.867655279130683e-05, "loss": 1.7318, "step": 367 }, { "epoch": 2.3896103896103895, "grad_norm": 2.975876569747925, "learning_rate": 9.866936292351836e-05, "loss": 1.8791, "step": 368 }, { "epoch": 2.396103896103896, "grad_norm": 3.221210241317749, "learning_rate": 9.866215384183619e-05, "loss": 1.8514, "step": 369 }, { "epoch": 2.4025974025974026, "grad_norm": 3.188079833984375, "learning_rate": 9.865492554910633e-05, "loss": 1.8689, "step": 370 }, { "epoch": 2.409090909090909, "grad_norm": 3.175229072570801, "learning_rate": 9.864767804818243e-05, "loss": 1.8805, "step": 371 }, { "epoch": 2.4155844155844157, "grad_norm": 3.077758312225342, "learning_rate": 9.864041134192563e-05, "loss": 1.9167, "step": 372 }, { "epoch": 2.4220779220779223, "grad_norm": 2.952587366104126, "learning_rate": 9.863312543320477e-05, "loss": 1.7636, "step": 373 }, { "epoch": 2.4285714285714284, "grad_norm": 2.8241958618164062, "learning_rate": 9.86258203248962e-05, "loss": 1.583, "step": 374 }, { "epoch": 2.435064935064935, "grad_norm": 3.0791385173797607, "learning_rate": 9.861849601988383e-05, "loss": 1.5679, "step": 375 }, { "epoch": 2.4415584415584415, "grad_norm": 3.0636885166168213, "learning_rate": 9.861115252105921e-05, "loss": 1.6682, "step": 376 }, { "epoch": 2.448051948051948, "grad_norm": 2.9893672466278076, "learning_rate": 9.860378983132143e-05, "loss": 1.8759, "step": 377 }, { "epoch": 2.4545454545454546, "grad_norm": 2.585669755935669, "learning_rate": 9.859640795357716e-05, "loss": 1.602, "step": 378 }, { "epoch": 2.461038961038961, "grad_norm": 2.8519511222839355, "learning_rate": 9.858900689074064e-05, "loss": 1.8531, "step": 379 }, { "epoch": 2.4675324675324677, "grad_norm": 2.6581485271453857, "learning_rate": 9.85815866457337e-05, "loss": 1.6298, "step": 380 }, { "epoch": 2.474025974025974, "grad_norm": 2.6713006496429443, "learning_rate": 9.857414722148574e-05, "loss": 1.7688, "step": 381 }, { "epoch": 2.4805194805194803, "grad_norm": 2.8108949661254883, "learning_rate": 9.856668862093372e-05, "loss": 1.7104, "step": 382 }, { "epoch": 2.487012987012987, "grad_norm": 2.4003746509552, "learning_rate": 9.855921084702219e-05, "loss": 1.4872, "step": 383 }, { "epoch": 2.4935064935064934, "grad_norm": 2.9604358673095703, "learning_rate": 9.855171390270324e-05, "loss": 1.6916, "step": 384 }, { "epoch": 2.5, "grad_norm": 2.9553635120391846, "learning_rate": 9.854419779093655e-05, "loss": 1.65, "step": 385 }, { "epoch": 2.5064935064935066, "grad_norm": 2.6719701290130615, "learning_rate": 9.853666251468937e-05, "loss": 1.5492, "step": 386 }, { "epoch": 2.512987012987013, "grad_norm": 2.918327808380127, "learning_rate": 9.85291080769365e-05, "loss": 1.8282, "step": 387 }, { "epoch": 2.5194805194805197, "grad_norm": 3.0886974334716797, "learning_rate": 9.852153448066032e-05, "loss": 1.7593, "step": 388 }, { "epoch": 2.525974025974026, "grad_norm": 2.843238592147827, "learning_rate": 9.851394172885074e-05, "loss": 1.7201, "step": 389 }, { "epoch": 2.5324675324675323, "grad_norm": 2.817976951599121, "learning_rate": 9.85063298245053e-05, "loss": 1.7933, "step": 390 }, { "epoch": 2.538961038961039, "grad_norm": 2.8269622325897217, "learning_rate": 9.849869877062902e-05, "loss": 1.7015, "step": 391 }, { "epoch": 2.5454545454545454, "grad_norm": 2.727435827255249, "learning_rate": 9.849104857023455e-05, "loss": 1.481, "step": 392 }, { "epoch": 2.551948051948052, "grad_norm": 2.7080495357513428, "learning_rate": 9.848337922634206e-05, "loss": 1.4346, "step": 393 }, { "epoch": 2.5584415584415585, "grad_norm": 3.0474908351898193, "learning_rate": 9.847569074197926e-05, "loss": 1.6259, "step": 394 }, { "epoch": 2.564935064935065, "grad_norm": 2.7546396255493164, "learning_rate": 9.846798312018146e-05, "loss": 1.5626, "step": 395 }, { "epoch": 2.571428571428571, "grad_norm": 2.9604411125183105, "learning_rate": 9.846025636399152e-05, "loss": 2.0086, "step": 396 }, { "epoch": 2.5779220779220777, "grad_norm": 3.0381553173065186, "learning_rate": 9.845251047645983e-05, "loss": 2.0266, "step": 397 }, { "epoch": 2.5844155844155843, "grad_norm": 2.540466070175171, "learning_rate": 9.844474546064435e-05, "loss": 1.4928, "step": 398 }, { "epoch": 2.590909090909091, "grad_norm": 2.5260584354400635, "learning_rate": 9.843696131961058e-05, "loss": 1.5486, "step": 399 }, { "epoch": 2.5974025974025974, "grad_norm": 2.7667641639709473, "learning_rate": 9.842915805643155e-05, "loss": 1.6139, "step": 400 }, { "epoch": 2.603896103896104, "grad_norm": 2.7333874702453613, "learning_rate": 9.842133567418792e-05, "loss": 1.747, "step": 401 }, { "epoch": 2.6103896103896105, "grad_norm": 2.620067596435547, "learning_rate": 9.841349417596779e-05, "loss": 1.6353, "step": 402 }, { "epoch": 2.616883116883117, "grad_norm": 2.723745584487915, "learning_rate": 9.84056335648669e-05, "loss": 1.5413, "step": 403 }, { "epoch": 2.6233766233766236, "grad_norm": 2.789571523666382, "learning_rate": 9.839775384398847e-05, "loss": 1.9104, "step": 404 }, { "epoch": 2.62987012987013, "grad_norm": 2.624312162399292, "learning_rate": 9.838985501644328e-05, "loss": 1.7595, "step": 405 }, { "epoch": 2.6363636363636362, "grad_norm": 2.454328775405884, "learning_rate": 9.838193708534968e-05, "loss": 1.5174, "step": 406 }, { "epoch": 2.642857142857143, "grad_norm": 2.664504051208496, "learning_rate": 9.837400005383354e-05, "loss": 1.5799, "step": 407 }, { "epoch": 2.6493506493506493, "grad_norm": 2.553387403488159, "learning_rate": 9.83660439250283e-05, "loss": 1.6922, "step": 408 }, { "epoch": 2.655844155844156, "grad_norm": 2.5766408443450928, "learning_rate": 9.835806870207487e-05, "loss": 1.8922, "step": 409 }, { "epoch": 2.6623376623376624, "grad_norm": 2.8266332149505615, "learning_rate": 9.835007438812177e-05, "loss": 1.6223, "step": 410 }, { "epoch": 2.6688311688311686, "grad_norm": 2.6766135692596436, "learning_rate": 9.834206098632499e-05, "loss": 1.6929, "step": 411 }, { "epoch": 2.675324675324675, "grad_norm": 2.5043816566467285, "learning_rate": 9.833402849984815e-05, "loss": 1.5207, "step": 412 }, { "epoch": 2.6818181818181817, "grad_norm": 2.6566975116729736, "learning_rate": 9.832597693186232e-05, "loss": 1.597, "step": 413 }, { "epoch": 2.688311688311688, "grad_norm": 2.4868650436401367, "learning_rate": 9.831790628554612e-05, "loss": 1.6636, "step": 414 }, { "epoch": 2.6948051948051948, "grad_norm": 2.5328221321105957, "learning_rate": 9.830981656408574e-05, "loss": 1.7525, "step": 415 }, { "epoch": 2.7012987012987013, "grad_norm": 2.5664961338043213, "learning_rate": 9.830170777067485e-05, "loss": 1.6539, "step": 416 }, { "epoch": 2.707792207792208, "grad_norm": 2.7295408248901367, "learning_rate": 9.829357990851468e-05, "loss": 1.8107, "step": 417 }, { "epoch": 2.7142857142857144, "grad_norm": 2.501190662384033, "learning_rate": 9.8285432980814e-05, "loss": 1.4882, "step": 418 }, { "epoch": 2.720779220779221, "grad_norm": 2.678788185119629, "learning_rate": 9.827726699078908e-05, "loss": 1.711, "step": 419 }, { "epoch": 2.7272727272727275, "grad_norm": 2.4673514366149902, "learning_rate": 9.82690819416637e-05, "loss": 1.6805, "step": 420 }, { "epoch": 2.7337662337662336, "grad_norm": 2.555209159851074, "learning_rate": 9.826087783666921e-05, "loss": 1.705, "step": 421 }, { "epoch": 2.74025974025974, "grad_norm": 2.5114858150482178, "learning_rate": 9.825265467904445e-05, "loss": 1.6545, "step": 422 }, { "epoch": 2.7467532467532467, "grad_norm": 2.7648122310638428, "learning_rate": 9.824441247203579e-05, "loss": 1.741, "step": 423 }, { "epoch": 2.7532467532467533, "grad_norm": 2.949734687805176, "learning_rate": 9.823615121889716e-05, "loss": 1.8766, "step": 424 }, { "epoch": 2.75974025974026, "grad_norm": 2.765125036239624, "learning_rate": 9.822787092288991e-05, "loss": 1.674, "step": 425 }, { "epoch": 2.7662337662337664, "grad_norm": 2.5355770587921143, "learning_rate": 9.821957158728301e-05, "loss": 1.7228, "step": 426 }, { "epoch": 2.7727272727272725, "grad_norm": 2.7192254066467285, "learning_rate": 9.82112532153529e-05, "loss": 1.8595, "step": 427 }, { "epoch": 2.779220779220779, "grad_norm": 2.6829566955566406, "learning_rate": 9.820291581038355e-05, "loss": 1.758, "step": 428 }, { "epoch": 2.7857142857142856, "grad_norm": 2.2499940395355225, "learning_rate": 9.819455937566642e-05, "loss": 1.427, "step": 429 }, { "epoch": 2.792207792207792, "grad_norm": 2.475795030593872, "learning_rate": 9.81861839145005e-05, "loss": 1.7117, "step": 430 }, { "epoch": 2.7987012987012987, "grad_norm": 2.8484835624694824, "learning_rate": 9.817778943019228e-05, "loss": 1.8138, "step": 431 }, { "epoch": 2.8051948051948052, "grad_norm": 2.821648359298706, "learning_rate": 9.816937592605579e-05, "loss": 1.6179, "step": 432 }, { "epoch": 2.811688311688312, "grad_norm": 2.4450900554656982, "learning_rate": 9.816094340541256e-05, "loss": 1.4323, "step": 433 }, { "epoch": 2.8181818181818183, "grad_norm": 2.6112494468688965, "learning_rate": 9.815249187159157e-05, "loss": 1.8037, "step": 434 }, { "epoch": 2.824675324675325, "grad_norm": 2.6128196716308594, "learning_rate": 9.814402132792939e-05, "loss": 1.6578, "step": 435 }, { "epoch": 2.8311688311688314, "grad_norm": 2.560913562774658, "learning_rate": 9.813553177777003e-05, "loss": 1.8626, "step": 436 }, { "epoch": 2.8376623376623376, "grad_norm": 2.2726757526397705, "learning_rate": 9.812702322446505e-05, "loss": 1.3101, "step": 437 }, { "epoch": 2.844155844155844, "grad_norm": 2.4498019218444824, "learning_rate": 9.81184956713735e-05, "loss": 1.6713, "step": 438 }, { "epoch": 2.8506493506493507, "grad_norm": 2.671379804611206, "learning_rate": 9.810994912186189e-05, "loss": 1.7618, "step": 439 }, { "epoch": 2.857142857142857, "grad_norm": 2.9830217361450195, "learning_rate": 9.81013835793043e-05, "loss": 1.8335, "step": 440 }, { "epoch": 2.8636363636363638, "grad_norm": 2.728466749191284, "learning_rate": 9.809279904708224e-05, "loss": 1.6927, "step": 441 }, { "epoch": 2.87012987012987, "grad_norm": 2.514505624771118, "learning_rate": 9.808419552858477e-05, "loss": 1.7542, "step": 442 }, { "epoch": 2.8766233766233764, "grad_norm": 2.831462860107422, "learning_rate": 9.80755730272084e-05, "loss": 1.6759, "step": 443 }, { "epoch": 2.883116883116883, "grad_norm": 2.8181638717651367, "learning_rate": 9.806693154635718e-05, "loss": 1.8944, "step": 444 }, { "epoch": 2.8896103896103895, "grad_norm": 2.7978835105895996, "learning_rate": 9.80582710894426e-05, "loss": 1.7783, "step": 445 }, { "epoch": 2.896103896103896, "grad_norm": 2.823338747024536, "learning_rate": 9.80495916598837e-05, "loss": 1.7405, "step": 446 }, { "epoch": 2.9025974025974026, "grad_norm": 2.665881395339966, "learning_rate": 9.804089326110697e-05, "loss": 1.6666, "step": 447 }, { "epoch": 2.909090909090909, "grad_norm": 2.69315767288208, "learning_rate": 9.80321758965464e-05, "loss": 1.759, "step": 448 }, { "epoch": 2.9155844155844157, "grad_norm": 2.6156058311462402, "learning_rate": 9.802343956964348e-05, "loss": 1.6456, "step": 449 }, { "epoch": 2.9220779220779223, "grad_norm": 2.9610443115234375, "learning_rate": 9.801468428384716e-05, "loss": 1.8471, "step": 450 }, { "epoch": 2.928571428571429, "grad_norm": 2.6673457622528076, "learning_rate": 9.800591004261388e-05, "loss": 1.6938, "step": 451 }, { "epoch": 2.935064935064935, "grad_norm": 2.542597532272339, "learning_rate": 9.79971168494076e-05, "loss": 1.5416, "step": 452 }, { "epoch": 2.9415584415584415, "grad_norm": 2.7253355979919434, "learning_rate": 9.79883047076997e-05, "loss": 1.7721, "step": 453 }, { "epoch": 2.948051948051948, "grad_norm": 2.6640748977661133, "learning_rate": 9.797947362096908e-05, "loss": 1.9404, "step": 454 }, { "epoch": 2.9545454545454546, "grad_norm": 2.5272445678710938, "learning_rate": 9.797062359270215e-05, "loss": 1.6494, "step": 455 }, { "epoch": 2.961038961038961, "grad_norm": 2.679551839828491, "learning_rate": 9.796175462639272e-05, "loss": 1.5135, "step": 456 }, { "epoch": 2.9675324675324677, "grad_norm": 2.832418918609619, "learning_rate": 9.795286672554213e-05, "loss": 1.7057, "step": 457 }, { "epoch": 2.974025974025974, "grad_norm": 2.6729843616485596, "learning_rate": 9.794395989365918e-05, "loss": 1.7592, "step": 458 }, { "epoch": 2.9805194805194803, "grad_norm": 2.605210065841675, "learning_rate": 9.793503413426015e-05, "loss": 1.7291, "step": 459 }, { "epoch": 2.987012987012987, "grad_norm": 2.906468391418457, "learning_rate": 9.79260894508688e-05, "loss": 1.6162, "step": 460 }, { "epoch": 2.9935064935064934, "grad_norm": 2.4108238220214844, "learning_rate": 9.791712584701634e-05, "loss": 1.5179, "step": 461 }, { "epoch": 3.0, "grad_norm": 3461.14306640625, "learning_rate": 9.790814332624143e-05, "loss": 1.6267, "step": 462 }, { "epoch": 3.0064935064935066, "grad_norm": 2.691493511199951, "learning_rate": 9.789914189209029e-05, "loss": 1.4611, "step": 463 }, { "epoch": 3.012987012987013, "grad_norm": 2.130276918411255, "learning_rate": 9.789012154811647e-05, "loss": 1.0821, "step": 464 }, { "epoch": 3.0194805194805197, "grad_norm": 2.652697801589966, "learning_rate": 9.788108229788111e-05, "loss": 1.4394, "step": 465 }, { "epoch": 3.0259740259740258, "grad_norm": 2.6161065101623535, "learning_rate": 9.787202414495276e-05, "loss": 1.5084, "step": 466 }, { "epoch": 3.0324675324675323, "grad_norm": 2.4646644592285156, "learning_rate": 9.786294709290741e-05, "loss": 1.375, "step": 467 }, { "epoch": 3.038961038961039, "grad_norm": 2.6410462856292725, "learning_rate": 9.785385114532857e-05, "loss": 1.4476, "step": 468 }, { "epoch": 3.0454545454545454, "grad_norm": 2.5742151737213135, "learning_rate": 9.784473630580713e-05, "loss": 1.4318, "step": 469 }, { "epoch": 3.051948051948052, "grad_norm": 2.7610526084899902, "learning_rate": 9.783560257794154e-05, "loss": 1.5746, "step": 470 }, { "epoch": 3.0584415584415585, "grad_norm": 2.6887526512145996, "learning_rate": 9.78264499653376e-05, "loss": 1.5727, "step": 471 }, { "epoch": 3.064935064935065, "grad_norm": 2.533906936645508, "learning_rate": 9.781727847160865e-05, "loss": 1.2746, "step": 472 }, { "epoch": 3.0714285714285716, "grad_norm": 2.960747241973877, "learning_rate": 9.780808810037543e-05, "loss": 1.5435, "step": 473 }, { "epoch": 3.0779220779220777, "grad_norm": 2.580984115600586, "learning_rate": 9.779887885526615e-05, "loss": 1.4734, "step": 474 }, { "epoch": 3.0844155844155843, "grad_norm": 2.5366058349609375, "learning_rate": 9.778965073991651e-05, "loss": 1.2849, "step": 475 }, { "epoch": 3.090909090909091, "grad_norm": 2.808206796646118, "learning_rate": 9.778040375796959e-05, "loss": 1.3771, "step": 476 }, { "epoch": 3.0974025974025974, "grad_norm": 3.073631763458252, "learning_rate": 9.777113791307598e-05, "loss": 1.3693, "step": 477 }, { "epoch": 3.103896103896104, "grad_norm": 2.810788869857788, "learning_rate": 9.776185320889363e-05, "loss": 1.6245, "step": 478 }, { "epoch": 3.1103896103896105, "grad_norm": 2.9978721141815186, "learning_rate": 9.775254964908807e-05, "loss": 1.5436, "step": 479 }, { "epoch": 3.116883116883117, "grad_norm": 2.744044065475464, "learning_rate": 9.774322723733216e-05, "loss": 1.6038, "step": 480 }, { "epoch": 3.1233766233766236, "grad_norm": 2.7648191452026367, "learning_rate": 9.773388597730623e-05, "loss": 1.3454, "step": 481 }, { "epoch": 3.1298701298701297, "grad_norm": 2.9355039596557617, "learning_rate": 9.772452587269808e-05, "loss": 1.5168, "step": 482 }, { "epoch": 3.1363636363636362, "grad_norm": 2.6012725830078125, "learning_rate": 9.771514692720293e-05, "loss": 1.4007, "step": 483 }, { "epoch": 3.142857142857143, "grad_norm": 2.895432949066162, "learning_rate": 9.770574914452343e-05, "loss": 1.5449, "step": 484 }, { "epoch": 3.1493506493506493, "grad_norm": 3.0441057682037354, "learning_rate": 9.769633252836969e-05, "loss": 1.7258, "step": 485 }, { "epoch": 3.155844155844156, "grad_norm": 2.6639978885650635, "learning_rate": 9.768689708245922e-05, "loss": 1.2904, "step": 486 }, { "epoch": 3.1623376623376624, "grad_norm": 2.972113847732544, "learning_rate": 9.767744281051701e-05, "loss": 1.5279, "step": 487 }, { "epoch": 3.168831168831169, "grad_norm": 2.602752685546875, "learning_rate": 9.766796971627543e-05, "loss": 1.4921, "step": 488 }, { "epoch": 3.175324675324675, "grad_norm": 2.472797393798828, "learning_rate": 9.765847780347432e-05, "loss": 1.4381, "step": 489 }, { "epoch": 3.1818181818181817, "grad_norm": 2.773179531097412, "learning_rate": 9.764896707586096e-05, "loss": 1.471, "step": 490 }, { "epoch": 3.188311688311688, "grad_norm": 2.8864376544952393, "learning_rate": 9.763943753718998e-05, "loss": 1.4751, "step": 491 }, { "epoch": 3.1948051948051948, "grad_norm": 2.8007543087005615, "learning_rate": 9.762988919122355e-05, "loss": 1.3703, "step": 492 }, { "epoch": 3.2012987012987013, "grad_norm": 2.5158145427703857, "learning_rate": 9.762032204173116e-05, "loss": 1.2792, "step": 493 }, { "epoch": 3.207792207792208, "grad_norm": 2.662209987640381, "learning_rate": 9.761073609248981e-05, "loss": 1.5026, "step": 494 }, { "epoch": 3.2142857142857144, "grad_norm": 2.8098185062408447, "learning_rate": 9.760113134728384e-05, "loss": 1.5946, "step": 495 }, { "epoch": 3.220779220779221, "grad_norm": 2.5345005989074707, "learning_rate": 9.759150780990507e-05, "loss": 1.4414, "step": 496 }, { "epoch": 3.227272727272727, "grad_norm": 2.4922263622283936, "learning_rate": 9.758186548415273e-05, "loss": 1.4034, "step": 497 }, { "epoch": 3.2337662337662336, "grad_norm": 2.613332509994507, "learning_rate": 9.757220437383346e-05, "loss": 1.5602, "step": 498 }, { "epoch": 3.24025974025974, "grad_norm": 2.9960076808929443, "learning_rate": 9.756252448276127e-05, "loss": 1.4437, "step": 499 }, { "epoch": 3.2467532467532467, "grad_norm": 2.567506790161133, "learning_rate": 9.755282581475769e-05, "loss": 1.5557, "step": 500 }, { "epoch": 3.2532467532467533, "grad_norm": 2.7986555099487305, "learning_rate": 9.754310837365155e-05, "loss": 1.458, "step": 501 }, { "epoch": 3.25974025974026, "grad_norm": 2.6835241317749023, "learning_rate": 9.753337216327917e-05, "loss": 1.4782, "step": 502 }, { "epoch": 3.2662337662337664, "grad_norm": 2.7536139488220215, "learning_rate": 9.752361718748423e-05, "loss": 1.5209, "step": 503 }, { "epoch": 3.2727272727272725, "grad_norm": 2.784055709838867, "learning_rate": 9.751384345011787e-05, "loss": 1.5919, "step": 504 }, { "epoch": 3.279220779220779, "grad_norm": 2.5098867416381836, "learning_rate": 9.750405095503859e-05, "loss": 1.5246, "step": 505 }, { "epoch": 3.2857142857142856, "grad_norm": 2.397177219390869, "learning_rate": 9.749423970611231e-05, "loss": 1.3737, "step": 506 }, { "epoch": 3.292207792207792, "grad_norm": 2.790895938873291, "learning_rate": 9.748440970721236e-05, "loss": 1.5794, "step": 507 }, { "epoch": 3.2987012987012987, "grad_norm": 2.7680368423461914, "learning_rate": 9.747456096221945e-05, "loss": 1.5599, "step": 508 }, { "epoch": 3.3051948051948052, "grad_norm": 2.880476951599121, "learning_rate": 9.746469347502174e-05, "loss": 1.6697, "step": 509 }, { "epoch": 3.311688311688312, "grad_norm": 2.7532639503479004, "learning_rate": 9.745480724951473e-05, "loss": 1.5952, "step": 510 }, { "epoch": 3.3181818181818183, "grad_norm": 2.685209035873413, "learning_rate": 9.744490228960138e-05, "loss": 1.5274, "step": 511 }, { "epoch": 3.324675324675325, "grad_norm": 2.940244674682617, "learning_rate": 9.743497859919196e-05, "loss": 1.5009, "step": 512 }, { "epoch": 3.331168831168831, "grad_norm": 2.9467644691467285, "learning_rate": 9.742503618220422e-05, "loss": 1.6353, "step": 513 }, { "epoch": 3.3376623376623376, "grad_norm": 2.8322834968566895, "learning_rate": 9.741507504256327e-05, "loss": 1.653, "step": 514 }, { "epoch": 3.344155844155844, "grad_norm": 3.078629493713379, "learning_rate": 9.74050951842016e-05, "loss": 1.6408, "step": 515 }, { "epoch": 3.3506493506493507, "grad_norm": 2.8227908611297607, "learning_rate": 9.739509661105912e-05, "loss": 1.6134, "step": 516 }, { "epoch": 3.357142857142857, "grad_norm": 2.8084356784820557, "learning_rate": 9.738507932708307e-05, "loss": 1.6953, "step": 517 }, { "epoch": 3.3636363636363638, "grad_norm": 2.492335557937622, "learning_rate": 9.737504333622813e-05, "loss": 1.534, "step": 518 }, { "epoch": 3.3701298701298703, "grad_norm": 2.6720969676971436, "learning_rate": 9.736498864245638e-05, "loss": 1.4763, "step": 519 }, { "epoch": 3.3766233766233764, "grad_norm": 2.631711006164551, "learning_rate": 9.735491524973722e-05, "loss": 1.5452, "step": 520 }, { "epoch": 3.383116883116883, "grad_norm": 3.2638559341430664, "learning_rate": 9.734482316204747e-05, "loss": 1.5939, "step": 521 }, { "epoch": 3.3896103896103895, "grad_norm": 2.783334255218506, "learning_rate": 9.733471238337136e-05, "loss": 1.448, "step": 522 }, { "epoch": 3.396103896103896, "grad_norm": 2.8106093406677246, "learning_rate": 9.73245829177004e-05, "loss": 1.378, "step": 523 }, { "epoch": 3.4025974025974026, "grad_norm": 2.796281099319458, "learning_rate": 9.73144347690336e-05, "loss": 1.5344, "step": 524 }, { "epoch": 3.409090909090909, "grad_norm": 2.7715847492218018, "learning_rate": 9.730426794137727e-05, "loss": 1.3997, "step": 525 }, { "epoch": 3.4155844155844157, "grad_norm": 2.7550740242004395, "learning_rate": 9.729408243874511e-05, "loss": 1.6974, "step": 526 }, { "epoch": 3.4220779220779223, "grad_norm": 2.8042678833007812, "learning_rate": 9.728387826515819e-05, "loss": 1.432, "step": 527 }, { "epoch": 3.4285714285714284, "grad_norm": 2.7147512435913086, "learning_rate": 9.727365542464497e-05, "loss": 1.6309, "step": 528 }, { "epoch": 3.435064935064935, "grad_norm": 2.6216771602630615, "learning_rate": 9.726341392124127e-05, "loss": 1.4175, "step": 529 }, { "epoch": 3.4415584415584415, "grad_norm": 2.668849468231201, "learning_rate": 9.725315375899024e-05, "loss": 1.6321, "step": 530 }, { "epoch": 3.448051948051948, "grad_norm": 2.8898913860321045, "learning_rate": 9.724287494194247e-05, "loss": 1.5724, "step": 531 }, { "epoch": 3.4545454545454546, "grad_norm": 2.313223361968994, "learning_rate": 9.723257747415584e-05, "loss": 1.357, "step": 532 }, { "epoch": 3.461038961038961, "grad_norm": 2.627986431121826, "learning_rate": 9.722226135969566e-05, "loss": 1.5693, "step": 533 }, { "epoch": 3.4675324675324677, "grad_norm": 2.4512863159179688, "learning_rate": 9.721192660263453e-05, "loss": 1.6062, "step": 534 }, { "epoch": 3.474025974025974, "grad_norm": 2.7816243171691895, "learning_rate": 9.72015732070525e-05, "loss": 1.6978, "step": 535 }, { "epoch": 3.4805194805194803, "grad_norm": 2.9086880683898926, "learning_rate": 9.719120117703687e-05, "loss": 1.5865, "step": 536 }, { "epoch": 3.487012987012987, "grad_norm": 2.2172327041625977, "learning_rate": 9.718081051668239e-05, "loss": 1.3989, "step": 537 }, { "epoch": 3.4935064935064934, "grad_norm": 2.8032913208007812, "learning_rate": 9.717040123009111e-05, "loss": 1.6331, "step": 538 }, { "epoch": 3.5, "grad_norm": 2.4231390953063965, "learning_rate": 9.715997332137248e-05, "loss": 1.5196, "step": 539 }, { "epoch": 3.5064935064935066, "grad_norm": 2.8790225982666016, "learning_rate": 9.714952679464323e-05, "loss": 1.5077, "step": 540 }, { "epoch": 3.512987012987013, "grad_norm": 2.6722092628479004, "learning_rate": 9.713906165402751e-05, "loss": 1.6147, "step": 541 }, { "epoch": 3.5194805194805197, "grad_norm": 2.7589612007141113, "learning_rate": 9.71285779036568e-05, "loss": 1.428, "step": 542 }, { "epoch": 3.525974025974026, "grad_norm": 2.4026832580566406, "learning_rate": 9.71180755476699e-05, "loss": 1.2818, "step": 543 }, { "epoch": 3.5324675324675323, "grad_norm": 2.9387869834899902, "learning_rate": 9.710755459021296e-05, "loss": 1.5423, "step": 544 }, { "epoch": 3.538961038961039, "grad_norm": 2.7179949283599854, "learning_rate": 9.709701503543954e-05, "loss": 1.5384, "step": 545 }, { "epoch": 3.5454545454545454, "grad_norm": 2.889209508895874, "learning_rate": 9.708645688751044e-05, "loss": 1.6434, "step": 546 }, { "epoch": 3.551948051948052, "grad_norm": 2.6073267459869385, "learning_rate": 9.707588015059386e-05, "loss": 1.729, "step": 547 }, { "epoch": 3.5584415584415585, "grad_norm": 2.7963855266571045, "learning_rate": 9.706528482886535e-05, "loss": 1.7076, "step": 548 }, { "epoch": 3.564935064935065, "grad_norm": 2.9273459911346436, "learning_rate": 9.705467092650775e-05, "loss": 1.5023, "step": 549 }, { "epoch": 3.571428571428571, "grad_norm": 2.3651621341705322, "learning_rate": 9.704403844771128e-05, "loss": 1.3576, "step": 550 }, { "epoch": 3.5779220779220777, "grad_norm": 2.7819669246673584, "learning_rate": 9.703338739667346e-05, "loss": 1.7064, "step": 551 }, { "epoch": 3.5844155844155843, "grad_norm": 2.7776331901550293, "learning_rate": 9.702271777759916e-05, "loss": 1.6858, "step": 552 }, { "epoch": 3.590909090909091, "grad_norm": 2.456737995147705, "learning_rate": 9.701202959470058e-05, "loss": 1.5001, "step": 553 }, { "epoch": 3.5974025974025974, "grad_norm": 2.6922426223754883, "learning_rate": 9.700132285219724e-05, "loss": 1.6717, "step": 554 }, { "epoch": 3.603896103896104, "grad_norm": 2.968545913696289, "learning_rate": 9.699059755431598e-05, "loss": 1.6364, "step": 555 }, { "epoch": 3.6103896103896105, "grad_norm": 2.709141731262207, "learning_rate": 9.697985370529101e-05, "loss": 1.7164, "step": 556 }, { "epoch": 3.616883116883117, "grad_norm": 2.5271835327148438, "learning_rate": 9.696909130936382e-05, "loss": 1.3943, "step": 557 }, { "epoch": 3.6233766233766236, "grad_norm": 2.51953125, "learning_rate": 9.695831037078322e-05, "loss": 1.5367, "step": 558 }, { "epoch": 3.62987012987013, "grad_norm": 2.145310401916504, "learning_rate": 9.694751089380536e-05, "loss": 1.2712, "step": 559 }, { "epoch": 3.6363636363636362, "grad_norm": 2.6842525005340576, "learning_rate": 9.693669288269372e-05, "loss": 1.5694, "step": 560 }, { "epoch": 3.642857142857143, "grad_norm": 2.610161781311035, "learning_rate": 9.692585634171905e-05, "loss": 1.6228, "step": 561 }, { "epoch": 3.6493506493506493, "grad_norm": 2.376155376434326, "learning_rate": 9.691500127515945e-05, "loss": 1.438, "step": 562 }, { "epoch": 3.655844155844156, "grad_norm": 3.225393533706665, "learning_rate": 9.690412768730035e-05, "loss": 1.7239, "step": 563 }, { "epoch": 3.6623376623376624, "grad_norm": 2.3960697650909424, "learning_rate": 9.689323558243446e-05, "loss": 1.4696, "step": 564 }, { "epoch": 3.6688311688311686, "grad_norm": 2.8606388568878174, "learning_rate": 9.688232496486178e-05, "loss": 1.5147, "step": 565 }, { "epoch": 3.675324675324675, "grad_norm": 2.504814863204956, "learning_rate": 9.687139583888972e-05, "loss": 1.5208, "step": 566 }, { "epoch": 3.6818181818181817, "grad_norm": 2.563558340072632, "learning_rate": 9.686044820883285e-05, "loss": 1.3693, "step": 567 }, { "epoch": 3.688311688311688, "grad_norm": 2.8867974281311035, "learning_rate": 9.684948207901315e-05, "loss": 1.6681, "step": 568 }, { "epoch": 3.6948051948051948, "grad_norm": 2.5309319496154785, "learning_rate": 9.68384974537599e-05, "loss": 1.5182, "step": 569 }, { "epoch": 3.7012987012987013, "grad_norm": 2.7548654079437256, "learning_rate": 9.682749433740962e-05, "loss": 1.7318, "step": 570 }, { "epoch": 3.707792207792208, "grad_norm": 2.2360949516296387, "learning_rate": 9.681647273430618e-05, "loss": 1.223, "step": 571 }, { "epoch": 3.7142857142857144, "grad_norm": 2.6854865550994873, "learning_rate": 9.680543264880076e-05, "loss": 1.6428, "step": 572 }, { "epoch": 3.720779220779221, "grad_norm": 2.4563329219818115, "learning_rate": 9.679437408525174e-05, "loss": 1.5267, "step": 573 }, { "epoch": 3.7272727272727275, "grad_norm": 2.505563735961914, "learning_rate": 9.678329704802494e-05, "loss": 1.4726, "step": 574 }, { "epoch": 3.7337662337662336, "grad_norm": 2.7705132961273193, "learning_rate": 9.677220154149336e-05, "loss": 1.6072, "step": 575 }, { "epoch": 3.74025974025974, "grad_norm": 2.2069296836853027, "learning_rate": 9.676108757003735e-05, "loss": 1.1699, "step": 576 }, { "epoch": 3.7467532467532467, "grad_norm": 2.6705710887908936, "learning_rate": 9.674995513804452e-05, "loss": 1.5892, "step": 577 }, { "epoch": 3.7532467532467533, "grad_norm": 2.477724313735962, "learning_rate": 9.673880424990977e-05, "loss": 1.4687, "step": 578 }, { "epoch": 3.75974025974026, "grad_norm": 2.465447425842285, "learning_rate": 9.672763491003531e-05, "loss": 1.5392, "step": 579 }, { "epoch": 3.7662337662337664, "grad_norm": 2.462146282196045, "learning_rate": 9.671644712283061e-05, "loss": 1.5552, "step": 580 }, { "epoch": 3.7727272727272725, "grad_norm": 2.4764628410339355, "learning_rate": 9.670524089271242e-05, "loss": 1.6651, "step": 581 }, { "epoch": 3.779220779220779, "grad_norm": 2.6047585010528564, "learning_rate": 9.669401622410482e-05, "loss": 1.4174, "step": 582 }, { "epoch": 3.7857142857142856, "grad_norm": 2.6200366020202637, "learning_rate": 9.668277312143907e-05, "loss": 1.5273, "step": 583 }, { "epoch": 3.792207792207792, "grad_norm": 3.028610944747925, "learning_rate": 9.667151158915382e-05, "loss": 1.698, "step": 584 }, { "epoch": 3.7987012987012987, "grad_norm": 2.632977247238159, "learning_rate": 9.666023163169493e-05, "loss": 1.6539, "step": 585 }, { "epoch": 3.8051948051948052, "grad_norm": 2.5790421962738037, "learning_rate": 9.664893325351555e-05, "loss": 1.5818, "step": 586 }, { "epoch": 3.811688311688312, "grad_norm": 2.546786069869995, "learning_rate": 9.663761645907609e-05, "loss": 1.6406, "step": 587 }, { "epoch": 3.8181818181818183, "grad_norm": 2.581007957458496, "learning_rate": 9.662628125284425e-05, "loss": 1.4747, "step": 588 }, { "epoch": 3.824675324675325, "grad_norm": 2.5376641750335693, "learning_rate": 9.6614927639295e-05, "loss": 1.5814, "step": 589 }, { "epoch": 3.8311688311688314, "grad_norm": 2.6359288692474365, "learning_rate": 9.660355562291055e-05, "loss": 1.5488, "step": 590 }, { "epoch": 3.8376623376623376, "grad_norm": 2.6092121601104736, "learning_rate": 9.65921652081804e-05, "loss": 1.4414, "step": 591 }, { "epoch": 3.844155844155844, "grad_norm": 2.4724388122558594, "learning_rate": 9.65807563996013e-05, "loss": 1.4849, "step": 592 }, { "epoch": 3.8506493506493507, "grad_norm": 2.303741693496704, "learning_rate": 9.656932920167727e-05, "loss": 1.6192, "step": 593 }, { "epoch": 3.857142857142857, "grad_norm": 2.2833895683288574, "learning_rate": 9.65578836189196e-05, "loss": 1.389, "step": 594 }, { "epoch": 3.8636363636363638, "grad_norm": 2.262861728668213, "learning_rate": 9.654641965584678e-05, "loss": 1.5055, "step": 595 }, { "epoch": 3.87012987012987, "grad_norm": 2.427997350692749, "learning_rate": 9.653493731698467e-05, "loss": 1.5428, "step": 596 }, { "epoch": 3.8766233766233764, "grad_norm": 2.410557270050049, "learning_rate": 9.652343660686626e-05, "loss": 1.5879, "step": 597 }, { "epoch": 3.883116883116883, "grad_norm": 2.3627400398254395, "learning_rate": 9.651191753003186e-05, "loss": 1.4858, "step": 598 }, { "epoch": 3.8896103896103895, "grad_norm": 2.57161283493042, "learning_rate": 9.650038009102905e-05, "loss": 1.6244, "step": 599 }, { "epoch": 3.896103896103896, "grad_norm": 2.7876200675964355, "learning_rate": 9.648882429441257e-05, "loss": 1.6798, "step": 600 }, { "epoch": 3.9025974025974026, "grad_norm": 2.5130650997161865, "learning_rate": 9.647725014474452e-05, "loss": 1.4941, "step": 601 }, { "epoch": 3.909090909090909, "grad_norm": 2.831350326538086, "learning_rate": 9.646565764659417e-05, "loss": 1.6509, "step": 602 }, { "epoch": 3.9155844155844157, "grad_norm": 2.643336534500122, "learning_rate": 9.645404680453805e-05, "loss": 1.5174, "step": 603 }, { "epoch": 3.9220779220779223, "grad_norm": 2.471973180770874, "learning_rate": 9.644241762315995e-05, "loss": 1.6618, "step": 604 }, { "epoch": 3.928571428571429, "grad_norm": 2.491856575012207, "learning_rate": 9.643077010705087e-05, "loss": 1.4538, "step": 605 }, { "epoch": 3.935064935064935, "grad_norm": 2.444056749343872, "learning_rate": 9.641910426080908e-05, "loss": 1.6486, "step": 606 }, { "epoch": 3.9415584415584415, "grad_norm": 2.5531413555145264, "learning_rate": 9.640742008904005e-05, "loss": 1.5494, "step": 607 }, { "epoch": 3.948051948051948, "grad_norm": 2.229311466217041, "learning_rate": 9.639571759635654e-05, "loss": 1.3976, "step": 608 }, { "epoch": 3.9545454545454546, "grad_norm": 2.342977285385132, "learning_rate": 9.638399678737848e-05, "loss": 1.6625, "step": 609 }, { "epoch": 3.961038961038961, "grad_norm": 2.172034978866577, "learning_rate": 9.637225766673307e-05, "loss": 1.3824, "step": 610 }, { "epoch": 3.9675324675324677, "grad_norm": 2.5296199321746826, "learning_rate": 9.636050023905473e-05, "loss": 1.6165, "step": 611 }, { "epoch": 3.974025974025974, "grad_norm": 2.536747455596924, "learning_rate": 9.63487245089851e-05, "loss": 1.7385, "step": 612 }, { "epoch": 3.9805194805194803, "grad_norm": 2.177907705307007, "learning_rate": 9.633693048117306e-05, "loss": 1.4146, "step": 613 }, { "epoch": 3.987012987012987, "grad_norm": 2.305320978164673, "learning_rate": 9.632511816027469e-05, "loss": 1.4506, "step": 614 }, { "epoch": 3.9935064935064934, "grad_norm": 2.482697010040283, "learning_rate": 9.631328755095333e-05, "loss": 1.5763, "step": 615 }, { "epoch": 4.0, "grad_norm": 5.43980073928833, "learning_rate": 9.630143865787951e-05, "loss": 1.6993, "step": 616 }, { "epoch": 4.0064935064935066, "grad_norm": 2.2191660404205322, "learning_rate": 9.628957148573098e-05, "loss": 1.2847, "step": 617 }, { "epoch": 4.012987012987013, "grad_norm": 2.345017671585083, "learning_rate": 9.62776860391927e-05, "loss": 1.2782, "step": 618 }, { "epoch": 4.01948051948052, "grad_norm": 2.5423858165740967, "learning_rate": 9.626578232295689e-05, "loss": 1.6394, "step": 619 }, { "epoch": 4.025974025974026, "grad_norm": 2.2237322330474854, "learning_rate": 9.62538603417229e-05, "loss": 1.2239, "step": 620 }, { "epoch": 4.032467532467533, "grad_norm": 2.228304147720337, "learning_rate": 9.62419201001974e-05, "loss": 1.3274, "step": 621 }, { "epoch": 4.038961038961039, "grad_norm": 2.5428571701049805, "learning_rate": 9.622996160309414e-05, "loss": 1.2906, "step": 622 }, { "epoch": 4.045454545454546, "grad_norm": 2.316067695617676, "learning_rate": 9.62179848551342e-05, "loss": 1.2922, "step": 623 }, { "epoch": 4.0519480519480515, "grad_norm": 2.2502663135528564, "learning_rate": 9.620598986104578e-05, "loss": 1.2758, "step": 624 }, { "epoch": 4.058441558441558, "grad_norm": 2.4511966705322266, "learning_rate": 9.619397662556435e-05, "loss": 1.3631, "step": 625 }, { "epoch": 4.064935064935065, "grad_norm": 2.5539820194244385, "learning_rate": 9.61819451534325e-05, "loss": 1.3669, "step": 626 }, { "epoch": 4.071428571428571, "grad_norm": 2.432616949081421, "learning_rate": 9.616989544940009e-05, "loss": 1.4103, "step": 627 }, { "epoch": 4.077922077922078, "grad_norm": 2.256044387817383, "learning_rate": 9.615782751822413e-05, "loss": 1.2974, "step": 628 }, { "epoch": 4.084415584415584, "grad_norm": 2.4633290767669678, "learning_rate": 9.614574136466888e-05, "loss": 1.3829, "step": 629 }, { "epoch": 4.090909090909091, "grad_norm": 2.8581559658050537, "learning_rate": 9.613363699350575e-05, "loss": 1.4883, "step": 630 }, { "epoch": 4.097402597402597, "grad_norm": 2.2781195640563965, "learning_rate": 9.612151440951334e-05, "loss": 1.362, "step": 631 }, { "epoch": 4.103896103896104, "grad_norm": 2.3701205253601074, "learning_rate": 9.610937361747748e-05, "loss": 1.2678, "step": 632 }, { "epoch": 4.1103896103896105, "grad_norm": 2.225470542907715, "learning_rate": 9.609721462219114e-05, "loss": 1.2274, "step": 633 }, { "epoch": 4.116883116883117, "grad_norm": 2.603336811065674, "learning_rate": 9.60850374284545e-05, "loss": 1.4969, "step": 634 }, { "epoch": 4.123376623376624, "grad_norm": 2.4754090309143066, "learning_rate": 9.607284204107493e-05, "loss": 1.2693, "step": 635 }, { "epoch": 4.12987012987013, "grad_norm": 2.260408639907837, "learning_rate": 9.606062846486698e-05, "loss": 1.2367, "step": 636 }, { "epoch": 4.136363636363637, "grad_norm": 2.4500088691711426, "learning_rate": 9.604839670465236e-05, "loss": 1.3531, "step": 637 }, { "epoch": 4.142857142857143, "grad_norm": 2.718536853790283, "learning_rate": 9.603614676526e-05, "loss": 1.4347, "step": 638 }, { "epoch": 4.14935064935065, "grad_norm": 2.782520294189453, "learning_rate": 9.602387865152597e-05, "loss": 1.3553, "step": 639 }, { "epoch": 4.1558441558441555, "grad_norm": 2.552777051925659, "learning_rate": 9.601159236829352e-05, "loss": 1.4462, "step": 640 }, { "epoch": 4.162337662337662, "grad_norm": 2.2942702770233154, "learning_rate": 9.599928792041308e-05, "loss": 1.2738, "step": 641 }, { "epoch": 4.1688311688311686, "grad_norm": 2.392411231994629, "learning_rate": 9.598696531274227e-05, "loss": 1.3295, "step": 642 }, { "epoch": 4.175324675324675, "grad_norm": 2.619590997695923, "learning_rate": 9.597462455014585e-05, "loss": 1.3489, "step": 643 }, { "epoch": 4.181818181818182, "grad_norm": 2.656822443008423, "learning_rate": 9.596226563749575e-05, "loss": 1.511, "step": 644 }, { "epoch": 4.188311688311688, "grad_norm": 2.4362051486968994, "learning_rate": 9.594988857967106e-05, "loss": 1.2914, "step": 645 }, { "epoch": 4.194805194805195, "grad_norm": 2.1643314361572266, "learning_rate": 9.593749338155809e-05, "loss": 1.1751, "step": 646 }, { "epoch": 4.201298701298701, "grad_norm": 2.725790023803711, "learning_rate": 9.592508004805023e-05, "loss": 1.5275, "step": 647 }, { "epoch": 4.207792207792208, "grad_norm": 2.422140598297119, "learning_rate": 9.59126485840481e-05, "loss": 1.3667, "step": 648 }, { "epoch": 4.214285714285714, "grad_norm": 2.537302017211914, "learning_rate": 9.59001989944594e-05, "loss": 1.4091, "step": 649 }, { "epoch": 4.220779220779221, "grad_norm": 2.6183526515960693, "learning_rate": 9.588773128419906e-05, "loss": 1.4391, "step": 650 }, { "epoch": 4.2272727272727275, "grad_norm": 2.544766426086426, "learning_rate": 9.587524545818913e-05, "loss": 1.4811, "step": 651 }, { "epoch": 4.233766233766234, "grad_norm": 2.4699995517730713, "learning_rate": 9.586274152135884e-05, "loss": 1.3246, "step": 652 }, { "epoch": 4.240259740259741, "grad_norm": 2.2006990909576416, "learning_rate": 9.58502194786445e-05, "loss": 1.2254, "step": 653 }, { "epoch": 4.246753246753247, "grad_norm": 2.6973538398742676, "learning_rate": 9.583767933498964e-05, "loss": 1.4668, "step": 654 }, { "epoch": 4.253246753246753, "grad_norm": 2.2840769290924072, "learning_rate": 9.58251210953449e-05, "loss": 1.2807, "step": 655 }, { "epoch": 4.259740259740259, "grad_norm": 2.4915285110473633, "learning_rate": 9.58125447646681e-05, "loss": 1.4697, "step": 656 }, { "epoch": 4.266233766233766, "grad_norm": 2.6363418102264404, "learning_rate": 9.579995034792414e-05, "loss": 1.3831, "step": 657 }, { "epoch": 4.2727272727272725, "grad_norm": 2.5724446773529053, "learning_rate": 9.578733785008513e-05, "loss": 1.4606, "step": 658 }, { "epoch": 4.279220779220779, "grad_norm": 2.6677348613739014, "learning_rate": 9.577470727613025e-05, "loss": 1.488, "step": 659 }, { "epoch": 4.285714285714286, "grad_norm": 2.5872716903686523, "learning_rate": 9.576205863104588e-05, "loss": 1.4098, "step": 660 }, { "epoch": 4.292207792207792, "grad_norm": 2.636482000350952, "learning_rate": 9.57493919198255e-05, "loss": 1.4056, "step": 661 }, { "epoch": 4.298701298701299, "grad_norm": 2.4787089824676514, "learning_rate": 9.573670714746972e-05, "loss": 1.3252, "step": 662 }, { "epoch": 4.305194805194805, "grad_norm": 2.544703245162964, "learning_rate": 9.572400431898627e-05, "loss": 1.213, "step": 663 }, { "epoch": 4.311688311688312, "grad_norm": 2.3949763774871826, "learning_rate": 9.571128343939005e-05, "loss": 1.4002, "step": 664 }, { "epoch": 4.318181818181818, "grad_norm": 2.5433011054992676, "learning_rate": 9.569854451370307e-05, "loss": 1.4224, "step": 665 }, { "epoch": 4.324675324675325, "grad_norm": 2.485722780227661, "learning_rate": 9.568578754695442e-05, "loss": 1.3944, "step": 666 }, { "epoch": 4.3311688311688314, "grad_norm": 2.7001330852508545, "learning_rate": 9.567301254418038e-05, "loss": 1.3847, "step": 667 }, { "epoch": 4.337662337662338, "grad_norm": 2.6091723442077637, "learning_rate": 9.566021951042433e-05, "loss": 1.48, "step": 668 }, { "epoch": 4.3441558441558445, "grad_norm": 2.37465238571167, "learning_rate": 9.56474084507367e-05, "loss": 1.2859, "step": 669 }, { "epoch": 4.35064935064935, "grad_norm": 2.298042058944702, "learning_rate": 9.563457937017515e-05, "loss": 1.2833, "step": 670 }, { "epoch": 4.357142857142857, "grad_norm": 2.418095827102661, "learning_rate": 9.562173227380436e-05, "loss": 1.2609, "step": 671 }, { "epoch": 4.363636363636363, "grad_norm": 2.5434913635253906, "learning_rate": 9.56088671666962e-05, "loss": 1.2695, "step": 672 }, { "epoch": 4.37012987012987, "grad_norm": 2.607151508331299, "learning_rate": 9.559598405392958e-05, "loss": 1.4324, "step": 673 }, { "epoch": 4.376623376623376, "grad_norm": 2.42596173286438, "learning_rate": 9.558308294059054e-05, "loss": 1.4373, "step": 674 }, { "epoch": 4.383116883116883, "grad_norm": 2.508871078491211, "learning_rate": 9.557016383177227e-05, "loss": 1.4397, "step": 675 }, { "epoch": 4.3896103896103895, "grad_norm": 2.110358715057373, "learning_rate": 9.555722673257501e-05, "loss": 1.139, "step": 676 }, { "epoch": 4.396103896103896, "grad_norm": 2.3169806003570557, "learning_rate": 9.554427164810611e-05, "loss": 1.231, "step": 677 }, { "epoch": 4.402597402597403, "grad_norm": 2.418757200241089, "learning_rate": 9.553129858348006e-05, "loss": 1.4744, "step": 678 }, { "epoch": 4.409090909090909, "grad_norm": 2.496831178665161, "learning_rate": 9.55183075438184e-05, "loss": 1.4325, "step": 679 }, { "epoch": 4.415584415584416, "grad_norm": 2.282007932662964, "learning_rate": 9.550529853424979e-05, "loss": 1.2713, "step": 680 }, { "epoch": 4.422077922077922, "grad_norm": 2.46980619430542, "learning_rate": 9.549227155990999e-05, "loss": 1.426, "step": 681 }, { "epoch": 4.428571428571429, "grad_norm": 2.3510262966156006, "learning_rate": 9.547922662594183e-05, "loss": 1.2731, "step": 682 }, { "epoch": 4.435064935064935, "grad_norm": 2.6384055614471436, "learning_rate": 9.546616373749525e-05, "loss": 1.4716, "step": 683 }, { "epoch": 4.441558441558442, "grad_norm": 2.60732102394104, "learning_rate": 9.545308289972728e-05, "loss": 1.4314, "step": 684 }, { "epoch": 4.448051948051948, "grad_norm": 2.352541923522949, "learning_rate": 9.543998411780201e-05, "loss": 1.3909, "step": 685 }, { "epoch": 4.454545454545454, "grad_norm": 2.4203426837921143, "learning_rate": 9.542686739689065e-05, "loss": 1.3476, "step": 686 }, { "epoch": 4.461038961038961, "grad_norm": 2.63857364654541, "learning_rate": 9.541373274217145e-05, "loss": 1.4261, "step": 687 }, { "epoch": 4.467532467532467, "grad_norm": 2.3278229236602783, "learning_rate": 9.540058015882979e-05, "loss": 1.2848, "step": 688 }, { "epoch": 4.474025974025974, "grad_norm": 2.334977388381958, "learning_rate": 9.538740965205808e-05, "loss": 1.3625, "step": 689 }, { "epoch": 4.48051948051948, "grad_norm": 2.4834561347961426, "learning_rate": 9.537422122705585e-05, "loss": 1.302, "step": 690 }, { "epoch": 4.487012987012987, "grad_norm": 2.0580313205718994, "learning_rate": 9.536101488902966e-05, "loss": 1.1823, "step": 691 }, { "epoch": 4.4935064935064934, "grad_norm": 2.5352213382720947, "learning_rate": 9.534779064319318e-05, "loss": 1.3543, "step": 692 }, { "epoch": 4.5, "grad_norm": 2.424487829208374, "learning_rate": 9.533454849476712e-05, "loss": 1.4382, "step": 693 }, { "epoch": 4.5064935064935066, "grad_norm": 2.486064910888672, "learning_rate": 9.532128844897928e-05, "loss": 1.3491, "step": 694 }, { "epoch": 4.512987012987013, "grad_norm": 2.4160115718841553, "learning_rate": 9.530801051106449e-05, "loss": 1.401, "step": 695 }, { "epoch": 4.51948051948052, "grad_norm": 2.027177095413208, "learning_rate": 9.529471468626472e-05, "loss": 1.1387, "step": 696 }, { "epoch": 4.525974025974026, "grad_norm": 2.3802859783172607, "learning_rate": 9.528140097982889e-05, "loss": 1.504, "step": 697 }, { "epoch": 4.532467532467533, "grad_norm": 2.4218432903289795, "learning_rate": 9.526806939701309e-05, "loss": 1.3862, "step": 698 }, { "epoch": 4.538961038961039, "grad_norm": 2.2717883586883545, "learning_rate": 9.52547199430804e-05, "loss": 1.1086, "step": 699 }, { "epoch": 4.545454545454545, "grad_norm": 2.2099993228912354, "learning_rate": 9.524135262330098e-05, "loss": 1.3446, "step": 700 }, { "epoch": 4.551948051948052, "grad_norm": 2.0593318939208984, "learning_rate": 9.522796744295202e-05, "loss": 1.0246, "step": 701 }, { "epoch": 4.558441558441558, "grad_norm": 2.5161662101745605, "learning_rate": 9.52145644073178e-05, "loss": 1.51, "step": 702 }, { "epoch": 4.564935064935065, "grad_norm": 2.523601531982422, "learning_rate": 9.520114352168958e-05, "loss": 1.4339, "step": 703 }, { "epoch": 4.571428571428571, "grad_norm": 2.5355663299560547, "learning_rate": 9.518770479136578e-05, "loss": 1.3231, "step": 704 }, { "epoch": 4.577922077922078, "grad_norm": 2.8767991065979004, "learning_rate": 9.517424822165175e-05, "loss": 1.5899, "step": 705 }, { "epoch": 4.584415584415584, "grad_norm": 2.6482222080230713, "learning_rate": 9.516077381785994e-05, "loss": 1.337, "step": 706 }, { "epoch": 4.590909090909091, "grad_norm": 2.7159810066223145, "learning_rate": 9.514728158530983e-05, "loss": 1.4175, "step": 707 }, { "epoch": 4.597402597402597, "grad_norm": 2.59430193901062, "learning_rate": 9.513377152932796e-05, "loss": 1.4217, "step": 708 }, { "epoch": 4.603896103896104, "grad_norm": 2.4445676803588867, "learning_rate": 9.512024365524787e-05, "loss": 1.4127, "step": 709 }, { "epoch": 4.6103896103896105, "grad_norm": 2.483778953552246, "learning_rate": 9.510669796841014e-05, "loss": 1.4122, "step": 710 }, { "epoch": 4.616883116883117, "grad_norm": 2.4489808082580566, "learning_rate": 9.509313447416242e-05, "loss": 1.5403, "step": 711 }, { "epoch": 4.623376623376624, "grad_norm": 2.4049267768859863, "learning_rate": 9.507955317785934e-05, "loss": 1.4104, "step": 712 }, { "epoch": 4.62987012987013, "grad_norm": 2.906292200088501, "learning_rate": 9.506595408486259e-05, "loss": 1.7337, "step": 713 }, { "epoch": 4.636363636363637, "grad_norm": 2.2860770225524902, "learning_rate": 9.505233720054087e-05, "loss": 1.3166, "step": 714 }, { "epoch": 4.642857142857143, "grad_norm": 2.660557270050049, "learning_rate": 9.503870253026991e-05, "loss": 1.4689, "step": 715 }, { "epoch": 4.64935064935065, "grad_norm": 2.4646763801574707, "learning_rate": 9.502505007943248e-05, "loss": 1.4584, "step": 716 }, { "epoch": 4.6558441558441555, "grad_norm": 2.287764072418213, "learning_rate": 9.501137985341832e-05, "loss": 1.4074, "step": 717 }, { "epoch": 4.662337662337662, "grad_norm": 3.6270928382873535, "learning_rate": 9.499769185762425e-05, "loss": 1.5872, "step": 718 }, { "epoch": 4.6688311688311686, "grad_norm": 2.5962812900543213, "learning_rate": 9.498398609745405e-05, "loss": 1.5825, "step": 719 }, { "epoch": 4.675324675324675, "grad_norm": 2.382645845413208, "learning_rate": 9.497026257831855e-05, "loss": 1.3595, "step": 720 }, { "epoch": 4.681818181818182, "grad_norm": 2.0103063583374023, "learning_rate": 9.49565213056356e-05, "loss": 1.1057, "step": 721 }, { "epoch": 4.688311688311688, "grad_norm": 2.2708091735839844, "learning_rate": 9.494276228482998e-05, "loss": 1.3579, "step": 722 }, { "epoch": 4.694805194805195, "grad_norm": 2.530606508255005, "learning_rate": 9.492898552133358e-05, "loss": 1.4606, "step": 723 }, { "epoch": 4.701298701298701, "grad_norm": 2.033069372177124, "learning_rate": 9.491519102058522e-05, "loss": 1.1266, "step": 724 }, { "epoch": 4.707792207792208, "grad_norm": 2.533560037612915, "learning_rate": 9.490137878803079e-05, "loss": 1.5121, "step": 725 }, { "epoch": 4.714285714285714, "grad_norm": 2.3064355850219727, "learning_rate": 9.48875488291231e-05, "loss": 1.3564, "step": 726 }, { "epoch": 4.720779220779221, "grad_norm": 2.3468894958496094, "learning_rate": 9.487370114932202e-05, "loss": 1.4063, "step": 727 }, { "epoch": 4.7272727272727275, "grad_norm": 2.2847111225128174, "learning_rate": 9.485983575409438e-05, "loss": 1.3642, "step": 728 }, { "epoch": 4.733766233766234, "grad_norm": 2.188500165939331, "learning_rate": 9.484595264891402e-05, "loss": 1.2824, "step": 729 }, { "epoch": 4.740259740259741, "grad_norm": 2.442934036254883, "learning_rate": 9.483205183926181e-05, "loss": 1.4042, "step": 730 }, { "epoch": 4.746753246753247, "grad_norm": 2.4196629524230957, "learning_rate": 9.48181333306255e-05, "loss": 1.4626, "step": 731 }, { "epoch": 4.753246753246753, "grad_norm": 2.290989637374878, "learning_rate": 9.480419712849995e-05, "loss": 1.2912, "step": 732 }, { "epoch": 4.759740259740259, "grad_norm": 2.430495262145996, "learning_rate": 9.479024323838693e-05, "loss": 1.4117, "step": 733 }, { "epoch": 4.766233766233766, "grad_norm": 2.0131633281707764, "learning_rate": 9.477627166579522e-05, "loss": 1.1689, "step": 734 }, { "epoch": 4.7727272727272725, "grad_norm": 2.582155466079712, "learning_rate": 9.476228241624059e-05, "loss": 1.6276, "step": 735 }, { "epoch": 4.779220779220779, "grad_norm": 2.3215701580047607, "learning_rate": 9.474827549524574e-05, "loss": 1.3326, "step": 736 }, { "epoch": 4.785714285714286, "grad_norm": 2.2553138732910156, "learning_rate": 9.473425090834041e-05, "loss": 1.3568, "step": 737 }, { "epoch": 4.792207792207792, "grad_norm": 2.5642342567443848, "learning_rate": 9.472020866106128e-05, "loss": 1.4779, "step": 738 }, { "epoch": 4.798701298701299, "grad_norm": 2.465707778930664, "learning_rate": 9.470614875895201e-05, "loss": 1.4478, "step": 739 }, { "epoch": 4.805194805194805, "grad_norm": 2.395329236984253, "learning_rate": 9.46920712075632e-05, "loss": 1.3339, "step": 740 }, { "epoch": 4.811688311688312, "grad_norm": 2.5474603176116943, "learning_rate": 9.467797601245246e-05, "loss": 1.4824, "step": 741 }, { "epoch": 4.818181818181818, "grad_norm": 2.498077154159546, "learning_rate": 9.466386317918436e-05, "loss": 1.4558, "step": 742 }, { "epoch": 4.824675324675325, "grad_norm": 2.258089065551758, "learning_rate": 9.464973271333042e-05, "loss": 1.3571, "step": 743 }, { "epoch": 4.8311688311688314, "grad_norm": 2.5701329708099365, "learning_rate": 9.463558462046912e-05, "loss": 1.5302, "step": 744 }, { "epoch": 4.837662337662338, "grad_norm": 2.347843885421753, "learning_rate": 9.46214189061859e-05, "loss": 1.3368, "step": 745 }, { "epoch": 4.8441558441558445, "grad_norm": 2.3211004734039307, "learning_rate": 9.460723557607316e-05, "loss": 1.3966, "step": 746 }, { "epoch": 4.85064935064935, "grad_norm": 2.381417989730835, "learning_rate": 9.459303463573026e-05, "loss": 1.267, "step": 747 }, { "epoch": 4.857142857142857, "grad_norm": 2.3821098804473877, "learning_rate": 9.457881609076352e-05, "loss": 1.4147, "step": 748 }, { "epoch": 4.863636363636363, "grad_norm": 2.3875110149383545, "learning_rate": 9.456457994678616e-05, "loss": 1.3116, "step": 749 }, { "epoch": 4.87012987012987, "grad_norm": 2.3099076747894287, "learning_rate": 9.45503262094184e-05, "loss": 1.2902, "step": 750 }, { "epoch": 4.876623376623376, "grad_norm": 2.0546610355377197, "learning_rate": 9.45360548842874e-05, "loss": 1.1813, "step": 751 }, { "epoch": 4.883116883116883, "grad_norm": 2.4475390911102295, "learning_rate": 9.452176597702725e-05, "loss": 1.5366, "step": 752 }, { "epoch": 4.8896103896103895, "grad_norm": 2.145540237426758, "learning_rate": 9.450745949327896e-05, "loss": 1.1983, "step": 753 }, { "epoch": 4.896103896103896, "grad_norm": 2.694192886352539, "learning_rate": 9.449313543869055e-05, "loss": 1.5095, "step": 754 }, { "epoch": 4.902597402597403, "grad_norm": 2.1562998294830322, "learning_rate": 9.447879381891692e-05, "loss": 1.1382, "step": 755 }, { "epoch": 4.909090909090909, "grad_norm": 2.4446427822113037, "learning_rate": 9.446443463961986e-05, "loss": 1.4053, "step": 756 }, { "epoch": 4.915584415584416, "grad_norm": 2.055360794067383, "learning_rate": 9.445005790646819e-05, "loss": 1.3194, "step": 757 }, { "epoch": 4.922077922077922, "grad_norm": 2.397583246231079, "learning_rate": 9.443566362513763e-05, "loss": 1.5682, "step": 758 }, { "epoch": 4.928571428571429, "grad_norm": 2.185490846633911, "learning_rate": 9.442125180131078e-05, "loss": 1.4359, "step": 759 }, { "epoch": 4.935064935064935, "grad_norm": 2.365079402923584, "learning_rate": 9.440682244067724e-05, "loss": 1.4141, "step": 760 }, { "epoch": 4.941558441558442, "grad_norm": 2.510624408721924, "learning_rate": 9.439237554893344e-05, "loss": 1.5375, "step": 761 }, { "epoch": 4.948051948051948, "grad_norm": 2.369779109954834, "learning_rate": 9.437791113178282e-05, "loss": 1.4662, "step": 762 }, { "epoch": 4.954545454545455, "grad_norm": 2.063880443572998, "learning_rate": 9.43634291949357e-05, "loss": 1.3133, "step": 763 }, { "epoch": 4.961038961038961, "grad_norm": 2.3315935134887695, "learning_rate": 9.434892974410932e-05, "loss": 1.4277, "step": 764 }, { "epoch": 4.967532467532467, "grad_norm": 2.5497238636016846, "learning_rate": 9.433441278502783e-05, "loss": 1.4894, "step": 765 }, { "epoch": 4.974025974025974, "grad_norm": 2.4642364978790283, "learning_rate": 9.431987832342228e-05, "loss": 1.4023, "step": 766 }, { "epoch": 4.98051948051948, "grad_norm": 2.380721092224121, "learning_rate": 9.430532636503068e-05, "loss": 1.3227, "step": 767 }, { "epoch": 4.987012987012987, "grad_norm": 2.2699685096740723, "learning_rate": 9.429075691559787e-05, "loss": 1.352, "step": 768 }, { "epoch": 4.9935064935064934, "grad_norm": 2.156005382537842, "learning_rate": 9.427616998087568e-05, "loss": 1.3047, "step": 769 }, { "epoch": 5.0, "grad_norm": 1483.0968017578125, "learning_rate": 9.426156556662276e-05, "loss": 1.3802, "step": 770 }, { "epoch": 5.0064935064935066, "grad_norm": 2.4799044132232666, "learning_rate": 9.424694367860473e-05, "loss": 1.214, "step": 771 }, { "epoch": 5.012987012987013, "grad_norm": 3.611436128616333, "learning_rate": 9.423230432259409e-05, "loss": 1.1361, "step": 772 }, { "epoch": 5.01948051948052, "grad_norm": 2.477595567703247, "learning_rate": 9.421764750437019e-05, "loss": 1.322, "step": 773 }, { "epoch": 5.025974025974026, "grad_norm": 2.0933218002319336, "learning_rate": 9.420297322971933e-05, "loss": 1.1123, "step": 774 }, { "epoch": 5.032467532467533, "grad_norm": 2.1223325729370117, "learning_rate": 9.418828150443467e-05, "loss": 1.2504, "step": 775 }, { "epoch": 5.038961038961039, "grad_norm": 2.640428304672241, "learning_rate": 9.41735723343163e-05, "loss": 1.3819, "step": 776 }, { "epoch": 5.045454545454546, "grad_norm": 2.309741258621216, "learning_rate": 9.415884572517113e-05, "loss": 1.2354, "step": 777 }, { "epoch": 5.0519480519480515, "grad_norm": 2.4606881141662598, "learning_rate": 9.414410168281302e-05, "loss": 1.4091, "step": 778 }, { "epoch": 5.058441558441558, "grad_norm": 2.1588926315307617, "learning_rate": 9.412934021306267e-05, "loss": 1.1284, "step": 779 }, { "epoch": 5.064935064935065, "grad_norm": 2.3062281608581543, "learning_rate": 9.411456132174767e-05, "loss": 1.2039, "step": 780 }, { "epoch": 5.071428571428571, "grad_norm": 2.493637800216675, "learning_rate": 9.40997650147025e-05, "loss": 1.2556, "step": 781 }, { "epoch": 5.077922077922078, "grad_norm": 2.254223346710205, "learning_rate": 9.408495129776852e-05, "loss": 1.14, "step": 782 }, { "epoch": 5.084415584415584, "grad_norm": 2.187127113342285, "learning_rate": 9.407012017679393e-05, "loss": 1.1906, "step": 783 }, { "epoch": 5.090909090909091, "grad_norm": 2.017028570175171, "learning_rate": 9.405527165763384e-05, "loss": 1.0582, "step": 784 }, { "epoch": 5.097402597402597, "grad_norm": 2.3100154399871826, "learning_rate": 9.404040574615018e-05, "loss": 1.3244, "step": 785 }, { "epoch": 5.103896103896104, "grad_norm": 2.231184482574463, "learning_rate": 9.402552244821182e-05, "loss": 1.0768, "step": 786 }, { "epoch": 5.1103896103896105, "grad_norm": 2.422355890274048, "learning_rate": 9.401062176969442e-05, "loss": 1.1453, "step": 787 }, { "epoch": 5.116883116883117, "grad_norm": 2.3468523025512695, "learning_rate": 9.399570371648052e-05, "loss": 1.1517, "step": 788 }, { "epoch": 5.123376623376624, "grad_norm": 2.1444785594940186, "learning_rate": 9.398076829445958e-05, "loss": 1.0645, "step": 789 }, { "epoch": 5.12987012987013, "grad_norm": 2.26538348197937, "learning_rate": 9.396581550952781e-05, "loss": 1.1867, "step": 790 }, { "epoch": 5.136363636363637, "grad_norm": 2.3012781143188477, "learning_rate": 9.395084536758838e-05, "loss": 1.1908, "step": 791 }, { "epoch": 5.142857142857143, "grad_norm": 2.353574514389038, "learning_rate": 9.393585787455124e-05, "loss": 1.1686, "step": 792 }, { "epoch": 5.14935064935065, "grad_norm": 2.434039354324341, "learning_rate": 9.392085303633323e-05, "loss": 1.1923, "step": 793 }, { "epoch": 5.1558441558441555, "grad_norm": 2.4139668941497803, "learning_rate": 9.3905830858858e-05, "loss": 1.2742, "step": 794 }, { "epoch": 5.162337662337662, "grad_norm": 2.5167605876922607, "learning_rate": 9.389079134805609e-05, "loss": 1.2763, "step": 795 }, { "epoch": 5.1688311688311686, "grad_norm": 2.29848313331604, "learning_rate": 9.387573450986484e-05, "loss": 1.2501, "step": 796 }, { "epoch": 5.175324675324675, "grad_norm": 2.4731240272521973, "learning_rate": 9.386066035022848e-05, "loss": 1.4086, "step": 797 }, { "epoch": 5.181818181818182, "grad_norm": 2.226640224456787, "learning_rate": 9.384556887509802e-05, "loss": 1.1452, "step": 798 }, { "epoch": 5.188311688311688, "grad_norm": 2.1757423877716064, "learning_rate": 9.383046009043134e-05, "loss": 1.2051, "step": 799 }, { "epoch": 5.194805194805195, "grad_norm": 2.5131468772888184, "learning_rate": 9.381533400219318e-05, "loss": 1.3115, "step": 800 }, { "epoch": 5.201298701298701, "grad_norm": 2.236072063446045, "learning_rate": 9.380019061635506e-05, "loss": 1.1337, "step": 801 }, { "epoch": 5.207792207792208, "grad_norm": 2.25288987159729, "learning_rate": 9.378502993889533e-05, "loss": 1.2222, "step": 802 }, { "epoch": 5.214285714285714, "grad_norm": 2.3511617183685303, "learning_rate": 9.37698519757992e-05, "loss": 1.228, "step": 803 }, { "epoch": 5.220779220779221, "grad_norm": 2.281393051147461, "learning_rate": 9.375465673305869e-05, "loss": 1.1854, "step": 804 }, { "epoch": 5.2272727272727275, "grad_norm": 2.451622247695923, "learning_rate": 9.373944421667265e-05, "loss": 1.2707, "step": 805 }, { "epoch": 5.233766233766234, "grad_norm": 2.3030169010162354, "learning_rate": 9.372421443264671e-05, "loss": 1.0945, "step": 806 }, { "epoch": 5.240259740259741, "grad_norm": 2.3412272930145264, "learning_rate": 9.370896738699339e-05, "loss": 1.1891, "step": 807 }, { "epoch": 5.246753246753247, "grad_norm": 2.457958936691284, "learning_rate": 9.369370308573198e-05, "loss": 1.2034, "step": 808 }, { "epoch": 5.253246753246753, "grad_norm": 2.3870041370391846, "learning_rate": 9.367842153488854e-05, "loss": 1.1308, "step": 809 }, { "epoch": 5.259740259740259, "grad_norm": 2.373983860015869, "learning_rate": 9.366312274049602e-05, "loss": 1.2246, "step": 810 }, { "epoch": 5.266233766233766, "grad_norm": 2.238525152206421, "learning_rate": 9.364780670859412e-05, "loss": 1.1794, "step": 811 }, { "epoch": 5.2727272727272725, "grad_norm": 2.2688422203063965, "learning_rate": 9.363247344522939e-05, "loss": 1.3021, "step": 812 }, { "epoch": 5.279220779220779, "grad_norm": 2.4562861919403076, "learning_rate": 9.361712295645515e-05, "loss": 1.2968, "step": 813 }, { "epoch": 5.285714285714286, "grad_norm": 1.8892732858657837, "learning_rate": 9.360175524833153e-05, "loss": 0.895, "step": 814 }, { "epoch": 5.292207792207792, "grad_norm": 2.4016454219818115, "learning_rate": 9.358637032692545e-05, "loss": 1.1788, "step": 815 }, { "epoch": 5.298701298701299, "grad_norm": 2.198923349380493, "learning_rate": 9.357096819831064e-05, "loss": 1.1777, "step": 816 }, { "epoch": 5.305194805194805, "grad_norm": 2.5778071880340576, "learning_rate": 9.355554886856762e-05, "loss": 1.2441, "step": 817 }, { "epoch": 5.311688311688312, "grad_norm": 2.578562021255493, "learning_rate": 9.354011234378369e-05, "loss": 1.3336, "step": 818 }, { "epoch": 5.318181818181818, "grad_norm": 2.29508376121521, "learning_rate": 9.352465863005296e-05, "loss": 1.3143, "step": 819 }, { "epoch": 5.324675324675325, "grad_norm": 2.3751208782196045, "learning_rate": 9.35091877334763e-05, "loss": 1.3194, "step": 820 }, { "epoch": 5.3311688311688314, "grad_norm": 2.456490993499756, "learning_rate": 9.349369966016134e-05, "loss": 1.2553, "step": 821 }, { "epoch": 5.337662337662338, "grad_norm": 1.9089747667312622, "learning_rate": 9.347819441622261e-05, "loss": 0.9778, "step": 822 }, { "epoch": 5.3441558441558445, "grad_norm": 2.301745891571045, "learning_rate": 9.346267200778126e-05, "loss": 1.1925, "step": 823 }, { "epoch": 5.35064935064935, "grad_norm": 2.3186118602752686, "learning_rate": 9.344713244096533e-05, "loss": 1.2568, "step": 824 }, { "epoch": 5.357142857142857, "grad_norm": 2.4224462509155273, "learning_rate": 9.343157572190957e-05, "loss": 1.2227, "step": 825 }, { "epoch": 5.363636363636363, "grad_norm": 2.4902286529541016, "learning_rate": 9.341600185675554e-05, "loss": 1.2466, "step": 826 }, { "epoch": 5.37012987012987, "grad_norm": 2.3658058643341064, "learning_rate": 9.340041085165155e-05, "loss": 1.288, "step": 827 }, { "epoch": 5.376623376623376, "grad_norm": 2.256941318511963, "learning_rate": 9.33848027127527e-05, "loss": 1.2251, "step": 828 }, { "epoch": 5.383116883116883, "grad_norm": 2.4632041454315186, "learning_rate": 9.336917744622081e-05, "loss": 1.3159, "step": 829 }, { "epoch": 5.3896103896103895, "grad_norm": 2.1642262935638428, "learning_rate": 9.33535350582245e-05, "loss": 1.1923, "step": 830 }, { "epoch": 5.396103896103896, "grad_norm": 2.154273748397827, "learning_rate": 9.333787555493914e-05, "loss": 1.1474, "step": 831 }, { "epoch": 5.402597402597403, "grad_norm": 2.319180965423584, "learning_rate": 9.332219894254686e-05, "loss": 1.2664, "step": 832 }, { "epoch": 5.409090909090909, "grad_norm": 2.0859668254852295, "learning_rate": 9.330650522723652e-05, "loss": 1.0003, "step": 833 }, { "epoch": 5.415584415584416, "grad_norm": 2.426114082336426, "learning_rate": 9.329079441520377e-05, "loss": 1.249, "step": 834 }, { "epoch": 5.422077922077922, "grad_norm": 2.3975894451141357, "learning_rate": 9.327506651265095e-05, "loss": 1.2747, "step": 835 }, { "epoch": 5.428571428571429, "grad_norm": 2.561692476272583, "learning_rate": 9.325932152578725e-05, "loss": 1.1971, "step": 836 }, { "epoch": 5.435064935064935, "grad_norm": 2.311870813369751, "learning_rate": 9.324355946082848e-05, "loss": 1.1388, "step": 837 }, { "epoch": 5.441558441558442, "grad_norm": 2.2985804080963135, "learning_rate": 9.322778032399728e-05, "loss": 1.1915, "step": 838 }, { "epoch": 5.448051948051948, "grad_norm": 2.5095090866088867, "learning_rate": 9.321198412152301e-05, "loss": 1.2289, "step": 839 }, { "epoch": 5.454545454545454, "grad_norm": 2.4977407455444336, "learning_rate": 9.319617085964176e-05, "loss": 1.1548, "step": 840 }, { "epoch": 5.461038961038961, "grad_norm": 2.479334831237793, "learning_rate": 9.318034054459637e-05, "loss": 1.2881, "step": 841 }, { "epoch": 5.467532467532467, "grad_norm": 2.316788911819458, "learning_rate": 9.316449318263635e-05, "loss": 1.0807, "step": 842 }, { "epoch": 5.474025974025974, "grad_norm": 2.3556737899780273, "learning_rate": 9.314862878001803e-05, "loss": 1.3983, "step": 843 }, { "epoch": 5.48051948051948, "grad_norm": 2.27823805809021, "learning_rate": 9.313274734300439e-05, "loss": 1.2266, "step": 844 }, { "epoch": 5.487012987012987, "grad_norm": 2.213878870010376, "learning_rate": 9.31168488778652e-05, "loss": 1.1607, "step": 845 }, { "epoch": 5.4935064935064934, "grad_norm": 2.411067485809326, "learning_rate": 9.310093339087692e-05, "loss": 1.3101, "step": 846 }, { "epoch": 5.5, "grad_norm": 2.3228533267974854, "learning_rate": 9.308500088832272e-05, "loss": 1.1342, "step": 847 }, { "epoch": 5.5064935064935066, "grad_norm": 2.511704683303833, "learning_rate": 9.30690513764925e-05, "loss": 1.3019, "step": 848 }, { "epoch": 5.512987012987013, "grad_norm": 2.267284393310547, "learning_rate": 9.305308486168288e-05, "loss": 1.2113, "step": 849 }, { "epoch": 5.51948051948052, "grad_norm": 2.3928427696228027, "learning_rate": 9.30371013501972e-05, "loss": 1.2584, "step": 850 }, { "epoch": 5.525974025974026, "grad_norm": 2.4797680377960205, "learning_rate": 9.302110084834545e-05, "loss": 1.2511, "step": 851 }, { "epoch": 5.532467532467533, "grad_norm": 2.280144214630127, "learning_rate": 9.300508336244444e-05, "loss": 1.3214, "step": 852 }, { "epoch": 5.538961038961039, "grad_norm": 2.3917574882507324, "learning_rate": 9.298904889881757e-05, "loss": 1.3104, "step": 853 }, { "epoch": 5.545454545454545, "grad_norm": 1.8771497011184692, "learning_rate": 9.297299746379502e-05, "loss": 1.0212, "step": 854 }, { "epoch": 5.551948051948052, "grad_norm": 2.3260936737060547, "learning_rate": 9.295692906371363e-05, "loss": 1.2671, "step": 855 }, { "epoch": 5.558441558441558, "grad_norm": 2.335958957672119, "learning_rate": 9.294084370491694e-05, "loss": 1.2515, "step": 856 }, { "epoch": 5.564935064935065, "grad_norm": 2.451331853866577, "learning_rate": 9.292474139375522e-05, "loss": 1.0896, "step": 857 }, { "epoch": 5.571428571428571, "grad_norm": 2.263322353363037, "learning_rate": 9.29086221365854e-05, "loss": 1.247, "step": 858 }, { "epoch": 5.577922077922078, "grad_norm": 2.4265804290771484, "learning_rate": 9.289248593977109e-05, "loss": 1.2482, "step": 859 }, { "epoch": 5.584415584415584, "grad_norm": 2.4868714809417725, "learning_rate": 9.287633280968261e-05, "loss": 1.2864, "step": 860 }, { "epoch": 5.590909090909091, "grad_norm": 2.502161979675293, "learning_rate": 9.286016275269698e-05, "loss": 1.2699, "step": 861 }, { "epoch": 5.597402597402597, "grad_norm": 2.301053762435913, "learning_rate": 9.284397577519788e-05, "loss": 1.2422, "step": 862 }, { "epoch": 5.603896103896104, "grad_norm": 2.7383460998535156, "learning_rate": 9.282777188357565e-05, "loss": 1.3225, "step": 863 }, { "epoch": 5.6103896103896105, "grad_norm": 2.5754849910736084, "learning_rate": 9.281155108422733e-05, "loss": 1.3382, "step": 864 }, { "epoch": 5.616883116883117, "grad_norm": 2.30465030670166, "learning_rate": 9.279531338355666e-05, "loss": 1.2114, "step": 865 }, { "epoch": 5.623376623376624, "grad_norm": 2.1572651863098145, "learning_rate": 9.2779058787974e-05, "loss": 1.2831, "step": 866 }, { "epoch": 5.62987012987013, "grad_norm": 2.3018293380737305, "learning_rate": 9.276278730389642e-05, "loss": 1.1312, "step": 867 }, { "epoch": 5.636363636363637, "grad_norm": 2.3740503787994385, "learning_rate": 9.274649893774767e-05, "loss": 1.2546, "step": 868 }, { "epoch": 5.642857142857143, "grad_norm": 2.359429121017456, "learning_rate": 9.273019369595809e-05, "loss": 1.2289, "step": 869 }, { "epoch": 5.64935064935065, "grad_norm": 2.7115275859832764, "learning_rate": 9.271387158496476e-05, "loss": 1.3554, "step": 870 }, { "epoch": 5.6558441558441555, "grad_norm": 1.951027274131775, "learning_rate": 9.269753261121138e-05, "loss": 1.1395, "step": 871 }, { "epoch": 5.662337662337662, "grad_norm": 2.2826426029205322, "learning_rate": 9.268117678114834e-05, "loss": 1.2125, "step": 872 }, { "epoch": 5.6688311688311686, "grad_norm": 2.1922526359558105, "learning_rate": 9.266480410123264e-05, "loss": 1.0959, "step": 873 }, { "epoch": 5.675324675324675, "grad_norm": 2.3663859367370605, "learning_rate": 9.264841457792795e-05, "loss": 1.2634, "step": 874 }, { "epoch": 5.681818181818182, "grad_norm": 2.077533483505249, "learning_rate": 9.263200821770461e-05, "loss": 1.1805, "step": 875 }, { "epoch": 5.688311688311688, "grad_norm": 2.255629062652588, "learning_rate": 9.26155850270396e-05, "loss": 1.3269, "step": 876 }, { "epoch": 5.694805194805195, "grad_norm": 2.387958288192749, "learning_rate": 9.259914501241652e-05, "loss": 1.2001, "step": 877 }, { "epoch": 5.701298701298701, "grad_norm": 2.3133974075317383, "learning_rate": 9.258268818032561e-05, "loss": 1.2889, "step": 878 }, { "epoch": 5.707792207792208, "grad_norm": 2.17250919342041, "learning_rate": 9.256621453726379e-05, "loss": 1.2761, "step": 879 }, { "epoch": 5.714285714285714, "grad_norm": 2.288163900375366, "learning_rate": 9.254972408973461e-05, "loss": 1.2545, "step": 880 }, { "epoch": 5.720779220779221, "grad_norm": 2.2603843212127686, "learning_rate": 9.25332168442482e-05, "loss": 1.2539, "step": 881 }, { "epoch": 5.7272727272727275, "grad_norm": 2.111243963241577, "learning_rate": 9.251669280732137e-05, "loss": 1.2284, "step": 882 }, { "epoch": 5.733766233766234, "grad_norm": 9.176555633544922, "learning_rate": 9.250015198547757e-05, "loss": 1.3237, "step": 883 }, { "epoch": 5.740259740259741, "grad_norm": 2.652176856994629, "learning_rate": 9.248359438524683e-05, "loss": 1.3154, "step": 884 }, { "epoch": 5.746753246753247, "grad_norm": 2.4932117462158203, "learning_rate": 9.246702001316583e-05, "loss": 1.2712, "step": 885 }, { "epoch": 5.753246753246753, "grad_norm": 2.2778451442718506, "learning_rate": 9.245042887577788e-05, "loss": 1.3088, "step": 886 }, { "epoch": 5.759740259740259, "grad_norm": 2.240194082260132, "learning_rate": 9.243382097963291e-05, "loss": 1.1532, "step": 887 }, { "epoch": 5.766233766233766, "grad_norm": 2.4618711471557617, "learning_rate": 9.241719633128743e-05, "loss": 1.4605, "step": 888 }, { "epoch": 5.7727272727272725, "grad_norm": 2.5479254722595215, "learning_rate": 9.24005549373046e-05, "loss": 1.3666, "step": 889 }, { "epoch": 5.779220779220779, "grad_norm": 2.277982473373413, "learning_rate": 9.238389680425416e-05, "loss": 1.3209, "step": 890 }, { "epoch": 5.785714285714286, "grad_norm": 2.3099091053009033, "learning_rate": 9.236722193871252e-05, "loss": 1.2911, "step": 891 }, { "epoch": 5.792207792207792, "grad_norm": 2.1005160808563232, "learning_rate": 9.23505303472626e-05, "loss": 1.1609, "step": 892 }, { "epoch": 5.798701298701299, "grad_norm": 2.325089693069458, "learning_rate": 9.233382203649401e-05, "loss": 1.2268, "step": 893 }, { "epoch": 5.805194805194805, "grad_norm": 2.2837464809417725, "learning_rate": 9.231709701300293e-05, "loss": 1.2928, "step": 894 }, { "epoch": 5.811688311688312, "grad_norm": 2.1417577266693115, "learning_rate": 9.230035528339211e-05, "loss": 1.2102, "step": 895 }, { "epoch": 5.818181818181818, "grad_norm": 2.3167195320129395, "learning_rate": 9.228359685427095e-05, "loss": 1.2982, "step": 896 }, { "epoch": 5.824675324675325, "grad_norm": 2.281616449356079, "learning_rate": 9.226682173225537e-05, "loss": 1.2442, "step": 897 }, { "epoch": 5.8311688311688314, "grad_norm": 2.245262622833252, "learning_rate": 9.225002992396796e-05, "loss": 1.2757, "step": 898 }, { "epoch": 5.837662337662338, "grad_norm": 2.2756450176239014, "learning_rate": 9.223322143603785e-05, "loss": 1.1611, "step": 899 }, { "epoch": 5.8441558441558445, "grad_norm": 2.2296695709228516, "learning_rate": 9.221639627510076e-05, "loss": 1.2486, "step": 900 }, { "epoch": 5.85064935064935, "grad_norm": 2.456972599029541, "learning_rate": 9.2199554447799e-05, "loss": 1.2553, "step": 901 }, { "epoch": 5.857142857142857, "grad_norm": 2.2772555351257324, "learning_rate": 9.218269596078146e-05, "loss": 1.1621, "step": 902 }, { "epoch": 5.863636363636363, "grad_norm": 2.2198710441589355, "learning_rate": 9.216582082070358e-05, "loss": 1.1448, "step": 903 }, { "epoch": 5.87012987012987, "grad_norm": 2.201732873916626, "learning_rate": 9.214892903422744e-05, "loss": 1.1754, "step": 904 }, { "epoch": 5.876623376623376, "grad_norm": 2.263493061065674, "learning_rate": 9.213202060802161e-05, "loss": 1.3105, "step": 905 }, { "epoch": 5.883116883116883, "grad_norm": 2.3806960582733154, "learning_rate": 9.21150955487613e-05, "loss": 1.2363, "step": 906 }, { "epoch": 5.8896103896103895, "grad_norm": 2.324129104614258, "learning_rate": 9.209815386312824e-05, "loss": 1.3469, "step": 907 }, { "epoch": 5.896103896103896, "grad_norm": 2.4612674713134766, "learning_rate": 9.208119555781074e-05, "loss": 1.3424, "step": 908 }, { "epoch": 5.902597402597403, "grad_norm": 2.341952323913574, "learning_rate": 9.206422063950367e-05, "loss": 1.2235, "step": 909 }, { "epoch": 5.909090909090909, "grad_norm": 2.305021047592163, "learning_rate": 9.204722911490846e-05, "loss": 1.3399, "step": 910 }, { "epoch": 5.915584415584416, "grad_norm": 2.174239158630371, "learning_rate": 9.203022099073309e-05, "loss": 1.106, "step": 911 }, { "epoch": 5.922077922077922, "grad_norm": 2.136263847351074, "learning_rate": 9.201319627369211e-05, "loss": 1.185, "step": 912 }, { "epoch": 5.928571428571429, "grad_norm": 2.0439507961273193, "learning_rate": 9.199615497050659e-05, "loss": 1.1936, "step": 913 }, { "epoch": 5.935064935064935, "grad_norm": 2.331047773361206, "learning_rate": 9.19790970879042e-05, "loss": 1.2955, "step": 914 }, { "epoch": 5.941558441558442, "grad_norm": 2.1538991928100586, "learning_rate": 9.19620226326191e-05, "loss": 1.2146, "step": 915 }, { "epoch": 5.948051948051948, "grad_norm": 2.235663414001465, "learning_rate": 9.194493161139199e-05, "loss": 1.2987, "step": 916 }, { "epoch": 5.954545454545455, "grad_norm": 2.266671657562256, "learning_rate": 9.192782403097018e-05, "loss": 1.2618, "step": 917 }, { "epoch": 5.961038961038961, "grad_norm": 2.287224292755127, "learning_rate": 9.191069989810744e-05, "loss": 1.2801, "step": 918 }, { "epoch": 5.967532467532467, "grad_norm": 2.454397678375244, "learning_rate": 9.189355921956412e-05, "loss": 1.3529, "step": 919 }, { "epoch": 5.974025974025974, "grad_norm": 2.4021215438842773, "learning_rate": 9.187640200210708e-05, "loss": 1.2859, "step": 920 }, { "epoch": 5.98051948051948, "grad_norm": 2.4222583770751953, "learning_rate": 9.185922825250974e-05, "loss": 1.3745, "step": 921 }, { "epoch": 5.987012987012987, "grad_norm": 1.9028486013412476, "learning_rate": 9.1842037977552e-05, "loss": 1.0603, "step": 922 }, { "epoch": 5.9935064935064934, "grad_norm": 2.0316622257232666, "learning_rate": 9.182483118402033e-05, "loss": 1.1392, "step": 923 }, { "epoch": 6.0, "grad_norm": 264.6945495605469, "learning_rate": 9.180760787870765e-05, "loss": 1.1341, "step": 924 }, { "epoch": 6.0064935064935066, "grad_norm": 1.8904297351837158, "learning_rate": 9.179036806841352e-05, "loss": 0.9749, "step": 925 }, { "epoch": 6.012987012987013, "grad_norm": 1.7428091764450073, "learning_rate": 9.17731117599439e-05, "loss": 0.857, "step": 926 }, { "epoch": 6.01948051948052, "grad_norm": 2.1916327476501465, "learning_rate": 9.175583896011131e-05, "loss": 1.1159, "step": 927 }, { "epoch": 6.025974025974026, "grad_norm": 2.009938955307007, "learning_rate": 9.173854967573479e-05, "loss": 0.985, "step": 928 }, { "epoch": 6.032467532467533, "grad_norm": 2.4173030853271484, "learning_rate": 9.172124391363985e-05, "loss": 1.1468, "step": 929 }, { "epoch": 6.038961038961039, "grad_norm": 2.157717227935791, "learning_rate": 9.170392168065857e-05, "loss": 1.0836, "step": 930 }, { "epoch": 6.045454545454546, "grad_norm": 1.9934571981430054, "learning_rate": 9.168658298362946e-05, "loss": 1.0066, "step": 931 }, { "epoch": 6.0519480519480515, "grad_norm": 2.120309352874756, "learning_rate": 9.166922782939758e-05, "loss": 1.1275, "step": 932 }, { "epoch": 6.058441558441558, "grad_norm": 2.1822173595428467, "learning_rate": 9.165185622481447e-05, "loss": 1.1083, "step": 933 }, { "epoch": 6.064935064935065, "grad_norm": 2.1950483322143555, "learning_rate": 9.163446817673817e-05, "loss": 1.0423, "step": 934 }, { "epoch": 6.071428571428571, "grad_norm": 2.231881856918335, "learning_rate": 9.161706369203317e-05, "loss": 1.086, "step": 935 }, { "epoch": 6.077922077922078, "grad_norm": 2.369540214538574, "learning_rate": 9.159964277757054e-05, "loss": 0.9978, "step": 936 }, { "epoch": 6.084415584415584, "grad_norm": 2.1250226497650146, "learning_rate": 9.158220544022773e-05, "loss": 0.9875, "step": 937 }, { "epoch": 6.090909090909091, "grad_norm": 2.21665096282959, "learning_rate": 9.156475168688877e-05, "loss": 1.0885, "step": 938 }, { "epoch": 6.097402597402597, "grad_norm": 1.7771512269973755, "learning_rate": 9.154728152444408e-05, "loss": 0.789, "step": 939 }, { "epoch": 6.103896103896104, "grad_norm": 2.3178699016571045, "learning_rate": 9.152979495979063e-05, "loss": 1.0956, "step": 940 }, { "epoch": 6.1103896103896105, "grad_norm": 2.534238576889038, "learning_rate": 9.151229199983184e-05, "loss": 1.092, "step": 941 }, { "epoch": 6.116883116883117, "grad_norm": 2.4788708686828613, "learning_rate": 9.14947726514776e-05, "loss": 1.1408, "step": 942 }, { "epoch": 6.123376623376624, "grad_norm": 2.188908576965332, "learning_rate": 9.147723692164427e-05, "loss": 1.0523, "step": 943 }, { "epoch": 6.12987012987013, "grad_norm": 1.940743327140808, "learning_rate": 9.145968481725467e-05, "loss": 0.8914, "step": 944 }, { "epoch": 6.136363636363637, "grad_norm": 2.2595057487487793, "learning_rate": 9.14421163452381e-05, "loss": 1.064, "step": 945 }, { "epoch": 6.142857142857143, "grad_norm": 2.3622570037841797, "learning_rate": 9.142453151253032e-05, "loss": 1.0703, "step": 946 }, { "epoch": 6.14935064935065, "grad_norm": 2.097458600997925, "learning_rate": 9.140693032607353e-05, "loss": 0.9451, "step": 947 }, { "epoch": 6.1558441558441555, "grad_norm": 2.064142942428589, "learning_rate": 9.138931279281639e-05, "loss": 0.9752, "step": 948 }, { "epoch": 6.162337662337662, "grad_norm": 2.3236732482910156, "learning_rate": 9.137167891971407e-05, "loss": 1.0357, "step": 949 }, { "epoch": 6.1688311688311686, "grad_norm": 2.375629186630249, "learning_rate": 9.135402871372808e-05, "loss": 1.1145, "step": 950 }, { "epoch": 6.175324675324675, "grad_norm": 2.575246572494507, "learning_rate": 9.13363621818265e-05, "loss": 1.2448, "step": 951 }, { "epoch": 6.181818181818182, "grad_norm": 2.221675395965576, "learning_rate": 9.131867933098378e-05, "loss": 1.0203, "step": 952 }, { "epoch": 6.188311688311688, "grad_norm": 2.4579875469207764, "learning_rate": 9.13009801681808e-05, "loss": 1.1714, "step": 953 }, { "epoch": 6.194805194805195, "grad_norm": 2.4114532470703125, "learning_rate": 9.128326470040495e-05, "loss": 1.0732, "step": 954 }, { "epoch": 6.201298701298701, "grad_norm": 2.374720335006714, "learning_rate": 9.126553293464998e-05, "loss": 1.0896, "step": 955 }, { "epoch": 6.207792207792208, "grad_norm": 2.4675614833831787, "learning_rate": 9.124778487791615e-05, "loss": 1.1355, "step": 956 }, { "epoch": 6.214285714285714, "grad_norm": 2.0805985927581787, "learning_rate": 9.123002053721005e-05, "loss": 1.0721, "step": 957 }, { "epoch": 6.220779220779221, "grad_norm": 2.3115074634552, "learning_rate": 9.121223991954484e-05, "loss": 1.0833, "step": 958 }, { "epoch": 6.2272727272727275, "grad_norm": 2.4905924797058105, "learning_rate": 9.119444303193996e-05, "loss": 1.0787, "step": 959 }, { "epoch": 6.233766233766234, "grad_norm": 2.3526439666748047, "learning_rate": 9.117662988142138e-05, "loss": 1.1503, "step": 960 }, { "epoch": 6.240259740259741, "grad_norm": 2.615757703781128, "learning_rate": 9.115880047502142e-05, "loss": 1.1554, "step": 961 }, { "epoch": 6.246753246753247, "grad_norm": 2.563284158706665, "learning_rate": 9.114095481977888e-05, "loss": 1.3076, "step": 962 }, { "epoch": 6.253246753246753, "grad_norm": 2.29913592338562, "learning_rate": 9.112309292273891e-05, "loss": 1.1206, "step": 963 }, { "epoch": 6.259740259740259, "grad_norm": 2.1598641872406006, "learning_rate": 9.110521479095312e-05, "loss": 1.0892, "step": 964 }, { "epoch": 6.266233766233766, "grad_norm": 2.0765597820281982, "learning_rate": 9.108732043147952e-05, "loss": 1.013, "step": 965 }, { "epoch": 6.2727272727272725, "grad_norm": 2.281052589416504, "learning_rate": 9.10694098513825e-05, "loss": 1.1901, "step": 966 }, { "epoch": 6.279220779220779, "grad_norm": 2.169802665710449, "learning_rate": 9.10514830577329e-05, "loss": 1.0905, "step": 967 }, { "epoch": 6.285714285714286, "grad_norm": 2.2257354259490967, "learning_rate": 9.103354005760791e-05, "loss": 1.0368, "step": 968 }, { "epoch": 6.292207792207792, "grad_norm": 2.169473648071289, "learning_rate": 9.101558085809114e-05, "loss": 1.0166, "step": 969 }, { "epoch": 6.298701298701299, "grad_norm": 2.37520694732666, "learning_rate": 9.099760546627261e-05, "loss": 1.1822, "step": 970 }, { "epoch": 6.305194805194805, "grad_norm": 2.1943604946136475, "learning_rate": 9.097961388924873e-05, "loss": 1.1304, "step": 971 }, { "epoch": 6.311688311688312, "grad_norm": 2.553886651992798, "learning_rate": 9.096160613412228e-05, "loss": 1.3013, "step": 972 }, { "epoch": 6.318181818181818, "grad_norm": 2.3251588344573975, "learning_rate": 9.094358220800243e-05, "loss": 1.0648, "step": 973 }, { "epoch": 6.324675324675325, "grad_norm": 2.508411407470703, "learning_rate": 9.092554211800474e-05, "loss": 1.1993, "step": 974 }, { "epoch": 6.3311688311688314, "grad_norm": 2.10422682762146, "learning_rate": 9.090748587125118e-05, "loss": 1.1125, "step": 975 }, { "epoch": 6.337662337662338, "grad_norm": 2.246373176574707, "learning_rate": 9.088941347487003e-05, "loss": 1.1525, "step": 976 }, { "epoch": 6.3441558441558445, "grad_norm": 2.19040846824646, "learning_rate": 9.0871324935996e-05, "loss": 1.0981, "step": 977 }, { "epoch": 6.35064935064935, "grad_norm": 2.4398250579833984, "learning_rate": 9.085322026177017e-05, "loss": 1.2318, "step": 978 }, { "epoch": 6.357142857142857, "grad_norm": 2.36503267288208, "learning_rate": 9.083509945933997e-05, "loss": 1.1359, "step": 979 }, { "epoch": 6.363636363636363, "grad_norm": 2.545269012451172, "learning_rate": 9.081696253585921e-05, "loss": 1.0359, "step": 980 }, { "epoch": 6.37012987012987, "grad_norm": 2.456841468811035, "learning_rate": 9.079880949848805e-05, "loss": 1.1641, "step": 981 }, { "epoch": 6.376623376623376, "grad_norm": 2.2646665573120117, "learning_rate": 9.078064035439301e-05, "loss": 1.1425, "step": 982 }, { "epoch": 6.383116883116883, "grad_norm": 2.244145154953003, "learning_rate": 9.076245511074703e-05, "loss": 1.0961, "step": 983 }, { "epoch": 6.3896103896103895, "grad_norm": 2.3785531520843506, "learning_rate": 9.074425377472931e-05, "loss": 1.1008, "step": 984 }, { "epoch": 6.396103896103896, "grad_norm": 2.486354351043701, "learning_rate": 9.072603635352548e-05, "loss": 1.1167, "step": 985 }, { "epoch": 6.402597402597403, "grad_norm": 2.5050578117370605, "learning_rate": 9.070780285432745e-05, "loss": 1.1885, "step": 986 }, { "epoch": 6.409090909090909, "grad_norm": 2.260310649871826, "learning_rate": 9.068955328433355e-05, "loss": 1.1354, "step": 987 }, { "epoch": 6.415584415584416, "grad_norm": 2.1151485443115234, "learning_rate": 9.067128765074842e-05, "loss": 1.0162, "step": 988 }, { "epoch": 6.422077922077922, "grad_norm": 2.35497784614563, "learning_rate": 9.065300596078303e-05, "loss": 1.0928, "step": 989 }, { "epoch": 6.428571428571429, "grad_norm": 2.429023504257202, "learning_rate": 9.06347082216547e-05, "loss": 1.1322, "step": 990 }, { "epoch": 6.435064935064935, "grad_norm": 2.556748867034912, "learning_rate": 9.06163944405871e-05, "loss": 1.208, "step": 991 }, { "epoch": 6.441558441558442, "grad_norm": 2.451526403427124, "learning_rate": 9.059806462481023e-05, "loss": 1.0806, "step": 992 }, { "epoch": 6.448051948051948, "grad_norm": 2.4315357208251953, "learning_rate": 9.057971878156036e-05, "loss": 1.2842, "step": 993 }, { "epoch": 6.454545454545454, "grad_norm": 2.3418030738830566, "learning_rate": 9.056135691808019e-05, "loss": 1.1049, "step": 994 }, { "epoch": 6.461038961038961, "grad_norm": 2.454900026321411, "learning_rate": 9.054297904161868e-05, "loss": 1.2332, "step": 995 }, { "epoch": 6.467532467532467, "grad_norm": 2.3480823040008545, "learning_rate": 9.052458515943111e-05, "loss": 1.0631, "step": 996 }, { "epoch": 6.474025974025974, "grad_norm": 2.2208402156829834, "learning_rate": 9.050617527877911e-05, "loss": 1.1265, "step": 997 }, { "epoch": 6.48051948051948, "grad_norm": 2.285851240158081, "learning_rate": 9.048774940693062e-05, "loss": 1.0793, "step": 998 }, { "epoch": 6.487012987012987, "grad_norm": 1.9776757955551147, "learning_rate": 9.046930755115985e-05, "loss": 0.8993, "step": 999 }, { "epoch": 6.4935064935064934, "grad_norm": 2.425506591796875, "learning_rate": 9.045084971874738e-05, "loss": 1.183, "step": 1000 }, { "epoch": 6.5, "grad_norm": 2.349262237548828, "learning_rate": 9.043237591698004e-05, "loss": 1.1151, "step": 1001 }, { "epoch": 6.5064935064935066, "grad_norm": 2.294227123260498, "learning_rate": 9.041388615315102e-05, "loss": 1.1007, "step": 1002 }, { "epoch": 6.512987012987013, "grad_norm": 2.1493442058563232, "learning_rate": 9.03953804345598e-05, "loss": 1.0778, "step": 1003 }, { "epoch": 6.51948051948052, "grad_norm": 2.619725227355957, "learning_rate": 9.03768587685121e-05, "loss": 1.2408, "step": 1004 }, { "epoch": 6.525974025974026, "grad_norm": 2.156489372253418, "learning_rate": 9.035832116232001e-05, "loss": 1.0415, "step": 1005 }, { "epoch": 6.532467532467533, "grad_norm": 2.3598475456237793, "learning_rate": 9.03397676233019e-05, "loss": 1.1308, "step": 1006 }, { "epoch": 6.538961038961039, "grad_norm": 2.3119537830352783, "learning_rate": 9.032119815878236e-05, "loss": 1.1402, "step": 1007 }, { "epoch": 6.545454545454545, "grad_norm": 2.459968328475952, "learning_rate": 9.030261277609236e-05, "loss": 1.2053, "step": 1008 }, { "epoch": 6.551948051948052, "grad_norm": 2.4004056453704834, "learning_rate": 9.02840114825691e-05, "loss": 1.2789, "step": 1009 }, { "epoch": 6.558441558441558, "grad_norm": 2.6905951499938965, "learning_rate": 9.02653942855561e-05, "loss": 1.1713, "step": 1010 }, { "epoch": 6.564935064935065, "grad_norm": 2.306800365447998, "learning_rate": 9.024676119240311e-05, "loss": 1.1418, "step": 1011 }, { "epoch": 6.571428571428571, "grad_norm": 2.355865001678467, "learning_rate": 9.022811221046618e-05, "loss": 1.1373, "step": 1012 }, { "epoch": 6.577922077922078, "grad_norm": 2.5856380462646484, "learning_rate": 9.020944734710766e-05, "loss": 1.2152, "step": 1013 }, { "epoch": 6.584415584415584, "grad_norm": 2.4231441020965576, "learning_rate": 9.01907666096961e-05, "loss": 1.0314, "step": 1014 }, { "epoch": 6.590909090909091, "grad_norm": 2.426198720932007, "learning_rate": 9.017207000560638e-05, "loss": 1.3291, "step": 1015 }, { "epoch": 6.597402597402597, "grad_norm": 2.3727035522460938, "learning_rate": 9.015335754221964e-05, "loss": 1.2275, "step": 1016 }, { "epoch": 6.603896103896104, "grad_norm": 2.151705265045166, "learning_rate": 9.013462922692324e-05, "loss": 1.0354, "step": 1017 }, { "epoch": 6.6103896103896105, "grad_norm": 2.015474319458008, "learning_rate": 9.011588506711083e-05, "loss": 0.9932, "step": 1018 }, { "epoch": 6.616883116883117, "grad_norm": 2.2131268978118896, "learning_rate": 9.009712507018231e-05, "loss": 1.1619, "step": 1019 }, { "epoch": 6.623376623376624, "grad_norm": 2.402639865875244, "learning_rate": 9.007834924354383e-05, "loss": 1.2321, "step": 1020 }, { "epoch": 6.62987012987013, "grad_norm": 2.2011077404022217, "learning_rate": 9.005955759460779e-05, "loss": 1.0755, "step": 1021 }, { "epoch": 6.636363636363637, "grad_norm": 2.4937143325805664, "learning_rate": 9.004075013079283e-05, "loss": 1.1842, "step": 1022 }, { "epoch": 6.642857142857143, "grad_norm": 2.269645929336548, "learning_rate": 9.002192685952385e-05, "loss": 1.0566, "step": 1023 }, { "epoch": 6.64935064935065, "grad_norm": 2.230419874191284, "learning_rate": 9.000308778823195e-05, "loss": 1.0918, "step": 1024 }, { "epoch": 6.6558441558441555, "grad_norm": 2.2211496829986572, "learning_rate": 8.998423292435454e-05, "loss": 1.0644, "step": 1025 }, { "epoch": 6.662337662337662, "grad_norm": 2.4004454612731934, "learning_rate": 8.996536227533519e-05, "loss": 1.21, "step": 1026 }, { "epoch": 6.6688311688311686, "grad_norm": 2.172208070755005, "learning_rate": 8.994647584862374e-05, "loss": 1.1403, "step": 1027 }, { "epoch": 6.675324675324675, "grad_norm": 2.2280516624450684, "learning_rate": 8.992757365167626e-05, "loss": 1.1776, "step": 1028 }, { "epoch": 6.681818181818182, "grad_norm": 2.2168164253234863, "learning_rate": 8.990865569195502e-05, "loss": 1.0141, "step": 1029 }, { "epoch": 6.688311688311688, "grad_norm": 2.4428412914276123, "learning_rate": 8.988972197692855e-05, "loss": 1.2419, "step": 1030 }, { "epoch": 6.694805194805195, "grad_norm": 2.5520572662353516, "learning_rate": 8.987077251407158e-05, "loss": 1.2397, "step": 1031 }, { "epoch": 6.701298701298701, "grad_norm": 2.211339235305786, "learning_rate": 8.985180731086505e-05, "loss": 1.0824, "step": 1032 }, { "epoch": 6.707792207792208, "grad_norm": 1.9707832336425781, "learning_rate": 8.983282637479614e-05, "loss": 1.0771, "step": 1033 }, { "epoch": 6.714285714285714, "grad_norm": 2.4689748287200928, "learning_rate": 8.981382971335819e-05, "loss": 1.1662, "step": 1034 }, { "epoch": 6.720779220779221, "grad_norm": 2.417175769805908, "learning_rate": 8.97948173340508e-05, "loss": 1.1183, "step": 1035 }, { "epoch": 6.7272727272727275, "grad_norm": 2.415498733520508, "learning_rate": 8.977578924437974e-05, "loss": 1.1773, "step": 1036 }, { "epoch": 6.733766233766234, "grad_norm": 2.1574041843414307, "learning_rate": 8.975674545185703e-05, "loss": 1.035, "step": 1037 }, { "epoch": 6.740259740259741, "grad_norm": 2.415621042251587, "learning_rate": 8.973768596400085e-05, "loss": 1.2409, "step": 1038 }, { "epoch": 6.746753246753247, "grad_norm": 2.438495635986328, "learning_rate": 8.971861078833557e-05, "loss": 1.1134, "step": 1039 }, { "epoch": 6.753246753246753, "grad_norm": 2.4584386348724365, "learning_rate": 8.969951993239177e-05, "loss": 1.105, "step": 1040 }, { "epoch": 6.759740259740259, "grad_norm": 2.286853313446045, "learning_rate": 8.968041340370621e-05, "loss": 1.1215, "step": 1041 }, { "epoch": 6.766233766233766, "grad_norm": 2.5318830013275146, "learning_rate": 8.966129120982188e-05, "loss": 1.2174, "step": 1042 }, { "epoch": 6.7727272727272725, "grad_norm": 2.14926815032959, "learning_rate": 8.964215335828787e-05, "loss": 1.0082, "step": 1043 }, { "epoch": 6.779220779220779, "grad_norm": 2.270693302154541, "learning_rate": 8.962299985665953e-05, "loss": 1.1778, "step": 1044 }, { "epoch": 6.785714285714286, "grad_norm": 2.420628547668457, "learning_rate": 8.960383071249836e-05, "loss": 1.1542, "step": 1045 }, { "epoch": 6.792207792207792, "grad_norm": 2.379152297973633, "learning_rate": 8.958464593337202e-05, "loss": 1.2034, "step": 1046 }, { "epoch": 6.798701298701299, "grad_norm": 2.2058722972869873, "learning_rate": 8.956544552685437e-05, "loss": 1.0938, "step": 1047 }, { "epoch": 6.805194805194805, "grad_norm": 2.075653553009033, "learning_rate": 8.954622950052542e-05, "loss": 1.0615, "step": 1048 }, { "epoch": 6.811688311688312, "grad_norm": 2.1014020442962646, "learning_rate": 8.952699786197137e-05, "loss": 1.0083, "step": 1049 }, { "epoch": 6.818181818181818, "grad_norm": 2.262732982635498, "learning_rate": 8.950775061878453e-05, "loss": 1.1361, "step": 1050 }, { "epoch": 6.824675324675325, "grad_norm": 2.087364435195923, "learning_rate": 8.948848777856343e-05, "loss": 1.0599, "step": 1051 }, { "epoch": 6.8311688311688314, "grad_norm": 2.4589810371398926, "learning_rate": 8.946920934891274e-05, "loss": 1.1639, "step": 1052 }, { "epoch": 6.837662337662338, "grad_norm": 2.136878490447998, "learning_rate": 8.944991533744326e-05, "loss": 1.0882, "step": 1053 }, { "epoch": 6.8441558441558445, "grad_norm": 2.4780335426330566, "learning_rate": 8.943060575177197e-05, "loss": 1.2518, "step": 1054 }, { "epoch": 6.85064935064935, "grad_norm": 2.233759641647339, "learning_rate": 8.941128059952201e-05, "loss": 1.1386, "step": 1055 }, { "epoch": 6.857142857142857, "grad_norm": 2.1583468914031982, "learning_rate": 8.93919398883226e-05, "loss": 1.0189, "step": 1056 }, { "epoch": 6.863636363636363, "grad_norm": 2.2025363445281982, "learning_rate": 8.937258362580919e-05, "loss": 1.0769, "step": 1057 }, { "epoch": 6.87012987012987, "grad_norm": 2.1733157634735107, "learning_rate": 8.93532118196233e-05, "loss": 1.0378, "step": 1058 }, { "epoch": 6.876623376623376, "grad_norm": 2.4087839126586914, "learning_rate": 8.93338244774126e-05, "loss": 1.1531, "step": 1059 }, { "epoch": 6.883116883116883, "grad_norm": 2.359661340713501, "learning_rate": 8.931442160683094e-05, "loss": 1.0614, "step": 1060 }, { "epoch": 6.8896103896103895, "grad_norm": 2.222322940826416, "learning_rate": 8.929500321553826e-05, "loss": 1.0687, "step": 1061 }, { "epoch": 6.896103896103896, "grad_norm": 2.2607553005218506, "learning_rate": 8.92755693112006e-05, "loss": 1.2072, "step": 1062 }, { "epoch": 6.902597402597403, "grad_norm": 2.3558382987976074, "learning_rate": 8.925611990149021e-05, "loss": 1.1749, "step": 1063 }, { "epoch": 6.909090909090909, "grad_norm": 2.3526875972747803, "learning_rate": 8.923665499408536e-05, "loss": 1.193, "step": 1064 }, { "epoch": 6.915584415584416, "grad_norm": 2.508218288421631, "learning_rate": 8.92171745966705e-05, "loss": 1.3347, "step": 1065 }, { "epoch": 6.922077922077922, "grad_norm": 1.8409452438354492, "learning_rate": 8.91976787169362e-05, "loss": 0.8723, "step": 1066 }, { "epoch": 6.928571428571429, "grad_norm": 2.0151336193084717, "learning_rate": 8.917816736257912e-05, "loss": 0.952, "step": 1067 }, { "epoch": 6.935064935064935, "grad_norm": 1.8350368738174438, "learning_rate": 8.915864054130204e-05, "loss": 0.8401, "step": 1068 }, { "epoch": 6.941558441558442, "grad_norm": 2.169405698776245, "learning_rate": 8.91390982608138e-05, "loss": 1.1113, "step": 1069 }, { "epoch": 6.948051948051948, "grad_norm": 2.3240628242492676, "learning_rate": 8.91195405288294e-05, "loss": 1.1703, "step": 1070 }, { "epoch": 6.954545454545455, "grad_norm": 2.274879217147827, "learning_rate": 8.909996735306996e-05, "loss": 1.1955, "step": 1071 }, { "epoch": 6.961038961038961, "grad_norm": 2.3772335052490234, "learning_rate": 8.908037874126263e-05, "loss": 1.2593, "step": 1072 }, { "epoch": 6.967532467532467, "grad_norm": 2.1066665649414062, "learning_rate": 8.906077470114069e-05, "loss": 1.0618, "step": 1073 }, { "epoch": 6.974025974025974, "grad_norm": 2.411353826522827, "learning_rate": 8.904115524044348e-05, "loss": 1.2221, "step": 1074 }, { "epoch": 6.98051948051948, "grad_norm": 2.37351131439209, "learning_rate": 8.90215203669165e-05, "loss": 1.1621, "step": 1075 }, { "epoch": 6.987012987012987, "grad_norm": 2.0856990814208984, "learning_rate": 8.900187008831125e-05, "loss": 1.0735, "step": 1076 }, { "epoch": 6.9935064935064934, "grad_norm": 2.174851179122925, "learning_rate": 8.898220441238534e-05, "loss": 1.1343, "step": 1077 }, { "epoch": 7.0, "grad_norm": 17985.041015625, "learning_rate": 8.896252334690251e-05, "loss": 1.1035, "step": 1078 }, { "epoch": 7.0064935064935066, "grad_norm": 1.939652442932129, "learning_rate": 8.894282689963251e-05, "loss": 0.8732, "step": 1079 }, { "epoch": 7.012987012987013, "grad_norm": 2.128493547439575, "learning_rate": 8.892311507835119e-05, "loss": 0.9418, "step": 1080 }, { "epoch": 7.01948051948052, "grad_norm": 2.4166202545166016, "learning_rate": 8.890338789084045e-05, "loss": 0.9604, "step": 1081 }, { "epoch": 7.025974025974026, "grad_norm": 1.8071622848510742, "learning_rate": 8.888364534488827e-05, "loss": 0.8387, "step": 1082 }, { "epoch": 7.032467532467533, "grad_norm": 2.023660182952881, "learning_rate": 8.886388744828872e-05, "loss": 0.9013, "step": 1083 }, { "epoch": 7.038961038961039, "grad_norm": 2.2689034938812256, "learning_rate": 8.884411420884189e-05, "loss": 0.986, "step": 1084 }, { "epoch": 7.045454545454546, "grad_norm": 2.1169872283935547, "learning_rate": 8.882432563435393e-05, "loss": 0.9093, "step": 1085 }, { "epoch": 7.0519480519480515, "grad_norm": 2.1221537590026855, "learning_rate": 8.880452173263709e-05, "loss": 0.9582, "step": 1086 }, { "epoch": 7.058441558441558, "grad_norm": 2.1945886611938477, "learning_rate": 8.87847025115096e-05, "loss": 1.022, "step": 1087 }, { "epoch": 7.064935064935065, "grad_norm": 2.1672163009643555, "learning_rate": 8.876486797879579e-05, "loss": 0.9379, "step": 1088 }, { "epoch": 7.071428571428571, "grad_norm": 2.1163387298583984, "learning_rate": 8.874501814232603e-05, "loss": 0.9828, "step": 1089 }, { "epoch": 7.077922077922078, "grad_norm": 1.9327890872955322, "learning_rate": 8.872515300993669e-05, "loss": 0.8449, "step": 1090 }, { "epoch": 7.084415584415584, "grad_norm": 2.355165958404541, "learning_rate": 8.870527258947024e-05, "loss": 1.016, "step": 1091 }, { "epoch": 7.090909090909091, "grad_norm": 2.0632739067077637, "learning_rate": 8.868537688877516e-05, "loss": 0.8259, "step": 1092 }, { "epoch": 7.097402597402597, "grad_norm": 2.1498799324035645, "learning_rate": 8.866546591570592e-05, "loss": 0.8963, "step": 1093 }, { "epoch": 7.103896103896104, "grad_norm": 2.1312801837921143, "learning_rate": 8.864553967812309e-05, "loss": 0.8811, "step": 1094 }, { "epoch": 7.1103896103896105, "grad_norm": 2.255749464035034, "learning_rate": 8.862559818389322e-05, "loss": 0.9756, "step": 1095 }, { "epoch": 7.116883116883117, "grad_norm": 2.2630326747894287, "learning_rate": 8.860564144088891e-05, "loss": 0.9027, "step": 1096 }, { "epoch": 7.123376623376624, "grad_norm": 2.323734998703003, "learning_rate": 8.858566945698874e-05, "loss": 1.078, "step": 1097 }, { "epoch": 7.12987012987013, "grad_norm": 2.2903385162353516, "learning_rate": 8.856568224007736e-05, "loss": 0.8821, "step": 1098 }, { "epoch": 7.136363636363637, "grad_norm": 2.2442169189453125, "learning_rate": 8.854567979804538e-05, "loss": 0.9962, "step": 1099 }, { "epoch": 7.142857142857143, "grad_norm": 2.57180118560791, "learning_rate": 8.852566213878947e-05, "loss": 1.104, "step": 1100 }, { "epoch": 7.14935064935065, "grad_norm": 2.2986552715301514, "learning_rate": 8.850562927021227e-05, "loss": 0.9048, "step": 1101 }, { "epoch": 7.1558441558441555, "grad_norm": 2.231067419052124, "learning_rate": 8.848558120022246e-05, "loss": 0.9655, "step": 1102 }, { "epoch": 7.162337662337662, "grad_norm": 5.123160362243652, "learning_rate": 8.846551793673467e-05, "loss": 1.0098, "step": 1103 }, { "epoch": 7.1688311688311686, "grad_norm": 2.8940987586975098, "learning_rate": 8.844543948766958e-05, "loss": 0.8868, "step": 1104 }, { "epoch": 7.175324675324675, "grad_norm": 2.3510234355926514, "learning_rate": 8.842534586095383e-05, "loss": 0.9608, "step": 1105 }, { "epoch": 7.181818181818182, "grad_norm": 2.365297317504883, "learning_rate": 8.840523706452009e-05, "loss": 1.0061, "step": 1106 }, { "epoch": 7.188311688311688, "grad_norm": 2.305508852005005, "learning_rate": 8.838511310630697e-05, "loss": 0.9354, "step": 1107 }, { "epoch": 7.194805194805195, "grad_norm": 2.5390188694000244, "learning_rate": 8.83649739942591e-05, "loss": 1.0172, "step": 1108 }, { "epoch": 7.201298701298701, "grad_norm": 2.373601198196411, "learning_rate": 8.834481973632708e-05, "loss": 1.0023, "step": 1109 }, { "epoch": 7.207792207792208, "grad_norm": 2.274458408355713, "learning_rate": 8.832465034046749e-05, "loss": 0.9513, "step": 1110 }, { "epoch": 7.214285714285714, "grad_norm": 1.957666039466858, "learning_rate": 8.83044658146429e-05, "loss": 0.7818, "step": 1111 }, { "epoch": 7.220779220779221, "grad_norm": 2.1820261478424072, "learning_rate": 8.828426616682183e-05, "loss": 0.842, "step": 1112 }, { "epoch": 7.2272727272727275, "grad_norm": 2.6345229148864746, "learning_rate": 8.826405140497878e-05, "loss": 1.0009, "step": 1113 }, { "epoch": 7.233766233766234, "grad_norm": 2.6103906631469727, "learning_rate": 8.824382153709421e-05, "loss": 0.9493, "step": 1114 }, { "epoch": 7.240259740259741, "grad_norm": 2.243248701095581, "learning_rate": 8.822357657115459e-05, "loss": 0.9307, "step": 1115 }, { "epoch": 7.246753246753247, "grad_norm": 2.402841091156006, "learning_rate": 8.820331651515226e-05, "loss": 1.0666, "step": 1116 }, { "epoch": 7.253246753246753, "grad_norm": 2.3419620990753174, "learning_rate": 8.81830413770856e-05, "loss": 0.9186, "step": 1117 }, { "epoch": 7.259740259740259, "grad_norm": 2.1303787231445312, "learning_rate": 8.816275116495892e-05, "loss": 0.9052, "step": 1118 }, { "epoch": 7.266233766233766, "grad_norm": 2.5103187561035156, "learning_rate": 8.814244588678245e-05, "loss": 1.0406, "step": 1119 }, { "epoch": 7.2727272727272725, "grad_norm": 2.2306487560272217, "learning_rate": 8.81221255505724e-05, "loss": 0.8912, "step": 1120 }, { "epoch": 7.279220779220779, "grad_norm": 2.2726263999938965, "learning_rate": 8.810179016435092e-05, "loss": 0.9709, "step": 1121 }, { "epoch": 7.285714285714286, "grad_norm": 2.5551421642303467, "learning_rate": 8.808143973614611e-05, "loss": 1.0897, "step": 1122 }, { "epoch": 7.292207792207792, "grad_norm": 2.1128101348876953, "learning_rate": 8.806107427399197e-05, "loss": 0.8943, "step": 1123 }, { "epoch": 7.298701298701299, "grad_norm": 2.3904340267181396, "learning_rate": 8.804069378592849e-05, "loss": 0.9773, "step": 1124 }, { "epoch": 7.305194805194805, "grad_norm": 1.9445910453796387, "learning_rate": 8.802029828000156e-05, "loss": 0.7842, "step": 1125 }, { "epoch": 7.311688311688312, "grad_norm": 2.3209996223449707, "learning_rate": 8.799988776426298e-05, "loss": 0.989, "step": 1126 }, { "epoch": 7.318181818181818, "grad_norm": 2.2647881507873535, "learning_rate": 8.797946224677052e-05, "loss": 0.9334, "step": 1127 }, { "epoch": 7.324675324675325, "grad_norm": 2.4576051235198975, "learning_rate": 8.795902173558783e-05, "loss": 1.0929, "step": 1128 }, { "epoch": 7.3311688311688314, "grad_norm": 2.433586597442627, "learning_rate": 8.793856623878452e-05, "loss": 1.0337, "step": 1129 }, { "epoch": 7.337662337662338, "grad_norm": 2.304011821746826, "learning_rate": 8.79180957644361e-05, "loss": 0.9771, "step": 1130 }, { "epoch": 7.3441558441558445, "grad_norm": 2.250781536102295, "learning_rate": 8.789761032062397e-05, "loss": 1.0171, "step": 1131 }, { "epoch": 7.35064935064935, "grad_norm": 2.306729555130005, "learning_rate": 8.787710991543548e-05, "loss": 0.8904, "step": 1132 }, { "epoch": 7.357142857142857, "grad_norm": 2.406986951828003, "learning_rate": 8.785659455696384e-05, "loss": 1.0551, "step": 1133 }, { "epoch": 7.363636363636363, "grad_norm": 2.177974224090576, "learning_rate": 8.783606425330819e-05, "loss": 0.9901, "step": 1134 }, { "epoch": 7.37012987012987, "grad_norm": 2.277726650238037, "learning_rate": 8.78155190125736e-05, "loss": 1.0153, "step": 1135 }, { "epoch": 7.376623376623376, "grad_norm": 2.368788480758667, "learning_rate": 8.7794958842871e-05, "loss": 0.9903, "step": 1136 }, { "epoch": 7.383116883116883, "grad_norm": 2.5523831844329834, "learning_rate": 8.777438375231717e-05, "loss": 1.0406, "step": 1137 }, { "epoch": 7.3896103896103895, "grad_norm": 2.339541435241699, "learning_rate": 8.775379374903487e-05, "loss": 1.0202, "step": 1138 }, { "epoch": 7.396103896103896, "grad_norm": 2.353381633758545, "learning_rate": 8.773318884115273e-05, "loss": 0.9538, "step": 1139 }, { "epoch": 7.402597402597403, "grad_norm": 2.490757703781128, "learning_rate": 8.771256903680519e-05, "loss": 1.0989, "step": 1140 }, { "epoch": 7.409090909090909, "grad_norm": 2.4964358806610107, "learning_rate": 8.769193434413266e-05, "loss": 1.0535, "step": 1141 }, { "epoch": 7.415584415584416, "grad_norm": 2.534123182296753, "learning_rate": 8.767128477128137e-05, "loss": 1.008, "step": 1142 }, { "epoch": 7.422077922077922, "grad_norm": 2.3385679721832275, "learning_rate": 8.765062032640346e-05, "loss": 1.0306, "step": 1143 }, { "epoch": 7.428571428571429, "grad_norm": 2.352762222290039, "learning_rate": 8.76299410176569e-05, "loss": 1.037, "step": 1144 }, { "epoch": 7.435064935064935, "grad_norm": 2.549403429031372, "learning_rate": 8.760924685320557e-05, "loss": 1.0304, "step": 1145 }, { "epoch": 7.441558441558442, "grad_norm": 2.2719662189483643, "learning_rate": 8.758853784121921e-05, "loss": 0.9224, "step": 1146 }, { "epoch": 7.448051948051948, "grad_norm": 2.1252450942993164, "learning_rate": 8.75678139898734e-05, "loss": 0.8933, "step": 1147 }, { "epoch": 7.454545454545454, "grad_norm": 2.4617741107940674, "learning_rate": 8.754707530734957e-05, "loss": 1.1581, "step": 1148 }, { "epoch": 7.461038961038961, "grad_norm": 2.141832113265991, "learning_rate": 8.752632180183505e-05, "loss": 0.9494, "step": 1149 }, { "epoch": 7.467532467532467, "grad_norm": 2.639061212539673, "learning_rate": 8.750555348152298e-05, "loss": 1.0582, "step": 1150 }, { "epoch": 7.474025974025974, "grad_norm": 1.9012668132781982, "learning_rate": 8.748477035461238e-05, "loss": 0.8111, "step": 1151 }, { "epoch": 7.48051948051948, "grad_norm": 2.2771406173706055, "learning_rate": 8.746397242930807e-05, "loss": 0.9934, "step": 1152 }, { "epoch": 7.487012987012987, "grad_norm": 2.5009937286376953, "learning_rate": 8.744315971382078e-05, "loss": 1.0519, "step": 1153 }, { "epoch": 7.4935064935064934, "grad_norm": 2.192047595977783, "learning_rate": 8.7422332216367e-05, "loss": 0.9727, "step": 1154 }, { "epoch": 7.5, "grad_norm": 2.1458213329315186, "learning_rate": 8.740148994516912e-05, "loss": 0.9318, "step": 1155 }, { "epoch": 7.5064935064935066, "grad_norm": 2.565033435821533, "learning_rate": 8.738063290845535e-05, "loss": 1.0677, "step": 1156 }, { "epoch": 7.512987012987013, "grad_norm": 2.0971720218658447, "learning_rate": 8.73597611144597e-05, "loss": 0.954, "step": 1157 }, { "epoch": 7.51948051948052, "grad_norm": 2.1589765548706055, "learning_rate": 8.733887457142202e-05, "loss": 1.0059, "step": 1158 }, { "epoch": 7.525974025974026, "grad_norm": 2.1708037853240967, "learning_rate": 8.7317973287588e-05, "loss": 1.0171, "step": 1159 }, { "epoch": 7.532467532467533, "grad_norm": 2.249652624130249, "learning_rate": 8.729705727120911e-05, "loss": 1.043, "step": 1160 }, { "epoch": 7.538961038961039, "grad_norm": 2.2765090465545654, "learning_rate": 8.727612653054269e-05, "loss": 0.9805, "step": 1161 }, { "epoch": 7.545454545454545, "grad_norm": 2.298006057739258, "learning_rate": 8.725518107385187e-05, "loss": 1.0361, "step": 1162 }, { "epoch": 7.551948051948052, "grad_norm": 2.2330729961395264, "learning_rate": 8.723422090940555e-05, "loss": 0.9749, "step": 1163 }, { "epoch": 7.558441558441558, "grad_norm": 2.1578798294067383, "learning_rate": 8.72132460454785e-05, "loss": 1.029, "step": 1164 }, { "epoch": 7.564935064935065, "grad_norm": 1.9553289413452148, "learning_rate": 8.719225649035126e-05, "loss": 0.7988, "step": 1165 }, { "epoch": 7.571428571428571, "grad_norm": 2.2159762382507324, "learning_rate": 8.717125225231017e-05, "loss": 1.062, "step": 1166 }, { "epoch": 7.577922077922078, "grad_norm": 1.8637006282806396, "learning_rate": 8.715023333964736e-05, "loss": 0.8109, "step": 1167 }, { "epoch": 7.584415584415584, "grad_norm": 2.314709186553955, "learning_rate": 8.712919976066077e-05, "loss": 0.9619, "step": 1168 }, { "epoch": 7.590909090909091, "grad_norm": 2.3416078090667725, "learning_rate": 8.710815152365415e-05, "loss": 1.0572, "step": 1169 }, { "epoch": 7.597402597402597, "grad_norm": 2.2990005016326904, "learning_rate": 8.708708863693697e-05, "loss": 0.9534, "step": 1170 }, { "epoch": 7.603896103896104, "grad_norm": 2.4119932651519775, "learning_rate": 8.706601110882454e-05, "loss": 1.0283, "step": 1171 }, { "epoch": 7.6103896103896105, "grad_norm": 2.183415174484253, "learning_rate": 8.704491894763794e-05, "loss": 0.9208, "step": 1172 }, { "epoch": 7.616883116883117, "grad_norm": 2.2732317447662354, "learning_rate": 8.702381216170403e-05, "loss": 0.9305, "step": 1173 }, { "epoch": 7.623376623376624, "grad_norm": 2.3167757987976074, "learning_rate": 8.700269075935541e-05, "loss": 0.9895, "step": 1174 }, { "epoch": 7.62987012987013, "grad_norm": 2.2215218544006348, "learning_rate": 8.69815547489305e-05, "loss": 1.0183, "step": 1175 }, { "epoch": 7.636363636363637, "grad_norm": 2.470303773880005, "learning_rate": 8.696040413877344e-05, "loss": 1.0394, "step": 1176 }, { "epoch": 7.642857142857143, "grad_norm": 2.379303216934204, "learning_rate": 8.693923893723416e-05, "loss": 0.9527, "step": 1177 }, { "epoch": 7.64935064935065, "grad_norm": 2.383970022201538, "learning_rate": 8.691805915266836e-05, "loss": 0.959, "step": 1178 }, { "epoch": 7.6558441558441555, "grad_norm": 2.301018238067627, "learning_rate": 8.689686479343747e-05, "loss": 0.963, "step": 1179 }, { "epoch": 7.662337662337662, "grad_norm": 2.6751325130462646, "learning_rate": 8.68756558679087e-05, "loss": 1.1706, "step": 1180 }, { "epoch": 7.6688311688311686, "grad_norm": 2.561551094055176, "learning_rate": 8.685443238445499e-05, "loss": 1.0545, "step": 1181 }, { "epoch": 7.675324675324675, "grad_norm": 2.3014729022979736, "learning_rate": 8.683319435145503e-05, "loss": 0.917, "step": 1182 }, { "epoch": 7.681818181818182, "grad_norm": 2.1952528953552246, "learning_rate": 8.681194177729327e-05, "loss": 1.0108, "step": 1183 }, { "epoch": 7.688311688311688, "grad_norm": 2.403540849685669, "learning_rate": 8.679067467035989e-05, "loss": 1.0096, "step": 1184 }, { "epoch": 7.694805194805195, "grad_norm": 2.3405985832214355, "learning_rate": 8.67693930390508e-05, "loss": 1.0406, "step": 1185 }, { "epoch": 7.701298701298701, "grad_norm": 2.394473075866699, "learning_rate": 8.674809689176764e-05, "loss": 1.0998, "step": 1186 }, { "epoch": 7.707792207792208, "grad_norm": 1.9719188213348389, "learning_rate": 8.672678623691783e-05, "loss": 0.8589, "step": 1187 }, { "epoch": 7.714285714285714, "grad_norm": 2.511230230331421, "learning_rate": 8.670546108291443e-05, "loss": 1.0682, "step": 1188 }, { "epoch": 7.720779220779221, "grad_norm": 2.1016743183135986, "learning_rate": 8.668412143817631e-05, "loss": 0.9302, "step": 1189 }, { "epoch": 7.7272727272727275, "grad_norm": 2.0212161540985107, "learning_rate": 8.6662767311128e-05, "loss": 1.0233, "step": 1190 }, { "epoch": 7.733766233766234, "grad_norm": 2.402103900909424, "learning_rate": 8.66413987101998e-05, "loss": 1.0854, "step": 1191 }, { "epoch": 7.740259740259741, "grad_norm": 2.3115615844726562, "learning_rate": 8.662001564382767e-05, "loss": 0.9475, "step": 1192 }, { "epoch": 7.746753246753247, "grad_norm": 2.4030799865722656, "learning_rate": 8.65986181204533e-05, "loss": 1.0273, "step": 1193 }, { "epoch": 7.753246753246753, "grad_norm": 2.3997647762298584, "learning_rate": 8.657720614852411e-05, "loss": 1.0452, "step": 1194 }, { "epoch": 7.759740259740259, "grad_norm": 2.138275146484375, "learning_rate": 8.655577973649321e-05, "loss": 0.9578, "step": 1195 }, { "epoch": 7.766233766233766, "grad_norm": 2.3623483180999756, "learning_rate": 8.65343388928194e-05, "loss": 1.028, "step": 1196 }, { "epoch": 7.7727272727272725, "grad_norm": 2.2765283584594727, "learning_rate": 8.651288362596718e-05, "loss": 0.975, "step": 1197 }, { "epoch": 7.779220779220779, "grad_norm": 2.779798746109009, "learning_rate": 8.649141394440677e-05, "loss": 1.0598, "step": 1198 }, { "epoch": 7.785714285714286, "grad_norm": 2.338444232940674, "learning_rate": 8.646992985661404e-05, "loss": 1.0226, "step": 1199 }, { "epoch": 7.792207792207792, "grad_norm": 2.3311634063720703, "learning_rate": 8.644843137107059e-05, "loss": 0.9551, "step": 1200 }, { "epoch": 7.798701298701299, "grad_norm": 2.024712324142456, "learning_rate": 8.642691849626364e-05, "loss": 0.8728, "step": 1201 }, { "epoch": 7.805194805194805, "grad_norm": 2.1652753353118896, "learning_rate": 8.640539124068617e-05, "loss": 0.9638, "step": 1202 }, { "epoch": 7.811688311688312, "grad_norm": 2.3495283126831055, "learning_rate": 8.638384961283679e-05, "loss": 1.0124, "step": 1203 }, { "epoch": 7.818181818181818, "grad_norm": 2.242274522781372, "learning_rate": 8.63622936212198e-05, "loss": 0.992, "step": 1204 }, { "epoch": 7.824675324675325, "grad_norm": 2.282271146774292, "learning_rate": 8.634072327434515e-05, "loss": 1.0438, "step": 1205 }, { "epoch": 7.8311688311688314, "grad_norm": 2.7067198753356934, "learning_rate": 8.631913858072846e-05, "loss": 1.1024, "step": 1206 }, { "epoch": 7.837662337662338, "grad_norm": 2.1303887367248535, "learning_rate": 8.629753954889107e-05, "loss": 1.0117, "step": 1207 }, { "epoch": 7.8441558441558445, "grad_norm": 2.1298983097076416, "learning_rate": 8.627592618735989e-05, "loss": 0.9347, "step": 1208 }, { "epoch": 7.85064935064935, "grad_norm": 2.4259676933288574, "learning_rate": 8.625429850466756e-05, "loss": 1.0624, "step": 1209 }, { "epoch": 7.857142857142857, "grad_norm": 2.1696083545684814, "learning_rate": 8.623265650935234e-05, "loss": 0.9395, "step": 1210 }, { "epoch": 7.863636363636363, "grad_norm": 2.418948173522949, "learning_rate": 8.621100020995814e-05, "loss": 1.1141, "step": 1211 }, { "epoch": 7.87012987012987, "grad_norm": 2.3008179664611816, "learning_rate": 8.618932961503452e-05, "loss": 1.0913, "step": 1212 }, { "epoch": 7.876623376623376, "grad_norm": 2.23539662361145, "learning_rate": 8.616764473313671e-05, "loss": 1.0262, "step": 1213 }, { "epoch": 7.883116883116883, "grad_norm": 2.30684232711792, "learning_rate": 8.614594557282553e-05, "loss": 1.0216, "step": 1214 }, { "epoch": 7.8896103896103895, "grad_norm": 2.328866481781006, "learning_rate": 8.612423214266749e-05, "loss": 1.0333, "step": 1215 }, { "epoch": 7.896103896103896, "grad_norm": 2.5381362438201904, "learning_rate": 8.61025044512347e-05, "loss": 1.0486, "step": 1216 }, { "epoch": 7.902597402597403, "grad_norm": 2.353156328201294, "learning_rate": 8.60807625071049e-05, "loss": 1.0335, "step": 1217 }, { "epoch": 7.909090909090909, "grad_norm": 2.1642353534698486, "learning_rate": 8.605900631886147e-05, "loss": 0.998, "step": 1218 }, { "epoch": 7.915584415584416, "grad_norm": 2.3284685611724854, "learning_rate": 8.603723589509343e-05, "loss": 1.1231, "step": 1219 }, { "epoch": 7.922077922077922, "grad_norm": 2.2811686992645264, "learning_rate": 8.601545124439535e-05, "loss": 1.016, "step": 1220 }, { "epoch": 7.928571428571429, "grad_norm": 2.224337339401245, "learning_rate": 8.59936523753675e-05, "loss": 0.9823, "step": 1221 }, { "epoch": 7.935064935064935, "grad_norm": 1.7927359342575073, "learning_rate": 8.597183929661573e-05, "loss": 0.7452, "step": 1222 }, { "epoch": 7.941558441558442, "grad_norm": 2.179928779602051, "learning_rate": 8.595001201675147e-05, "loss": 1.0548, "step": 1223 }, { "epoch": 7.948051948051948, "grad_norm": 2.3533084392547607, "learning_rate": 8.592817054439184e-05, "loss": 1.0012, "step": 1224 }, { "epoch": 7.954545454545455, "grad_norm": 2.246802806854248, "learning_rate": 8.590631488815944e-05, "loss": 1.0731, "step": 1225 }, { "epoch": 7.961038961038961, "grad_norm": 2.3199892044067383, "learning_rate": 8.588444505668258e-05, "loss": 1.1082, "step": 1226 }, { "epoch": 7.967532467532467, "grad_norm": 2.17795991897583, "learning_rate": 8.586256105859512e-05, "loss": 0.9684, "step": 1227 }, { "epoch": 7.974025974025974, "grad_norm": 2.3997833728790283, "learning_rate": 8.58406629025365e-05, "loss": 1.1021, "step": 1228 }, { "epoch": 7.98051948051948, "grad_norm": 2.203303098678589, "learning_rate": 8.581875059715176e-05, "loss": 1.0425, "step": 1229 }, { "epoch": 7.987012987012987, "grad_norm": 2.3763298988342285, "learning_rate": 8.579682415109156e-05, "loss": 1.0595, "step": 1230 }, { "epoch": 7.9935064935064934, "grad_norm": 2.3501322269439697, "learning_rate": 8.57748835730121e-05, "loss": 1.0559, "step": 1231 }, { "epoch": 8.0, "grad_norm": 8490.2294921875, "learning_rate": 8.575292887157516e-05, "loss": 0.9554, "step": 1232 }, { "epoch": 8.006493506493506, "grad_norm": 2.023545742034912, "learning_rate": 8.573096005544811e-05, "loss": 0.8104, "step": 1233 }, { "epoch": 8.012987012987013, "grad_norm": 2.183959484100342, "learning_rate": 8.570897713330393e-05, "loss": 0.8353, "step": 1234 }, { "epoch": 8.019480519480519, "grad_norm": 2.0647823810577393, "learning_rate": 8.568698011382107e-05, "loss": 0.8269, "step": 1235 }, { "epoch": 8.025974025974026, "grad_norm": 2.166212797164917, "learning_rate": 8.566496900568363e-05, "loss": 0.8408, "step": 1236 }, { "epoch": 8.032467532467532, "grad_norm": 2.1798791885375977, "learning_rate": 8.564294381758128e-05, "loss": 0.9087, "step": 1237 }, { "epoch": 8.03896103896104, "grad_norm": 1.9627519845962524, "learning_rate": 8.562090455820918e-05, "loss": 0.7727, "step": 1238 }, { "epoch": 8.045454545454545, "grad_norm": 2.255664587020874, "learning_rate": 8.559885123626807e-05, "loss": 0.8939, "step": 1239 }, { "epoch": 8.051948051948052, "grad_norm": 2.0216453075408936, "learning_rate": 8.557678386046427e-05, "loss": 0.805, "step": 1240 }, { "epoch": 8.058441558441558, "grad_norm": 2.289820432662964, "learning_rate": 8.555470243950964e-05, "loss": 0.9322, "step": 1241 }, { "epoch": 8.064935064935066, "grad_norm": 2.1566131114959717, "learning_rate": 8.553260698212155e-05, "loss": 0.8221, "step": 1242 }, { "epoch": 8.071428571428571, "grad_norm": 2.0849456787109375, "learning_rate": 8.551049749702297e-05, "loss": 0.8451, "step": 1243 }, { "epoch": 8.077922077922079, "grad_norm": 2.2331180572509766, "learning_rate": 8.548837399294235e-05, "loss": 0.8813, "step": 1244 }, { "epoch": 8.084415584415584, "grad_norm": 2.1426122188568115, "learning_rate": 8.546623647861371e-05, "loss": 0.9262, "step": 1245 }, { "epoch": 8.090909090909092, "grad_norm": 2.1544063091278076, "learning_rate": 8.544408496277656e-05, "loss": 0.8672, "step": 1246 }, { "epoch": 8.097402597402597, "grad_norm": 2.375364065170288, "learning_rate": 8.542191945417601e-05, "loss": 0.8861, "step": 1247 }, { "epoch": 8.103896103896103, "grad_norm": 2.2304739952087402, "learning_rate": 8.539973996156265e-05, "loss": 0.8394, "step": 1248 }, { "epoch": 8.11038961038961, "grad_norm": 2.1960930824279785, "learning_rate": 8.537754649369255e-05, "loss": 0.7973, "step": 1249 }, { "epoch": 8.116883116883116, "grad_norm": 2.1149661540985107, "learning_rate": 8.535533905932738e-05, "loss": 0.8099, "step": 1250 }, { "epoch": 8.123376623376624, "grad_norm": 2.2761709690093994, "learning_rate": 8.533311766723428e-05, "loss": 0.9352, "step": 1251 }, { "epoch": 8.12987012987013, "grad_norm": 2.1831071376800537, "learning_rate": 8.531088232618588e-05, "loss": 0.8521, "step": 1252 }, { "epoch": 8.136363636363637, "grad_norm": 2.340667724609375, "learning_rate": 8.528863304496035e-05, "loss": 0.8881, "step": 1253 }, { "epoch": 8.142857142857142, "grad_norm": 2.133786201477051, "learning_rate": 8.526636983234135e-05, "loss": 0.8583, "step": 1254 }, { "epoch": 8.14935064935065, "grad_norm": 2.3610661029815674, "learning_rate": 8.524409269711809e-05, "loss": 0.9045, "step": 1255 }, { "epoch": 8.155844155844155, "grad_norm": 2.1793646812438965, "learning_rate": 8.522180164808516e-05, "loss": 0.8446, "step": 1256 }, { "epoch": 8.162337662337663, "grad_norm": 2.218177080154419, "learning_rate": 8.519949669404274e-05, "loss": 0.8123, "step": 1257 }, { "epoch": 8.168831168831169, "grad_norm": 2.30928111076355, "learning_rate": 8.51771778437965e-05, "loss": 0.8365, "step": 1258 }, { "epoch": 8.175324675324676, "grad_norm": 2.2388057708740234, "learning_rate": 8.515484510615753e-05, "loss": 0.9075, "step": 1259 }, { "epoch": 8.181818181818182, "grad_norm": 2.3225274085998535, "learning_rate": 8.513249848994246e-05, "loss": 0.9796, "step": 1260 }, { "epoch": 8.188311688311689, "grad_norm": 2.455937623977661, "learning_rate": 8.511013800397338e-05, "loss": 0.9009, "step": 1261 }, { "epoch": 8.194805194805195, "grad_norm": 2.265684127807617, "learning_rate": 8.508776365707787e-05, "loss": 0.8867, "step": 1262 }, { "epoch": 8.2012987012987, "grad_norm": 2.2409677505493164, "learning_rate": 8.506537545808892e-05, "loss": 0.8623, "step": 1263 }, { "epoch": 8.207792207792208, "grad_norm": 2.3265388011932373, "learning_rate": 8.504297341584508e-05, "loss": 0.8919, "step": 1264 }, { "epoch": 8.214285714285714, "grad_norm": 2.317533016204834, "learning_rate": 8.502055753919032e-05, "loss": 0.8108, "step": 1265 }, { "epoch": 8.220779220779221, "grad_norm": 2.1003658771514893, "learning_rate": 8.499812783697407e-05, "loss": 0.7612, "step": 1266 }, { "epoch": 8.227272727272727, "grad_norm": 2.0778393745422363, "learning_rate": 8.497568431805119e-05, "loss": 0.7407, "step": 1267 }, { "epoch": 8.233766233766234, "grad_norm": 2.2582011222839355, "learning_rate": 8.495322699128205e-05, "loss": 0.8541, "step": 1268 }, { "epoch": 8.24025974025974, "grad_norm": 2.1436638832092285, "learning_rate": 8.493075586553245e-05, "loss": 0.8317, "step": 1269 }, { "epoch": 8.246753246753247, "grad_norm": 2.144747257232666, "learning_rate": 8.490827094967363e-05, "loss": 0.8715, "step": 1270 }, { "epoch": 8.253246753246753, "grad_norm": 2.3335506916046143, "learning_rate": 8.48857722525823e-05, "loss": 0.8549, "step": 1271 }, { "epoch": 8.25974025974026, "grad_norm": 2.3309953212738037, "learning_rate": 8.486325978314055e-05, "loss": 0.892, "step": 1272 }, { "epoch": 8.266233766233766, "grad_norm": 2.2063148021698, "learning_rate": 8.484073355023596e-05, "loss": 0.8658, "step": 1273 }, { "epoch": 8.272727272727273, "grad_norm": 2.223764419555664, "learning_rate": 8.481819356276155e-05, "loss": 0.8397, "step": 1274 }, { "epoch": 8.279220779220779, "grad_norm": 2.1882009506225586, "learning_rate": 8.479563982961573e-05, "loss": 0.8788, "step": 1275 }, { "epoch": 8.285714285714286, "grad_norm": 2.291719675064087, "learning_rate": 8.477307235970236e-05, "loss": 0.8885, "step": 1276 }, { "epoch": 8.292207792207792, "grad_norm": 2.111360788345337, "learning_rate": 8.475049116193071e-05, "loss": 0.8755, "step": 1277 }, { "epoch": 8.2987012987013, "grad_norm": 2.1871907711029053, "learning_rate": 8.472789624521551e-05, "loss": 0.7544, "step": 1278 }, { "epoch": 8.305194805194805, "grad_norm": 2.198638439178467, "learning_rate": 8.470528761847684e-05, "loss": 0.8808, "step": 1279 }, { "epoch": 8.311688311688311, "grad_norm": 2.108152151107788, "learning_rate": 8.468266529064026e-05, "loss": 0.8238, "step": 1280 }, { "epoch": 8.318181818181818, "grad_norm": 2.1417620182037354, "learning_rate": 8.466002927063667e-05, "loss": 0.8019, "step": 1281 }, { "epoch": 8.324675324675324, "grad_norm": 2.3144867420196533, "learning_rate": 8.463737956740245e-05, "loss": 0.8671, "step": 1282 }, { "epoch": 8.331168831168831, "grad_norm": 1.9930377006530762, "learning_rate": 8.461471618987933e-05, "loss": 0.7778, "step": 1283 }, { "epoch": 8.337662337662337, "grad_norm": 2.3630878925323486, "learning_rate": 8.459203914701444e-05, "loss": 0.8404, "step": 1284 }, { "epoch": 8.344155844155845, "grad_norm": 2.4153730869293213, "learning_rate": 8.456934844776032e-05, "loss": 0.9375, "step": 1285 }, { "epoch": 8.35064935064935, "grad_norm": 2.2773656845092773, "learning_rate": 8.454664410107493e-05, "loss": 0.9166, "step": 1286 }, { "epoch": 8.357142857142858, "grad_norm": 2.1964240074157715, "learning_rate": 8.452392611592153e-05, "loss": 0.8462, "step": 1287 }, { "epoch": 8.363636363636363, "grad_norm": 2.1902146339416504, "learning_rate": 8.450119450126887e-05, "loss": 0.8701, "step": 1288 }, { "epoch": 8.37012987012987, "grad_norm": 2.152127742767334, "learning_rate": 8.447844926609103e-05, "loss": 0.847, "step": 1289 }, { "epoch": 8.376623376623376, "grad_norm": 2.184800386428833, "learning_rate": 8.445569041936743e-05, "loss": 0.9207, "step": 1290 }, { "epoch": 8.383116883116884, "grad_norm": 2.275554895401001, "learning_rate": 8.443291797008293e-05, "loss": 0.98, "step": 1291 }, { "epoch": 8.38961038961039, "grad_norm": 2.2402615547180176, "learning_rate": 8.441013192722773e-05, "loss": 0.9529, "step": 1292 }, { "epoch": 8.396103896103897, "grad_norm": 2.243180513381958, "learning_rate": 8.438733229979741e-05, "loss": 0.8537, "step": 1293 }, { "epoch": 8.402597402597403, "grad_norm": 2.4076995849609375, "learning_rate": 8.436451909679287e-05, "loss": 0.9193, "step": 1294 }, { "epoch": 8.409090909090908, "grad_norm": 2.4312427043914795, "learning_rate": 8.434169232722043e-05, "loss": 0.8835, "step": 1295 }, { "epoch": 8.415584415584416, "grad_norm": 2.348674774169922, "learning_rate": 8.431885200009171e-05, "loss": 0.9248, "step": 1296 }, { "epoch": 8.422077922077921, "grad_norm": 2.3464605808258057, "learning_rate": 8.429599812442373e-05, "loss": 0.9194, "step": 1297 }, { "epoch": 8.428571428571429, "grad_norm": 2.3472964763641357, "learning_rate": 8.427313070923885e-05, "loss": 0.9237, "step": 1298 }, { "epoch": 8.435064935064934, "grad_norm": 2.479236364364624, "learning_rate": 8.425024976356474e-05, "loss": 1.0045, "step": 1299 }, { "epoch": 8.441558441558442, "grad_norm": 2.499112367630005, "learning_rate": 8.422735529643444e-05, "loss": 0.9375, "step": 1300 }, { "epoch": 8.448051948051948, "grad_norm": 2.4231646060943604, "learning_rate": 8.420444731688633e-05, "loss": 0.9719, "step": 1301 }, { "epoch": 8.454545454545455, "grad_norm": 2.3807730674743652, "learning_rate": 8.41815258339641e-05, "loss": 0.8821, "step": 1302 }, { "epoch": 8.46103896103896, "grad_norm": 2.4089229106903076, "learning_rate": 8.415859085671683e-05, "loss": 0.8724, "step": 1303 }, { "epoch": 8.467532467532468, "grad_norm": 2.2512381076812744, "learning_rate": 8.413564239419884e-05, "loss": 0.8726, "step": 1304 }, { "epoch": 8.474025974025974, "grad_norm": 2.4043405055999756, "learning_rate": 8.411268045546983e-05, "loss": 0.9624, "step": 1305 }, { "epoch": 8.480519480519481, "grad_norm": 2.2187204360961914, "learning_rate": 8.408970504959483e-05, "loss": 0.9099, "step": 1306 }, { "epoch": 8.487012987012987, "grad_norm": 2.2931110858917236, "learning_rate": 8.406671618564415e-05, "loss": 1.0083, "step": 1307 }, { "epoch": 8.493506493506494, "grad_norm": 2.2975666522979736, "learning_rate": 8.404371387269342e-05, "loss": 0.8684, "step": 1308 }, { "epoch": 8.5, "grad_norm": 2.3833606243133545, "learning_rate": 8.40206981198236e-05, "loss": 0.8932, "step": 1309 }, { "epoch": 8.506493506493506, "grad_norm": 2.406860589981079, "learning_rate": 8.399766893612096e-05, "loss": 0.9119, "step": 1310 }, { "epoch": 8.512987012987013, "grad_norm": 1.7942426204681396, "learning_rate": 8.397462633067705e-05, "loss": 0.6829, "step": 1311 }, { "epoch": 8.519480519480519, "grad_norm": 2.5286710262298584, "learning_rate": 8.395157031258871e-05, "loss": 0.9069, "step": 1312 }, { "epoch": 8.525974025974026, "grad_norm": 2.38651967048645, "learning_rate": 8.392850089095809e-05, "loss": 0.8739, "step": 1313 }, { "epoch": 8.532467532467532, "grad_norm": 2.5119848251342773, "learning_rate": 8.390541807489265e-05, "loss": 0.9988, "step": 1314 }, { "epoch": 8.53896103896104, "grad_norm": 2.1202709674835205, "learning_rate": 8.388232187350512e-05, "loss": 0.7731, "step": 1315 }, { "epoch": 8.545454545454545, "grad_norm": 2.3501884937286377, "learning_rate": 8.38592122959135e-05, "loss": 0.9621, "step": 1316 }, { "epoch": 8.551948051948052, "grad_norm": 2.2469823360443115, "learning_rate": 8.38360893512411e-05, "loss": 0.8462, "step": 1317 }, { "epoch": 8.558441558441558, "grad_norm": 2.3673346042633057, "learning_rate": 8.381295304861647e-05, "loss": 0.888, "step": 1318 }, { "epoch": 8.564935064935066, "grad_norm": 2.33341908454895, "learning_rate": 8.378980339717349e-05, "loss": 0.8749, "step": 1319 }, { "epoch": 8.571428571428571, "grad_norm": 2.006885290145874, "learning_rate": 8.376664040605122e-05, "loss": 0.8391, "step": 1320 }, { "epoch": 8.577922077922079, "grad_norm": 2.2687089443206787, "learning_rate": 8.374346408439411e-05, "loss": 0.8874, "step": 1321 }, { "epoch": 8.584415584415584, "grad_norm": 2.21208119392395, "learning_rate": 8.372027444135176e-05, "loss": 0.8589, "step": 1322 }, { "epoch": 8.590909090909092, "grad_norm": 1.9771323204040527, "learning_rate": 8.36970714860791e-05, "loss": 0.7858, "step": 1323 }, { "epoch": 8.597402597402597, "grad_norm": 1.8855808973312378, "learning_rate": 8.367385522773627e-05, "loss": 0.6732, "step": 1324 }, { "epoch": 8.603896103896105, "grad_norm": 2.59310245513916, "learning_rate": 8.365062567548867e-05, "loss": 0.9894, "step": 1325 }, { "epoch": 8.61038961038961, "grad_norm": 2.2608039379119873, "learning_rate": 8.3627382838507e-05, "loss": 0.8949, "step": 1326 }, { "epoch": 8.616883116883116, "grad_norm": 2.323054790496826, "learning_rate": 8.360412672596712e-05, "loss": 0.9194, "step": 1327 }, { "epoch": 8.623376623376624, "grad_norm": 1.8838788270950317, "learning_rate": 8.358085734705022e-05, "loss": 0.6433, "step": 1328 }, { "epoch": 8.62987012987013, "grad_norm": 2.2631521224975586, "learning_rate": 8.355757471094263e-05, "loss": 0.88, "step": 1329 }, { "epoch": 8.636363636363637, "grad_norm": 2.5464706420898438, "learning_rate": 8.3534278826836e-05, "loss": 0.9375, "step": 1330 }, { "epoch": 8.642857142857142, "grad_norm": 2.3076460361480713, "learning_rate": 8.351096970392717e-05, "loss": 0.9402, "step": 1331 }, { "epoch": 8.64935064935065, "grad_norm": 1.9361927509307861, "learning_rate": 8.348764735141823e-05, "loss": 0.6748, "step": 1332 }, { "epoch": 8.655844155844155, "grad_norm": 2.359254837036133, "learning_rate": 8.346431177851644e-05, "loss": 0.9341, "step": 1333 }, { "epoch": 8.662337662337663, "grad_norm": 2.4542675018310547, "learning_rate": 8.344096299443434e-05, "loss": 1.0064, "step": 1334 }, { "epoch": 8.668831168831169, "grad_norm": 2.462061882019043, "learning_rate": 8.341760100838965e-05, "loss": 0.943, "step": 1335 }, { "epoch": 8.675324675324676, "grad_norm": 2.2975666522979736, "learning_rate": 8.339422582960532e-05, "loss": 0.833, "step": 1336 }, { "epoch": 8.681818181818182, "grad_norm": 2.5934624671936035, "learning_rate": 8.33708374673095e-05, "loss": 1.0069, "step": 1337 }, { "epoch": 8.688311688311689, "grad_norm": 2.2661728858947754, "learning_rate": 8.334743593073553e-05, "loss": 0.9142, "step": 1338 }, { "epoch": 8.694805194805195, "grad_norm": 2.22139835357666, "learning_rate": 8.332402122912198e-05, "loss": 0.8557, "step": 1339 }, { "epoch": 8.7012987012987, "grad_norm": 2.3808929920196533, "learning_rate": 8.330059337171258e-05, "loss": 0.9882, "step": 1340 }, { "epoch": 8.707792207792208, "grad_norm": 2.285977840423584, "learning_rate": 8.327715236775633e-05, "loss": 0.9428, "step": 1341 }, { "epoch": 8.714285714285714, "grad_norm": 2.046553373336792, "learning_rate": 8.32536982265073e-05, "loss": 0.8864, "step": 1342 }, { "epoch": 8.720779220779221, "grad_norm": 2.305896520614624, "learning_rate": 8.323023095722486e-05, "loss": 0.9274, "step": 1343 }, { "epoch": 8.727272727272727, "grad_norm": 2.6005120277404785, "learning_rate": 8.320675056917352e-05, "loss": 0.9657, "step": 1344 }, { "epoch": 8.733766233766234, "grad_norm": 2.3337910175323486, "learning_rate": 8.318325707162293e-05, "loss": 0.9651, "step": 1345 }, { "epoch": 8.74025974025974, "grad_norm": 2.21706223487854, "learning_rate": 8.315975047384798e-05, "loss": 0.8629, "step": 1346 }, { "epoch": 8.746753246753247, "grad_norm": 2.2574703693389893, "learning_rate": 8.313623078512869e-05, "loss": 0.854, "step": 1347 }, { "epoch": 8.753246753246753, "grad_norm": 2.4945363998413086, "learning_rate": 8.311269801475026e-05, "loss": 0.923, "step": 1348 }, { "epoch": 8.75974025974026, "grad_norm": 2.107628107070923, "learning_rate": 8.308915217200306e-05, "loss": 0.8478, "step": 1349 }, { "epoch": 8.766233766233766, "grad_norm": 2.124215841293335, "learning_rate": 8.306559326618259e-05, "loss": 0.8439, "step": 1350 }, { "epoch": 8.772727272727273, "grad_norm": 2.057939052581787, "learning_rate": 8.304202130658959e-05, "loss": 0.8302, "step": 1351 }, { "epoch": 8.779220779220779, "grad_norm": 2.1522345542907715, "learning_rate": 8.301843630252985e-05, "loss": 0.8959, "step": 1352 }, { "epoch": 8.785714285714286, "grad_norm": 2.0224645137786865, "learning_rate": 8.299483826331437e-05, "loss": 0.7963, "step": 1353 }, { "epoch": 8.792207792207792, "grad_norm": 2.25050950050354, "learning_rate": 8.297122719825927e-05, "loss": 0.9035, "step": 1354 }, { "epoch": 8.7987012987013, "grad_norm": 2.3904287815093994, "learning_rate": 8.294760311668586e-05, "loss": 0.9492, "step": 1355 }, { "epoch": 8.805194805194805, "grad_norm": 2.241687536239624, "learning_rate": 8.29239660279205e-05, "loss": 0.8868, "step": 1356 }, { "epoch": 8.811688311688311, "grad_norm": 2.4004805088043213, "learning_rate": 8.29003159412948e-05, "loss": 0.9366, "step": 1357 }, { "epoch": 8.818181818181818, "grad_norm": 2.3381006717681885, "learning_rate": 8.287665286614538e-05, "loss": 0.9514, "step": 1358 }, { "epoch": 8.824675324675324, "grad_norm": 2.573312520980835, "learning_rate": 8.285297681181408e-05, "loss": 0.9617, "step": 1359 }, { "epoch": 8.831168831168831, "grad_norm": 2.1311933994293213, "learning_rate": 8.282928778764783e-05, "loss": 0.8366, "step": 1360 }, { "epoch": 8.837662337662337, "grad_norm": 2.3406474590301514, "learning_rate": 8.280558580299867e-05, "loss": 0.9838, "step": 1361 }, { "epoch": 8.844155844155845, "grad_norm": 2.5213472843170166, "learning_rate": 8.278187086722379e-05, "loss": 0.9613, "step": 1362 }, { "epoch": 8.85064935064935, "grad_norm": 2.03776216506958, "learning_rate": 8.275814298968544e-05, "loss": 0.8486, "step": 1363 }, { "epoch": 8.857142857142858, "grad_norm": 2.144857406616211, "learning_rate": 8.273440217975103e-05, "loss": 0.8526, "step": 1364 }, { "epoch": 8.863636363636363, "grad_norm": 2.279902935028076, "learning_rate": 8.271064844679306e-05, "loss": 0.918, "step": 1365 }, { "epoch": 8.87012987012987, "grad_norm": 2.033252477645874, "learning_rate": 8.268688180018912e-05, "loss": 0.8447, "step": 1366 }, { "epoch": 8.876623376623376, "grad_norm": 1.9368460178375244, "learning_rate": 8.26631022493219e-05, "loss": 0.8335, "step": 1367 }, { "epoch": 8.883116883116884, "grad_norm": 2.206130027770996, "learning_rate": 8.263930980357919e-05, "loss": 0.9042, "step": 1368 }, { "epoch": 8.88961038961039, "grad_norm": 2.219876289367676, "learning_rate": 8.261550447235389e-05, "loss": 0.9174, "step": 1369 }, { "epoch": 8.896103896103895, "grad_norm": 2.1504364013671875, "learning_rate": 8.259168626504395e-05, "loss": 0.8749, "step": 1370 }, { "epoch": 8.902597402597403, "grad_norm": 2.304591655731201, "learning_rate": 8.256785519105241e-05, "loss": 0.8961, "step": 1371 }, { "epoch": 8.909090909090908, "grad_norm": 2.122875690460205, "learning_rate": 8.254401125978743e-05, "loss": 0.7802, "step": 1372 }, { "epoch": 8.915584415584416, "grad_norm": 2.283535957336426, "learning_rate": 8.25201544806622e-05, "loss": 0.9022, "step": 1373 }, { "epoch": 8.922077922077921, "grad_norm": 2.3608994483947754, "learning_rate": 8.249628486309502e-05, "loss": 0.8685, "step": 1374 }, { "epoch": 8.928571428571429, "grad_norm": 2.3611888885498047, "learning_rate": 8.247240241650919e-05, "loss": 0.9948, "step": 1375 }, { "epoch": 8.935064935064934, "grad_norm": 2.3135151863098145, "learning_rate": 8.244850715033317e-05, "loss": 0.8179, "step": 1376 }, { "epoch": 8.941558441558442, "grad_norm": 2.3506641387939453, "learning_rate": 8.242459907400039e-05, "loss": 0.9665, "step": 1377 }, { "epoch": 8.948051948051948, "grad_norm": 2.306962251663208, "learning_rate": 8.240067819694942e-05, "loss": 0.9321, "step": 1378 }, { "epoch": 8.954545454545455, "grad_norm": 2.266268014907837, "learning_rate": 8.237674452862382e-05, "loss": 0.9511, "step": 1379 }, { "epoch": 8.96103896103896, "grad_norm": 2.541228771209717, "learning_rate": 8.235279807847223e-05, "loss": 0.9981, "step": 1380 }, { "epoch": 8.967532467532468, "grad_norm": 2.1593658924102783, "learning_rate": 8.232883885594832e-05, "loss": 0.8888, "step": 1381 }, { "epoch": 8.974025974025974, "grad_norm": 2.110729455947876, "learning_rate": 8.230486687051082e-05, "loss": 0.8165, "step": 1382 }, { "epoch": 8.980519480519481, "grad_norm": 2.523921489715576, "learning_rate": 8.22808821316235e-05, "loss": 0.9764, "step": 1383 }, { "epoch": 8.987012987012987, "grad_norm": 2.3435425758361816, "learning_rate": 8.225688464875514e-05, "loss": 0.9116, "step": 1384 }, { "epoch": 8.993506493506494, "grad_norm": 2.383795738220215, "learning_rate": 8.223287443137957e-05, "loss": 0.9476, "step": 1385 }, { "epoch": 9.0, "grad_norm": 2654.56103515625, "learning_rate": 8.220885148897565e-05, "loss": 1.0768, "step": 1386 }, { "epoch": 9.006493506493506, "grad_norm": 2.0752618312835693, "learning_rate": 8.218481583102726e-05, "loss": 0.7959, "step": 1387 }, { "epoch": 9.012987012987013, "grad_norm": 2.4385485649108887, "learning_rate": 8.216076746702327e-05, "loss": 0.8151, "step": 1388 }, { "epoch": 9.019480519480519, "grad_norm": 2.270139217376709, "learning_rate": 8.213670640645762e-05, "loss": 0.8186, "step": 1389 }, { "epoch": 9.025974025974026, "grad_norm": 2.112992763519287, "learning_rate": 8.211263265882923e-05, "loss": 0.7687, "step": 1390 }, { "epoch": 9.032467532467532, "grad_norm": 2.1796791553497314, "learning_rate": 8.208854623364202e-05, "loss": 0.7425, "step": 1391 }, { "epoch": 9.03896103896104, "grad_norm": 2.2237792015075684, "learning_rate": 8.206444714040495e-05, "loss": 0.7597, "step": 1392 }, { "epoch": 9.045454545454545, "grad_norm": 2.2954790592193604, "learning_rate": 8.204033538863197e-05, "loss": 0.7829, "step": 1393 }, { "epoch": 9.051948051948052, "grad_norm": 1.7754745483398438, "learning_rate": 8.201621098784198e-05, "loss": 0.6863, "step": 1394 }, { "epoch": 9.058441558441558, "grad_norm": 2.219863176345825, "learning_rate": 8.199207394755893e-05, "loss": 0.7826, "step": 1395 }, { "epoch": 9.064935064935066, "grad_norm": 2.1216280460357666, "learning_rate": 8.196792427731175e-05, "loss": 0.7987, "step": 1396 }, { "epoch": 9.071428571428571, "grad_norm": 2.249668598175049, "learning_rate": 8.194376198663434e-05, "loss": 0.825, "step": 1397 }, { "epoch": 9.077922077922079, "grad_norm": 2.171496629714966, "learning_rate": 8.191958708506558e-05, "loss": 0.7567, "step": 1398 }, { "epoch": 9.084415584415584, "grad_norm": 2.2283010482788086, "learning_rate": 8.189539958214935e-05, "loss": 0.7638, "step": 1399 }, { "epoch": 9.090909090909092, "grad_norm": 2.242703914642334, "learning_rate": 8.18711994874345e-05, "loss": 0.8351, "step": 1400 }, { "epoch": 9.097402597402597, "grad_norm": 2.1934618949890137, "learning_rate": 8.184698681047482e-05, "loss": 0.7873, "step": 1401 }, { "epoch": 9.103896103896103, "grad_norm": 2.263496160507202, "learning_rate": 8.18227615608291e-05, "loss": 0.7448, "step": 1402 }, { "epoch": 9.11038961038961, "grad_norm": 2.06032395362854, "learning_rate": 8.179852374806112e-05, "loss": 0.7561, "step": 1403 }, { "epoch": 9.116883116883116, "grad_norm": 1.9076995849609375, "learning_rate": 8.177427338173953e-05, "loss": 0.7034, "step": 1404 }, { "epoch": 9.123376623376624, "grad_norm": 2.372173547744751, "learning_rate": 8.175001047143804e-05, "loss": 0.8417, "step": 1405 }, { "epoch": 9.12987012987013, "grad_norm": 1.8342652320861816, "learning_rate": 8.172573502673523e-05, "loss": 0.6436, "step": 1406 }, { "epoch": 9.136363636363637, "grad_norm": 2.130004405975342, "learning_rate": 8.170144705721466e-05, "loss": 0.6889, "step": 1407 }, { "epoch": 9.142857142857142, "grad_norm": 2.177459478378296, "learning_rate": 8.167714657246486e-05, "loss": 0.7952, "step": 1408 }, { "epoch": 9.14935064935065, "grad_norm": 2.637460231781006, "learning_rate": 8.165283358207924e-05, "loss": 0.8493, "step": 1409 }, { "epoch": 9.155844155844155, "grad_norm": 2.201258897781372, "learning_rate": 8.162850809565623e-05, "loss": 0.8234, "step": 1410 }, { "epoch": 9.162337662337663, "grad_norm": 2.2699949741363525, "learning_rate": 8.160417012279911e-05, "loss": 0.8444, "step": 1411 }, { "epoch": 9.168831168831169, "grad_norm": 2.2601559162139893, "learning_rate": 8.157981967311614e-05, "loss": 0.757, "step": 1412 }, { "epoch": 9.175324675324676, "grad_norm": 2.369778871536255, "learning_rate": 8.15554567562205e-05, "loss": 0.8558, "step": 1413 }, { "epoch": 9.181818181818182, "grad_norm": 2.1335222721099854, "learning_rate": 8.153108138173027e-05, "loss": 0.7702, "step": 1414 }, { "epoch": 9.188311688311689, "grad_norm": 2.2661004066467285, "learning_rate": 8.150669355926846e-05, "loss": 0.771, "step": 1415 }, { "epoch": 9.194805194805195, "grad_norm": 2.3101413249969482, "learning_rate": 8.148229329846301e-05, "loss": 0.7875, "step": 1416 }, { "epoch": 9.2012987012987, "grad_norm": 2.1832332611083984, "learning_rate": 8.145788060894674e-05, "loss": 0.7662, "step": 1417 }, { "epoch": 9.207792207792208, "grad_norm": 2.008483409881592, "learning_rate": 8.143345550035741e-05, "loss": 0.7156, "step": 1418 }, { "epoch": 9.214285714285714, "grad_norm": 2.1562743186950684, "learning_rate": 8.140901798233767e-05, "loss": 0.7594, "step": 1419 }, { "epoch": 9.220779220779221, "grad_norm": 2.1420395374298096, "learning_rate": 8.138456806453503e-05, "loss": 0.7728, "step": 1420 }, { "epoch": 9.227272727272727, "grad_norm": 2.2543087005615234, "learning_rate": 8.136010575660196e-05, "loss": 0.7595, "step": 1421 }, { "epoch": 9.233766233766234, "grad_norm": 2.3767106533050537, "learning_rate": 8.13356310681958e-05, "loss": 0.8208, "step": 1422 }, { "epoch": 9.24025974025974, "grad_norm": 2.0140974521636963, "learning_rate": 8.131114400897874e-05, "loss": 0.7136, "step": 1423 }, { "epoch": 9.246753246753247, "grad_norm": 2.367492198944092, "learning_rate": 8.12866445886179e-05, "loss": 0.8795, "step": 1424 }, { "epoch": 9.253246753246753, "grad_norm": 2.0226759910583496, "learning_rate": 8.126213281678528e-05, "loss": 0.7512, "step": 1425 }, { "epoch": 9.25974025974026, "grad_norm": 2.336621046066284, "learning_rate": 8.123760870315768e-05, "loss": 0.8067, "step": 1426 }, { "epoch": 9.266233766233766, "grad_norm": 2.2011666297912598, "learning_rate": 8.12130722574169e-05, "loss": 0.7792, "step": 1427 }, { "epoch": 9.272727272727273, "grad_norm": 2.476090669631958, "learning_rate": 8.118852348924953e-05, "loss": 0.775, "step": 1428 }, { "epoch": 9.279220779220779, "grad_norm": 2.2978625297546387, "learning_rate": 8.116396240834698e-05, "loss": 0.7948, "step": 1429 }, { "epoch": 9.285714285714286, "grad_norm": 2.2786600589752197, "learning_rate": 8.113938902440564e-05, "loss": 0.7556, "step": 1430 }, { "epoch": 9.292207792207792, "grad_norm": 2.113959312438965, "learning_rate": 8.111480334712665e-05, "loss": 0.7449, "step": 1431 }, { "epoch": 9.2987012987013, "grad_norm": 2.136590003967285, "learning_rate": 8.109020538621606e-05, "loss": 0.7521, "step": 1432 }, { "epoch": 9.305194805194805, "grad_norm": 2.285116672515869, "learning_rate": 8.106559515138477e-05, "loss": 0.7869, "step": 1433 }, { "epoch": 9.311688311688311, "grad_norm": 2.320920467376709, "learning_rate": 8.104097265234848e-05, "loss": 0.8478, "step": 1434 }, { "epoch": 9.318181818181818, "grad_norm": 2.108393907546997, "learning_rate": 8.101633789882781e-05, "loss": 0.7533, "step": 1435 }, { "epoch": 9.324675324675324, "grad_norm": 2.2300140857696533, "learning_rate": 8.099169090054813e-05, "loss": 0.7973, "step": 1436 }, { "epoch": 9.331168831168831, "grad_norm": 2.217241048812866, "learning_rate": 8.096703166723968e-05, "loss": 0.784, "step": 1437 }, { "epoch": 9.337662337662337, "grad_norm": 2.127923011779785, "learning_rate": 8.094236020863757e-05, "loss": 0.7696, "step": 1438 }, { "epoch": 9.344155844155845, "grad_norm": 2.279656171798706, "learning_rate": 8.091767653448167e-05, "loss": 0.7762, "step": 1439 }, { "epoch": 9.35064935064935, "grad_norm": 2.1744582653045654, "learning_rate": 8.089298065451672e-05, "loss": 0.7161, "step": 1440 }, { "epoch": 9.357142857142858, "grad_norm": 2.01845121383667, "learning_rate": 8.086827257849226e-05, "loss": 0.6556, "step": 1441 }, { "epoch": 9.363636363636363, "grad_norm": 2.32757830619812, "learning_rate": 8.084355231616265e-05, "loss": 0.8107, "step": 1442 }, { "epoch": 9.37012987012987, "grad_norm": 2.1850666999816895, "learning_rate": 8.081881987728703e-05, "loss": 0.7491, "step": 1443 }, { "epoch": 9.376623376623376, "grad_norm": 2.2571535110473633, "learning_rate": 8.079407527162944e-05, "loss": 0.8356, "step": 1444 }, { "epoch": 9.383116883116884, "grad_norm": 2.1496357917785645, "learning_rate": 8.076931850895859e-05, "loss": 0.7408, "step": 1445 }, { "epoch": 9.38961038961039, "grad_norm": 2.1930642127990723, "learning_rate": 8.074454959904807e-05, "loss": 0.751, "step": 1446 }, { "epoch": 9.396103896103897, "grad_norm": 1.8324921131134033, "learning_rate": 8.071976855167629e-05, "loss": 0.6065, "step": 1447 }, { "epoch": 9.402597402597403, "grad_norm": 2.2916312217712402, "learning_rate": 8.069497537662639e-05, "loss": 0.7501, "step": 1448 }, { "epoch": 9.409090909090908, "grad_norm": 2.2877542972564697, "learning_rate": 8.067017008368632e-05, "loss": 0.8381, "step": 1449 }, { "epoch": 9.415584415584416, "grad_norm": 1.8830348253250122, "learning_rate": 8.064535268264883e-05, "loss": 0.6252, "step": 1450 }, { "epoch": 9.422077922077921, "grad_norm": 2.369856834411621, "learning_rate": 8.062052318331142e-05, "loss": 0.8398, "step": 1451 }, { "epoch": 9.428571428571429, "grad_norm": 2.233564615249634, "learning_rate": 8.059568159547641e-05, "loss": 0.7632, "step": 1452 }, { "epoch": 9.435064935064934, "grad_norm": 2.254913806915283, "learning_rate": 8.057082792895083e-05, "loss": 0.8431, "step": 1453 }, { "epoch": 9.441558441558442, "grad_norm": 1.880247712135315, "learning_rate": 8.054596219354654e-05, "loss": 0.6151, "step": 1454 }, { "epoch": 9.448051948051948, "grad_norm": 2.2374584674835205, "learning_rate": 8.052108439908013e-05, "loss": 0.8184, "step": 1455 }, { "epoch": 9.454545454545455, "grad_norm": 2.427372455596924, "learning_rate": 8.049619455537296e-05, "loss": 0.7961, "step": 1456 }, { "epoch": 9.46103896103896, "grad_norm": 1.6904860734939575, "learning_rate": 8.047129267225115e-05, "loss": 0.5814, "step": 1457 }, { "epoch": 9.467532467532468, "grad_norm": 2.3941128253936768, "learning_rate": 8.044637875954556e-05, "loss": 0.8157, "step": 1458 }, { "epoch": 9.474025974025974, "grad_norm": 2.4132771492004395, "learning_rate": 8.042145282709182e-05, "loss": 0.869, "step": 1459 }, { "epoch": 9.480519480519481, "grad_norm": 2.3084611892700195, "learning_rate": 8.039651488473028e-05, "loss": 0.7596, "step": 1460 }, { "epoch": 9.487012987012987, "grad_norm": 2.256843328475952, "learning_rate": 8.037156494230604e-05, "loss": 0.8579, "step": 1461 }, { "epoch": 9.493506493506494, "grad_norm": 2.3451285362243652, "learning_rate": 8.034660300966898e-05, "loss": 0.8552, "step": 1462 }, { "epoch": 9.5, "grad_norm": 2.4393768310546875, "learning_rate": 8.032162909667362e-05, "loss": 0.8209, "step": 1463 }, { "epoch": 9.506493506493506, "grad_norm": 2.1030781269073486, "learning_rate": 8.029664321317932e-05, "loss": 0.8008, "step": 1464 }, { "epoch": 9.512987012987013, "grad_norm": 2.3159143924713135, "learning_rate": 8.027164536905008e-05, "loss": 0.8277, "step": 1465 }, { "epoch": 9.519480519480519, "grad_norm": 2.08125638961792, "learning_rate": 8.024663557415464e-05, "loss": 0.7845, "step": 1466 }, { "epoch": 9.525974025974026, "grad_norm": 2.2004313468933105, "learning_rate": 8.022161383836652e-05, "loss": 0.8462, "step": 1467 }, { "epoch": 9.532467532467532, "grad_norm": 2.0208821296691895, "learning_rate": 8.019658017156386e-05, "loss": 0.703, "step": 1468 }, { "epoch": 9.53896103896104, "grad_norm": 2.086357355117798, "learning_rate": 8.017153458362956e-05, "loss": 0.7341, "step": 1469 }, { "epoch": 9.545454545454545, "grad_norm": 2.44262957572937, "learning_rate": 8.014647708445124e-05, "loss": 0.8811, "step": 1470 }, { "epoch": 9.551948051948052, "grad_norm": 2.260904550552368, "learning_rate": 8.01214076839212e-05, "loss": 0.7389, "step": 1471 }, { "epoch": 9.558441558441558, "grad_norm": 2.211005687713623, "learning_rate": 8.009632639193643e-05, "loss": 0.7509, "step": 1472 }, { "epoch": 9.564935064935066, "grad_norm": 2.503387212753296, "learning_rate": 8.007123321839864e-05, "loss": 0.8911, "step": 1473 }, { "epoch": 9.571428571428571, "grad_norm": 2.088785171508789, "learning_rate": 8.00461281732142e-05, "loss": 0.8056, "step": 1474 }, { "epoch": 9.577922077922079, "grad_norm": 2.1521730422973633, "learning_rate": 8.002101126629421e-05, "loss": 0.8136, "step": 1475 }, { "epoch": 9.584415584415584, "grad_norm": 2.2391040325164795, "learning_rate": 7.999588250755442e-05, "loss": 0.8104, "step": 1476 }, { "epoch": 9.590909090909092, "grad_norm": 2.1446774005889893, "learning_rate": 7.997074190691522e-05, "loss": 0.8641, "step": 1477 }, { "epoch": 9.597402597402597, "grad_norm": 2.460958957672119, "learning_rate": 7.994558947430179e-05, "loss": 0.8526, "step": 1478 }, { "epoch": 9.603896103896105, "grad_norm": 2.084301471710205, "learning_rate": 7.992042521964389e-05, "loss": 0.7874, "step": 1479 }, { "epoch": 9.61038961038961, "grad_norm": 2.258708953857422, "learning_rate": 7.989524915287595e-05, "loss": 0.8217, "step": 1480 }, { "epoch": 9.616883116883116, "grad_norm": 2.2106471061706543, "learning_rate": 7.987006128393709e-05, "loss": 0.8194, "step": 1481 }, { "epoch": 9.623376623376624, "grad_norm": 2.176764488220215, "learning_rate": 7.98448616227711e-05, "loss": 0.7876, "step": 1482 }, { "epoch": 9.62987012987013, "grad_norm": 2.358097791671753, "learning_rate": 7.981965017932638e-05, "loss": 0.8781, "step": 1483 }, { "epoch": 9.636363636363637, "grad_norm": 2.351985216140747, "learning_rate": 7.979442696355602e-05, "loss": 0.7921, "step": 1484 }, { "epoch": 9.642857142857142, "grad_norm": 2.128723382949829, "learning_rate": 7.976919198541776e-05, "loss": 0.7536, "step": 1485 }, { "epoch": 9.64935064935065, "grad_norm": 2.456420660018921, "learning_rate": 7.974394525487394e-05, "loss": 0.843, "step": 1486 }, { "epoch": 9.655844155844155, "grad_norm": 2.167208671569824, "learning_rate": 7.971868678189161e-05, "loss": 0.7295, "step": 1487 }, { "epoch": 9.662337662337663, "grad_norm": 2.31095814704895, "learning_rate": 7.969341657644237e-05, "loss": 0.7792, "step": 1488 }, { "epoch": 9.668831168831169, "grad_norm": 2.448061227798462, "learning_rate": 7.966813464850251e-05, "loss": 0.9162, "step": 1489 }, { "epoch": 9.675324675324676, "grad_norm": 2.2463135719299316, "learning_rate": 7.964284100805297e-05, "loss": 0.8312, "step": 1490 }, { "epoch": 9.681818181818182, "grad_norm": 2.4721384048461914, "learning_rate": 7.961753566507924e-05, "loss": 0.8892, "step": 1491 }, { "epoch": 9.688311688311689, "grad_norm": 2.483168840408325, "learning_rate": 7.959221862957148e-05, "loss": 0.8688, "step": 1492 }, { "epoch": 9.694805194805195, "grad_norm": 2.2723283767700195, "learning_rate": 7.956688991152445e-05, "loss": 0.8294, "step": 1493 }, { "epoch": 9.7012987012987, "grad_norm": 2.0583672523498535, "learning_rate": 7.954154952093756e-05, "loss": 0.769, "step": 1494 }, { "epoch": 9.707792207792208, "grad_norm": 2.319795846939087, "learning_rate": 7.951619746781474e-05, "loss": 0.8562, "step": 1495 }, { "epoch": 9.714285714285714, "grad_norm": 2.2907357215881348, "learning_rate": 7.94908337621646e-05, "loss": 0.8138, "step": 1496 }, { "epoch": 9.720779220779221, "grad_norm": 1.994308590888977, "learning_rate": 7.946545841400035e-05, "loss": 0.7778, "step": 1497 }, { "epoch": 9.727272727272727, "grad_norm": 2.314957618713379, "learning_rate": 7.944007143333975e-05, "loss": 0.8457, "step": 1498 }, { "epoch": 9.733766233766234, "grad_norm": 2.4140167236328125, "learning_rate": 7.94146728302052e-05, "loss": 0.8849, "step": 1499 }, { "epoch": 9.74025974025974, "grad_norm": 2.302248954772949, "learning_rate": 7.938926261462367e-05, "loss": 0.8035, "step": 1500 }, { "epoch": 9.746753246753247, "grad_norm": 2.2283337116241455, "learning_rate": 7.936384079662666e-05, "loss": 0.754, "step": 1501 }, { "epoch": 9.753246753246753, "grad_norm": 2.261359214782715, "learning_rate": 7.933840738625036e-05, "loss": 0.8227, "step": 1502 }, { "epoch": 9.75974025974026, "grad_norm": 2.03810453414917, "learning_rate": 7.931296239353546e-05, "loss": 0.7251, "step": 1503 }, { "epoch": 9.766233766233766, "grad_norm": 2.1554248332977295, "learning_rate": 7.928750582852722e-05, "loss": 0.7853, "step": 1504 }, { "epoch": 9.772727272727273, "grad_norm": 1.9828263521194458, "learning_rate": 7.926203770127552e-05, "loss": 0.7483, "step": 1505 }, { "epoch": 9.779220779220779, "grad_norm": 2.166712522506714, "learning_rate": 7.923655802183474e-05, "loss": 0.7987, "step": 1506 }, { "epoch": 9.785714285714286, "grad_norm": 2.025691270828247, "learning_rate": 7.921106680026387e-05, "loss": 0.7963, "step": 1507 }, { "epoch": 9.792207792207792, "grad_norm": 2.278630256652832, "learning_rate": 7.918556404662644e-05, "loss": 0.8426, "step": 1508 }, { "epoch": 9.7987012987013, "grad_norm": 2.3733408451080322, "learning_rate": 7.916004977099055e-05, "loss": 0.8032, "step": 1509 }, { "epoch": 9.805194805194805, "grad_norm": 2.116952896118164, "learning_rate": 7.913452398342881e-05, "loss": 0.7179, "step": 1510 }, { "epoch": 9.811688311688311, "grad_norm": 2.2615432739257812, "learning_rate": 7.91089866940184e-05, "loss": 0.8633, "step": 1511 }, { "epoch": 9.818181818181818, "grad_norm": 1.9311797618865967, "learning_rate": 7.908343791284105e-05, "loss": 0.6622, "step": 1512 }, { "epoch": 9.824675324675324, "grad_norm": 2.372143268585205, "learning_rate": 7.905787764998299e-05, "loss": 0.8368, "step": 1513 }, { "epoch": 9.831168831168831, "grad_norm": 2.188035488128662, "learning_rate": 7.903230591553504e-05, "loss": 0.7697, "step": 1514 }, { "epoch": 9.837662337662337, "grad_norm": 2.158198118209839, "learning_rate": 7.900672271959249e-05, "loss": 0.7336, "step": 1515 }, { "epoch": 9.844155844155845, "grad_norm": 1.9618550539016724, "learning_rate": 7.898112807225517e-05, "loss": 0.7297, "step": 1516 }, { "epoch": 9.85064935064935, "grad_norm": 2.266169548034668, "learning_rate": 7.895552198362748e-05, "loss": 0.8173, "step": 1517 }, { "epoch": 9.857142857142858, "grad_norm": 2.3293449878692627, "learning_rate": 7.892990446381827e-05, "loss": 0.8151, "step": 1518 }, { "epoch": 9.863636363636363, "grad_norm": 2.265075445175171, "learning_rate": 7.890427552294093e-05, "loss": 0.8879, "step": 1519 }, { "epoch": 9.87012987012987, "grad_norm": 2.1060047149658203, "learning_rate": 7.887863517111338e-05, "loss": 0.8157, "step": 1520 }, { "epoch": 9.876623376623376, "grad_norm": 2.202415704727173, "learning_rate": 7.885298341845802e-05, "loss": 0.7675, "step": 1521 }, { "epoch": 9.883116883116884, "grad_norm": 2.1517200469970703, "learning_rate": 7.882732027510174e-05, "loss": 0.811, "step": 1522 }, { "epoch": 9.88961038961039, "grad_norm": 2.144573211669922, "learning_rate": 7.880164575117597e-05, "loss": 0.7476, "step": 1523 }, { "epoch": 9.896103896103895, "grad_norm": 2.335124969482422, "learning_rate": 7.877595985681656e-05, "loss": 0.872, "step": 1524 }, { "epoch": 9.902597402597403, "grad_norm": 2.477221727371216, "learning_rate": 7.875026260216393e-05, "loss": 0.9285, "step": 1525 }, { "epoch": 9.909090909090908, "grad_norm": 1.9062142372131348, "learning_rate": 7.872455399736295e-05, "loss": 0.7905, "step": 1526 }, { "epoch": 9.915584415584416, "grad_norm": 2.1966278553009033, "learning_rate": 7.869883405256295e-05, "loss": 0.8347, "step": 1527 }, { "epoch": 9.922077922077921, "grad_norm": 2.1748650074005127, "learning_rate": 7.867310277791778e-05, "loss": 0.7988, "step": 1528 }, { "epoch": 9.928571428571429, "grad_norm": 2.3306756019592285, "learning_rate": 7.864736018358571e-05, "loss": 0.8781, "step": 1529 }, { "epoch": 9.935064935064934, "grad_norm": 2.344447135925293, "learning_rate": 7.862160627972955e-05, "loss": 0.8185, "step": 1530 }, { "epoch": 9.941558441558442, "grad_norm": 1.9969489574432373, "learning_rate": 7.85958410765165e-05, "loss": 0.719, "step": 1531 }, { "epoch": 9.948051948051948, "grad_norm": 2.3205368518829346, "learning_rate": 7.857006458411826e-05, "loss": 0.8242, "step": 1532 }, { "epoch": 9.954545454545455, "grad_norm": 2.0827131271362305, "learning_rate": 7.854427681271099e-05, "loss": 0.7715, "step": 1533 }, { "epoch": 9.96103896103896, "grad_norm": 2.0778398513793945, "learning_rate": 7.851847777247528e-05, "loss": 0.7781, "step": 1534 }, { "epoch": 9.967532467532468, "grad_norm": 2.2156105041503906, "learning_rate": 7.84926674735962e-05, "loss": 0.851, "step": 1535 }, { "epoch": 9.974025974025974, "grad_norm": 2.1992075443267822, "learning_rate": 7.846684592626323e-05, "loss": 0.8263, "step": 1536 }, { "epoch": 9.980519480519481, "grad_norm": 2.202113628387451, "learning_rate": 7.844101314067032e-05, "loss": 0.8584, "step": 1537 }, { "epoch": 9.987012987012987, "grad_norm": 2.097062349319458, "learning_rate": 7.841516912701585e-05, "loss": 0.784, "step": 1538 }, { "epoch": 9.993506493506494, "grad_norm": 2.44765305519104, "learning_rate": 7.838931389550259e-05, "loss": 0.8952, "step": 1539 }, { "epoch": 10.0, "grad_norm": 2912.900390625, "learning_rate": 7.836344745633783e-05, "loss": 0.8677, "step": 1540 }, { "epoch": 10.006493506493506, "grad_norm": 1.9397345781326294, "learning_rate": 7.83375698197332e-05, "loss": 0.6288, "step": 1541 }, { "epoch": 10.012987012987013, "grad_norm": 2.1335549354553223, "learning_rate": 7.831168099590478e-05, "loss": 0.7503, "step": 1542 }, { "epoch": 10.019480519480519, "grad_norm": 2.2006771564483643, "learning_rate": 7.828578099507308e-05, "loss": 0.6708, "step": 1543 }, { "epoch": 10.025974025974026, "grad_norm": 1.8945122957229614, "learning_rate": 7.8259869827463e-05, "loss": 0.6138, "step": 1544 }, { "epoch": 10.032467532467532, "grad_norm": 2.2254040241241455, "learning_rate": 7.823394750330387e-05, "loss": 0.7047, "step": 1545 }, { "epoch": 10.03896103896104, "grad_norm": 2.0241615772247314, "learning_rate": 7.820801403282939e-05, "loss": 0.6689, "step": 1546 }, { "epoch": 10.045454545454545, "grad_norm": 2.107599973678589, "learning_rate": 7.81820694262777e-05, "loss": 0.6813, "step": 1547 }, { "epoch": 10.051948051948052, "grad_norm": 2.0900888442993164, "learning_rate": 7.815611369389133e-05, "loss": 0.6593, "step": 1548 }, { "epoch": 10.058441558441558, "grad_norm": 2.0664241313934326, "learning_rate": 7.813014684591718e-05, "loss": 0.6814, "step": 1549 }, { "epoch": 10.064935064935066, "grad_norm": 2.1444430351257324, "learning_rate": 7.810416889260653e-05, "loss": 0.664, "step": 1550 }, { "epoch": 10.071428571428571, "grad_norm": 2.220968246459961, "learning_rate": 7.80781798442151e-05, "loss": 0.6684, "step": 1551 }, { "epoch": 10.077922077922079, "grad_norm": 2.3568577766418457, "learning_rate": 7.805217971100295e-05, "loss": 0.69, "step": 1552 }, { "epoch": 10.084415584415584, "grad_norm": 2.0757124423980713, "learning_rate": 7.802616850323449e-05, "loss": 0.6463, "step": 1553 }, { "epoch": 10.090909090909092, "grad_norm": 2.0199642181396484, "learning_rate": 7.800014623117857e-05, "loss": 0.639, "step": 1554 }, { "epoch": 10.097402597402597, "grad_norm": 2.1582894325256348, "learning_rate": 7.797411290510837e-05, "loss": 0.7168, "step": 1555 }, { "epoch": 10.103896103896103, "grad_norm": 2.1365647315979004, "learning_rate": 7.794806853530138e-05, "loss": 0.7253, "step": 1556 }, { "epoch": 10.11038961038961, "grad_norm": 2.1413962841033936, "learning_rate": 7.792201313203957e-05, "loss": 0.6625, "step": 1557 }, { "epoch": 10.116883116883116, "grad_norm": 2.0579428672790527, "learning_rate": 7.789594670560917e-05, "loss": 0.6876, "step": 1558 }, { "epoch": 10.123376623376624, "grad_norm": 1.8744693994522095, "learning_rate": 7.78698692663008e-05, "loss": 0.5653, "step": 1559 }, { "epoch": 10.12987012987013, "grad_norm": 2.3525032997131348, "learning_rate": 7.784378082440941e-05, "loss": 0.6893, "step": 1560 }, { "epoch": 10.136363636363637, "grad_norm": 2.229753017425537, "learning_rate": 7.78176813902343e-05, "loss": 0.7107, "step": 1561 }, { "epoch": 10.142857142857142, "grad_norm": 2.3776869773864746, "learning_rate": 7.779157097407915e-05, "loss": 0.7396, "step": 1562 }, { "epoch": 10.14935064935065, "grad_norm": 2.364971876144409, "learning_rate": 7.77654495862519e-05, "loss": 0.7148, "step": 1563 }, { "epoch": 10.155844155844155, "grad_norm": 2.2058353424072266, "learning_rate": 7.773931723706487e-05, "loss": 0.7041, "step": 1564 }, { "epoch": 10.162337662337663, "grad_norm": 2.3210866451263428, "learning_rate": 7.771317393683471e-05, "loss": 0.7472, "step": 1565 }, { "epoch": 10.168831168831169, "grad_norm": 2.044942855834961, "learning_rate": 7.768701969588237e-05, "loss": 0.654, "step": 1566 }, { "epoch": 10.175324675324676, "grad_norm": 2.2671778202056885, "learning_rate": 7.766085452453312e-05, "loss": 0.7198, "step": 1567 }, { "epoch": 10.181818181818182, "grad_norm": 2.2623441219329834, "learning_rate": 7.763467843311658e-05, "loss": 0.6947, "step": 1568 }, { "epoch": 10.188311688311689, "grad_norm": 2.348132610321045, "learning_rate": 7.760849143196665e-05, "loss": 0.7413, "step": 1569 }, { "epoch": 10.194805194805195, "grad_norm": 2.34828782081604, "learning_rate": 7.758229353142152e-05, "loss": 0.7909, "step": 1570 }, { "epoch": 10.2012987012987, "grad_norm": 2.1013669967651367, "learning_rate": 7.755608474182373e-05, "loss": 0.6481, "step": 1571 }, { "epoch": 10.207792207792208, "grad_norm": 1.9958415031433105, "learning_rate": 7.752986507352008e-05, "loss": 0.7038, "step": 1572 }, { "epoch": 10.214285714285714, "grad_norm": 2.4060864448547363, "learning_rate": 7.75036345368617e-05, "loss": 0.7174, "step": 1573 }, { "epoch": 10.220779220779221, "grad_norm": 2.2779603004455566, "learning_rate": 7.747739314220396e-05, "loss": 0.6656, "step": 1574 }, { "epoch": 10.227272727272727, "grad_norm": 2.348428249359131, "learning_rate": 7.74511408999066e-05, "loss": 0.753, "step": 1575 }, { "epoch": 10.233766233766234, "grad_norm": 2.427178382873535, "learning_rate": 7.742487782033354e-05, "loss": 0.7245, "step": 1576 }, { "epoch": 10.24025974025974, "grad_norm": 2.1331942081451416, "learning_rate": 7.739860391385303e-05, "loss": 0.6282, "step": 1577 }, { "epoch": 10.246753246753247, "grad_norm": 2.1701390743255615, "learning_rate": 7.737231919083761e-05, "loss": 0.675, "step": 1578 }, { "epoch": 10.253246753246753, "grad_norm": 2.2979564666748047, "learning_rate": 7.734602366166406e-05, "loss": 0.7126, "step": 1579 }, { "epoch": 10.25974025974026, "grad_norm": 2.368772268295288, "learning_rate": 7.731971733671346e-05, "loss": 0.72, "step": 1580 }, { "epoch": 10.266233766233766, "grad_norm": 2.1105165481567383, "learning_rate": 7.729340022637112e-05, "loss": 0.6476, "step": 1581 }, { "epoch": 10.272727272727273, "grad_norm": 2.5445973873138428, "learning_rate": 7.726707234102659e-05, "loss": 0.8015, "step": 1582 }, { "epoch": 10.279220779220779, "grad_norm": 2.3810203075408936, "learning_rate": 7.724073369107376e-05, "loss": 0.733, "step": 1583 }, { "epoch": 10.285714285714286, "grad_norm": 2.166701316833496, "learning_rate": 7.721438428691065e-05, "loss": 0.7144, "step": 1584 }, { "epoch": 10.292207792207792, "grad_norm": 1.981048345565796, "learning_rate": 7.718802413893963e-05, "loss": 0.6094, "step": 1585 }, { "epoch": 10.2987012987013, "grad_norm": 2.206669330596924, "learning_rate": 7.716165325756725e-05, "loss": 0.6947, "step": 1586 }, { "epoch": 10.305194805194805, "grad_norm": 2.164898157119751, "learning_rate": 7.713527165320434e-05, "loss": 0.7293, "step": 1587 }, { "epoch": 10.311688311688311, "grad_norm": 2.202178716659546, "learning_rate": 7.710887933626589e-05, "loss": 0.6746, "step": 1588 }, { "epoch": 10.318181818181818, "grad_norm": 2.296952247619629, "learning_rate": 7.708247631717122e-05, "loss": 0.7263, "step": 1589 }, { "epoch": 10.324675324675324, "grad_norm": 2.1997387409210205, "learning_rate": 7.705606260634379e-05, "loss": 0.6513, "step": 1590 }, { "epoch": 10.331168831168831, "grad_norm": 2.4745490550994873, "learning_rate": 7.702963821421133e-05, "loss": 0.7661, "step": 1591 }, { "epoch": 10.337662337662337, "grad_norm": 2.397670269012451, "learning_rate": 7.70032031512058e-05, "loss": 0.7387, "step": 1592 }, { "epoch": 10.344155844155845, "grad_norm": 2.434361696243286, "learning_rate": 7.69767574277633e-05, "loss": 0.7354, "step": 1593 }, { "epoch": 10.35064935064935, "grad_norm": 1.842675805091858, "learning_rate": 7.695030105432417e-05, "loss": 0.5535, "step": 1594 }, { "epoch": 10.357142857142858, "grad_norm": 2.0731892585754395, "learning_rate": 7.692383404133301e-05, "loss": 0.6616, "step": 1595 }, { "epoch": 10.363636363636363, "grad_norm": 2.039299488067627, "learning_rate": 7.689735639923858e-05, "loss": 0.6804, "step": 1596 }, { "epoch": 10.37012987012987, "grad_norm": 1.9815698862075806, "learning_rate": 7.687086813849378e-05, "loss": 0.6807, "step": 1597 }, { "epoch": 10.376623376623376, "grad_norm": 2.2085583209991455, "learning_rate": 7.684436926955582e-05, "loss": 0.6732, "step": 1598 }, { "epoch": 10.383116883116884, "grad_norm": 2.3801488876342773, "learning_rate": 7.6817859802886e-05, "loss": 0.7524, "step": 1599 }, { "epoch": 10.38961038961039, "grad_norm": 1.944014549255371, "learning_rate": 7.679133974894983e-05, "loss": 0.6785, "step": 1600 }, { "epoch": 10.396103896103897, "grad_norm": 1.7202798128128052, "learning_rate": 7.676480911821705e-05, "loss": 0.5849, "step": 1601 }, { "epoch": 10.402597402597403, "grad_norm": 2.336012601852417, "learning_rate": 7.673826792116146e-05, "loss": 0.6768, "step": 1602 }, { "epoch": 10.409090909090908, "grad_norm": 2.4163119792938232, "learning_rate": 7.671171616826119e-05, "loss": 0.8147, "step": 1603 }, { "epoch": 10.415584415584416, "grad_norm": 2.2862460613250732, "learning_rate": 7.668515386999837e-05, "loss": 0.7144, "step": 1604 }, { "epoch": 10.422077922077921, "grad_norm": 2.4034481048583984, "learning_rate": 7.665858103685944e-05, "loss": 0.7784, "step": 1605 }, { "epoch": 10.428571428571429, "grad_norm": 2.230703592300415, "learning_rate": 7.663199767933489e-05, "loss": 0.7181, "step": 1606 }, { "epoch": 10.435064935064934, "grad_norm": 2.1745998859405518, "learning_rate": 7.660540380791942e-05, "loss": 0.6556, "step": 1607 }, { "epoch": 10.441558441558442, "grad_norm": 2.2904982566833496, "learning_rate": 7.65787994331119e-05, "loss": 0.7285, "step": 1608 }, { "epoch": 10.448051948051948, "grad_norm": 2.110764503479004, "learning_rate": 7.655218456541529e-05, "loss": 0.642, "step": 1609 }, { "epoch": 10.454545454545455, "grad_norm": 2.170912027359009, "learning_rate": 7.65255592153367e-05, "loss": 0.6743, "step": 1610 }, { "epoch": 10.46103896103896, "grad_norm": 2.565704584121704, "learning_rate": 7.649892339338744e-05, "loss": 0.7778, "step": 1611 }, { "epoch": 10.467532467532468, "grad_norm": 2.2946760654449463, "learning_rate": 7.647227711008287e-05, "loss": 0.754, "step": 1612 }, { "epoch": 10.474025974025974, "grad_norm": 2.269430160522461, "learning_rate": 7.644562037594254e-05, "loss": 0.74, "step": 1613 }, { "epoch": 10.480519480519481, "grad_norm": 1.9184123277664185, "learning_rate": 7.64189532014901e-05, "loss": 0.5831, "step": 1614 }, { "epoch": 10.487012987012987, "grad_norm": 2.157374858856201, "learning_rate": 7.639227559725332e-05, "loss": 0.7455, "step": 1615 }, { "epoch": 10.493506493506494, "grad_norm": 2.176115036010742, "learning_rate": 7.636558757376413e-05, "loss": 0.7088, "step": 1616 }, { "epoch": 10.5, "grad_norm": 2.1193859577178955, "learning_rate": 7.633888914155852e-05, "loss": 0.6377, "step": 1617 }, { "epoch": 10.506493506493506, "grad_norm": 2.1090433597564697, "learning_rate": 7.631218031117658e-05, "loss": 0.6521, "step": 1618 }, { "epoch": 10.512987012987013, "grad_norm": 2.215869426727295, "learning_rate": 7.628546109316257e-05, "loss": 0.7566, "step": 1619 }, { "epoch": 10.519480519480519, "grad_norm": 2.28544282913208, "learning_rate": 7.62587314980648e-05, "loss": 0.7332, "step": 1620 }, { "epoch": 10.525974025974026, "grad_norm": 2.1070797443389893, "learning_rate": 7.623199153643569e-05, "loss": 0.7118, "step": 1621 }, { "epoch": 10.532467532467532, "grad_norm": 2.4239230155944824, "learning_rate": 7.620524121883174e-05, "loss": 0.768, "step": 1622 }, { "epoch": 10.53896103896104, "grad_norm": 2.3072590827941895, "learning_rate": 7.617848055581361e-05, "loss": 0.7294, "step": 1623 }, { "epoch": 10.545454545454545, "grad_norm": 2.4551761150360107, "learning_rate": 7.615170955794591e-05, "loss": 0.8272, "step": 1624 }, { "epoch": 10.551948051948052, "grad_norm": 2.317012310028076, "learning_rate": 7.612492823579745e-05, "loss": 0.7588, "step": 1625 }, { "epoch": 10.558441558441558, "grad_norm": 2.036930799484253, "learning_rate": 7.609813659994108e-05, "loss": 0.6712, "step": 1626 }, { "epoch": 10.564935064935066, "grad_norm": 2.1572816371917725, "learning_rate": 7.607133466095366e-05, "loss": 0.712, "step": 1627 }, { "epoch": 10.571428571428571, "grad_norm": 1.9979349374771118, "learning_rate": 7.604452242941622e-05, "loss": 0.6711, "step": 1628 }, { "epoch": 10.577922077922079, "grad_norm": 2.5580434799194336, "learning_rate": 7.60176999159138e-05, "loss": 0.7787, "step": 1629 }, { "epoch": 10.584415584415584, "grad_norm": 2.1501293182373047, "learning_rate": 7.599086713103548e-05, "loss": 0.7245, "step": 1630 }, { "epoch": 10.590909090909092, "grad_norm": 2.2791953086853027, "learning_rate": 7.596402408537443e-05, "loss": 0.7575, "step": 1631 }, { "epoch": 10.597402597402597, "grad_norm": 2.2916219234466553, "learning_rate": 7.593717078952788e-05, "loss": 0.7194, "step": 1632 }, { "epoch": 10.603896103896105, "grad_norm": 2.251642942428589, "learning_rate": 7.591030725409707e-05, "loss": 0.7858, "step": 1633 }, { "epoch": 10.61038961038961, "grad_norm": 2.1227355003356934, "learning_rate": 7.588343348968728e-05, "loss": 0.7496, "step": 1634 }, { "epoch": 10.616883116883116, "grad_norm": 1.8362125158309937, "learning_rate": 7.585654950690786e-05, "loss": 0.5487, "step": 1635 }, { "epoch": 10.623376623376624, "grad_norm": 2.174795150756836, "learning_rate": 7.582965531637221e-05, "loss": 0.7212, "step": 1636 }, { "epoch": 10.62987012987013, "grad_norm": 2.2185142040252686, "learning_rate": 7.580275092869766e-05, "loss": 0.7792, "step": 1637 }, { "epoch": 10.636363636363637, "grad_norm": 2.1538960933685303, "learning_rate": 7.577583635450571e-05, "loss": 0.7158, "step": 1638 }, { "epoch": 10.642857142857142, "grad_norm": 2.1951744556427, "learning_rate": 7.574891160442179e-05, "loss": 0.7092, "step": 1639 }, { "epoch": 10.64935064935065, "grad_norm": 1.9684754610061646, "learning_rate": 7.572197668907532e-05, "loss": 0.6546, "step": 1640 }, { "epoch": 10.655844155844155, "grad_norm": 2.06463885307312, "learning_rate": 7.569503161909983e-05, "loss": 0.7007, "step": 1641 }, { "epoch": 10.662337662337663, "grad_norm": 2.0905561447143555, "learning_rate": 7.566807640513278e-05, "loss": 0.7559, "step": 1642 }, { "epoch": 10.668831168831169, "grad_norm": 2.309191942214966, "learning_rate": 7.564111105781568e-05, "loss": 0.7564, "step": 1643 }, { "epoch": 10.675324675324676, "grad_norm": 2.3112220764160156, "learning_rate": 7.561413558779402e-05, "loss": 0.7723, "step": 1644 }, { "epoch": 10.681818181818182, "grad_norm": 2.321855068206787, "learning_rate": 7.558715000571726e-05, "loss": 0.7907, "step": 1645 }, { "epoch": 10.688311688311689, "grad_norm": 2.232640027999878, "learning_rate": 7.55601543222389e-05, "loss": 0.794, "step": 1646 }, { "epoch": 10.694805194805195, "grad_norm": 1.7704936265945435, "learning_rate": 7.553314854801641e-05, "loss": 0.545, "step": 1647 }, { "epoch": 10.7012987012987, "grad_norm": 2.0758585929870605, "learning_rate": 7.550613269371124e-05, "loss": 0.7549, "step": 1648 }, { "epoch": 10.707792207792208, "grad_norm": 2.173079252243042, "learning_rate": 7.547910676998883e-05, "loss": 0.7017, "step": 1649 }, { "epoch": 10.714285714285714, "grad_norm": 2.047727584838867, "learning_rate": 7.545207078751857e-05, "loss": 0.6592, "step": 1650 }, { "epoch": 10.720779220779221, "grad_norm": 2.197402000427246, "learning_rate": 7.542502475697385e-05, "loss": 0.6863, "step": 1651 }, { "epoch": 10.727272727272727, "grad_norm": 2.0295863151550293, "learning_rate": 7.5397968689032e-05, "loss": 0.6389, "step": 1652 }, { "epoch": 10.733766233766234, "grad_norm": 2.347686529159546, "learning_rate": 7.537090259437435e-05, "loss": 0.7571, "step": 1653 }, { "epoch": 10.74025974025974, "grad_norm": 2.224231719970703, "learning_rate": 7.534382648368616e-05, "loss": 0.6876, "step": 1654 }, { "epoch": 10.746753246753247, "grad_norm": 2.1998109817504883, "learning_rate": 7.531674036765664e-05, "loss": 0.7749, "step": 1655 }, { "epoch": 10.753246753246753, "grad_norm": 2.3216822147369385, "learning_rate": 7.528964425697896e-05, "loss": 0.7631, "step": 1656 }, { "epoch": 10.75974025974026, "grad_norm": 2.265143632888794, "learning_rate": 7.526253816235023e-05, "loss": 0.762, "step": 1657 }, { "epoch": 10.766233766233766, "grad_norm": 2.359100341796875, "learning_rate": 7.523542209447152e-05, "loss": 0.7892, "step": 1658 }, { "epoch": 10.772727272727273, "grad_norm": 2.244819164276123, "learning_rate": 7.52082960640478e-05, "loss": 0.7531, "step": 1659 }, { "epoch": 10.779220779220779, "grad_norm": 1.8464090824127197, "learning_rate": 7.518116008178805e-05, "loss": 0.6299, "step": 1660 }, { "epoch": 10.785714285714286, "grad_norm": 2.5119171142578125, "learning_rate": 7.515401415840509e-05, "loss": 0.7831, "step": 1661 }, { "epoch": 10.792207792207792, "grad_norm": 2.5271401405334473, "learning_rate": 7.512685830461568e-05, "loss": 0.8111, "step": 1662 }, { "epoch": 10.7987012987013, "grad_norm": 2.099280595779419, "learning_rate": 7.509969253114055e-05, "loss": 0.7015, "step": 1663 }, { "epoch": 10.805194805194805, "grad_norm": 2.3430895805358887, "learning_rate": 7.507251684870433e-05, "loss": 0.7549, "step": 1664 }, { "epoch": 10.811688311688311, "grad_norm": 2.0067107677459717, "learning_rate": 7.50453312680355e-05, "loss": 0.6117, "step": 1665 }, { "epoch": 10.818181818181818, "grad_norm": 2.2068874835968018, "learning_rate": 7.501813579986656e-05, "loss": 0.7337, "step": 1666 }, { "epoch": 10.824675324675324, "grad_norm": 2.0831525325775146, "learning_rate": 7.499093045493379e-05, "loss": 0.6972, "step": 1667 }, { "epoch": 10.831168831168831, "grad_norm": 2.1348683834075928, "learning_rate": 7.496371524397746e-05, "loss": 0.6775, "step": 1668 }, { "epoch": 10.837662337662337, "grad_norm": 2.137401580810547, "learning_rate": 7.493649017774171e-05, "loss": 0.7376, "step": 1669 }, { "epoch": 10.844155844155845, "grad_norm": 2.398810625076294, "learning_rate": 7.490925526697455e-05, "loss": 0.7626, "step": 1670 }, { "epoch": 10.85064935064935, "grad_norm": 1.9947322607040405, "learning_rate": 7.48820105224279e-05, "loss": 0.6033, "step": 1671 }, { "epoch": 10.857142857142858, "grad_norm": 2.4584691524505615, "learning_rate": 7.485475595485756e-05, "loss": 0.806, "step": 1672 }, { "epoch": 10.863636363636363, "grad_norm": 2.294435739517212, "learning_rate": 7.482749157502318e-05, "loss": 0.7604, "step": 1673 }, { "epoch": 10.87012987012987, "grad_norm": 2.5067639350891113, "learning_rate": 7.480021739368833e-05, "loss": 0.7548, "step": 1674 }, { "epoch": 10.876623376623376, "grad_norm": 1.932142972946167, "learning_rate": 7.477293342162039e-05, "loss": 0.6518, "step": 1675 }, { "epoch": 10.883116883116884, "grad_norm": 2.36539626121521, "learning_rate": 7.474563966959067e-05, "loss": 0.7516, "step": 1676 }, { "epoch": 10.88961038961039, "grad_norm": 2.2556207180023193, "learning_rate": 7.471833614837431e-05, "loss": 0.737, "step": 1677 }, { "epoch": 10.896103896103895, "grad_norm": 2.3657636642456055, "learning_rate": 7.469102286875029e-05, "loss": 0.7405, "step": 1678 }, { "epoch": 10.902597402597403, "grad_norm": 1.943218469619751, "learning_rate": 7.46636998415015e-05, "loss": 0.6355, "step": 1679 }, { "epoch": 10.909090909090908, "grad_norm": 2.4874868392944336, "learning_rate": 7.463636707741458e-05, "loss": 0.793, "step": 1680 }, { "epoch": 10.915584415584416, "grad_norm": 2.346367597579956, "learning_rate": 7.460902458728012e-05, "loss": 0.749, "step": 1681 }, { "epoch": 10.922077922077921, "grad_norm": 2.232698678970337, "learning_rate": 7.458167238189248e-05, "loss": 0.7215, "step": 1682 }, { "epoch": 10.928571428571429, "grad_norm": 2.218801259994507, "learning_rate": 7.455431047204988e-05, "loss": 0.782, "step": 1683 }, { "epoch": 10.935064935064934, "grad_norm": 2.312042713165283, "learning_rate": 7.452693886855438e-05, "loss": 0.7656, "step": 1684 }, { "epoch": 10.941558441558442, "grad_norm": 2.2011115550994873, "learning_rate": 7.449955758221183e-05, "loss": 0.7446, "step": 1685 }, { "epoch": 10.948051948051948, "grad_norm": 2.228243350982666, "learning_rate": 7.447216662383196e-05, "loss": 0.6661, "step": 1686 }, { "epoch": 10.954545454545455, "grad_norm": 2.1731884479522705, "learning_rate": 7.444476600422828e-05, "loss": 0.7674, "step": 1687 }, { "epoch": 10.96103896103896, "grad_norm": 2.086169481277466, "learning_rate": 7.441735573421809e-05, "loss": 0.7016, "step": 1688 }, { "epoch": 10.967532467532468, "grad_norm": 2.0726120471954346, "learning_rate": 7.438993582462256e-05, "loss": 0.6815, "step": 1689 }, { "epoch": 10.974025974025974, "grad_norm": 2.0607035160064697, "learning_rate": 7.436250628626662e-05, "loss": 0.7075, "step": 1690 }, { "epoch": 10.980519480519481, "grad_norm": 2.23492169380188, "learning_rate": 7.433506712997904e-05, "loss": 0.7345, "step": 1691 }, { "epoch": 10.987012987012987, "grad_norm": 2.3279659748077393, "learning_rate": 7.430761836659235e-05, "loss": 0.7348, "step": 1692 }, { "epoch": 10.993506493506494, "grad_norm": 2.2870876789093018, "learning_rate": 7.428016000694286e-05, "loss": 0.7223, "step": 1693 }, { "epoch": 11.0, "grad_norm": 1362.704833984375, "learning_rate": 7.425269206187075e-05, "loss": 0.7249, "step": 1694 }, { "epoch": 11.006493506493506, "grad_norm": 1.971757173538208, "learning_rate": 7.42252145422199e-05, "loss": 0.5723, "step": 1695 }, { "epoch": 11.012987012987013, "grad_norm": 2.09525203704834, "learning_rate": 7.4197727458838e-05, "loss": 0.6099, "step": 1696 }, { "epoch": 11.019480519480519, "grad_norm": 1.9457815885543823, "learning_rate": 7.417023082257652e-05, "loss": 0.5656, "step": 1697 }, { "epoch": 11.025974025974026, "grad_norm": 2.0404152870178223, "learning_rate": 7.414272464429068e-05, "loss": 0.6151, "step": 1698 }, { "epoch": 11.032467532467532, "grad_norm": 2.177133560180664, "learning_rate": 7.411520893483951e-05, "loss": 0.6507, "step": 1699 }, { "epoch": 11.03896103896104, "grad_norm": 2.029690980911255, "learning_rate": 7.408768370508576e-05, "loss": 0.6013, "step": 1700 }, { "epoch": 11.045454545454545, "grad_norm": 2.175691604614258, "learning_rate": 7.406014896589597e-05, "loss": 0.6494, "step": 1701 }, { "epoch": 11.051948051948052, "grad_norm": 1.9023364782333374, "learning_rate": 7.403260472814039e-05, "loss": 0.5249, "step": 1702 }, { "epoch": 11.058441558441558, "grad_norm": 1.9307712316513062, "learning_rate": 7.400505100269308e-05, "loss": 0.605, "step": 1703 }, { "epoch": 11.064935064935066, "grad_norm": 2.0047030448913574, "learning_rate": 7.397748780043179e-05, "loss": 0.593, "step": 1704 }, { "epoch": 11.071428571428571, "grad_norm": 2.054450750350952, "learning_rate": 7.394991513223806e-05, "loss": 0.6133, "step": 1705 }, { "epoch": 11.077922077922079, "grad_norm": 2.20436954498291, "learning_rate": 7.392233300899712e-05, "loss": 0.6176, "step": 1706 }, { "epoch": 11.084415584415584, "grad_norm": 2.08552622795105, "learning_rate": 7.389474144159796e-05, "loss": 0.5731, "step": 1707 }, { "epoch": 11.090909090909092, "grad_norm": 1.974770188331604, "learning_rate": 7.38671404409333e-05, "loss": 0.5748, "step": 1708 }, { "epoch": 11.097402597402597, "grad_norm": 2.1373484134674072, "learning_rate": 7.38395300178996e-05, "loss": 0.6429, "step": 1709 }, { "epoch": 11.103896103896103, "grad_norm": 2.1213269233703613, "learning_rate": 7.381191018339696e-05, "loss": 0.6421, "step": 1710 }, { "epoch": 11.11038961038961, "grad_norm": 2.0944926738739014, "learning_rate": 7.378428094832931e-05, "loss": 0.6206, "step": 1711 }, { "epoch": 11.116883116883116, "grad_norm": 2.2668039798736572, "learning_rate": 7.37566423236042e-05, "loss": 0.658, "step": 1712 }, { "epoch": 11.123376623376624, "grad_norm": 1.8874119520187378, "learning_rate": 7.372899432013294e-05, "loss": 0.5201, "step": 1713 }, { "epoch": 11.12987012987013, "grad_norm": 2.1221206188201904, "learning_rate": 7.370133694883051e-05, "loss": 0.6018, "step": 1714 }, { "epoch": 11.136363636363637, "grad_norm": 1.8658677339553833, "learning_rate": 7.36736702206156e-05, "loss": 0.5639, "step": 1715 }, { "epoch": 11.142857142857142, "grad_norm": 2.2183215618133545, "learning_rate": 7.364599414641064e-05, "loss": 0.667, "step": 1716 }, { "epoch": 11.14935064935065, "grad_norm": 2.2229409217834473, "learning_rate": 7.361830873714165e-05, "loss": 0.651, "step": 1717 }, { "epoch": 11.155844155844155, "grad_norm": 2.024958610534668, "learning_rate": 7.35906140037384e-05, "loss": 0.5851, "step": 1718 }, { "epoch": 11.162337662337663, "grad_norm": 2.61043643951416, "learning_rate": 7.356290995713437e-05, "loss": 0.7094, "step": 1719 }, { "epoch": 11.168831168831169, "grad_norm": 1.994360089302063, "learning_rate": 7.353519660826665e-05, "loss": 0.597, "step": 1720 }, { "epoch": 11.175324675324676, "grad_norm": 2.4047257900238037, "learning_rate": 7.350747396807601e-05, "loss": 0.6236, "step": 1721 }, { "epoch": 11.181818181818182, "grad_norm": 2.2192437648773193, "learning_rate": 7.347974204750696e-05, "loss": 0.6513, "step": 1722 }, { "epoch": 11.188311688311689, "grad_norm": 2.100661039352417, "learning_rate": 7.345200085750757e-05, "loss": 0.6196, "step": 1723 }, { "epoch": 11.194805194805195, "grad_norm": 1.9950624704360962, "learning_rate": 7.342425040902967e-05, "loss": 0.588, "step": 1724 }, { "epoch": 11.2012987012987, "grad_norm": 2.0585994720458984, "learning_rate": 7.339649071302867e-05, "loss": 0.6089, "step": 1725 }, { "epoch": 11.207792207792208, "grad_norm": 2.2247488498687744, "learning_rate": 7.336872178046368e-05, "loss": 0.702, "step": 1726 }, { "epoch": 11.214285714285714, "grad_norm": 2.0867512226104736, "learning_rate": 7.334094362229739e-05, "loss": 0.5719, "step": 1727 }, { "epoch": 11.220779220779221, "grad_norm": 2.1678314208984375, "learning_rate": 7.331315624949625e-05, "loss": 0.6033, "step": 1728 }, { "epoch": 11.227272727272727, "grad_norm": 1.8890186548233032, "learning_rate": 7.32853596730302e-05, "loss": 0.5229, "step": 1729 }, { "epoch": 11.233766233766234, "grad_norm": 1.9374103546142578, "learning_rate": 7.325755390387292e-05, "loss": 0.5743, "step": 1730 }, { "epoch": 11.24025974025974, "grad_norm": 2.0223710536956787, "learning_rate": 7.32297389530017e-05, "loss": 0.5544, "step": 1731 }, { "epoch": 11.246753246753247, "grad_norm": 1.9823747873306274, "learning_rate": 7.320191483139742e-05, "loss": 0.5758, "step": 1732 }, { "epoch": 11.253246753246753, "grad_norm": 2.0667495727539062, "learning_rate": 7.317408155004462e-05, "loss": 0.5883, "step": 1733 }, { "epoch": 11.25974025974026, "grad_norm": 2.946258306503296, "learning_rate": 7.314623911993142e-05, "loss": 0.6296, "step": 1734 }, { "epoch": 11.266233766233766, "grad_norm": 2.1733124256134033, "learning_rate": 7.311838755204959e-05, "loss": 0.6713, "step": 1735 }, { "epoch": 11.272727272727273, "grad_norm": 2.237227201461792, "learning_rate": 7.309052685739448e-05, "loss": 0.6404, "step": 1736 }, { "epoch": 11.279220779220779, "grad_norm": 2.2390263080596924, "learning_rate": 7.306265704696504e-05, "loss": 0.65, "step": 1737 }, { "epoch": 11.285714285714286, "grad_norm": 1.8883891105651855, "learning_rate": 7.303477813176385e-05, "loss": 0.5331, "step": 1738 }, { "epoch": 11.292207792207792, "grad_norm": 1.9942971467971802, "learning_rate": 7.300689012279706e-05, "loss": 0.5833, "step": 1739 }, { "epoch": 11.2987012987013, "grad_norm": 2.324103355407715, "learning_rate": 7.297899303107441e-05, "loss": 0.6633, "step": 1740 }, { "epoch": 11.305194805194805, "grad_norm": 2.0713562965393066, "learning_rate": 7.29510868676092e-05, "loss": 0.6185, "step": 1741 }, { "epoch": 11.311688311688311, "grad_norm": 2.018885612487793, "learning_rate": 7.29231716434184e-05, "loss": 0.6032, "step": 1742 }, { "epoch": 11.318181818181818, "grad_norm": 2.5611448287963867, "learning_rate": 7.289524736952245e-05, "loss": 0.7274, "step": 1743 }, { "epoch": 11.324675324675324, "grad_norm": 2.0800020694732666, "learning_rate": 7.286731405694544e-05, "loss": 0.6414, "step": 1744 }, { "epoch": 11.331168831168831, "grad_norm": 1.932077169418335, "learning_rate": 7.283937171671498e-05, "loss": 0.5886, "step": 1745 }, { "epoch": 11.337662337662337, "grad_norm": 2.0475950241088867, "learning_rate": 7.281142035986227e-05, "loss": 0.6354, "step": 1746 }, { "epoch": 11.344155844155845, "grad_norm": 2.118962287902832, "learning_rate": 7.278345999742208e-05, "loss": 0.6537, "step": 1747 }, { "epoch": 11.35064935064935, "grad_norm": 2.1595616340637207, "learning_rate": 7.275549064043268e-05, "loss": 0.6664, "step": 1748 }, { "epoch": 11.357142857142858, "grad_norm": 2.058976411819458, "learning_rate": 7.272751229993598e-05, "loss": 0.5731, "step": 1749 }, { "epoch": 11.363636363636363, "grad_norm": 2.0019469261169434, "learning_rate": 7.269952498697734e-05, "loss": 0.5478, "step": 1750 }, { "epoch": 11.37012987012987, "grad_norm": 1.9929718971252441, "learning_rate": 7.267152871260573e-05, "loss": 0.566, "step": 1751 }, { "epoch": 11.376623376623376, "grad_norm": 2.4305410385131836, "learning_rate": 7.264352348787364e-05, "loss": 0.6865, "step": 1752 }, { "epoch": 11.383116883116884, "grad_norm": 2.2349486351013184, "learning_rate": 7.261550932383707e-05, "loss": 0.5859, "step": 1753 }, { "epoch": 11.38961038961039, "grad_norm": 2.297055959701538, "learning_rate": 7.258748623155558e-05, "loss": 0.6629, "step": 1754 }, { "epoch": 11.396103896103897, "grad_norm": 2.401366949081421, "learning_rate": 7.255945422209227e-05, "loss": 0.7201, "step": 1755 }, { "epoch": 11.402597402597403, "grad_norm": 2.1199653148651123, "learning_rate": 7.253141330651368e-05, "loss": 0.6021, "step": 1756 }, { "epoch": 11.409090909090908, "grad_norm": 2.2622432708740234, "learning_rate": 7.250336349588994e-05, "loss": 0.6528, "step": 1757 }, { "epoch": 11.415584415584416, "grad_norm": 2.107171058654785, "learning_rate": 7.247530480129469e-05, "loss": 0.6177, "step": 1758 }, { "epoch": 11.422077922077921, "grad_norm": 2.6965367794036865, "learning_rate": 7.244723723380505e-05, "loss": 0.724, "step": 1759 }, { "epoch": 11.428571428571429, "grad_norm": 2.36151385307312, "learning_rate": 7.241916080450163e-05, "loss": 0.6813, "step": 1760 }, { "epoch": 11.435064935064934, "grad_norm": 2.007305860519409, "learning_rate": 7.239107552446857e-05, "loss": 0.6151, "step": 1761 }, { "epoch": 11.441558441558442, "grad_norm": 2.315720319747925, "learning_rate": 7.236298140479351e-05, "loss": 0.6959, "step": 1762 }, { "epoch": 11.448051948051948, "grad_norm": 2.2148401737213135, "learning_rate": 7.233487845656754e-05, "loss": 0.6723, "step": 1763 }, { "epoch": 11.454545454545455, "grad_norm": 2.250305414199829, "learning_rate": 7.23067666908853e-05, "loss": 0.6875, "step": 1764 }, { "epoch": 11.46103896103896, "grad_norm": 2.0240795612335205, "learning_rate": 7.227864611884483e-05, "loss": 0.6607, "step": 1765 }, { "epoch": 11.467532467532468, "grad_norm": 2.3691110610961914, "learning_rate": 7.225051675154767e-05, "loss": 0.7038, "step": 1766 }, { "epoch": 11.474025974025974, "grad_norm": 2.293552875518799, "learning_rate": 7.222237860009892e-05, "loss": 0.7139, "step": 1767 }, { "epoch": 11.480519480519481, "grad_norm": 2.2120654582977295, "learning_rate": 7.219423167560701e-05, "loss": 0.6508, "step": 1768 }, { "epoch": 11.487012987012987, "grad_norm": 1.8835054636001587, "learning_rate": 7.216607598918393e-05, "loss": 0.4962, "step": 1769 }, { "epoch": 11.493506493506494, "grad_norm": 2.0173633098602295, "learning_rate": 7.21379115519451e-05, "loss": 0.5846, "step": 1770 }, { "epoch": 11.5, "grad_norm": 2.0470130443573, "learning_rate": 7.210973837500937e-05, "loss": 0.6357, "step": 1771 }, { "epoch": 11.506493506493506, "grad_norm": 2.2059829235076904, "learning_rate": 7.208155646949908e-05, "loss": 0.6786, "step": 1772 }, { "epoch": 11.512987012987013, "grad_norm": 2.187005043029785, "learning_rate": 7.205336584653999e-05, "loss": 0.6637, "step": 1773 }, { "epoch": 11.519480519480519, "grad_norm": 2.1610119342803955, "learning_rate": 7.202516651726134e-05, "loss": 0.6523, "step": 1774 }, { "epoch": 11.525974025974026, "grad_norm": 2.28117036819458, "learning_rate": 7.199695849279576e-05, "loss": 0.6649, "step": 1775 }, { "epoch": 11.532467532467532, "grad_norm": 2.059276580810547, "learning_rate": 7.196874178427933e-05, "loss": 0.6081, "step": 1776 }, { "epoch": 11.53896103896104, "grad_norm": 2.2199676036834717, "learning_rate": 7.194051640285157e-05, "loss": 0.6859, "step": 1777 }, { "epoch": 11.545454545454545, "grad_norm": 2.2723867893218994, "learning_rate": 7.191228235965538e-05, "loss": 0.6717, "step": 1778 }, { "epoch": 11.551948051948052, "grad_norm": 2.3416223526000977, "learning_rate": 7.188403966583716e-05, "loss": 0.6552, "step": 1779 }, { "epoch": 11.558441558441558, "grad_norm": 2.1369590759277344, "learning_rate": 7.185578833254664e-05, "loss": 0.6509, "step": 1780 }, { "epoch": 11.564935064935066, "grad_norm": 2.2083840370178223, "learning_rate": 7.182752837093702e-05, "loss": 0.6168, "step": 1781 }, { "epoch": 11.571428571428571, "grad_norm": 1.9752053022384644, "learning_rate": 7.179925979216491e-05, "loss": 0.6355, "step": 1782 }, { "epoch": 11.577922077922079, "grad_norm": 2.114065408706665, "learning_rate": 7.177098260739024e-05, "loss": 0.6583, "step": 1783 }, { "epoch": 11.584415584415584, "grad_norm": 2.2280924320220947, "learning_rate": 7.174269682777641e-05, "loss": 0.665, "step": 1784 }, { "epoch": 11.590909090909092, "grad_norm": 2.106236696243286, "learning_rate": 7.171440246449024e-05, "loss": 0.6574, "step": 1785 }, { "epoch": 11.597402597402597, "grad_norm": 2.398568630218506, "learning_rate": 7.168609952870184e-05, "loss": 0.6788, "step": 1786 }, { "epoch": 11.603896103896105, "grad_norm": 2.0673725605010986, "learning_rate": 7.16577880315848e-05, "loss": 0.6203, "step": 1787 }, { "epoch": 11.61038961038961, "grad_norm": 2.2148146629333496, "learning_rate": 7.162946798431604e-05, "loss": 0.7175, "step": 1788 }, { "epoch": 11.616883116883116, "grad_norm": 2.2335145473480225, "learning_rate": 7.160113939807587e-05, "loss": 0.6621, "step": 1789 }, { "epoch": 11.623376623376624, "grad_norm": 2.2381131649017334, "learning_rate": 7.157280228404795e-05, "loss": 0.6576, "step": 1790 }, { "epoch": 11.62987012987013, "grad_norm": 2.1792495250701904, "learning_rate": 7.154445665341933e-05, "loss": 0.6911, "step": 1791 }, { "epoch": 11.636363636363637, "grad_norm": 2.3321566581726074, "learning_rate": 7.151610251738045e-05, "loss": 0.6945, "step": 1792 }, { "epoch": 11.642857142857142, "grad_norm": 2.0206103324890137, "learning_rate": 7.148773988712503e-05, "loss": 0.6102, "step": 1793 }, { "epoch": 11.64935064935065, "grad_norm": 2.0455446243286133, "learning_rate": 7.145936877385018e-05, "loss": 0.6816, "step": 1794 }, { "epoch": 11.655844155844155, "grad_norm": 2.3636724948883057, "learning_rate": 7.143098918875643e-05, "loss": 0.6943, "step": 1795 }, { "epoch": 11.662337662337663, "grad_norm": 2.1322073936462402, "learning_rate": 7.140260114304751e-05, "loss": 0.7023, "step": 1796 }, { "epoch": 11.668831168831169, "grad_norm": 2.060019016265869, "learning_rate": 7.137420464793063e-05, "loss": 0.5799, "step": 1797 }, { "epoch": 11.675324675324676, "grad_norm": 1.8431174755096436, "learning_rate": 7.134579971461627e-05, "loss": 0.5107, "step": 1798 }, { "epoch": 11.681818181818182, "grad_norm": 2.0622732639312744, "learning_rate": 7.131738635431822e-05, "loss": 0.6045, "step": 1799 }, { "epoch": 11.688311688311689, "grad_norm": 1.8862097263336182, "learning_rate": 7.128896457825364e-05, "loss": 0.6123, "step": 1800 }, { "epoch": 11.694805194805195, "grad_norm": 2.087017297744751, "learning_rate": 7.126053439764299e-05, "loss": 0.6562, "step": 1801 }, { "epoch": 11.7012987012987, "grad_norm": 1.8770846128463745, "learning_rate": 7.123209582371006e-05, "loss": 0.6068, "step": 1802 }, { "epoch": 11.707792207792208, "grad_norm": 2.0145421028137207, "learning_rate": 7.120364886768197e-05, "loss": 0.645, "step": 1803 }, { "epoch": 11.714285714285714, "grad_norm": 1.9685002565383911, "learning_rate": 7.11751935407891e-05, "loss": 0.6093, "step": 1804 }, { "epoch": 11.720779220779221, "grad_norm": 2.0800533294677734, "learning_rate": 7.114672985426516e-05, "loss": 0.6473, "step": 1805 }, { "epoch": 11.727272727272727, "grad_norm": 1.9733763933181763, "learning_rate": 7.111825781934719e-05, "loss": 0.637, "step": 1806 }, { "epoch": 11.733766233766234, "grad_norm": 2.161607265472412, "learning_rate": 7.108977744727547e-05, "loss": 0.6241, "step": 1807 }, { "epoch": 11.74025974025974, "grad_norm": 2.179612636566162, "learning_rate": 7.106128874929363e-05, "loss": 0.7628, "step": 1808 }, { "epoch": 11.746753246753247, "grad_norm": 2.0560779571533203, "learning_rate": 7.103279173664851e-05, "loss": 0.5887, "step": 1809 }, { "epoch": 11.753246753246753, "grad_norm": 2.240081548690796, "learning_rate": 7.100428642059033e-05, "loss": 0.733, "step": 1810 }, { "epoch": 11.75974025974026, "grad_norm": 2.139436960220337, "learning_rate": 7.097577281237249e-05, "loss": 0.6425, "step": 1811 }, { "epoch": 11.766233766233766, "grad_norm": 2.2671170234680176, "learning_rate": 7.094725092325176e-05, "loss": 0.6823, "step": 1812 }, { "epoch": 11.772727272727273, "grad_norm": 2.1800804138183594, "learning_rate": 7.09187207644881e-05, "loss": 0.7122, "step": 1813 }, { "epoch": 11.779220779220779, "grad_norm": 1.9480642080307007, "learning_rate": 7.089018234734476e-05, "loss": 0.556, "step": 1814 }, { "epoch": 11.785714285714286, "grad_norm": 1.8281680345535278, "learning_rate": 7.086163568308828e-05, "loss": 0.5396, "step": 1815 }, { "epoch": 11.792207792207792, "grad_norm": 2.3980553150177, "learning_rate": 7.08330807829884e-05, "loss": 0.7549, "step": 1816 }, { "epoch": 11.7987012987013, "grad_norm": 2.317260503768921, "learning_rate": 7.080451765831817e-05, "loss": 0.6985, "step": 1817 }, { "epoch": 11.805194805194805, "grad_norm": 2.4132368564605713, "learning_rate": 7.077594632035385e-05, "loss": 0.7135, "step": 1818 }, { "epoch": 11.811688311688311, "grad_norm": 2.272742509841919, "learning_rate": 7.074736678037494e-05, "loss": 0.668, "step": 1819 }, { "epoch": 11.818181818181818, "grad_norm": 2.309525728225708, "learning_rate": 7.071877904966423e-05, "loss": 0.7512, "step": 1820 }, { "epoch": 11.824675324675324, "grad_norm": 2.0604569911956787, "learning_rate": 7.069018313950763e-05, "loss": 0.6374, "step": 1821 }, { "epoch": 11.831168831168831, "grad_norm": 2.2944161891937256, "learning_rate": 7.066157906119441e-05, "loss": 0.7265, "step": 1822 }, { "epoch": 11.837662337662337, "grad_norm": 2.2441184520721436, "learning_rate": 7.063296682601702e-05, "loss": 0.6809, "step": 1823 }, { "epoch": 11.844155844155845, "grad_norm": 2.2128078937530518, "learning_rate": 7.060434644527105e-05, "loss": 0.6976, "step": 1824 }, { "epoch": 11.85064935064935, "grad_norm": 2.045984983444214, "learning_rate": 7.057571793025544e-05, "loss": 0.6242, "step": 1825 }, { "epoch": 11.857142857142858, "grad_norm": 2.0411674976348877, "learning_rate": 7.054708129227224e-05, "loss": 0.6031, "step": 1826 }, { "epoch": 11.863636363636363, "grad_norm": 2.0572099685668945, "learning_rate": 7.051843654262676e-05, "loss": 0.5997, "step": 1827 }, { "epoch": 11.87012987012987, "grad_norm": 2.142810821533203, "learning_rate": 7.048978369262747e-05, "loss": 0.5677, "step": 1828 }, { "epoch": 11.876623376623376, "grad_norm": 2.1098294258117676, "learning_rate": 7.046112275358608e-05, "loss": 0.6425, "step": 1829 }, { "epoch": 11.883116883116884, "grad_norm": 2.272458791732788, "learning_rate": 7.043245373681747e-05, "loss": 0.7284, "step": 1830 }, { "epoch": 11.88961038961039, "grad_norm": 2.2965171337127686, "learning_rate": 7.040377665363969e-05, "loss": 0.6809, "step": 1831 }, { "epoch": 11.896103896103895, "grad_norm": 2.2099809646606445, "learning_rate": 7.037509151537403e-05, "loss": 0.6389, "step": 1832 }, { "epoch": 11.902597402597403, "grad_norm": 2.201387882232666, "learning_rate": 7.034639833334494e-05, "loss": 0.6783, "step": 1833 }, { "epoch": 11.909090909090908, "grad_norm": 2.2194228172302246, "learning_rate": 7.031769711888e-05, "loss": 0.6866, "step": 1834 }, { "epoch": 11.915584415584416, "grad_norm": 2.3768062591552734, "learning_rate": 7.028898788331e-05, "loss": 0.7316, "step": 1835 }, { "epoch": 11.922077922077921, "grad_norm": 2.1263155937194824, "learning_rate": 7.026027063796891e-05, "loss": 0.6822, "step": 1836 }, { "epoch": 11.928571428571429, "grad_norm": 2.297542095184326, "learning_rate": 7.023154539419384e-05, "loss": 0.6611, "step": 1837 }, { "epoch": 11.935064935064934, "grad_norm": 2.1204769611358643, "learning_rate": 7.020281216332503e-05, "loss": 0.658, "step": 1838 }, { "epoch": 11.941558441558442, "grad_norm": 1.9579997062683105, "learning_rate": 7.017407095670593e-05, "loss": 0.5524, "step": 1839 }, { "epoch": 11.948051948051948, "grad_norm": 2.3692562580108643, "learning_rate": 7.014532178568314e-05, "loss": 0.6995, "step": 1840 }, { "epoch": 11.954545454545455, "grad_norm": 2.3002681732177734, "learning_rate": 7.011656466160632e-05, "loss": 0.7267, "step": 1841 }, { "epoch": 11.96103896103896, "grad_norm": 2.3371801376342773, "learning_rate": 7.008779959582837e-05, "loss": 0.7695, "step": 1842 }, { "epoch": 11.967532467532468, "grad_norm": 2.0627670288085938, "learning_rate": 7.005902659970528e-05, "loss": 0.635, "step": 1843 }, { "epoch": 11.974025974025974, "grad_norm": 2.0339272022247314, "learning_rate": 7.003024568459614e-05, "loss": 0.6353, "step": 1844 }, { "epoch": 11.980519480519481, "grad_norm": 2.1179451942443848, "learning_rate": 7.000145686186324e-05, "loss": 0.6786, "step": 1845 }, { "epoch": 11.987012987012987, "grad_norm": 2.1404683589935303, "learning_rate": 6.997266014287193e-05, "loss": 0.7124, "step": 1846 }, { "epoch": 11.993506493506494, "grad_norm": 2.2253522872924805, "learning_rate": 6.994385553899069e-05, "loss": 0.7233, "step": 1847 }, { "epoch": 12.0, "grad_norm": 1076.5802001953125, "learning_rate": 6.991504306159114e-05, "loss": 0.6579, "step": 1848 }, { "epoch": 12.006493506493506, "grad_norm": 1.9225164651870728, "learning_rate": 6.988622272204799e-05, "loss": 0.5623, "step": 1849 }, { "epoch": 12.012987012987013, "grad_norm": 1.8583208322525024, "learning_rate": 6.985739453173903e-05, "loss": 0.53, "step": 1850 }, { "epoch": 12.019480519480519, "grad_norm": 1.985059380531311, "learning_rate": 6.98285585020452e-05, "loss": 0.5346, "step": 1851 }, { "epoch": 12.025974025974026, "grad_norm": 2.0284557342529297, "learning_rate": 6.97997146443505e-05, "loss": 0.5592, "step": 1852 }, { "epoch": 12.032467532467532, "grad_norm": 2.00041127204895, "learning_rate": 6.977086297004202e-05, "loss": 0.5619, "step": 1853 }, { "epoch": 12.03896103896104, "grad_norm": 2.134187936782837, "learning_rate": 6.974200349050996e-05, "loss": 0.6185, "step": 1854 }, { "epoch": 12.045454545454545, "grad_norm": 1.9643243551254272, "learning_rate": 6.971313621714756e-05, "loss": 0.5066, "step": 1855 }, { "epoch": 12.051948051948052, "grad_norm": 2.0005311965942383, "learning_rate": 6.968426116135118e-05, "loss": 0.591, "step": 1856 }, { "epoch": 12.058441558441558, "grad_norm": 1.8810878992080688, "learning_rate": 6.965537833452024e-05, "loss": 0.4961, "step": 1857 }, { "epoch": 12.064935064935066, "grad_norm": 1.9006643295288086, "learning_rate": 6.96264877480572e-05, "loss": 0.5309, "step": 1858 }, { "epoch": 12.071428571428571, "grad_norm": 2.020428419113159, "learning_rate": 6.959758941336762e-05, "loss": 0.5334, "step": 1859 }, { "epoch": 12.077922077922079, "grad_norm": 1.9589020013809204, "learning_rate": 6.956868334186013e-05, "loss": 0.5894, "step": 1860 }, { "epoch": 12.084415584415584, "grad_norm": 1.9384912252426147, "learning_rate": 6.953976954494635e-05, "loss": 0.5343, "step": 1861 }, { "epoch": 12.090909090909092, "grad_norm": 2.0977766513824463, "learning_rate": 6.9510848034041e-05, "loss": 0.5726, "step": 1862 }, { "epoch": 12.097402597402597, "grad_norm": 2.350893497467041, "learning_rate": 6.948191882056185e-05, "loss": 0.5778, "step": 1863 }, { "epoch": 12.103896103896103, "grad_norm": 1.8108704090118408, "learning_rate": 6.945298191592967e-05, "loss": 0.4941, "step": 1864 }, { "epoch": 12.11038961038961, "grad_norm": 1.985229253768921, "learning_rate": 6.942403733156832e-05, "loss": 0.5245, "step": 1865 }, { "epoch": 12.116883116883116, "grad_norm": 2.0757787227630615, "learning_rate": 6.939508507890464e-05, "loss": 0.5347, "step": 1866 }, { "epoch": 12.123376623376624, "grad_norm": 2.1860249042510986, "learning_rate": 6.936612516936852e-05, "loss": 0.5951, "step": 1867 }, { "epoch": 12.12987012987013, "grad_norm": 2.0259830951690674, "learning_rate": 6.93371576143929e-05, "loss": 0.5316, "step": 1868 }, { "epoch": 12.136363636363637, "grad_norm": 1.9689854383468628, "learning_rate": 6.930818242541368e-05, "loss": 0.5568, "step": 1869 }, { "epoch": 12.142857142857142, "grad_norm": 2.054041862487793, "learning_rate": 6.927919961386984e-05, "loss": 0.5711, "step": 1870 }, { "epoch": 12.14935064935065, "grad_norm": 1.7612172365188599, "learning_rate": 6.92502091912033e-05, "loss": 0.4729, "step": 1871 }, { "epoch": 12.155844155844155, "grad_norm": 2.099085569381714, "learning_rate": 6.922121116885903e-05, "loss": 0.5802, "step": 1872 }, { "epoch": 12.162337662337663, "grad_norm": 2.230409860610962, "learning_rate": 6.919220555828502e-05, "loss": 0.5908, "step": 1873 }, { "epoch": 12.168831168831169, "grad_norm": 2.0448951721191406, "learning_rate": 6.916319237093219e-05, "loss": 0.5871, "step": 1874 }, { "epoch": 12.175324675324676, "grad_norm": 1.6726332902908325, "learning_rate": 6.91341716182545e-05, "loss": 0.4441, "step": 1875 }, { "epoch": 12.181818181818182, "grad_norm": 1.9495742321014404, "learning_rate": 6.910514331170888e-05, "loss": 0.5741, "step": 1876 }, { "epoch": 12.188311688311689, "grad_norm": 1.893748164176941, "learning_rate": 6.907610746275523e-05, "loss": 0.5568, "step": 1877 }, { "epoch": 12.194805194805195, "grad_norm": 2.1908624172210693, "learning_rate": 6.904706408285648e-05, "loss": 0.6201, "step": 1878 }, { "epoch": 12.2012987012987, "grad_norm": 2.14664363861084, "learning_rate": 6.901801318347847e-05, "loss": 0.6187, "step": 1879 }, { "epoch": 12.207792207792208, "grad_norm": 2.0984911918640137, "learning_rate": 6.898895477609007e-05, "loss": 0.608, "step": 1880 }, { "epoch": 12.214285714285714, "grad_norm": 2.1119325160980225, "learning_rate": 6.895988887216302e-05, "loss": 0.6473, "step": 1881 }, { "epoch": 12.220779220779221, "grad_norm": 2.1093862056732178, "learning_rate": 6.893081548317211e-05, "loss": 0.6126, "step": 1882 }, { "epoch": 12.227272727272727, "grad_norm": 2.203214645385742, "learning_rate": 6.890173462059506e-05, "loss": 0.6142, "step": 1883 }, { "epoch": 12.233766233766234, "grad_norm": 2.082893133163452, "learning_rate": 6.887264629591254e-05, "loss": 0.584, "step": 1884 }, { "epoch": 12.24025974025974, "grad_norm": 1.9005697965621948, "learning_rate": 6.884355052060814e-05, "loss": 0.494, "step": 1885 }, { "epoch": 12.246753246753247, "grad_norm": 2.1379759311676025, "learning_rate": 6.881444730616842e-05, "loss": 0.5557, "step": 1886 }, { "epoch": 12.253246753246753, "grad_norm": 1.7855029106140137, "learning_rate": 6.878533666408286e-05, "loss": 0.4827, "step": 1887 }, { "epoch": 12.25974025974026, "grad_norm": 2.110853433609009, "learning_rate": 6.87562186058439e-05, "loss": 0.5239, "step": 1888 }, { "epoch": 12.266233766233766, "grad_norm": 2.10565185546875, "learning_rate": 6.872709314294685e-05, "loss": 0.5986, "step": 1889 }, { "epoch": 12.272727272727273, "grad_norm": 2.247499465942383, "learning_rate": 6.869796028689001e-05, "loss": 0.5874, "step": 1890 }, { "epoch": 12.279220779220779, "grad_norm": 2.151932954788208, "learning_rate": 6.86688200491746e-05, "loss": 0.5771, "step": 1891 }, { "epoch": 12.285714285714286, "grad_norm": 1.9797194004058838, "learning_rate": 6.863967244130467e-05, "loss": 0.5452, "step": 1892 }, { "epoch": 12.292207792207792, "grad_norm": 2.0008790493011475, "learning_rate": 6.861051747478726e-05, "loss": 0.5515, "step": 1893 }, { "epoch": 12.2987012987013, "grad_norm": 2.157362937927246, "learning_rate": 6.858135516113226e-05, "loss": 0.5479, "step": 1894 }, { "epoch": 12.305194805194805, "grad_norm": 2.224337577819824, "learning_rate": 6.855218551185255e-05, "loss": 0.6381, "step": 1895 }, { "epoch": 12.311688311688311, "grad_norm": 1.938456416130066, "learning_rate": 6.852300853846381e-05, "loss": 0.5629, "step": 1896 }, { "epoch": 12.318181818181818, "grad_norm": 1.7481359243392944, "learning_rate": 6.849382425248464e-05, "loss": 0.4578, "step": 1897 }, { "epoch": 12.324675324675324, "grad_norm": 1.9388666152954102, "learning_rate": 6.84646326654365e-05, "loss": 0.5472, "step": 1898 }, { "epoch": 12.331168831168831, "grad_norm": 2.1877782344818115, "learning_rate": 6.843543378884387e-05, "loss": 0.6215, "step": 1899 }, { "epoch": 12.337662337662337, "grad_norm": 2.2899937629699707, "learning_rate": 6.840622763423391e-05, "loss": 0.6257, "step": 1900 }, { "epoch": 12.344155844155845, "grad_norm": 2.166365146636963, "learning_rate": 6.837701421313677e-05, "loss": 0.5647, "step": 1901 }, { "epoch": 12.35064935064935, "grad_norm": 2.299274444580078, "learning_rate": 6.834779353708548e-05, "loss": 0.635, "step": 1902 }, { "epoch": 12.357142857142858, "grad_norm": 2.1747353076934814, "learning_rate": 6.831856561761585e-05, "loss": 0.6053, "step": 1903 }, { "epoch": 12.363636363636363, "grad_norm": 2.0763461589813232, "learning_rate": 6.828933046626664e-05, "loss": 0.5943, "step": 1904 }, { "epoch": 12.37012987012987, "grad_norm": 2.133894920349121, "learning_rate": 6.82600880945794e-05, "loss": 0.5598, "step": 1905 }, { "epoch": 12.376623376623376, "grad_norm": 2.123075485229492, "learning_rate": 6.823083851409857e-05, "loss": 0.5706, "step": 1906 }, { "epoch": 12.383116883116884, "grad_norm": 2.181898832321167, "learning_rate": 6.820158173637142e-05, "loss": 0.5897, "step": 1907 }, { "epoch": 12.38961038961039, "grad_norm": 2.125399589538574, "learning_rate": 6.817231777294803e-05, "loss": 0.5686, "step": 1908 }, { "epoch": 12.396103896103897, "grad_norm": 2.233853578567505, "learning_rate": 6.814304663538142e-05, "loss": 0.6248, "step": 1909 }, { "epoch": 12.402597402597403, "grad_norm": 2.1631298065185547, "learning_rate": 6.811376833522729e-05, "loss": 0.591, "step": 1910 }, { "epoch": 12.409090909090908, "grad_norm": 2.2579503059387207, "learning_rate": 6.808448288404431e-05, "loss": 0.6415, "step": 1911 }, { "epoch": 12.415584415584416, "grad_norm": 2.1994619369506836, "learning_rate": 6.805519029339387e-05, "loss": 0.5854, "step": 1912 }, { "epoch": 12.422077922077921, "grad_norm": 1.9904730319976807, "learning_rate": 6.802589057484027e-05, "loss": 0.5822, "step": 1913 }, { "epoch": 12.428571428571429, "grad_norm": 2.0386736392974854, "learning_rate": 6.799658373995053e-05, "loss": 0.5614, "step": 1914 }, { "epoch": 12.435064935064934, "grad_norm": 2.15208101272583, "learning_rate": 6.796726980029454e-05, "loss": 0.5926, "step": 1915 }, { "epoch": 12.441558441558442, "grad_norm": 1.9997342824935913, "learning_rate": 6.793794876744499e-05, "loss": 0.5069, "step": 1916 }, { "epoch": 12.448051948051948, "grad_norm": 2.0709683895111084, "learning_rate": 6.790862065297732e-05, "loss": 0.5953, "step": 1917 }, { "epoch": 12.454545454545455, "grad_norm": 1.7772856950759888, "learning_rate": 6.787928546846987e-05, "loss": 0.5253, "step": 1918 }, { "epoch": 12.46103896103896, "grad_norm": 2.1581130027770996, "learning_rate": 6.784994322550365e-05, "loss": 0.5836, "step": 1919 }, { "epoch": 12.467532467532468, "grad_norm": 1.9727606773376465, "learning_rate": 6.782059393566253e-05, "loss": 0.5558, "step": 1920 }, { "epoch": 12.474025974025974, "grad_norm": 2.0561611652374268, "learning_rate": 6.779123761053317e-05, "loss": 0.5259, "step": 1921 }, { "epoch": 12.480519480519481, "grad_norm": 2.3199853897094727, "learning_rate": 6.776187426170493e-05, "loss": 0.6292, "step": 1922 }, { "epoch": 12.487012987012987, "grad_norm": 2.1790263652801514, "learning_rate": 6.773250390077006e-05, "loss": 0.5693, "step": 1923 }, { "epoch": 12.493506493506494, "grad_norm": 2.2785651683807373, "learning_rate": 6.770312653932345e-05, "loss": 0.6542, "step": 1924 }, { "epoch": 12.5, "grad_norm": 1.8668270111083984, "learning_rate": 6.767374218896286e-05, "loss": 0.5379, "step": 1925 }, { "epoch": 12.506493506493506, "grad_norm": 2.2081425189971924, "learning_rate": 6.764435086128876e-05, "loss": 0.5977, "step": 1926 }, { "epoch": 12.512987012987013, "grad_norm": 2.148101329803467, "learning_rate": 6.761495256790435e-05, "loss": 0.5996, "step": 1927 }, { "epoch": 12.519480519480519, "grad_norm": 1.9085726737976074, "learning_rate": 6.758554732041564e-05, "loss": 0.5474, "step": 1928 }, { "epoch": 12.525974025974026, "grad_norm": 1.6701866388320923, "learning_rate": 6.755613513043137e-05, "loss": 0.4075, "step": 1929 }, { "epoch": 12.532467532467532, "grad_norm": 2.0828919410705566, "learning_rate": 6.752671600956295e-05, "loss": 0.5761, "step": 1930 }, { "epoch": 12.53896103896104, "grad_norm": 2.122612237930298, "learning_rate": 6.749728996942463e-05, "loss": 0.5902, "step": 1931 }, { "epoch": 12.545454545454545, "grad_norm": 2.474417209625244, "learning_rate": 6.746785702163336e-05, "loss": 0.6628, "step": 1932 }, { "epoch": 12.551948051948052, "grad_norm": 2.0963616371154785, "learning_rate": 6.743841717780874e-05, "loss": 0.6209, "step": 1933 }, { "epoch": 12.558441558441558, "grad_norm": 1.8984389305114746, "learning_rate": 6.740897044957322e-05, "loss": 0.5674, "step": 1934 }, { "epoch": 12.564935064935066, "grad_norm": 2.200878620147705, "learning_rate": 6.737951684855185e-05, "loss": 0.6193, "step": 1935 }, { "epoch": 12.571428571428571, "grad_norm": 1.947191834449768, "learning_rate": 6.735005638637248e-05, "loss": 0.5233, "step": 1936 }, { "epoch": 12.577922077922079, "grad_norm": 1.9447444677352905, "learning_rate": 6.73205890746656e-05, "loss": 0.5566, "step": 1937 }, { "epoch": 12.584415584415584, "grad_norm": 2.252058267593384, "learning_rate": 6.729111492506449e-05, "loss": 0.648, "step": 1938 }, { "epoch": 12.590909090909092, "grad_norm": 1.99386465549469, "learning_rate": 6.726163394920503e-05, "loss": 0.5509, "step": 1939 }, { "epoch": 12.597402597402597, "grad_norm": 2.0936877727508545, "learning_rate": 6.723214615872585e-05, "loss": 0.572, "step": 1940 }, { "epoch": 12.603896103896105, "grad_norm": 2.110973596572876, "learning_rate": 6.720265156526828e-05, "loss": 0.5817, "step": 1941 }, { "epoch": 12.61038961038961, "grad_norm": 2.3618104457855225, "learning_rate": 6.71731501804763e-05, "loss": 0.6308, "step": 1942 }, { "epoch": 12.616883116883116, "grad_norm": 2.1770923137664795, "learning_rate": 6.714364201599662e-05, "loss": 0.5744, "step": 1943 }, { "epoch": 12.623376623376624, "grad_norm": 2.2065415382385254, "learning_rate": 6.711412708347856e-05, "loss": 0.6431, "step": 1944 }, { "epoch": 12.62987012987013, "grad_norm": 2.123046398162842, "learning_rate": 6.708460539457418e-05, "loss": 0.577, "step": 1945 }, { "epoch": 12.636363636363637, "grad_norm": 2.3211543560028076, "learning_rate": 6.705507696093814e-05, "loss": 0.6264, "step": 1946 }, { "epoch": 12.642857142857142, "grad_norm": 1.948301076889038, "learning_rate": 6.702554179422783e-05, "loss": 0.5625, "step": 1947 }, { "epoch": 12.64935064935065, "grad_norm": 1.979990839958191, "learning_rate": 6.699599990610323e-05, "loss": 0.5623, "step": 1948 }, { "epoch": 12.655844155844155, "grad_norm": 1.8006978034973145, "learning_rate": 6.696645130822705e-05, "loss": 0.5595, "step": 1949 }, { "epoch": 12.662337662337663, "grad_norm": 2.1235246658325195, "learning_rate": 6.693689601226458e-05, "loss": 0.5588, "step": 1950 }, { "epoch": 12.668831168831169, "grad_norm": 2.1899781227111816, "learning_rate": 6.690733402988379e-05, "loss": 0.615, "step": 1951 }, { "epoch": 12.675324675324676, "grad_norm": 1.9493566751480103, "learning_rate": 6.68777653727553e-05, "loss": 0.5755, "step": 1952 }, { "epoch": 12.681818181818182, "grad_norm": 2.196223497390747, "learning_rate": 6.684819005255232e-05, "loss": 0.6394, "step": 1953 }, { "epoch": 12.688311688311689, "grad_norm": 2.0495898723602295, "learning_rate": 6.681860808095074e-05, "loss": 0.5432, "step": 1954 }, { "epoch": 12.694805194805195, "grad_norm": 2.1735448837280273, "learning_rate": 6.678901946962903e-05, "loss": 0.6549, "step": 1955 }, { "epoch": 12.7012987012987, "grad_norm": 2.1591930389404297, "learning_rate": 6.675942423026833e-05, "loss": 0.5885, "step": 1956 }, { "epoch": 12.707792207792208, "grad_norm": 2.046126127243042, "learning_rate": 6.672982237455237e-05, "loss": 0.5849, "step": 1957 }, { "epoch": 12.714285714285714, "grad_norm": 2.0392959117889404, "learning_rate": 6.670021391416749e-05, "loss": 0.5539, "step": 1958 }, { "epoch": 12.720779220779221, "grad_norm": 2.007087230682373, "learning_rate": 6.667059886080262e-05, "loss": 0.5605, "step": 1959 }, { "epoch": 12.727272727272727, "grad_norm": 2.019771099090576, "learning_rate": 6.664097722614934e-05, "loss": 0.612, "step": 1960 }, { "epoch": 12.733766233766234, "grad_norm": 2.1823174953460693, "learning_rate": 6.66113490219018e-05, "loss": 0.6417, "step": 1961 }, { "epoch": 12.74025974025974, "grad_norm": 2.022130250930786, "learning_rate": 6.658171425975672e-05, "loss": 0.5359, "step": 1962 }, { "epoch": 12.746753246753247, "grad_norm": 2.0918095111846924, "learning_rate": 6.655207295141346e-05, "loss": 0.5774, "step": 1963 }, { "epoch": 12.753246753246753, "grad_norm": 2.2147998809814453, "learning_rate": 6.652242510857394e-05, "loss": 0.6046, "step": 1964 }, { "epoch": 12.75974025974026, "grad_norm": 2.1559667587280273, "learning_rate": 6.649277074294264e-05, "loss": 0.5821, "step": 1965 }, { "epoch": 12.766233766233766, "grad_norm": 1.8574299812316895, "learning_rate": 6.646310986622668e-05, "loss": 0.5442, "step": 1966 }, { "epoch": 12.772727272727273, "grad_norm": 2.072185754776001, "learning_rate": 6.643344249013562e-05, "loss": 0.5747, "step": 1967 }, { "epoch": 12.779220779220779, "grad_norm": 1.8414840698242188, "learning_rate": 6.640376862638176e-05, "loss": 0.4852, "step": 1968 }, { "epoch": 12.785714285714286, "grad_norm": 2.0400550365448, "learning_rate": 6.637408828667982e-05, "loss": 0.5534, "step": 1969 }, { "epoch": 12.792207792207792, "grad_norm": 2.1789374351501465, "learning_rate": 6.634440148274713e-05, "loss": 0.5932, "step": 1970 }, { "epoch": 12.7987012987013, "grad_norm": 2.1213948726654053, "learning_rate": 6.631470822630359e-05, "loss": 0.6047, "step": 1971 }, { "epoch": 12.805194805194805, "grad_norm": 1.9664524793624878, "learning_rate": 6.628500852907161e-05, "loss": 0.5529, "step": 1972 }, { "epoch": 12.811688311688311, "grad_norm": 2.3267581462860107, "learning_rate": 6.625530240277618e-05, "loss": 0.6179, "step": 1973 }, { "epoch": 12.818181818181818, "grad_norm": 2.1917669773101807, "learning_rate": 6.622558985914477e-05, "loss": 0.6275, "step": 1974 }, { "epoch": 12.824675324675324, "grad_norm": 2.1599512100219727, "learning_rate": 6.619587090990746e-05, "loss": 0.6153, "step": 1975 }, { "epoch": 12.831168831168831, "grad_norm": 2.2975082397460938, "learning_rate": 6.616614556679683e-05, "loss": 0.6387, "step": 1976 }, { "epoch": 12.837662337662337, "grad_norm": 2.11966609954834, "learning_rate": 6.613641384154794e-05, "loss": 0.5674, "step": 1977 }, { "epoch": 12.844155844155845, "grad_norm": 2.677170991897583, "learning_rate": 6.61066757458984e-05, "loss": 0.701, "step": 1978 }, { "epoch": 12.85064935064935, "grad_norm": 2.129204273223877, "learning_rate": 6.607693129158837e-05, "loss": 0.5654, "step": 1979 }, { "epoch": 12.857142857142858, "grad_norm": 1.9715017080307007, "learning_rate": 6.604718049036048e-05, "loss": 0.5539, "step": 1980 }, { "epoch": 12.863636363636363, "grad_norm": 2.3241770267486572, "learning_rate": 6.601742335395987e-05, "loss": 0.6669, "step": 1981 }, { "epoch": 12.87012987012987, "grad_norm": 2.105726718902588, "learning_rate": 6.598765989413419e-05, "loss": 0.6325, "step": 1982 }, { "epoch": 12.876623376623376, "grad_norm": 1.9479717016220093, "learning_rate": 6.595789012263355e-05, "loss": 0.5448, "step": 1983 }, { "epoch": 12.883116883116884, "grad_norm": 2.1983015537261963, "learning_rate": 6.592811405121064e-05, "loss": 0.6193, "step": 1984 }, { "epoch": 12.88961038961039, "grad_norm": 2.3505144119262695, "learning_rate": 6.589833169162054e-05, "loss": 0.677, "step": 1985 }, { "epoch": 12.896103896103895, "grad_norm": 2.245614767074585, "learning_rate": 6.586854305562088e-05, "loss": 0.6554, "step": 1986 }, { "epoch": 12.902597402597403, "grad_norm": 1.899040937423706, "learning_rate": 6.583874815497175e-05, "loss": 0.5566, "step": 1987 }, { "epoch": 12.909090909090908, "grad_norm": 2.100031614303589, "learning_rate": 6.580894700143565e-05, "loss": 0.5871, "step": 1988 }, { "epoch": 12.915584415584416, "grad_norm": 2.2208034992218018, "learning_rate": 6.577913960677766e-05, "loss": 0.6821, "step": 1989 }, { "epoch": 12.922077922077921, "grad_norm": 2.1982948780059814, "learning_rate": 6.574932598276525e-05, "loss": 0.6474, "step": 1990 }, { "epoch": 12.928571428571429, "grad_norm": 1.8316792249679565, "learning_rate": 6.571950614116835e-05, "loss": 0.5621, "step": 1991 }, { "epoch": 12.935064935064934, "grad_norm": 2.208531379699707, "learning_rate": 6.568968009375937e-05, "loss": 0.6198, "step": 1992 }, { "epoch": 12.941558441558442, "grad_norm": 2.179624557495117, "learning_rate": 6.565984785231318e-05, "loss": 0.6272, "step": 1993 }, { "epoch": 12.948051948051948, "grad_norm": 2.1235885620117188, "learning_rate": 6.563000942860706e-05, "loss": 0.6084, "step": 1994 }, { "epoch": 12.954545454545455, "grad_norm": 2.1948156356811523, "learning_rate": 6.560016483442075e-05, "loss": 0.6801, "step": 1995 }, { "epoch": 12.96103896103896, "grad_norm": 1.9589859247207642, "learning_rate": 6.557031408153642e-05, "loss": 0.5824, "step": 1996 }, { "epoch": 12.967532467532468, "grad_norm": 2.1491918563842773, "learning_rate": 6.554045718173867e-05, "loss": 0.6364, "step": 1997 }, { "epoch": 12.974025974025974, "grad_norm": 2.1677355766296387, "learning_rate": 6.551059414681455e-05, "loss": 0.6551, "step": 1998 }, { "epoch": 12.980519480519481, "grad_norm": 1.8698400259017944, "learning_rate": 6.54807249885535e-05, "loss": 0.5743, "step": 1999 }, { "epoch": 12.987012987012987, "grad_norm": 1.9512232542037964, "learning_rate": 6.545084971874738e-05, "loss": 0.5545, "step": 2000 }, { "epoch": 12.993506493506494, "grad_norm": 2.3677287101745605, "learning_rate": 6.542096834919049e-05, "loss": 0.6814, "step": 2001 }, { "epoch": 13.0, "grad_norm": 5.212640762329102, "learning_rate": 6.539108089167953e-05, "loss": 0.6413, "step": 2002 }, { "epoch": 13.006493506493506, "grad_norm": 1.8685005903244019, "learning_rate": 6.536118735801356e-05, "loss": 0.4756, "step": 2003 }, { "epoch": 13.012987012987013, "grad_norm": 2.1032299995422363, "learning_rate": 6.533128775999411e-05, "loss": 0.5265, "step": 2004 }, { "epoch": 13.019480519480519, "grad_norm": 1.9253900051116943, "learning_rate": 6.530138210942505e-05, "loss": 0.4767, "step": 2005 }, { "epoch": 13.025974025974026, "grad_norm": 1.9877099990844727, "learning_rate": 6.527147041811266e-05, "loss": 0.5098, "step": 2006 }, { "epoch": 13.032467532467532, "grad_norm": 1.713268518447876, "learning_rate": 6.52415526978656e-05, "loss": 0.4504, "step": 2007 }, { "epoch": 13.03896103896104, "grad_norm": 2.010927200317383, "learning_rate": 6.52116289604949e-05, "loss": 0.5143, "step": 2008 }, { "epoch": 13.045454545454545, "grad_norm": 2.0579028129577637, "learning_rate": 6.518169921781402e-05, "loss": 0.5334, "step": 2009 }, { "epoch": 13.051948051948052, "grad_norm": 1.8461856842041016, "learning_rate": 6.51517634816387e-05, "loss": 0.4309, "step": 2010 }, { "epoch": 13.058441558441558, "grad_norm": 1.6491504907608032, "learning_rate": 6.512182176378713e-05, "loss": 0.4266, "step": 2011 }, { "epoch": 13.064935064935066, "grad_norm": 1.8561513423919678, "learning_rate": 6.50918740760798e-05, "loss": 0.5071, "step": 2012 }, { "epoch": 13.071428571428571, "grad_norm": 1.9853945970535278, "learning_rate": 6.506192043033959e-05, "loss": 0.4812, "step": 2013 }, { "epoch": 13.077922077922079, "grad_norm": 1.909090280532837, "learning_rate": 6.503196083839174e-05, "loss": 0.5097, "step": 2014 }, { "epoch": 13.084415584415584, "grad_norm": 1.947060227394104, "learning_rate": 6.500199531206382e-05, "loss": 0.4531, "step": 2015 }, { "epoch": 13.090909090909092, "grad_norm": 1.9234563112258911, "learning_rate": 6.497202386318573e-05, "loss": 0.4907, "step": 2016 }, { "epoch": 13.097402597402597, "grad_norm": 2.059049129486084, "learning_rate": 6.494204650358973e-05, "loss": 0.5197, "step": 2017 }, { "epoch": 13.103896103896103, "grad_norm": 2.2307331562042236, "learning_rate": 6.491206324511039e-05, "loss": 0.6148, "step": 2018 }, { "epoch": 13.11038961038961, "grad_norm": 1.892966866493225, "learning_rate": 6.488207409958466e-05, "loss": 0.5067, "step": 2019 }, { "epoch": 13.116883116883116, "grad_norm": 1.8496367931365967, "learning_rate": 6.485207907885175e-05, "loss": 0.4889, "step": 2020 }, { "epoch": 13.123376623376624, "grad_norm": 2.266213893890381, "learning_rate": 6.482207819475323e-05, "loss": 0.5595, "step": 2021 }, { "epoch": 13.12987012987013, "grad_norm": 1.8475916385650635, "learning_rate": 6.4792071459133e-05, "loss": 0.4796, "step": 2022 }, { "epoch": 13.136363636363637, "grad_norm": 2.1203629970550537, "learning_rate": 6.476205888383719e-05, "loss": 0.5184, "step": 2023 }, { "epoch": 13.142857142857142, "grad_norm": 1.9380943775177002, "learning_rate": 6.473204048071432e-05, "loss": 0.4669, "step": 2024 }, { "epoch": 13.14935064935065, "grad_norm": 1.887258768081665, "learning_rate": 6.47020162616152e-05, "loss": 0.4896, "step": 2025 }, { "epoch": 13.155844155844155, "grad_norm": 1.96815025806427, "learning_rate": 6.467198623839288e-05, "loss": 0.4959, "step": 2026 }, { "epoch": 13.162337662337663, "grad_norm": 2.083922863006592, "learning_rate": 6.464195042290277e-05, "loss": 0.5191, "step": 2027 }, { "epoch": 13.168831168831169, "grad_norm": 2.1014833450317383, "learning_rate": 6.46119088270025e-05, "loss": 0.5548, "step": 2028 }, { "epoch": 13.175324675324676, "grad_norm": 1.9475595951080322, "learning_rate": 6.458186146255203e-05, "loss": 0.4752, "step": 2029 }, { "epoch": 13.181818181818182, "grad_norm": 2.1587653160095215, "learning_rate": 6.455180834141359e-05, "loss": 0.545, "step": 2030 }, { "epoch": 13.188311688311689, "grad_norm": 2.103052854537964, "learning_rate": 6.452174947545167e-05, "loss": 0.5186, "step": 2031 }, { "epoch": 13.194805194805195, "grad_norm": 1.8780242204666138, "learning_rate": 6.449168487653305e-05, "loss": 0.4593, "step": 2032 }, { "epoch": 13.2012987012987, "grad_norm": 1.9651139974594116, "learning_rate": 6.446161455652674e-05, "loss": 0.51, "step": 2033 }, { "epoch": 13.207792207792208, "grad_norm": 2.1292293071746826, "learning_rate": 6.443153852730404e-05, "loss": 0.553, "step": 2034 }, { "epoch": 13.214285714285714, "grad_norm": 1.8487555980682373, "learning_rate": 6.440145680073847e-05, "loss": 0.4712, "step": 2035 }, { "epoch": 13.220779220779221, "grad_norm": 2.0636463165283203, "learning_rate": 6.437136938870583e-05, "loss": 0.5354, "step": 2036 }, { "epoch": 13.227272727272727, "grad_norm": 1.924094796180725, "learning_rate": 6.434127630308416e-05, "loss": 0.5052, "step": 2037 }, { "epoch": 13.233766233766234, "grad_norm": 2.0354607105255127, "learning_rate": 6.43111775557537e-05, "loss": 0.5963, "step": 2038 }, { "epoch": 13.24025974025974, "grad_norm": 1.9856023788452148, "learning_rate": 6.428107315859702e-05, "loss": 0.5152, "step": 2039 }, { "epoch": 13.246753246753247, "grad_norm": 1.7797104120254517, "learning_rate": 6.42509631234988e-05, "loss": 0.4549, "step": 2040 }, { "epoch": 13.253246753246753, "grad_norm": 1.9620555639266968, "learning_rate": 6.422084746234604e-05, "loss": 0.4979, "step": 2041 }, { "epoch": 13.25974025974026, "grad_norm": 2.117298126220703, "learning_rate": 6.419072618702793e-05, "loss": 0.5939, "step": 2042 }, { "epoch": 13.266233766233766, "grad_norm": 2.0733323097229004, "learning_rate": 6.416059930943585e-05, "loss": 0.5434, "step": 2043 }, { "epoch": 13.272727272727273, "grad_norm": 1.9714899063110352, "learning_rate": 6.413046684146343e-05, "loss": 0.4654, "step": 2044 }, { "epoch": 13.279220779220779, "grad_norm": 2.035008430480957, "learning_rate": 6.410032879500647e-05, "loss": 0.5418, "step": 2045 }, { "epoch": 13.285714285714286, "grad_norm": 2.1320652961730957, "learning_rate": 6.407018518196303e-05, "loss": 0.5639, "step": 2046 }, { "epoch": 13.292207792207792, "grad_norm": 2.251023769378662, "learning_rate": 6.404003601423329e-05, "loss": 0.5101, "step": 2047 }, { "epoch": 13.2987012987013, "grad_norm": 1.9076013565063477, "learning_rate": 6.400988130371969e-05, "loss": 0.4994, "step": 2048 }, { "epoch": 13.305194805194805, "grad_norm": 1.9021027088165283, "learning_rate": 6.397972106232681e-05, "loss": 0.4868, "step": 2049 }, { "epoch": 13.311688311688311, "grad_norm": 2.0925509929656982, "learning_rate": 6.394955530196147e-05, "loss": 0.5395, "step": 2050 }, { "epoch": 13.318181818181818, "grad_norm": 2.2410504817962646, "learning_rate": 6.39193840345326e-05, "loss": 0.5421, "step": 2051 }, { "epoch": 13.324675324675324, "grad_norm": 2.4037466049194336, "learning_rate": 6.388920727195138e-05, "loss": 0.6181, "step": 2052 }, { "epoch": 13.331168831168831, "grad_norm": 1.830690860748291, "learning_rate": 6.385902502613106e-05, "loss": 0.4437, "step": 2053 }, { "epoch": 13.337662337662337, "grad_norm": 2.139547824859619, "learning_rate": 6.382883730898717e-05, "loss": 0.547, "step": 2054 }, { "epoch": 13.344155844155845, "grad_norm": 1.7549453973770142, "learning_rate": 6.37986441324373e-05, "loss": 0.5073, "step": 2055 }, { "epoch": 13.35064935064935, "grad_norm": 2.10517954826355, "learning_rate": 6.376844550840125e-05, "loss": 0.5434, "step": 2056 }, { "epoch": 13.357142857142858, "grad_norm": 2.1346967220306396, "learning_rate": 6.373824144880098e-05, "loss": 0.5551, "step": 2057 }, { "epoch": 13.363636363636363, "grad_norm": 1.5481007099151611, "learning_rate": 6.370803196556055e-05, "loss": 0.3688, "step": 2058 }, { "epoch": 13.37012987012987, "grad_norm": 1.8314411640167236, "learning_rate": 6.36778170706062e-05, "loss": 0.4539, "step": 2059 }, { "epoch": 13.376623376623376, "grad_norm": 2.0875957012176514, "learning_rate": 6.364759677586627e-05, "loss": 0.5223, "step": 2060 }, { "epoch": 13.383116883116884, "grad_norm": 2.1524407863616943, "learning_rate": 6.361737109327128e-05, "loss": 0.5842, "step": 2061 }, { "epoch": 13.38961038961039, "grad_norm": 1.9753766059875488, "learning_rate": 6.358714003475384e-05, "loss": 0.5446, "step": 2062 }, { "epoch": 13.396103896103897, "grad_norm": 1.9236072301864624, "learning_rate": 6.355690361224869e-05, "loss": 0.475, "step": 2063 }, { "epoch": 13.402597402597403, "grad_norm": 1.9570033550262451, "learning_rate": 6.35266618376927e-05, "loss": 0.4705, "step": 2064 }, { "epoch": 13.409090909090908, "grad_norm": 2.0021793842315674, "learning_rate": 6.349641472302483e-05, "loss": 0.5146, "step": 2065 }, { "epoch": 13.415584415584416, "grad_norm": 2.1636292934417725, "learning_rate": 6.346616228018616e-05, "loss": 0.5342, "step": 2066 }, { "epoch": 13.422077922077921, "grad_norm": 1.7105865478515625, "learning_rate": 6.34359045211199e-05, "loss": 0.4019, "step": 2067 }, { "epoch": 13.428571428571429, "grad_norm": 2.270984649658203, "learning_rate": 6.340564145777132e-05, "loss": 0.5322, "step": 2068 }, { "epoch": 13.435064935064934, "grad_norm": 2.0992496013641357, "learning_rate": 6.337537310208778e-05, "loss": 0.5545, "step": 2069 }, { "epoch": 13.441558441558442, "grad_norm": 2.1089553833007812, "learning_rate": 6.334509946601879e-05, "loss": 0.562, "step": 2070 }, { "epoch": 13.448051948051948, "grad_norm": 2.043635845184326, "learning_rate": 6.331482056151585e-05, "loss": 0.5461, "step": 2071 }, { "epoch": 13.454545454545455, "grad_norm": 2.079118490219116, "learning_rate": 6.328453640053263e-05, "loss": 0.5147, "step": 2072 }, { "epoch": 13.46103896103896, "grad_norm": 1.6074239015579224, "learning_rate": 6.325424699502483e-05, "loss": 0.4172, "step": 2073 }, { "epoch": 13.467532467532468, "grad_norm": 1.7791599035263062, "learning_rate": 6.322395235695022e-05, "loss": 0.451, "step": 2074 }, { "epoch": 13.474025974025974, "grad_norm": 2.134434700012207, "learning_rate": 6.319365249826865e-05, "loss": 0.4966, "step": 2075 }, { "epoch": 13.480519480519481, "grad_norm": 2.86612606048584, "learning_rate": 6.316334743094201e-05, "loss": 0.5469, "step": 2076 }, { "epoch": 13.487012987012987, "grad_norm": 2.0249807834625244, "learning_rate": 6.313303716693428e-05, "loss": 0.5334, "step": 2077 }, { "epoch": 13.493506493506494, "grad_norm": 2.0734705924987793, "learning_rate": 6.310272171821146e-05, "loss": 0.5299, "step": 2078 }, { "epoch": 13.5, "grad_norm": 2.2216503620147705, "learning_rate": 6.307240109674162e-05, "loss": 0.5377, "step": 2079 }, { "epoch": 13.506493506493506, "grad_norm": 2.039626359939575, "learning_rate": 6.304207531449486e-05, "loss": 0.5285, "step": 2080 }, { "epoch": 13.512987012987013, "grad_norm": 2.066758871078491, "learning_rate": 6.301174438344328e-05, "loss": 0.5145, "step": 2081 }, { "epoch": 13.519480519480519, "grad_norm": 2.039462089538574, "learning_rate": 6.298140831556112e-05, "loss": 0.5039, "step": 2082 }, { "epoch": 13.525974025974026, "grad_norm": 2.188901424407959, "learning_rate": 6.295106712282451e-05, "loss": 0.5432, "step": 2083 }, { "epoch": 13.532467532467532, "grad_norm": 2.0759294033050537, "learning_rate": 6.292072081721173e-05, "loss": 0.5137, "step": 2084 }, { "epoch": 13.53896103896104, "grad_norm": 1.954957127571106, "learning_rate": 6.2890369410703e-05, "loss": 0.4908, "step": 2085 }, { "epoch": 13.545454545454545, "grad_norm": 1.8928133249282837, "learning_rate": 6.286001291528056e-05, "loss": 0.491, "step": 2086 }, { "epoch": 13.551948051948052, "grad_norm": 2.024346351623535, "learning_rate": 6.282965134292868e-05, "loss": 0.5316, "step": 2087 }, { "epoch": 13.558441558441558, "grad_norm": 1.8604222536087036, "learning_rate": 6.279928470563364e-05, "loss": 0.4553, "step": 2088 }, { "epoch": 13.564935064935066, "grad_norm": 1.866055965423584, "learning_rate": 6.27689130153837e-05, "loss": 0.5058, "step": 2089 }, { "epoch": 13.571428571428571, "grad_norm": 1.6331592798233032, "learning_rate": 6.273853628416911e-05, "loss": 0.4194, "step": 2090 }, { "epoch": 13.577922077922079, "grad_norm": 1.8331129550933838, "learning_rate": 6.270815452398215e-05, "loss": 0.4645, "step": 2091 }, { "epoch": 13.584415584415584, "grad_norm": 1.97976553440094, "learning_rate": 6.267776774681703e-05, "loss": 0.5311, "step": 2092 }, { "epoch": 13.590909090909092, "grad_norm": 2.0776448249816895, "learning_rate": 6.264737596466998e-05, "loss": 0.5425, "step": 2093 }, { "epoch": 13.597402597402597, "grad_norm": 1.9937676191329956, "learning_rate": 6.261697918953921e-05, "loss": 0.4808, "step": 2094 }, { "epoch": 13.603896103896105, "grad_norm": 2.2069528102874756, "learning_rate": 6.258657743342486e-05, "loss": 0.5907, "step": 2095 }, { "epoch": 13.61038961038961, "grad_norm": 1.902036428451538, "learning_rate": 6.255617070832908e-05, "loss": 0.5159, "step": 2096 }, { "epoch": 13.616883116883116, "grad_norm": 2.1040568351745605, "learning_rate": 6.252575902625595e-05, "loss": 0.5484, "step": 2097 }, { "epoch": 13.623376623376624, "grad_norm": 1.8775359392166138, "learning_rate": 6.249534239921153e-05, "loss": 0.5038, "step": 2098 }, { "epoch": 13.62987012987013, "grad_norm": 1.9108651876449585, "learning_rate": 6.24649208392038e-05, "loss": 0.4966, "step": 2099 }, { "epoch": 13.636363636363637, "grad_norm": 2.1672561168670654, "learning_rate": 6.243449435824276e-05, "loss": 0.5691, "step": 2100 }, { "epoch": 13.642857142857142, "grad_norm": 2.215721607208252, "learning_rate": 6.240406296834024e-05, "loss": 0.5141, "step": 2101 }, { "epoch": 13.64935064935065, "grad_norm": 1.9970991611480713, "learning_rate": 6.237362668151012e-05, "loss": 0.5436, "step": 2102 }, { "epoch": 13.655844155844155, "grad_norm": 2.052436590194702, "learning_rate": 6.234318550976815e-05, "loss": 0.5346, "step": 2103 }, { "epoch": 13.662337662337663, "grad_norm": 2.0946736335754395, "learning_rate": 6.231273946513201e-05, "loss": 0.563, "step": 2104 }, { "epoch": 13.668831168831169, "grad_norm": 2.2018373012542725, "learning_rate": 6.228228855962133e-05, "loss": 0.5879, "step": 2105 }, { "epoch": 13.675324675324676, "grad_norm": 1.985517978668213, "learning_rate": 6.225183280525763e-05, "loss": 0.5054, "step": 2106 }, { "epoch": 13.681818181818182, "grad_norm": 2.2698051929473877, "learning_rate": 6.222137221406439e-05, "loss": 0.5837, "step": 2107 }, { "epoch": 13.688311688311689, "grad_norm": 1.78743314743042, "learning_rate": 6.219090679806693e-05, "loss": 0.4551, "step": 2108 }, { "epoch": 13.694805194805195, "grad_norm": 2.0907576084136963, "learning_rate": 6.216043656929253e-05, "loss": 0.5884, "step": 2109 }, { "epoch": 13.7012987012987, "grad_norm": 1.94028902053833, "learning_rate": 6.212996153977037e-05, "loss": 0.518, "step": 2110 }, { "epoch": 13.707792207792208, "grad_norm": 2.4052321910858154, "learning_rate": 6.20994817215315e-05, "loss": 0.6378, "step": 2111 }, { "epoch": 13.714285714285714, "grad_norm": 2.077237844467163, "learning_rate": 6.206899712660886e-05, "loss": 0.5693, "step": 2112 }, { "epoch": 13.720779220779221, "grad_norm": 2.1299149990081787, "learning_rate": 6.20385077670373e-05, "loss": 0.5964, "step": 2113 }, { "epoch": 13.727272727272727, "grad_norm": 2.0773046016693115, "learning_rate": 6.20080136548535e-05, "loss": 0.5483, "step": 2114 }, { "epoch": 13.733766233766234, "grad_norm": 2.1190061569213867, "learning_rate": 6.19775148020961e-05, "loss": 0.5367, "step": 2115 }, { "epoch": 13.74025974025974, "grad_norm": 2.0737929344177246, "learning_rate": 6.194701122080553e-05, "loss": 0.5265, "step": 2116 }, { "epoch": 13.746753246753247, "grad_norm": 2.3500454425811768, "learning_rate": 6.191650292302414e-05, "loss": 0.5931, "step": 2117 }, { "epoch": 13.753246753246753, "grad_norm": 2.088576078414917, "learning_rate": 6.188598992079613e-05, "loss": 0.5832, "step": 2118 }, { "epoch": 13.75974025974026, "grad_norm": 2.0208451747894287, "learning_rate": 6.185547222616752e-05, "loss": 0.5538, "step": 2119 }, { "epoch": 13.766233766233766, "grad_norm": 2.2917118072509766, "learning_rate": 6.182494985118624e-05, "loss": 0.5753, "step": 2120 }, { "epoch": 13.772727272727273, "grad_norm": 2.0528573989868164, "learning_rate": 6.179442280790202e-05, "loss": 0.498, "step": 2121 }, { "epoch": 13.779220779220779, "grad_norm": 2.072192907333374, "learning_rate": 6.176389110836647e-05, "loss": 0.548, "step": 2122 }, { "epoch": 13.785714285714286, "grad_norm": 2.331056594848633, "learning_rate": 6.173335476463302e-05, "loss": 0.5495, "step": 2123 }, { "epoch": 13.792207792207792, "grad_norm": 1.977685570716858, "learning_rate": 6.170281378875692e-05, "loss": 0.5183, "step": 2124 }, { "epoch": 13.7987012987013, "grad_norm": 1.922735333442688, "learning_rate": 6.167226819279528e-05, "loss": 0.4729, "step": 2125 }, { "epoch": 13.805194805194805, "grad_norm": 2.081286907196045, "learning_rate": 6.164171798880699e-05, "loss": 0.5667, "step": 2126 }, { "epoch": 13.811688311688311, "grad_norm": 1.8858697414398193, "learning_rate": 6.161116318885283e-05, "loss": 0.534, "step": 2127 }, { "epoch": 13.818181818181818, "grad_norm": 1.888351559638977, "learning_rate": 6.158060380499533e-05, "loss": 0.5312, "step": 2128 }, { "epoch": 13.824675324675324, "grad_norm": 2.015016794204712, "learning_rate": 6.155003984929883e-05, "loss": 0.5091, "step": 2129 }, { "epoch": 13.831168831168831, "grad_norm": 2.031311273574829, "learning_rate": 6.151947133382954e-05, "loss": 0.54, "step": 2130 }, { "epoch": 13.837662337662337, "grad_norm": 2.2997567653656006, "learning_rate": 6.148889827065537e-05, "loss": 0.6281, "step": 2131 }, { "epoch": 13.844155844155845, "grad_norm": 2.062107801437378, "learning_rate": 6.145832067184614e-05, "loss": 0.5306, "step": 2132 }, { "epoch": 13.85064935064935, "grad_norm": 2.2079269886016846, "learning_rate": 6.142773854947336e-05, "loss": 0.5328, "step": 2133 }, { "epoch": 13.857142857142858, "grad_norm": 2.250805139541626, "learning_rate": 6.139715191561038e-05, "loss": 0.56, "step": 2134 }, { "epoch": 13.863636363636363, "grad_norm": 1.9680566787719727, "learning_rate": 6.136656078233232e-05, "loss": 0.5111, "step": 2135 }, { "epoch": 13.87012987012987, "grad_norm": 2.3636484146118164, "learning_rate": 6.133596516171609e-05, "loss": 0.568, "step": 2136 }, { "epoch": 13.876623376623376, "grad_norm": 2.1117160320281982, "learning_rate": 6.130536506584032e-05, "loss": 0.5228, "step": 2137 }, { "epoch": 13.883116883116884, "grad_norm": 2.2921080589294434, "learning_rate": 6.127476050678547e-05, "loss": 0.5301, "step": 2138 }, { "epoch": 13.88961038961039, "grad_norm": 2.155996322631836, "learning_rate": 6.124415149663374e-05, "loss": 0.5723, "step": 2139 }, { "epoch": 13.896103896103895, "grad_norm": 1.989581823348999, "learning_rate": 6.121353804746906e-05, "loss": 0.5407, "step": 2140 }, { "epoch": 13.902597402597403, "grad_norm": 2.180414915084839, "learning_rate": 6.118292017137716e-05, "loss": 0.605, "step": 2141 }, { "epoch": 13.909090909090908, "grad_norm": 2.10390043258667, "learning_rate": 6.115229788044548e-05, "loss": 0.5455, "step": 2142 }, { "epoch": 13.915584415584416, "grad_norm": 2.1462514400482178, "learning_rate": 6.11216711867632e-05, "loss": 0.55, "step": 2143 }, { "epoch": 13.922077922077921, "grad_norm": 2.065823554992676, "learning_rate": 6.109104010242128e-05, "loss": 0.6055, "step": 2144 }, { "epoch": 13.928571428571429, "grad_norm": 2.0353028774261475, "learning_rate": 6.106040463951237e-05, "loss": 0.5752, "step": 2145 }, { "epoch": 13.935064935064934, "grad_norm": 2.097501277923584, "learning_rate": 6.1029764810130864e-05, "loss": 0.5526, "step": 2146 }, { "epoch": 13.941558441558442, "grad_norm": 2.058262825012207, "learning_rate": 6.0999120626372894e-05, "loss": 0.5508, "step": 2147 }, { "epoch": 13.948051948051948, "grad_norm": 2.0566976070404053, "learning_rate": 6.0968472100336305e-05, "loss": 0.5525, "step": 2148 }, { "epoch": 13.954545454545455, "grad_norm": 1.9276599884033203, "learning_rate": 6.093781924412063e-05, "loss": 0.5248, "step": 2149 }, { "epoch": 13.96103896103896, "grad_norm": 2.349358081817627, "learning_rate": 6.090716206982714e-05, "loss": 0.6566, "step": 2150 }, { "epoch": 13.967532467532468, "grad_norm": 2.0224742889404297, "learning_rate": 6.0876500589558796e-05, "loss": 0.5618, "step": 2151 }, { "epoch": 13.974025974025974, "grad_norm": 2.0278806686401367, "learning_rate": 6.084583481542027e-05, "loss": 0.5395, "step": 2152 }, { "epoch": 13.980519480519481, "grad_norm": 1.888813853263855, "learning_rate": 6.081516475951793e-05, "loss": 0.5143, "step": 2153 }, { "epoch": 13.987012987012987, "grad_norm": 2.149365186691284, "learning_rate": 6.078449043395982e-05, "loss": 0.5807, "step": 2154 }, { "epoch": 13.993506493506494, "grad_norm": 1.9563267230987549, "learning_rate": 6.075381185085568e-05, "loss": 0.5255, "step": 2155 }, { "epoch": 14.0, "grad_norm": 430.6167907714844, "learning_rate": 6.072312902231691e-05, "loss": 0.6761, "step": 2156 }, { "epoch": 14.006493506493506, "grad_norm": 1.723750114440918, "learning_rate": 6.0692441960456657e-05, "loss": 0.398, "step": 2157 }, { "epoch": 14.012987012987013, "grad_norm": 1.9454874992370605, "learning_rate": 6.066175067738964e-05, "loss": 0.4885, "step": 2158 }, { "epoch": 14.019480519480519, "grad_norm": 2.0680899620056152, "learning_rate": 6.06310551852323e-05, "loss": 0.4981, "step": 2159 }, { "epoch": 14.025974025974026, "grad_norm": 1.7846578359603882, "learning_rate": 6.060035549610273e-05, "loss": 0.4461, "step": 2160 }, { "epoch": 14.032467532467532, "grad_norm": 1.6244697570800781, "learning_rate": 6.056965162212072e-05, "loss": 0.391, "step": 2161 }, { "epoch": 14.03896103896104, "grad_norm": 1.861986756324768, "learning_rate": 6.05389435754076e-05, "loss": 0.4462, "step": 2162 }, { "epoch": 14.045454545454545, "grad_norm": 1.9358868598937988, "learning_rate": 6.0508231368086484e-05, "loss": 0.4518, "step": 2163 }, { "epoch": 14.051948051948052, "grad_norm": 2.105084180831909, "learning_rate": 6.0477515012282024e-05, "loss": 0.5559, "step": 2164 }, { "epoch": 14.058441558441558, "grad_norm": 2.202756643295288, "learning_rate": 6.0446794520120584e-05, "loss": 0.5361, "step": 2165 }, { "epoch": 14.064935064935066, "grad_norm": 1.9239821434020996, "learning_rate": 6.041606990373012e-05, "loss": 0.4578, "step": 2166 }, { "epoch": 14.071428571428571, "grad_norm": 2.157377004623413, "learning_rate": 6.03853411752402e-05, "loss": 0.534, "step": 2167 }, { "epoch": 14.077922077922079, "grad_norm": 2.0353739261627197, "learning_rate": 6.035460834678207e-05, "loss": 0.5045, "step": 2168 }, { "epoch": 14.084415584415584, "grad_norm": 1.8081053495407104, "learning_rate": 6.032387143048853e-05, "loss": 0.4518, "step": 2169 }, { "epoch": 14.090909090909092, "grad_norm": 1.8117995262145996, "learning_rate": 6.029313043849407e-05, "loss": 0.4494, "step": 2170 }, { "epoch": 14.097402597402597, "grad_norm": 1.8994969129562378, "learning_rate": 6.026238538293472e-05, "loss": 0.4844, "step": 2171 }, { "epoch": 14.103896103896103, "grad_norm": 1.8953076601028442, "learning_rate": 6.0231636275948135e-05, "loss": 0.4819, "step": 2172 }, { "epoch": 14.11038961038961, "grad_norm": 1.9595729112625122, "learning_rate": 6.0200883129673604e-05, "loss": 0.4689, "step": 2173 }, { "epoch": 14.116883116883116, "grad_norm": 1.87164306640625, "learning_rate": 6.0170125956251934e-05, "loss": 0.458, "step": 2174 }, { "epoch": 14.123376623376624, "grad_norm": 1.9536057710647583, "learning_rate": 6.0139364767825626e-05, "loss": 0.5087, "step": 2175 }, { "epoch": 14.12987012987013, "grad_norm": 1.9607187509536743, "learning_rate": 6.010859957653868e-05, "loss": 0.4764, "step": 2176 }, { "epoch": 14.136363636363637, "grad_norm": 1.5424551963806152, "learning_rate": 6.00778303945367e-05, "loss": 0.3429, "step": 2177 }, { "epoch": 14.142857142857142, "grad_norm": 1.9886722564697266, "learning_rate": 6.00470572339669e-05, "loss": 0.4564, "step": 2178 }, { "epoch": 14.14935064935065, "grad_norm": 1.8871906995773315, "learning_rate": 6.001628010697799e-05, "loss": 0.4469, "step": 2179 }, { "epoch": 14.155844155844155, "grad_norm": 1.8248738050460815, "learning_rate": 5.9985499025720346e-05, "loss": 0.4714, "step": 2180 }, { "epoch": 14.162337662337663, "grad_norm": 1.942944049835205, "learning_rate": 5.9954714002345836e-05, "loss": 0.4656, "step": 2181 }, { "epoch": 14.168831168831169, "grad_norm": 1.7118310928344727, "learning_rate": 5.9923925049007855e-05, "loss": 0.4223, "step": 2182 }, { "epoch": 14.175324675324676, "grad_norm": 2.1456005573272705, "learning_rate": 5.9893132177861454e-05, "loss": 0.5301, "step": 2183 }, { "epoch": 14.181818181818182, "grad_norm": 2.018801212310791, "learning_rate": 5.9862335401063155e-05, "loss": 0.4639, "step": 2184 }, { "epoch": 14.188311688311689, "grad_norm": 1.9553627967834473, "learning_rate": 5.9831534730771e-05, "loss": 0.441, "step": 2185 }, { "epoch": 14.194805194805195, "grad_norm": 1.910554051399231, "learning_rate": 5.9800730179144656e-05, "loss": 0.4913, "step": 2186 }, { "epoch": 14.2012987012987, "grad_norm": 2.2660701274871826, "learning_rate": 5.9769921758345246e-05, "loss": 0.509, "step": 2187 }, { "epoch": 14.207792207792208, "grad_norm": 1.819960355758667, "learning_rate": 5.973910948053545e-05, "loss": 0.4436, "step": 2188 }, { "epoch": 14.214285714285714, "grad_norm": 2.024562358856201, "learning_rate": 5.970829335787946e-05, "loss": 0.5243, "step": 2189 }, { "epoch": 14.220779220779221, "grad_norm": 1.7835793495178223, "learning_rate": 5.967747340254303e-05, "loss": 0.4159, "step": 2190 }, { "epoch": 14.227272727272727, "grad_norm": 1.9386513233184814, "learning_rate": 5.964664962669333e-05, "loss": 0.4776, "step": 2191 }, { "epoch": 14.233766233766234, "grad_norm": 1.990627408027649, "learning_rate": 5.9615822042499146e-05, "loss": 0.461, "step": 2192 }, { "epoch": 14.24025974025974, "grad_norm": 1.928742527961731, "learning_rate": 5.95849906621307e-05, "loss": 0.4615, "step": 2193 }, { "epoch": 14.246753246753247, "grad_norm": 2.0883946418762207, "learning_rate": 5.955415549775974e-05, "loss": 0.4687, "step": 2194 }, { "epoch": 14.253246753246753, "grad_norm": 2.1362526416778564, "learning_rate": 5.9523316561559503e-05, "loss": 0.518, "step": 2195 }, { "epoch": 14.25974025974026, "grad_norm": 2.0090742111206055, "learning_rate": 5.949247386570471e-05, "loss": 0.4146, "step": 2196 }, { "epoch": 14.266233766233766, "grad_norm": 1.8732699155807495, "learning_rate": 5.946162742237154e-05, "loss": 0.4452, "step": 2197 }, { "epoch": 14.272727272727273, "grad_norm": 1.767877459526062, "learning_rate": 5.9430777243737746e-05, "loss": 0.4327, "step": 2198 }, { "epoch": 14.279220779220779, "grad_norm": 1.8981797695159912, "learning_rate": 5.939992334198242e-05, "loss": 0.4409, "step": 2199 }, { "epoch": 14.285714285714286, "grad_norm": 1.9029948711395264, "learning_rate": 5.9369065729286245e-05, "loss": 0.4419, "step": 2200 }, { "epoch": 14.292207792207792, "grad_norm": 2.0496606826782227, "learning_rate": 5.933820441783129e-05, "loss": 0.4936, "step": 2201 }, { "epoch": 14.2987012987013, "grad_norm": 1.840586543083191, "learning_rate": 5.930733941980111e-05, "loss": 0.4705, "step": 2202 }, { "epoch": 14.305194805194805, "grad_norm": 1.8446810245513916, "learning_rate": 5.927647074738074e-05, "loss": 0.4955, "step": 2203 }, { "epoch": 14.311688311688311, "grad_norm": 2.1083133220672607, "learning_rate": 5.924559841275661e-05, "loss": 0.5316, "step": 2204 }, { "epoch": 14.318181818181818, "grad_norm": 1.8302310705184937, "learning_rate": 5.921472242811668e-05, "loss": 0.4348, "step": 2205 }, { "epoch": 14.324675324675324, "grad_norm": 1.7570937871932983, "learning_rate": 5.918384280565025e-05, "loss": 0.4268, "step": 2206 }, { "epoch": 14.331168831168831, "grad_norm": 1.8031936883926392, "learning_rate": 5.9152959557548117e-05, "loss": 0.4316, "step": 2207 }, { "epoch": 14.337662337662337, "grad_norm": 1.9004753828048706, "learning_rate": 5.912207269600252e-05, "loss": 0.4888, "step": 2208 }, { "epoch": 14.344155844155845, "grad_norm": 1.8461647033691406, "learning_rate": 5.9091182233207075e-05, "loss": 0.4542, "step": 2209 }, { "epoch": 14.35064935064935, "grad_norm": 2.096561908721924, "learning_rate": 5.906028818135687e-05, "loss": 0.5014, "step": 2210 }, { "epoch": 14.357142857142858, "grad_norm": 1.800281047821045, "learning_rate": 5.902939055264838e-05, "loss": 0.4459, "step": 2211 }, { "epoch": 14.363636363636363, "grad_norm": 2.1272525787353516, "learning_rate": 5.89984893592795e-05, "loss": 0.489, "step": 2212 }, { "epoch": 14.37012987012987, "grad_norm": 2.0823850631713867, "learning_rate": 5.896758461344952e-05, "loss": 0.5373, "step": 2213 }, { "epoch": 14.376623376623376, "grad_norm": 2.00270676612854, "learning_rate": 5.8936676327359154e-05, "loss": 0.4883, "step": 2214 }, { "epoch": 14.383116883116884, "grad_norm": 2.108412265777588, "learning_rate": 5.89057645132105e-05, "loss": 0.5143, "step": 2215 }, { "epoch": 14.38961038961039, "grad_norm": 2.0262179374694824, "learning_rate": 5.887484918320708e-05, "loss": 0.5128, "step": 2216 }, { "epoch": 14.396103896103897, "grad_norm": 2.045429229736328, "learning_rate": 5.8843930349553736e-05, "loss": 0.5009, "step": 2217 }, { "epoch": 14.402597402597403, "grad_norm": 1.8671499490737915, "learning_rate": 5.881300802445675e-05, "loss": 0.4645, "step": 2218 }, { "epoch": 14.409090909090908, "grad_norm": 1.9127780199050903, "learning_rate": 5.878208222012377e-05, "loss": 0.4593, "step": 2219 }, { "epoch": 14.415584415584416, "grad_norm": 1.7372092008590698, "learning_rate": 5.875115294876381e-05, "loss": 0.4668, "step": 2220 }, { "epoch": 14.422077922077921, "grad_norm": 2.0599021911621094, "learning_rate": 5.872022022258725e-05, "loss": 0.4908, "step": 2221 }, { "epoch": 14.428571428571429, "grad_norm": 1.7940086126327515, "learning_rate": 5.868928405380585e-05, "loss": 0.4514, "step": 2222 }, { "epoch": 14.435064935064934, "grad_norm": 1.8995345830917358, "learning_rate": 5.865834445463273e-05, "loss": 0.4686, "step": 2223 }, { "epoch": 14.441558441558442, "grad_norm": 1.9151577949523926, "learning_rate": 5.8627401437282334e-05, "loss": 0.4608, "step": 2224 }, { "epoch": 14.448051948051948, "grad_norm": 1.841570496559143, "learning_rate": 5.859645501397048e-05, "loss": 0.4694, "step": 2225 }, { "epoch": 14.454545454545455, "grad_norm": 1.9435954093933105, "learning_rate": 5.856550519691433e-05, "loss": 0.4697, "step": 2226 }, { "epoch": 14.46103896103896, "grad_norm": 1.835790991783142, "learning_rate": 5.853455199833238e-05, "loss": 0.4625, "step": 2227 }, { "epoch": 14.467532467532468, "grad_norm": 1.958960771560669, "learning_rate": 5.850359543044446e-05, "loss": 0.4853, "step": 2228 }, { "epoch": 14.474025974025974, "grad_norm": 1.9147467613220215, "learning_rate": 5.847263550547174e-05, "loss": 0.4745, "step": 2229 }, { "epoch": 14.480519480519481, "grad_norm": 1.7217332124710083, "learning_rate": 5.844167223563669e-05, "loss": 0.391, "step": 2230 }, { "epoch": 14.487012987012987, "grad_norm": 1.7955234050750732, "learning_rate": 5.841070563316315e-05, "loss": 0.4369, "step": 2231 }, { "epoch": 14.493506493506494, "grad_norm": 2.0770103931427, "learning_rate": 5.837973571027621e-05, "loss": 0.5369, "step": 2232 }, { "epoch": 14.5, "grad_norm": 1.7267543077468872, "learning_rate": 5.834876247920233e-05, "loss": 0.4511, "step": 2233 }, { "epoch": 14.506493506493506, "grad_norm": 2.0531458854675293, "learning_rate": 5.831778595216924e-05, "loss": 0.4808, "step": 2234 }, { "epoch": 14.512987012987013, "grad_norm": 1.8746572732925415, "learning_rate": 5.828680614140599e-05, "loss": 0.448, "step": 2235 }, { "epoch": 14.519480519480519, "grad_norm": 2.0527100563049316, "learning_rate": 5.82558230591429e-05, "loss": 0.5507, "step": 2236 }, { "epoch": 14.525974025974026, "grad_norm": 2.0019044876098633, "learning_rate": 5.822483671761163e-05, "loss": 0.5131, "step": 2237 }, { "epoch": 14.532467532467532, "grad_norm": 2.0601959228515625, "learning_rate": 5.819384712904508e-05, "loss": 0.4807, "step": 2238 }, { "epoch": 14.53896103896104, "grad_norm": 1.9823836088180542, "learning_rate": 5.8162854305677425e-05, "loss": 0.4708, "step": 2239 }, { "epoch": 14.545454545454545, "grad_norm": 2.0328471660614014, "learning_rate": 5.813185825974419e-05, "loss": 0.5208, "step": 2240 }, { "epoch": 14.551948051948052, "grad_norm": 2.6339499950408936, "learning_rate": 5.810085900348209e-05, "loss": 0.5001, "step": 2241 }, { "epoch": 14.558441558441558, "grad_norm": 2.0323269367218018, "learning_rate": 5.806985654912914e-05, "loss": 0.5019, "step": 2242 }, { "epoch": 14.564935064935066, "grad_norm": 2.112851619720459, "learning_rate": 5.8038850908924636e-05, "loss": 0.5332, "step": 2243 }, { "epoch": 14.571428571428571, "grad_norm": 1.640142560005188, "learning_rate": 5.80078420951091e-05, "loss": 0.3836, "step": 2244 }, { "epoch": 14.577922077922079, "grad_norm": 1.8446474075317383, "learning_rate": 5.797683011992432e-05, "loss": 0.4286, "step": 2245 }, { "epoch": 14.584415584415584, "grad_norm": 2.1189794540405273, "learning_rate": 5.794581499561334e-05, "loss": 0.5665, "step": 2246 }, { "epoch": 14.590909090909092, "grad_norm": 1.9349459409713745, "learning_rate": 5.791479673442044e-05, "loss": 0.4824, "step": 2247 }, { "epoch": 14.597402597402597, "grad_norm": 2.1034860610961914, "learning_rate": 5.788377534859114e-05, "loss": 0.5042, "step": 2248 }, { "epoch": 14.603896103896105, "grad_norm": 1.9855496883392334, "learning_rate": 5.785275085037218e-05, "loss": 0.5432, "step": 2249 }, { "epoch": 14.61038961038961, "grad_norm": 1.9206759929656982, "learning_rate": 5.782172325201155e-05, "loss": 0.4845, "step": 2250 }, { "epoch": 14.616883116883116, "grad_norm": 2.0690195560455322, "learning_rate": 5.779069256575845e-05, "loss": 0.5385, "step": 2251 }, { "epoch": 14.623376623376624, "grad_norm": 1.9303492307662964, "learning_rate": 5.7759658803863304e-05, "loss": 0.4833, "step": 2252 }, { "epoch": 14.62987012987013, "grad_norm": 1.9649652242660522, "learning_rate": 5.772862197857776e-05, "loss": 0.4803, "step": 2253 }, { "epoch": 14.636363636363637, "grad_norm": 2.056445360183716, "learning_rate": 5.769758210215466e-05, "loss": 0.4954, "step": 2254 }, { "epoch": 14.642857142857142, "grad_norm": 2.22932767868042, "learning_rate": 5.7666539186848036e-05, "loss": 0.541, "step": 2255 }, { "epoch": 14.64935064935065, "grad_norm": 1.9158599376678467, "learning_rate": 5.763549324491316e-05, "loss": 0.5104, "step": 2256 }, { "epoch": 14.655844155844155, "grad_norm": 2.056835651397705, "learning_rate": 5.7604444288606474e-05, "loss": 0.514, "step": 2257 }, { "epoch": 14.662337662337663, "grad_norm": 1.7276153564453125, "learning_rate": 5.7573392330185625e-05, "loss": 0.4321, "step": 2258 }, { "epoch": 14.668831168831169, "grad_norm": 2.047945976257324, "learning_rate": 5.7542337381909417e-05, "loss": 0.5074, "step": 2259 }, { "epoch": 14.675324675324676, "grad_norm": 1.9653900861740112, "learning_rate": 5.751127945603786e-05, "loss": 0.4676, "step": 2260 }, { "epoch": 14.681818181818182, "grad_norm": 1.8870630264282227, "learning_rate": 5.7480218564832125e-05, "loss": 0.4707, "step": 2261 }, { "epoch": 14.688311688311689, "grad_norm": 1.6896121501922607, "learning_rate": 5.744915472055456e-05, "loss": 0.4522, "step": 2262 }, { "epoch": 14.694805194805195, "grad_norm": 1.9910752773284912, "learning_rate": 5.741808793546869e-05, "loss": 0.5132, "step": 2263 }, { "epoch": 14.7012987012987, "grad_norm": 1.7974369525909424, "learning_rate": 5.7387018221839197e-05, "loss": 0.4556, "step": 2264 }, { "epoch": 14.707792207792208, "grad_norm": 2.076993703842163, "learning_rate": 5.735594559193187e-05, "loss": 0.5445, "step": 2265 }, { "epoch": 14.714285714285714, "grad_norm": 1.960834264755249, "learning_rate": 5.7324870058013736e-05, "loss": 0.5218, "step": 2266 }, { "epoch": 14.720779220779221, "grad_norm": 1.7765799760818481, "learning_rate": 5.72937916323529e-05, "loss": 0.4668, "step": 2267 }, { "epoch": 14.727272727272727, "grad_norm": 1.595685601234436, "learning_rate": 5.726271032721864e-05, "loss": 0.3903, "step": 2268 }, { "epoch": 14.733766233766234, "grad_norm": 1.900124192237854, "learning_rate": 5.723162615488137e-05, "loss": 0.4938, "step": 2269 }, { "epoch": 14.74025974025974, "grad_norm": 1.9192895889282227, "learning_rate": 5.7200539127612604e-05, "loss": 0.4916, "step": 2270 }, { "epoch": 14.746753246753247, "grad_norm": 2.0334246158599854, "learning_rate": 5.716944925768505e-05, "loss": 0.4675, "step": 2271 }, { "epoch": 14.753246753246753, "grad_norm": 1.9807862043380737, "learning_rate": 5.713835655737244e-05, "loss": 0.4895, "step": 2272 }, { "epoch": 14.75974025974026, "grad_norm": 1.8962934017181396, "learning_rate": 5.7107261038949734e-05, "loss": 0.4736, "step": 2273 }, { "epoch": 14.766233766233766, "grad_norm": 2.073451519012451, "learning_rate": 5.707616271469293e-05, "loss": 0.5383, "step": 2274 }, { "epoch": 14.772727272727273, "grad_norm": 2.0890538692474365, "learning_rate": 5.7045061596879134e-05, "loss": 0.5472, "step": 2275 }, { "epoch": 14.779220779220779, "grad_norm": 2.1617870330810547, "learning_rate": 5.7013957697786605e-05, "loss": 0.524, "step": 2276 }, { "epoch": 14.785714285714286, "grad_norm": 1.795647144317627, "learning_rate": 5.6982851029694636e-05, "loss": 0.4337, "step": 2277 }, { "epoch": 14.792207792207792, "grad_norm": 1.8621337413787842, "learning_rate": 5.695174160488369e-05, "loss": 0.4717, "step": 2278 }, { "epoch": 14.7987012987013, "grad_norm": 1.8423058986663818, "learning_rate": 5.6920629435635256e-05, "loss": 0.4771, "step": 2279 }, { "epoch": 14.805194805194805, "grad_norm": 2.1172149181365967, "learning_rate": 5.68895145342319e-05, "loss": 0.5208, "step": 2280 }, { "epoch": 14.811688311688311, "grad_norm": 1.8154979944229126, "learning_rate": 5.685839691295733e-05, "loss": 0.4517, "step": 2281 }, { "epoch": 14.818181818181818, "grad_norm": 2.148499011993408, "learning_rate": 5.682727658409628e-05, "loss": 0.5284, "step": 2282 }, { "epoch": 14.824675324675324, "grad_norm": 2.024648666381836, "learning_rate": 5.679615355993455e-05, "loss": 0.4543, "step": 2283 }, { "epoch": 14.831168831168831, "grad_norm": 1.9004312753677368, "learning_rate": 5.676502785275901e-05, "loss": 0.4675, "step": 2284 }, { "epoch": 14.837662337662337, "grad_norm": 1.9394006729125977, "learning_rate": 5.6733899474857634e-05, "loss": 0.4755, "step": 2285 }, { "epoch": 14.844155844155845, "grad_norm": 1.9973540306091309, "learning_rate": 5.670276843851938e-05, "loss": 0.4964, "step": 2286 }, { "epoch": 14.85064935064935, "grad_norm": 1.6694719791412354, "learning_rate": 5.6671634756034295e-05, "loss": 0.4034, "step": 2287 }, { "epoch": 14.857142857142858, "grad_norm": 2.1099071502685547, "learning_rate": 5.664049843969348e-05, "loss": 0.5722, "step": 2288 }, { "epoch": 14.863636363636363, "grad_norm": 2.096684217453003, "learning_rate": 5.660935950178904e-05, "loss": 0.503, "step": 2289 }, { "epoch": 14.87012987012987, "grad_norm": 2.207970380783081, "learning_rate": 5.6578217954614134e-05, "loss": 0.5296, "step": 2290 }, { "epoch": 14.876623376623376, "grad_norm": 2.032879590988159, "learning_rate": 5.6547073810462956e-05, "loss": 0.5042, "step": 2291 }, { "epoch": 14.883116883116884, "grad_norm": 1.8430237770080566, "learning_rate": 5.651592708163074e-05, "loss": 0.4276, "step": 2292 }, { "epoch": 14.88961038961039, "grad_norm": 1.9308630228042603, "learning_rate": 5.6484777780413686e-05, "loss": 0.5392, "step": 2293 }, { "epoch": 14.896103896103895, "grad_norm": 2.139652967453003, "learning_rate": 5.6453625919109074e-05, "loss": 0.5203, "step": 2294 }, { "epoch": 14.902597402597403, "grad_norm": 2.140904426574707, "learning_rate": 5.642247151001515e-05, "loss": 0.5293, "step": 2295 }, { "epoch": 14.909090909090908, "grad_norm": 2.118394374847412, "learning_rate": 5.639131456543119e-05, "loss": 0.539, "step": 2296 }, { "epoch": 14.915584415584416, "grad_norm": 1.873718023300171, "learning_rate": 5.6360155097657464e-05, "loss": 0.4581, "step": 2297 }, { "epoch": 14.922077922077921, "grad_norm": 2.013944387435913, "learning_rate": 5.632899311899521e-05, "loss": 0.4919, "step": 2298 }, { "epoch": 14.928571428571429, "grad_norm": 1.8649920225143433, "learning_rate": 5.629782864174672e-05, "loss": 0.5073, "step": 2299 }, { "epoch": 14.935064935064934, "grad_norm": 1.8919544219970703, "learning_rate": 5.6266661678215216e-05, "loss": 0.5138, "step": 2300 }, { "epoch": 14.941558441558442, "grad_norm": 1.8159269094467163, "learning_rate": 5.6235492240704936e-05, "loss": 0.4491, "step": 2301 }, { "epoch": 14.948051948051948, "grad_norm": 1.9950357675552368, "learning_rate": 5.620432034152106e-05, "loss": 0.528, "step": 2302 }, { "epoch": 14.954545454545455, "grad_norm": 2.1367409229278564, "learning_rate": 5.617314599296977e-05, "loss": 0.5164, "step": 2303 }, { "epoch": 14.96103896103896, "grad_norm": 2.0098366737365723, "learning_rate": 5.614196920735821e-05, "loss": 0.533, "step": 2304 }, { "epoch": 14.967532467532468, "grad_norm": 1.7905912399291992, "learning_rate": 5.6110789996994474e-05, "loss": 0.4644, "step": 2305 }, { "epoch": 14.974025974025974, "grad_norm": 1.7596795558929443, "learning_rate": 5.607960837418763e-05, "loss": 0.4699, "step": 2306 }, { "epoch": 14.980519480519481, "grad_norm": 1.941423773765564, "learning_rate": 5.6048424351247686e-05, "loss": 0.4874, "step": 2307 }, { "epoch": 14.987012987012987, "grad_norm": 1.6929208040237427, "learning_rate": 5.601723794048558e-05, "loss": 0.383, "step": 2308 }, { "epoch": 14.993506493506494, "grad_norm": 1.8159369230270386, "learning_rate": 5.598604915421324e-05, "loss": 0.4806, "step": 2309 }, { "epoch": 15.0, "grad_norm": 27394.884765625, "learning_rate": 5.595485800474348e-05, "loss": 0.5541, "step": 2310 }, { "epoch": 15.006493506493506, "grad_norm": 1.7933733463287354, "learning_rate": 5.592366450439012e-05, "loss": 0.4224, "step": 2311 }, { "epoch": 15.012987012987013, "grad_norm": 1.8115507364273071, "learning_rate": 5.58924686654678e-05, "loss": 0.4107, "step": 2312 }, { "epoch": 15.019480519480519, "grad_norm": 1.659523367881775, "learning_rate": 5.5861270500292175e-05, "loss": 0.4019, "step": 2313 }, { "epoch": 15.025974025974026, "grad_norm": 1.903672456741333, "learning_rate": 5.583007002117978e-05, "loss": 0.4265, "step": 2314 }, { "epoch": 15.032467532467532, "grad_norm": 1.5900388956069946, "learning_rate": 5.579886724044807e-05, "loss": 0.3745, "step": 2315 }, { "epoch": 15.03896103896104, "grad_norm": 1.9903018474578857, "learning_rate": 5.576766217041541e-05, "loss": 0.4743, "step": 2316 }, { "epoch": 15.045454545454545, "grad_norm": 1.851680874824524, "learning_rate": 5.573645482340107e-05, "loss": 0.3734, "step": 2317 }, { "epoch": 15.051948051948052, "grad_norm": 1.81151282787323, "learning_rate": 5.570524521172522e-05, "loss": 0.3987, "step": 2318 }, { "epoch": 15.058441558441558, "grad_norm": 1.8062418699264526, "learning_rate": 5.567403334770891e-05, "loss": 0.4004, "step": 2319 }, { "epoch": 15.064935064935066, "grad_norm": 1.8997695446014404, "learning_rate": 5.564281924367408e-05, "loss": 0.4241, "step": 2320 }, { "epoch": 15.071428571428571, "grad_norm": 1.7678101062774658, "learning_rate": 5.5611602911943596e-05, "loss": 0.3893, "step": 2321 }, { "epoch": 15.077922077922079, "grad_norm": 1.8739129304885864, "learning_rate": 5.5580384364841157e-05, "loss": 0.4488, "step": 2322 }, { "epoch": 15.084415584415584, "grad_norm": 1.8988037109375, "learning_rate": 5.554916361469133e-05, "loss": 0.4413, "step": 2323 }, { "epoch": 15.090909090909092, "grad_norm": 1.884633183479309, "learning_rate": 5.551794067381959e-05, "loss": 0.4371, "step": 2324 }, { "epoch": 15.097402597402597, "grad_norm": 1.9238171577453613, "learning_rate": 5.548671555455226e-05, "loss": 0.4408, "step": 2325 }, { "epoch": 15.103896103896103, "grad_norm": 1.983849048614502, "learning_rate": 5.5455488269216525e-05, "loss": 0.4599, "step": 2326 }, { "epoch": 15.11038961038961, "grad_norm": 1.9247374534606934, "learning_rate": 5.542425883014043e-05, "loss": 0.4407, "step": 2327 }, { "epoch": 15.116883116883116, "grad_norm": 1.8628653287887573, "learning_rate": 5.539302724965284e-05, "loss": 0.4222, "step": 2328 }, { "epoch": 15.123376623376624, "grad_norm": 2.01529598236084, "learning_rate": 5.5361793540083505e-05, "loss": 0.4371, "step": 2329 }, { "epoch": 15.12987012987013, "grad_norm": 1.873279094696045, "learning_rate": 5.5330557713763e-05, "loss": 0.4066, "step": 2330 }, { "epoch": 15.136363636363637, "grad_norm": 1.7380694150924683, "learning_rate": 5.529931978302272e-05, "loss": 0.3627, "step": 2331 }, { "epoch": 15.142857142857142, "grad_norm": 1.8929762840270996, "learning_rate": 5.526807976019492e-05, "loss": 0.4085, "step": 2332 }, { "epoch": 15.14935064935065, "grad_norm": 1.6444894075393677, "learning_rate": 5.5236837657612664e-05, "loss": 0.3539, "step": 2333 }, { "epoch": 15.155844155844155, "grad_norm": 1.9875844717025757, "learning_rate": 5.5205593487609844e-05, "loss": 0.4465, "step": 2334 }, { "epoch": 15.162337662337663, "grad_norm": 1.7953152656555176, "learning_rate": 5.517434726252113e-05, "loss": 0.3634, "step": 2335 }, { "epoch": 15.168831168831169, "grad_norm": 1.7829195261001587, "learning_rate": 5.5143098994682084e-05, "loss": 0.3887, "step": 2336 }, { "epoch": 15.175324675324676, "grad_norm": 1.9508140087127686, "learning_rate": 5.5111848696429e-05, "loss": 0.4567, "step": 2337 }, { "epoch": 15.181818181818182, "grad_norm": 1.8800532817840576, "learning_rate": 5.5080596380099006e-05, "loss": 0.4333, "step": 2338 }, { "epoch": 15.188311688311689, "grad_norm": 1.8089532852172852, "learning_rate": 5.504934205803002e-05, "loss": 0.4133, "step": 2339 }, { "epoch": 15.194805194805195, "grad_norm": 1.860283613204956, "learning_rate": 5.5018085742560744e-05, "loss": 0.444, "step": 2340 }, { "epoch": 15.2012987012987, "grad_norm": 1.8280411958694458, "learning_rate": 5.498682744603071e-05, "loss": 0.4475, "step": 2341 }, { "epoch": 15.207792207792208, "grad_norm": 1.9756337404251099, "learning_rate": 5.4955567180780164e-05, "loss": 0.4557, "step": 2342 }, { "epoch": 15.214285714285714, "grad_norm": 1.8537825345993042, "learning_rate": 5.4924304959150175e-05, "loss": 0.4368, "step": 2343 }, { "epoch": 15.220779220779221, "grad_norm": 1.9010072946548462, "learning_rate": 5.489304079348258e-05, "loss": 0.4385, "step": 2344 }, { "epoch": 15.227272727272727, "grad_norm": 2.10852313041687, "learning_rate": 5.486177469611998e-05, "loss": 0.4751, "step": 2345 }, { "epoch": 15.233766233766234, "grad_norm": 2.0710091590881348, "learning_rate": 5.48305066794057e-05, "loss": 0.4492, "step": 2346 }, { "epoch": 15.24025974025974, "grad_norm": 2.18450927734375, "learning_rate": 5.4799236755683914e-05, "loss": 0.4304, "step": 2347 }, { "epoch": 15.246753246753247, "grad_norm": 1.8866140842437744, "learning_rate": 5.476796493729943e-05, "loss": 0.4226, "step": 2348 }, { "epoch": 15.253246753246753, "grad_norm": 1.9405475854873657, "learning_rate": 5.473669123659793e-05, "loss": 0.4327, "step": 2349 }, { "epoch": 15.25974025974026, "grad_norm": 1.8188602924346924, "learning_rate": 5.470541566592573e-05, "loss": 0.4094, "step": 2350 }, { "epoch": 15.266233766233766, "grad_norm": 1.9213995933532715, "learning_rate": 5.467413823762993e-05, "loss": 0.4275, "step": 2351 }, { "epoch": 15.272727272727273, "grad_norm": 1.7504397630691528, "learning_rate": 5.46428589640584e-05, "loss": 0.4055, "step": 2352 }, { "epoch": 15.279220779220779, "grad_norm": 1.8797755241394043, "learning_rate": 5.461157785755967e-05, "loss": 0.4395, "step": 2353 }, { "epoch": 15.285714285714286, "grad_norm": 1.9434428215026855, "learning_rate": 5.458029493048303e-05, "loss": 0.4396, "step": 2354 }, { "epoch": 15.292207792207792, "grad_norm": 1.8237086534500122, "learning_rate": 5.4549010195178505e-05, "loss": 0.4072, "step": 2355 }, { "epoch": 15.2987012987013, "grad_norm": 2.064702033996582, "learning_rate": 5.4517723663996775e-05, "loss": 0.424, "step": 2356 }, { "epoch": 15.305194805194805, "grad_norm": 1.4699984788894653, "learning_rate": 5.448643534928931e-05, "loss": 0.3071, "step": 2357 }, { "epoch": 15.311688311688311, "grad_norm": 1.8575775623321533, "learning_rate": 5.4455145263408215e-05, "loss": 0.4012, "step": 2358 }, { "epoch": 15.318181818181818, "grad_norm": 1.7640053033828735, "learning_rate": 5.4423853418706327e-05, "loss": 0.3897, "step": 2359 }, { "epoch": 15.324675324675324, "grad_norm": 2.0353124141693115, "learning_rate": 5.439255982753717e-05, "loss": 0.4375, "step": 2360 }, { "epoch": 15.331168831168831, "grad_norm": 1.5689446926116943, "learning_rate": 5.436126450225495e-05, "loss": 0.3462, "step": 2361 }, { "epoch": 15.337662337662337, "grad_norm": 2.0782582759857178, "learning_rate": 5.432996745521458e-05, "loss": 0.4741, "step": 2362 }, { "epoch": 15.344155844155845, "grad_norm": 2.0686850547790527, "learning_rate": 5.429866869877163e-05, "loss": 0.4635, "step": 2363 }, { "epoch": 15.35064935064935, "grad_norm": 1.948331594467163, "learning_rate": 5.4267368245282345e-05, "loss": 0.4466, "step": 2364 }, { "epoch": 15.357142857142858, "grad_norm": 1.9553227424621582, "learning_rate": 5.423606610710368e-05, "loss": 0.4608, "step": 2365 }, { "epoch": 15.363636363636363, "grad_norm": 1.8363107442855835, "learning_rate": 5.420476229659318e-05, "loss": 0.4229, "step": 2366 }, { "epoch": 15.37012987012987, "grad_norm": 1.9885114431381226, "learning_rate": 5.417345682610914e-05, "loss": 0.4334, "step": 2367 }, { "epoch": 15.376623376623376, "grad_norm": 1.9855831861495972, "learning_rate": 5.414214970801042e-05, "loss": 0.4427, "step": 2368 }, { "epoch": 15.383116883116884, "grad_norm": 1.8806434869766235, "learning_rate": 5.411084095465661e-05, "loss": 0.4567, "step": 2369 }, { "epoch": 15.38961038961039, "grad_norm": 2.067573308944702, "learning_rate": 5.407953057840789e-05, "loss": 0.5001, "step": 2370 }, { "epoch": 15.396103896103897, "grad_norm": 1.7322285175323486, "learning_rate": 5.404821859162509e-05, "loss": 0.4374, "step": 2371 }, { "epoch": 15.402597402597403, "grad_norm": 1.9472609758377075, "learning_rate": 5.4016905006669715e-05, "loss": 0.4335, "step": 2372 }, { "epoch": 15.409090909090908, "grad_norm": 2.0601353645324707, "learning_rate": 5.3985589835903846e-05, "loss": 0.4836, "step": 2373 }, { "epoch": 15.415584415584416, "grad_norm": 1.789538025856018, "learning_rate": 5.3954273091690245e-05, "loss": 0.4205, "step": 2374 }, { "epoch": 15.422077922077921, "grad_norm": 1.9714622497558594, "learning_rate": 5.392295478639225e-05, "loss": 0.4546, "step": 2375 }, { "epoch": 15.428571428571429, "grad_norm": 1.7235947847366333, "learning_rate": 5.3891634932373825e-05, "loss": 0.3673, "step": 2376 }, { "epoch": 15.435064935064934, "grad_norm": 1.7255702018737793, "learning_rate": 5.386031354199956e-05, "loss": 0.3839, "step": 2377 }, { "epoch": 15.441558441558442, "grad_norm": 1.7942345142364502, "learning_rate": 5.382899062763466e-05, "loss": 0.4179, "step": 2378 }, { "epoch": 15.448051948051948, "grad_norm": 2.123368978500366, "learning_rate": 5.379766620164488e-05, "loss": 0.4714, "step": 2379 }, { "epoch": 15.454545454545455, "grad_norm": 1.8634470701217651, "learning_rate": 5.3766340276396646e-05, "loss": 0.4181, "step": 2380 }, { "epoch": 15.46103896103896, "grad_norm": 1.5813051462173462, "learning_rate": 5.373501286425691e-05, "loss": 0.3406, "step": 2381 }, { "epoch": 15.467532467532468, "grad_norm": 1.8893381357192993, "learning_rate": 5.370368397759324e-05, "loss": 0.4624, "step": 2382 }, { "epoch": 15.474025974025974, "grad_norm": 1.9371058940887451, "learning_rate": 5.367235362877378e-05, "loss": 0.4078, "step": 2383 }, { "epoch": 15.480519480519481, "grad_norm": 2.030733585357666, "learning_rate": 5.3641021830167296e-05, "loss": 0.4414, "step": 2384 }, { "epoch": 15.487012987012987, "grad_norm": 1.9892330169677734, "learning_rate": 5.360968859414305e-05, "loss": 0.4081, "step": 2385 }, { "epoch": 15.493506493506494, "grad_norm": 2.044180393218994, "learning_rate": 5.357835393307089e-05, "loss": 0.4166, "step": 2386 }, { "epoch": 15.5, "grad_norm": 2.041566848754883, "learning_rate": 5.354701785932129e-05, "loss": 0.5239, "step": 2387 }, { "epoch": 15.506493506493506, "grad_norm": 1.9771960973739624, "learning_rate": 5.3515680385265196e-05, "loss": 0.4926, "step": 2388 }, { "epoch": 15.512987012987013, "grad_norm": 2.1107778549194336, "learning_rate": 5.348434152327417e-05, "loss": 0.479, "step": 2389 }, { "epoch": 15.519480519480519, "grad_norm": 1.9286577701568604, "learning_rate": 5.345300128572031e-05, "loss": 0.4558, "step": 2390 }, { "epoch": 15.525974025974026, "grad_norm": 2.0163707733154297, "learning_rate": 5.3421659684976197e-05, "loss": 0.4816, "step": 2391 }, { "epoch": 15.532467532467532, "grad_norm": 1.9194059371948242, "learning_rate": 5.3390316733415044e-05, "loss": 0.4382, "step": 2392 }, { "epoch": 15.53896103896104, "grad_norm": 1.9520113468170166, "learning_rate": 5.335897244341054e-05, "loss": 0.4539, "step": 2393 }, { "epoch": 15.545454545454545, "grad_norm": 1.8568896055221558, "learning_rate": 5.332762682733691e-05, "loss": 0.415, "step": 2394 }, { "epoch": 15.551948051948052, "grad_norm": 2.010601282119751, "learning_rate": 5.32962798975689e-05, "loss": 0.4754, "step": 2395 }, { "epoch": 15.558441558441558, "grad_norm": 1.8659908771514893, "learning_rate": 5.3264931666481786e-05, "loss": 0.433, "step": 2396 }, { "epoch": 15.564935064935066, "grad_norm": 1.715757966041565, "learning_rate": 5.3233582146451375e-05, "loss": 0.3923, "step": 2397 }, { "epoch": 15.571428571428571, "grad_norm": 2.116550922393799, "learning_rate": 5.320223134985392e-05, "loss": 0.4569, "step": 2398 }, { "epoch": 15.577922077922079, "grad_norm": 1.9593226909637451, "learning_rate": 5.317087928906627e-05, "loss": 0.4612, "step": 2399 }, { "epoch": 15.584415584415584, "grad_norm": 1.9149504899978638, "learning_rate": 5.313952597646568e-05, "loss": 0.4297, "step": 2400 }, { "epoch": 15.590909090909092, "grad_norm": 1.9268628358840942, "learning_rate": 5.310817142442994e-05, "loss": 0.4562, "step": 2401 }, { "epoch": 15.597402597402597, "grad_norm": 1.98369562625885, "learning_rate": 5.307681564533736e-05, "loss": 0.4519, "step": 2402 }, { "epoch": 15.603896103896105, "grad_norm": 2.0358848571777344, "learning_rate": 5.3045458651566695e-05, "loss": 0.4636, "step": 2403 }, { "epoch": 15.61038961038961, "grad_norm": 1.849379539489746, "learning_rate": 5.301410045549718e-05, "loss": 0.415, "step": 2404 }, { "epoch": 15.616883116883116, "grad_norm": 1.8836501836776733, "learning_rate": 5.298274106950854e-05, "loss": 0.4354, "step": 2405 }, { "epoch": 15.623376623376624, "grad_norm": 1.98408842086792, "learning_rate": 5.295138050598096e-05, "loss": 0.4544, "step": 2406 }, { "epoch": 15.62987012987013, "grad_norm": 1.646842122077942, "learning_rate": 5.29200187772951e-05, "loss": 0.4206, "step": 2407 }, { "epoch": 15.636363636363637, "grad_norm": 1.6809951066970825, "learning_rate": 5.288865589583207e-05, "loss": 0.384, "step": 2408 }, { "epoch": 15.642857142857142, "grad_norm": 1.9409666061401367, "learning_rate": 5.2857291873973435e-05, "loss": 0.4559, "step": 2409 }, { "epoch": 15.64935064935065, "grad_norm": 1.9693655967712402, "learning_rate": 5.2825926724101236e-05, "loss": 0.4862, "step": 2410 }, { "epoch": 15.655844155844155, "grad_norm": 1.8932464122772217, "learning_rate": 5.2794560458597897e-05, "loss": 0.4696, "step": 2411 }, { "epoch": 15.662337662337663, "grad_norm": 1.9884529113769531, "learning_rate": 5.276319308984636e-05, "loss": 0.5044, "step": 2412 }, { "epoch": 15.668831168831169, "grad_norm": 1.8495194911956787, "learning_rate": 5.273182463022995e-05, "loss": 0.4292, "step": 2413 }, { "epoch": 15.675324675324676, "grad_norm": 1.7464812994003296, "learning_rate": 5.2700455092132436e-05, "loss": 0.4102, "step": 2414 }, { "epoch": 15.681818181818182, "grad_norm": 1.8720086812973022, "learning_rate": 5.266908448793803e-05, "loss": 0.4426, "step": 2415 }, { "epoch": 15.688311688311689, "grad_norm": 1.8257683515548706, "learning_rate": 5.263771283003133e-05, "loss": 0.437, "step": 2416 }, { "epoch": 15.694805194805195, "grad_norm": 1.7542264461517334, "learning_rate": 5.260634013079737e-05, "loss": 0.4043, "step": 2417 }, { "epoch": 15.7012987012987, "grad_norm": 1.9541983604431152, "learning_rate": 5.257496640262162e-05, "loss": 0.4401, "step": 2418 }, { "epoch": 15.707792207792208, "grad_norm": 1.9968219995498657, "learning_rate": 5.25435916578899e-05, "loss": 0.4885, "step": 2419 }, { "epoch": 15.714285714285714, "grad_norm": 1.9374127388000488, "learning_rate": 5.2512215908988484e-05, "loss": 0.437, "step": 2420 }, { "epoch": 15.720779220779221, "grad_norm": 1.8781362771987915, "learning_rate": 5.2480839168304e-05, "loss": 0.4244, "step": 2421 }, { "epoch": 15.727272727272727, "grad_norm": 2.1212801933288574, "learning_rate": 5.2449461448223517e-05, "loss": 0.5161, "step": 2422 }, { "epoch": 15.733766233766234, "grad_norm": 1.9629656076431274, "learning_rate": 5.2418082761134445e-05, "loss": 0.4562, "step": 2423 }, { "epoch": 15.74025974025974, "grad_norm": 2.031665086746216, "learning_rate": 5.2386703119424584e-05, "loss": 0.4555, "step": 2424 }, { "epoch": 15.746753246753247, "grad_norm": 2.124204397201538, "learning_rate": 5.235532253548213e-05, "loss": 0.5088, "step": 2425 }, { "epoch": 15.753246753246753, "grad_norm": 1.8898444175720215, "learning_rate": 5.232394102169565e-05, "loss": 0.4031, "step": 2426 }, { "epoch": 15.75974025974026, "grad_norm": 1.8912581205368042, "learning_rate": 5.229255859045406e-05, "loss": 0.4112, "step": 2427 }, { "epoch": 15.766233766233766, "grad_norm": 1.984041452407837, "learning_rate": 5.226117525414663e-05, "loss": 0.4798, "step": 2428 }, { "epoch": 15.772727272727273, "grad_norm": 1.8399240970611572, "learning_rate": 5.2229791025163046e-05, "loss": 0.3877, "step": 2429 }, { "epoch": 15.779220779220779, "grad_norm": 1.9852311611175537, "learning_rate": 5.219840591589325e-05, "loss": 0.4805, "step": 2430 }, { "epoch": 15.785714285714286, "grad_norm": 1.99494469165802, "learning_rate": 5.216701993872762e-05, "loss": 0.452, "step": 2431 }, { "epoch": 15.792207792207792, "grad_norm": 2.0795602798461914, "learning_rate": 5.213563310605686e-05, "loss": 0.5183, "step": 2432 }, { "epoch": 15.7987012987013, "grad_norm": 1.877393126487732, "learning_rate": 5.2104245430271946e-05, "loss": 0.4696, "step": 2433 }, { "epoch": 15.805194805194805, "grad_norm": 2.1190946102142334, "learning_rate": 5.2072856923764266e-05, "loss": 0.4675, "step": 2434 }, { "epoch": 15.811688311688311, "grad_norm": 1.897771954536438, "learning_rate": 5.204146759892551e-05, "loss": 0.4613, "step": 2435 }, { "epoch": 15.818181818181818, "grad_norm": 1.79399836063385, "learning_rate": 5.2010077468147665e-05, "loss": 0.4218, "step": 2436 }, { "epoch": 15.824675324675324, "grad_norm": 1.9332948923110962, "learning_rate": 5.1978686543823076e-05, "loss": 0.4769, "step": 2437 }, { "epoch": 15.831168831168831, "grad_norm": 1.8081494569778442, "learning_rate": 5.194729483834438e-05, "loss": 0.4254, "step": 2438 }, { "epoch": 15.837662337662337, "grad_norm": 1.5738856792449951, "learning_rate": 5.1915902364104506e-05, "loss": 0.3907, "step": 2439 }, { "epoch": 15.844155844155845, "grad_norm": 1.990145206451416, "learning_rate": 5.188450913349674e-05, "loss": 0.5046, "step": 2440 }, { "epoch": 15.85064935064935, "grad_norm": 1.8861225843429565, "learning_rate": 5.185311515891459e-05, "loss": 0.4484, "step": 2441 }, { "epoch": 15.857142857142858, "grad_norm": 1.9962940216064453, "learning_rate": 5.1821720452751945e-05, "loss": 0.4763, "step": 2442 }, { "epoch": 15.863636363636363, "grad_norm": 1.7997182607650757, "learning_rate": 5.179032502740291e-05, "loss": 0.4409, "step": 2443 }, { "epoch": 15.87012987012987, "grad_norm": 1.897403597831726, "learning_rate": 5.175892889526189e-05, "loss": 0.4464, "step": 2444 }, { "epoch": 15.876623376623376, "grad_norm": 1.7679868936538696, "learning_rate": 5.172753206872363e-05, "loss": 0.4087, "step": 2445 }, { "epoch": 15.883116883116884, "grad_norm": 1.93822181224823, "learning_rate": 5.169613456018304e-05, "loss": 0.485, "step": 2446 }, { "epoch": 15.88961038961039, "grad_norm": 2.119961977005005, "learning_rate": 5.166473638203539e-05, "loss": 0.4766, "step": 2447 }, { "epoch": 15.896103896103895, "grad_norm": 2.0220601558685303, "learning_rate": 5.1633337546676196e-05, "loss": 0.4645, "step": 2448 }, { "epoch": 15.902597402597403, "grad_norm": 1.9838132858276367, "learning_rate": 5.1601938066501196e-05, "loss": 0.4651, "step": 2449 }, { "epoch": 15.909090909090908, "grad_norm": 2.0590314865112305, "learning_rate": 5.157053795390642e-05, "loss": 0.4939, "step": 2450 }, { "epoch": 15.915584415584416, "grad_norm": 1.9032071828842163, "learning_rate": 5.153913722128813e-05, "loss": 0.4437, "step": 2451 }, { "epoch": 15.922077922077921, "grad_norm": 1.606368899345398, "learning_rate": 5.150773588104284e-05, "loss": 0.4026, "step": 2452 }, { "epoch": 15.928571428571429, "grad_norm": 1.7986513376235962, "learning_rate": 5.147633394556731e-05, "loss": 0.4346, "step": 2453 }, { "epoch": 15.935064935064934, "grad_norm": 1.85692298412323, "learning_rate": 5.14449314272585e-05, "loss": 0.4668, "step": 2454 }, { "epoch": 15.941558441558442, "grad_norm": 1.9175212383270264, "learning_rate": 5.141352833851367e-05, "loss": 0.4514, "step": 2455 }, { "epoch": 15.948051948051948, "grad_norm": 1.8391081094741821, "learning_rate": 5.138212469173022e-05, "loss": 0.4637, "step": 2456 }, { "epoch": 15.954545454545455, "grad_norm": 1.8607032299041748, "learning_rate": 5.1350720499305835e-05, "loss": 0.4288, "step": 2457 }, { "epoch": 15.96103896103896, "grad_norm": 2.236193895339966, "learning_rate": 5.13193157736384e-05, "loss": 0.4957, "step": 2458 }, { "epoch": 15.967532467532468, "grad_norm": 1.895599126815796, "learning_rate": 5.128791052712597e-05, "loss": 0.4621, "step": 2459 }, { "epoch": 15.974025974025974, "grad_norm": 1.7029972076416016, "learning_rate": 5.125650477216688e-05, "loss": 0.3869, "step": 2460 }, { "epoch": 15.980519480519481, "grad_norm": 1.895903468132019, "learning_rate": 5.1225098521159585e-05, "loss": 0.4756, "step": 2461 }, { "epoch": 15.987012987012987, "grad_norm": 1.8416452407836914, "learning_rate": 5.119369178650282e-05, "loss": 0.4264, "step": 2462 }, { "epoch": 15.993506493506494, "grad_norm": 1.8602122068405151, "learning_rate": 5.116228458059543e-05, "loss": 0.424, "step": 2463 }, { "epoch": 16.0, "grad_norm": 19.60172462463379, "learning_rate": 5.113087691583649e-05, "loss": 0.5386, "step": 2464 }, { "epoch": 16.006493506493506, "grad_norm": 1.536372184753418, "learning_rate": 5.109946880462526e-05, "loss": 0.3336, "step": 2465 }, { "epoch": 16.01298701298701, "grad_norm": 1.8130605220794678, "learning_rate": 5.1068060259361153e-05, "loss": 0.412, "step": 2466 }, { "epoch": 16.01948051948052, "grad_norm": 1.7264176607131958, "learning_rate": 5.1036651292443774e-05, "loss": 0.4077, "step": 2467 }, { "epoch": 16.025974025974026, "grad_norm": 1.9557987451553345, "learning_rate": 5.1005241916272886e-05, "loss": 0.4346, "step": 2468 }, { "epoch": 16.032467532467532, "grad_norm": 1.8347598314285278, "learning_rate": 5.09738321432484e-05, "loss": 0.4012, "step": 2469 }, { "epoch": 16.038961038961038, "grad_norm": 1.9019415378570557, "learning_rate": 5.094242198577042e-05, "loss": 0.3958, "step": 2470 }, { "epoch": 16.045454545454547, "grad_norm": 1.6096651554107666, "learning_rate": 5.0911011456239157e-05, "loss": 0.3694, "step": 2471 }, { "epoch": 16.051948051948052, "grad_norm": 2.033205986022949, "learning_rate": 5.087960056705499e-05, "loss": 0.4224, "step": 2472 }, { "epoch": 16.058441558441558, "grad_norm": 1.9416512250900269, "learning_rate": 5.0848189330618456e-05, "loss": 0.4474, "step": 2473 }, { "epoch": 16.064935064935064, "grad_norm": 1.7071467638015747, "learning_rate": 5.0816777759330215e-05, "loss": 0.3728, "step": 2474 }, { "epoch": 16.071428571428573, "grad_norm": 1.6450309753417969, "learning_rate": 5.078536586559104e-05, "loss": 0.369, "step": 2475 }, { "epoch": 16.07792207792208, "grad_norm": 1.6530896425247192, "learning_rate": 5.075395366180186e-05, "loss": 0.3653, "step": 2476 }, { "epoch": 16.084415584415584, "grad_norm": 1.9590433835983276, "learning_rate": 5.0722541160363726e-05, "loss": 0.4025, "step": 2477 }, { "epoch": 16.09090909090909, "grad_norm": 1.6475454568862915, "learning_rate": 5.069112837367776e-05, "loss": 0.3674, "step": 2478 }, { "epoch": 16.0974025974026, "grad_norm": 1.753271460533142, "learning_rate": 5.065971531414528e-05, "loss": 0.3733, "step": 2479 }, { "epoch": 16.103896103896105, "grad_norm": 1.917557954788208, "learning_rate": 5.062830199416764e-05, "loss": 0.4473, "step": 2480 }, { "epoch": 16.11038961038961, "grad_norm": 1.8115241527557373, "learning_rate": 5.0596888426146325e-05, "loss": 0.3771, "step": 2481 }, { "epoch": 16.116883116883116, "grad_norm": 1.8796888589859009, "learning_rate": 5.05654746224829e-05, "loss": 0.3898, "step": 2482 }, { "epoch": 16.123376623376622, "grad_norm": 1.8184095621109009, "learning_rate": 5.053406059557906e-05, "loss": 0.3855, "step": 2483 }, { "epoch": 16.12987012987013, "grad_norm": 1.9025495052337646, "learning_rate": 5.0502646357836535e-05, "loss": 0.4127, "step": 2484 }, { "epoch": 16.136363636363637, "grad_norm": 1.698099136352539, "learning_rate": 5.047123192165721e-05, "loss": 0.3463, "step": 2485 }, { "epoch": 16.142857142857142, "grad_norm": 1.5930640697479248, "learning_rate": 5.0439817299442983e-05, "loss": 0.3534, "step": 2486 }, { "epoch": 16.149350649350648, "grad_norm": 1.8648321628570557, "learning_rate": 5.0408402503595845e-05, "loss": 0.3917, "step": 2487 }, { "epoch": 16.155844155844157, "grad_norm": 1.850818395614624, "learning_rate": 5.037698754651786e-05, "loss": 0.4185, "step": 2488 }, { "epoch": 16.162337662337663, "grad_norm": 1.7078030109405518, "learning_rate": 5.034557244061117e-05, "loss": 0.367, "step": 2489 }, { "epoch": 16.16883116883117, "grad_norm": 1.9454824924468994, "learning_rate": 5.0314157198277954e-05, "loss": 0.4301, "step": 2490 }, { "epoch": 16.175324675324674, "grad_norm": 1.7974553108215332, "learning_rate": 5.0282741831920454e-05, "loss": 0.3781, "step": 2491 }, { "epoch": 16.181818181818183, "grad_norm": 1.8951915502548218, "learning_rate": 5.025132635394095e-05, "loss": 0.4169, "step": 2492 }, { "epoch": 16.18831168831169, "grad_norm": 1.7715855836868286, "learning_rate": 5.021991077674179e-05, "loss": 0.3676, "step": 2493 }, { "epoch": 16.194805194805195, "grad_norm": 1.8014692068099976, "learning_rate": 5.018849511272532e-05, "loss": 0.3514, "step": 2494 }, { "epoch": 16.2012987012987, "grad_norm": 1.8222955465316772, "learning_rate": 5.0157079374293983e-05, "loss": 0.4101, "step": 2495 }, { "epoch": 16.207792207792206, "grad_norm": 2.0071604251861572, "learning_rate": 5.0125663573850204e-05, "loss": 0.4358, "step": 2496 }, { "epoch": 16.214285714285715, "grad_norm": 1.7885559797286987, "learning_rate": 5.0094247723796403e-05, "loss": 0.3885, "step": 2497 }, { "epoch": 16.22077922077922, "grad_norm": 1.9011200666427612, "learning_rate": 5.006283183653513e-05, "loss": 0.4215, "step": 2498 }, { "epoch": 16.227272727272727, "grad_norm": 1.7357369661331177, "learning_rate": 5.003141592446882e-05, "loss": 0.3403, "step": 2499 }, { "epoch": 16.233766233766232, "grad_norm": 1.8025972843170166, "learning_rate": 5e-05, "loss": 0.3856, "step": 2500 }, { "epoch": 16.24025974025974, "grad_norm": 1.7521417140960693, "learning_rate": 4.996858407553119e-05, "loss": 0.368, "step": 2501 }, { "epoch": 16.246753246753247, "grad_norm": 1.8964296579360962, "learning_rate": 4.9937168163464897e-05, "loss": 0.387, "step": 2502 }, { "epoch": 16.253246753246753, "grad_norm": 1.6628857851028442, "learning_rate": 4.990575227620359e-05, "loss": 0.3405, "step": 2503 }, { "epoch": 16.25974025974026, "grad_norm": 1.7666095495224, "learning_rate": 4.9874336426149814e-05, "loss": 0.3699, "step": 2504 }, { "epoch": 16.266233766233768, "grad_norm": 1.871014952659607, "learning_rate": 4.984292062570602e-05, "loss": 0.4222, "step": 2505 }, { "epoch": 16.272727272727273, "grad_norm": 1.8126856088638306, "learning_rate": 4.981150488727469e-05, "loss": 0.3925, "step": 2506 }, { "epoch": 16.27922077922078, "grad_norm": 1.7900199890136719, "learning_rate": 4.978008922325824e-05, "loss": 0.3612, "step": 2507 }, { "epoch": 16.285714285714285, "grad_norm": 1.8557946681976318, "learning_rate": 4.974867364605905e-05, "loss": 0.39, "step": 2508 }, { "epoch": 16.292207792207794, "grad_norm": 1.7613614797592163, "learning_rate": 4.971725816807956e-05, "loss": 0.3769, "step": 2509 }, { "epoch": 16.2987012987013, "grad_norm": 1.4544224739074707, "learning_rate": 4.968584280172206e-05, "loss": 0.3248, "step": 2510 }, { "epoch": 16.305194805194805, "grad_norm": 1.8322076797485352, "learning_rate": 4.965442755938884e-05, "loss": 0.4105, "step": 2511 }, { "epoch": 16.31168831168831, "grad_norm": 1.747609257698059, "learning_rate": 4.962301245348216e-05, "loss": 0.3871, "step": 2512 }, { "epoch": 16.318181818181817, "grad_norm": 1.7011605501174927, "learning_rate": 4.959159749640416e-05, "loss": 0.3463, "step": 2513 }, { "epoch": 16.324675324675326, "grad_norm": 1.8105734586715698, "learning_rate": 4.956018270055703e-05, "loss": 0.4268, "step": 2514 }, { "epoch": 16.33116883116883, "grad_norm": 1.8925591707229614, "learning_rate": 4.95287680783428e-05, "loss": 0.4644, "step": 2515 }, { "epoch": 16.337662337662337, "grad_norm": 1.8447585105895996, "learning_rate": 4.9497353642163476e-05, "loss": 0.3975, "step": 2516 }, { "epoch": 16.344155844155843, "grad_norm": 1.8829737901687622, "learning_rate": 4.9465939404420966e-05, "loss": 0.4104, "step": 2517 }, { "epoch": 16.350649350649352, "grad_norm": 1.849299669265747, "learning_rate": 4.9434525377517114e-05, "loss": 0.4154, "step": 2518 }, { "epoch": 16.357142857142858, "grad_norm": 1.7983893156051636, "learning_rate": 4.9403111573853686e-05, "loss": 0.4104, "step": 2519 }, { "epoch": 16.363636363636363, "grad_norm": 1.629199504852295, "learning_rate": 4.9371698005832365e-05, "loss": 0.3114, "step": 2520 }, { "epoch": 16.37012987012987, "grad_norm": 1.8307642936706543, "learning_rate": 4.934028468585473e-05, "loss": 0.3906, "step": 2521 }, { "epoch": 16.376623376623378, "grad_norm": 1.6470370292663574, "learning_rate": 4.930887162632224e-05, "loss": 0.326, "step": 2522 }, { "epoch": 16.383116883116884, "grad_norm": 1.867017149925232, "learning_rate": 4.9277458839636285e-05, "loss": 0.3932, "step": 2523 }, { "epoch": 16.38961038961039, "grad_norm": 1.7679275274276733, "learning_rate": 4.9246046338198147e-05, "loss": 0.377, "step": 2524 }, { "epoch": 16.396103896103895, "grad_norm": 2.562811851501465, "learning_rate": 4.921463413440898e-05, "loss": 0.4232, "step": 2525 }, { "epoch": 16.4025974025974, "grad_norm": 1.7845680713653564, "learning_rate": 4.9183222240669796e-05, "loss": 0.4371, "step": 2526 }, { "epoch": 16.40909090909091, "grad_norm": 1.8348369598388672, "learning_rate": 4.9151810669381556e-05, "loss": 0.4095, "step": 2527 }, { "epoch": 16.415584415584416, "grad_norm": 1.9595232009887695, "learning_rate": 4.9120399432945016e-05, "loss": 0.3927, "step": 2528 }, { "epoch": 16.42207792207792, "grad_norm": 1.7152999639511108, "learning_rate": 4.908898854376086e-05, "loss": 0.3965, "step": 2529 }, { "epoch": 16.428571428571427, "grad_norm": 1.8380136489868164, "learning_rate": 4.90575780142296e-05, "loss": 0.3981, "step": 2530 }, { "epoch": 16.435064935064936, "grad_norm": 1.8340736627578735, "learning_rate": 4.902616785675161e-05, "loss": 0.4278, "step": 2531 }, { "epoch": 16.441558441558442, "grad_norm": 1.5227307081222534, "learning_rate": 4.899475808372714e-05, "loss": 0.3029, "step": 2532 }, { "epoch": 16.448051948051948, "grad_norm": 1.7773783206939697, "learning_rate": 4.896334870755623e-05, "loss": 0.415, "step": 2533 }, { "epoch": 16.454545454545453, "grad_norm": 1.886927843093872, "learning_rate": 4.893193974063885e-05, "loss": 0.4087, "step": 2534 }, { "epoch": 16.461038961038962, "grad_norm": 1.9630560874938965, "learning_rate": 4.890053119537475e-05, "loss": 0.4291, "step": 2535 }, { "epoch": 16.467532467532468, "grad_norm": 1.908774495124817, "learning_rate": 4.8869123084163524e-05, "loss": 0.4307, "step": 2536 }, { "epoch": 16.474025974025974, "grad_norm": 1.9670474529266357, "learning_rate": 4.883771541940459e-05, "loss": 0.4306, "step": 2537 }, { "epoch": 16.48051948051948, "grad_norm": 1.737168550491333, "learning_rate": 4.8806308213497184e-05, "loss": 0.3743, "step": 2538 }, { "epoch": 16.48701298701299, "grad_norm": 1.7427324056625366, "learning_rate": 4.877490147884041e-05, "loss": 0.3952, "step": 2539 }, { "epoch": 16.493506493506494, "grad_norm": 2.0852608680725098, "learning_rate": 4.874349522783313e-05, "loss": 0.4108, "step": 2540 }, { "epoch": 16.5, "grad_norm": 1.7185182571411133, "learning_rate": 4.871208947287404e-05, "loss": 0.3882, "step": 2541 }, { "epoch": 16.506493506493506, "grad_norm": 1.9440059661865234, "learning_rate": 4.868068422636162e-05, "loss": 0.4613, "step": 2542 }, { "epoch": 16.51298701298701, "grad_norm": 2.0143227577209473, "learning_rate": 4.864927950069416e-05, "loss": 0.4414, "step": 2543 }, { "epoch": 16.51948051948052, "grad_norm": 1.5256792306900024, "learning_rate": 4.8617875308269787e-05, "loss": 0.3096, "step": 2544 }, { "epoch": 16.525974025974026, "grad_norm": 1.9585679769515991, "learning_rate": 4.858647166148634e-05, "loss": 0.45, "step": 2545 }, { "epoch": 16.532467532467532, "grad_norm": 2.0001230239868164, "learning_rate": 4.8555068572741505e-05, "loss": 0.4472, "step": 2546 }, { "epoch": 16.538961038961038, "grad_norm": 2.0032620429992676, "learning_rate": 4.852366605443271e-05, "loss": 0.461, "step": 2547 }, { "epoch": 16.545454545454547, "grad_norm": 1.9636540412902832, "learning_rate": 4.8492264118957156e-05, "loss": 0.4092, "step": 2548 }, { "epoch": 16.551948051948052, "grad_norm": 1.7286251783370972, "learning_rate": 4.846086277871187e-05, "loss": 0.389, "step": 2549 }, { "epoch": 16.558441558441558, "grad_norm": 1.6896281242370605, "learning_rate": 4.8429462046093585e-05, "loss": 0.36, "step": 2550 }, { "epoch": 16.564935064935064, "grad_norm": 1.8248164653778076, "learning_rate": 4.8398061933498816e-05, "loss": 0.3979, "step": 2551 }, { "epoch": 16.571428571428573, "grad_norm": 2.0552384853363037, "learning_rate": 4.836666245332382e-05, "loss": 0.473, "step": 2552 }, { "epoch": 16.57792207792208, "grad_norm": 1.703555703163147, "learning_rate": 4.8335263617964606e-05, "loss": 0.3605, "step": 2553 }, { "epoch": 16.584415584415584, "grad_norm": 1.8883312940597534, "learning_rate": 4.8303865439816966e-05, "loss": 0.386, "step": 2554 }, { "epoch": 16.59090909090909, "grad_norm": 1.6219338178634644, "learning_rate": 4.827246793127639e-05, "loss": 0.3748, "step": 2555 }, { "epoch": 16.5974025974026, "grad_norm": 1.5926085710525513, "learning_rate": 4.824107110473812e-05, "loss": 0.3464, "step": 2556 }, { "epoch": 16.603896103896105, "grad_norm": 2.0677969455718994, "learning_rate": 4.8209674972597114e-05, "loss": 0.4676, "step": 2557 }, { "epoch": 16.61038961038961, "grad_norm": 2.003356456756592, "learning_rate": 4.817827954724805e-05, "loss": 0.4578, "step": 2558 }, { "epoch": 16.616883116883116, "grad_norm": 1.8525855541229248, "learning_rate": 4.81468848410854e-05, "loss": 0.4384, "step": 2559 }, { "epoch": 16.623376623376622, "grad_norm": 1.7237621545791626, "learning_rate": 4.811549086650327e-05, "loss": 0.38, "step": 2560 }, { "epoch": 16.62987012987013, "grad_norm": 1.9740512371063232, "learning_rate": 4.8084097635895505e-05, "loss": 0.4612, "step": 2561 }, { "epoch": 16.636363636363637, "grad_norm": 1.9930429458618164, "learning_rate": 4.8052705161655644e-05, "loss": 0.4381, "step": 2562 }, { "epoch": 16.642857142857142, "grad_norm": 1.9352537393569946, "learning_rate": 4.802131345617694e-05, "loss": 0.4585, "step": 2563 }, { "epoch": 16.649350649350648, "grad_norm": 1.957864761352539, "learning_rate": 4.798992253185233e-05, "loss": 0.4014, "step": 2564 }, { "epoch": 16.655844155844157, "grad_norm": 1.7447909116744995, "learning_rate": 4.7958532401074504e-05, "loss": 0.3998, "step": 2565 }, { "epoch": 16.662337662337663, "grad_norm": 1.7870335578918457, "learning_rate": 4.7927143076235745e-05, "loss": 0.4279, "step": 2566 }, { "epoch": 16.66883116883117, "grad_norm": 1.8535168170928955, "learning_rate": 4.7895754569728066e-05, "loss": 0.4267, "step": 2567 }, { "epoch": 16.675324675324674, "grad_norm": 1.8935611248016357, "learning_rate": 4.7864366893943166e-05, "loss": 0.4296, "step": 2568 }, { "epoch": 16.681818181818183, "grad_norm": 1.7043728828430176, "learning_rate": 4.7832980061272384e-05, "loss": 0.3804, "step": 2569 }, { "epoch": 16.68831168831169, "grad_norm": 1.8452892303466797, "learning_rate": 4.7801594084106763e-05, "loss": 0.4273, "step": 2570 }, { "epoch": 16.694805194805195, "grad_norm": 1.8952516317367554, "learning_rate": 4.777020897483697e-05, "loss": 0.4181, "step": 2571 }, { "epoch": 16.7012987012987, "grad_norm": 1.8324185609817505, "learning_rate": 4.7738824745853374e-05, "loss": 0.4145, "step": 2572 }, { "epoch": 16.707792207792206, "grad_norm": 1.5182191133499146, "learning_rate": 4.770744140954596e-05, "loss": 0.3243, "step": 2573 }, { "epoch": 16.714285714285715, "grad_norm": 1.8509901762008667, "learning_rate": 4.767605897830435e-05, "loss": 0.4123, "step": 2574 }, { "epoch": 16.72077922077922, "grad_norm": 1.8853600025177002, "learning_rate": 4.7644677464517874e-05, "loss": 0.4459, "step": 2575 }, { "epoch": 16.727272727272727, "grad_norm": 1.9594619274139404, "learning_rate": 4.761329688057543e-05, "loss": 0.428, "step": 2576 }, { "epoch": 16.733766233766232, "grad_norm": 1.831547737121582, "learning_rate": 4.758191723886557e-05, "loss": 0.373, "step": 2577 }, { "epoch": 16.74025974025974, "grad_norm": 2.012741804122925, "learning_rate": 4.7550538551776495e-05, "loss": 0.4414, "step": 2578 }, { "epoch": 16.746753246753247, "grad_norm": 1.7638040781021118, "learning_rate": 4.7519160831695994e-05, "loss": 0.4082, "step": 2579 }, { "epoch": 16.753246753246753, "grad_norm": 1.8421845436096191, "learning_rate": 4.748778409101153e-05, "loss": 0.3676, "step": 2580 }, { "epoch": 16.75974025974026, "grad_norm": 1.9414628744125366, "learning_rate": 4.745640834211011e-05, "loss": 0.4308, "step": 2581 }, { "epoch": 16.766233766233768, "grad_norm": 1.656212329864502, "learning_rate": 4.742503359737841e-05, "loss": 0.3771, "step": 2582 }, { "epoch": 16.772727272727273, "grad_norm": 2.0582118034362793, "learning_rate": 4.739365986920265e-05, "loss": 0.4719, "step": 2583 }, { "epoch": 16.77922077922078, "grad_norm": 1.9124013185501099, "learning_rate": 4.736228716996868e-05, "loss": 0.3998, "step": 2584 }, { "epoch": 16.785714285714285, "grad_norm": 2.0058646202087402, "learning_rate": 4.7330915512061976e-05, "loss": 0.4754, "step": 2585 }, { "epoch": 16.792207792207794, "grad_norm": 1.6502070426940918, "learning_rate": 4.729954490786757e-05, "loss": 0.3287, "step": 2586 }, { "epoch": 16.7987012987013, "grad_norm": 2.0556440353393555, "learning_rate": 4.7268175369770066e-05, "loss": 0.4468, "step": 2587 }, { "epoch": 16.805194805194805, "grad_norm": 1.6197199821472168, "learning_rate": 4.723680691015366e-05, "loss": 0.3483, "step": 2588 }, { "epoch": 16.81168831168831, "grad_norm": 1.722090244293213, "learning_rate": 4.72054395414021e-05, "loss": 0.3676, "step": 2589 }, { "epoch": 16.818181818181817, "grad_norm": 1.9909186363220215, "learning_rate": 4.7174073275898776e-05, "loss": 0.451, "step": 2590 }, { "epoch": 16.824675324675326, "grad_norm": 1.7004845142364502, "learning_rate": 4.714270812602657e-05, "loss": 0.3918, "step": 2591 }, { "epoch": 16.83116883116883, "grad_norm": 1.9880518913269043, "learning_rate": 4.711134410416794e-05, "loss": 0.4583, "step": 2592 }, { "epoch": 16.837662337662337, "grad_norm": 1.7336833477020264, "learning_rate": 4.707998122270492e-05, "loss": 0.381, "step": 2593 }, { "epoch": 16.844155844155843, "grad_norm": 1.8251391649246216, "learning_rate": 4.7048619494019045e-05, "loss": 0.4075, "step": 2594 }, { "epoch": 16.850649350649352, "grad_norm": 1.8285964727401733, "learning_rate": 4.701725893049147e-05, "loss": 0.4203, "step": 2595 }, { "epoch": 16.857142857142858, "grad_norm": 1.9775934219360352, "learning_rate": 4.698589954450283e-05, "loss": 0.4004, "step": 2596 }, { "epoch": 16.863636363636363, "grad_norm": 1.754765272140503, "learning_rate": 4.6954541348433316e-05, "loss": 0.4089, "step": 2597 }, { "epoch": 16.87012987012987, "grad_norm": 1.974440097808838, "learning_rate": 4.692318435466265e-05, "loss": 0.4422, "step": 2598 }, { "epoch": 16.876623376623378, "grad_norm": 1.7709462642669678, "learning_rate": 4.6891828575570055e-05, "loss": 0.4002, "step": 2599 }, { "epoch": 16.883116883116884, "grad_norm": 2.0820789337158203, "learning_rate": 4.6860474023534335e-05, "loss": 0.4795, "step": 2600 }, { "epoch": 16.88961038961039, "grad_norm": 1.9320366382598877, "learning_rate": 4.682912071093374e-05, "loss": 0.424, "step": 2601 }, { "epoch": 16.896103896103895, "grad_norm": 1.8082619905471802, "learning_rate": 4.679776865014608e-05, "loss": 0.4278, "step": 2602 }, { "epoch": 16.9025974025974, "grad_norm": 1.7421860694885254, "learning_rate": 4.676641785354864e-05, "loss": 0.3793, "step": 2603 }, { "epoch": 16.90909090909091, "grad_norm": 1.6660720109939575, "learning_rate": 4.6735068333518205e-05, "loss": 0.3865, "step": 2604 }, { "epoch": 16.915584415584416, "grad_norm": 1.8764604330062866, "learning_rate": 4.670372010243111e-05, "loss": 0.4167, "step": 2605 }, { "epoch": 16.92207792207792, "grad_norm": 1.7141389846801758, "learning_rate": 4.667237317266311e-05, "loss": 0.3797, "step": 2606 }, { "epoch": 16.928571428571427, "grad_norm": 1.8355389833450317, "learning_rate": 4.664102755658948e-05, "loss": 0.3709, "step": 2607 }, { "epoch": 16.935064935064936, "grad_norm": 2.0633785724639893, "learning_rate": 4.6609683266584974e-05, "loss": 0.456, "step": 2608 }, { "epoch": 16.941558441558442, "grad_norm": 1.9902276992797852, "learning_rate": 4.657834031502381e-05, "loss": 0.4454, "step": 2609 }, { "epoch": 16.948051948051948, "grad_norm": 1.9734420776367188, "learning_rate": 4.654699871427971e-05, "loss": 0.4317, "step": 2610 }, { "epoch": 16.954545454545453, "grad_norm": 2.0436525344848633, "learning_rate": 4.6515658476725834e-05, "loss": 0.4827, "step": 2611 }, { "epoch": 16.961038961038962, "grad_norm": 1.8276691436767578, "learning_rate": 4.6484319614734815e-05, "loss": 0.4292, "step": 2612 }, { "epoch": 16.967532467532468, "grad_norm": 1.9008606672286987, "learning_rate": 4.6452982140678737e-05, "loss": 0.442, "step": 2613 }, { "epoch": 16.974025974025974, "grad_norm": 1.8933931589126587, "learning_rate": 4.642164606692912e-05, "loss": 0.4016, "step": 2614 }, { "epoch": 16.98051948051948, "grad_norm": 1.8439511060714722, "learning_rate": 4.639031140585697e-05, "loss": 0.3959, "step": 2615 }, { "epoch": 16.98701298701299, "grad_norm": 1.657429575920105, "learning_rate": 4.6358978169832716e-05, "loss": 0.3449, "step": 2616 }, { "epoch": 16.993506493506494, "grad_norm": 1.9203455448150635, "learning_rate": 4.632764637122622e-05, "loss": 0.4123, "step": 2617 }, { "epoch": 17.0, "grad_norm": 556.3326416015625, "learning_rate": 4.6296316022406776e-05, "loss": 0.4423, "step": 2618 }, { "epoch": 17.006493506493506, "grad_norm": 1.6963850259780884, "learning_rate": 4.6264987135743104e-05, "loss": 0.3987, "step": 2619 }, { "epoch": 17.01298701298701, "grad_norm": 1.464180827140808, "learning_rate": 4.623365972360337e-05, "loss": 0.2908, "step": 2620 }, { "epoch": 17.01948051948052, "grad_norm": 1.7919656038284302, "learning_rate": 4.620233379835513e-05, "loss": 0.3931, "step": 2621 }, { "epoch": 17.025974025974026, "grad_norm": 1.6347843408584595, "learning_rate": 4.617100937236535e-05, "loss": 0.3652, "step": 2622 }, { "epoch": 17.032467532467532, "grad_norm": 1.9906270503997803, "learning_rate": 4.613968645800044e-05, "loss": 0.388, "step": 2623 }, { "epoch": 17.038961038961038, "grad_norm": 1.695497989654541, "learning_rate": 4.6108365067626173e-05, "loss": 0.3361, "step": 2624 }, { "epoch": 17.045454545454547, "grad_norm": 1.6013907194137573, "learning_rate": 4.607704521360775e-05, "loss": 0.3529, "step": 2625 }, { "epoch": 17.051948051948052, "grad_norm": 1.7877893447875977, "learning_rate": 4.604572690830976e-05, "loss": 0.3788, "step": 2626 }, { "epoch": 17.058441558441558, "grad_norm": 1.753179907798767, "learning_rate": 4.601441016409616e-05, "loss": 0.3366, "step": 2627 }, { "epoch": 17.064935064935064, "grad_norm": 1.8334606885910034, "learning_rate": 4.5983094993330296e-05, "loss": 0.3868, "step": 2628 }, { "epoch": 17.071428571428573, "grad_norm": 1.8474700450897217, "learning_rate": 4.595178140837491e-05, "loss": 0.3494, "step": 2629 }, { "epoch": 17.07792207792208, "grad_norm": 1.5837911367416382, "learning_rate": 4.592046942159213e-05, "loss": 0.3204, "step": 2630 }, { "epoch": 17.084415584415584, "grad_norm": 1.7848855257034302, "learning_rate": 4.5889159045343404e-05, "loss": 0.3784, "step": 2631 }, { "epoch": 17.09090909090909, "grad_norm": 1.7343623638153076, "learning_rate": 4.585785029198959e-05, "loss": 0.3682, "step": 2632 }, { "epoch": 17.0974025974026, "grad_norm": 1.8310134410858154, "learning_rate": 4.582654317389089e-05, "loss": 0.3325, "step": 2633 }, { "epoch": 17.103896103896105, "grad_norm": 1.6972695589065552, "learning_rate": 4.5795237703406815e-05, "loss": 0.3462, "step": 2634 }, { "epoch": 17.11038961038961, "grad_norm": 1.6968369483947754, "learning_rate": 4.576393389289633e-05, "loss": 0.3514, "step": 2635 }, { "epoch": 17.116883116883116, "grad_norm": 1.8728262186050415, "learning_rate": 4.573263175471766e-05, "loss": 0.3878, "step": 2636 }, { "epoch": 17.123376623376622, "grad_norm": 1.6569467782974243, "learning_rate": 4.570133130122839e-05, "loss": 0.3288, "step": 2637 }, { "epoch": 17.12987012987013, "grad_norm": 1.6950936317443848, "learning_rate": 4.5670032544785444e-05, "loss": 0.3613, "step": 2638 }, { "epoch": 17.136363636363637, "grad_norm": 2.0666229724884033, "learning_rate": 4.563873549774506e-05, "loss": 0.421, "step": 2639 }, { "epoch": 17.142857142857142, "grad_norm": 1.605417251586914, "learning_rate": 4.560744017246284e-05, "loss": 0.3439, "step": 2640 }, { "epoch": 17.149350649350648, "grad_norm": 1.8102631568908691, "learning_rate": 4.5576146581293685e-05, "loss": 0.347, "step": 2641 }, { "epoch": 17.155844155844157, "grad_norm": 1.6798969507217407, "learning_rate": 4.55448547365918e-05, "loss": 0.3587, "step": 2642 }, { "epoch": 17.162337662337663, "grad_norm": 1.8787752389907837, "learning_rate": 4.5513564650710706e-05, "loss": 0.384, "step": 2643 }, { "epoch": 17.16883116883117, "grad_norm": 1.726641058921814, "learning_rate": 4.5482276336003216e-05, "loss": 0.3529, "step": 2644 }, { "epoch": 17.175324675324674, "grad_norm": 1.798478364944458, "learning_rate": 4.5450989804821506e-05, "loss": 0.3896, "step": 2645 }, { "epoch": 17.181818181818183, "grad_norm": 1.7787280082702637, "learning_rate": 4.541970506951697e-05, "loss": 0.3562, "step": 2646 }, { "epoch": 17.18831168831169, "grad_norm": 1.7351130247116089, "learning_rate": 4.538842214244035e-05, "loss": 0.3803, "step": 2647 }, { "epoch": 17.194805194805195, "grad_norm": 1.817060947418213, "learning_rate": 4.535714103594162e-05, "loss": 0.3637, "step": 2648 }, { "epoch": 17.2012987012987, "grad_norm": 1.5878382921218872, "learning_rate": 4.5325861762370065e-05, "loss": 0.3189, "step": 2649 }, { "epoch": 17.207792207792206, "grad_norm": 1.7303762435913086, "learning_rate": 4.529458433407428e-05, "loss": 0.353, "step": 2650 }, { "epoch": 17.214285714285715, "grad_norm": 1.7414559125900269, "learning_rate": 4.5263308763402084e-05, "loss": 0.401, "step": 2651 }, { "epoch": 17.22077922077922, "grad_norm": 1.6657047271728516, "learning_rate": 4.5232035062700576e-05, "loss": 0.3355, "step": 2652 }, { "epoch": 17.227272727272727, "grad_norm": 1.6541603803634644, "learning_rate": 4.520076324431611e-05, "loss": 0.3215, "step": 2653 }, { "epoch": 17.233766233766232, "grad_norm": 1.8703174591064453, "learning_rate": 4.516949332059429e-05, "loss": 0.4035, "step": 2654 }, { "epoch": 17.24025974025974, "grad_norm": 1.7139230966567993, "learning_rate": 4.513822530388003e-05, "loss": 0.3451, "step": 2655 }, { "epoch": 17.246753246753247, "grad_norm": 1.6760883331298828, "learning_rate": 4.510695920651742e-05, "loss": 0.3406, "step": 2656 }, { "epoch": 17.253246753246753, "grad_norm": 1.9123879671096802, "learning_rate": 4.507569504084983e-05, "loss": 0.4033, "step": 2657 }, { "epoch": 17.25974025974026, "grad_norm": 1.6761144399642944, "learning_rate": 4.504443281921985e-05, "loss": 0.348, "step": 2658 }, { "epoch": 17.266233766233768, "grad_norm": 1.7181462049484253, "learning_rate": 4.50131725539693e-05, "loss": 0.3511, "step": 2659 }, { "epoch": 17.272727272727273, "grad_norm": 1.8143903017044067, "learning_rate": 4.498191425743925e-05, "loss": 0.3533, "step": 2660 }, { "epoch": 17.27922077922078, "grad_norm": 1.786970615386963, "learning_rate": 4.495065794196999e-05, "loss": 0.3727, "step": 2661 }, { "epoch": 17.285714285714285, "grad_norm": 1.7093663215637207, "learning_rate": 4.491940361990101e-05, "loss": 0.3428, "step": 2662 }, { "epoch": 17.292207792207794, "grad_norm": 1.6793053150177002, "learning_rate": 4.488815130357102e-05, "loss": 0.3436, "step": 2663 }, { "epoch": 17.2987012987013, "grad_norm": 1.768581748008728, "learning_rate": 4.4856901005317934e-05, "loss": 0.365, "step": 2664 }, { "epoch": 17.305194805194805, "grad_norm": 1.8647191524505615, "learning_rate": 4.482565273747888e-05, "loss": 0.4012, "step": 2665 }, { "epoch": 17.31168831168831, "grad_norm": 1.8021681308746338, "learning_rate": 4.4794406512390175e-05, "loss": 0.359, "step": 2666 }, { "epoch": 17.318181818181817, "grad_norm": 1.8641501665115356, "learning_rate": 4.476316234238734e-05, "loss": 0.3598, "step": 2667 }, { "epoch": 17.324675324675326, "grad_norm": 1.55692458152771, "learning_rate": 4.473192023980509e-05, "loss": 0.3413, "step": 2668 }, { "epoch": 17.33116883116883, "grad_norm": 1.716506004333496, "learning_rate": 4.4700680216977284e-05, "loss": 0.3661, "step": 2669 }, { "epoch": 17.337662337662337, "grad_norm": 1.8231031894683838, "learning_rate": 4.466944228623701e-05, "loss": 0.3692, "step": 2670 }, { "epoch": 17.344155844155843, "grad_norm": 1.843737006187439, "learning_rate": 4.463820645991651e-05, "loss": 0.3593, "step": 2671 }, { "epoch": 17.350649350649352, "grad_norm": 1.4156854152679443, "learning_rate": 4.460697275034717e-05, "loss": 0.3026, "step": 2672 }, { "epoch": 17.357142857142858, "grad_norm": 1.778434157371521, "learning_rate": 4.457574116985958e-05, "loss": 0.3415, "step": 2673 }, { "epoch": 17.363636363636363, "grad_norm": 1.7231427431106567, "learning_rate": 4.4544511730783466e-05, "loss": 0.3477, "step": 2674 }, { "epoch": 17.37012987012987, "grad_norm": 1.5872408151626587, "learning_rate": 4.451328444544774e-05, "loss": 0.3341, "step": 2675 }, { "epoch": 17.376623376623378, "grad_norm": 1.6699098348617554, "learning_rate": 4.448205932618042e-05, "loss": 0.3019, "step": 2676 }, { "epoch": 17.383116883116884, "grad_norm": 1.838097333908081, "learning_rate": 4.4450836385308684e-05, "loss": 0.3664, "step": 2677 }, { "epoch": 17.38961038961039, "grad_norm": 1.6604009866714478, "learning_rate": 4.4419615635158875e-05, "loss": 0.3643, "step": 2678 }, { "epoch": 17.396103896103895, "grad_norm": 1.7098332643508911, "learning_rate": 4.43883970880564e-05, "loss": 0.3629, "step": 2679 }, { "epoch": 17.4025974025974, "grad_norm": 1.7673627138137817, "learning_rate": 4.435718075632592e-05, "loss": 0.3516, "step": 2680 }, { "epoch": 17.40909090909091, "grad_norm": 1.7328159809112549, "learning_rate": 4.4325966652291103e-05, "loss": 0.3904, "step": 2681 }, { "epoch": 17.415584415584416, "grad_norm": 1.6068525314331055, "learning_rate": 4.4294754788274796e-05, "loss": 0.3365, "step": 2682 }, { "epoch": 17.42207792207792, "grad_norm": 1.8727941513061523, "learning_rate": 4.426354517659894e-05, "loss": 0.4211, "step": 2683 }, { "epoch": 17.428571428571427, "grad_norm": 1.8097883462905884, "learning_rate": 4.4232337829584585e-05, "loss": 0.3741, "step": 2684 }, { "epoch": 17.435064935064936, "grad_norm": 1.8538397550582886, "learning_rate": 4.4201132759551934e-05, "loss": 0.3956, "step": 2685 }, { "epoch": 17.441558441558442, "grad_norm": 1.915847897529602, "learning_rate": 4.4169929978820227e-05, "loss": 0.439, "step": 2686 }, { "epoch": 17.448051948051948, "grad_norm": 1.6578996181488037, "learning_rate": 4.4138729499707844e-05, "loss": 0.3662, "step": 2687 }, { "epoch": 17.454545454545453, "grad_norm": 1.997751235961914, "learning_rate": 4.410753133453222e-05, "loss": 0.4039, "step": 2688 }, { "epoch": 17.461038961038962, "grad_norm": 1.6750801801681519, "learning_rate": 4.4076335495609914e-05, "loss": 0.3447, "step": 2689 }, { "epoch": 17.467532467532468, "grad_norm": 1.831777572631836, "learning_rate": 4.404514199525651e-05, "loss": 0.3775, "step": 2690 }, { "epoch": 17.474025974025974, "grad_norm": 1.9386937618255615, "learning_rate": 4.4013950845786764e-05, "loss": 0.3797, "step": 2691 }, { "epoch": 17.48051948051948, "grad_norm": 1.960254430770874, "learning_rate": 4.398276205951443e-05, "loss": 0.3782, "step": 2692 }, { "epoch": 17.48701298701299, "grad_norm": 1.953656554222107, "learning_rate": 4.395157564875234e-05, "loss": 0.4166, "step": 2693 }, { "epoch": 17.493506493506494, "grad_norm": 1.8518905639648438, "learning_rate": 4.392039162581239e-05, "loss": 0.3922, "step": 2694 }, { "epoch": 17.5, "grad_norm": 1.8920955657958984, "learning_rate": 4.3889210003005524e-05, "loss": 0.404, "step": 2695 }, { "epoch": 17.506493506493506, "grad_norm": 1.8469494581222534, "learning_rate": 4.38580307926418e-05, "loss": 0.3604, "step": 2696 }, { "epoch": 17.51298701298701, "grad_norm": 1.9427776336669922, "learning_rate": 4.382685400703024e-05, "loss": 0.3724, "step": 2697 }, { "epoch": 17.51948051948052, "grad_norm": 1.779253602027893, "learning_rate": 4.379567965847896e-05, "loss": 0.3737, "step": 2698 }, { "epoch": 17.525974025974026, "grad_norm": 1.773774266242981, "learning_rate": 4.376450775929509e-05, "loss": 0.3573, "step": 2699 }, { "epoch": 17.532467532467532, "grad_norm": 1.6752811670303345, "learning_rate": 4.373333832178478e-05, "loss": 0.3688, "step": 2700 }, { "epoch": 17.538961038961038, "grad_norm": 1.5234564542770386, "learning_rate": 4.370217135825329e-05, "loss": 0.3044, "step": 2701 }, { "epoch": 17.545454545454547, "grad_norm": 2.0020792484283447, "learning_rate": 4.36710068810048e-05, "loss": 0.4208, "step": 2702 }, { "epoch": 17.551948051948052, "grad_norm": 2.003157615661621, "learning_rate": 4.363984490234256e-05, "loss": 0.4105, "step": 2703 }, { "epoch": 17.558441558441558, "grad_norm": 1.762168288230896, "learning_rate": 4.360868543456883e-05, "loss": 0.3403, "step": 2704 }, { "epoch": 17.564935064935064, "grad_norm": 1.7689694166183472, "learning_rate": 4.3577528489984854e-05, "loss": 0.3379, "step": 2705 }, { "epoch": 17.571428571428573, "grad_norm": 1.7085871696472168, "learning_rate": 4.354637408089093e-05, "loss": 0.3477, "step": 2706 }, { "epoch": 17.57792207792208, "grad_norm": 2.000523567199707, "learning_rate": 4.3515222219586325e-05, "loss": 0.4261, "step": 2707 }, { "epoch": 17.584415584415584, "grad_norm": 1.7377650737762451, "learning_rate": 4.348407291836928e-05, "loss": 0.3524, "step": 2708 }, { "epoch": 17.59090909090909, "grad_norm": 1.874767541885376, "learning_rate": 4.3452926189537056e-05, "loss": 0.393, "step": 2709 }, { "epoch": 17.5974025974026, "grad_norm": 1.8511624336242676, "learning_rate": 4.342178204538588e-05, "loss": 0.42, "step": 2710 }, { "epoch": 17.603896103896105, "grad_norm": 1.9704468250274658, "learning_rate": 4.339064049821097e-05, "loss": 0.3672, "step": 2711 }, { "epoch": 17.61038961038961, "grad_norm": 1.6300371885299683, "learning_rate": 4.3359501560306535e-05, "loss": 0.3519, "step": 2712 }, { "epoch": 17.616883116883116, "grad_norm": 2.0243921279907227, "learning_rate": 4.332836524396571e-05, "loss": 0.409, "step": 2713 }, { "epoch": 17.623376623376622, "grad_norm": 1.8632975816726685, "learning_rate": 4.329723156148063e-05, "loss": 0.3849, "step": 2714 }, { "epoch": 17.62987012987013, "grad_norm": 1.7771358489990234, "learning_rate": 4.326610052514237e-05, "loss": 0.3591, "step": 2715 }, { "epoch": 17.636363636363637, "grad_norm": 1.7966933250427246, "learning_rate": 4.3234972147240996e-05, "loss": 0.3752, "step": 2716 }, { "epoch": 17.642857142857142, "grad_norm": 1.7768160104751587, "learning_rate": 4.320384644006546e-05, "loss": 0.3789, "step": 2717 }, { "epoch": 17.649350649350648, "grad_norm": 1.9179160594940186, "learning_rate": 4.317272341590373e-05, "loss": 0.4499, "step": 2718 }, { "epoch": 17.655844155844157, "grad_norm": 1.9707324504852295, "learning_rate": 4.314160308704268e-05, "loss": 0.4113, "step": 2719 }, { "epoch": 17.662337662337663, "grad_norm": 1.993179202079773, "learning_rate": 4.3110485465768096e-05, "loss": 0.4244, "step": 2720 }, { "epoch": 17.66883116883117, "grad_norm": 1.8198834657669067, "learning_rate": 4.3079370564364755e-05, "loss": 0.3837, "step": 2721 }, { "epoch": 17.675324675324674, "grad_norm": 1.6581958532333374, "learning_rate": 4.304825839511632e-05, "loss": 0.3737, "step": 2722 }, { "epoch": 17.681818181818183, "grad_norm": 1.9836208820343018, "learning_rate": 4.301714897030537e-05, "loss": 0.4257, "step": 2723 }, { "epoch": 17.68831168831169, "grad_norm": 1.7951767444610596, "learning_rate": 4.298604230221341e-05, "loss": 0.3925, "step": 2724 }, { "epoch": 17.694805194805195, "grad_norm": 1.8457961082458496, "learning_rate": 4.295493840312087e-05, "loss": 0.3986, "step": 2725 }, { "epoch": 17.7012987012987, "grad_norm": 1.9345061779022217, "learning_rate": 4.292383728530708e-05, "loss": 0.4044, "step": 2726 }, { "epoch": 17.707792207792206, "grad_norm": 1.492247462272644, "learning_rate": 4.289273896105027e-05, "loss": 0.322, "step": 2727 }, { "epoch": 17.714285714285715, "grad_norm": 1.974740982055664, "learning_rate": 4.2861643442627564e-05, "loss": 0.4248, "step": 2728 }, { "epoch": 17.72077922077922, "grad_norm": 1.7309398651123047, "learning_rate": 4.283055074231498e-05, "loss": 0.3993, "step": 2729 }, { "epoch": 17.727272727272727, "grad_norm": 1.7410228252410889, "learning_rate": 4.2799460872387394e-05, "loss": 0.3579, "step": 2730 }, { "epoch": 17.733766233766232, "grad_norm": 1.7010009288787842, "learning_rate": 4.2768373845118634e-05, "loss": 0.3611, "step": 2731 }, { "epoch": 17.74025974025974, "grad_norm": 1.7742600440979004, "learning_rate": 4.273728967278137e-05, "loss": 0.383, "step": 2732 }, { "epoch": 17.746753246753247, "grad_norm": 1.7612049579620361, "learning_rate": 4.270620836764712e-05, "loss": 0.3879, "step": 2733 }, { "epoch": 17.753246753246753, "grad_norm": 1.7385255098342896, "learning_rate": 4.267512994198629e-05, "loss": 0.367, "step": 2734 }, { "epoch": 17.75974025974026, "grad_norm": 1.9281290769577026, "learning_rate": 4.264405440806813e-05, "loss": 0.4054, "step": 2735 }, { "epoch": 17.766233766233768, "grad_norm": 1.699860692024231, "learning_rate": 4.261298177816082e-05, "loss": 0.3631, "step": 2736 }, { "epoch": 17.772727272727273, "grad_norm": 1.6525532007217407, "learning_rate": 4.258191206453131e-05, "loss": 0.3216, "step": 2737 }, { "epoch": 17.77922077922078, "grad_norm": 2.0260276794433594, "learning_rate": 4.255084527944545e-05, "loss": 0.4445, "step": 2738 }, { "epoch": 17.785714285714285, "grad_norm": 1.752463936805725, "learning_rate": 4.251978143516789e-05, "loss": 0.403, "step": 2739 }, { "epoch": 17.792207792207794, "grad_norm": 1.74684739112854, "learning_rate": 4.2488720543962146e-05, "loss": 0.3457, "step": 2740 }, { "epoch": 17.7987012987013, "grad_norm": 1.8829489946365356, "learning_rate": 4.245766261809059e-05, "loss": 0.4042, "step": 2741 }, { "epoch": 17.805194805194805, "grad_norm": 1.8385472297668457, "learning_rate": 4.242660766981438e-05, "loss": 0.3831, "step": 2742 }, { "epoch": 17.81168831168831, "grad_norm": 1.8216915130615234, "learning_rate": 4.239555571139353e-05, "loss": 0.3995, "step": 2743 }, { "epoch": 17.818181818181817, "grad_norm": 1.844178557395935, "learning_rate": 4.236450675508685e-05, "loss": 0.3864, "step": 2744 }, { "epoch": 17.824675324675326, "grad_norm": 1.8179407119750977, "learning_rate": 4.233346081315196e-05, "loss": 0.3717, "step": 2745 }, { "epoch": 17.83116883116883, "grad_norm": 1.8247555494308472, "learning_rate": 4.230241789784535e-05, "loss": 0.375, "step": 2746 }, { "epoch": 17.837662337662337, "grad_norm": 2.0518603324890137, "learning_rate": 4.2271378021422246e-05, "loss": 0.4371, "step": 2747 }, { "epoch": 17.844155844155843, "grad_norm": 1.748353362083435, "learning_rate": 4.22403411961367e-05, "loss": 0.3734, "step": 2748 }, { "epoch": 17.850649350649352, "grad_norm": 1.8702881336212158, "learning_rate": 4.2209307434241566e-05, "loss": 0.4179, "step": 2749 }, { "epoch": 17.857142857142858, "grad_norm": 1.793709635734558, "learning_rate": 4.2178276747988446e-05, "loss": 0.3758, "step": 2750 }, { "epoch": 17.863636363636363, "grad_norm": 1.9019750356674194, "learning_rate": 4.2147249149627824e-05, "loss": 0.3735, "step": 2751 }, { "epoch": 17.87012987012987, "grad_norm": 1.9998383522033691, "learning_rate": 4.211622465140887e-05, "loss": 0.4174, "step": 2752 }, { "epoch": 17.876623376623378, "grad_norm": 1.9233485460281372, "learning_rate": 4.208520326557957e-05, "loss": 0.4312, "step": 2753 }, { "epoch": 17.883116883116884, "grad_norm": 1.8096988201141357, "learning_rate": 4.205418500438667e-05, "loss": 0.3878, "step": 2754 }, { "epoch": 17.88961038961039, "grad_norm": 1.7870423793792725, "learning_rate": 4.202316988007567e-05, "loss": 0.3795, "step": 2755 }, { "epoch": 17.896103896103895, "grad_norm": 1.7298967838287354, "learning_rate": 4.1992157904890905e-05, "loss": 0.3722, "step": 2756 }, { "epoch": 17.9025974025974, "grad_norm": 1.8235975503921509, "learning_rate": 4.1961149091075376e-05, "loss": 0.3867, "step": 2757 }, { "epoch": 17.90909090909091, "grad_norm": 1.6556315422058105, "learning_rate": 4.193014345087087e-05, "loss": 0.3617, "step": 2758 }, { "epoch": 17.915584415584416, "grad_norm": 1.9418789148330688, "learning_rate": 4.1899140996517934e-05, "loss": 0.4103, "step": 2759 }, { "epoch": 17.92207792207792, "grad_norm": 1.6838346719741821, "learning_rate": 4.1868141740255823e-05, "loss": 0.3368, "step": 2760 }, { "epoch": 17.928571428571427, "grad_norm": 1.7457374334335327, "learning_rate": 4.183714569432258e-05, "loss": 0.3816, "step": 2761 }, { "epoch": 17.935064935064936, "grad_norm": 1.7318838834762573, "learning_rate": 4.1806152870954935e-05, "loss": 0.3735, "step": 2762 }, { "epoch": 17.941558441558442, "grad_norm": 1.7461531162261963, "learning_rate": 4.177516328238838e-05, "loss": 0.3603, "step": 2763 }, { "epoch": 17.948051948051948, "grad_norm": 1.9221315383911133, "learning_rate": 4.1744176940857107e-05, "loss": 0.4195, "step": 2764 }, { "epoch": 17.954545454545453, "grad_norm": 1.7519389390945435, "learning_rate": 4.171319385859401e-05, "loss": 0.3933, "step": 2765 }, { "epoch": 17.961038961038962, "grad_norm": 1.6337523460388184, "learning_rate": 4.168221404783076e-05, "loss": 0.3438, "step": 2766 }, { "epoch": 17.967532467532468, "grad_norm": 1.9572718143463135, "learning_rate": 4.165123752079768e-05, "loss": 0.4389, "step": 2767 }, { "epoch": 17.974025974025974, "grad_norm": 1.9795647859573364, "learning_rate": 4.16202642897238e-05, "loss": 0.4443, "step": 2768 }, { "epoch": 17.98051948051948, "grad_norm": 1.9309934377670288, "learning_rate": 4.158929436683686e-05, "loss": 0.3811, "step": 2769 }, { "epoch": 17.98701298701299, "grad_norm": 1.843565583229065, "learning_rate": 4.15583277643633e-05, "loss": 0.3763, "step": 2770 }, { "epoch": 17.993506493506494, "grad_norm": 1.6526652574539185, "learning_rate": 4.152736449452827e-05, "loss": 0.3714, "step": 2771 }, { "epoch": 18.0, "grad_norm": 437.5696716308594, "learning_rate": 4.149640456955555e-05, "loss": 0.3076, "step": 2772 }, { "epoch": 18.006493506493506, "grad_norm": 1.841978669166565, "learning_rate": 4.146544800166764e-05, "loss": 0.3601, "step": 2773 }, { "epoch": 18.01298701298701, "grad_norm": 1.8681656122207642, "learning_rate": 4.143449480308569e-05, "loss": 0.3841, "step": 2774 }, { "epoch": 18.01948051948052, "grad_norm": 1.8575022220611572, "learning_rate": 4.140354498602952e-05, "loss": 0.3686, "step": 2775 }, { "epoch": 18.025974025974026, "grad_norm": 1.8024266958236694, "learning_rate": 4.137259856271767e-05, "loss": 0.3586, "step": 2776 }, { "epoch": 18.032467532467532, "grad_norm": 1.7548918724060059, "learning_rate": 4.134165554536728e-05, "loss": 0.3455, "step": 2777 }, { "epoch": 18.038961038961038, "grad_norm": 1.745211124420166, "learning_rate": 4.131071594619416e-05, "loss": 0.3417, "step": 2778 }, { "epoch": 18.045454545454547, "grad_norm": 1.6816892623901367, "learning_rate": 4.127977977741276e-05, "loss": 0.3465, "step": 2779 }, { "epoch": 18.051948051948052, "grad_norm": 1.405104637145996, "learning_rate": 4.1248847051236195e-05, "loss": 0.2655, "step": 2780 }, { "epoch": 18.058441558441558, "grad_norm": 1.5914933681488037, "learning_rate": 4.1217917779876235e-05, "loss": 0.3392, "step": 2781 }, { "epoch": 18.064935064935064, "grad_norm": 1.7541837692260742, "learning_rate": 4.118699197554326e-05, "loss": 0.3566, "step": 2782 }, { "epoch": 18.071428571428573, "grad_norm": 1.7105783224105835, "learning_rate": 4.115606965044628e-05, "loss": 0.3254, "step": 2783 }, { "epoch": 18.07792207792208, "grad_norm": 1.8172639608383179, "learning_rate": 4.112515081679294e-05, "loss": 0.3437, "step": 2784 }, { "epoch": 18.084415584415584, "grad_norm": 1.8359875679016113, "learning_rate": 4.109423548678949e-05, "loss": 0.3812, "step": 2785 }, { "epoch": 18.09090909090909, "grad_norm": 1.814726710319519, "learning_rate": 4.1063323672640844e-05, "loss": 0.3743, "step": 2786 }, { "epoch": 18.0974025974026, "grad_norm": 1.6159942150115967, "learning_rate": 4.103241538655049e-05, "loss": 0.3147, "step": 2787 }, { "epoch": 18.103896103896105, "grad_norm": 1.7023439407348633, "learning_rate": 4.100151064072052e-05, "loss": 0.3237, "step": 2788 }, { "epoch": 18.11038961038961, "grad_norm": 1.512913465499878, "learning_rate": 4.097060944735164e-05, "loss": 0.2579, "step": 2789 }, { "epoch": 18.116883116883116, "grad_norm": 1.7844362258911133, "learning_rate": 4.093971181864313e-05, "loss": 0.3757, "step": 2790 }, { "epoch": 18.123376623376622, "grad_norm": 1.62827467918396, "learning_rate": 4.0908817766792923e-05, "loss": 0.3283, "step": 2791 }, { "epoch": 18.12987012987013, "grad_norm": 1.5056383609771729, "learning_rate": 4.087792730399749e-05, "loss": 0.2963, "step": 2792 }, { "epoch": 18.136363636363637, "grad_norm": 1.7066633701324463, "learning_rate": 4.0847040442451895e-05, "loss": 0.3201, "step": 2793 }, { "epoch": 18.142857142857142, "grad_norm": 1.7512034177780151, "learning_rate": 4.081615719434977e-05, "loss": 0.3273, "step": 2794 }, { "epoch": 18.149350649350648, "grad_norm": 1.8144875764846802, "learning_rate": 4.078527757188333e-05, "loss": 0.3372, "step": 2795 }, { "epoch": 18.155844155844157, "grad_norm": 1.670381784439087, "learning_rate": 4.075440158724338e-05, "loss": 0.3273, "step": 2796 }, { "epoch": 18.162337662337663, "grad_norm": 1.6355849504470825, "learning_rate": 4.072352925261927e-05, "loss": 0.3242, "step": 2797 }, { "epoch": 18.16883116883117, "grad_norm": 1.871863842010498, "learning_rate": 4.06926605801989e-05, "loss": 0.3401, "step": 2798 }, { "epoch": 18.175324675324674, "grad_norm": 1.826072335243225, "learning_rate": 4.066179558216874e-05, "loss": 0.3683, "step": 2799 }, { "epoch": 18.181818181818183, "grad_norm": 1.5390807390213013, "learning_rate": 4.063093427071376e-05, "loss": 0.2713, "step": 2800 }, { "epoch": 18.18831168831169, "grad_norm": 1.5793615579605103, "learning_rate": 4.0600076658017585e-05, "loss": 0.3185, "step": 2801 }, { "epoch": 18.194805194805195, "grad_norm": 1.7266685962677002, "learning_rate": 4.056922275626227e-05, "loss": 0.3375, "step": 2802 }, { "epoch": 18.2012987012987, "grad_norm": 1.694481611251831, "learning_rate": 4.053837257762846e-05, "loss": 0.3055, "step": 2803 }, { "epoch": 18.207792207792206, "grad_norm": 1.4903581142425537, "learning_rate": 4.0507526134295314e-05, "loss": 0.3, "step": 2804 }, { "epoch": 18.214285714285715, "grad_norm": 1.7501603364944458, "learning_rate": 4.047668343844051e-05, "loss": 0.3634, "step": 2805 }, { "epoch": 18.22077922077922, "grad_norm": 1.8231741189956665, "learning_rate": 4.044584450224026e-05, "loss": 0.3531, "step": 2806 }, { "epoch": 18.227272727272727, "grad_norm": 1.8030606508255005, "learning_rate": 4.04150093378693e-05, "loss": 0.3794, "step": 2807 }, { "epoch": 18.233766233766232, "grad_norm": 1.8993045091629028, "learning_rate": 4.0384177957500866e-05, "loss": 0.3746, "step": 2808 }, { "epoch": 18.24025974025974, "grad_norm": 1.7921812534332275, "learning_rate": 4.035335037330668e-05, "loss": 0.3513, "step": 2809 }, { "epoch": 18.246753246753247, "grad_norm": 1.8197550773620605, "learning_rate": 4.032252659745699e-05, "loss": 0.3511, "step": 2810 }, { "epoch": 18.253246753246753, "grad_norm": 1.6334441900253296, "learning_rate": 4.029170664212054e-05, "loss": 0.3345, "step": 2811 }, { "epoch": 18.25974025974026, "grad_norm": 1.8141868114471436, "learning_rate": 4.026089051946457e-05, "loss": 0.3591, "step": 2812 }, { "epoch": 18.266233766233768, "grad_norm": 1.4973831176757812, "learning_rate": 4.023007824165476e-05, "loss": 0.3168, "step": 2813 }, { "epoch": 18.272727272727273, "grad_norm": 1.5394607782363892, "learning_rate": 4.0199269820855355e-05, "loss": 0.2791, "step": 2814 }, { "epoch": 18.27922077922078, "grad_norm": 1.6770823001861572, "learning_rate": 4.0168465269229007e-05, "loss": 0.3641, "step": 2815 }, { "epoch": 18.285714285714285, "grad_norm": 1.8351795673370361, "learning_rate": 4.0137664598936857e-05, "loss": 0.3658, "step": 2816 }, { "epoch": 18.292207792207794, "grad_norm": 1.497967004776001, "learning_rate": 4.010686782213855e-05, "loss": 0.2957, "step": 2817 }, { "epoch": 18.2987012987013, "grad_norm": 1.9181640148162842, "learning_rate": 4.007607495099215e-05, "loss": 0.4133, "step": 2818 }, { "epoch": 18.305194805194805, "grad_norm": 1.8355867862701416, "learning_rate": 4.004528599765419e-05, "loss": 0.3352, "step": 2819 }, { "epoch": 18.31168831168831, "grad_norm": 1.7657628059387207, "learning_rate": 4.001450097427966e-05, "loss": 0.3513, "step": 2820 }, { "epoch": 18.318181818181817, "grad_norm": 1.620801568031311, "learning_rate": 3.9983719893022e-05, "loss": 0.3259, "step": 2821 }, { "epoch": 18.324675324675326, "grad_norm": 1.5936059951782227, "learning_rate": 3.9952942766033114e-05, "loss": 0.3073, "step": 2822 }, { "epoch": 18.33116883116883, "grad_norm": 1.7015280723571777, "learning_rate": 3.992216960546331e-05, "loss": 0.3482, "step": 2823 }, { "epoch": 18.337662337662337, "grad_norm": 1.7215479612350464, "learning_rate": 3.989140042346134e-05, "loss": 0.36, "step": 2824 }, { "epoch": 18.344155844155843, "grad_norm": 1.6612660884857178, "learning_rate": 3.986063523217439e-05, "loss": 0.3187, "step": 2825 }, { "epoch": 18.350649350649352, "grad_norm": 1.6512391567230225, "learning_rate": 3.9829874043748064e-05, "loss": 0.335, "step": 2826 }, { "epoch": 18.357142857142858, "grad_norm": 1.6896615028381348, "learning_rate": 3.9799116870326415e-05, "loss": 0.3393, "step": 2827 }, { "epoch": 18.363636363636363, "grad_norm": 1.7979546785354614, "learning_rate": 3.976836372405188e-05, "loss": 0.365, "step": 2828 }, { "epoch": 18.37012987012987, "grad_norm": 1.6790097951889038, "learning_rate": 3.97376146170653e-05, "loss": 0.3236, "step": 2829 }, { "epoch": 18.376623376623378, "grad_norm": 1.6865733861923218, "learning_rate": 3.9706869561505946e-05, "loss": 0.3563, "step": 2830 }, { "epoch": 18.383116883116884, "grad_norm": 1.7411903142929077, "learning_rate": 3.967612856951146e-05, "loss": 0.3521, "step": 2831 }, { "epoch": 18.38961038961039, "grad_norm": 1.6182326078414917, "learning_rate": 3.964539165321794e-05, "loss": 0.3391, "step": 2832 }, { "epoch": 18.396103896103895, "grad_norm": 1.5759528875350952, "learning_rate": 3.9614658824759806e-05, "loss": 0.3185, "step": 2833 }, { "epoch": 18.4025974025974, "grad_norm": 1.7490513324737549, "learning_rate": 3.95839300962699e-05, "loss": 0.3671, "step": 2834 }, { "epoch": 18.40909090909091, "grad_norm": 1.6510854959487915, "learning_rate": 3.955320547987943e-05, "loss": 0.3546, "step": 2835 }, { "epoch": 18.415584415584416, "grad_norm": 1.653049349784851, "learning_rate": 3.952248498771797e-05, "loss": 0.3631, "step": 2836 }, { "epoch": 18.42207792207792, "grad_norm": 1.5997154712677002, "learning_rate": 3.949176863191353e-05, "loss": 0.3167, "step": 2837 }, { "epoch": 18.428571428571427, "grad_norm": 1.8495252132415771, "learning_rate": 3.9461056424592416e-05, "loss": 0.3752, "step": 2838 }, { "epoch": 18.435064935064936, "grad_norm": 1.8384591341018677, "learning_rate": 3.943034837787931e-05, "loss": 0.386, "step": 2839 }, { "epoch": 18.441558441558442, "grad_norm": 1.7558294534683228, "learning_rate": 3.939964450389728e-05, "loss": 0.3559, "step": 2840 }, { "epoch": 18.448051948051948, "grad_norm": 1.6538680791854858, "learning_rate": 3.93689448147677e-05, "loss": 0.3625, "step": 2841 }, { "epoch": 18.454545454545453, "grad_norm": 1.6745864152908325, "learning_rate": 3.933824932261037e-05, "loss": 0.3418, "step": 2842 }, { "epoch": 18.461038961038962, "grad_norm": 1.6941906213760376, "learning_rate": 3.9307558039543355e-05, "loss": 0.3432, "step": 2843 }, { "epoch": 18.467532467532468, "grad_norm": 1.6110925674438477, "learning_rate": 3.927687097768309e-05, "loss": 0.3184, "step": 2844 }, { "epoch": 18.474025974025974, "grad_norm": 1.6785582304000854, "learning_rate": 3.924618814914434e-05, "loss": 0.3508, "step": 2845 }, { "epoch": 18.48051948051948, "grad_norm": 1.727658748626709, "learning_rate": 3.9215509566040184e-05, "loss": 0.3422, "step": 2846 }, { "epoch": 18.48701298701299, "grad_norm": 1.7198506593704224, "learning_rate": 3.918483524048207e-05, "loss": 0.3722, "step": 2847 }, { "epoch": 18.493506493506494, "grad_norm": 1.7855008840560913, "learning_rate": 3.915416518457974e-05, "loss": 0.3792, "step": 2848 }, { "epoch": 18.5, "grad_norm": 1.8613656759262085, "learning_rate": 3.912349941044122e-05, "loss": 0.3964, "step": 2849 }, { "epoch": 18.506493506493506, "grad_norm": 1.662196397781372, "learning_rate": 3.9092837930172884e-05, "loss": 0.3297, "step": 2850 }, { "epoch": 18.51298701298701, "grad_norm": 1.7454947233200073, "learning_rate": 3.9062180755879373e-05, "loss": 0.3956, "step": 2851 }, { "epoch": 18.51948051948052, "grad_norm": 1.6198335886001587, "learning_rate": 3.90315278996637e-05, "loss": 0.3381, "step": 2852 }, { "epoch": 18.525974025974026, "grad_norm": 1.8167154788970947, "learning_rate": 3.900087937362711e-05, "loss": 0.3992, "step": 2853 }, { "epoch": 18.532467532467532, "grad_norm": 1.5783976316452026, "learning_rate": 3.897023518986915e-05, "loss": 0.339, "step": 2854 }, { "epoch": 18.538961038961038, "grad_norm": 1.6981533765792847, "learning_rate": 3.893959536048765e-05, "loss": 0.338, "step": 2855 }, { "epoch": 18.545454545454547, "grad_norm": 1.7285051345825195, "learning_rate": 3.890895989757874e-05, "loss": 0.3378, "step": 2856 }, { "epoch": 18.551948051948052, "grad_norm": 1.6622600555419922, "learning_rate": 3.887832881323681e-05, "loss": 0.3304, "step": 2857 }, { "epoch": 18.558441558441558, "grad_norm": 1.5686423778533936, "learning_rate": 3.884770211955454e-05, "loss": 0.3254, "step": 2858 }, { "epoch": 18.564935064935064, "grad_norm": 1.7750526666641235, "learning_rate": 3.8817079828622856e-05, "loss": 0.35, "step": 2859 }, { "epoch": 18.571428571428573, "grad_norm": 1.7655175924301147, "learning_rate": 3.878646195253095e-05, "loss": 0.3663, "step": 2860 }, { "epoch": 18.57792207792208, "grad_norm": 1.6904159784317017, "learning_rate": 3.875584850336627e-05, "loss": 0.327, "step": 2861 }, { "epoch": 18.584415584415584, "grad_norm": 1.5708426237106323, "learning_rate": 3.872523949321454e-05, "loss": 0.3237, "step": 2862 }, { "epoch": 18.59090909090909, "grad_norm": 1.6561986207962036, "learning_rate": 3.869463493415969e-05, "loss": 0.3312, "step": 2863 }, { "epoch": 18.5974025974026, "grad_norm": 1.74215567111969, "learning_rate": 3.8664034838283924e-05, "loss": 0.3773, "step": 2864 }, { "epoch": 18.603896103896105, "grad_norm": 1.8055598735809326, "learning_rate": 3.863343921766769e-05, "loss": 0.3939, "step": 2865 }, { "epoch": 18.61038961038961, "grad_norm": 1.6521251201629639, "learning_rate": 3.860284808438962e-05, "loss": 0.345, "step": 2866 }, { "epoch": 18.616883116883116, "grad_norm": 1.633084774017334, "learning_rate": 3.857226145052665e-05, "loss": 0.3369, "step": 2867 }, { "epoch": 18.623376623376622, "grad_norm": 1.579702377319336, "learning_rate": 3.854167932815387e-05, "loss": 0.3075, "step": 2868 }, { "epoch": 18.62987012987013, "grad_norm": 1.4390311241149902, "learning_rate": 3.851110172934463e-05, "loss": 0.2763, "step": 2869 }, { "epoch": 18.636363636363637, "grad_norm": 1.6653575897216797, "learning_rate": 3.848052866617049e-05, "loss": 0.3611, "step": 2870 }, { "epoch": 18.642857142857142, "grad_norm": 1.8293627500534058, "learning_rate": 3.8449960150701165e-05, "loss": 0.3347, "step": 2871 }, { "epoch": 18.649350649350648, "grad_norm": 1.6536465883255005, "learning_rate": 3.841939619500468e-05, "loss": 0.3093, "step": 2872 }, { "epoch": 18.655844155844157, "grad_norm": 1.714642882347107, "learning_rate": 3.8388836811147176e-05, "loss": 0.328, "step": 2873 }, { "epoch": 18.662337662337663, "grad_norm": 1.7932056188583374, "learning_rate": 3.835828201119301e-05, "loss": 0.3588, "step": 2874 }, { "epoch": 18.66883116883117, "grad_norm": 1.773572564125061, "learning_rate": 3.832773180720475e-05, "loss": 0.3578, "step": 2875 }, { "epoch": 18.675324675324674, "grad_norm": 1.6852892637252808, "learning_rate": 3.829718621124308e-05, "loss": 0.34, "step": 2876 }, { "epoch": 18.681818181818183, "grad_norm": 1.751536250114441, "learning_rate": 3.8266645235367e-05, "loss": 0.3527, "step": 2877 }, { "epoch": 18.68831168831169, "grad_norm": 1.6013262271881104, "learning_rate": 3.8236108891633535e-05, "loss": 0.3312, "step": 2878 }, { "epoch": 18.694805194805195, "grad_norm": 1.51932954788208, "learning_rate": 3.820557719209799e-05, "loss": 0.2631, "step": 2879 }, { "epoch": 18.7012987012987, "grad_norm": 1.4073882102966309, "learning_rate": 3.817505014881378e-05, "loss": 0.294, "step": 2880 }, { "epoch": 18.707792207792206, "grad_norm": 1.9614096879959106, "learning_rate": 3.8144527773832476e-05, "loss": 0.3926, "step": 2881 }, { "epoch": 18.714285714285715, "grad_norm": 1.650420904159546, "learning_rate": 3.811401007920388e-05, "loss": 0.3025, "step": 2882 }, { "epoch": 18.72077922077922, "grad_norm": 1.8370767831802368, "learning_rate": 3.808349707697586e-05, "loss": 0.3491, "step": 2883 }, { "epoch": 18.727272727272727, "grad_norm": 1.7715657949447632, "learning_rate": 3.8052988779194475e-05, "loss": 0.3387, "step": 2884 }, { "epoch": 18.733766233766232, "grad_norm": 1.6910730600357056, "learning_rate": 3.8022485197903925e-05, "loss": 0.3481, "step": 2885 }, { "epoch": 18.74025974025974, "grad_norm": 1.5854307413101196, "learning_rate": 3.7991986345146505e-05, "loss": 0.3068, "step": 2886 }, { "epoch": 18.746753246753247, "grad_norm": 1.5841996669769287, "learning_rate": 3.796149223296272e-05, "loss": 0.3202, "step": 2887 }, { "epoch": 18.753246753246753, "grad_norm": 1.6605578660964966, "learning_rate": 3.7931002873391154e-05, "loss": 0.3359, "step": 2888 }, { "epoch": 18.75974025974026, "grad_norm": 1.7045667171478271, "learning_rate": 3.790051827846851e-05, "loss": 0.3556, "step": 2889 }, { "epoch": 18.766233766233768, "grad_norm": 1.8820583820343018, "learning_rate": 3.787003846022964e-05, "loss": 0.3696, "step": 2890 }, { "epoch": 18.772727272727273, "grad_norm": 1.895405888557434, "learning_rate": 3.783956343070746e-05, "loss": 0.3668, "step": 2891 }, { "epoch": 18.77922077922078, "grad_norm": 1.6554292440414429, "learning_rate": 3.780909320193308e-05, "loss": 0.3216, "step": 2892 }, { "epoch": 18.785714285714285, "grad_norm": 1.6428215503692627, "learning_rate": 3.7778627785935626e-05, "loss": 0.3389, "step": 2893 }, { "epoch": 18.792207792207794, "grad_norm": 1.714737892150879, "learning_rate": 3.774816719474238e-05, "loss": 0.3754, "step": 2894 }, { "epoch": 18.7987012987013, "grad_norm": 1.8068455457687378, "learning_rate": 3.7717711440378694e-05, "loss": 0.3714, "step": 2895 }, { "epoch": 18.805194805194805, "grad_norm": 1.7066638469696045, "learning_rate": 3.7687260534868e-05, "loss": 0.3427, "step": 2896 }, { "epoch": 18.81168831168831, "grad_norm": 1.665046215057373, "learning_rate": 3.7656814490231865e-05, "loss": 0.3427, "step": 2897 }, { "epoch": 18.818181818181817, "grad_norm": 1.769525170326233, "learning_rate": 3.7626373318489886e-05, "loss": 0.3635, "step": 2898 }, { "epoch": 18.824675324675326, "grad_norm": 1.657949447631836, "learning_rate": 3.759593703165977e-05, "loss": 0.3259, "step": 2899 }, { "epoch": 18.83116883116883, "grad_norm": 1.7202316522598267, "learning_rate": 3.756550564175727e-05, "loss": 0.3456, "step": 2900 }, { "epoch": 18.837662337662337, "grad_norm": 1.6598058938980103, "learning_rate": 3.75350791607962e-05, "loss": 0.3365, "step": 2901 }, { "epoch": 18.844155844155843, "grad_norm": 1.5769715309143066, "learning_rate": 3.7504657600788485e-05, "loss": 0.2986, "step": 2902 }, { "epoch": 18.850649350649352, "grad_norm": 1.7609301805496216, "learning_rate": 3.7474240973744066e-05, "loss": 0.3745, "step": 2903 }, { "epoch": 18.857142857142858, "grad_norm": 1.7873097658157349, "learning_rate": 3.744382929167094e-05, "loss": 0.3685, "step": 2904 }, { "epoch": 18.863636363636363, "grad_norm": 1.8870104551315308, "learning_rate": 3.741342256657515e-05, "loss": 0.3903, "step": 2905 }, { "epoch": 18.87012987012987, "grad_norm": 1.7489452362060547, "learning_rate": 3.73830208104608e-05, "loss": 0.3384, "step": 2906 }, { "epoch": 18.876623376623378, "grad_norm": 1.8287419080734253, "learning_rate": 3.735262403533002e-05, "loss": 0.3783, "step": 2907 }, { "epoch": 18.883116883116884, "grad_norm": 1.9345002174377441, "learning_rate": 3.732223225318298e-05, "loss": 0.3855, "step": 2908 }, { "epoch": 18.88961038961039, "grad_norm": 1.6523834466934204, "learning_rate": 3.7291845476017863e-05, "loss": 0.3345, "step": 2909 }, { "epoch": 18.896103896103895, "grad_norm": 1.8098790645599365, "learning_rate": 3.72614637158309e-05, "loss": 0.3535, "step": 2910 }, { "epoch": 18.9025974025974, "grad_norm": 1.6670156717300415, "learning_rate": 3.723108698461631e-05, "loss": 0.3531, "step": 2911 }, { "epoch": 18.90909090909091, "grad_norm": 1.406030297279358, "learning_rate": 3.720071529436637e-05, "loss": 0.2886, "step": 2912 }, { "epoch": 18.915584415584416, "grad_norm": 1.7998902797698975, "learning_rate": 3.717034865707133e-05, "loss": 0.3804, "step": 2913 }, { "epoch": 18.92207792207792, "grad_norm": 1.549731969833374, "learning_rate": 3.713998708471946e-05, "loss": 0.2818, "step": 2914 }, { "epoch": 18.928571428571427, "grad_norm": 1.6668057441711426, "learning_rate": 3.710963058929701e-05, "loss": 0.3345, "step": 2915 }, { "epoch": 18.935064935064936, "grad_norm": 1.7145839929580688, "learning_rate": 3.7079279182788265e-05, "loss": 0.3366, "step": 2916 }, { "epoch": 18.941558441558442, "grad_norm": 2.05721378326416, "learning_rate": 3.704893287717548e-05, "loss": 0.3865, "step": 2917 }, { "epoch": 18.948051948051948, "grad_norm": 1.7486335039138794, "learning_rate": 3.7018591684438895e-05, "loss": 0.3478, "step": 2918 }, { "epoch": 18.954545454545453, "grad_norm": 1.9624111652374268, "learning_rate": 3.698825561655672e-05, "loss": 0.3507, "step": 2919 }, { "epoch": 18.961038961038962, "grad_norm": 1.841869592666626, "learning_rate": 3.6957924685505167e-05, "loss": 0.3801, "step": 2920 }, { "epoch": 18.967532467532468, "grad_norm": 1.7049486637115479, "learning_rate": 3.6927598903258374e-05, "loss": 0.3277, "step": 2921 }, { "epoch": 18.974025974025974, "grad_norm": 1.6004366874694824, "learning_rate": 3.689727828178854e-05, "loss": 0.3265, "step": 2922 }, { "epoch": 18.98051948051948, "grad_norm": 1.834167242050171, "learning_rate": 3.686696283306572e-05, "loss": 0.3823, "step": 2923 }, { "epoch": 18.98701298701299, "grad_norm": 1.7891381978988647, "learning_rate": 3.683665256905799e-05, "loss": 0.3357, "step": 2924 }, { "epoch": 18.993506493506494, "grad_norm": 1.7981270551681519, "learning_rate": 3.680634750173137e-05, "loss": 0.3763, "step": 2925 }, { "epoch": 19.0, "grad_norm": 2582.5048828125, "learning_rate": 3.677604764304978e-05, "loss": 0.4122, "step": 2926 }, { "epoch": 19.006493506493506, "grad_norm": 1.5977998971939087, "learning_rate": 3.674575300497517e-05, "loss": 0.3098, "step": 2927 }, { "epoch": 19.01298701298701, "grad_norm": 1.6671226024627686, "learning_rate": 3.671546359946737e-05, "loss": 0.3317, "step": 2928 }, { "epoch": 19.01948051948052, "grad_norm": 1.695551872253418, "learning_rate": 3.668517943848416e-05, "loss": 0.3399, "step": 2929 }, { "epoch": 19.025974025974026, "grad_norm": 1.5789490938186646, "learning_rate": 3.665490053398123e-05, "loss": 0.3255, "step": 2930 }, { "epoch": 19.032467532467532, "grad_norm": 1.682943344116211, "learning_rate": 3.662462689791221e-05, "loss": 0.3249, "step": 2931 }, { "epoch": 19.038961038961038, "grad_norm": 1.6995230913162231, "learning_rate": 3.659435854222869e-05, "loss": 0.3225, "step": 2932 }, { "epoch": 19.045454545454547, "grad_norm": 1.6949334144592285, "learning_rate": 3.656409547888011e-05, "loss": 0.341, "step": 2933 }, { "epoch": 19.051948051948052, "grad_norm": 1.3892732858657837, "learning_rate": 3.6533837719813844e-05, "loss": 0.2831, "step": 2934 }, { "epoch": 19.058441558441558, "grad_norm": 1.6560840606689453, "learning_rate": 3.650358527697519e-05, "loss": 0.2893, "step": 2935 }, { "epoch": 19.064935064935064, "grad_norm": 1.8238203525543213, "learning_rate": 3.6473338162307314e-05, "loss": 0.3687, "step": 2936 }, { "epoch": 19.071428571428573, "grad_norm": 1.453324556350708, "learning_rate": 3.644309638775132e-05, "loss": 0.2899, "step": 2937 }, { "epoch": 19.07792207792208, "grad_norm": 1.6432727575302124, "learning_rate": 3.641285996524617e-05, "loss": 0.3141, "step": 2938 }, { "epoch": 19.084415584415584, "grad_norm": 1.5617375373840332, "learning_rate": 3.6382628906728736e-05, "loss": 0.3165, "step": 2939 }, { "epoch": 19.09090909090909, "grad_norm": 1.6537740230560303, "learning_rate": 3.635240322413374e-05, "loss": 0.2997, "step": 2940 }, { "epoch": 19.0974025974026, "grad_norm": 1.7960238456726074, "learning_rate": 3.632218292939383e-05, "loss": 0.3379, "step": 2941 }, { "epoch": 19.103896103896105, "grad_norm": 1.6603929996490479, "learning_rate": 3.629196803443946e-05, "loss": 0.3082, "step": 2942 }, { "epoch": 19.11038961038961, "grad_norm": 1.590101957321167, "learning_rate": 3.626175855119903e-05, "loss": 0.3037, "step": 2943 }, { "epoch": 19.116883116883116, "grad_norm": 1.5472991466522217, "learning_rate": 3.6231554491598754e-05, "loss": 0.28, "step": 2944 }, { "epoch": 19.123376623376622, "grad_norm": 1.5755720138549805, "learning_rate": 3.620135586756273e-05, "loss": 0.3161, "step": 2945 }, { "epoch": 19.12987012987013, "grad_norm": 1.9312375783920288, "learning_rate": 3.617116269101286e-05, "loss": 0.3817, "step": 2946 }, { "epoch": 19.136363636363637, "grad_norm": 1.5177223682403564, "learning_rate": 3.614097497386894e-05, "loss": 0.2559, "step": 2947 }, { "epoch": 19.142857142857142, "grad_norm": 1.3481799364089966, "learning_rate": 3.6110792728048635e-05, "loss": 0.2471, "step": 2948 }, { "epoch": 19.149350649350648, "grad_norm": 1.6340221166610718, "learning_rate": 3.60806159654674e-05, "loss": 0.2777, "step": 2949 }, { "epoch": 19.155844155844157, "grad_norm": 1.69704270362854, "learning_rate": 3.605044469803854e-05, "loss": 0.3263, "step": 2950 }, { "epoch": 19.162337662337663, "grad_norm": 1.886120080947876, "learning_rate": 3.60202789376732e-05, "loss": 0.3291, "step": 2951 }, { "epoch": 19.16883116883117, "grad_norm": 1.6609972715377808, "learning_rate": 3.599011869628033e-05, "loss": 0.3177, "step": 2952 }, { "epoch": 19.175324675324674, "grad_norm": 1.59373939037323, "learning_rate": 3.595996398576672e-05, "loss": 0.2988, "step": 2953 }, { "epoch": 19.181818181818183, "grad_norm": 1.4958374500274658, "learning_rate": 3.592981481803699e-05, "loss": 0.2744, "step": 2954 }, { "epoch": 19.18831168831169, "grad_norm": 1.7184253931045532, "learning_rate": 3.589967120499353e-05, "loss": 0.2971, "step": 2955 }, { "epoch": 19.194805194805195, "grad_norm": 1.5311522483825684, "learning_rate": 3.5869533158536585e-05, "loss": 0.2916, "step": 2956 }, { "epoch": 19.2012987012987, "grad_norm": 1.8133670091629028, "learning_rate": 3.583940069056415e-05, "loss": 0.351, "step": 2957 }, { "epoch": 19.207792207792206, "grad_norm": 1.4929035902023315, "learning_rate": 3.5809273812972074e-05, "loss": 0.2954, "step": 2958 }, { "epoch": 19.214285714285715, "grad_norm": 1.6833645105361938, "learning_rate": 3.577915253765396e-05, "loss": 0.3147, "step": 2959 }, { "epoch": 19.22077922077922, "grad_norm": 1.6576787233352661, "learning_rate": 3.5749036876501194e-05, "loss": 0.3204, "step": 2960 }, { "epoch": 19.227272727272727, "grad_norm": 1.6584258079528809, "learning_rate": 3.5718926841402996e-05, "loss": 0.2874, "step": 2961 }, { "epoch": 19.233766233766232, "grad_norm": 1.7899978160858154, "learning_rate": 3.5688822444246294e-05, "loss": 0.3193, "step": 2962 }, { "epoch": 19.24025974025974, "grad_norm": 1.8224332332611084, "learning_rate": 3.565872369691586e-05, "loss": 0.344, "step": 2963 }, { "epoch": 19.246753246753247, "grad_norm": 1.674277424812317, "learning_rate": 3.5628630611294186e-05, "loss": 0.33, "step": 2964 }, { "epoch": 19.253246753246753, "grad_norm": 1.574242115020752, "learning_rate": 3.559854319926156e-05, "loss": 0.2882, "step": 2965 }, { "epoch": 19.25974025974026, "grad_norm": 1.7872192859649658, "learning_rate": 3.556846147269598e-05, "loss": 0.3288, "step": 2966 }, { "epoch": 19.266233766233768, "grad_norm": 1.3709115982055664, "learning_rate": 3.553838544347326e-05, "loss": 0.2571, "step": 2967 }, { "epoch": 19.272727272727273, "grad_norm": 1.5480459928512573, "learning_rate": 3.550831512346695e-05, "loss": 0.2829, "step": 2968 }, { "epoch": 19.27922077922078, "grad_norm": 1.5414947271347046, "learning_rate": 3.5478250524548326e-05, "loss": 0.2986, "step": 2969 }, { "epoch": 19.285714285714285, "grad_norm": 1.7619088888168335, "learning_rate": 3.544819165858642e-05, "loss": 0.3354, "step": 2970 }, { "epoch": 19.292207792207794, "grad_norm": 1.6412689685821533, "learning_rate": 3.5418138537447985e-05, "loss": 0.307, "step": 2971 }, { "epoch": 19.2987012987013, "grad_norm": 1.6454576253890991, "learning_rate": 3.5388091172997504e-05, "loss": 0.3346, "step": 2972 }, { "epoch": 19.305194805194805, "grad_norm": 1.6400692462921143, "learning_rate": 3.535804957709724e-05, "loss": 0.3235, "step": 2973 }, { "epoch": 19.31168831168831, "grad_norm": 1.4973965883255005, "learning_rate": 3.532801376160713e-05, "loss": 0.2836, "step": 2974 }, { "epoch": 19.318181818181817, "grad_norm": 1.6952433586120605, "learning_rate": 3.529798373838481e-05, "loss": 0.3536, "step": 2975 }, { "epoch": 19.324675324675326, "grad_norm": 1.752414584159851, "learning_rate": 3.526795951928569e-05, "loss": 0.341, "step": 2976 }, { "epoch": 19.33116883116883, "grad_norm": 1.3718217611312866, "learning_rate": 3.523794111616281e-05, "loss": 0.2255, "step": 2977 }, { "epoch": 19.337662337662337, "grad_norm": 2.0626535415649414, "learning_rate": 3.520792854086702e-05, "loss": 0.382, "step": 2978 }, { "epoch": 19.344155844155843, "grad_norm": 1.677330493927002, "learning_rate": 3.517792180524677e-05, "loss": 0.3394, "step": 2979 }, { "epoch": 19.350649350649352, "grad_norm": 1.6705995798110962, "learning_rate": 3.5147920921148267e-05, "loss": 0.3225, "step": 2980 }, { "epoch": 19.357142857142858, "grad_norm": 1.663895845413208, "learning_rate": 3.511792590041537e-05, "loss": 0.3093, "step": 2981 }, { "epoch": 19.363636363636363, "grad_norm": 1.728903889656067, "learning_rate": 3.508793675488961e-05, "loss": 0.3399, "step": 2982 }, { "epoch": 19.37012987012987, "grad_norm": 1.7410802841186523, "learning_rate": 3.505795349641029e-05, "loss": 0.3498, "step": 2983 }, { "epoch": 19.376623376623378, "grad_norm": 1.7982991933822632, "learning_rate": 3.502797613681429e-05, "loss": 0.354, "step": 2984 }, { "epoch": 19.383116883116884, "grad_norm": 1.854328989982605, "learning_rate": 3.4998004687936196e-05, "loss": 0.3506, "step": 2985 }, { "epoch": 19.38961038961039, "grad_norm": 1.7890002727508545, "learning_rate": 3.496803916160827e-05, "loss": 0.353, "step": 2986 }, { "epoch": 19.396103896103895, "grad_norm": 1.9270013570785522, "learning_rate": 3.49380795696604e-05, "loss": 0.3728, "step": 2987 }, { "epoch": 19.4025974025974, "grad_norm": 1.7479575872421265, "learning_rate": 3.49081259239202e-05, "loss": 0.308, "step": 2988 }, { "epoch": 19.40909090909091, "grad_norm": 1.9831420183181763, "learning_rate": 3.487817823621288e-05, "loss": 0.3861, "step": 2989 }, { "epoch": 19.415584415584416, "grad_norm": 1.539864420890808, "learning_rate": 3.484823651836131e-05, "loss": 0.2824, "step": 2990 }, { "epoch": 19.42207792207792, "grad_norm": 1.5040315389633179, "learning_rate": 3.4818300782186e-05, "loss": 0.3004, "step": 2991 }, { "epoch": 19.428571428571427, "grad_norm": 1.6518138647079468, "learning_rate": 3.478837103950509e-05, "loss": 0.3054, "step": 2992 }, { "epoch": 19.435064935064936, "grad_norm": 1.6713945865631104, "learning_rate": 3.4758447302134414e-05, "loss": 0.3151, "step": 2993 }, { "epoch": 19.441558441558442, "grad_norm": 1.9557679891586304, "learning_rate": 3.472852958188736e-05, "loss": 0.3748, "step": 2994 }, { "epoch": 19.448051948051948, "grad_norm": 1.636669397354126, "learning_rate": 3.469861789057497e-05, "loss": 0.3074, "step": 2995 }, { "epoch": 19.454545454545453, "grad_norm": 1.7408828735351562, "learning_rate": 3.466871224000591e-05, "loss": 0.3598, "step": 2996 }, { "epoch": 19.461038961038962, "grad_norm": 1.6687767505645752, "learning_rate": 3.4638812641986454e-05, "loss": 0.3314, "step": 2997 }, { "epoch": 19.467532467532468, "grad_norm": 1.8104026317596436, "learning_rate": 3.460891910832049e-05, "loss": 0.3432, "step": 2998 }, { "epoch": 19.474025974025974, "grad_norm": 1.579221487045288, "learning_rate": 3.457903165080952e-05, "loss": 0.3125, "step": 2999 }, { "epoch": 19.48051948051948, "grad_norm": 1.7088932991027832, "learning_rate": 3.4549150281252636e-05, "loss": 0.3223, "step": 3000 }, { "epoch": 19.48701298701299, "grad_norm": 1.7969592809677124, "learning_rate": 3.451927501144653e-05, "loss": 0.3435, "step": 3001 }, { "epoch": 19.493506493506494, "grad_norm": 1.6600782871246338, "learning_rate": 3.4489405853185465e-05, "loss": 0.3023, "step": 3002 }, { "epoch": 19.5, "grad_norm": 1.775125503540039, "learning_rate": 3.445954281826134e-05, "loss": 0.3673, "step": 3003 }, { "epoch": 19.506493506493506, "grad_norm": 1.6707899570465088, "learning_rate": 3.442968591846359e-05, "loss": 0.333, "step": 3004 }, { "epoch": 19.51298701298701, "grad_norm": 1.7193658351898193, "learning_rate": 3.4399835165579266e-05, "loss": 0.3202, "step": 3005 }, { "epoch": 19.51948051948052, "grad_norm": 1.5874290466308594, "learning_rate": 3.436999057139295e-05, "loss": 0.3037, "step": 3006 }, { "epoch": 19.525974025974026, "grad_norm": 1.5743268728256226, "learning_rate": 3.4340152147686824e-05, "loss": 0.3133, "step": 3007 }, { "epoch": 19.532467532467532, "grad_norm": 2.0078327655792236, "learning_rate": 3.4310319906240626e-05, "loss": 0.3793, "step": 3008 }, { "epoch": 19.538961038961038, "grad_norm": 1.4190967082977295, "learning_rate": 3.428049385883166e-05, "loss": 0.2745, "step": 3009 }, { "epoch": 19.545454545454547, "grad_norm": 1.8206357955932617, "learning_rate": 3.425067401723477e-05, "loss": 0.3723, "step": 3010 }, { "epoch": 19.551948051948052, "grad_norm": 1.7096072435379028, "learning_rate": 3.422086039322235e-05, "loss": 0.345, "step": 3011 }, { "epoch": 19.558441558441558, "grad_norm": 1.5544872283935547, "learning_rate": 3.4191052998564344e-05, "loss": 0.3079, "step": 3012 }, { "epoch": 19.564935064935064, "grad_norm": 1.5369852781295776, "learning_rate": 3.4161251845028265e-05, "loss": 0.2774, "step": 3013 }, { "epoch": 19.571428571428573, "grad_norm": 1.5764816999435425, "learning_rate": 3.413145694437912e-05, "loss": 0.3086, "step": 3014 }, { "epoch": 19.57792207792208, "grad_norm": 1.6694865226745605, "learning_rate": 3.4101668308379466e-05, "loss": 0.3302, "step": 3015 }, { "epoch": 19.584415584415584, "grad_norm": 1.5649733543395996, "learning_rate": 3.407188594878938e-05, "loss": 0.303, "step": 3016 }, { "epoch": 19.59090909090909, "grad_norm": 1.5806865692138672, "learning_rate": 3.4042109877366446e-05, "loss": 0.3317, "step": 3017 }, { "epoch": 19.5974025974026, "grad_norm": 1.6395881175994873, "learning_rate": 3.401234010586583e-05, "loss": 0.3287, "step": 3018 }, { "epoch": 19.603896103896105, "grad_norm": 1.7937806844711304, "learning_rate": 3.398257664604015e-05, "loss": 0.3613, "step": 3019 }, { "epoch": 19.61038961038961, "grad_norm": 1.600665807723999, "learning_rate": 3.3952819509639534e-05, "loss": 0.2979, "step": 3020 }, { "epoch": 19.616883116883116, "grad_norm": 1.754226565361023, "learning_rate": 3.392306870841164e-05, "loss": 0.3036, "step": 3021 }, { "epoch": 19.623376623376622, "grad_norm": 1.7161558866500854, "learning_rate": 3.38933242541016e-05, "loss": 0.3383, "step": 3022 }, { "epoch": 19.62987012987013, "grad_norm": 1.8866463899612427, "learning_rate": 3.386358615845207e-05, "loss": 0.3546, "step": 3023 }, { "epoch": 19.636363636363637, "grad_norm": 1.4768173694610596, "learning_rate": 3.3833854433203185e-05, "loss": 0.2707, "step": 3024 }, { "epoch": 19.642857142857142, "grad_norm": 1.6149816513061523, "learning_rate": 3.380412909009254e-05, "loss": 0.3003, "step": 3025 }, { "epoch": 19.649350649350648, "grad_norm": 1.6946154832839966, "learning_rate": 3.377441014085524e-05, "loss": 0.3295, "step": 3026 }, { "epoch": 19.655844155844157, "grad_norm": 1.621476411819458, "learning_rate": 3.374469759722383e-05, "loss": 0.2983, "step": 3027 }, { "epoch": 19.662337662337663, "grad_norm": 1.5453895330429077, "learning_rate": 3.371499147092839e-05, "loss": 0.2905, "step": 3028 }, { "epoch": 19.66883116883117, "grad_norm": 1.684995174407959, "learning_rate": 3.3685291773696424e-05, "loss": 0.3481, "step": 3029 }, { "epoch": 19.675324675324674, "grad_norm": 1.8596428632736206, "learning_rate": 3.3655598517252885e-05, "loss": 0.3543, "step": 3030 }, { "epoch": 19.681818181818183, "grad_norm": 1.7348289489746094, "learning_rate": 3.36259117133202e-05, "loss": 0.3107, "step": 3031 }, { "epoch": 19.68831168831169, "grad_norm": 1.6357989311218262, "learning_rate": 3.359623137361825e-05, "loss": 0.3186, "step": 3032 }, { "epoch": 19.694805194805195, "grad_norm": 1.6792805194854736, "learning_rate": 3.3566557509864375e-05, "loss": 0.3325, "step": 3033 }, { "epoch": 19.7012987012987, "grad_norm": 1.4639986753463745, "learning_rate": 3.353689013377334e-05, "loss": 0.2751, "step": 3034 }, { "epoch": 19.707792207792206, "grad_norm": 1.742875576019287, "learning_rate": 3.350722925705736e-05, "loss": 0.356, "step": 3035 }, { "epoch": 19.714285714285715, "grad_norm": 1.6753621101379395, "learning_rate": 3.3477574891426074e-05, "loss": 0.2926, "step": 3036 }, { "epoch": 19.72077922077922, "grad_norm": 1.6364171504974365, "learning_rate": 3.344792704858654e-05, "loss": 0.3068, "step": 3037 }, { "epoch": 19.727272727272727, "grad_norm": 1.66275954246521, "learning_rate": 3.3418285740243286e-05, "loss": 0.3142, "step": 3038 }, { "epoch": 19.733766233766232, "grad_norm": 1.732170581817627, "learning_rate": 3.3388650978098215e-05, "loss": 0.365, "step": 3039 }, { "epoch": 19.74025974025974, "grad_norm": 1.7453938722610474, "learning_rate": 3.335902277385067e-05, "loss": 0.3281, "step": 3040 }, { "epoch": 19.746753246753247, "grad_norm": 1.7581194639205933, "learning_rate": 3.332940113919739e-05, "loss": 0.361, "step": 3041 }, { "epoch": 19.753246753246753, "grad_norm": 1.916835904121399, "learning_rate": 3.3299786085832516e-05, "loss": 0.3636, "step": 3042 }, { "epoch": 19.75974025974026, "grad_norm": 1.6435821056365967, "learning_rate": 3.3270177625447626e-05, "loss": 0.2857, "step": 3043 }, { "epoch": 19.766233766233768, "grad_norm": 1.5829612016677856, "learning_rate": 3.324057576973166e-05, "loss": 0.3003, "step": 3044 }, { "epoch": 19.772727272727273, "grad_norm": 1.7046080827713013, "learning_rate": 3.321098053037097e-05, "loss": 0.3536, "step": 3045 }, { "epoch": 19.77922077922078, "grad_norm": 1.7147088050842285, "learning_rate": 3.318139191904928e-05, "loss": 0.3475, "step": 3046 }, { "epoch": 19.785714285714285, "grad_norm": 1.7734124660491943, "learning_rate": 3.315180994744769e-05, "loss": 0.3469, "step": 3047 }, { "epoch": 19.792207792207794, "grad_norm": 1.7753067016601562, "learning_rate": 3.312223462724472e-05, "loss": 0.3305, "step": 3048 }, { "epoch": 19.7987012987013, "grad_norm": 1.699244737625122, "learning_rate": 3.309266597011621e-05, "loss": 0.3498, "step": 3049 }, { "epoch": 19.805194805194805, "grad_norm": 1.6749790906906128, "learning_rate": 3.3063103987735433e-05, "loss": 0.3161, "step": 3050 }, { "epoch": 19.81168831168831, "grad_norm": 1.7554203271865845, "learning_rate": 3.303354869177297e-05, "loss": 0.3541, "step": 3051 }, { "epoch": 19.818181818181817, "grad_norm": 1.6937886476516724, "learning_rate": 3.300400009389678e-05, "loss": 0.3389, "step": 3052 }, { "epoch": 19.824675324675326, "grad_norm": 1.8646681308746338, "learning_rate": 3.297445820577219e-05, "loss": 0.3488, "step": 3053 }, { "epoch": 19.83116883116883, "grad_norm": 1.6930698156356812, "learning_rate": 3.294492303906188e-05, "loss": 0.3195, "step": 3054 }, { "epoch": 19.837662337662337, "grad_norm": 1.7790641784667969, "learning_rate": 3.2915394605425835e-05, "loss": 0.3383, "step": 3055 }, { "epoch": 19.844155844155843, "grad_norm": 1.6845293045043945, "learning_rate": 3.288587291652144e-05, "loss": 0.3374, "step": 3056 }, { "epoch": 19.850649350649352, "grad_norm": 1.7920316457748413, "learning_rate": 3.285635798400338e-05, "loss": 0.3212, "step": 3057 }, { "epoch": 19.857142857142858, "grad_norm": 1.9233556985855103, "learning_rate": 3.282684981952369e-05, "loss": 0.3939, "step": 3058 }, { "epoch": 19.863636363636363, "grad_norm": 1.7246575355529785, "learning_rate": 3.279734843473172e-05, "loss": 0.3217, "step": 3059 }, { "epoch": 19.87012987012987, "grad_norm": 1.705918788909912, "learning_rate": 3.276785384127415e-05, "loss": 0.3291, "step": 3060 }, { "epoch": 19.876623376623378, "grad_norm": 1.650768518447876, "learning_rate": 3.2738366050794985e-05, "loss": 0.3034, "step": 3061 }, { "epoch": 19.883116883116884, "grad_norm": 1.6976193189620972, "learning_rate": 3.2708885074935514e-05, "loss": 0.3167, "step": 3062 }, { "epoch": 19.88961038961039, "grad_norm": 1.6339914798736572, "learning_rate": 3.2679410925334385e-05, "loss": 0.3496, "step": 3063 }, { "epoch": 19.896103896103895, "grad_norm": 1.7407292127609253, "learning_rate": 3.2649943613627535e-05, "loss": 0.3595, "step": 3064 }, { "epoch": 19.9025974025974, "grad_norm": 1.7094537019729614, "learning_rate": 3.262048315144815e-05, "loss": 0.3445, "step": 3065 }, { "epoch": 19.90909090909091, "grad_norm": 1.8952274322509766, "learning_rate": 3.2591029550426796e-05, "loss": 0.3644, "step": 3066 }, { "epoch": 19.915584415584416, "grad_norm": 1.609169602394104, "learning_rate": 3.2561582822191274e-05, "loss": 0.2972, "step": 3067 }, { "epoch": 19.92207792207792, "grad_norm": 1.622796893119812, "learning_rate": 3.2532142978366654e-05, "loss": 0.3157, "step": 3068 }, { "epoch": 19.928571428571427, "grad_norm": 1.6345113515853882, "learning_rate": 3.2502710030575366e-05, "loss": 0.3192, "step": 3069 }, { "epoch": 19.935064935064936, "grad_norm": 1.840793490409851, "learning_rate": 3.247328399043706e-05, "loss": 0.3335, "step": 3070 }, { "epoch": 19.941558441558442, "grad_norm": 1.6772702932357788, "learning_rate": 3.244386486956866e-05, "loss": 0.3288, "step": 3071 }, { "epoch": 19.948051948051948, "grad_norm": 1.497800350189209, "learning_rate": 3.241445267958438e-05, "loss": 0.3061, "step": 3072 }, { "epoch": 19.954545454545453, "grad_norm": 1.5990211963653564, "learning_rate": 3.2385047432095655e-05, "loss": 0.2983, "step": 3073 }, { "epoch": 19.961038961038962, "grad_norm": 1.5010451078414917, "learning_rate": 3.235564913871126e-05, "loss": 0.2942, "step": 3074 }, { "epoch": 19.967532467532468, "grad_norm": 1.848536491394043, "learning_rate": 3.2326257811037155e-05, "loss": 0.3412, "step": 3075 }, { "epoch": 19.974025974025974, "grad_norm": 1.7635799646377563, "learning_rate": 3.229687346067656e-05, "loss": 0.3785, "step": 3076 }, { "epoch": 19.98051948051948, "grad_norm": 1.5399929285049438, "learning_rate": 3.226749609922997e-05, "loss": 0.2821, "step": 3077 }, { "epoch": 19.98701298701299, "grad_norm": 1.920938491821289, "learning_rate": 3.223812573829506e-05, "loss": 0.3749, "step": 3078 }, { "epoch": 19.993506493506494, "grad_norm": 1.4005589485168457, "learning_rate": 3.220876238946684e-05, "loss": 0.281, "step": 3079 }, { "epoch": 20.0, "grad_norm": 139.0920867919922, "learning_rate": 3.217940606433747e-05, "loss": 0.3615, "step": 3080 }, { "epoch": 20.006493506493506, "grad_norm": 1.6003315448760986, "learning_rate": 3.215005677449636e-05, "loss": 0.29, "step": 3081 }, { "epoch": 20.01298701298701, "grad_norm": 1.465588927268982, "learning_rate": 3.2120714531530146e-05, "loss": 0.2716, "step": 3082 }, { "epoch": 20.01948051948052, "grad_norm": 1.5164473056793213, "learning_rate": 3.209137934702267e-05, "loss": 0.2801, "step": 3083 }, { "epoch": 20.025974025974026, "grad_norm": 1.6695610284805298, "learning_rate": 3.206205123255502e-05, "loss": 0.2837, "step": 3084 }, { "epoch": 20.032467532467532, "grad_norm": 1.5957876443862915, "learning_rate": 3.203273019970547e-05, "loss": 0.3119, "step": 3085 }, { "epoch": 20.038961038961038, "grad_norm": 1.6930581331253052, "learning_rate": 3.200341626004949e-05, "loss": 0.3149, "step": 3086 }, { "epoch": 20.045454545454547, "grad_norm": 1.5878403186798096, "learning_rate": 3.197410942515975e-05, "loss": 0.2859, "step": 3087 }, { "epoch": 20.051948051948052, "grad_norm": 1.7272661924362183, "learning_rate": 3.1944809706606124e-05, "loss": 0.3317, "step": 3088 }, { "epoch": 20.058441558441558, "grad_norm": 1.3851985931396484, "learning_rate": 3.19155171159557e-05, "loss": 0.2609, "step": 3089 }, { "epoch": 20.064935064935064, "grad_norm": 1.539115309715271, "learning_rate": 3.188623166477272e-05, "loss": 0.2815, "step": 3090 }, { "epoch": 20.071428571428573, "grad_norm": 1.5955266952514648, "learning_rate": 3.185695336461861e-05, "loss": 0.2748, "step": 3091 }, { "epoch": 20.07792207792208, "grad_norm": 1.750322699546814, "learning_rate": 3.1827682227051983e-05, "loss": 0.3197, "step": 3092 }, { "epoch": 20.084415584415584, "grad_norm": 1.4146174192428589, "learning_rate": 3.1798418263628596e-05, "loss": 0.2621, "step": 3093 }, { "epoch": 20.09090909090909, "grad_norm": 1.5428171157836914, "learning_rate": 3.176916148590145e-05, "loss": 0.2995, "step": 3094 }, { "epoch": 20.0974025974026, "grad_norm": 1.576772928237915, "learning_rate": 3.173991190542061e-05, "loss": 0.2757, "step": 3095 }, { "epoch": 20.103896103896105, "grad_norm": 1.5218124389648438, "learning_rate": 3.171066953373338e-05, "loss": 0.2993, "step": 3096 }, { "epoch": 20.11038961038961, "grad_norm": 1.5845181941986084, "learning_rate": 3.1681434382384165e-05, "loss": 0.3067, "step": 3097 }, { "epoch": 20.116883116883116, "grad_norm": 1.631756067276001, "learning_rate": 3.165220646291454e-05, "loss": 0.2921, "step": 3098 }, { "epoch": 20.123376623376622, "grad_norm": 1.5345515012741089, "learning_rate": 3.1622985786863234e-05, "loss": 0.2806, "step": 3099 }, { "epoch": 20.12987012987013, "grad_norm": 1.4022310972213745, "learning_rate": 3.1593772365766105e-05, "loss": 0.2689, "step": 3100 }, { "epoch": 20.136363636363637, "grad_norm": 1.767087697982788, "learning_rate": 3.156456621115615e-05, "loss": 0.3069, "step": 3101 }, { "epoch": 20.142857142857142, "grad_norm": 1.4767444133758545, "learning_rate": 3.153536733456349e-05, "loss": 0.2585, "step": 3102 }, { "epoch": 20.149350649350648, "grad_norm": 1.7268086671829224, "learning_rate": 3.150617574751538e-05, "loss": 0.3118, "step": 3103 }, { "epoch": 20.155844155844157, "grad_norm": 1.523352861404419, "learning_rate": 3.147699146153621e-05, "loss": 0.3094, "step": 3104 }, { "epoch": 20.162337662337663, "grad_norm": 1.6233497858047485, "learning_rate": 3.144781448814746e-05, "loss": 0.2816, "step": 3105 }, { "epoch": 20.16883116883117, "grad_norm": 1.4761881828308105, "learning_rate": 3.141864483886774e-05, "loss": 0.2778, "step": 3106 }, { "epoch": 20.175324675324674, "grad_norm": 1.7515058517456055, "learning_rate": 3.138948252521275e-05, "loss": 0.2918, "step": 3107 }, { "epoch": 20.181818181818183, "grad_norm": 1.7507883310317993, "learning_rate": 3.1360327558695335e-05, "loss": 0.3364, "step": 3108 }, { "epoch": 20.18831168831169, "grad_norm": 1.8343671560287476, "learning_rate": 3.1331179950825415e-05, "loss": 0.3497, "step": 3109 }, { "epoch": 20.194805194805195, "grad_norm": 1.6417983770370483, "learning_rate": 3.130203971310999e-05, "loss": 0.3231, "step": 3110 }, { "epoch": 20.2012987012987, "grad_norm": 1.6743037700653076, "learning_rate": 3.1272906857053164e-05, "loss": 0.327, "step": 3111 }, { "epoch": 20.207792207792206, "grad_norm": 1.629453420639038, "learning_rate": 3.124378139415614e-05, "loss": 0.2926, "step": 3112 }, { "epoch": 20.214285714285715, "grad_norm": 1.5045936107635498, "learning_rate": 3.121466333591715e-05, "loss": 0.2715, "step": 3113 }, { "epoch": 20.22077922077922, "grad_norm": 1.5699928998947144, "learning_rate": 3.118555269383159e-05, "loss": 0.2686, "step": 3114 }, { "epoch": 20.227272727272727, "grad_norm": 1.6248760223388672, "learning_rate": 3.1156449479391876e-05, "loss": 0.3209, "step": 3115 }, { "epoch": 20.233766233766232, "grad_norm": 1.609988808631897, "learning_rate": 3.112735370408748e-05, "loss": 0.3099, "step": 3116 }, { "epoch": 20.24025974025974, "grad_norm": 1.6785662174224854, "learning_rate": 3.1098265379404954e-05, "loss": 0.3035, "step": 3117 }, { "epoch": 20.246753246753247, "grad_norm": 1.6742063760757446, "learning_rate": 3.1069184516827887e-05, "loss": 0.3051, "step": 3118 }, { "epoch": 20.253246753246753, "grad_norm": 1.6288021802902222, "learning_rate": 3.104011112783699e-05, "loss": 0.2984, "step": 3119 }, { "epoch": 20.25974025974026, "grad_norm": 1.6975327730178833, "learning_rate": 3.101104522390995e-05, "loss": 0.3122, "step": 3120 }, { "epoch": 20.266233766233768, "grad_norm": 1.5018330812454224, "learning_rate": 3.098198681652154e-05, "loss": 0.2615, "step": 3121 }, { "epoch": 20.272727272727273, "grad_norm": 1.5475512742996216, "learning_rate": 3.0952935917143533e-05, "loss": 0.2724, "step": 3122 }, { "epoch": 20.27922077922078, "grad_norm": 1.9470924139022827, "learning_rate": 3.092389253724476e-05, "loss": 0.3353, "step": 3123 }, { "epoch": 20.285714285714285, "grad_norm": 1.625825047492981, "learning_rate": 3.089485668829113e-05, "loss": 0.3002, "step": 3124 }, { "epoch": 20.292207792207794, "grad_norm": 1.6080690622329712, "learning_rate": 3.086582838174551e-05, "loss": 0.3152, "step": 3125 }, { "epoch": 20.2987012987013, "grad_norm": 1.4272031784057617, "learning_rate": 3.0836807629067824e-05, "loss": 0.2546, "step": 3126 }, { "epoch": 20.305194805194805, "grad_norm": 1.5931485891342163, "learning_rate": 3.0807794441715e-05, "loss": 0.3063, "step": 3127 }, { "epoch": 20.31168831168831, "grad_norm": 1.5994946956634521, "learning_rate": 3.077878883114096e-05, "loss": 0.295, "step": 3128 }, { "epoch": 20.318181818181817, "grad_norm": 1.5144575834274292, "learning_rate": 3.074979080879671e-05, "loss": 0.2597, "step": 3129 }, { "epoch": 20.324675324675326, "grad_norm": 1.6098295450210571, "learning_rate": 3.072080038613018e-05, "loss": 0.275, "step": 3130 }, { "epoch": 20.33116883116883, "grad_norm": 1.7494981288909912, "learning_rate": 3.069181757458633e-05, "loss": 0.3243, "step": 3131 }, { "epoch": 20.337662337662337, "grad_norm": 1.5973317623138428, "learning_rate": 3.066284238560712e-05, "loss": 0.3091, "step": 3132 }, { "epoch": 20.344155844155843, "grad_norm": 1.608145833015442, "learning_rate": 3.063387483063148e-05, "loss": 0.2948, "step": 3133 }, { "epoch": 20.350649350649352, "grad_norm": 1.7458863258361816, "learning_rate": 3.0604914921095374e-05, "loss": 0.327, "step": 3134 }, { "epoch": 20.357142857142858, "grad_norm": 1.5687710046768188, "learning_rate": 3.05759626684317e-05, "loss": 0.3005, "step": 3135 }, { "epoch": 20.363636363636363, "grad_norm": 1.7417196035385132, "learning_rate": 3.0547018084070344e-05, "loss": 0.3363, "step": 3136 }, { "epoch": 20.37012987012987, "grad_norm": 1.515019416809082, "learning_rate": 3.051808117943817e-05, "loss": 0.2749, "step": 3137 }, { "epoch": 20.376623376623378, "grad_norm": 1.4067978858947754, "learning_rate": 3.0489151965958994e-05, "loss": 0.2447, "step": 3138 }, { "epoch": 20.383116883116884, "grad_norm": 1.425169587135315, "learning_rate": 3.0460230455053657e-05, "loss": 0.276, "step": 3139 }, { "epoch": 20.38961038961039, "grad_norm": 1.6170049905776978, "learning_rate": 3.043131665813988e-05, "loss": 0.2987, "step": 3140 }, { "epoch": 20.396103896103895, "grad_norm": 1.7482763528823853, "learning_rate": 3.040241058663238e-05, "loss": 0.3425, "step": 3141 }, { "epoch": 20.4025974025974, "grad_norm": 1.4901901483535767, "learning_rate": 3.0373512251942814e-05, "loss": 0.274, "step": 3142 }, { "epoch": 20.40909090909091, "grad_norm": 1.4814355373382568, "learning_rate": 3.0344621665479778e-05, "loss": 0.2792, "step": 3143 }, { "epoch": 20.415584415584416, "grad_norm": 1.7409002780914307, "learning_rate": 3.031573883864882e-05, "loss": 0.3229, "step": 3144 }, { "epoch": 20.42207792207792, "grad_norm": 1.6587722301483154, "learning_rate": 3.028686378285245e-05, "loss": 0.3186, "step": 3145 }, { "epoch": 20.428571428571427, "grad_norm": 1.5549224615097046, "learning_rate": 3.025799650949006e-05, "loss": 0.2744, "step": 3146 }, { "epoch": 20.435064935064936, "grad_norm": 1.7462142705917358, "learning_rate": 3.0229137029957993e-05, "loss": 0.3383, "step": 3147 }, { "epoch": 20.441558441558442, "grad_norm": 1.7569427490234375, "learning_rate": 3.0200285355649506e-05, "loss": 0.3343, "step": 3148 }, { "epoch": 20.448051948051948, "grad_norm": 1.7111477851867676, "learning_rate": 3.0171441497954806e-05, "loss": 0.3011, "step": 3149 }, { "epoch": 20.454545454545453, "grad_norm": 1.5983539819717407, "learning_rate": 3.0142605468260978e-05, "loss": 0.2699, "step": 3150 }, { "epoch": 20.461038961038962, "grad_norm": 1.688127875328064, "learning_rate": 3.0113777277952022e-05, "loss": 0.3419, "step": 3151 }, { "epoch": 20.467532467532468, "grad_norm": 1.788126826286316, "learning_rate": 3.008495693840887e-05, "loss": 0.3254, "step": 3152 }, { "epoch": 20.474025974025974, "grad_norm": 1.4928444623947144, "learning_rate": 3.0056144461009312e-05, "loss": 0.3052, "step": 3153 }, { "epoch": 20.48051948051948, "grad_norm": 1.579251766204834, "learning_rate": 3.002733985712808e-05, "loss": 0.2835, "step": 3154 }, { "epoch": 20.48701298701299, "grad_norm": 1.8297576904296875, "learning_rate": 2.999854313813677e-05, "loss": 0.3444, "step": 3155 }, { "epoch": 20.493506493506494, "grad_norm": 1.8977277278900146, "learning_rate": 2.9969754315403865e-05, "loss": 0.3416, "step": 3156 }, { "epoch": 20.5, "grad_norm": 1.5912731885910034, "learning_rate": 2.9940973400294743e-05, "loss": 0.2834, "step": 3157 }, { "epoch": 20.506493506493506, "grad_norm": 1.7094495296478271, "learning_rate": 2.9912200404171618e-05, "loss": 0.3066, "step": 3158 }, { "epoch": 20.51298701298701, "grad_norm": 1.7550628185272217, "learning_rate": 2.9883435338393672e-05, "loss": 0.3339, "step": 3159 }, { "epoch": 20.51948051948052, "grad_norm": 1.4511152505874634, "learning_rate": 2.9854678214316873e-05, "loss": 0.2585, "step": 3160 }, { "epoch": 20.525974025974026, "grad_norm": 1.640790343284607, "learning_rate": 2.982592904329407e-05, "loss": 0.3099, "step": 3161 }, { "epoch": 20.532467532467532, "grad_norm": 1.762873649597168, "learning_rate": 2.979718783667499e-05, "loss": 0.3277, "step": 3162 }, { "epoch": 20.538961038961038, "grad_norm": 1.570123314857483, "learning_rate": 2.9768454605806172e-05, "loss": 0.2913, "step": 3163 }, { "epoch": 20.545454545454547, "grad_norm": 1.359166145324707, "learning_rate": 2.97397293620311e-05, "loss": 0.2445, "step": 3164 }, { "epoch": 20.551948051948052, "grad_norm": 1.5213282108306885, "learning_rate": 2.9711012116690007e-05, "loss": 0.2985, "step": 3165 }, { "epoch": 20.558441558441558, "grad_norm": 1.73888099193573, "learning_rate": 2.9682302881120017e-05, "loss": 0.3245, "step": 3166 }, { "epoch": 20.564935064935064, "grad_norm": 1.564223289489746, "learning_rate": 2.965360166665508e-05, "loss": 0.2877, "step": 3167 }, { "epoch": 20.571428571428573, "grad_norm": 1.7191345691680908, "learning_rate": 2.9624908484625957e-05, "loss": 0.3433, "step": 3168 }, { "epoch": 20.57792207792208, "grad_norm": 1.7902562618255615, "learning_rate": 2.959622334636031e-05, "loss": 0.3117, "step": 3169 }, { "epoch": 20.584415584415584, "grad_norm": 1.6850911378860474, "learning_rate": 2.9567546263182556e-05, "loss": 0.3223, "step": 3170 }, { "epoch": 20.59090909090909, "grad_norm": 1.702717900276184, "learning_rate": 2.9538877246413943e-05, "loss": 0.3348, "step": 3171 }, { "epoch": 20.5974025974026, "grad_norm": 1.683753252029419, "learning_rate": 2.951021630737255e-05, "loss": 0.3228, "step": 3172 }, { "epoch": 20.603896103896105, "grad_norm": 1.705182671546936, "learning_rate": 2.9481563457373246e-05, "loss": 0.3128, "step": 3173 }, { "epoch": 20.61038961038961, "grad_norm": 1.6732240915298462, "learning_rate": 2.945291870772776e-05, "loss": 0.3209, "step": 3174 }, { "epoch": 20.616883116883116, "grad_norm": 1.573097586631775, "learning_rate": 2.9424282069744564e-05, "loss": 0.3009, "step": 3175 }, { "epoch": 20.623376623376622, "grad_norm": 1.6206854581832886, "learning_rate": 2.9395653554728953e-05, "loss": 0.2864, "step": 3176 }, { "epoch": 20.62987012987013, "grad_norm": 1.5243724584579468, "learning_rate": 2.9367033173983006e-05, "loss": 0.2724, "step": 3177 }, { "epoch": 20.636363636363637, "grad_norm": 1.6121240854263306, "learning_rate": 2.9338420938805577e-05, "loss": 0.3002, "step": 3178 }, { "epoch": 20.642857142857142, "grad_norm": 1.5962162017822266, "learning_rate": 2.9309816860492367e-05, "loss": 0.2902, "step": 3179 }, { "epoch": 20.649350649350648, "grad_norm": 1.4771066904067993, "learning_rate": 2.9281220950335796e-05, "loss": 0.291, "step": 3180 }, { "epoch": 20.655844155844157, "grad_norm": 1.651808500289917, "learning_rate": 2.925263321962507e-05, "loss": 0.2956, "step": 3181 }, { "epoch": 20.662337662337663, "grad_norm": 1.7307038307189941, "learning_rate": 2.9224053679646167e-05, "loss": 0.3388, "step": 3182 }, { "epoch": 20.66883116883117, "grad_norm": 1.4165980815887451, "learning_rate": 2.9195482341681828e-05, "loss": 0.2492, "step": 3183 }, { "epoch": 20.675324675324674, "grad_norm": 1.7736002206802368, "learning_rate": 2.9166919217011602e-05, "loss": 0.3755, "step": 3184 }, { "epoch": 20.681818181818183, "grad_norm": 1.51992928981781, "learning_rate": 2.913836431691175e-05, "loss": 0.2761, "step": 3185 }, { "epoch": 20.68831168831169, "grad_norm": 1.4604041576385498, "learning_rate": 2.910981765265525e-05, "loss": 0.266, "step": 3186 }, { "epoch": 20.694805194805195, "grad_norm": 1.6753500699996948, "learning_rate": 2.908127923551194e-05, "loss": 0.3093, "step": 3187 }, { "epoch": 20.7012987012987, "grad_norm": 1.6381313800811768, "learning_rate": 2.9052749076748264e-05, "loss": 0.3326, "step": 3188 }, { "epoch": 20.707792207792206, "grad_norm": 1.5157326459884644, "learning_rate": 2.9024227187627518e-05, "loss": 0.2762, "step": 3189 }, { "epoch": 20.714285714285715, "grad_norm": 1.7049490213394165, "learning_rate": 2.899571357940969e-05, "loss": 0.3202, "step": 3190 }, { "epoch": 20.72077922077922, "grad_norm": 1.63556706905365, "learning_rate": 2.896720826335151e-05, "loss": 0.3337, "step": 3191 }, { "epoch": 20.727272727272727, "grad_norm": 1.7763365507125854, "learning_rate": 2.8938711250706395e-05, "loss": 0.3233, "step": 3192 }, { "epoch": 20.733766233766232, "grad_norm": 1.7170530557632446, "learning_rate": 2.8910222552724553e-05, "loss": 0.3546, "step": 3193 }, { "epoch": 20.74025974025974, "grad_norm": 1.7245872020721436, "learning_rate": 2.888174218065281e-05, "loss": 0.3246, "step": 3194 }, { "epoch": 20.746753246753247, "grad_norm": 1.7832973003387451, "learning_rate": 2.8853270145734846e-05, "loss": 0.3671, "step": 3195 }, { "epoch": 20.753246753246753, "grad_norm": 1.5165952444076538, "learning_rate": 2.8824806459210908e-05, "loss": 0.2947, "step": 3196 }, { "epoch": 20.75974025974026, "grad_norm": 1.9060173034667969, "learning_rate": 2.8796351132318046e-05, "loss": 0.3473, "step": 3197 }, { "epoch": 20.766233766233768, "grad_norm": 1.6652640104293823, "learning_rate": 2.876790417628994e-05, "loss": 0.3302, "step": 3198 }, { "epoch": 20.772727272727273, "grad_norm": 1.6599342823028564, "learning_rate": 2.8739465602357014e-05, "loss": 0.3178, "step": 3199 }, { "epoch": 20.77922077922078, "grad_norm": 1.587645411491394, "learning_rate": 2.8711035421746364e-05, "loss": 0.3202, "step": 3200 }, { "epoch": 20.785714285714285, "grad_norm": 1.5443060398101807, "learning_rate": 2.86826136456818e-05, "loss": 0.2854, "step": 3201 }, { "epoch": 20.792207792207794, "grad_norm": 1.5955126285552979, "learning_rate": 2.865420028538374e-05, "loss": 0.3277, "step": 3202 }, { "epoch": 20.7987012987013, "grad_norm": 1.3673385381698608, "learning_rate": 2.8625795352069386e-05, "loss": 0.2375, "step": 3203 }, { "epoch": 20.805194805194805, "grad_norm": 1.3633562326431274, "learning_rate": 2.859739885695247e-05, "loss": 0.2328, "step": 3204 }, { "epoch": 20.81168831168831, "grad_norm": 1.5961345434188843, "learning_rate": 2.856901081124359e-05, "loss": 0.2904, "step": 3205 }, { "epoch": 20.818181818181817, "grad_norm": 1.7264599800109863, "learning_rate": 2.8540631226149812e-05, "loss": 0.3447, "step": 3206 }, { "epoch": 20.824675324675326, "grad_norm": 1.6234056949615479, "learning_rate": 2.8512260112874996e-05, "loss": 0.3216, "step": 3207 }, { "epoch": 20.83116883116883, "grad_norm": 1.6192063093185425, "learning_rate": 2.8483897482619565e-05, "loss": 0.2879, "step": 3208 }, { "epoch": 20.837662337662337, "grad_norm": 1.531345248222351, "learning_rate": 2.845554334658066e-05, "loss": 0.2979, "step": 3209 }, { "epoch": 20.844155844155843, "grad_norm": 1.5085830688476562, "learning_rate": 2.8427197715952047e-05, "loss": 0.2813, "step": 3210 }, { "epoch": 20.850649350649352, "grad_norm": 1.807420253753662, "learning_rate": 2.8398860601924144e-05, "loss": 0.333, "step": 3211 }, { "epoch": 20.857142857142858, "grad_norm": 1.6554759740829468, "learning_rate": 2.837053201568396e-05, "loss": 0.3105, "step": 3212 }, { "epoch": 20.863636363636363, "grad_norm": 1.5831176042556763, "learning_rate": 2.834221196841521e-05, "loss": 0.2875, "step": 3213 }, { "epoch": 20.87012987012987, "grad_norm": 1.9093105792999268, "learning_rate": 2.8313900471298148e-05, "loss": 0.37, "step": 3214 }, { "epoch": 20.876623376623378, "grad_norm": 1.370474934577942, "learning_rate": 2.828559753550977e-05, "loss": 0.2374, "step": 3215 }, { "epoch": 20.883116883116884, "grad_norm": 1.8838412761688232, "learning_rate": 2.825730317222358e-05, "loss": 0.3512, "step": 3216 }, { "epoch": 20.88961038961039, "grad_norm": 1.686285376548767, "learning_rate": 2.822901739260978e-05, "loss": 0.3121, "step": 3217 }, { "epoch": 20.896103896103895, "grad_norm": 1.6083804368972778, "learning_rate": 2.8200740207835107e-05, "loss": 0.2754, "step": 3218 }, { "epoch": 20.9025974025974, "grad_norm": 1.7600228786468506, "learning_rate": 2.817247162906297e-05, "loss": 0.3419, "step": 3219 }, { "epoch": 20.90909090909091, "grad_norm": 1.6443798542022705, "learning_rate": 2.8144211667453368e-05, "loss": 0.3317, "step": 3220 }, { "epoch": 20.915584415584416, "grad_norm": 1.6973304748535156, "learning_rate": 2.811596033416285e-05, "loss": 0.3489, "step": 3221 }, { "epoch": 20.92207792207792, "grad_norm": 1.5156219005584717, "learning_rate": 2.8087717640344618e-05, "loss": 0.2857, "step": 3222 }, { "epoch": 20.928571428571427, "grad_norm": 1.4208022356033325, "learning_rate": 2.8059483597148455e-05, "loss": 0.2643, "step": 3223 }, { "epoch": 20.935064935064936, "grad_norm": 1.6075962781906128, "learning_rate": 2.803125821572068e-05, "loss": 0.2902, "step": 3224 }, { "epoch": 20.941558441558442, "grad_norm": 1.8121535778045654, "learning_rate": 2.8003041507204242e-05, "loss": 0.3833, "step": 3225 }, { "epoch": 20.948051948051948, "grad_norm": 1.6950867176055908, "learning_rate": 2.797483348273867e-05, "loss": 0.3301, "step": 3226 }, { "epoch": 20.954545454545453, "grad_norm": 1.6453806161880493, "learning_rate": 2.7946634153460016e-05, "loss": 0.3155, "step": 3227 }, { "epoch": 20.961038961038962, "grad_norm": 1.9567476511001587, "learning_rate": 2.7918443530500937e-05, "loss": 0.3548, "step": 3228 }, { "epoch": 20.967532467532468, "grad_norm": 1.6447367668151855, "learning_rate": 2.7890261624990642e-05, "loss": 0.3055, "step": 3229 }, { "epoch": 20.974025974025974, "grad_norm": 1.6823948621749878, "learning_rate": 2.7862088448054936e-05, "loss": 0.3303, "step": 3230 }, { "epoch": 20.98051948051948, "grad_norm": 1.7678556442260742, "learning_rate": 2.7833924010816086e-05, "loss": 0.3294, "step": 3231 }, { "epoch": 20.98701298701299, "grad_norm": 1.5686346292495728, "learning_rate": 2.7805768324393015e-05, "loss": 0.304, "step": 3232 }, { "epoch": 20.993506493506494, "grad_norm": 1.7144767045974731, "learning_rate": 2.77776213999011e-05, "loss": 0.3429, "step": 3233 }, { "epoch": 21.0, "grad_norm": 2284.729248046875, "learning_rate": 2.7749483248452323e-05, "loss": 0.3019, "step": 3234 }, { "epoch": 21.006493506493506, "grad_norm": 1.4807523488998413, "learning_rate": 2.772135388115519e-05, "loss": 0.251, "step": 3235 }, { "epoch": 21.01298701298701, "grad_norm": 1.4363569021224976, "learning_rate": 2.7693233309114718e-05, "loss": 0.2596, "step": 3236 }, { "epoch": 21.01948051948052, "grad_norm": 1.7477468252182007, "learning_rate": 2.766512154343246e-05, "loss": 0.3099, "step": 3237 }, { "epoch": 21.025974025974026, "grad_norm": 1.631587266921997, "learning_rate": 2.7637018595206514e-05, "loss": 0.2752, "step": 3238 }, { "epoch": 21.032467532467532, "grad_norm": 1.6368337869644165, "learning_rate": 2.7608924475531427e-05, "loss": 0.2924, "step": 3239 }, { "epoch": 21.038961038961038, "grad_norm": 1.6427805423736572, "learning_rate": 2.7580839195498398e-05, "loss": 0.2935, "step": 3240 }, { "epoch": 21.045454545454547, "grad_norm": 1.7105602025985718, "learning_rate": 2.7552762766194973e-05, "loss": 0.3064, "step": 3241 }, { "epoch": 21.051948051948052, "grad_norm": 1.4318819046020508, "learning_rate": 2.7524695198705334e-05, "loss": 0.2759, "step": 3242 }, { "epoch": 21.058441558441558, "grad_norm": 1.5050474405288696, "learning_rate": 2.7496636504110075e-05, "loss": 0.2656, "step": 3243 }, { "epoch": 21.064935064935064, "grad_norm": 1.6251636743545532, "learning_rate": 2.7468586693486336e-05, "loss": 0.2969, "step": 3244 }, { "epoch": 21.071428571428573, "grad_norm": 1.5804094076156616, "learning_rate": 2.7440545777907746e-05, "loss": 0.3075, "step": 3245 }, { "epoch": 21.07792207792208, "grad_norm": 1.6606534719467163, "learning_rate": 2.7412513768444425e-05, "loss": 0.2851, "step": 3246 }, { "epoch": 21.084415584415584, "grad_norm": 1.7039629220962524, "learning_rate": 2.7384490676162933e-05, "loss": 0.3088, "step": 3247 }, { "epoch": 21.09090909090909, "grad_norm": 1.4719603061676025, "learning_rate": 2.735647651212638e-05, "loss": 0.2567, "step": 3248 }, { "epoch": 21.0974025974026, "grad_norm": 1.747230052947998, "learning_rate": 2.732847128739426e-05, "loss": 0.3061, "step": 3249 }, { "epoch": 21.103896103896105, "grad_norm": 1.5914639234542847, "learning_rate": 2.7300475013022663e-05, "loss": 0.3096, "step": 3250 }, { "epoch": 21.11038961038961, "grad_norm": 1.5523849725723267, "learning_rate": 2.7272487700064025e-05, "loss": 0.263, "step": 3251 }, { "epoch": 21.116883116883116, "grad_norm": 1.550976276397705, "learning_rate": 2.7244509359567327e-05, "loss": 0.264, "step": 3252 }, { "epoch": 21.123376623376622, "grad_norm": 1.4793239831924438, "learning_rate": 2.7216540002577933e-05, "loss": 0.2254, "step": 3253 }, { "epoch": 21.12987012987013, "grad_norm": 1.660180687904358, "learning_rate": 2.7188579640137728e-05, "loss": 0.324, "step": 3254 }, { "epoch": 21.136363636363637, "grad_norm": 1.3775200843811035, "learning_rate": 2.7160628283285018e-05, "loss": 0.2247, "step": 3255 }, { "epoch": 21.142857142857142, "grad_norm": 1.5850276947021484, "learning_rate": 2.7132685943054575e-05, "loss": 0.3005, "step": 3256 }, { "epoch": 21.149350649350648, "grad_norm": 1.5831350088119507, "learning_rate": 2.710475263047756e-05, "loss": 0.2672, "step": 3257 }, { "epoch": 21.155844155844157, "grad_norm": 1.6370729207992554, "learning_rate": 2.707682835658163e-05, "loss": 0.3144, "step": 3258 }, { "epoch": 21.162337662337663, "grad_norm": 1.8571823835372925, "learning_rate": 2.7048913132390786e-05, "loss": 0.3408, "step": 3259 }, { "epoch": 21.16883116883117, "grad_norm": 1.6455146074295044, "learning_rate": 2.702100696892561e-05, "loss": 0.2812, "step": 3260 }, { "epoch": 21.175324675324674, "grad_norm": 1.5817310810089111, "learning_rate": 2.6993109877202945e-05, "loss": 0.2857, "step": 3261 }, { "epoch": 21.181818181818183, "grad_norm": 1.621423363685608, "learning_rate": 2.6965221868236155e-05, "loss": 0.2775, "step": 3262 }, { "epoch": 21.18831168831169, "grad_norm": 1.530440330505371, "learning_rate": 2.6937342953034962e-05, "loss": 0.269, "step": 3263 }, { "epoch": 21.194805194805195, "grad_norm": 1.6790771484375, "learning_rate": 2.6909473142605525e-05, "loss": 0.2769, "step": 3264 }, { "epoch": 21.2012987012987, "grad_norm": 1.3510956764221191, "learning_rate": 2.6881612447950423e-05, "loss": 0.2324, "step": 3265 }, { "epoch": 21.207792207792206, "grad_norm": 1.5371694564819336, "learning_rate": 2.6853760880068585e-05, "loss": 0.2629, "step": 3266 }, { "epoch": 21.214285714285715, "grad_norm": 1.5438907146453857, "learning_rate": 2.6825918449955383e-05, "loss": 0.2768, "step": 3267 }, { "epoch": 21.22077922077922, "grad_norm": 1.7779414653778076, "learning_rate": 2.6798085168602595e-05, "loss": 0.3273, "step": 3268 }, { "epoch": 21.227272727272727, "grad_norm": 1.5467731952667236, "learning_rate": 2.6770261046998314e-05, "loss": 0.2588, "step": 3269 }, { "epoch": 21.233766233766232, "grad_norm": 1.530750036239624, "learning_rate": 2.674244609612708e-05, "loss": 0.2771, "step": 3270 }, { "epoch": 21.24025974025974, "grad_norm": 1.5975691080093384, "learning_rate": 2.671464032696982e-05, "loss": 0.2634, "step": 3271 }, { "epoch": 21.246753246753247, "grad_norm": 1.7090222835540771, "learning_rate": 2.6686843750503776e-05, "loss": 0.3025, "step": 3272 }, { "epoch": 21.253246753246753, "grad_norm": 1.501075029373169, "learning_rate": 2.6659056377702606e-05, "loss": 0.2607, "step": 3273 }, { "epoch": 21.25974025974026, "grad_norm": 1.5953552722930908, "learning_rate": 2.6631278219536327e-05, "loss": 0.3023, "step": 3274 }, { "epoch": 21.266233766233768, "grad_norm": 1.5634409189224243, "learning_rate": 2.660350928697134e-05, "loss": 0.2826, "step": 3275 }, { "epoch": 21.272727272727273, "grad_norm": 1.5833948850631714, "learning_rate": 2.6575749590970335e-05, "loss": 0.2764, "step": 3276 }, { "epoch": 21.27922077922078, "grad_norm": 1.39997398853302, "learning_rate": 2.6547999142492447e-05, "loss": 0.2492, "step": 3277 }, { "epoch": 21.285714285714285, "grad_norm": 1.541993498802185, "learning_rate": 2.6520257952493065e-05, "loss": 0.2826, "step": 3278 }, { "epoch": 21.292207792207794, "grad_norm": 1.4523844718933105, "learning_rate": 2.6492526031924003e-05, "loss": 0.2347, "step": 3279 }, { "epoch": 21.2987012987013, "grad_norm": 1.721123456954956, "learning_rate": 2.6464803391733374e-05, "loss": 0.2877, "step": 3280 }, { "epoch": 21.305194805194805, "grad_norm": 1.557781457901001, "learning_rate": 2.6437090042865655e-05, "loss": 0.2754, "step": 3281 }, { "epoch": 21.31168831168831, "grad_norm": 1.5026285648345947, "learning_rate": 2.6409385996261603e-05, "loss": 0.2754, "step": 3282 }, { "epoch": 21.318181818181817, "grad_norm": 1.5858430862426758, "learning_rate": 2.6381691262858383e-05, "loss": 0.2649, "step": 3283 }, { "epoch": 21.324675324675326, "grad_norm": 1.6628375053405762, "learning_rate": 2.635400585358937e-05, "loss": 0.2965, "step": 3284 }, { "epoch": 21.33116883116883, "grad_norm": 1.3391214609146118, "learning_rate": 2.6326329779384395e-05, "loss": 0.2213, "step": 3285 }, { "epoch": 21.337662337662337, "grad_norm": 1.727211594581604, "learning_rate": 2.62986630511695e-05, "loss": 0.3203, "step": 3286 }, { "epoch": 21.344155844155843, "grad_norm": 1.3903836011886597, "learning_rate": 2.6271005679867082e-05, "loss": 0.251, "step": 3287 }, { "epoch": 21.350649350649352, "grad_norm": 1.7230497598648071, "learning_rate": 2.6243357676395818e-05, "loss": 0.3016, "step": 3288 }, { "epoch": 21.357142857142858, "grad_norm": 1.4616880416870117, "learning_rate": 2.6215719051670706e-05, "loss": 0.2713, "step": 3289 }, { "epoch": 21.363636363636363, "grad_norm": 1.7390302419662476, "learning_rate": 2.618808981660304e-05, "loss": 0.3169, "step": 3290 }, { "epoch": 21.37012987012987, "grad_norm": 1.577402949333191, "learning_rate": 2.6160469982100428e-05, "loss": 0.3023, "step": 3291 }, { "epoch": 21.376623376623378, "grad_norm": 1.6617189645767212, "learning_rate": 2.6132859559066702e-05, "loss": 0.322, "step": 3292 }, { "epoch": 21.383116883116884, "grad_norm": 1.4321908950805664, "learning_rate": 2.6105258558402056e-05, "loss": 0.2609, "step": 3293 }, { "epoch": 21.38961038961039, "grad_norm": 1.5041757822036743, "learning_rate": 2.607766699100288e-05, "loss": 0.2584, "step": 3294 }, { "epoch": 21.396103896103895, "grad_norm": 1.4955791234970093, "learning_rate": 2.6050084867761954e-05, "loss": 0.2714, "step": 3295 }, { "epoch": 21.4025974025974, "grad_norm": 1.5688844919204712, "learning_rate": 2.602251219956821e-05, "loss": 0.2903, "step": 3296 }, { "epoch": 21.40909090909091, "grad_norm": 1.526567816734314, "learning_rate": 2.5994948997306938e-05, "loss": 0.2691, "step": 3297 }, { "epoch": 21.415584415584416, "grad_norm": 1.5073966979980469, "learning_rate": 2.5967395271859617e-05, "loss": 0.2771, "step": 3298 }, { "epoch": 21.42207792207792, "grad_norm": 1.6727265119552612, "learning_rate": 2.5939851034104035e-05, "loss": 0.3314, "step": 3299 }, { "epoch": 21.428571428571427, "grad_norm": 1.8070220947265625, "learning_rate": 2.591231629491423e-05, "loss": 0.3506, "step": 3300 }, { "epoch": 21.435064935064936, "grad_norm": 1.2890980243682861, "learning_rate": 2.5884791065160495e-05, "loss": 0.2001, "step": 3301 }, { "epoch": 21.441558441558442, "grad_norm": 1.6223267316818237, "learning_rate": 2.585727535570932e-05, "loss": 0.3031, "step": 3302 }, { "epoch": 21.448051948051948, "grad_norm": 1.6252323389053345, "learning_rate": 2.5829769177423502e-05, "loss": 0.2811, "step": 3303 }, { "epoch": 21.454545454545453, "grad_norm": 1.560977578163147, "learning_rate": 2.5802272541161993e-05, "loss": 0.2859, "step": 3304 }, { "epoch": 21.461038961038962, "grad_norm": 1.638126015663147, "learning_rate": 2.5774785457780103e-05, "loss": 0.3133, "step": 3305 }, { "epoch": 21.467532467532468, "grad_norm": 1.6311955451965332, "learning_rate": 2.574730793812925e-05, "loss": 0.2911, "step": 3306 }, { "epoch": 21.474025974025974, "grad_norm": 1.4712984561920166, "learning_rate": 2.5719839993057142e-05, "loss": 0.2918, "step": 3307 }, { "epoch": 21.48051948051948, "grad_norm": 1.688271403312683, "learning_rate": 2.569238163340767e-05, "loss": 0.2899, "step": 3308 }, { "epoch": 21.48701298701299, "grad_norm": 1.462765097618103, "learning_rate": 2.566493287002097e-05, "loss": 0.2799, "step": 3309 }, { "epoch": 21.493506493506494, "grad_norm": 1.738244891166687, "learning_rate": 2.5637493713733374e-05, "loss": 0.3307, "step": 3310 }, { "epoch": 21.5, "grad_norm": 1.8312119245529175, "learning_rate": 2.5610064175377456e-05, "loss": 0.3246, "step": 3311 }, { "epoch": 21.506493506493506, "grad_norm": 1.5559279918670654, "learning_rate": 2.5582644265781925e-05, "loss": 0.2822, "step": 3312 }, { "epoch": 21.51298701298701, "grad_norm": 1.7086284160614014, "learning_rate": 2.5555233995771756e-05, "loss": 0.3218, "step": 3313 }, { "epoch": 21.51948051948052, "grad_norm": 1.3549182415008545, "learning_rate": 2.5527833376168054e-05, "loss": 0.22, "step": 3314 }, { "epoch": 21.525974025974026, "grad_norm": 1.960464596748352, "learning_rate": 2.550044241778817e-05, "loss": 0.3075, "step": 3315 }, { "epoch": 21.532467532467532, "grad_norm": 1.6499247550964355, "learning_rate": 2.547306113144564e-05, "loss": 0.2801, "step": 3316 }, { "epoch": 21.538961038961038, "grad_norm": 1.6525225639343262, "learning_rate": 2.5445689527950133e-05, "loss": 0.3252, "step": 3317 }, { "epoch": 21.545454545454547, "grad_norm": 1.6803456544876099, "learning_rate": 2.541832761810753e-05, "loss": 0.3141, "step": 3318 }, { "epoch": 21.551948051948052, "grad_norm": 1.6995880603790283, "learning_rate": 2.5390975412719903e-05, "loss": 0.3095, "step": 3319 }, { "epoch": 21.558441558441558, "grad_norm": 1.6328798532485962, "learning_rate": 2.5363632922585433e-05, "loss": 0.3153, "step": 3320 }, { "epoch": 21.564935064935064, "grad_norm": 1.492034912109375, "learning_rate": 2.5336300158498516e-05, "loss": 0.2514, "step": 3321 }, { "epoch": 21.571428571428573, "grad_norm": 1.5037617683410645, "learning_rate": 2.5308977131249722e-05, "loss": 0.2665, "step": 3322 }, { "epoch": 21.57792207792208, "grad_norm": 1.6498708724975586, "learning_rate": 2.528166385162571e-05, "loss": 0.3037, "step": 3323 }, { "epoch": 21.584415584415584, "grad_norm": 1.6667795181274414, "learning_rate": 2.5254360330409342e-05, "loss": 0.3015, "step": 3324 }, { "epoch": 21.59090909090909, "grad_norm": 1.7515053749084473, "learning_rate": 2.522706657837962e-05, "loss": 0.3321, "step": 3325 }, { "epoch": 21.5974025974026, "grad_norm": 1.7231420278549194, "learning_rate": 2.5199782606311706e-05, "loss": 0.3399, "step": 3326 }, { "epoch": 21.603896103896105, "grad_norm": 1.5029762983322144, "learning_rate": 2.517250842497684e-05, "loss": 0.2714, "step": 3327 }, { "epoch": 21.61038961038961, "grad_norm": 1.5153870582580566, "learning_rate": 2.514524404514248e-05, "loss": 0.2636, "step": 3328 }, { "epoch": 21.616883116883116, "grad_norm": 1.523094654083252, "learning_rate": 2.5117989477572123e-05, "loss": 0.2917, "step": 3329 }, { "epoch": 21.623376623376622, "grad_norm": 1.616584300994873, "learning_rate": 2.509074473302546e-05, "loss": 0.2649, "step": 3330 }, { "epoch": 21.62987012987013, "grad_norm": 1.612755298614502, "learning_rate": 2.5063509822258304e-05, "loss": 0.2962, "step": 3331 }, { "epoch": 21.636363636363637, "grad_norm": 1.5755430459976196, "learning_rate": 2.5036284756022566e-05, "loss": 0.2661, "step": 3332 }, { "epoch": 21.642857142857142, "grad_norm": 1.6561660766601562, "learning_rate": 2.5009069545066233e-05, "loss": 0.3092, "step": 3333 }, { "epoch": 21.649350649350648, "grad_norm": 1.4235655069351196, "learning_rate": 2.4981864200133486e-05, "loss": 0.2516, "step": 3334 }, { "epoch": 21.655844155844157, "grad_norm": 1.748930811882019, "learning_rate": 2.4954668731964503e-05, "loss": 0.2965, "step": 3335 }, { "epoch": 21.662337662337663, "grad_norm": 1.9241677522659302, "learning_rate": 2.49274831512957e-05, "loss": 0.2824, "step": 3336 }, { "epoch": 21.66883116883117, "grad_norm": 1.5924633741378784, "learning_rate": 2.4900307468859464e-05, "loss": 0.2871, "step": 3337 }, { "epoch": 21.675324675324674, "grad_norm": 1.7249375581741333, "learning_rate": 2.487314169538435e-05, "loss": 0.3015, "step": 3338 }, { "epoch": 21.681818181818183, "grad_norm": 1.593870759010315, "learning_rate": 2.4845985841594943e-05, "loss": 0.2784, "step": 3339 }, { "epoch": 21.68831168831169, "grad_norm": 1.5246846675872803, "learning_rate": 2.4818839918211962e-05, "loss": 0.2634, "step": 3340 }, { "epoch": 21.694805194805195, "grad_norm": 1.7379146814346313, "learning_rate": 2.4791703935952193e-05, "loss": 0.3163, "step": 3341 }, { "epoch": 21.7012987012987, "grad_norm": 1.4411296844482422, "learning_rate": 2.47645779055285e-05, "loss": 0.2793, "step": 3342 }, { "epoch": 21.707792207792206, "grad_norm": 1.5561825037002563, "learning_rate": 2.473746183764979e-05, "loss": 0.2713, "step": 3343 }, { "epoch": 21.714285714285715, "grad_norm": 1.61198091506958, "learning_rate": 2.4710355743021076e-05, "loss": 0.2874, "step": 3344 }, { "epoch": 21.72077922077922, "grad_norm": 1.4368712902069092, "learning_rate": 2.468325963234337e-05, "loss": 0.2586, "step": 3345 }, { "epoch": 21.727272727272727, "grad_norm": 1.3584120273590088, "learning_rate": 2.465617351631385e-05, "loss": 0.2325, "step": 3346 }, { "epoch": 21.733766233766232, "grad_norm": 1.5912816524505615, "learning_rate": 2.462909740562565e-05, "loss": 0.2724, "step": 3347 }, { "epoch": 21.74025974025974, "grad_norm": 1.6549819707870483, "learning_rate": 2.460203131096801e-05, "loss": 0.3279, "step": 3348 }, { "epoch": 21.746753246753247, "grad_norm": 1.8146955966949463, "learning_rate": 2.4574975243026166e-05, "loss": 0.3285, "step": 3349 }, { "epoch": 21.753246753246753, "grad_norm": 1.7132552862167358, "learning_rate": 2.4547929212481435e-05, "loss": 0.2751, "step": 3350 }, { "epoch": 21.75974025974026, "grad_norm": 1.4731333255767822, "learning_rate": 2.452089323001118e-05, "loss": 0.2857, "step": 3351 }, { "epoch": 21.766233766233768, "grad_norm": 1.5654605627059937, "learning_rate": 2.4493867306288776e-05, "loss": 0.2859, "step": 3352 }, { "epoch": 21.772727272727273, "grad_norm": 1.387986421585083, "learning_rate": 2.44668514519836e-05, "loss": 0.2419, "step": 3353 }, { "epoch": 21.77922077922078, "grad_norm": 1.5316569805145264, "learning_rate": 2.4439845677761127e-05, "loss": 0.273, "step": 3354 }, { "epoch": 21.785714285714285, "grad_norm": 1.6250125169754028, "learning_rate": 2.4412849994282742e-05, "loss": 0.2948, "step": 3355 }, { "epoch": 21.792207792207794, "grad_norm": 1.7658638954162598, "learning_rate": 2.4385864412206007e-05, "loss": 0.3022, "step": 3356 }, { "epoch": 21.7987012987013, "grad_norm": 1.6159045696258545, "learning_rate": 2.435888894218432e-05, "loss": 0.2883, "step": 3357 }, { "epoch": 21.805194805194805, "grad_norm": 1.7427952289581299, "learning_rate": 2.433192359486723e-05, "loss": 0.3354, "step": 3358 }, { "epoch": 21.81168831168831, "grad_norm": 1.5414979457855225, "learning_rate": 2.4304968380900177e-05, "loss": 0.2686, "step": 3359 }, { "epoch": 21.818181818181817, "grad_norm": 1.5334300994873047, "learning_rate": 2.427802331092468e-05, "loss": 0.2778, "step": 3360 }, { "epoch": 21.824675324675326, "grad_norm": 1.5901919603347778, "learning_rate": 2.425108839557822e-05, "loss": 0.2846, "step": 3361 }, { "epoch": 21.83116883116883, "grad_norm": 1.5082696676254272, "learning_rate": 2.4224163645494298e-05, "loss": 0.286, "step": 3362 }, { "epoch": 21.837662337662337, "grad_norm": 1.4053430557250977, "learning_rate": 2.4197249071302336e-05, "loss": 0.2703, "step": 3363 }, { "epoch": 21.844155844155843, "grad_norm": 1.6315356492996216, "learning_rate": 2.4170344683627822e-05, "loss": 0.275, "step": 3364 }, { "epoch": 21.850649350649352, "grad_norm": 1.4324266910552979, "learning_rate": 2.414345049309215e-05, "loss": 0.2507, "step": 3365 }, { "epoch": 21.857142857142858, "grad_norm": 1.6574413776397705, "learning_rate": 2.411656651031273e-05, "loss": 0.2825, "step": 3366 }, { "epoch": 21.863636363636363, "grad_norm": 1.7428301572799683, "learning_rate": 2.408969274590296e-05, "loss": 0.2966, "step": 3367 }, { "epoch": 21.87012987012987, "grad_norm": 1.808197021484375, "learning_rate": 2.406282921047213e-05, "loss": 0.3692, "step": 3368 }, { "epoch": 21.876623376623378, "grad_norm": 1.710782766342163, "learning_rate": 2.403597591462557e-05, "loss": 0.3074, "step": 3369 }, { "epoch": 21.883116883116884, "grad_norm": 1.6153148412704468, "learning_rate": 2.4009132868964522e-05, "loss": 0.3067, "step": 3370 }, { "epoch": 21.88961038961039, "grad_norm": 1.7324892282485962, "learning_rate": 2.3982300084086224e-05, "loss": 0.3105, "step": 3371 }, { "epoch": 21.896103896103895, "grad_norm": 1.9062179327011108, "learning_rate": 2.395547757058379e-05, "loss": 0.3646, "step": 3372 }, { "epoch": 21.9025974025974, "grad_norm": 1.7402024269104004, "learning_rate": 2.3928665339046368e-05, "loss": 0.3237, "step": 3373 }, { "epoch": 21.90909090909091, "grad_norm": 1.552284836769104, "learning_rate": 2.390186340005896e-05, "loss": 0.2969, "step": 3374 }, { "epoch": 21.915584415584416, "grad_norm": 1.9513046741485596, "learning_rate": 2.3875071764202563e-05, "loss": 0.3489, "step": 3375 }, { "epoch": 21.92207792207792, "grad_norm": 1.7798665761947632, "learning_rate": 2.38482904420541e-05, "loss": 0.3018, "step": 3376 }, { "epoch": 21.928571428571427, "grad_norm": 1.5989744663238525, "learning_rate": 2.382151944418642e-05, "loss": 0.277, "step": 3377 }, { "epoch": 21.935064935064936, "grad_norm": 1.789979100227356, "learning_rate": 2.3794758781168265e-05, "loss": 0.3357, "step": 3378 }, { "epoch": 21.941558441558442, "grad_norm": 1.5926214456558228, "learning_rate": 2.3768008463564346e-05, "loss": 0.3063, "step": 3379 }, { "epoch": 21.948051948051948, "grad_norm": 1.3948332071304321, "learning_rate": 2.374126850193521e-05, "loss": 0.2458, "step": 3380 }, { "epoch": 21.954545454545453, "grad_norm": 1.5772472620010376, "learning_rate": 2.3714538906837453e-05, "loss": 0.2718, "step": 3381 }, { "epoch": 21.961038961038962, "grad_norm": 1.5779716968536377, "learning_rate": 2.3687819688823438e-05, "loss": 0.2727, "step": 3382 }, { "epoch": 21.967532467532468, "grad_norm": 1.613577127456665, "learning_rate": 2.3661110858441522e-05, "loss": 0.2946, "step": 3383 }, { "epoch": 21.974025974025974, "grad_norm": 1.5281500816345215, "learning_rate": 2.363441242623589e-05, "loss": 0.2866, "step": 3384 }, { "epoch": 21.98051948051948, "grad_norm": 1.5561461448669434, "learning_rate": 2.3607724402746684e-05, "loss": 0.3017, "step": 3385 }, { "epoch": 21.98701298701299, "grad_norm": 1.7536768913269043, "learning_rate": 2.358104679850991e-05, "loss": 0.3051, "step": 3386 }, { "epoch": 21.993506493506494, "grad_norm": 1.5164214372634888, "learning_rate": 2.355437962405749e-05, "loss": 0.2846, "step": 3387 }, { "epoch": 22.0, "grad_norm": 37.286903381347656, "learning_rate": 2.3527722889917147e-05, "loss": 0.3725, "step": 3388 }, { "epoch": 22.006493506493506, "grad_norm": 1.5180093050003052, "learning_rate": 2.3501076606612595e-05, "loss": 0.2771, "step": 3389 }, { "epoch": 22.01298701298701, "grad_norm": 1.5944215059280396, "learning_rate": 2.3474440784663292e-05, "loss": 0.2744, "step": 3390 }, { "epoch": 22.01948051948052, "grad_norm": 1.7063524723052979, "learning_rate": 2.3447815434584725e-05, "loss": 0.2761, "step": 3391 }, { "epoch": 22.025974025974026, "grad_norm": 1.5939115285873413, "learning_rate": 2.3421200566888095e-05, "loss": 0.2648, "step": 3392 }, { "epoch": 22.032467532467532, "grad_norm": 1.6371650695800781, "learning_rate": 2.339459619208058e-05, "loss": 0.2789, "step": 3393 }, { "epoch": 22.038961038961038, "grad_norm": 1.465437650680542, "learning_rate": 2.3368002320665118e-05, "loss": 0.2595, "step": 3394 }, { "epoch": 22.045454545454547, "grad_norm": 1.403302788734436, "learning_rate": 2.334141896314057e-05, "loss": 0.24, "step": 3395 }, { "epoch": 22.051948051948052, "grad_norm": 1.5353994369506836, "learning_rate": 2.331484613000163e-05, "loss": 0.2364, "step": 3396 }, { "epoch": 22.058441558441558, "grad_norm": 1.5051480531692505, "learning_rate": 2.3288283831738838e-05, "loss": 0.2739, "step": 3397 }, { "epoch": 22.064935064935064, "grad_norm": 1.6413733959197998, "learning_rate": 2.326173207883854e-05, "loss": 0.2808, "step": 3398 }, { "epoch": 22.071428571428573, "grad_norm": 1.6410763263702393, "learning_rate": 2.3235190881782986e-05, "loss": 0.2926, "step": 3399 }, { "epoch": 22.07792207792208, "grad_norm": 1.5759512186050415, "learning_rate": 2.320866025105016e-05, "loss": 0.266, "step": 3400 }, { "epoch": 22.084415584415584, "grad_norm": 1.4181909561157227, "learning_rate": 2.3182140197114012e-05, "loss": 0.2634, "step": 3401 }, { "epoch": 22.09090909090909, "grad_norm": 1.507177472114563, "learning_rate": 2.3155630730444183e-05, "loss": 0.2419, "step": 3402 }, { "epoch": 22.0974025974026, "grad_norm": 1.5201656818389893, "learning_rate": 2.3129131861506226e-05, "loss": 0.2495, "step": 3403 }, { "epoch": 22.103896103896105, "grad_norm": 1.568708062171936, "learning_rate": 2.310264360076144e-05, "loss": 0.2544, "step": 3404 }, { "epoch": 22.11038961038961, "grad_norm": 1.7201495170593262, "learning_rate": 2.307616595866699e-05, "loss": 0.2886, "step": 3405 }, { "epoch": 22.116883116883116, "grad_norm": 1.4597588777542114, "learning_rate": 2.304969894567583e-05, "loss": 0.267, "step": 3406 }, { "epoch": 22.123376623376622, "grad_norm": 1.64579439163208, "learning_rate": 2.302324257223673e-05, "loss": 0.2861, "step": 3407 }, { "epoch": 22.12987012987013, "grad_norm": 1.3634487390518188, "learning_rate": 2.2996796848794216e-05, "loss": 0.2262, "step": 3408 }, { "epoch": 22.136363636363637, "grad_norm": 1.5885374546051025, "learning_rate": 2.2970361785788673e-05, "loss": 0.2733, "step": 3409 }, { "epoch": 22.142857142857142, "grad_norm": 1.201250672340393, "learning_rate": 2.2943937393656218e-05, "loss": 0.1921, "step": 3410 }, { "epoch": 22.149350649350648, "grad_norm": 1.379902720451355, "learning_rate": 2.291752368282879e-05, "loss": 0.2472, "step": 3411 }, { "epoch": 22.155844155844157, "grad_norm": 1.5337809324264526, "learning_rate": 2.2891120663734113e-05, "loss": 0.2689, "step": 3412 }, { "epoch": 22.162337662337663, "grad_norm": 1.4945390224456787, "learning_rate": 2.286472834679569e-05, "loss": 0.2661, "step": 3413 }, { "epoch": 22.16883116883117, "grad_norm": 1.5157954692840576, "learning_rate": 2.2838346742432757e-05, "loss": 0.2715, "step": 3414 }, { "epoch": 22.175324675324674, "grad_norm": 1.671298861503601, "learning_rate": 2.2811975861060375e-05, "loss": 0.3162, "step": 3415 }, { "epoch": 22.181818181818183, "grad_norm": 1.7096617221832275, "learning_rate": 2.2785615713089366e-05, "loss": 0.2949, "step": 3416 }, { "epoch": 22.18831168831169, "grad_norm": 1.1825339794158936, "learning_rate": 2.275926630892626e-05, "loss": 0.1835, "step": 3417 }, { "epoch": 22.194805194805195, "grad_norm": 1.5265440940856934, "learning_rate": 2.2732927658973423e-05, "loss": 0.2539, "step": 3418 }, { "epoch": 22.2012987012987, "grad_norm": 1.3746451139450073, "learning_rate": 2.2706599773628907e-05, "loss": 0.2314, "step": 3419 }, { "epoch": 22.207792207792206, "grad_norm": 1.4681334495544434, "learning_rate": 2.2680282663286552e-05, "loss": 0.2553, "step": 3420 }, { "epoch": 22.214285714285715, "grad_norm": 1.3677679300308228, "learning_rate": 2.265397633833594e-05, "loss": 0.2442, "step": 3421 }, { "epoch": 22.22077922077922, "grad_norm": 1.55158269405365, "learning_rate": 2.262768080916241e-05, "loss": 0.2809, "step": 3422 }, { "epoch": 22.227272727272727, "grad_norm": 1.4442514181137085, "learning_rate": 2.2601396086146987e-05, "loss": 0.2648, "step": 3423 }, { "epoch": 22.233766233766232, "grad_norm": 1.1679338216781616, "learning_rate": 2.2575122179666497e-05, "loss": 0.1996, "step": 3424 }, { "epoch": 22.24025974025974, "grad_norm": 1.578581690788269, "learning_rate": 2.254885910009341e-05, "loss": 0.2806, "step": 3425 }, { "epoch": 22.246753246753247, "grad_norm": 1.3819916248321533, "learning_rate": 2.2522606857796042e-05, "loss": 0.2217, "step": 3426 }, { "epoch": 22.253246753246753, "grad_norm": 1.4690115451812744, "learning_rate": 2.2496365463138308e-05, "loss": 0.2809, "step": 3427 }, { "epoch": 22.25974025974026, "grad_norm": 1.7027679681777954, "learning_rate": 2.2470134926479935e-05, "loss": 0.3275, "step": 3428 }, { "epoch": 22.266233766233768, "grad_norm": 1.4241315126419067, "learning_rate": 2.244391525817629e-05, "loss": 0.2482, "step": 3429 }, { "epoch": 22.272727272727273, "grad_norm": 1.3987478017807007, "learning_rate": 2.241770646857849e-05, "loss": 0.2492, "step": 3430 }, { "epoch": 22.27922077922078, "grad_norm": 1.738652229309082, "learning_rate": 2.2391508568033365e-05, "loss": 0.3063, "step": 3431 }, { "epoch": 22.285714285714285, "grad_norm": 1.339930534362793, "learning_rate": 2.2365321566883436e-05, "loss": 0.2264, "step": 3432 }, { "epoch": 22.292207792207794, "grad_norm": 1.5308494567871094, "learning_rate": 2.2339145475466882e-05, "loss": 0.2602, "step": 3433 }, { "epoch": 22.2987012987013, "grad_norm": 1.5097752809524536, "learning_rate": 2.231298030411766e-05, "loss": 0.2528, "step": 3434 }, { "epoch": 22.305194805194805, "grad_norm": 1.6993303298950195, "learning_rate": 2.228682606316529e-05, "loss": 0.3062, "step": 3435 }, { "epoch": 22.31168831168831, "grad_norm": 1.5050712823867798, "learning_rate": 2.2260682762935138e-05, "loss": 0.273, "step": 3436 }, { "epoch": 22.318181818181817, "grad_norm": 1.5746042728424072, "learning_rate": 2.2234550413748106e-05, "loss": 0.2779, "step": 3437 }, { "epoch": 22.324675324675326, "grad_norm": 1.3380743265151978, "learning_rate": 2.2208429025920867e-05, "loss": 0.2386, "step": 3438 }, { "epoch": 22.33116883116883, "grad_norm": 1.6429160833358765, "learning_rate": 2.2182318609765702e-05, "loss": 0.2801, "step": 3439 }, { "epoch": 22.337662337662337, "grad_norm": 1.5324541330337524, "learning_rate": 2.215621917559062e-05, "loss": 0.2732, "step": 3440 }, { "epoch": 22.344155844155843, "grad_norm": 1.5058189630508423, "learning_rate": 2.2130130733699206e-05, "loss": 0.2499, "step": 3441 }, { "epoch": 22.350649350649352, "grad_norm": 1.668092131614685, "learning_rate": 2.2104053294390846e-05, "loss": 0.2976, "step": 3442 }, { "epoch": 22.357142857142858, "grad_norm": 1.6283371448516846, "learning_rate": 2.2077986867960444e-05, "loss": 0.2726, "step": 3443 }, { "epoch": 22.363636363636363, "grad_norm": 1.6393423080444336, "learning_rate": 2.205193146469864e-05, "loss": 0.2883, "step": 3444 }, { "epoch": 22.37012987012987, "grad_norm": 2.137350559234619, "learning_rate": 2.202588709489166e-05, "loss": 0.2726, "step": 3445 }, { "epoch": 22.376623376623378, "grad_norm": 1.5293010473251343, "learning_rate": 2.1999853768821432e-05, "loss": 0.2817, "step": 3446 }, { "epoch": 22.383116883116884, "grad_norm": 1.6385552883148193, "learning_rate": 2.1973831496765506e-05, "loss": 0.2752, "step": 3447 }, { "epoch": 22.38961038961039, "grad_norm": 1.554538607597351, "learning_rate": 2.194782028899707e-05, "loss": 0.2678, "step": 3448 }, { "epoch": 22.396103896103895, "grad_norm": 1.5276954174041748, "learning_rate": 2.1921820155784906e-05, "loss": 0.2631, "step": 3449 }, { "epoch": 22.4025974025974, "grad_norm": 1.3642127513885498, "learning_rate": 2.1895831107393487e-05, "loss": 0.2205, "step": 3450 }, { "epoch": 22.40909090909091, "grad_norm": 1.4631916284561157, "learning_rate": 2.1869853154082832e-05, "loss": 0.2529, "step": 3451 }, { "epoch": 22.415584415584416, "grad_norm": 1.6195316314697266, "learning_rate": 2.1843886306108686e-05, "loss": 0.2917, "step": 3452 }, { "epoch": 22.42207792207792, "grad_norm": 1.424699068069458, "learning_rate": 2.1817930573722305e-05, "loss": 0.2521, "step": 3453 }, { "epoch": 22.428571428571427, "grad_norm": 1.459183692932129, "learning_rate": 2.1791985967170632e-05, "loss": 0.2457, "step": 3454 }, { "epoch": 22.435064935064936, "grad_norm": 1.543350100517273, "learning_rate": 2.1766052496696153e-05, "loss": 0.2855, "step": 3455 }, { "epoch": 22.441558441558442, "grad_norm": 1.5383509397506714, "learning_rate": 2.174013017253701e-05, "loss": 0.2413, "step": 3456 }, { "epoch": 22.448051948051948, "grad_norm": 1.581472396850586, "learning_rate": 2.171421900492692e-05, "loss": 0.2825, "step": 3457 }, { "epoch": 22.454545454545453, "grad_norm": 1.680168628692627, "learning_rate": 2.168831900409523e-05, "loss": 0.281, "step": 3458 }, { "epoch": 22.461038961038962, "grad_norm": 1.4650065898895264, "learning_rate": 2.1662430180266812e-05, "loss": 0.2405, "step": 3459 }, { "epoch": 22.467532467532468, "grad_norm": 1.5870894193649292, "learning_rate": 2.1636552543662193e-05, "loss": 0.2787, "step": 3460 }, { "epoch": 22.474025974025974, "grad_norm": 1.5513888597488403, "learning_rate": 2.161068610449742e-05, "loss": 0.2583, "step": 3461 }, { "epoch": 22.48051948051948, "grad_norm": 1.59364914894104, "learning_rate": 2.158483087298417e-05, "loss": 0.2772, "step": 3462 }, { "epoch": 22.48701298701299, "grad_norm": 1.5559419393539429, "learning_rate": 2.1558986859329704e-05, "loss": 0.2849, "step": 3463 }, { "epoch": 22.493506493506494, "grad_norm": 1.7203205823898315, "learning_rate": 2.153315407373679e-05, "loss": 0.2926, "step": 3464 }, { "epoch": 22.5, "grad_norm": 1.6420910358428955, "learning_rate": 2.1507332526403818e-05, "loss": 0.2962, "step": 3465 }, { "epoch": 22.506493506493506, "grad_norm": 1.7251297235488892, "learning_rate": 2.1481522227524726e-05, "loss": 0.314, "step": 3466 }, { "epoch": 22.51298701298701, "grad_norm": 1.6253340244293213, "learning_rate": 2.1455723187289035e-05, "loss": 0.2871, "step": 3467 }, { "epoch": 22.51948051948052, "grad_norm": 1.5801173448562622, "learning_rate": 2.142993541588175e-05, "loss": 0.2581, "step": 3468 }, { "epoch": 22.525974025974026, "grad_norm": 1.4139834642410278, "learning_rate": 2.1404158923483525e-05, "loss": 0.2425, "step": 3469 }, { "epoch": 22.532467532467532, "grad_norm": 1.5056028366088867, "learning_rate": 2.1378393720270468e-05, "loss": 0.2619, "step": 3470 }, { "epoch": 22.538961038961038, "grad_norm": 1.4539679288864136, "learning_rate": 2.1352639816414295e-05, "loss": 0.2693, "step": 3471 }, { "epoch": 22.545454545454547, "grad_norm": 1.6343995332717896, "learning_rate": 2.1326897222082226e-05, "loss": 0.2906, "step": 3472 }, { "epoch": 22.551948051948052, "grad_norm": 1.6230934858322144, "learning_rate": 2.1301165947437064e-05, "loss": 0.2958, "step": 3473 }, { "epoch": 22.558441558441558, "grad_norm": 1.6073095798492432, "learning_rate": 2.1275446002637066e-05, "loss": 0.2966, "step": 3474 }, { "epoch": 22.564935064935064, "grad_norm": 1.5576633214950562, "learning_rate": 2.1249737397836094e-05, "loss": 0.263, "step": 3475 }, { "epoch": 22.571428571428573, "grad_norm": 1.604946494102478, "learning_rate": 2.1224040143183444e-05, "loss": 0.2964, "step": 3476 }, { "epoch": 22.57792207792208, "grad_norm": 1.6344997882843018, "learning_rate": 2.119835424882406e-05, "loss": 0.3117, "step": 3477 }, { "epoch": 22.584415584415584, "grad_norm": 1.5691205263137817, "learning_rate": 2.117267972489827e-05, "loss": 0.3134, "step": 3478 }, { "epoch": 22.59090909090909, "grad_norm": 1.8421242237091064, "learning_rate": 2.1147016581542e-05, "loss": 0.3309, "step": 3479 }, { "epoch": 22.5974025974026, "grad_norm": 1.4754592180252075, "learning_rate": 2.112136482888663e-05, "loss": 0.276, "step": 3480 }, { "epoch": 22.603896103896105, "grad_norm": 1.3509669303894043, "learning_rate": 2.1095724477059077e-05, "loss": 0.2421, "step": 3481 }, { "epoch": 22.61038961038961, "grad_norm": 1.6093113422393799, "learning_rate": 2.107009553618174e-05, "loss": 0.2811, "step": 3482 }, { "epoch": 22.616883116883116, "grad_norm": 1.4185787439346313, "learning_rate": 2.1044478016372542e-05, "loss": 0.2502, "step": 3483 }, { "epoch": 22.623376623376622, "grad_norm": 1.6224240064620972, "learning_rate": 2.1018871927744843e-05, "loss": 0.2546, "step": 3484 }, { "epoch": 22.62987012987013, "grad_norm": 1.4978384971618652, "learning_rate": 2.099327728040755e-05, "loss": 0.259, "step": 3485 }, { "epoch": 22.636363636363637, "grad_norm": 1.6402332782745361, "learning_rate": 2.0967694084464973e-05, "loss": 0.2675, "step": 3486 }, { "epoch": 22.642857142857142, "grad_norm": 1.6276346445083618, "learning_rate": 2.0942122350017023e-05, "loss": 0.289, "step": 3487 }, { "epoch": 22.649350649350648, "grad_norm": 1.5568166971206665, "learning_rate": 2.0916562087158964e-05, "loss": 0.259, "step": 3488 }, { "epoch": 22.655844155844157, "grad_norm": 1.637975811958313, "learning_rate": 2.0891013305981622e-05, "loss": 0.2992, "step": 3489 }, { "epoch": 22.662337662337663, "grad_norm": 1.479280948638916, "learning_rate": 2.0865476016571207e-05, "loss": 0.2529, "step": 3490 }, { "epoch": 22.66883116883117, "grad_norm": 1.5200845003128052, "learning_rate": 2.083995022900946e-05, "loss": 0.2652, "step": 3491 }, { "epoch": 22.675324675324674, "grad_norm": 1.5632872581481934, "learning_rate": 2.0814435953373557e-05, "loss": 0.2867, "step": 3492 }, { "epoch": 22.681818181818183, "grad_norm": 1.63100266456604, "learning_rate": 2.0788933199736143e-05, "loss": 0.2803, "step": 3493 }, { "epoch": 22.68831168831169, "grad_norm": 1.738909363746643, "learning_rate": 2.0763441978165276e-05, "loss": 0.3017, "step": 3494 }, { "epoch": 22.694805194805195, "grad_norm": 1.545060396194458, "learning_rate": 2.0737962298724512e-05, "loss": 0.2613, "step": 3495 }, { "epoch": 22.7012987012987, "grad_norm": 1.7323980331420898, "learning_rate": 2.071249417147278e-05, "loss": 0.2752, "step": 3496 }, { "epoch": 22.707792207792206, "grad_norm": 1.6065315008163452, "learning_rate": 2.0687037606464553e-05, "loss": 0.2881, "step": 3497 }, { "epoch": 22.714285714285715, "grad_norm": 1.5343070030212402, "learning_rate": 2.0661592613749635e-05, "loss": 0.2492, "step": 3498 }, { "epoch": 22.72077922077922, "grad_norm": 1.3241387605667114, "learning_rate": 2.0636159203373344e-05, "loss": 0.2322, "step": 3499 }, { "epoch": 22.727272727272727, "grad_norm": 1.4830141067504883, "learning_rate": 2.061073738537635e-05, "loss": 0.2737, "step": 3500 }, { "epoch": 22.733766233766232, "grad_norm": 1.7142821550369263, "learning_rate": 2.05853271697948e-05, "loss": 0.3126, "step": 3501 }, { "epoch": 22.74025974025974, "grad_norm": 1.4911231994628906, "learning_rate": 2.055992856666024e-05, "loss": 0.274, "step": 3502 }, { "epoch": 22.746753246753247, "grad_norm": 1.6087110042572021, "learning_rate": 2.053454158599966e-05, "loss": 0.3002, "step": 3503 }, { "epoch": 22.753246753246753, "grad_norm": 1.5740450620651245, "learning_rate": 2.0509166237835404e-05, "loss": 0.2654, "step": 3504 }, { "epoch": 22.75974025974026, "grad_norm": 1.6524783372879028, "learning_rate": 2.0483802532185286e-05, "loss": 0.2835, "step": 3505 }, { "epoch": 22.766233766233768, "grad_norm": 1.8935885429382324, "learning_rate": 2.045845047906247e-05, "loss": 0.3083, "step": 3506 }, { "epoch": 22.772727272727273, "grad_norm": 1.2677043676376343, "learning_rate": 2.043311008847555e-05, "loss": 0.2216, "step": 3507 }, { "epoch": 22.77922077922078, "grad_norm": 1.3806015253067017, "learning_rate": 2.0407781370428524e-05, "loss": 0.2321, "step": 3508 }, { "epoch": 22.785714285714285, "grad_norm": 1.4981380701065063, "learning_rate": 2.0382464334920776e-05, "loss": 0.2793, "step": 3509 }, { "epoch": 22.792207792207794, "grad_norm": 1.2533999681472778, "learning_rate": 2.035715899194704e-05, "loss": 0.2194, "step": 3510 }, { "epoch": 22.7987012987013, "grad_norm": 1.5374542474746704, "learning_rate": 2.0331865351497486e-05, "loss": 0.2712, "step": 3511 }, { "epoch": 22.805194805194805, "grad_norm": 1.42042076587677, "learning_rate": 2.0306583423557653e-05, "loss": 0.2628, "step": 3512 }, { "epoch": 22.81168831168831, "grad_norm": 1.5351003408432007, "learning_rate": 2.0281313218108416e-05, "loss": 0.2701, "step": 3513 }, { "epoch": 22.818181818181817, "grad_norm": 1.7053982019424438, "learning_rate": 2.0256054745126086e-05, "loss": 0.3417, "step": 3514 }, { "epoch": 22.824675324675326, "grad_norm": 1.711934208869934, "learning_rate": 2.0230808014582263e-05, "loss": 0.2967, "step": 3515 }, { "epoch": 22.83116883116883, "grad_norm": 1.590355396270752, "learning_rate": 2.0205573036443994e-05, "loss": 0.2906, "step": 3516 }, { "epoch": 22.837662337662337, "grad_norm": 1.7013020515441895, "learning_rate": 2.018034982067363e-05, "loss": 0.2959, "step": 3517 }, { "epoch": 22.844155844155843, "grad_norm": 1.5259300470352173, "learning_rate": 2.015513837722893e-05, "loss": 0.2573, "step": 3518 }, { "epoch": 22.850649350649352, "grad_norm": 1.4902393817901611, "learning_rate": 2.012993871606292e-05, "loss": 0.2522, "step": 3519 }, { "epoch": 22.857142857142858, "grad_norm": 1.668640375137329, "learning_rate": 2.0104750847124075e-05, "loss": 0.3173, "step": 3520 }, { "epoch": 22.863636363636363, "grad_norm": 1.5937610864639282, "learning_rate": 2.0079574780356115e-05, "loss": 0.2823, "step": 3521 }, { "epoch": 22.87012987012987, "grad_norm": 1.5136908292770386, "learning_rate": 2.0054410525698215e-05, "loss": 0.2576, "step": 3522 }, { "epoch": 22.876623376623378, "grad_norm": 1.65923011302948, "learning_rate": 2.0029258093084774e-05, "loss": 0.2765, "step": 3523 }, { "epoch": 22.883116883116884, "grad_norm": 1.6090705394744873, "learning_rate": 2.0004117492445616e-05, "loss": 0.2561, "step": 3524 }, { "epoch": 22.88961038961039, "grad_norm": 1.497574806213379, "learning_rate": 1.9978988733705807e-05, "loss": 0.2711, "step": 3525 }, { "epoch": 22.896103896103895, "grad_norm": 1.8243398666381836, "learning_rate": 1.9953871826785807e-05, "loss": 0.3382, "step": 3526 }, { "epoch": 22.9025974025974, "grad_norm": 1.5362616777420044, "learning_rate": 1.992876678160137e-05, "loss": 0.2765, "step": 3527 }, { "epoch": 22.90909090909091, "grad_norm": 1.5441484451293945, "learning_rate": 1.9903673608063587e-05, "loss": 0.2827, "step": 3528 }, { "epoch": 22.915584415584416, "grad_norm": 1.6237545013427734, "learning_rate": 1.9878592316078816e-05, "loss": 0.3137, "step": 3529 }, { "epoch": 22.92207792207792, "grad_norm": 1.4968721866607666, "learning_rate": 1.9853522915548784e-05, "loss": 0.2921, "step": 3530 }, { "epoch": 22.928571428571427, "grad_norm": 1.9086956977844238, "learning_rate": 1.982846541637044e-05, "loss": 0.3724, "step": 3531 }, { "epoch": 22.935064935064936, "grad_norm": 1.7461841106414795, "learning_rate": 1.980341982843616e-05, "loss": 0.2938, "step": 3532 }, { "epoch": 22.941558441558442, "grad_norm": 1.7323745489120483, "learning_rate": 1.9778386161633495e-05, "loss": 0.3009, "step": 3533 }, { "epoch": 22.948051948051948, "grad_norm": 1.5889756679534912, "learning_rate": 1.9753364425845368e-05, "loss": 0.2976, "step": 3534 }, { "epoch": 22.954545454545453, "grad_norm": 1.5568944215774536, "learning_rate": 1.9728354630949936e-05, "loss": 0.2702, "step": 3535 }, { "epoch": 22.961038961038962, "grad_norm": 1.6102936267852783, "learning_rate": 1.9703356786820687e-05, "loss": 0.2748, "step": 3536 }, { "epoch": 22.967532467532468, "grad_norm": 1.4768744707107544, "learning_rate": 1.967837090332637e-05, "loss": 0.2667, "step": 3537 }, { "epoch": 22.974025974025974, "grad_norm": 1.4863190650939941, "learning_rate": 1.965339699033104e-05, "loss": 0.2655, "step": 3538 }, { "epoch": 22.98051948051948, "grad_norm": 1.752227783203125, "learning_rate": 1.962843505769396e-05, "loss": 0.3076, "step": 3539 }, { "epoch": 22.98701298701299, "grad_norm": 1.5887079238891602, "learning_rate": 1.9603485115269748e-05, "loss": 0.2852, "step": 3540 }, { "epoch": 22.993506493506494, "grad_norm": 1.5256731510162354, "learning_rate": 1.9578547172908184e-05, "loss": 0.2932, "step": 3541 }, { "epoch": 23.0, "grad_norm": 3.6849677562713623, "learning_rate": 1.9553621240454452e-05, "loss": 0.2778, "step": 3542 }, { "epoch": 23.006493506493506, "grad_norm": 1.3863080739974976, "learning_rate": 1.9528707327748856e-05, "loss": 0.2273, "step": 3543 }, { "epoch": 23.01298701298701, "grad_norm": 1.3826905488967896, "learning_rate": 1.9503805444627054e-05, "loss": 0.2409, "step": 3544 }, { "epoch": 23.01948051948052, "grad_norm": 1.471177101135254, "learning_rate": 1.947891560091988e-05, "loss": 0.2199, "step": 3545 }, { "epoch": 23.025974025974026, "grad_norm": 1.404947280883789, "learning_rate": 1.9454037806453464e-05, "loss": 0.2294, "step": 3546 }, { "epoch": 23.032467532467532, "grad_norm": 1.5946253538131714, "learning_rate": 1.9429172071049168e-05, "loss": 0.2713, "step": 3547 }, { "epoch": 23.038961038961038, "grad_norm": 1.5549907684326172, "learning_rate": 1.940431840452361e-05, "loss": 0.2403, "step": 3548 }, { "epoch": 23.045454545454547, "grad_norm": 1.441696047782898, "learning_rate": 1.937947681668858e-05, "loss": 0.2545, "step": 3549 }, { "epoch": 23.051948051948052, "grad_norm": 1.5007630586624146, "learning_rate": 1.9354647317351188e-05, "loss": 0.2537, "step": 3550 }, { "epoch": 23.058441558441558, "grad_norm": 1.4599555730819702, "learning_rate": 1.932982991631369e-05, "loss": 0.256, "step": 3551 }, { "epoch": 23.064935064935064, "grad_norm": 1.4425921440124512, "learning_rate": 1.9305024623373618e-05, "loss": 0.2357, "step": 3552 }, { "epoch": 23.071428571428573, "grad_norm": 1.4287683963775635, "learning_rate": 1.928023144832371e-05, "loss": 0.243, "step": 3553 }, { "epoch": 23.07792207792208, "grad_norm": 1.5742028951644897, "learning_rate": 1.9255450400951935e-05, "loss": 0.2954, "step": 3554 }, { "epoch": 23.084415584415584, "grad_norm": 1.4811071157455444, "learning_rate": 1.923068149104143e-05, "loss": 0.2604, "step": 3555 }, { "epoch": 23.09090909090909, "grad_norm": 1.4043035507202148, "learning_rate": 1.9205924728370578e-05, "loss": 0.2263, "step": 3556 }, { "epoch": 23.0974025974026, "grad_norm": 1.7453620433807373, "learning_rate": 1.918118012271297e-05, "loss": 0.3124, "step": 3557 }, { "epoch": 23.103896103896105, "grad_norm": 1.2564915418624878, "learning_rate": 1.9156447683837363e-05, "loss": 0.2075, "step": 3558 }, { "epoch": 23.11038961038961, "grad_norm": 1.1455838680267334, "learning_rate": 1.9131727421507744e-05, "loss": 0.1923, "step": 3559 }, { "epoch": 23.116883116883116, "grad_norm": 1.5321933031082153, "learning_rate": 1.9107019345483296e-05, "loss": 0.2561, "step": 3560 }, { "epoch": 23.123376623376622, "grad_norm": 1.546483039855957, "learning_rate": 1.908232346551834e-05, "loss": 0.2422, "step": 3561 }, { "epoch": 23.12987012987013, "grad_norm": 1.4524723291397095, "learning_rate": 1.9057639791362437e-05, "loss": 0.2439, "step": 3562 }, { "epoch": 23.136363636363637, "grad_norm": 1.604983925819397, "learning_rate": 1.903296833276033e-05, "loss": 0.2708, "step": 3563 }, { "epoch": 23.142857142857142, "grad_norm": 1.5314178466796875, "learning_rate": 1.9008309099451887e-05, "loss": 0.2581, "step": 3564 }, { "epoch": 23.149350649350648, "grad_norm": 1.728439211845398, "learning_rate": 1.8983662101172216e-05, "loss": 0.3222, "step": 3565 }, { "epoch": 23.155844155844157, "grad_norm": 1.5534626245498657, "learning_rate": 1.8959027347651527e-05, "loss": 0.2586, "step": 3566 }, { "epoch": 23.162337662337663, "grad_norm": 1.4811235666275024, "learning_rate": 1.8934404848615246e-05, "loss": 0.2545, "step": 3567 }, { "epoch": 23.16883116883117, "grad_norm": 1.5369898080825806, "learning_rate": 1.8909794613783943e-05, "loss": 0.2382, "step": 3568 }, { "epoch": 23.175324675324674, "grad_norm": 1.5151273012161255, "learning_rate": 1.888519665287337e-05, "loss": 0.2654, "step": 3569 }, { "epoch": 23.181818181818183, "grad_norm": 1.5723505020141602, "learning_rate": 1.8860610975594382e-05, "loss": 0.2461, "step": 3570 }, { "epoch": 23.18831168831169, "grad_norm": 1.7091180086135864, "learning_rate": 1.8836037591653044e-05, "loss": 0.2598, "step": 3571 }, { "epoch": 23.194805194805195, "grad_norm": 1.5047938823699951, "learning_rate": 1.8811476510750488e-05, "loss": 0.2558, "step": 3572 }, { "epoch": 23.2012987012987, "grad_norm": 1.529113531112671, "learning_rate": 1.878692774258311e-05, "loss": 0.261, "step": 3573 }, { "epoch": 23.207792207792206, "grad_norm": 1.420833945274353, "learning_rate": 1.8762391296842317e-05, "loss": 0.2363, "step": 3574 }, { "epoch": 23.214285714285715, "grad_norm": 1.4916411638259888, "learning_rate": 1.8737867183214757e-05, "loss": 0.2379, "step": 3575 }, { "epoch": 23.22077922077922, "grad_norm": 1.4184962511062622, "learning_rate": 1.8713355411382116e-05, "loss": 0.2367, "step": 3576 }, { "epoch": 23.227272727272727, "grad_norm": 1.5358425378799438, "learning_rate": 1.8688855991021274e-05, "loss": 0.2458, "step": 3577 }, { "epoch": 23.233766233766232, "grad_norm": 1.5893042087554932, "learning_rate": 1.866436893180421e-05, "loss": 0.2785, "step": 3578 }, { "epoch": 23.24025974025974, "grad_norm": 1.513578176498413, "learning_rate": 1.8639894243398053e-05, "loss": 0.2704, "step": 3579 }, { "epoch": 23.246753246753247, "grad_norm": 1.4020006656646729, "learning_rate": 1.8615431935464982e-05, "loss": 0.2285, "step": 3580 }, { "epoch": 23.253246753246753, "grad_norm": 1.4641797542572021, "learning_rate": 1.8590982017662368e-05, "loss": 0.2562, "step": 3581 }, { "epoch": 23.25974025974026, "grad_norm": 1.4930659532546997, "learning_rate": 1.856654449964259e-05, "loss": 0.2236, "step": 3582 }, { "epoch": 23.266233766233768, "grad_norm": 1.851391315460205, "learning_rate": 1.854211939105327e-05, "loss": 0.2707, "step": 3583 }, { "epoch": 23.272727272727273, "grad_norm": 1.4651583433151245, "learning_rate": 1.8517706701537e-05, "loss": 0.2275, "step": 3584 }, { "epoch": 23.27922077922078, "grad_norm": 1.6360440254211426, "learning_rate": 1.8493306440731562e-05, "loss": 0.2798, "step": 3585 }, { "epoch": 23.285714285714285, "grad_norm": 1.5081583261489868, "learning_rate": 1.846891861826975e-05, "loss": 0.2646, "step": 3586 }, { "epoch": 23.292207792207794, "grad_norm": 1.5869941711425781, "learning_rate": 1.844454324377951e-05, "loss": 0.262, "step": 3587 }, { "epoch": 23.2987012987013, "grad_norm": 1.5664876699447632, "learning_rate": 1.8420180326883857e-05, "loss": 0.2495, "step": 3588 }, { "epoch": 23.305194805194805, "grad_norm": 1.5755020380020142, "learning_rate": 1.8395829877200903e-05, "loss": 0.2655, "step": 3589 }, { "epoch": 23.31168831168831, "grad_norm": 1.5650177001953125, "learning_rate": 1.837149190434378e-05, "loss": 0.2771, "step": 3590 }, { "epoch": 23.318181818181817, "grad_norm": 1.238885760307312, "learning_rate": 1.8347166417920774e-05, "loss": 0.2249, "step": 3591 }, { "epoch": 23.324675324675326, "grad_norm": 1.5093410015106201, "learning_rate": 1.832285342753515e-05, "loss": 0.2335, "step": 3592 }, { "epoch": 23.33116883116883, "grad_norm": 1.4694108963012695, "learning_rate": 1.8298552942785353e-05, "loss": 0.2488, "step": 3593 }, { "epoch": 23.337662337662337, "grad_norm": 1.3553141355514526, "learning_rate": 1.827426497326478e-05, "loss": 0.2283, "step": 3594 }, { "epoch": 23.344155844155843, "grad_norm": 1.5009989738464355, "learning_rate": 1.824998952856198e-05, "loss": 0.2548, "step": 3595 }, { "epoch": 23.350649350649352, "grad_norm": 1.5753271579742432, "learning_rate": 1.8225726618260473e-05, "loss": 0.2551, "step": 3596 }, { "epoch": 23.357142857142858, "grad_norm": 1.6028717756271362, "learning_rate": 1.8201476251938887e-05, "loss": 0.2875, "step": 3597 }, { "epoch": 23.363636363636363, "grad_norm": 1.4757002592086792, "learning_rate": 1.817723843917089e-05, "loss": 0.2481, "step": 3598 }, { "epoch": 23.37012987012987, "grad_norm": 1.582947015762329, "learning_rate": 1.8153013189525193e-05, "loss": 0.2786, "step": 3599 }, { "epoch": 23.376623376623378, "grad_norm": 1.5916093587875366, "learning_rate": 1.812880051256552e-05, "loss": 0.2809, "step": 3600 }, { "epoch": 23.383116883116884, "grad_norm": 1.4278252124786377, "learning_rate": 1.810460041785067e-05, "loss": 0.2306, "step": 3601 }, { "epoch": 23.38961038961039, "grad_norm": 1.3419313430786133, "learning_rate": 1.8080412914934442e-05, "loss": 0.2413, "step": 3602 }, { "epoch": 23.396103896103895, "grad_norm": 1.6411482095718384, "learning_rate": 1.8056238013365677e-05, "loss": 0.2981, "step": 3603 }, { "epoch": 23.4025974025974, "grad_norm": 1.6128722429275513, "learning_rate": 1.8032075722688258e-05, "loss": 0.2728, "step": 3604 }, { "epoch": 23.40909090909091, "grad_norm": 1.6131975650787354, "learning_rate": 1.800792605244109e-05, "loss": 0.2676, "step": 3605 }, { "epoch": 23.415584415584416, "grad_norm": 1.5457934141159058, "learning_rate": 1.7983789012158035e-05, "loss": 0.2864, "step": 3606 }, { "epoch": 23.42207792207792, "grad_norm": 1.3788739442825317, "learning_rate": 1.7959664611368044e-05, "loss": 0.2092, "step": 3607 }, { "epoch": 23.428571428571427, "grad_norm": 1.4640756845474243, "learning_rate": 1.7935552859595063e-05, "loss": 0.2505, "step": 3608 }, { "epoch": 23.435064935064936, "grad_norm": 1.3214120864868164, "learning_rate": 1.791145376635799e-05, "loss": 0.2281, "step": 3609 }, { "epoch": 23.441558441558442, "grad_norm": 1.7398396730422974, "learning_rate": 1.788736734117078e-05, "loss": 0.305, "step": 3610 }, { "epoch": 23.448051948051948, "grad_norm": 1.3596508502960205, "learning_rate": 1.7863293593542403e-05, "loss": 0.2132, "step": 3611 }, { "epoch": 23.454545454545453, "grad_norm": 1.7303870916366577, "learning_rate": 1.7839232532976747e-05, "loss": 0.3175, "step": 3612 }, { "epoch": 23.461038961038962, "grad_norm": 1.5764626264572144, "learning_rate": 1.781518416897276e-05, "loss": 0.2761, "step": 3613 }, { "epoch": 23.467532467532468, "grad_norm": 1.5844157934188843, "learning_rate": 1.779114851102437e-05, "loss": 0.2682, "step": 3614 }, { "epoch": 23.474025974025974, "grad_norm": 1.4731892347335815, "learning_rate": 1.776712556862044e-05, "loss": 0.247, "step": 3615 }, { "epoch": 23.48051948051948, "grad_norm": 1.3179312944412231, "learning_rate": 1.7743115351244882e-05, "loss": 0.2403, "step": 3616 }, { "epoch": 23.48701298701299, "grad_norm": 1.5436668395996094, "learning_rate": 1.77191178683765e-05, "loss": 0.2579, "step": 3617 }, { "epoch": 23.493506493506494, "grad_norm": 1.3392527103424072, "learning_rate": 1.769513312948919e-05, "loss": 0.2262, "step": 3618 }, { "epoch": 23.5, "grad_norm": 1.5829112529754639, "learning_rate": 1.767116114405169e-05, "loss": 0.2678, "step": 3619 }, { "epoch": 23.506493506493506, "grad_norm": 1.4703954458236694, "learning_rate": 1.76472019215278e-05, "loss": 0.2369, "step": 3620 }, { "epoch": 23.51298701298701, "grad_norm": 1.7341636419296265, "learning_rate": 1.7623255471376205e-05, "loss": 0.3001, "step": 3621 }, { "epoch": 23.51948051948052, "grad_norm": 1.3890771865844727, "learning_rate": 1.7599321803050596e-05, "loss": 0.2375, "step": 3622 }, { "epoch": 23.525974025974026, "grad_norm": 1.7448186874389648, "learning_rate": 1.7575400925999614e-05, "loss": 0.2915, "step": 3623 }, { "epoch": 23.532467532467532, "grad_norm": 1.479318618774414, "learning_rate": 1.7551492849666855e-05, "loss": 0.2642, "step": 3624 }, { "epoch": 23.538961038961038, "grad_norm": 1.5565717220306396, "learning_rate": 1.7527597583490822e-05, "loss": 0.2544, "step": 3625 }, { "epoch": 23.545454545454547, "grad_norm": 1.459172010421753, "learning_rate": 1.7503715136905015e-05, "loss": 0.2641, "step": 3626 }, { "epoch": 23.551948051948052, "grad_norm": 1.4650789499282837, "learning_rate": 1.7479845519337795e-05, "loss": 0.227, "step": 3627 }, { "epoch": 23.558441558441558, "grad_norm": 1.7029366493225098, "learning_rate": 1.7455988740212576e-05, "loss": 0.3056, "step": 3628 }, { "epoch": 23.564935064935064, "grad_norm": 1.4796315431594849, "learning_rate": 1.7432144808947594e-05, "loss": 0.2184, "step": 3629 }, { "epoch": 23.571428571428573, "grad_norm": 1.584101676940918, "learning_rate": 1.7408313734956078e-05, "loss": 0.2864, "step": 3630 }, { "epoch": 23.57792207792208, "grad_norm": 1.5924060344696045, "learning_rate": 1.7384495527646126e-05, "loss": 0.2593, "step": 3631 }, { "epoch": 23.584415584415584, "grad_norm": 1.669854998588562, "learning_rate": 1.7360690196420815e-05, "loss": 0.279, "step": 3632 }, { "epoch": 23.59090909090909, "grad_norm": 1.5048229694366455, "learning_rate": 1.7336897750678106e-05, "loss": 0.2527, "step": 3633 }, { "epoch": 23.5974025974026, "grad_norm": 1.6479854583740234, "learning_rate": 1.7313118199810903e-05, "loss": 0.2895, "step": 3634 }, { "epoch": 23.603896103896105, "grad_norm": 1.448838472366333, "learning_rate": 1.7289351553206952e-05, "loss": 0.2602, "step": 3635 }, { "epoch": 23.61038961038961, "grad_norm": 1.5668859481811523, "learning_rate": 1.7265597820248984e-05, "loss": 0.2847, "step": 3636 }, { "epoch": 23.616883116883116, "grad_norm": 1.4269754886627197, "learning_rate": 1.724185701031456e-05, "loss": 0.2527, "step": 3637 }, { "epoch": 23.623376623376622, "grad_norm": 1.6819483041763306, "learning_rate": 1.721812913277623e-05, "loss": 0.2863, "step": 3638 }, { "epoch": 23.62987012987013, "grad_norm": 1.5328335762023926, "learning_rate": 1.7194414197001335e-05, "loss": 0.2537, "step": 3639 }, { "epoch": 23.636363636363637, "grad_norm": 1.4612891674041748, "learning_rate": 1.717071221235219e-05, "loss": 0.2474, "step": 3640 }, { "epoch": 23.642857142857142, "grad_norm": 1.2880675792694092, "learning_rate": 1.7147023188185933e-05, "loss": 0.2234, "step": 3641 }, { "epoch": 23.649350649350648, "grad_norm": 1.3966264724731445, "learning_rate": 1.712334713385463e-05, "loss": 0.2399, "step": 3642 }, { "epoch": 23.655844155844157, "grad_norm": 1.5062140226364136, "learning_rate": 1.7099684058705212e-05, "loss": 0.294, "step": 3643 }, { "epoch": 23.662337662337663, "grad_norm": 1.4420065879821777, "learning_rate": 1.7076033972079507e-05, "loss": 0.276, "step": 3644 }, { "epoch": 23.66883116883117, "grad_norm": 1.4636797904968262, "learning_rate": 1.7052396883314152e-05, "loss": 0.2376, "step": 3645 }, { "epoch": 23.675324675324674, "grad_norm": 1.6941367387771606, "learning_rate": 1.7028772801740746e-05, "loss": 0.2942, "step": 3646 }, { "epoch": 23.681818181818183, "grad_norm": 1.530224323272705, "learning_rate": 1.700516173668565e-05, "loss": 0.2678, "step": 3647 }, { "epoch": 23.68831168831169, "grad_norm": 1.529497742652893, "learning_rate": 1.698156369747016e-05, "loss": 0.2702, "step": 3648 }, { "epoch": 23.694805194805195, "grad_norm": 1.4436651468276978, "learning_rate": 1.6957978693410416e-05, "loss": 0.2435, "step": 3649 }, { "epoch": 23.7012987012987, "grad_norm": 1.7091460227966309, "learning_rate": 1.6934406733817414e-05, "loss": 0.2794, "step": 3650 }, { "epoch": 23.707792207792206, "grad_norm": 1.4090582132339478, "learning_rate": 1.6910847827996962e-05, "loss": 0.2426, "step": 3651 }, { "epoch": 23.714285714285715, "grad_norm": 1.5994294881820679, "learning_rate": 1.6887301985249754e-05, "loss": 0.2659, "step": 3652 }, { "epoch": 23.72077922077922, "grad_norm": 1.2122873067855835, "learning_rate": 1.6863769214871335e-05, "loss": 0.1927, "step": 3653 }, { "epoch": 23.727272727272727, "grad_norm": 1.5995254516601562, "learning_rate": 1.6840249526152034e-05, "loss": 0.256, "step": 3654 }, { "epoch": 23.733766233766232, "grad_norm": 1.2082469463348389, "learning_rate": 1.6816742928377078e-05, "loss": 0.1918, "step": 3655 }, { "epoch": 23.74025974025974, "grad_norm": 1.4470820426940918, "learning_rate": 1.67932494308265e-05, "loss": 0.2331, "step": 3656 }, { "epoch": 23.746753246753247, "grad_norm": 1.4353268146514893, "learning_rate": 1.6769769042775142e-05, "loss": 0.2403, "step": 3657 }, { "epoch": 23.753246753246753, "grad_norm": 1.5231980085372925, "learning_rate": 1.6746301773492702e-05, "loss": 0.2693, "step": 3658 }, { "epoch": 23.75974025974026, "grad_norm": 1.4293920993804932, "learning_rate": 1.67228476322437e-05, "loss": 0.2604, "step": 3659 }, { "epoch": 23.766233766233768, "grad_norm": 1.3395967483520508, "learning_rate": 1.669940662828743e-05, "loss": 0.2294, "step": 3660 }, { "epoch": 23.772727272727273, "grad_norm": 1.5245810747146606, "learning_rate": 1.6675978770878052e-05, "loss": 0.2656, "step": 3661 }, { "epoch": 23.77922077922078, "grad_norm": 2.2698071002960205, "learning_rate": 1.6652564069264475e-05, "loss": 0.2688, "step": 3662 }, { "epoch": 23.785714285714285, "grad_norm": 1.658073902130127, "learning_rate": 1.662916253269052e-05, "loss": 0.2921, "step": 3663 }, { "epoch": 23.792207792207794, "grad_norm": 1.393870234489441, "learning_rate": 1.6605774170394682e-05, "loss": 0.2401, "step": 3664 }, { "epoch": 23.7987012987013, "grad_norm": 1.4478249549865723, "learning_rate": 1.658239899161036e-05, "loss": 0.2508, "step": 3665 }, { "epoch": 23.805194805194805, "grad_norm": 1.2878296375274658, "learning_rate": 1.655903700556567e-05, "loss": 0.2191, "step": 3666 }, { "epoch": 23.81168831168831, "grad_norm": 1.5081398487091064, "learning_rate": 1.653568822148356e-05, "loss": 0.2592, "step": 3667 }, { "epoch": 23.818181818181817, "grad_norm": 1.5049258470535278, "learning_rate": 1.651235264858177e-05, "loss": 0.2396, "step": 3668 }, { "epoch": 23.824675324675326, "grad_norm": 1.6963160037994385, "learning_rate": 1.648903029607283e-05, "loss": 0.2868, "step": 3669 }, { "epoch": 23.83116883116883, "grad_norm": 1.810282588005066, "learning_rate": 1.6465721173164005e-05, "loss": 0.2948, "step": 3670 }, { "epoch": 23.837662337662337, "grad_norm": 1.5936979055404663, "learning_rate": 1.6442425289057388e-05, "loss": 0.2531, "step": 3671 }, { "epoch": 23.844155844155843, "grad_norm": 1.6848145723342896, "learning_rate": 1.6419142652949793e-05, "loss": 0.2784, "step": 3672 }, { "epoch": 23.850649350649352, "grad_norm": 1.502152442932129, "learning_rate": 1.639587327403289e-05, "loss": 0.2553, "step": 3673 }, { "epoch": 23.857142857142858, "grad_norm": 1.4996960163116455, "learning_rate": 1.6372617161493014e-05, "loss": 0.2596, "step": 3674 }, { "epoch": 23.863636363636363, "grad_norm": 1.3415074348449707, "learning_rate": 1.6349374324511345e-05, "loss": 0.2331, "step": 3675 }, { "epoch": 23.87012987012987, "grad_norm": 1.458927869796753, "learning_rate": 1.6326144772263753e-05, "loss": 0.2774, "step": 3676 }, { "epoch": 23.876623376623378, "grad_norm": 1.5150805711746216, "learning_rate": 1.6302928513920913e-05, "loss": 0.2661, "step": 3677 }, { "epoch": 23.883116883116884, "grad_norm": 1.4769573211669922, "learning_rate": 1.627972555864824e-05, "loss": 0.2477, "step": 3678 }, { "epoch": 23.88961038961039, "grad_norm": 1.5924259424209595, "learning_rate": 1.6256535915605907e-05, "loss": 0.2724, "step": 3679 }, { "epoch": 23.896103896103895, "grad_norm": 1.4722639322280884, "learning_rate": 1.6233359593948783e-05, "loss": 0.2645, "step": 3680 }, { "epoch": 23.9025974025974, "grad_norm": 1.3167918920516968, "learning_rate": 1.6210196602826544e-05, "loss": 0.2148, "step": 3681 }, { "epoch": 23.90909090909091, "grad_norm": 1.5795596837997437, "learning_rate": 1.6187046951383532e-05, "loss": 0.2921, "step": 3682 }, { "epoch": 23.915584415584416, "grad_norm": 1.4929596185684204, "learning_rate": 1.6163910648758916e-05, "loss": 0.2413, "step": 3683 }, { "epoch": 23.92207792207792, "grad_norm": 1.5382635593414307, "learning_rate": 1.6140787704086506e-05, "loss": 0.2658, "step": 3684 }, { "epoch": 23.928571428571427, "grad_norm": 1.551076889038086, "learning_rate": 1.6117678126494894e-05, "loss": 0.2713, "step": 3685 }, { "epoch": 23.935064935064936, "grad_norm": 1.6375598907470703, "learning_rate": 1.6094581925107353e-05, "loss": 0.256, "step": 3686 }, { "epoch": 23.941558441558442, "grad_norm": 1.6615065336227417, "learning_rate": 1.607149910904191e-05, "loss": 0.3001, "step": 3687 }, { "epoch": 23.948051948051948, "grad_norm": 1.4583709239959717, "learning_rate": 1.6048429687411292e-05, "loss": 0.2406, "step": 3688 }, { "epoch": 23.954545454545453, "grad_norm": 1.6075941324234009, "learning_rate": 1.6025373669322962e-05, "loss": 0.291, "step": 3689 }, { "epoch": 23.961038961038962, "grad_norm": 1.4560933113098145, "learning_rate": 1.600233106387904e-05, "loss": 0.2548, "step": 3690 }, { "epoch": 23.967532467532468, "grad_norm": 1.340819239616394, "learning_rate": 1.5979301880176412e-05, "loss": 0.2218, "step": 3691 }, { "epoch": 23.974025974025974, "grad_norm": 1.4247775077819824, "learning_rate": 1.5956286127306596e-05, "loss": 0.2292, "step": 3692 }, { "epoch": 23.98051948051948, "grad_norm": 1.4783180952072144, "learning_rate": 1.5933283814355872e-05, "loss": 0.2592, "step": 3693 }, { "epoch": 23.98701298701299, "grad_norm": 1.2897588014602661, "learning_rate": 1.591029495040518e-05, "loss": 0.2068, "step": 3694 }, { "epoch": 23.993506493506494, "grad_norm": 1.708559274673462, "learning_rate": 1.588731954453019e-05, "loss": 0.2903, "step": 3695 }, { "epoch": 24.0, "grad_norm": 555.4655151367188, "learning_rate": 1.586435760580118e-05, "loss": 0.3237, "step": 3696 }, { "epoch": 24.006493506493506, "grad_norm": 1.4743223190307617, "learning_rate": 1.5841409143283197e-05, "loss": 0.2401, "step": 3697 }, { "epoch": 24.01298701298701, "grad_norm": 1.3983312845230103, "learning_rate": 1.5818474166035906e-05, "loss": 0.2394, "step": 3698 }, { "epoch": 24.01948051948052, "grad_norm": 1.5984793901443481, "learning_rate": 1.5795552683113685e-05, "loss": 0.2686, "step": 3699 }, { "epoch": 24.025974025974026, "grad_norm": 1.5668600797653198, "learning_rate": 1.5772644703565565e-05, "loss": 0.2515, "step": 3700 }, { "epoch": 24.032467532467532, "grad_norm": 1.4743406772613525, "learning_rate": 1.5749750236435277e-05, "loss": 0.2497, "step": 3701 }, { "epoch": 24.038961038961038, "grad_norm": 1.571355938911438, "learning_rate": 1.5726869290761164e-05, "loss": 0.2552, "step": 3702 }, { "epoch": 24.045454545454547, "grad_norm": 1.4651532173156738, "learning_rate": 1.5704001875576267e-05, "loss": 0.2313, "step": 3703 }, { "epoch": 24.051948051948052, "grad_norm": 1.5145294666290283, "learning_rate": 1.5681147999908306e-05, "loss": 0.2496, "step": 3704 }, { "epoch": 24.058441558441558, "grad_norm": 1.4692890644073486, "learning_rate": 1.5658307672779593e-05, "loss": 0.2447, "step": 3705 }, { "epoch": 24.064935064935064, "grad_norm": 1.3533949851989746, "learning_rate": 1.5635480903207144e-05, "loss": 0.2198, "step": 3706 }, { "epoch": 24.071428571428573, "grad_norm": 1.4457982778549194, "learning_rate": 1.5612667700202622e-05, "loss": 0.2217, "step": 3707 }, { "epoch": 24.07792207792208, "grad_norm": 1.3959927558898926, "learning_rate": 1.5589868072772284e-05, "loss": 0.2436, "step": 3708 }, { "epoch": 24.084415584415584, "grad_norm": 1.291892409324646, "learning_rate": 1.5567082029917078e-05, "loss": 0.2101, "step": 3709 }, { "epoch": 24.09090909090909, "grad_norm": 1.4876279830932617, "learning_rate": 1.554430958063259e-05, "loss": 0.2237, "step": 3710 }, { "epoch": 24.0974025974026, "grad_norm": 1.59600031375885, "learning_rate": 1.552155073390899e-05, "loss": 0.2803, "step": 3711 }, { "epoch": 24.103896103896105, "grad_norm": 1.4498419761657715, "learning_rate": 1.549880549873115e-05, "loss": 0.2451, "step": 3712 }, { "epoch": 24.11038961038961, "grad_norm": 1.4382352828979492, "learning_rate": 1.5476073884078467e-05, "loss": 0.2505, "step": 3713 }, { "epoch": 24.116883116883116, "grad_norm": 1.2461518049240112, "learning_rate": 1.5453355898925093e-05, "loss": 0.2216, "step": 3714 }, { "epoch": 24.123376623376622, "grad_norm": 1.4990081787109375, "learning_rate": 1.5430651552239685e-05, "loss": 0.2283, "step": 3715 }, { "epoch": 24.12987012987013, "grad_norm": 1.3354995250701904, "learning_rate": 1.5407960852985583e-05, "loss": 0.2338, "step": 3716 }, { "epoch": 24.136363636363637, "grad_norm": 1.5128660202026367, "learning_rate": 1.538528381012069e-05, "loss": 0.2454, "step": 3717 }, { "epoch": 24.142857142857142, "grad_norm": 1.5596075057983398, "learning_rate": 1.5362620432597557e-05, "loss": 0.2545, "step": 3718 }, { "epoch": 24.149350649350648, "grad_norm": 1.5532145500183105, "learning_rate": 1.533997072936333e-05, "loss": 0.2572, "step": 3719 }, { "epoch": 24.155844155844157, "grad_norm": 1.504630208015442, "learning_rate": 1.5317334709359766e-05, "loss": 0.2545, "step": 3720 }, { "epoch": 24.162337662337663, "grad_norm": 1.1753491163253784, "learning_rate": 1.529471238152317e-05, "loss": 0.1929, "step": 3721 }, { "epoch": 24.16883116883117, "grad_norm": 1.445791482925415, "learning_rate": 1.5272103754784517e-05, "loss": 0.2438, "step": 3722 }, { "epoch": 24.175324675324674, "grad_norm": 1.497458577156067, "learning_rate": 1.5249508838069293e-05, "loss": 0.2727, "step": 3723 }, { "epoch": 24.181818181818183, "grad_norm": 1.389278769493103, "learning_rate": 1.5226927640297662e-05, "loss": 0.2349, "step": 3724 }, { "epoch": 24.18831168831169, "grad_norm": 1.4731794595718384, "learning_rate": 1.5204360170384286e-05, "loss": 0.2519, "step": 3725 }, { "epoch": 24.194805194805195, "grad_norm": 1.4574756622314453, "learning_rate": 1.5181806437238477e-05, "loss": 0.2649, "step": 3726 }, { "epoch": 24.2012987012987, "grad_norm": 1.3568106889724731, "learning_rate": 1.5159266449764054e-05, "loss": 0.2346, "step": 3727 }, { "epoch": 24.207792207792206, "grad_norm": 1.6920344829559326, "learning_rate": 1.5136740216859464e-05, "loss": 0.2951, "step": 3728 }, { "epoch": 24.214285714285715, "grad_norm": 1.3407113552093506, "learning_rate": 1.511422774741771e-05, "loss": 0.2187, "step": 3729 }, { "epoch": 24.22077922077922, "grad_norm": 1.4222047328948975, "learning_rate": 1.5091729050326375e-05, "loss": 0.2388, "step": 3730 }, { "epoch": 24.227272727272727, "grad_norm": 1.591163158416748, "learning_rate": 1.5069244134467553e-05, "loss": 0.279, "step": 3731 }, { "epoch": 24.233766233766232, "grad_norm": 1.5508114099502563, "learning_rate": 1.5046773008717969e-05, "loss": 0.2713, "step": 3732 }, { "epoch": 24.24025974025974, "grad_norm": 1.5003843307495117, "learning_rate": 1.5024315681948813e-05, "loss": 0.2584, "step": 3733 }, { "epoch": 24.246753246753247, "grad_norm": 1.465012788772583, "learning_rate": 1.5001872163025954e-05, "loss": 0.2551, "step": 3734 }, { "epoch": 24.253246753246753, "grad_norm": 1.4430475234985352, "learning_rate": 1.4979442460809683e-05, "loss": 0.2817, "step": 3735 }, { "epoch": 24.25974025974026, "grad_norm": 1.4548296928405762, "learning_rate": 1.4957026584154926e-05, "loss": 0.2389, "step": 3736 }, { "epoch": 24.266233766233768, "grad_norm": 1.5435104370117188, "learning_rate": 1.4934624541911085e-05, "loss": 0.2559, "step": 3737 }, { "epoch": 24.272727272727273, "grad_norm": 1.4787144660949707, "learning_rate": 1.4912236342922149e-05, "loss": 0.265, "step": 3738 }, { "epoch": 24.27922077922078, "grad_norm": 1.5224908590316772, "learning_rate": 1.4889861996026616e-05, "loss": 0.2473, "step": 3739 }, { "epoch": 24.285714285714285, "grad_norm": 1.4455149173736572, "learning_rate": 1.4867501510057546e-05, "loss": 0.2417, "step": 3740 }, { "epoch": 24.292207792207794, "grad_norm": 1.3495597839355469, "learning_rate": 1.4845154893842477e-05, "loss": 0.2211, "step": 3741 }, { "epoch": 24.2987012987013, "grad_norm": 1.3956032991409302, "learning_rate": 1.4822822156203526e-05, "loss": 0.2461, "step": 3742 }, { "epoch": 24.305194805194805, "grad_norm": 1.511701226234436, "learning_rate": 1.4800503305957269e-05, "loss": 0.2532, "step": 3743 }, { "epoch": 24.31168831168831, "grad_norm": 1.4901893138885498, "learning_rate": 1.4778198351914857e-05, "loss": 0.2625, "step": 3744 }, { "epoch": 24.318181818181817, "grad_norm": 1.4659007787704468, "learning_rate": 1.4755907302881927e-05, "loss": 0.2726, "step": 3745 }, { "epoch": 24.324675324675326, "grad_norm": 1.445360779762268, "learning_rate": 1.4733630167658652e-05, "loss": 0.2332, "step": 3746 }, { "epoch": 24.33116883116883, "grad_norm": 1.2270129919052124, "learning_rate": 1.4711366955039663e-05, "loss": 0.2129, "step": 3747 }, { "epoch": 24.337662337662337, "grad_norm": 1.613595962524414, "learning_rate": 1.4689117673814135e-05, "loss": 0.244, "step": 3748 }, { "epoch": 24.344155844155843, "grad_norm": 1.2541364431381226, "learning_rate": 1.4666882332765747e-05, "loss": 0.1987, "step": 3749 }, { "epoch": 24.350649350649352, "grad_norm": 1.3972169160842896, "learning_rate": 1.4644660940672627e-05, "loss": 0.2327, "step": 3750 }, { "epoch": 24.357142857142858, "grad_norm": 1.587989091873169, "learning_rate": 1.4622453506307453e-05, "loss": 0.2674, "step": 3751 }, { "epoch": 24.363636363636363, "grad_norm": 1.7443116903305054, "learning_rate": 1.4600260038437375e-05, "loss": 0.3176, "step": 3752 }, { "epoch": 24.37012987012987, "grad_norm": 1.3819708824157715, "learning_rate": 1.4578080545823991e-05, "loss": 0.2442, "step": 3753 }, { "epoch": 24.376623376623378, "grad_norm": 1.4685349464416504, "learning_rate": 1.4555915037223439e-05, "loss": 0.2485, "step": 3754 }, { "epoch": 24.383116883116884, "grad_norm": 1.2664926052093506, "learning_rate": 1.4533763521386318e-05, "loss": 0.2141, "step": 3755 }, { "epoch": 24.38961038961039, "grad_norm": 1.5902550220489502, "learning_rate": 1.4511626007057666e-05, "loss": 0.2606, "step": 3756 }, { "epoch": 24.396103896103895, "grad_norm": 1.7585757970809937, "learning_rate": 1.4489502502977037e-05, "loss": 0.2953, "step": 3757 }, { "epoch": 24.4025974025974, "grad_norm": 1.2820913791656494, "learning_rate": 1.4467393017878445e-05, "loss": 0.2251, "step": 3758 }, { "epoch": 24.40909090909091, "grad_norm": 1.4327855110168457, "learning_rate": 1.4445297560490374e-05, "loss": 0.2521, "step": 3759 }, { "epoch": 24.415584415584416, "grad_norm": 1.6517313718795776, "learning_rate": 1.4423216139535734e-05, "loss": 0.2788, "step": 3760 }, { "epoch": 24.42207792207792, "grad_norm": 1.5322458744049072, "learning_rate": 1.4401148763731953e-05, "loss": 0.2345, "step": 3761 }, { "epoch": 24.428571428571427, "grad_norm": 1.5027211904525757, "learning_rate": 1.4379095441790846e-05, "loss": 0.2567, "step": 3762 }, { "epoch": 24.435064935064936, "grad_norm": 1.4352718591690063, "learning_rate": 1.4357056182418726e-05, "loss": 0.2709, "step": 3763 }, { "epoch": 24.441558441558442, "grad_norm": 1.4367806911468506, "learning_rate": 1.4335030994316357e-05, "loss": 0.2323, "step": 3764 }, { "epoch": 24.448051948051948, "grad_norm": 1.5347613096237183, "learning_rate": 1.4313019886178941e-05, "loss": 0.2547, "step": 3765 }, { "epoch": 24.454545454545453, "grad_norm": 1.6051234006881714, "learning_rate": 1.4291022866696085e-05, "loss": 0.2697, "step": 3766 }, { "epoch": 24.461038961038962, "grad_norm": 1.4305423498153687, "learning_rate": 1.4269039944551898e-05, "loss": 0.2227, "step": 3767 }, { "epoch": 24.467532467532468, "grad_norm": 1.4778579473495483, "learning_rate": 1.4247071128424838e-05, "loss": 0.2478, "step": 3768 }, { "epoch": 24.474025974025974, "grad_norm": 1.611335039138794, "learning_rate": 1.4225116426987917e-05, "loss": 0.2764, "step": 3769 }, { "epoch": 24.48051948051948, "grad_norm": 1.7121034860610962, "learning_rate": 1.420317584890844e-05, "loss": 0.3018, "step": 3770 }, { "epoch": 24.48701298701299, "grad_norm": 1.340805172920227, "learning_rate": 1.4181249402848246e-05, "loss": 0.1958, "step": 3771 }, { "epoch": 24.493506493506494, "grad_norm": 1.3698137998580933, "learning_rate": 1.4159337097463515e-05, "loss": 0.2062, "step": 3772 }, { "epoch": 24.5, "grad_norm": 1.6280251741409302, "learning_rate": 1.413743894140489e-05, "loss": 0.2837, "step": 3773 }, { "epoch": 24.506493506493506, "grad_norm": 1.5984694957733154, "learning_rate": 1.411555494331741e-05, "loss": 0.2679, "step": 3774 }, { "epoch": 24.51298701298701, "grad_norm": 1.6274757385253906, "learning_rate": 1.4093685111840566e-05, "loss": 0.2703, "step": 3775 }, { "epoch": 24.51948051948052, "grad_norm": 1.6693423986434937, "learning_rate": 1.407182945560817e-05, "loss": 0.2746, "step": 3776 }, { "epoch": 24.525974025974026, "grad_norm": 1.6024433374404907, "learning_rate": 1.4049987983248536e-05, "loss": 0.2988, "step": 3777 }, { "epoch": 24.532467532467532, "grad_norm": 1.5039241313934326, "learning_rate": 1.4028160703384263e-05, "loss": 0.2401, "step": 3778 }, { "epoch": 24.538961038961038, "grad_norm": 1.5956799983978271, "learning_rate": 1.4006347624632504e-05, "loss": 0.2796, "step": 3779 }, { "epoch": 24.545454545454547, "grad_norm": 1.2181111574172974, "learning_rate": 1.3984548755604654e-05, "loss": 0.1943, "step": 3780 }, { "epoch": 24.551948051948052, "grad_norm": 1.4724570512771606, "learning_rate": 1.3962764104906595e-05, "loss": 0.2597, "step": 3781 }, { "epoch": 24.558441558441558, "grad_norm": 1.7387564182281494, "learning_rate": 1.3940993681138536e-05, "loss": 0.2879, "step": 3782 }, { "epoch": 24.564935064935064, "grad_norm": 1.489858627319336, "learning_rate": 1.39192374928951e-05, "loss": 0.2499, "step": 3783 }, { "epoch": 24.571428571428573, "grad_norm": 1.685354232788086, "learning_rate": 1.3897495548765294e-05, "loss": 0.2888, "step": 3784 }, { "epoch": 24.57792207792208, "grad_norm": 1.4370073080062866, "learning_rate": 1.387576785733251e-05, "loss": 0.2467, "step": 3785 }, { "epoch": 24.584415584415584, "grad_norm": 1.5947544574737549, "learning_rate": 1.3854054427174468e-05, "loss": 0.2739, "step": 3786 }, { "epoch": 24.59090909090909, "grad_norm": 1.6878124475479126, "learning_rate": 1.3832355266863305e-05, "loss": 0.2584, "step": 3787 }, { "epoch": 24.5974025974026, "grad_norm": 1.5307517051696777, "learning_rate": 1.3810670384965474e-05, "loss": 0.2547, "step": 3788 }, { "epoch": 24.603896103896105, "grad_norm": 1.2758972644805908, "learning_rate": 1.3788999790041867e-05, "loss": 0.2046, "step": 3789 }, { "epoch": 24.61038961038961, "grad_norm": 1.4588029384613037, "learning_rate": 1.3767343490647665e-05, "loss": 0.224, "step": 3790 }, { "epoch": 24.616883116883116, "grad_norm": 1.424414873123169, "learning_rate": 1.3745701495332452e-05, "loss": 0.242, "step": 3791 }, { "epoch": 24.623376623376622, "grad_norm": 1.412399172782898, "learning_rate": 1.3724073812640115e-05, "loss": 0.2428, "step": 3792 }, { "epoch": 24.62987012987013, "grad_norm": 1.6432981491088867, "learning_rate": 1.3702460451108935e-05, "loss": 0.2778, "step": 3793 }, { "epoch": 24.636363636363637, "grad_norm": 1.4610412120819092, "learning_rate": 1.368086141927154e-05, "loss": 0.2623, "step": 3794 }, { "epoch": 24.642857142857142, "grad_norm": 1.3785265684127808, "learning_rate": 1.3659276725654863e-05, "loss": 0.2217, "step": 3795 }, { "epoch": 24.649350649350648, "grad_norm": 1.633029818534851, "learning_rate": 1.3637706378780208e-05, "loss": 0.2934, "step": 3796 }, { "epoch": 24.655844155844157, "grad_norm": 1.4184614419937134, "learning_rate": 1.3616150387163217e-05, "loss": 0.2449, "step": 3797 }, { "epoch": 24.662337662337663, "grad_norm": 1.411084771156311, "learning_rate": 1.3594608759313831e-05, "loss": 0.2554, "step": 3798 }, { "epoch": 24.66883116883117, "grad_norm": 1.3232630491256714, "learning_rate": 1.3573081503736362e-05, "loss": 0.2171, "step": 3799 }, { "epoch": 24.675324675324674, "grad_norm": 1.4922369718551636, "learning_rate": 1.3551568628929434e-05, "loss": 0.2463, "step": 3800 }, { "epoch": 24.681818181818183, "grad_norm": 1.4725935459136963, "learning_rate": 1.3530070143385965e-05, "loss": 0.2614, "step": 3801 }, { "epoch": 24.68831168831169, "grad_norm": 1.4515880346298218, "learning_rate": 1.350858605559323e-05, "loss": 0.2533, "step": 3802 }, { "epoch": 24.694805194805195, "grad_norm": 1.4999488592147827, "learning_rate": 1.3487116374032816e-05, "loss": 0.2648, "step": 3803 }, { "epoch": 24.7012987012987, "grad_norm": 1.5419762134552002, "learning_rate": 1.346566110718061e-05, "loss": 0.2507, "step": 3804 }, { "epoch": 24.707792207792206, "grad_norm": 1.4331741333007812, "learning_rate": 1.3444220263506795e-05, "loss": 0.2688, "step": 3805 }, { "epoch": 24.714285714285715, "grad_norm": 1.4227244853973389, "learning_rate": 1.3422793851475907e-05, "loss": 0.2533, "step": 3806 }, { "epoch": 24.72077922077922, "grad_norm": 1.4616429805755615, "learning_rate": 1.3401381879546715e-05, "loss": 0.2635, "step": 3807 }, { "epoch": 24.727272727272727, "grad_norm": 1.5894221067428589, "learning_rate": 1.3379984356172349e-05, "loss": 0.2791, "step": 3808 }, { "epoch": 24.733766233766232, "grad_norm": 1.5892449617385864, "learning_rate": 1.335860128980021e-05, "loss": 0.2745, "step": 3809 }, { "epoch": 24.74025974025974, "grad_norm": 1.3794944286346436, "learning_rate": 1.3337232688872003e-05, "loss": 0.2275, "step": 3810 }, { "epoch": 24.746753246753247, "grad_norm": 1.5496991872787476, "learning_rate": 1.3315878561823697e-05, "loss": 0.2256, "step": 3811 }, { "epoch": 24.753246753246753, "grad_norm": 1.5657767057418823, "learning_rate": 1.3294538917085586e-05, "loss": 0.2379, "step": 3812 }, { "epoch": 24.75974025974026, "grad_norm": 1.241636872291565, "learning_rate": 1.3273213763082193e-05, "loss": 0.1942, "step": 3813 }, { "epoch": 24.766233766233768, "grad_norm": 1.2433438301086426, "learning_rate": 1.3251903108232361e-05, "loss": 0.2174, "step": 3814 }, { "epoch": 24.772727272727273, "grad_norm": 1.3848848342895508, "learning_rate": 1.3230606960949205e-05, "loss": 0.2303, "step": 3815 }, { "epoch": 24.77922077922078, "grad_norm": 1.3921412229537964, "learning_rate": 1.3209325329640121e-05, "loss": 0.2406, "step": 3816 }, { "epoch": 24.785714285714285, "grad_norm": 1.5151231288909912, "learning_rate": 1.3188058222706734e-05, "loss": 0.2587, "step": 3817 }, { "epoch": 24.792207792207794, "grad_norm": 1.4199899435043335, "learning_rate": 1.3166805648544989e-05, "loss": 0.237, "step": 3818 }, { "epoch": 24.7987012987013, "grad_norm": 1.4588905572891235, "learning_rate": 1.3145567615545012e-05, "loss": 0.2569, "step": 3819 }, { "epoch": 24.805194805194805, "grad_norm": 1.398755431175232, "learning_rate": 1.3124344132091315e-05, "loss": 0.2233, "step": 3820 }, { "epoch": 24.81168831168831, "grad_norm": 1.4625189304351807, "learning_rate": 1.3103135206562534e-05, "loss": 0.2445, "step": 3821 }, { "epoch": 24.818181818181817, "grad_norm": 1.6369975805282593, "learning_rate": 1.3081940847331659e-05, "loss": 0.2887, "step": 3822 }, { "epoch": 24.824675324675326, "grad_norm": 1.460815191268921, "learning_rate": 1.3060761062765853e-05, "loss": 0.2501, "step": 3823 }, { "epoch": 24.83116883116883, "grad_norm": 1.5518665313720703, "learning_rate": 1.3039595861226573e-05, "loss": 0.2626, "step": 3824 }, { "epoch": 24.837662337662337, "grad_norm": 1.6659070253372192, "learning_rate": 1.301844525106951e-05, "loss": 0.3129, "step": 3825 }, { "epoch": 24.844155844155843, "grad_norm": 1.4165908098220825, "learning_rate": 1.2997309240644606e-05, "loss": 0.2336, "step": 3826 }, { "epoch": 24.850649350649352, "grad_norm": 1.5086363554000854, "learning_rate": 1.2976187838295983e-05, "loss": 0.2477, "step": 3827 }, { "epoch": 24.857142857142858, "grad_norm": 1.461747407913208, "learning_rate": 1.2955081052362073e-05, "loss": 0.2679, "step": 3828 }, { "epoch": 24.863636363636363, "grad_norm": 1.6059125661849976, "learning_rate": 1.2933988891175458e-05, "loss": 0.2917, "step": 3829 }, { "epoch": 24.87012987012987, "grad_norm": 1.497239112854004, "learning_rate": 1.291291136306304e-05, "loss": 0.2279, "step": 3830 }, { "epoch": 24.876623376623378, "grad_norm": 1.3064825534820557, "learning_rate": 1.2891848476345863e-05, "loss": 0.2167, "step": 3831 }, { "epoch": 24.883116883116884, "grad_norm": 1.4591797590255737, "learning_rate": 1.2870800239339236e-05, "loss": 0.2542, "step": 3832 }, { "epoch": 24.88961038961039, "grad_norm": 1.496483564376831, "learning_rate": 1.2849766660352653e-05, "loss": 0.2442, "step": 3833 }, { "epoch": 24.896103896103895, "grad_norm": 1.7861509323120117, "learning_rate": 1.282874774768984e-05, "loss": 0.34, "step": 3834 }, { "epoch": 24.9025974025974, "grad_norm": 1.6444364786148071, "learning_rate": 1.280774350964874e-05, "loss": 0.2567, "step": 3835 }, { "epoch": 24.90909090909091, "grad_norm": 1.6426337957382202, "learning_rate": 1.2786753954521507e-05, "loss": 0.2643, "step": 3836 }, { "epoch": 24.915584415584416, "grad_norm": 1.6060869693756104, "learning_rate": 1.2765779090594454e-05, "loss": 0.2623, "step": 3837 }, { "epoch": 24.92207792207792, "grad_norm": 1.4153776168823242, "learning_rate": 1.274481892614815e-05, "loss": 0.2177, "step": 3838 }, { "epoch": 24.928571428571427, "grad_norm": 1.6109665632247925, "learning_rate": 1.2723873469457304e-05, "loss": 0.2762, "step": 3839 }, { "epoch": 24.935064935064936, "grad_norm": 1.6159919500350952, "learning_rate": 1.2702942728790895e-05, "loss": 0.253, "step": 3840 }, { "epoch": 24.941558441558442, "grad_norm": 1.375745177268982, "learning_rate": 1.268202671241201e-05, "loss": 0.2242, "step": 3841 }, { "epoch": 24.948051948051948, "grad_norm": 1.4708601236343384, "learning_rate": 1.2661125428577997e-05, "loss": 0.2596, "step": 3842 }, { "epoch": 24.954545454545453, "grad_norm": 1.4296232461929321, "learning_rate": 1.2640238885540312e-05, "loss": 0.2634, "step": 3843 }, { "epoch": 24.961038961038962, "grad_norm": 1.3737279176712036, "learning_rate": 1.2619367091544654e-05, "loss": 0.242, "step": 3844 }, { "epoch": 24.967532467532468, "grad_norm": 1.6131162643432617, "learning_rate": 1.2598510054830881e-05, "loss": 0.2828, "step": 3845 }, { "epoch": 24.974025974025974, "grad_norm": 1.5378482341766357, "learning_rate": 1.2577667783633007e-05, "loss": 0.265, "step": 3846 }, { "epoch": 24.98051948051948, "grad_norm": 1.4757226705551147, "learning_rate": 1.2556840286179228e-05, "loss": 0.2856, "step": 3847 }, { "epoch": 24.98701298701299, "grad_norm": 1.4571387767791748, "learning_rate": 1.2536027570691944e-05, "loss": 0.2415, "step": 3848 }, { "epoch": 24.993506493506494, "grad_norm": 1.618838906288147, "learning_rate": 1.2515229645387638e-05, "loss": 0.2557, "step": 3849 }, { "epoch": 25.0, "grad_norm": 4.126040935516357, "learning_rate": 1.2494446518477022e-05, "loss": 0.2599, "step": 3850 }, { "epoch": 25.006493506493506, "grad_norm": 1.547411561012268, "learning_rate": 1.2473678198164968e-05, "loss": 0.2525, "step": 3851 }, { "epoch": 25.01298701298701, "grad_norm": 1.3735060691833496, "learning_rate": 1.2452924692650442e-05, "loss": 0.2263, "step": 3852 }, { "epoch": 25.01948051948052, "grad_norm": 1.2571289539337158, "learning_rate": 1.2432186010126612e-05, "loss": 0.2019, "step": 3853 }, { "epoch": 25.025974025974026, "grad_norm": 1.442159652709961, "learning_rate": 1.241146215878079e-05, "loss": 0.2374, "step": 3854 }, { "epoch": 25.032467532467532, "grad_norm": 1.4531927108764648, "learning_rate": 1.2390753146794437e-05, "loss": 0.2124, "step": 3855 }, { "epoch": 25.038961038961038, "grad_norm": 1.5387581586837769, "learning_rate": 1.2370058982343109e-05, "loss": 0.2725, "step": 3856 }, { "epoch": 25.045454545454547, "grad_norm": 1.38837730884552, "learning_rate": 1.2349379673596561e-05, "loss": 0.208, "step": 3857 }, { "epoch": 25.051948051948052, "grad_norm": 1.7436175346374512, "learning_rate": 1.232871522871864e-05, "loss": 0.2568, "step": 3858 }, { "epoch": 25.058441558441558, "grad_norm": 1.37714684009552, "learning_rate": 1.2308065655867351e-05, "loss": 0.2208, "step": 3859 }, { "epoch": 25.064935064935064, "grad_norm": 1.583754301071167, "learning_rate": 1.2287430963194807e-05, "loss": 0.2643, "step": 3860 }, { "epoch": 25.071428571428573, "grad_norm": 1.5096653699874878, "learning_rate": 1.2266811158847286e-05, "loss": 0.217, "step": 3861 }, { "epoch": 25.07792207792208, "grad_norm": 1.4023994207382202, "learning_rate": 1.2246206250965125e-05, "loss": 0.2418, "step": 3862 }, { "epoch": 25.084415584415584, "grad_norm": 1.5400210618972778, "learning_rate": 1.2225616247682848e-05, "loss": 0.271, "step": 3863 }, { "epoch": 25.09090909090909, "grad_norm": 1.6497278213500977, "learning_rate": 1.2205041157129016e-05, "loss": 0.3044, "step": 3864 }, { "epoch": 25.0974025974026, "grad_norm": 1.3827998638153076, "learning_rate": 1.2184480987426405e-05, "loss": 0.2287, "step": 3865 }, { "epoch": 25.103896103896105, "grad_norm": 1.3452811241149902, "learning_rate": 1.2163935746691806e-05, "loss": 0.2148, "step": 3866 }, { "epoch": 25.11038961038961, "grad_norm": 1.5052608251571655, "learning_rate": 1.2143405443036182e-05, "loss": 0.2355, "step": 3867 }, { "epoch": 25.116883116883116, "grad_norm": 1.4265750646591187, "learning_rate": 1.2122890084564542e-05, "loss": 0.2626, "step": 3868 }, { "epoch": 25.123376623376622, "grad_norm": 1.376091480255127, "learning_rate": 1.2102389679376036e-05, "loss": 0.2224, "step": 3869 }, { "epoch": 25.12987012987013, "grad_norm": 1.4407280683517456, "learning_rate": 1.2081904235563901e-05, "loss": 0.2321, "step": 3870 }, { "epoch": 25.136363636363637, "grad_norm": 1.2167425155639648, "learning_rate": 1.206143376121549e-05, "loss": 0.2015, "step": 3871 }, { "epoch": 25.142857142857142, "grad_norm": 1.4684216976165771, "learning_rate": 1.2040978264412178e-05, "loss": 0.2573, "step": 3872 }, { "epoch": 25.149350649350648, "grad_norm": 1.3168730735778809, "learning_rate": 1.2020537753229506e-05, "loss": 0.2023, "step": 3873 }, { "epoch": 25.155844155844157, "grad_norm": 1.4566622972488403, "learning_rate": 1.2000112235737026e-05, "loss": 0.2771, "step": 3874 }, { "epoch": 25.162337662337663, "grad_norm": 1.5811680555343628, "learning_rate": 1.1979701719998453e-05, "loss": 0.2762, "step": 3875 }, { "epoch": 25.16883116883117, "grad_norm": 1.5613210201263428, "learning_rate": 1.1959306214071508e-05, "loss": 0.2833, "step": 3876 }, { "epoch": 25.175324675324674, "grad_norm": 1.3008835315704346, "learning_rate": 1.1938925726008038e-05, "loss": 0.2079, "step": 3877 }, { "epoch": 25.181818181818183, "grad_norm": 1.446663498878479, "learning_rate": 1.19185602638539e-05, "loss": 0.2383, "step": 3878 }, { "epoch": 25.18831168831169, "grad_norm": 1.580613374710083, "learning_rate": 1.1898209835649082e-05, "loss": 0.2537, "step": 3879 }, { "epoch": 25.194805194805195, "grad_norm": 1.199026346206665, "learning_rate": 1.18778744494276e-05, "loss": 0.1829, "step": 3880 }, { "epoch": 25.2012987012987, "grad_norm": 1.642932415008545, "learning_rate": 1.1857554113217567e-05, "loss": 0.2481, "step": 3881 }, { "epoch": 25.207792207792206, "grad_norm": 1.4312461614608765, "learning_rate": 1.1837248835041092e-05, "loss": 0.2408, "step": 3882 }, { "epoch": 25.214285714285715, "grad_norm": 1.3826494216918945, "learning_rate": 1.181695862291441e-05, "loss": 0.2207, "step": 3883 }, { "epoch": 25.22077922077922, "grad_norm": 1.4738268852233887, "learning_rate": 1.179668348484773e-05, "loss": 0.2347, "step": 3884 }, { "epoch": 25.227272727272727, "grad_norm": 1.5864123106002808, "learning_rate": 1.1776423428845423e-05, "loss": 0.2581, "step": 3885 }, { "epoch": 25.233766233766232, "grad_norm": 1.4101697206497192, "learning_rate": 1.1756178462905782e-05, "loss": 0.2542, "step": 3886 }, { "epoch": 25.24025974025974, "grad_norm": 1.4642064571380615, "learning_rate": 1.1735948595021228e-05, "loss": 0.2238, "step": 3887 }, { "epoch": 25.246753246753247, "grad_norm": 1.542807698249817, "learning_rate": 1.1715733833178177e-05, "loss": 0.2461, "step": 3888 }, { "epoch": 25.253246753246753, "grad_norm": 1.3182908296585083, "learning_rate": 1.1695534185357099e-05, "loss": 0.2077, "step": 3889 }, { "epoch": 25.25974025974026, "grad_norm": 1.4858134984970093, "learning_rate": 1.1675349659532513e-05, "loss": 0.2386, "step": 3890 }, { "epoch": 25.266233766233768, "grad_norm": 1.4680992364883423, "learning_rate": 1.1655180263672927e-05, "loss": 0.2392, "step": 3891 }, { "epoch": 25.272727272727273, "grad_norm": 1.404128909111023, "learning_rate": 1.16350260057409e-05, "loss": 0.2343, "step": 3892 }, { "epoch": 25.27922077922078, "grad_norm": 1.5189787149429321, "learning_rate": 1.1614886893693044e-05, "loss": 0.251, "step": 3893 }, { "epoch": 25.285714285714285, "grad_norm": 1.460323452949524, "learning_rate": 1.159476293547992e-05, "loss": 0.2491, "step": 3894 }, { "epoch": 25.292207792207794, "grad_norm": 1.4811824560165405, "learning_rate": 1.1574654139046171e-05, "loss": 0.2355, "step": 3895 }, { "epoch": 25.2987012987013, "grad_norm": 1.612228274345398, "learning_rate": 1.1554560512330436e-05, "loss": 0.2813, "step": 3896 }, { "epoch": 25.305194805194805, "grad_norm": 1.4109948873519897, "learning_rate": 1.1534482063265345e-05, "loss": 0.2404, "step": 3897 }, { "epoch": 25.31168831168831, "grad_norm": 1.4630529880523682, "learning_rate": 1.1514418799777554e-05, "loss": 0.2311, "step": 3898 }, { "epoch": 25.318181818181817, "grad_norm": 1.425007700920105, "learning_rate": 1.1494370729787728e-05, "loss": 0.2218, "step": 3899 }, { "epoch": 25.324675324675326, "grad_norm": 1.4541804790496826, "learning_rate": 1.1474337861210543e-05, "loss": 0.2484, "step": 3900 }, { "epoch": 25.33116883116883, "grad_norm": 1.4659674167633057, "learning_rate": 1.1454320201954626e-05, "loss": 0.2246, "step": 3901 }, { "epoch": 25.337662337662337, "grad_norm": 1.3819206953048706, "learning_rate": 1.1434317759922664e-05, "loss": 0.2265, "step": 3902 }, { "epoch": 25.344155844155843, "grad_norm": 1.4699912071228027, "learning_rate": 1.141433054301127e-05, "loss": 0.2661, "step": 3903 }, { "epoch": 25.350649350649352, "grad_norm": 1.4448765516281128, "learning_rate": 1.1394358559111101e-05, "loss": 0.2366, "step": 3904 }, { "epoch": 25.357142857142858, "grad_norm": 1.3506455421447754, "learning_rate": 1.1374401816106778e-05, "loss": 0.2186, "step": 3905 }, { "epoch": 25.363636363636363, "grad_norm": 1.2490218877792358, "learning_rate": 1.135446032187692e-05, "loss": 0.2241, "step": 3906 }, { "epoch": 25.37012987012987, "grad_norm": 1.3988432884216309, "learning_rate": 1.1334534084294079e-05, "loss": 0.238, "step": 3907 }, { "epoch": 25.376623376623378, "grad_norm": 1.5286494493484497, "learning_rate": 1.1314623111224865e-05, "loss": 0.2629, "step": 3908 }, { "epoch": 25.383116883116884, "grad_norm": 1.4078693389892578, "learning_rate": 1.1294727410529749e-05, "loss": 0.2273, "step": 3909 }, { "epoch": 25.38961038961039, "grad_norm": 1.4063057899475098, "learning_rate": 1.1274846990063315e-05, "loss": 0.2447, "step": 3910 }, { "epoch": 25.396103896103895, "grad_norm": 1.359681248664856, "learning_rate": 1.1254981857673985e-05, "loss": 0.228, "step": 3911 }, { "epoch": 25.4025974025974, "grad_norm": 1.2989537715911865, "learning_rate": 1.123513202120422e-05, "loss": 0.2096, "step": 3912 }, { "epoch": 25.40909090909091, "grad_norm": 1.611515760421753, "learning_rate": 1.1215297488490412e-05, "loss": 0.2688, "step": 3913 }, { "epoch": 25.415584415584416, "grad_norm": 1.4984757900238037, "learning_rate": 1.1195478267362924e-05, "loss": 0.2342, "step": 3914 }, { "epoch": 25.42207792207792, "grad_norm": 1.4901241064071655, "learning_rate": 1.1175674365646066e-05, "loss": 0.2412, "step": 3915 }, { "epoch": 25.428571428571427, "grad_norm": 1.2154924869537354, "learning_rate": 1.1155885791158128e-05, "loss": 0.1905, "step": 3916 }, { "epoch": 25.435064935064936, "grad_norm": 1.417709469795227, "learning_rate": 1.1136112551711291e-05, "loss": 0.2331, "step": 3917 }, { "epoch": 25.441558441558442, "grad_norm": 1.5889604091644287, "learning_rate": 1.111635465511175e-05, "loss": 0.2699, "step": 3918 }, { "epoch": 25.448051948051948, "grad_norm": 1.397789716720581, "learning_rate": 1.1096612109159566e-05, "loss": 0.238, "step": 3919 }, { "epoch": 25.454545454545453, "grad_norm": 1.5090736150741577, "learning_rate": 1.1076884921648834e-05, "loss": 0.2615, "step": 3920 }, { "epoch": 25.461038961038962, "grad_norm": 1.356040358543396, "learning_rate": 1.1057173100367496e-05, "loss": 0.2151, "step": 3921 }, { "epoch": 25.467532467532468, "grad_norm": 1.3909372091293335, "learning_rate": 1.10374766530975e-05, "loss": 0.2258, "step": 3922 }, { "epoch": 25.474025974025974, "grad_norm": 1.419009804725647, "learning_rate": 1.101779558761466e-05, "loss": 0.2345, "step": 3923 }, { "epoch": 25.48051948051948, "grad_norm": 1.5796552896499634, "learning_rate": 1.0998129911688764e-05, "loss": 0.2752, "step": 3924 }, { "epoch": 25.48701298701299, "grad_norm": 1.4663971662521362, "learning_rate": 1.0978479633083505e-05, "loss": 0.2403, "step": 3925 }, { "epoch": 25.493506493506494, "grad_norm": 1.5179322957992554, "learning_rate": 1.0958844759556525e-05, "loss": 0.251, "step": 3926 }, { "epoch": 25.5, "grad_norm": 1.5957598686218262, "learning_rate": 1.0939225298859323e-05, "loss": 0.2607, "step": 3927 }, { "epoch": 25.506493506493506, "grad_norm": 1.6642088890075684, "learning_rate": 1.0919621258737383e-05, "loss": 0.2632, "step": 3928 }, { "epoch": 25.51298701298701, "grad_norm": 1.5881017446517944, "learning_rate": 1.0900032646930036e-05, "loss": 0.2629, "step": 3929 }, { "epoch": 25.51948051948052, "grad_norm": 1.7285717725753784, "learning_rate": 1.0880459471170596e-05, "loss": 0.2964, "step": 3930 }, { "epoch": 25.525974025974026, "grad_norm": 1.5745680332183838, "learning_rate": 1.0860901739186208e-05, "loss": 0.2749, "step": 3931 }, { "epoch": 25.532467532467532, "grad_norm": 1.2794368267059326, "learning_rate": 1.0841359458697986e-05, "loss": 0.2167, "step": 3932 }, { "epoch": 25.538961038961038, "grad_norm": 1.5602290630340576, "learning_rate": 1.0821832637420881e-05, "loss": 0.2772, "step": 3933 }, { "epoch": 25.545454545454547, "grad_norm": 1.6337082386016846, "learning_rate": 1.0802321283063794e-05, "loss": 0.2746, "step": 3934 }, { "epoch": 25.551948051948052, "grad_norm": 1.4723968505859375, "learning_rate": 1.0782825403329488e-05, "loss": 0.2414, "step": 3935 }, { "epoch": 25.558441558441558, "grad_norm": 1.4009954929351807, "learning_rate": 1.0763345005914649e-05, "loss": 0.2385, "step": 3936 }, { "epoch": 25.564935064935064, "grad_norm": 1.483906865119934, "learning_rate": 1.0743880098509795e-05, "loss": 0.2547, "step": 3937 }, { "epoch": 25.571428571428573, "grad_norm": 1.4557583332061768, "learning_rate": 1.0724430688799402e-05, "loss": 0.23, "step": 3938 }, { "epoch": 25.57792207792208, "grad_norm": 1.2362334728240967, "learning_rate": 1.0704996784461752e-05, "loss": 0.2084, "step": 3939 }, { "epoch": 25.584415584415584, "grad_norm": 1.6001020669937134, "learning_rate": 1.0685578393169055e-05, "loss": 0.2592, "step": 3940 }, { "epoch": 25.59090909090909, "grad_norm": 1.4896738529205322, "learning_rate": 1.0666175522587402e-05, "loss": 0.2716, "step": 3941 }, { "epoch": 25.5974025974026, "grad_norm": 1.2849098443984985, "learning_rate": 1.0646788180376716e-05, "loss": 0.2057, "step": 3942 }, { "epoch": 25.603896103896105, "grad_norm": 1.3597465753555298, "learning_rate": 1.0627416374190812e-05, "loss": 0.2279, "step": 3943 }, { "epoch": 25.61038961038961, "grad_norm": 1.373323917388916, "learning_rate": 1.060806011167741e-05, "loss": 0.2316, "step": 3944 }, { "epoch": 25.616883116883116, "grad_norm": 1.518390417098999, "learning_rate": 1.0588719400478004e-05, "loss": 0.2373, "step": 3945 }, { "epoch": 25.623376623376622, "grad_norm": 1.3952455520629883, "learning_rate": 1.0569394248228026e-05, "loss": 0.2473, "step": 3946 }, { "epoch": 25.62987012987013, "grad_norm": 1.397457480430603, "learning_rate": 1.0550084662556753e-05, "loss": 0.2262, "step": 3947 }, { "epoch": 25.636363636363637, "grad_norm": 1.5089877843856812, "learning_rate": 1.0530790651087274e-05, "loss": 0.2609, "step": 3948 }, { "epoch": 25.642857142857142, "grad_norm": 1.569136142730713, "learning_rate": 1.0511512221436581e-05, "loss": 0.2408, "step": 3949 }, { "epoch": 25.649350649350648, "grad_norm": 1.5139870643615723, "learning_rate": 1.049224938121548e-05, "loss": 0.2553, "step": 3950 }, { "epoch": 25.655844155844157, "grad_norm": 1.6584709882736206, "learning_rate": 1.0473002138028654e-05, "loss": 0.2851, "step": 3951 }, { "epoch": 25.662337662337663, "grad_norm": 1.4925132989883423, "learning_rate": 1.0453770499474586e-05, "loss": 0.2364, "step": 3952 }, { "epoch": 25.66883116883117, "grad_norm": 1.4464969635009766, "learning_rate": 1.0434554473145648e-05, "loss": 0.2209, "step": 3953 }, { "epoch": 25.675324675324674, "grad_norm": 1.4543795585632324, "learning_rate": 1.0415354066627992e-05, "loss": 0.2299, "step": 3954 }, { "epoch": 25.681818181818183, "grad_norm": 1.5420897006988525, "learning_rate": 1.039616928750165e-05, "loss": 0.2576, "step": 3955 }, { "epoch": 25.68831168831169, "grad_norm": 1.4393539428710938, "learning_rate": 1.0377000143340471e-05, "loss": 0.2618, "step": 3956 }, { "epoch": 25.694805194805195, "grad_norm": 1.6070607900619507, "learning_rate": 1.0357846641712144e-05, "loss": 0.2927, "step": 3957 }, { "epoch": 25.7012987012987, "grad_norm": 1.559675693511963, "learning_rate": 1.0338708790178136e-05, "loss": 0.2696, "step": 3958 }, { "epoch": 25.707792207792206, "grad_norm": 1.412699818611145, "learning_rate": 1.03195865962938e-05, "loss": 0.2521, "step": 3959 }, { "epoch": 25.714285714285715, "grad_norm": 1.3377207517623901, "learning_rate": 1.0300480067608231e-05, "loss": 0.2103, "step": 3960 }, { "epoch": 25.72077922077922, "grad_norm": 1.233055591583252, "learning_rate": 1.028138921166444e-05, "loss": 0.1781, "step": 3961 }, { "epoch": 25.727272727272727, "grad_norm": 1.316535234451294, "learning_rate": 1.026231403599915e-05, "loss": 0.216, "step": 3962 }, { "epoch": 25.733766233766232, "grad_norm": 1.3665781021118164, "learning_rate": 1.0243254548142967e-05, "loss": 0.242, "step": 3963 }, { "epoch": 25.74025974025974, "grad_norm": 1.623543381690979, "learning_rate": 1.0224210755620256e-05, "loss": 0.2459, "step": 3964 }, { "epoch": 25.746753246753247, "grad_norm": 1.4611620903015137, "learning_rate": 1.0205182665949208e-05, "loss": 0.2412, "step": 3965 }, { "epoch": 25.753246753246753, "grad_norm": 1.716041088104248, "learning_rate": 1.0186170286641816e-05, "loss": 0.2478, "step": 3966 }, { "epoch": 25.75974025974026, "grad_norm": 1.4261060953140259, "learning_rate": 1.016717362520388e-05, "loss": 0.2266, "step": 3967 }, { "epoch": 25.766233766233768, "grad_norm": 1.207743763923645, "learning_rate": 1.014819268913495e-05, "loss": 0.1878, "step": 3968 }, { "epoch": 25.772727272727273, "grad_norm": 1.6146961450576782, "learning_rate": 1.0129227485928433e-05, "loss": 0.2691, "step": 3969 }, { "epoch": 25.77922077922078, "grad_norm": 1.3689700365066528, "learning_rate": 1.0110278023071446e-05, "loss": 0.2068, "step": 3970 }, { "epoch": 25.785714285714285, "grad_norm": 1.5973433256149292, "learning_rate": 1.0091344308044986e-05, "loss": 0.266, "step": 3971 }, { "epoch": 25.792207792207794, "grad_norm": 1.407936453819275, "learning_rate": 1.0072426348323754e-05, "loss": 0.2537, "step": 3972 }, { "epoch": 25.7987012987013, "grad_norm": 1.4229836463928223, "learning_rate": 1.005352415137628e-05, "loss": 0.232, "step": 3973 }, { "epoch": 25.805194805194805, "grad_norm": 1.326256513595581, "learning_rate": 1.003463772466483e-05, "loss": 0.2184, "step": 3974 }, { "epoch": 25.81168831168831, "grad_norm": 1.5608634948730469, "learning_rate": 1.0015767075645471e-05, "loss": 0.271, "step": 3975 }, { "epoch": 25.818181818181817, "grad_norm": 1.5370579957962036, "learning_rate": 9.99691221176805e-06, "loss": 0.2628, "step": 3976 }, { "epoch": 25.824675324675326, "grad_norm": 1.5833702087402344, "learning_rate": 9.978073140476169e-06, "loss": 0.2695, "step": 3977 }, { "epoch": 25.83116883116883, "grad_norm": 1.282939076423645, "learning_rate": 9.959249869207177e-06, "loss": 0.1994, "step": 3978 }, { "epoch": 25.837662337662337, "grad_norm": 1.60276460647583, "learning_rate": 9.940442405392226e-06, "loss": 0.2679, "step": 3979 }, { "epoch": 25.844155844155843, "grad_norm": 1.4950294494628906, "learning_rate": 9.921650756456164e-06, "loss": 0.2748, "step": 3980 }, { "epoch": 25.850649350649352, "grad_norm": 1.303539514541626, "learning_rate": 9.902874929817696e-06, "loss": 0.2305, "step": 3981 }, { "epoch": 25.857142857142858, "grad_norm": 1.7113896608352661, "learning_rate": 9.884114932889171e-06, "loss": 0.2895, "step": 3982 }, { "epoch": 25.863636363636363, "grad_norm": 1.3244531154632568, "learning_rate": 9.865370773076771e-06, "loss": 0.2274, "step": 3983 }, { "epoch": 25.87012987012987, "grad_norm": 1.6126139163970947, "learning_rate": 9.846642457780369e-06, "loss": 0.2334, "step": 3984 }, { "epoch": 25.876623376623378, "grad_norm": 1.5527100563049316, "learning_rate": 9.82792999439362e-06, "loss": 0.2803, "step": 3985 }, { "epoch": 25.883116883116884, "grad_norm": 1.1837865114212036, "learning_rate": 9.809233390303901e-06, "loss": 0.1785, "step": 3986 }, { "epoch": 25.88961038961039, "grad_norm": 1.6036617755889893, "learning_rate": 9.79055265289236e-06, "loss": 0.2846, "step": 3987 }, { "epoch": 25.896103896103895, "grad_norm": 1.4677743911743164, "learning_rate": 9.771887789533818e-06, "loss": 0.2398, "step": 3988 }, { "epoch": 25.9025974025974, "grad_norm": 1.6012471914291382, "learning_rate": 9.753238807596903e-06, "loss": 0.2752, "step": 3989 }, { "epoch": 25.90909090909091, "grad_norm": 1.5253721475601196, "learning_rate": 9.734605714443906e-06, "loss": 0.2424, "step": 3990 }, { "epoch": 25.915584415584416, "grad_norm": 1.2777734994888306, "learning_rate": 9.715988517430896e-06, "loss": 0.1913, "step": 3991 }, { "epoch": 25.92207792207792, "grad_norm": 1.486307144165039, "learning_rate": 9.69738722390765e-06, "loss": 0.2496, "step": 3992 }, { "epoch": 25.928571428571427, "grad_norm": 1.363534688949585, "learning_rate": 9.67880184121765e-06, "loss": 0.2435, "step": 3993 }, { "epoch": 25.935064935064936, "grad_norm": 1.6092283725738525, "learning_rate": 9.66023237669812e-06, "loss": 0.2837, "step": 3994 }, { "epoch": 25.941558441558442, "grad_norm": 1.4709584712982178, "learning_rate": 9.641678837679985e-06, "loss": 0.2336, "step": 3995 }, { "epoch": 25.948051948051948, "grad_norm": 1.4642581939697266, "learning_rate": 9.623141231487904e-06, "loss": 0.2377, "step": 3996 }, { "epoch": 25.954545454545453, "grad_norm": 1.448259949684143, "learning_rate": 9.60461956544021e-06, "loss": 0.2461, "step": 3997 }, { "epoch": 25.961038961038962, "grad_norm": 1.3671592473983765, "learning_rate": 9.586113846848982e-06, "loss": 0.2306, "step": 3998 }, { "epoch": 25.967532467532468, "grad_norm": 1.414479374885559, "learning_rate": 9.567624083019966e-06, "loss": 0.2298, "step": 3999 }, { "epoch": 25.974025974025974, "grad_norm": 1.5224496126174927, "learning_rate": 9.549150281252633e-06, "loss": 0.2419, "step": 4000 }, { "epoch": 25.98051948051948, "grad_norm": 1.6472277641296387, "learning_rate": 9.53069244884015e-06, "loss": 0.2956, "step": 4001 }, { "epoch": 25.98701298701299, "grad_norm": 1.5626544952392578, "learning_rate": 9.512250593069393e-06, "loss": 0.2523, "step": 4002 }, { "epoch": 25.993506493506494, "grad_norm": 1.4781023263931274, "learning_rate": 9.49382472122089e-06, "loss": 0.2529, "step": 4003 }, { "epoch": 26.0, "grad_norm": 196.38731384277344, "learning_rate": 9.475414840568897e-06, "loss": 0.2654, "step": 4004 }, { "epoch": 26.006493506493506, "grad_norm": 1.47327721118927, "learning_rate": 9.457020958381324e-06, "loss": 0.2437, "step": 4005 }, { "epoch": 26.01298701298701, "grad_norm": 1.4836572408676147, "learning_rate": 9.438643081919818e-06, "loss": 0.2329, "step": 4006 }, { "epoch": 26.01948051948052, "grad_norm": 1.374118447303772, "learning_rate": 9.420281218439648e-06, "loss": 0.2402, "step": 4007 }, { "epoch": 26.025974025974026, "grad_norm": 1.3078553676605225, "learning_rate": 9.401935375189801e-06, "loss": 0.2055, "step": 4008 }, { "epoch": 26.032467532467532, "grad_norm": 1.5128684043884277, "learning_rate": 9.383605559412911e-06, "loss": 0.2596, "step": 4009 }, { "epoch": 26.038961038961038, "grad_norm": 1.337986707687378, "learning_rate": 9.365291778345303e-06, "loss": 0.2274, "step": 4010 }, { "epoch": 26.045454545454547, "grad_norm": 1.3085969686508179, "learning_rate": 9.346994039216972e-06, "loss": 0.2242, "step": 4011 }, { "epoch": 26.051948051948052, "grad_norm": 1.3607500791549683, "learning_rate": 9.32871234925159e-06, "loss": 0.2243, "step": 4012 }, { "epoch": 26.058441558441558, "grad_norm": 1.471604824066162, "learning_rate": 9.310446715666449e-06, "loss": 0.2227, "step": 4013 }, { "epoch": 26.064935064935064, "grad_norm": 1.252761721611023, "learning_rate": 9.29219714567256e-06, "loss": 0.2129, "step": 4014 }, { "epoch": 26.071428571428573, "grad_norm": 1.451578974723816, "learning_rate": 9.273963646474526e-06, "loss": 0.2071, "step": 4015 }, { "epoch": 26.07792207792208, "grad_norm": 1.514508605003357, "learning_rate": 9.25574622527069e-06, "loss": 0.234, "step": 4016 }, { "epoch": 26.084415584415584, "grad_norm": 1.3731317520141602, "learning_rate": 9.237544889252969e-06, "loss": 0.2252, "step": 4017 }, { "epoch": 26.09090909090909, "grad_norm": 1.4656537771224976, "learning_rate": 9.219359645606989e-06, "loss": 0.2408, "step": 4018 }, { "epoch": 26.0974025974026, "grad_norm": 1.3962067365646362, "learning_rate": 9.201190501511964e-06, "loss": 0.2093, "step": 4019 }, { "epoch": 26.103896103896105, "grad_norm": 1.4254004955291748, "learning_rate": 9.183037464140798e-06, "loss": 0.2474, "step": 4020 }, { "epoch": 26.11038961038961, "grad_norm": 1.3458423614501953, "learning_rate": 9.164900540660032e-06, "loss": 0.2279, "step": 4021 }, { "epoch": 26.116883116883116, "grad_norm": 1.3686282634735107, "learning_rate": 9.146779738229838e-06, "loss": 0.2343, "step": 4022 }, { "epoch": 26.123376623376622, "grad_norm": 1.2849996089935303, "learning_rate": 9.128675064004e-06, "loss": 0.2022, "step": 4023 }, { "epoch": 26.12987012987013, "grad_norm": 1.3870078325271606, "learning_rate": 9.110586525129989e-06, "loss": 0.2232, "step": 4024 }, { "epoch": 26.136363636363637, "grad_norm": 1.4612501859664917, "learning_rate": 9.09251412874882e-06, "loss": 0.2491, "step": 4025 }, { "epoch": 26.142857142857142, "grad_norm": 1.5044692754745483, "learning_rate": 9.074457881995257e-06, "loss": 0.2268, "step": 4026 }, { "epoch": 26.149350649350648, "grad_norm": 1.1951324939727783, "learning_rate": 9.056417791997568e-06, "loss": 0.1935, "step": 4027 }, { "epoch": 26.155844155844157, "grad_norm": 1.4967836141586304, "learning_rate": 9.038393865877725e-06, "loss": 0.2409, "step": 4028 }, { "epoch": 26.162337662337663, "grad_norm": 1.5848519802093506, "learning_rate": 9.020386110751266e-06, "loss": 0.2603, "step": 4029 }, { "epoch": 26.16883116883117, "grad_norm": 1.2820090055465698, "learning_rate": 9.002394533727382e-06, "loss": 0.2294, "step": 4030 }, { "epoch": 26.175324675324674, "grad_norm": 1.4881724119186401, "learning_rate": 8.984419141908856e-06, "loss": 0.2225, "step": 4031 }, { "epoch": 26.181818181818183, "grad_norm": 1.5716921091079712, "learning_rate": 8.966459942392109e-06, "loss": 0.2483, "step": 4032 }, { "epoch": 26.18831168831169, "grad_norm": 1.3901420831680298, "learning_rate": 8.94851694226711e-06, "loss": 0.2472, "step": 4033 }, { "epoch": 26.194805194805195, "grad_norm": 1.4638257026672363, "learning_rate": 8.930590148617512e-06, "loss": 0.2355, "step": 4034 }, { "epoch": 26.2012987012987, "grad_norm": 1.3153793811798096, "learning_rate": 8.912679568520494e-06, "loss": 0.2211, "step": 4035 }, { "epoch": 26.207792207792206, "grad_norm": 1.4137687683105469, "learning_rate": 8.894785209046885e-06, "loss": 0.2216, "step": 4036 }, { "epoch": 26.214285714285715, "grad_norm": 1.4965516328811646, "learning_rate": 8.876907077261093e-06, "loss": 0.2555, "step": 4037 }, { "epoch": 26.22077922077922, "grad_norm": 1.3542468547821045, "learning_rate": 8.859045180221138e-06, "loss": 0.2145, "step": 4038 }, { "epoch": 26.227272727272727, "grad_norm": 1.3804082870483398, "learning_rate": 8.841199524978583e-06, "loss": 0.2303, "step": 4039 }, { "epoch": 26.233766233766232, "grad_norm": 1.5143622159957886, "learning_rate": 8.823370118578623e-06, "loss": 0.2305, "step": 4040 }, { "epoch": 26.24025974025974, "grad_norm": 1.3844674825668335, "learning_rate": 8.805556968060048e-06, "loss": 0.2264, "step": 4041 }, { "epoch": 26.246753246753247, "grad_norm": 1.2946738004684448, "learning_rate": 8.78776008045517e-06, "loss": 0.1973, "step": 4042 }, { "epoch": 26.253246753246753, "grad_norm": 1.3786653280258179, "learning_rate": 8.769979462789957e-06, "loss": 0.2361, "step": 4043 }, { "epoch": 26.25974025974026, "grad_norm": 1.5263012647628784, "learning_rate": 8.752215122083873e-06, "loss": 0.2525, "step": 4044 }, { "epoch": 26.266233766233768, "grad_norm": 1.4955079555511475, "learning_rate": 8.734467065350022e-06, "loss": 0.2487, "step": 4045 }, { "epoch": 26.272727272727273, "grad_norm": 1.5463720560073853, "learning_rate": 8.716735299595059e-06, "loss": 0.2551, "step": 4046 }, { "epoch": 26.27922077922078, "grad_norm": 1.4916661977767944, "learning_rate": 8.699019831819205e-06, "loss": 0.2526, "step": 4047 }, { "epoch": 26.285714285714285, "grad_norm": 1.565500020980835, "learning_rate": 8.68132066901623e-06, "loss": 0.2726, "step": 4048 }, { "epoch": 26.292207792207794, "grad_norm": 1.425245761871338, "learning_rate": 8.663637818173504e-06, "loss": 0.2256, "step": 4049 }, { "epoch": 26.2987012987013, "grad_norm": 1.3212052583694458, "learning_rate": 8.645971286271904e-06, "loss": 0.204, "step": 4050 }, { "epoch": 26.305194805194805, "grad_norm": 1.4682427644729614, "learning_rate": 8.628321080285939e-06, "loss": 0.2575, "step": 4051 }, { "epoch": 26.31168831168831, "grad_norm": 1.4816293716430664, "learning_rate": 8.610687207183605e-06, "loss": 0.2284, "step": 4052 }, { "epoch": 26.318181818181817, "grad_norm": 1.5263566970825195, "learning_rate": 8.59306967392649e-06, "loss": 0.2819, "step": 4053 }, { "epoch": 26.324675324675326, "grad_norm": 1.5160341262817383, "learning_rate": 8.575468487469696e-06, "loss": 0.231, "step": 4054 }, { "epoch": 26.33116883116883, "grad_norm": 1.5796668529510498, "learning_rate": 8.557883654761906e-06, "loss": 0.2755, "step": 4055 }, { "epoch": 26.337662337662337, "grad_norm": 1.5855549573898315, "learning_rate": 8.540315182745328e-06, "loss": 0.2666, "step": 4056 }, { "epoch": 26.344155844155843, "grad_norm": 1.5372071266174316, "learning_rate": 8.52276307835574e-06, "loss": 0.2274, "step": 4057 }, { "epoch": 26.350649350649352, "grad_norm": 1.536356806755066, "learning_rate": 8.505227348522404e-06, "loss": 0.2361, "step": 4058 }, { "epoch": 26.357142857142858, "grad_norm": 1.5033609867095947, "learning_rate": 8.48770800016817e-06, "loss": 0.2787, "step": 4059 }, { "epoch": 26.363636363636363, "grad_norm": 1.4871630668640137, "learning_rate": 8.470205040209361e-06, "loss": 0.2722, "step": 4060 }, { "epoch": 26.37012987012987, "grad_norm": 1.4464274644851685, "learning_rate": 8.452718475555927e-06, "loss": 0.2347, "step": 4061 }, { "epoch": 26.376623376623378, "grad_norm": 1.3340239524841309, "learning_rate": 8.435248313111243e-06, "loss": 0.2189, "step": 4062 }, { "epoch": 26.383116883116884, "grad_norm": 1.2462677955627441, "learning_rate": 8.417794559772273e-06, "loss": 0.2077, "step": 4063 }, { "epoch": 26.38961038961039, "grad_norm": 1.5940028429031372, "learning_rate": 8.400357222429472e-06, "loss": 0.2459, "step": 4064 }, { "epoch": 26.396103896103895, "grad_norm": 1.2997297048568726, "learning_rate": 8.382936307966838e-06, "loss": 0.2293, "step": 4065 }, { "epoch": 26.4025974025974, "grad_norm": 1.4336843490600586, "learning_rate": 8.365531823261841e-06, "loss": 0.2216, "step": 4066 }, { "epoch": 26.40909090909091, "grad_norm": 1.4503132104873657, "learning_rate": 8.348143775185536e-06, "loss": 0.2101, "step": 4067 }, { "epoch": 26.415584415584416, "grad_norm": 1.2731194496154785, "learning_rate": 8.330772170602419e-06, "loss": 0.1976, "step": 4068 }, { "epoch": 26.42207792207792, "grad_norm": 1.4676200151443481, "learning_rate": 8.31341701637055e-06, "loss": 0.2473, "step": 4069 }, { "epoch": 26.428571428571427, "grad_norm": 1.5888464450836182, "learning_rate": 8.296078319341443e-06, "loss": 0.2453, "step": 4070 }, { "epoch": 26.435064935064936, "grad_norm": 1.3592251539230347, "learning_rate": 8.278756086360156e-06, "loss": 0.2113, "step": 4071 }, { "epoch": 26.441558441558442, "grad_norm": 1.4380148649215698, "learning_rate": 8.261450324265224e-06, "loss": 0.2346, "step": 4072 }, { "epoch": 26.448051948051948, "grad_norm": 1.3315917253494263, "learning_rate": 8.244161039888709e-06, "loss": 0.2124, "step": 4073 }, { "epoch": 26.454545454545453, "grad_norm": 1.4123307466506958, "learning_rate": 8.226888240056113e-06, "loss": 0.2057, "step": 4074 }, { "epoch": 26.461038961038962, "grad_norm": 1.4434168338775635, "learning_rate": 8.209631931586498e-06, "loss": 0.2281, "step": 4075 }, { "epoch": 26.467532467532468, "grad_norm": 1.5822036266326904, "learning_rate": 8.192392121292336e-06, "loss": 0.2631, "step": 4076 }, { "epoch": 26.474025974025974, "grad_norm": 1.4746819734573364, "learning_rate": 8.17516881597969e-06, "loss": 0.2368, "step": 4077 }, { "epoch": 26.48051948051948, "grad_norm": 1.4489593505859375, "learning_rate": 8.157962022448001e-06, "loss": 0.2325, "step": 4078 }, { "epoch": 26.48701298701299, "grad_norm": 1.5484001636505127, "learning_rate": 8.140771747490273e-06, "loss": 0.2521, "step": 4079 }, { "epoch": 26.493506493506494, "grad_norm": 1.397491693496704, "learning_rate": 8.123597997892918e-06, "loss": 0.2303, "step": 4080 }, { "epoch": 26.5, "grad_norm": 1.4493643045425415, "learning_rate": 8.106440780435882e-06, "loss": 0.2396, "step": 4081 }, { "epoch": 26.506493506493506, "grad_norm": 1.4584861993789673, "learning_rate": 8.08930010189256e-06, "loss": 0.2357, "step": 4082 }, { "epoch": 26.51298701298701, "grad_norm": 1.4024105072021484, "learning_rate": 8.072175969029832e-06, "loss": 0.239, "step": 4083 }, { "epoch": 26.51948051948052, "grad_norm": 1.2045546770095825, "learning_rate": 8.05506838860801e-06, "loss": 0.1859, "step": 4084 }, { "epoch": 26.525974025974026, "grad_norm": 1.5479137897491455, "learning_rate": 8.037977367380922e-06, "loss": 0.2464, "step": 4085 }, { "epoch": 26.532467532467532, "grad_norm": 1.3793410062789917, "learning_rate": 8.020902912095806e-06, "loss": 0.2231, "step": 4086 }, { "epoch": 26.538961038961038, "grad_norm": 1.6801488399505615, "learning_rate": 8.003845029493407e-06, "loss": 0.2728, "step": 4087 }, { "epoch": 26.545454545454547, "grad_norm": 1.472559928894043, "learning_rate": 7.9868037263079e-06, "loss": 0.2354, "step": 4088 }, { "epoch": 26.551948051948052, "grad_norm": 1.493683934211731, "learning_rate": 7.969779009266915e-06, "loss": 0.2599, "step": 4089 }, { "epoch": 26.558441558441558, "grad_norm": 1.5684465169906616, "learning_rate": 7.952770885091548e-06, "loss": 0.2432, "step": 4090 }, { "epoch": 26.564935064935064, "grad_norm": 1.5496026277542114, "learning_rate": 7.935779360496332e-06, "loss": 0.247, "step": 4091 }, { "epoch": 26.571428571428573, "grad_norm": 1.4261456727981567, "learning_rate": 7.91880444218927e-06, "loss": 0.2347, "step": 4092 }, { "epoch": 26.57792207792208, "grad_norm": 1.5223562717437744, "learning_rate": 7.901846136871766e-06, "loss": 0.2506, "step": 4093 }, { "epoch": 26.584415584415584, "grad_norm": 1.4474468231201172, "learning_rate": 7.884904451238712e-06, "loss": 0.2331, "step": 4094 }, { "epoch": 26.59090909090909, "grad_norm": 1.5187584161758423, "learning_rate": 7.867979391978397e-06, "loss": 0.242, "step": 4095 }, { "epoch": 26.5974025974026, "grad_norm": 1.517674446105957, "learning_rate": 7.851070965772572e-06, "loss": 0.2318, "step": 4096 }, { "epoch": 26.603896103896105, "grad_norm": 1.4315662384033203, "learning_rate": 7.834179179296419e-06, "loss": 0.2203, "step": 4097 }, { "epoch": 26.61038961038961, "grad_norm": 1.5745152235031128, "learning_rate": 7.817304039218559e-06, "loss": 0.2702, "step": 4098 }, { "epoch": 26.616883116883116, "grad_norm": 1.1795282363891602, "learning_rate": 7.800445552201008e-06, "loss": 0.1923, "step": 4099 }, { "epoch": 26.623376623376622, "grad_norm": 1.4204246997833252, "learning_rate": 7.783603724899257e-06, "loss": 0.2169, "step": 4100 }, { "epoch": 26.62987012987013, "grad_norm": 1.317226767539978, "learning_rate": 7.76677856396215e-06, "loss": 0.2133, "step": 4101 }, { "epoch": 26.636363636363637, "grad_norm": 1.1443227529525757, "learning_rate": 7.749970076032049e-06, "loss": 0.1632, "step": 4102 }, { "epoch": 26.642857142857142, "grad_norm": 1.461870551109314, "learning_rate": 7.733178267744633e-06, "loss": 0.2531, "step": 4103 }, { "epoch": 26.649350649350648, "grad_norm": 1.4633764028549194, "learning_rate": 7.716403145729074e-06, "loss": 0.2172, "step": 4104 }, { "epoch": 26.655844155844157, "grad_norm": 1.5518317222595215, "learning_rate": 7.699644716607895e-06, "loss": 0.2432, "step": 4105 }, { "epoch": 26.662337662337663, "grad_norm": 1.4457675218582153, "learning_rate": 7.682902986997076e-06, "loss": 0.2307, "step": 4106 }, { "epoch": 26.66883116883117, "grad_norm": 1.2934722900390625, "learning_rate": 7.666177963505988e-06, "loss": 0.206, "step": 4107 }, { "epoch": 26.675324675324674, "grad_norm": 1.6599860191345215, "learning_rate": 7.649469652737406e-06, "loss": 0.2682, "step": 4108 }, { "epoch": 26.681818181818183, "grad_norm": 1.4232659339904785, "learning_rate": 7.632778061287493e-06, "loss": 0.228, "step": 4109 }, { "epoch": 26.68831168831169, "grad_norm": 1.3405925035476685, "learning_rate": 7.6161031957458494e-06, "loss": 0.2201, "step": 4110 }, { "epoch": 26.694805194805195, "grad_norm": 1.3626002073287964, "learning_rate": 7.59944506269541e-06, "loss": 0.2202, "step": 4111 }, { "epoch": 26.7012987012987, "grad_norm": 1.4566465616226196, "learning_rate": 7.582803668712579e-06, "loss": 0.2343, "step": 4112 }, { "epoch": 26.707792207792206, "grad_norm": 1.165060043334961, "learning_rate": 7.5661790203670975e-06, "loss": 0.1737, "step": 4113 }, { "epoch": 26.714285714285715, "grad_norm": 1.4785093069076538, "learning_rate": 7.549571124222127e-06, "loss": 0.2332, "step": 4114 }, { "epoch": 26.72077922077922, "grad_norm": 1.4225431680679321, "learning_rate": 7.532979986834176e-06, "loss": 0.2329, "step": 4115 }, { "epoch": 26.727272727272727, "grad_norm": 1.4431058168411255, "learning_rate": 7.51640561475318e-06, "loss": 0.2587, "step": 4116 }, { "epoch": 26.733766233766232, "grad_norm": 1.2011971473693848, "learning_rate": 7.499848014522437e-06, "loss": 0.1974, "step": 4117 }, { "epoch": 26.74025974025974, "grad_norm": 1.5450186729431152, "learning_rate": 7.48330719267864e-06, "loss": 0.2428, "step": 4118 }, { "epoch": 26.746753246753247, "grad_norm": 1.4603906869888306, "learning_rate": 7.4667831557518165e-06, "loss": 0.2333, "step": 4119 }, { "epoch": 26.753246753246753, "grad_norm": 1.54874587059021, "learning_rate": 7.450275910265414e-06, "loss": 0.2402, "step": 4120 }, { "epoch": 26.75974025974026, "grad_norm": 1.496936321258545, "learning_rate": 7.433785462736209e-06, "loss": 0.2517, "step": 4121 }, { "epoch": 26.766233766233768, "grad_norm": 1.3329963684082031, "learning_rate": 7.4173118196744e-06, "loss": 0.2257, "step": 4122 }, { "epoch": 26.772727272727273, "grad_norm": 1.6605186462402344, "learning_rate": 7.4008549875835e-06, "loss": 0.273, "step": 4123 }, { "epoch": 26.77922077922078, "grad_norm": 1.4919471740722656, "learning_rate": 7.384414972960418e-06, "loss": 0.245, "step": 4124 }, { "epoch": 26.785714285714285, "grad_norm": 1.5525104999542236, "learning_rate": 7.367991782295391e-06, "loss": 0.2396, "step": 4125 }, { "epoch": 26.792207792207794, "grad_norm": 1.4189823865890503, "learning_rate": 7.351585422072049e-06, "loss": 0.2278, "step": 4126 }, { "epoch": 26.7987012987013, "grad_norm": 1.3449676036834717, "learning_rate": 7.335195898767366e-06, "loss": 0.2282, "step": 4127 }, { "epoch": 26.805194805194805, "grad_norm": 1.3542174100875854, "learning_rate": 7.318823218851667e-06, "loss": 0.2207, "step": 4128 }, { "epoch": 26.81168831168831, "grad_norm": 1.6442121267318726, "learning_rate": 7.302467388788614e-06, "loss": 0.2679, "step": 4129 }, { "epoch": 26.818181818181817, "grad_norm": 1.5656448602676392, "learning_rate": 7.286128415035248e-06, "loss": 0.304, "step": 4130 }, { "epoch": 26.824675324675326, "grad_norm": 1.4578937292099, "learning_rate": 7.269806304041915e-06, "loss": 0.2385, "step": 4131 }, { "epoch": 26.83116883116883, "grad_norm": 1.3822400569915771, "learning_rate": 7.253501062252338e-06, "loss": 0.2303, "step": 4132 }, { "epoch": 26.837662337662337, "grad_norm": 1.3357703685760498, "learning_rate": 7.237212696103568e-06, "loss": 0.1941, "step": 4133 }, { "epoch": 26.844155844155843, "grad_norm": 1.5666699409484863, "learning_rate": 7.220941212026005e-06, "loss": 0.2635, "step": 4134 }, { "epoch": 26.850649350649352, "grad_norm": 1.4458389282226562, "learning_rate": 7.204686616443351e-06, "loss": 0.2462, "step": 4135 }, { "epoch": 26.857142857142858, "grad_norm": 1.5157705545425415, "learning_rate": 7.188448915772672e-06, "loss": 0.2285, "step": 4136 }, { "epoch": 26.863636363636363, "grad_norm": 1.595637559890747, "learning_rate": 7.172228116424373e-06, "loss": 0.2601, "step": 4137 }, { "epoch": 26.87012987012987, "grad_norm": 1.531641960144043, "learning_rate": 7.156024224802138e-06, "loss": 0.2627, "step": 4138 }, { "epoch": 26.876623376623378, "grad_norm": 1.3616324663162231, "learning_rate": 7.139837247303028e-06, "loss": 0.2064, "step": 4139 }, { "epoch": 26.883116883116884, "grad_norm": 1.6820498704910278, "learning_rate": 7.123667190317396e-06, "loss": 0.2835, "step": 4140 }, { "epoch": 26.88961038961039, "grad_norm": 1.753391981124878, "learning_rate": 7.107514060228921e-06, "loss": 0.286, "step": 4141 }, { "epoch": 26.896103896103895, "grad_norm": 1.2197531461715698, "learning_rate": 7.091377863414611e-06, "loss": 0.1978, "step": 4142 }, { "epoch": 26.9025974025974, "grad_norm": 1.5755234956741333, "learning_rate": 7.075258606244789e-06, "loss": 0.2461, "step": 4143 }, { "epoch": 26.90909090909091, "grad_norm": 1.485494613647461, "learning_rate": 7.059156295083064e-06, "loss": 0.2232, "step": 4144 }, { "epoch": 26.915584415584416, "grad_norm": 1.684638500213623, "learning_rate": 7.043070936286389e-06, "loss": 0.2505, "step": 4145 }, { "epoch": 26.92207792207792, "grad_norm": 1.4940664768218994, "learning_rate": 7.027002536204985e-06, "loss": 0.2687, "step": 4146 }, { "epoch": 26.928571428571427, "grad_norm": 1.4867305755615234, "learning_rate": 7.010951101182439e-06, "loss": 0.2554, "step": 4147 }, { "epoch": 26.935064935064936, "grad_norm": 1.4521554708480835, "learning_rate": 6.9949166375555704e-06, "loss": 0.2519, "step": 4148 }, { "epoch": 26.941558441558442, "grad_norm": 1.4833617210388184, "learning_rate": 6.978899151654555e-06, "loss": 0.2298, "step": 4149 }, { "epoch": 26.948051948051948, "grad_norm": 1.4452197551727295, "learning_rate": 6.962898649802823e-06, "loss": 0.2371, "step": 4150 }, { "epoch": 26.954545454545453, "grad_norm": 1.5047880411148071, "learning_rate": 6.946915138317122e-06, "loss": 0.239, "step": 4151 }, { "epoch": 26.961038961038962, "grad_norm": 1.5090023279190063, "learning_rate": 6.930948623507505e-06, "loss": 0.2293, "step": 4152 }, { "epoch": 26.967532467532468, "grad_norm": 1.3914098739624023, "learning_rate": 6.914999111677295e-06, "loss": 0.2276, "step": 4153 }, { "epoch": 26.974025974025974, "grad_norm": 1.3756853342056274, "learning_rate": 6.899066609123089e-06, "loss": 0.22, "step": 4154 }, { "epoch": 26.98051948051948, "grad_norm": 1.4306117296218872, "learning_rate": 6.883151122134812e-06, "loss": 0.2568, "step": 4155 }, { "epoch": 26.98701298701299, "grad_norm": 1.4045994281768799, "learning_rate": 6.86725265699561e-06, "loss": 0.2124, "step": 4156 }, { "epoch": 26.993506493506494, "grad_norm": 1.639349341392517, "learning_rate": 6.851371219981989e-06, "loss": 0.2644, "step": 4157 }, { "epoch": 27.0, "grad_norm": 1257.550537109375, "learning_rate": 6.835506817363657e-06, "loss": 0.1975, "step": 4158 }, { "epoch": 27.006493506493506, "grad_norm": 1.4366843700408936, "learning_rate": 6.8196594554036486e-06, "loss": 0.251, "step": 4159 }, { "epoch": 27.01298701298701, "grad_norm": 1.478812336921692, "learning_rate": 6.803829140358237e-06, "loss": 0.2371, "step": 4160 }, { "epoch": 27.01948051948052, "grad_norm": 1.4175927639007568, "learning_rate": 6.788015878476983e-06, "loss": 0.2397, "step": 4161 }, { "epoch": 27.025974025974026, "grad_norm": 1.3861377239227295, "learning_rate": 6.772219676002711e-06, "loss": 0.2271, "step": 4162 }, { "epoch": 27.032467532467532, "grad_norm": 1.503459095954895, "learning_rate": 6.756440539171533e-06, "loss": 0.248, "step": 4163 }, { "epoch": 27.038961038961038, "grad_norm": 2.0790321826934814, "learning_rate": 6.740678474212769e-06, "loss": 0.2654, "step": 4164 }, { "epoch": 27.045454545454547, "grad_norm": 1.4396727085113525, "learning_rate": 6.724933487349061e-06, "loss": 0.2225, "step": 4165 }, { "epoch": 27.051948051948052, "grad_norm": 1.1916790008544922, "learning_rate": 6.709205584796241e-06, "loss": 0.2038, "step": 4166 }, { "epoch": 27.058441558441558, "grad_norm": 1.3912105560302734, "learning_rate": 6.693494772763486e-06, "loss": 0.2284, "step": 4167 }, { "epoch": 27.064935064935064, "grad_norm": 1.3174680471420288, "learning_rate": 6.677801057453142e-06, "loss": 0.2022, "step": 4168 }, { "epoch": 27.071428571428573, "grad_norm": 1.5365556478500366, "learning_rate": 6.662124445060863e-06, "loss": 0.2601, "step": 4169 }, { "epoch": 27.07792207792208, "grad_norm": 1.1626392602920532, "learning_rate": 6.646464941775499e-06, "loss": 0.1607, "step": 4170 }, { "epoch": 27.084415584415584, "grad_norm": 1.5728877782821655, "learning_rate": 6.6308225537791925e-06, "loss": 0.275, "step": 4171 }, { "epoch": 27.09090909090909, "grad_norm": 1.5004632472991943, "learning_rate": 6.615197287247299e-06, "loss": 0.2425, "step": 4172 }, { "epoch": 27.0974025974026, "grad_norm": 1.347186803817749, "learning_rate": 6.599589148348451e-06, "loss": 0.1976, "step": 4173 }, { "epoch": 27.103896103896105, "grad_norm": 1.270532488822937, "learning_rate": 6.583998143244463e-06, "loss": 0.1878, "step": 4174 }, { "epoch": 27.11038961038961, "grad_norm": 1.3854254484176636, "learning_rate": 6.568424278090446e-06, "loss": 0.2287, "step": 4175 }, { "epoch": 27.116883116883116, "grad_norm": 1.5932155847549438, "learning_rate": 6.552867559034687e-06, "loss": 0.2539, "step": 4176 }, { "epoch": 27.123376623376622, "grad_norm": 1.193563461303711, "learning_rate": 6.5373279922187445e-06, "loss": 0.1794, "step": 4177 }, { "epoch": 27.12987012987013, "grad_norm": 1.4257055521011353, "learning_rate": 6.521805583777396e-06, "loss": 0.2197, "step": 4178 }, { "epoch": 27.136363636363637, "grad_norm": 1.4581280946731567, "learning_rate": 6.506300339838656e-06, "loss": 0.2366, "step": 4179 }, { "epoch": 27.142857142857142, "grad_norm": 1.4068759679794312, "learning_rate": 6.490812266523716e-06, "loss": 0.2424, "step": 4180 }, { "epoch": 27.149350649350648, "grad_norm": 1.4506574869155884, "learning_rate": 6.4753413699470465e-06, "loss": 0.211, "step": 4181 }, { "epoch": 27.155844155844157, "grad_norm": 1.4119222164154053, "learning_rate": 6.459887656216313e-06, "loss": 0.2362, "step": 4182 }, { "epoch": 27.162337662337663, "grad_norm": 1.5075432062149048, "learning_rate": 6.444451131432383e-06, "loss": 0.2343, "step": 4183 }, { "epoch": 27.16883116883117, "grad_norm": 1.2462363243103027, "learning_rate": 6.429031801689361e-06, "loss": 0.2094, "step": 4184 }, { "epoch": 27.175324675324674, "grad_norm": 1.1130222082138062, "learning_rate": 6.413629673074561e-06, "loss": 0.1708, "step": 4185 }, { "epoch": 27.181818181818183, "grad_norm": 1.4629265069961548, "learning_rate": 6.39824475166848e-06, "loss": 0.2161, "step": 4186 }, { "epoch": 27.18831168831169, "grad_norm": 1.495139479637146, "learning_rate": 6.382877043544855e-06, "loss": 0.2555, "step": 4187 }, { "epoch": 27.194805194805195, "grad_norm": 1.5857653617858887, "learning_rate": 6.3675265547706196e-06, "loss": 0.2585, "step": 4188 }, { "epoch": 27.2012987012987, "grad_norm": 1.3717056512832642, "learning_rate": 6.352193291405883e-06, "loss": 0.2212, "step": 4189 }, { "epoch": 27.207792207792206, "grad_norm": 1.3991684913635254, "learning_rate": 6.336877259504004e-06, "loss": 0.225, "step": 4190 }, { "epoch": 27.214285714285715, "grad_norm": 1.4601714611053467, "learning_rate": 6.321578465111477e-06, "loss": 0.2309, "step": 4191 }, { "epoch": 27.22077922077922, "grad_norm": 1.4891407489776611, "learning_rate": 6.30629691426804e-06, "loss": 0.2352, "step": 4192 }, { "epoch": 27.227272727272727, "grad_norm": 1.3236773014068604, "learning_rate": 6.291032613006609e-06, "loss": 0.2115, "step": 4193 }, { "epoch": 27.233766233766232, "grad_norm": 1.4177298545837402, "learning_rate": 6.275785567353293e-06, "loss": 0.2406, "step": 4194 }, { "epoch": 27.24025974025974, "grad_norm": 1.5722953081130981, "learning_rate": 6.2605557833273656e-06, "loss": 0.2491, "step": 4195 }, { "epoch": 27.246753246753247, "grad_norm": 1.5723475217819214, "learning_rate": 6.245343266941328e-06, "loss": 0.2653, "step": 4196 }, { "epoch": 27.253246753246753, "grad_norm": 1.2392525672912598, "learning_rate": 6.23014802420081e-06, "loss": 0.2075, "step": 4197 }, { "epoch": 27.25974025974026, "grad_norm": 1.4983159303665161, "learning_rate": 6.214970061104686e-06, "loss": 0.2411, "step": 4198 }, { "epoch": 27.266233766233768, "grad_norm": 1.4145365953445435, "learning_rate": 6.199809383644956e-06, "loss": 0.2347, "step": 4199 }, { "epoch": 27.272727272727273, "grad_norm": 1.4473834037780762, "learning_rate": 6.1846659978068265e-06, "loss": 0.2172, "step": 4200 }, { "epoch": 27.27922077922078, "grad_norm": 1.5161728858947754, "learning_rate": 6.169539909568655e-06, "loss": 0.2466, "step": 4201 }, { "epoch": 27.285714285714285, "grad_norm": 1.3642016649246216, "learning_rate": 6.154431124901982e-06, "loss": 0.2237, "step": 4202 }, { "epoch": 27.292207792207794, "grad_norm": 1.4526147842407227, "learning_rate": 6.139339649771525e-06, "loss": 0.2387, "step": 4203 }, { "epoch": 27.2987012987013, "grad_norm": 1.40420401096344, "learning_rate": 6.124265490135162e-06, "loss": 0.1868, "step": 4204 }, { "epoch": 27.305194805194805, "grad_norm": 1.252336859703064, "learning_rate": 6.109208651943921e-06, "loss": 0.199, "step": 4205 }, { "epoch": 27.31168831168831, "grad_norm": 1.6298810243606567, "learning_rate": 6.094169141142014e-06, "loss": 0.2625, "step": 4206 }, { "epoch": 27.318181818181817, "grad_norm": 1.3172500133514404, "learning_rate": 6.079146963666776e-06, "loss": 0.2197, "step": 4207 }, { "epoch": 27.324675324675326, "grad_norm": 1.5007563829421997, "learning_rate": 6.064142125448763e-06, "loss": 0.2578, "step": 4208 }, { "epoch": 27.33116883116883, "grad_norm": 1.408240556716919, "learning_rate": 6.049154632411624e-06, "loss": 0.2144, "step": 4209 }, { "epoch": 27.337662337662337, "grad_norm": 1.496727466583252, "learning_rate": 6.034184490472195e-06, "loss": 0.2401, "step": 4210 }, { "epoch": 27.344155844155843, "grad_norm": 1.3175632953643799, "learning_rate": 6.019231705540435e-06, "loss": 0.1992, "step": 4211 }, { "epoch": 27.350649350649352, "grad_norm": 1.5040570497512817, "learning_rate": 6.004296283519478e-06, "loss": 0.2293, "step": 4212 }, { "epoch": 27.357142857142858, "grad_norm": 1.269888997077942, "learning_rate": 5.989378230305592e-06, "loss": 0.2087, "step": 4213 }, { "epoch": 27.363636363636363, "grad_norm": 1.3803386688232422, "learning_rate": 5.9744775517881935e-06, "loss": 0.2262, "step": 4214 }, { "epoch": 27.37012987012987, "grad_norm": 1.578514814376831, "learning_rate": 5.95959425384982e-06, "loss": 0.2664, "step": 4215 }, { "epoch": 27.376623376623378, "grad_norm": 1.2101290225982666, "learning_rate": 5.94472834236618e-06, "loss": 0.1941, "step": 4216 }, { "epoch": 27.383116883116884, "grad_norm": 1.572113037109375, "learning_rate": 5.9298798232060695e-06, "loss": 0.2476, "step": 4217 }, { "epoch": 27.38961038961039, "grad_norm": 1.4231563806533813, "learning_rate": 5.915048702231491e-06, "loss": 0.2136, "step": 4218 }, { "epoch": 27.396103896103895, "grad_norm": 1.2820154428482056, "learning_rate": 5.900234985297498e-06, "loss": 0.2212, "step": 4219 }, { "epoch": 27.4025974025974, "grad_norm": 1.4885966777801514, "learning_rate": 5.885438678252342e-06, "loss": 0.2564, "step": 4220 }, { "epoch": 27.40909090909091, "grad_norm": 1.4494009017944336, "learning_rate": 5.870659786937344e-06, "loss": 0.2328, "step": 4221 }, { "epoch": 27.415584415584416, "grad_norm": 1.4441783428192139, "learning_rate": 5.855898317186992e-06, "loss": 0.2177, "step": 4222 }, { "epoch": 27.42207792207792, "grad_norm": 1.5828957557678223, "learning_rate": 5.841154274828869e-06, "loss": 0.2633, "step": 4223 }, { "epoch": 27.428571428571427, "grad_norm": 1.3022863864898682, "learning_rate": 5.8264276656837145e-06, "loss": 0.1958, "step": 4224 }, { "epoch": 27.435064935064936, "grad_norm": 1.3509620428085327, "learning_rate": 5.811718495565332e-06, "loss": 0.2157, "step": 4225 }, { "epoch": 27.441558441558442, "grad_norm": 1.5641427040100098, "learning_rate": 5.797026770280684e-06, "loss": 0.2353, "step": 4226 }, { "epoch": 27.448051948051948, "grad_norm": 1.5348894596099854, "learning_rate": 5.782352495629822e-06, "loss": 0.2824, "step": 4227 }, { "epoch": 27.454545454545453, "grad_norm": 1.5248255729675293, "learning_rate": 5.767695677405921e-06, "loss": 0.2258, "step": 4228 }, { "epoch": 27.461038961038962, "grad_norm": 1.5213544368743896, "learning_rate": 5.753056321395267e-06, "loss": 0.2271, "step": 4229 }, { "epoch": 27.467532467532468, "grad_norm": 1.5472891330718994, "learning_rate": 5.738434433377243e-06, "loss": 0.2791, "step": 4230 }, { "epoch": 27.474025974025974, "grad_norm": 1.357235312461853, "learning_rate": 5.7238300191243325e-06, "loss": 0.2052, "step": 4231 }, { "epoch": 27.48051948051948, "grad_norm": 1.4257265329360962, "learning_rate": 5.7092430844021275e-06, "loss": 0.2553, "step": 4232 }, { "epoch": 27.48701298701299, "grad_norm": 1.4562528133392334, "learning_rate": 5.694673634969333e-06, "loss": 0.2293, "step": 4233 }, { "epoch": 27.493506493506494, "grad_norm": 1.4705281257629395, "learning_rate": 5.680121676577721e-06, "loss": 0.2269, "step": 4234 }, { "epoch": 27.5, "grad_norm": 1.3913674354553223, "learning_rate": 5.665587214972174e-06, "loss": 0.2317, "step": 4235 }, { "epoch": 27.506493506493506, "grad_norm": 1.409739375114441, "learning_rate": 5.651070255890689e-06, "loss": 0.2269, "step": 4236 }, { "epoch": 27.51298701298701, "grad_norm": 1.4227676391601562, "learning_rate": 5.636570805064301e-06, "loss": 0.2367, "step": 4237 }, { "epoch": 27.51948051948052, "grad_norm": 1.3669805526733398, "learning_rate": 5.622088868217179e-06, "loss": 0.2257, "step": 4238 }, { "epoch": 27.525974025974026, "grad_norm": 1.4420340061187744, "learning_rate": 5.607624451066568e-06, "loss": 0.2213, "step": 4239 }, { "epoch": 27.532467532467532, "grad_norm": 1.453864336013794, "learning_rate": 5.593177559322777e-06, "loss": 0.2111, "step": 4240 }, { "epoch": 27.538961038961038, "grad_norm": 1.3545842170715332, "learning_rate": 5.578748198689227e-06, "loss": 0.2362, "step": 4241 }, { "epoch": 27.545454545454547, "grad_norm": 1.378858208656311, "learning_rate": 5.564336374862372e-06, "loss": 0.2314, "step": 4242 }, { "epoch": 27.551948051948052, "grad_norm": 1.4109467267990112, "learning_rate": 5.549942093531812e-06, "loss": 0.2153, "step": 4243 }, { "epoch": 27.558441558441558, "grad_norm": 1.7690722942352295, "learning_rate": 5.535565360380146e-06, "loss": 0.2401, "step": 4244 }, { "epoch": 27.564935064935064, "grad_norm": 1.4343832731246948, "learning_rate": 5.521206181083111e-06, "loss": 0.2222, "step": 4245 }, { "epoch": 27.571428571428573, "grad_norm": 1.3798093795776367, "learning_rate": 5.506864561309455e-06, "loss": 0.2345, "step": 4246 }, { "epoch": 27.57792207792208, "grad_norm": 1.456672191619873, "learning_rate": 5.492540506721033e-06, "loss": 0.2305, "step": 4247 }, { "epoch": 27.584415584415584, "grad_norm": 1.3595104217529297, "learning_rate": 5.478234022972756e-06, "loss": 0.2282, "step": 4248 }, { "epoch": 27.59090909090909, "grad_norm": 1.461184024810791, "learning_rate": 5.463945115712609e-06, "loss": 0.2632, "step": 4249 }, { "epoch": 27.5974025974026, "grad_norm": 1.4280775785446167, "learning_rate": 5.449673790581611e-06, "loss": 0.2325, "step": 4250 }, { "epoch": 27.603896103896105, "grad_norm": 1.387702226638794, "learning_rate": 5.435420053213863e-06, "loss": 0.2118, "step": 4251 }, { "epoch": 27.61038961038961, "grad_norm": 1.3455487489700317, "learning_rate": 5.421183909236494e-06, "loss": 0.2186, "step": 4252 }, { "epoch": 27.616883116883116, "grad_norm": 1.4924322366714478, "learning_rate": 5.406965364269745e-06, "loss": 0.2352, "step": 4253 }, { "epoch": 27.623376623376622, "grad_norm": 1.5834050178527832, "learning_rate": 5.392764423926838e-06, "loss": 0.2643, "step": 4254 }, { "epoch": 27.62987012987013, "grad_norm": 1.2884026765823364, "learning_rate": 5.378581093814111e-06, "loss": 0.2001, "step": 4255 }, { "epoch": 27.636363636363637, "grad_norm": 1.5274333953857422, "learning_rate": 5.3644153795308904e-06, "loss": 0.2662, "step": 4256 }, { "epoch": 27.642857142857142, "grad_norm": 1.392540693283081, "learning_rate": 5.350267286669586e-06, "loss": 0.2296, "step": 4257 }, { "epoch": 27.649350649350648, "grad_norm": 1.4900485277175903, "learning_rate": 5.33613682081564e-06, "loss": 0.2257, "step": 4258 }, { "epoch": 27.655844155844157, "grad_norm": 1.609121561050415, "learning_rate": 5.322023987547547e-06, "loss": 0.2453, "step": 4259 }, { "epoch": 27.662337662337663, "grad_norm": 1.369093418121338, "learning_rate": 5.307928792436811e-06, "loss": 0.219, "step": 4260 }, { "epoch": 27.66883116883117, "grad_norm": 1.3919950723648071, "learning_rate": 5.293851241048015e-06, "loss": 0.2148, "step": 4261 }, { "epoch": 27.675324675324674, "grad_norm": 1.6031920909881592, "learning_rate": 5.279791338938716e-06, "loss": 0.233, "step": 4262 }, { "epoch": 27.681818181818183, "grad_norm": 1.5094611644744873, "learning_rate": 5.2657490916595885e-06, "loss": 0.2426, "step": 4263 }, { "epoch": 27.68831168831169, "grad_norm": 1.6092923879623413, "learning_rate": 5.2517245047542574e-06, "loss": 0.2555, "step": 4264 }, { "epoch": 27.694805194805195, "grad_norm": 1.3829764127731323, "learning_rate": 5.2377175837594216e-06, "loss": 0.2243, "step": 4265 }, { "epoch": 27.7012987012987, "grad_norm": 1.148871660232544, "learning_rate": 5.2237283342047805e-06, "loss": 0.1816, "step": 4266 }, { "epoch": 27.707792207792206, "grad_norm": 1.383902668952942, "learning_rate": 5.209756761613071e-06, "loss": 0.2377, "step": 4267 }, { "epoch": 27.714285714285715, "grad_norm": 1.4102824926376343, "learning_rate": 5.1958028715000495e-06, "loss": 0.2228, "step": 4268 }, { "epoch": 27.72077922077922, "grad_norm": 1.5818957090377808, "learning_rate": 5.1818666693745025e-06, "loss": 0.2567, "step": 4269 }, { "epoch": 27.727272727272727, "grad_norm": 1.2153655290603638, "learning_rate": 5.1679481607382065e-06, "loss": 0.2015, "step": 4270 }, { "epoch": 27.733766233766232, "grad_norm": 1.4225279092788696, "learning_rate": 5.154047351085984e-06, "loss": 0.232, "step": 4271 }, { "epoch": 27.74025974025974, "grad_norm": 1.492111086845398, "learning_rate": 5.140164245905632e-06, "loss": 0.2465, "step": 4272 }, { "epoch": 27.746753246753247, "grad_norm": 1.376865029335022, "learning_rate": 5.126298850677991e-06, "loss": 0.2429, "step": 4273 }, { "epoch": 27.753246753246753, "grad_norm": 1.7007191181182861, "learning_rate": 5.112451170876903e-06, "loss": 0.2756, "step": 4274 }, { "epoch": 27.75974025974026, "grad_norm": 1.382163166999817, "learning_rate": 5.098621211969223e-06, "loss": 0.2461, "step": 4275 }, { "epoch": 27.766233766233768, "grad_norm": 1.1419545412063599, "learning_rate": 5.084808979414779e-06, "loss": 0.1856, "step": 4276 }, { "epoch": 27.772727272727273, "grad_norm": 1.4656493663787842, "learning_rate": 5.071014478666425e-06, "loss": 0.25, "step": 4277 }, { "epoch": 27.77922077922078, "grad_norm": 1.5065422058105469, "learning_rate": 5.057237715170033e-06, "loss": 0.2613, "step": 4278 }, { "epoch": 27.785714285714285, "grad_norm": 1.4432060718536377, "learning_rate": 5.043478694364417e-06, "loss": 0.219, "step": 4279 }, { "epoch": 27.792207792207794, "grad_norm": 1.7624403238296509, "learning_rate": 5.029737421681446e-06, "loss": 0.2528, "step": 4280 }, { "epoch": 27.7987012987013, "grad_norm": 1.4036396741867065, "learning_rate": 5.016013902545957e-06, "loss": 0.2383, "step": 4281 }, { "epoch": 27.805194805194805, "grad_norm": 1.166783332824707, "learning_rate": 5.002308142375761e-06, "loss": 0.1715, "step": 4282 }, { "epoch": 27.81168831168831, "grad_norm": 1.4475295543670654, "learning_rate": 4.9886201465816855e-06, "loss": 0.2154, "step": 4283 }, { "epoch": 27.818181818181817, "grad_norm": 1.3192903995513916, "learning_rate": 4.9749499205675394e-06, "loss": 0.2208, "step": 4284 }, { "epoch": 27.824675324675326, "grad_norm": 1.5346260070800781, "learning_rate": 4.961297469730097e-06, "loss": 0.2782, "step": 4285 }, { "epoch": 27.83116883116883, "grad_norm": 1.6296072006225586, "learning_rate": 4.9476627994591515e-06, "loss": 0.2453, "step": 4286 }, { "epoch": 27.837662337662337, "grad_norm": 1.415259599685669, "learning_rate": 4.9340459151374196e-06, "loss": 0.2421, "step": 4287 }, { "epoch": 27.844155844155843, "grad_norm": 1.6008384227752686, "learning_rate": 4.920446822140673e-06, "loss": 0.2688, "step": 4288 }, { "epoch": 27.850649350649352, "grad_norm": 1.40635085105896, "learning_rate": 4.906865525837589e-06, "loss": 0.2337, "step": 4289 }, { "epoch": 27.857142857142858, "grad_norm": 1.4448304176330566, "learning_rate": 4.893302031589864e-06, "loss": 0.2323, "step": 4290 }, { "epoch": 27.863636363636363, "grad_norm": 1.1303738355636597, "learning_rate": 4.87975634475214e-06, "loss": 0.1624, "step": 4291 }, { "epoch": 27.87012987012987, "grad_norm": 1.4003924131393433, "learning_rate": 4.86622847067204e-06, "loss": 0.2141, "step": 4292 }, { "epoch": 27.876623376623378, "grad_norm": 1.42268705368042, "learning_rate": 4.85271841469016e-06, "loss": 0.2299, "step": 4293 }, { "epoch": 27.883116883116884, "grad_norm": 1.3648524284362793, "learning_rate": 4.839226182140067e-06, "loss": 0.2344, "step": 4294 }, { "epoch": 27.88961038961039, "grad_norm": 1.4869227409362793, "learning_rate": 4.825751778348258e-06, "loss": 0.2436, "step": 4295 }, { "epoch": 27.896103896103895, "grad_norm": 1.3129571676254272, "learning_rate": 4.812295208634238e-06, "loss": 0.2139, "step": 4296 }, { "epoch": 27.9025974025974, "grad_norm": 1.2178107500076294, "learning_rate": 4.798856478310409e-06, "loss": 0.2068, "step": 4297 }, { "epoch": 27.90909090909091, "grad_norm": 1.5232648849487305, "learning_rate": 4.785435592682219e-06, "loss": 0.2658, "step": 4298 }, { "epoch": 27.915584415584416, "grad_norm": 1.3810973167419434, "learning_rate": 4.772032557047984e-06, "loss": 0.217, "step": 4299 }, { "epoch": 27.92207792207792, "grad_norm": 1.5051517486572266, "learning_rate": 4.758647376699032e-06, "loss": 0.2382, "step": 4300 }, { "epoch": 27.928571428571427, "grad_norm": 1.5706090927124023, "learning_rate": 4.745280056919599e-06, "loss": 0.2753, "step": 4301 }, { "epoch": 27.935064935064936, "grad_norm": 1.3974881172180176, "learning_rate": 4.731930602986906e-06, "loss": 0.2267, "step": 4302 }, { "epoch": 27.941558441558442, "grad_norm": 1.5711486339569092, "learning_rate": 4.718599020171099e-06, "loss": 0.2533, "step": 4303 }, { "epoch": 27.948051948051948, "grad_norm": 1.5940574407577515, "learning_rate": 4.705285313735297e-06, "loss": 0.2652, "step": 4304 }, { "epoch": 27.954545454545453, "grad_norm": 1.4611643552780151, "learning_rate": 4.691989488935511e-06, "loss": 0.2349, "step": 4305 }, { "epoch": 27.961038961038962, "grad_norm": 1.6703907251358032, "learning_rate": 4.678711551020743e-06, "loss": 0.2832, "step": 4306 }, { "epoch": 27.967532467532468, "grad_norm": 1.378297209739685, "learning_rate": 4.665451505232882e-06, "loss": 0.2181, "step": 4307 }, { "epoch": 27.974025974025974, "grad_norm": 1.49268639087677, "learning_rate": 4.652209356806825e-06, "loss": 0.2404, "step": 4308 }, { "epoch": 27.98051948051948, "grad_norm": 1.4238439798355103, "learning_rate": 4.638985110970339e-06, "loss": 0.2252, "step": 4309 }, { "epoch": 27.98701298701299, "grad_norm": 1.506311297416687, "learning_rate": 4.625778772944156e-06, "loss": 0.2482, "step": 4310 }, { "epoch": 27.993506493506494, "grad_norm": 1.440933108329773, "learning_rate": 4.61259034794192e-06, "loss": 0.2311, "step": 4311 }, { "epoch": 28.0, "grad_norm": 804.62646484375, "learning_rate": 4.599419841170216e-06, "loss": 0.2961, "step": 4312 }, { "epoch": 28.006493506493506, "grad_norm": 1.4886484146118164, "learning_rate": 4.586267257828547e-06, "loss": 0.2396, "step": 4313 }, { "epoch": 28.01298701298701, "grad_norm": 1.4247444868087769, "learning_rate": 4.573132603109365e-06, "loss": 0.224, "step": 4314 }, { "epoch": 28.01948051948052, "grad_norm": 1.4735891819000244, "learning_rate": 4.560015882197993e-06, "loss": 0.2437, "step": 4315 }, { "epoch": 28.025974025974026, "grad_norm": 1.425214171409607, "learning_rate": 4.546917100272735e-06, "loss": 0.2431, "step": 4316 }, { "epoch": 28.032467532467532, "grad_norm": 1.2438082695007324, "learning_rate": 4.533836262504754e-06, "loss": 0.21, "step": 4317 }, { "epoch": 28.038961038961038, "grad_norm": 1.405678391456604, "learning_rate": 4.5207733740581735e-06, "loss": 0.2375, "step": 4318 }, { "epoch": 28.045454545454547, "grad_norm": 1.517043113708496, "learning_rate": 4.507728440090014e-06, "loss": 0.2561, "step": 4319 }, { "epoch": 28.051948051948052, "grad_norm": 1.4188717603683472, "learning_rate": 4.494701465750217e-06, "loss": 0.224, "step": 4320 }, { "epoch": 28.058441558441558, "grad_norm": 1.4755464792251587, "learning_rate": 4.481692456181608e-06, "loss": 0.2385, "step": 4321 }, { "epoch": 28.064935064935064, "grad_norm": 1.3436827659606934, "learning_rate": 4.468701416519955e-06, "loss": 0.2076, "step": 4322 }, { "epoch": 28.071428571428573, "grad_norm": 1.3644148111343384, "learning_rate": 4.4557283518938955e-06, "loss": 0.2247, "step": 4323 }, { "epoch": 28.07792207792208, "grad_norm": 1.4109776020050049, "learning_rate": 4.4427732674250045e-06, "loss": 0.2204, "step": 4324 }, { "epoch": 28.084415584415584, "grad_norm": 1.3625751733779907, "learning_rate": 4.429836168227735e-06, "loss": 0.2124, "step": 4325 }, { "epoch": 28.09090909090909, "grad_norm": 1.2502995729446411, "learning_rate": 4.416917059409464e-06, "loss": 0.1974, "step": 4326 }, { "epoch": 28.0974025974026, "grad_norm": 1.3760490417480469, "learning_rate": 4.404015946070439e-06, "loss": 0.224, "step": 4327 }, { "epoch": 28.103896103896105, "grad_norm": 1.3126150369644165, "learning_rate": 4.391132833303807e-06, "loss": 0.2152, "step": 4328 }, { "epoch": 28.11038961038961, "grad_norm": 1.2602272033691406, "learning_rate": 4.378267726195645e-06, "loss": 0.2045, "step": 4329 }, { "epoch": 28.116883116883116, "grad_norm": 1.5098934173583984, "learning_rate": 4.365420629824863e-06, "loss": 0.2345, "step": 4330 }, { "epoch": 28.123376623376622, "grad_norm": 1.4111462831497192, "learning_rate": 4.352591549263302e-06, "loss": 0.2511, "step": 4331 }, { "epoch": 28.12987012987013, "grad_norm": 1.4122276306152344, "learning_rate": 4.3397804895756955e-06, "loss": 0.2278, "step": 4332 }, { "epoch": 28.136363636363637, "grad_norm": 1.4028385877609253, "learning_rate": 4.32698745581962e-06, "loss": 0.2283, "step": 4333 }, { "epoch": 28.142857142857142, "grad_norm": 1.6334865093231201, "learning_rate": 4.31421245304558e-06, "loss": 0.2649, "step": 4334 }, { "epoch": 28.149350649350648, "grad_norm": 1.4974746704101562, "learning_rate": 4.301455486296946e-06, "loss": 0.2159, "step": 4335 }, { "epoch": 28.155844155844157, "grad_norm": 1.431597113609314, "learning_rate": 4.288716560609951e-06, "loss": 0.2256, "step": 4336 }, { "epoch": 28.162337662337663, "grad_norm": 1.5479998588562012, "learning_rate": 4.275995681013745e-06, "loss": 0.2721, "step": 4337 }, { "epoch": 28.16883116883117, "grad_norm": 1.4098697900772095, "learning_rate": 4.263292852530293e-06, "loss": 0.2275, "step": 4338 }, { "epoch": 28.175324675324674, "grad_norm": 1.450825810432434, "learning_rate": 4.250608080174512e-06, "loss": 0.234, "step": 4339 }, { "epoch": 28.181818181818183, "grad_norm": 1.4986686706542969, "learning_rate": 4.237941368954124e-06, "loss": 0.256, "step": 4340 }, { "epoch": 28.18831168831169, "grad_norm": 1.358237862586975, "learning_rate": 4.225292723869756e-06, "loss": 0.2103, "step": 4341 }, { "epoch": 28.194805194805195, "grad_norm": 1.3350050449371338, "learning_rate": 4.212662149914886e-06, "loss": 0.1989, "step": 4342 }, { "epoch": 28.2012987012987, "grad_norm": 1.3782732486724854, "learning_rate": 4.200049652075866e-06, "loss": 0.2283, "step": 4343 }, { "epoch": 28.207792207792206, "grad_norm": 1.6267609596252441, "learning_rate": 4.18745523533191e-06, "loss": 0.2498, "step": 4344 }, { "epoch": 28.214285714285715, "grad_norm": 1.3791561126708984, "learning_rate": 4.174878904655105e-06, "loss": 0.2484, "step": 4345 }, { "epoch": 28.22077922077922, "grad_norm": 1.3116323947906494, "learning_rate": 4.162320665010372e-06, "loss": 0.2062, "step": 4346 }, { "epoch": 28.227272727272727, "grad_norm": 1.465994119644165, "learning_rate": 4.149780521355523e-06, "loss": 0.2309, "step": 4347 }, { "epoch": 28.233766233766232, "grad_norm": 1.3930613994598389, "learning_rate": 4.137258478641171e-06, "loss": 0.2159, "step": 4348 }, { "epoch": 28.24025974025974, "grad_norm": 1.5446178913116455, "learning_rate": 4.1247545418108715e-06, "loss": 0.2402, "step": 4349 }, { "epoch": 28.246753246753247, "grad_norm": 1.6220091581344604, "learning_rate": 4.112268715800943e-06, "loss": 0.2771, "step": 4350 }, { "epoch": 28.253246753246753, "grad_norm": 1.2821210622787476, "learning_rate": 4.099801005540616e-06, "loss": 0.2056, "step": 4351 }, { "epoch": 28.25974025974026, "grad_norm": 1.5242184400558472, "learning_rate": 4.087351415951918e-06, "loss": 0.2167, "step": 4352 }, { "epoch": 28.266233766233768, "grad_norm": 1.2646162509918213, "learning_rate": 4.0749199519497685e-06, "loss": 0.194, "step": 4353 }, { "epoch": 28.272727272727273, "grad_norm": 1.4229185581207275, "learning_rate": 4.062506618441908e-06, "loss": 0.2252, "step": 4354 }, { "epoch": 28.27922077922078, "grad_norm": 1.3415851593017578, "learning_rate": 4.050111420328939e-06, "loss": 0.2092, "step": 4355 }, { "epoch": 28.285714285714285, "grad_norm": 1.38569176197052, "learning_rate": 4.0377343625042584e-06, "loss": 0.2049, "step": 4356 }, { "epoch": 28.292207792207794, "grad_norm": 1.3419746160507202, "learning_rate": 4.025375449854163e-06, "loss": 0.195, "step": 4357 }, { "epoch": 28.2987012987013, "grad_norm": 1.4296602010726929, "learning_rate": 4.013034687257727e-06, "loss": 0.227, "step": 4358 }, { "epoch": 28.305194805194805, "grad_norm": 1.827120065689087, "learning_rate": 4.000712079586916e-06, "loss": 0.2408, "step": 4359 }, { "epoch": 28.31168831168831, "grad_norm": 1.4306057691574097, "learning_rate": 3.9884076317064814e-06, "loss": 0.24, "step": 4360 }, { "epoch": 28.318181818181817, "grad_norm": 1.327657699584961, "learning_rate": 3.976121348474038e-06, "loss": 0.2004, "step": 4361 }, { "epoch": 28.324675324675326, "grad_norm": 1.5038615465164185, "learning_rate": 3.963853234740006e-06, "loss": 0.231, "step": 4362 }, { "epoch": 28.33116883116883, "grad_norm": 1.4577782154083252, "learning_rate": 3.951603295347639e-06, "loss": 0.2491, "step": 4363 }, { "epoch": 28.337662337662337, "grad_norm": 1.4464967250823975, "learning_rate": 3.939371535133024e-06, "loss": 0.2218, "step": 4364 }, { "epoch": 28.344155844155843, "grad_norm": 1.3318097591400146, "learning_rate": 3.9271579589250816e-06, "loss": 0.2362, "step": 4365 }, { "epoch": 28.350649350649352, "grad_norm": 1.5567820072174072, "learning_rate": 3.914962571545511e-06, "loss": 0.2522, "step": 4366 }, { "epoch": 28.357142857142858, "grad_norm": 1.2841646671295166, "learning_rate": 3.902785377808882e-06, "loss": 0.1985, "step": 4367 }, { "epoch": 28.363636363636363, "grad_norm": 1.3579140901565552, "learning_rate": 3.890626382522533e-06, "loss": 0.2265, "step": 4368 }, { "epoch": 28.37012987012987, "grad_norm": 1.3064169883728027, "learning_rate": 3.8784855904866635e-06, "loss": 0.2024, "step": 4369 }, { "epoch": 28.376623376623378, "grad_norm": 1.3977599143981934, "learning_rate": 3.866363006494255e-06, "loss": 0.2358, "step": 4370 }, { "epoch": 28.383116883116884, "grad_norm": 1.4794952869415283, "learning_rate": 3.854258635331126e-06, "loss": 0.2237, "step": 4371 }, { "epoch": 28.38961038961039, "grad_norm": 1.472916841506958, "learning_rate": 3.842172481775874e-06, "loss": 0.2384, "step": 4372 }, { "epoch": 28.396103896103895, "grad_norm": 1.320885181427002, "learning_rate": 3.830104550599922e-06, "loss": 0.2127, "step": 4373 }, { "epoch": 28.4025974025974, "grad_norm": 1.390451192855835, "learning_rate": 3.8180548465675144e-06, "loss": 0.2151, "step": 4374 }, { "epoch": 28.40909090909091, "grad_norm": 1.4826058149337769, "learning_rate": 3.8060233744356633e-06, "loss": 0.2509, "step": 4375 }, { "epoch": 28.415584415584416, "grad_norm": 1.45282781124115, "learning_rate": 3.794010138954213e-06, "loss": 0.2159, "step": 4376 }, { "epoch": 28.42207792207792, "grad_norm": 1.3463776111602783, "learning_rate": 3.782015144865808e-06, "loss": 0.2144, "step": 4377 }, { "epoch": 28.428571428571427, "grad_norm": 1.4938033819198608, "learning_rate": 3.7700383969058618e-06, "loss": 0.219, "step": 4378 }, { "epoch": 28.435064935064936, "grad_norm": 1.527669072151184, "learning_rate": 3.7580798998026134e-06, "loss": 0.2545, "step": 4379 }, { "epoch": 28.441558441558442, "grad_norm": 1.5726501941680908, "learning_rate": 3.7461396582771035e-06, "loss": 0.2507, "step": 4380 }, { "epoch": 28.448051948051948, "grad_norm": 1.3895041942596436, "learning_rate": 3.734217677043128e-06, "loss": 0.1974, "step": 4381 }, { "epoch": 28.454545454545453, "grad_norm": 1.5021185874938965, "learning_rate": 3.7223139608073e-06, "loss": 0.2508, "step": 4382 }, { "epoch": 28.461038961038962, "grad_norm": 1.4821114540100098, "learning_rate": 3.7104285142690265e-06, "loss": 0.2233, "step": 4383 }, { "epoch": 28.467532467532468, "grad_norm": 1.531602144241333, "learning_rate": 3.698561342120499e-06, "loss": 0.2483, "step": 4384 }, { "epoch": 28.474025974025974, "grad_norm": 1.3615492582321167, "learning_rate": 3.6867124490466698e-06, "loss": 0.2159, "step": 4385 }, { "epoch": 28.48051948051948, "grad_norm": 1.2236193418502808, "learning_rate": 3.674881839725314e-06, "loss": 0.1817, "step": 4386 }, { "epoch": 28.48701298701299, "grad_norm": 1.285338044166565, "learning_rate": 3.6630695188269505e-06, "loss": 0.1989, "step": 4387 }, { "epoch": 28.493506493506494, "grad_norm": 1.3873265981674194, "learning_rate": 3.651275491014905e-06, "loss": 0.2256, "step": 4388 }, { "epoch": 28.5, "grad_norm": 1.285491943359375, "learning_rate": 3.6394997609452752e-06, "loss": 0.2337, "step": 4389 }, { "epoch": 28.506493506493506, "grad_norm": 1.4612563848495483, "learning_rate": 3.627742333266937e-06, "loss": 0.2646, "step": 4390 }, { "epoch": 28.51298701298701, "grad_norm": 1.4174251556396484, "learning_rate": 3.616003212621527e-06, "loss": 0.2393, "step": 4391 }, { "epoch": 28.51948051948052, "grad_norm": 1.2974154949188232, "learning_rate": 3.604282403643472e-06, "loss": 0.2061, "step": 4392 }, { "epoch": 28.525974025974026, "grad_norm": 1.4877636432647705, "learning_rate": 3.5925799109599423e-06, "loss": 0.2264, "step": 4393 }, { "epoch": 28.532467532467532, "grad_norm": 1.4763554334640503, "learning_rate": 3.5808957391909316e-06, "loss": 0.2617, "step": 4394 }, { "epoch": 28.538961038961038, "grad_norm": 1.4987128973007202, "learning_rate": 3.569229892949133e-06, "loss": 0.2275, "step": 4395 }, { "epoch": 28.545454545454547, "grad_norm": 1.4981915950775146, "learning_rate": 3.5575823768400628e-06, "loss": 0.2208, "step": 4396 }, { "epoch": 28.551948051948052, "grad_norm": 1.3527140617370605, "learning_rate": 3.545953195461954e-06, "loss": 0.2051, "step": 4397 }, { "epoch": 28.558441558441558, "grad_norm": 1.5411646366119385, "learning_rate": 3.534342353405834e-06, "loss": 0.2475, "step": 4398 }, { "epoch": 28.564935064935064, "grad_norm": 1.470456838607788, "learning_rate": 3.5227498552554807e-06, "loss": 0.247, "step": 4399 }, { "epoch": 28.571428571428573, "grad_norm": 1.231394648551941, "learning_rate": 3.511175705587433e-06, "loss": 0.1922, "step": 4400 }, { "epoch": 28.57792207792208, "grad_norm": 1.2981582880020142, "learning_rate": 3.4996199089709692e-06, "loss": 0.1959, "step": 4401 }, { "epoch": 28.584415584415584, "grad_norm": 1.4794974327087402, "learning_rate": 3.488082469968146e-06, "loss": 0.2368, "step": 4402 }, { "epoch": 28.59090909090909, "grad_norm": 1.2699427604675293, "learning_rate": 3.4765633931337473e-06, "loss": 0.2046, "step": 4403 }, { "epoch": 28.5974025974026, "grad_norm": 1.2251280546188354, "learning_rate": 3.465062683015341e-06, "loss": 0.1815, "step": 4404 }, { "epoch": 28.603896103896105, "grad_norm": 1.4747943878173828, "learning_rate": 3.4535803441532123e-06, "loss": 0.2601, "step": 4405 }, { "epoch": 28.61038961038961, "grad_norm": 1.3116390705108643, "learning_rate": 3.442116381080418e-06, "loss": 0.1962, "step": 4406 }, { "epoch": 28.616883116883116, "grad_norm": 1.2822507619857788, "learning_rate": 3.4306707983227325e-06, "loss": 0.1899, "step": 4407 }, { "epoch": 28.623376623376622, "grad_norm": 1.3936786651611328, "learning_rate": 3.4192436003987026e-06, "loss": 0.2258, "step": 4408 }, { "epoch": 28.62987012987013, "grad_norm": 1.2836315631866455, "learning_rate": 3.407834791819603e-06, "loss": 0.2201, "step": 4409 }, { "epoch": 28.636363636363637, "grad_norm": 1.6127647161483765, "learning_rate": 3.3964443770894528e-06, "loss": 0.2753, "step": 4410 }, { "epoch": 28.642857142857142, "grad_norm": 1.6660722494125366, "learning_rate": 3.3850723607049996e-06, "loss": 0.2444, "step": 4411 }, { "epoch": 28.649350649350648, "grad_norm": 1.2967348098754883, "learning_rate": 3.373718747155752e-06, "loss": 0.1949, "step": 4412 }, { "epoch": 28.655844155844157, "grad_norm": 1.4003551006317139, "learning_rate": 3.3623835409239022e-06, "loss": 0.2314, "step": 4413 }, { "epoch": 28.662337662337663, "grad_norm": 1.4544432163238525, "learning_rate": 3.351066746484455e-06, "loss": 0.2443, "step": 4414 }, { "epoch": 28.66883116883117, "grad_norm": 1.4031589031219482, "learning_rate": 3.3397683683050685e-06, "loss": 0.221, "step": 4415 }, { "epoch": 28.675324675324674, "grad_norm": 1.3709231615066528, "learning_rate": 3.328488410846187e-06, "loss": 0.1965, "step": 4416 }, { "epoch": 28.681818181818183, "grad_norm": 1.2183549404144287, "learning_rate": 3.3172268785609307e-06, "loss": 0.1784, "step": 4417 }, { "epoch": 28.68831168831169, "grad_norm": 1.2859083414077759, "learning_rate": 3.3059837758951994e-06, "loss": 0.2083, "step": 4418 }, { "epoch": 28.694805194805195, "grad_norm": 1.3288540840148926, "learning_rate": 3.2947591072875814e-06, "loss": 0.2168, "step": 4419 }, { "epoch": 28.7012987012987, "grad_norm": 1.417007327079773, "learning_rate": 3.2835528771693992e-06, "loss": 0.2411, "step": 4420 }, { "epoch": 28.707792207792206, "grad_norm": 1.3782029151916504, "learning_rate": 3.2723650899646906e-06, "loss": 0.215, "step": 4421 }, { "epoch": 28.714285714285715, "grad_norm": 1.5411187410354614, "learning_rate": 3.2611957500902347e-06, "loss": 0.2177, "step": 4422 }, { "epoch": 28.72077922077922, "grad_norm": 1.4956077337265015, "learning_rate": 3.250044861955487e-06, "loss": 0.239, "step": 4423 }, { "epoch": 28.727272727272727, "grad_norm": 1.3691248893737793, "learning_rate": 3.2389124299626484e-06, "loss": 0.2197, "step": 4424 }, { "epoch": 28.733766233766232, "grad_norm": 1.54719078540802, "learning_rate": 3.2277984585066366e-06, "loss": 0.2354, "step": 4425 }, { "epoch": 28.74025974025974, "grad_norm": 1.331629991531372, "learning_rate": 3.2167029519750593e-06, "loss": 0.2274, "step": 4426 }, { "epoch": 28.746753246753247, "grad_norm": 1.4866575002670288, "learning_rate": 3.205625914748256e-06, "loss": 0.2655, "step": 4427 }, { "epoch": 28.753246753246753, "grad_norm": 1.4166159629821777, "learning_rate": 3.194567351199257e-06, "loss": 0.2245, "step": 4428 }, { "epoch": 28.75974025974026, "grad_norm": 1.4901286363601685, "learning_rate": 3.1835272656938197e-06, "loss": 0.245, "step": 4429 }, { "epoch": 28.766233766233768, "grad_norm": 1.398005485534668, "learning_rate": 3.172505662590386e-06, "loss": 0.2313, "step": 4430 }, { "epoch": 28.772727272727273, "grad_norm": 1.574500322341919, "learning_rate": 3.1615025462401138e-06, "loss": 0.2593, "step": 4431 }, { "epoch": 28.77922077922078, "grad_norm": 1.452019214630127, "learning_rate": 3.150517920986851e-06, "loss": 0.2306, "step": 4432 }, { "epoch": 28.785714285714285, "grad_norm": 1.3880548477172852, "learning_rate": 3.1395517911671612e-06, "loss": 0.22, "step": 4433 }, { "epoch": 28.792207792207794, "grad_norm": 1.4887290000915527, "learning_rate": 3.128604161110299e-06, "loss": 0.2492, "step": 4434 }, { "epoch": 28.7987012987013, "grad_norm": 1.56515371799469, "learning_rate": 3.1176750351382235e-06, "loss": 0.2419, "step": 4435 }, { "epoch": 28.805194805194805, "grad_norm": 1.4204707145690918, "learning_rate": 3.106764417565561e-06, "loss": 0.2278, "step": 4436 }, { "epoch": 28.81168831168831, "grad_norm": 1.1363461017608643, "learning_rate": 3.095872312699666e-06, "loss": 0.1776, "step": 4437 }, { "epoch": 28.818181818181817, "grad_norm": 1.4058977365493774, "learning_rate": 3.08499872484056e-06, "loss": 0.2211, "step": 4438 }, { "epoch": 28.824675324675326, "grad_norm": 1.3269935846328735, "learning_rate": 3.0741436582809703e-06, "loss": 0.2134, "step": 4439 }, { "epoch": 28.83116883116883, "grad_norm": 1.4511215686798096, "learning_rate": 3.0633071173062967e-06, "loss": 0.2111, "step": 4440 }, { "epoch": 28.837662337662337, "grad_norm": 1.5990904569625854, "learning_rate": 3.052489106194645e-06, "loss": 0.2739, "step": 4441 }, { "epoch": 28.844155844155843, "grad_norm": 1.3339288234710693, "learning_rate": 3.0416896292167875e-06, "loss": 0.215, "step": 4442 }, { "epoch": 28.850649350649352, "grad_norm": 1.3713160753250122, "learning_rate": 3.0309086906361917e-06, "loss": 0.227, "step": 4443 }, { "epoch": 28.857142857142858, "grad_norm": 1.3854676485061646, "learning_rate": 3.0201462947089866e-06, "loss": 0.2156, "step": 4444 }, { "epoch": 28.863636363636363, "grad_norm": 1.4092707633972168, "learning_rate": 3.0094024456840174e-06, "loss": 0.2189, "step": 4445 }, { "epoch": 28.87012987012987, "grad_norm": 1.410773515701294, "learning_rate": 2.9986771478027755e-06, "loss": 0.224, "step": 4446 }, { "epoch": 28.876623376623378, "grad_norm": 1.446549654006958, "learning_rate": 2.9879704052994394e-06, "loss": 0.2332, "step": 4447 }, { "epoch": 28.883116883116884, "grad_norm": 1.2421294450759888, "learning_rate": 2.977282222400851e-06, "loss": 0.2064, "step": 4448 }, { "epoch": 28.88961038961039, "grad_norm": 1.5260447263717651, "learning_rate": 2.9666126033265462e-06, "loss": 0.2277, "step": 4449 }, { "epoch": 28.896103896103895, "grad_norm": 1.358296275138855, "learning_rate": 2.9559615522887273e-06, "loss": 0.2262, "step": 4450 }, { "epoch": 28.9025974025974, "grad_norm": 1.250861644744873, "learning_rate": 2.9453290734922535e-06, "loss": 0.1886, "step": 4451 }, { "epoch": 28.90909090909091, "grad_norm": 1.429858684539795, "learning_rate": 2.9347151711346555e-06, "loss": 0.2489, "step": 4452 }, { "epoch": 28.915584415584416, "grad_norm": 1.235266089439392, "learning_rate": 2.924119849406143e-06, "loss": 0.2047, "step": 4453 }, { "epoch": 28.92207792207792, "grad_norm": 1.3401710987091064, "learning_rate": 2.913543112489564e-06, "loss": 0.2216, "step": 4454 }, { "epoch": 28.928571428571427, "grad_norm": 1.3335764408111572, "learning_rate": 2.9029849645604733e-06, "loss": 0.2161, "step": 4455 }, { "epoch": 28.935064935064936, "grad_norm": 1.3277149200439453, "learning_rate": 2.8924454097870367e-06, "loss": 0.2178, "step": 4456 }, { "epoch": 28.941558441558442, "grad_norm": 1.5716547966003418, "learning_rate": 2.8819244523301146e-06, "loss": 0.2366, "step": 4457 }, { "epoch": 28.948051948051948, "grad_norm": 1.5386325120925903, "learning_rate": 2.8714220963432125e-06, "loss": 0.2599, "step": 4458 }, { "epoch": 28.954545454545453, "grad_norm": 1.5311278104782104, "learning_rate": 2.8609383459724913e-06, "loss": 0.2379, "step": 4459 }, { "epoch": 28.961038961038962, "grad_norm": 1.5145843029022217, "learning_rate": 2.850473205356774e-06, "loss": 0.2601, "step": 4460 }, { "epoch": 28.967532467532468, "grad_norm": 1.3857287168502808, "learning_rate": 2.8400266786275387e-06, "loss": 0.2091, "step": 4461 }, { "epoch": 28.974025974025974, "grad_norm": 1.2279174327850342, "learning_rate": 2.8295987699088923e-06, "loss": 0.1991, "step": 4462 }, { "epoch": 28.98051948051948, "grad_norm": 1.4894819259643555, "learning_rate": 2.8191894833176248e-06, "loss": 0.2461, "step": 4463 }, { "epoch": 28.98701298701299, "grad_norm": 1.359235167503357, "learning_rate": 2.8087988229631322e-06, "loss": 0.1935, "step": 4464 }, { "epoch": 28.993506493506494, "grad_norm": 1.4306108951568604, "learning_rate": 2.7984267929475173e-06, "loss": 0.23, "step": 4465 }, { "epoch": 29.0, "grad_norm": 242.78746032714844, "learning_rate": 2.788073397365465e-06, "loss": 0.2494, "step": 4466 }, { "epoch": 29.006493506493506, "grad_norm": 1.5637943744659424, "learning_rate": 2.7777386403043505e-06, "loss": 0.2541, "step": 4467 }, { "epoch": 29.01298701298701, "grad_norm": 1.5055761337280273, "learning_rate": 2.76742252584416e-06, "loss": 0.2396, "step": 4468 }, { "epoch": 29.01948051948052, "grad_norm": 1.3281155824661255, "learning_rate": 2.757125058057536e-06, "loss": 0.2005, "step": 4469 }, { "epoch": 29.025974025974026, "grad_norm": 1.4867539405822754, "learning_rate": 2.746846241009765e-06, "loss": 0.2133, "step": 4470 }, { "epoch": 29.032467532467532, "grad_norm": 1.487004041671753, "learning_rate": 2.7365860787587407e-06, "loss": 0.2498, "step": 4471 }, { "epoch": 29.038961038961038, "grad_norm": 1.1922616958618164, "learning_rate": 2.7263445753550276e-06, "loss": 0.1845, "step": 4472 }, { "epoch": 29.045454545454547, "grad_norm": 1.5154821872711182, "learning_rate": 2.7161217348418144e-06, "loss": 0.2467, "step": 4473 }, { "epoch": 29.051948051948052, "grad_norm": 1.4144598245620728, "learning_rate": 2.705917561254895e-06, "loss": 0.2205, "step": 4474 }, { "epoch": 29.058441558441558, "grad_norm": 1.4396140575408936, "learning_rate": 2.695732058622735e-06, "loss": 0.2288, "step": 4475 }, { "epoch": 29.064935064935064, "grad_norm": 1.1640795469284058, "learning_rate": 2.6855652309664083e-06, "loss": 0.1649, "step": 4476 }, { "epoch": 29.071428571428573, "grad_norm": 1.3204982280731201, "learning_rate": 2.675417082299603e-06, "loss": 0.1935, "step": 4477 }, { "epoch": 29.07792207792208, "grad_norm": 1.2430561780929565, "learning_rate": 2.665287616628659e-06, "loss": 0.2019, "step": 4478 }, { "epoch": 29.084415584415584, "grad_norm": 1.480852484703064, "learning_rate": 2.6551768379525277e-06, "loss": 0.2517, "step": 4479 }, { "epoch": 29.09090909090909, "grad_norm": 1.4226365089416504, "learning_rate": 2.6450847502627884e-06, "loss": 0.2275, "step": 4480 }, { "epoch": 29.0974025974026, "grad_norm": 1.397377610206604, "learning_rate": 2.6350113575436264e-06, "loss": 0.2215, "step": 4481 }, { "epoch": 29.103896103896105, "grad_norm": 1.6053948402404785, "learning_rate": 2.6249566637718714e-06, "loss": 0.2668, "step": 4482 }, { "epoch": 29.11038961038961, "grad_norm": 1.4192824363708496, "learning_rate": 2.614920672916943e-06, "loss": 0.2285, "step": 4483 }, { "epoch": 29.116883116883116, "grad_norm": 1.321874737739563, "learning_rate": 2.6049033889408938e-06, "loss": 0.2202, "step": 4484 }, { "epoch": 29.123376623376622, "grad_norm": 1.6679133176803589, "learning_rate": 2.594904815798399e-06, "loss": 0.2279, "step": 4485 }, { "epoch": 29.12987012987013, "grad_norm": 1.5913492441177368, "learning_rate": 2.584924957436735e-06, "loss": 0.244, "step": 4486 }, { "epoch": 29.136363636363637, "grad_norm": 1.34376060962677, "learning_rate": 2.5749638177957835e-06, "loss": 0.2163, "step": 4487 }, { "epoch": 29.142857142857142, "grad_norm": 1.5198384523391724, "learning_rate": 2.5650214008080543e-06, "loss": 0.212, "step": 4488 }, { "epoch": 29.149350649350648, "grad_norm": 1.4052996635437012, "learning_rate": 2.555097710398635e-06, "loss": 0.2488, "step": 4489 }, { "epoch": 29.155844155844157, "grad_norm": 1.5139129161834717, "learning_rate": 2.5451927504852757e-06, "loss": 0.2397, "step": 4490 }, { "epoch": 29.162337662337663, "grad_norm": 1.4797667264938354, "learning_rate": 2.5353065249782647e-06, "loss": 0.2302, "step": 4491 }, { "epoch": 29.16883116883117, "grad_norm": 1.3339955806732178, "learning_rate": 2.5254390377805583e-06, "loss": 0.1987, "step": 4492 }, { "epoch": 29.175324675324674, "grad_norm": 1.436172366142273, "learning_rate": 2.5155902927876564e-06, "loss": 0.2332, "step": 4493 }, { "epoch": 29.181818181818183, "grad_norm": 1.276912808418274, "learning_rate": 2.505760293887699e-06, "loss": 0.199, "step": 4494 }, { "epoch": 29.18831168831169, "grad_norm": 1.3921757936477661, "learning_rate": 2.4959490449614096e-06, "loss": 0.2244, "step": 4495 }, { "epoch": 29.194805194805195, "grad_norm": 1.43412446975708, "learning_rate": 2.4861565498821347e-06, "loss": 0.2274, "step": 4496 }, { "epoch": 29.2012987012987, "grad_norm": 1.4300919771194458, "learning_rate": 2.476382812515765e-06, "loss": 0.2244, "step": 4497 }, { "epoch": 29.207792207792206, "grad_norm": 1.1662312746047974, "learning_rate": 2.4666278367208415e-06, "loss": 0.1711, "step": 4498 }, { "epoch": 29.214285714285715, "grad_norm": 1.3270200490951538, "learning_rate": 2.4568916263484508e-06, "loss": 0.2002, "step": 4499 }, { "epoch": 29.22077922077922, "grad_norm": 1.4422167539596558, "learning_rate": 2.4471741852423237e-06, "loss": 0.24, "step": 4500 }, { "epoch": 29.227272727272727, "grad_norm": 1.3519929647445679, "learning_rate": 2.43747551723873e-06, "loss": 0.2109, "step": 4501 }, { "epoch": 29.233766233766232, "grad_norm": 1.4732192754745483, "learning_rate": 2.4277956261665626e-06, "loss": 0.2334, "step": 4502 }, { "epoch": 29.24025974025974, "grad_norm": 1.5626745223999023, "learning_rate": 2.4181345158472758e-06, "loss": 0.2138, "step": 4503 }, { "epoch": 29.246753246753247, "grad_norm": 1.4250954389572144, "learning_rate": 2.4084921900949297e-06, "loss": 0.219, "step": 4504 }, { "epoch": 29.253246753246753, "grad_norm": 1.271789789199829, "learning_rate": 2.3988686527161687e-06, "loss": 0.1985, "step": 4505 }, { "epoch": 29.25974025974026, "grad_norm": 1.478391170501709, "learning_rate": 2.3892639075102084e-06, "loss": 0.2247, "step": 4506 }, { "epoch": 29.266233766233768, "grad_norm": 1.1864556074142456, "learning_rate": 2.3796779582688446e-06, "loss": 0.1889, "step": 4507 }, { "epoch": 29.272727272727273, "grad_norm": 1.4443172216415405, "learning_rate": 2.3701108087764656e-06, "loss": 0.2309, "step": 4508 }, { "epoch": 29.27922077922078, "grad_norm": 1.3897244930267334, "learning_rate": 2.3605624628100175e-06, "loss": 0.2345, "step": 4509 }, { "epoch": 29.285714285714285, "grad_norm": 2.757894277572632, "learning_rate": 2.351032924139063e-06, "loss": 0.228, "step": 4510 }, { "epoch": 29.292207792207794, "grad_norm": 1.3442329168319702, "learning_rate": 2.3415221965256805e-06, "loss": 0.212, "step": 4511 }, { "epoch": 29.2987012987013, "grad_norm": 1.554976463317871, "learning_rate": 2.3320302837245843e-06, "loss": 0.2497, "step": 4512 }, { "epoch": 29.305194805194805, "grad_norm": 1.3681670427322388, "learning_rate": 2.3225571894830045e-06, "loss": 0.2158, "step": 4513 }, { "epoch": 29.31168831168831, "grad_norm": 1.4626275300979614, "learning_rate": 2.313102917540788e-06, "loss": 0.2177, "step": 4514 }, { "epoch": 29.318181818181817, "grad_norm": 1.2011229991912842, "learning_rate": 2.3036674716303275e-06, "loss": 0.2027, "step": 4515 }, { "epoch": 29.324675324675326, "grad_norm": 1.4060630798339844, "learning_rate": 2.2942508554765762e-06, "loss": 0.2301, "step": 4516 }, { "epoch": 29.33116883116883, "grad_norm": 1.5239795446395874, "learning_rate": 2.2848530727970774e-06, "loss": 0.2214, "step": 4517 }, { "epoch": 29.337662337662337, "grad_norm": 1.338597297668457, "learning_rate": 2.27547412730193e-06, "loss": 0.2135, "step": 4518 }, { "epoch": 29.344155844155843, "grad_norm": 1.4340932369232178, "learning_rate": 2.266114022693777e-06, "loss": 0.2317, "step": 4519 }, { "epoch": 29.350649350649352, "grad_norm": 1.417336344718933, "learning_rate": 2.2567727626678527e-06, "loss": 0.2321, "step": 4520 }, { "epoch": 29.357142857142858, "grad_norm": 1.4359439611434937, "learning_rate": 2.247450350911939e-06, "loss": 0.225, "step": 4521 }, { "epoch": 29.363636363636363, "grad_norm": 1.3348804712295532, "learning_rate": 2.238146791106366e-06, "loss": 0.2325, "step": 4522 }, { "epoch": 29.37012987012987, "grad_norm": 1.5960094928741455, "learning_rate": 2.2288620869240383e-06, "loss": 0.2483, "step": 4523 }, { "epoch": 29.376623376623378, "grad_norm": 1.449878215789795, "learning_rate": 2.2195962420304085e-06, "loss": 0.2513, "step": 4524 }, { "epoch": 29.383116883116884, "grad_norm": 1.6213632822036743, "learning_rate": 2.210349260083494e-06, "loss": 0.2795, "step": 4525 }, { "epoch": 29.38961038961039, "grad_norm": 1.3410649299621582, "learning_rate": 2.2011211447338478e-06, "loss": 0.2085, "step": 4526 }, { "epoch": 29.396103896103895, "grad_norm": 1.6169847249984741, "learning_rate": 2.191911899624588e-06, "loss": 0.2571, "step": 4527 }, { "epoch": 29.4025974025974, "grad_norm": 1.3275034427642822, "learning_rate": 2.1827215283913682e-06, "loss": 0.2023, "step": 4528 }, { "epoch": 29.40909090909091, "grad_norm": 1.4694947004318237, "learning_rate": 2.173550034662408e-06, "loss": 0.2425, "step": 4529 }, { "epoch": 29.415584415584416, "grad_norm": 1.3936761617660522, "learning_rate": 2.164397422058473e-06, "loss": 0.2138, "step": 4530 }, { "epoch": 29.42207792207792, "grad_norm": 1.5484139919281006, "learning_rate": 2.1552636941928717e-06, "loss": 0.2623, "step": 4531 }, { "epoch": 29.428571428571427, "grad_norm": 1.461364984512329, "learning_rate": 2.1461488546714427e-06, "loss": 0.2331, "step": 4532 }, { "epoch": 29.435064935064936, "grad_norm": 1.4459589719772339, "learning_rate": 2.137052907092596e-06, "loss": 0.2351, "step": 4533 }, { "epoch": 29.441558441558442, "grad_norm": 1.3712208271026611, "learning_rate": 2.127975855047243e-06, "loss": 0.2234, "step": 4534 }, { "epoch": 29.448051948051948, "grad_norm": 1.3484586477279663, "learning_rate": 2.118917702118889e-06, "loss": 0.1998, "step": 4535 }, { "epoch": 29.454545454545453, "grad_norm": 1.3767493963241577, "learning_rate": 2.1098784518835293e-06, "loss": 0.2219, "step": 4536 }, { "epoch": 29.461038961038962, "grad_norm": 1.4202708005905151, "learning_rate": 2.100858107909731e-06, "loss": 0.2197, "step": 4537 }, { "epoch": 29.467532467532468, "grad_norm": 1.2947967052459717, "learning_rate": 2.091856673758569e-06, "loss": 0.2121, "step": 4538 }, { "epoch": 29.474025974025974, "grad_norm": 1.4303934574127197, "learning_rate": 2.082874152983677e-06, "loss": 0.2335, "step": 4539 }, { "epoch": 29.48051948051948, "grad_norm": 1.4804555177688599, "learning_rate": 2.0739105491312027e-06, "loss": 0.2313, "step": 4540 }, { "epoch": 29.48701298701299, "grad_norm": 1.4761794805526733, "learning_rate": 2.064965865739854e-06, "loss": 0.2501, "step": 4541 }, { "epoch": 29.493506493506494, "grad_norm": 1.2567049264907837, "learning_rate": 2.056040106340823e-06, "loss": 0.1992, "step": 4542 }, { "epoch": 29.5, "grad_norm": 1.5460628271102905, "learning_rate": 2.04713327445788e-06, "loss": 0.2518, "step": 4543 }, { "epoch": 29.506493506493506, "grad_norm": 1.4742612838745117, "learning_rate": 2.0382453736072836e-06, "loss": 0.2436, "step": 4544 }, { "epoch": 29.51298701298701, "grad_norm": 1.6358933448791504, "learning_rate": 2.0293764072978618e-06, "loss": 0.2303, "step": 4545 }, { "epoch": 29.51948051948052, "grad_norm": 1.5824604034423828, "learning_rate": 2.0205263790309125e-06, "loss": 0.2511, "step": 4546 }, { "epoch": 29.525974025974026, "grad_norm": 1.197434902191162, "learning_rate": 2.0116952923003083e-06, "loss": 0.1883, "step": 4547 }, { "epoch": 29.532467532467532, "grad_norm": 1.39491605758667, "learning_rate": 2.002883150592416e-06, "loss": 0.2027, "step": 4548 }, { "epoch": 29.538961038961038, "grad_norm": 1.4976357221603394, "learning_rate": 1.9940899573861195e-06, "loss": 0.2493, "step": 4549 }, { "epoch": 29.545454545454547, "grad_norm": 1.3756825923919678, "learning_rate": 1.985315716152847e-06, "loss": 0.215, "step": 4550 }, { "epoch": 29.551948051948052, "grad_norm": 1.4004231691360474, "learning_rate": 1.976560430356522e-06, "loss": 0.2059, "step": 4551 }, { "epoch": 29.558441558441558, "grad_norm": 1.4448028802871704, "learning_rate": 1.967824103453597e-06, "loss": 0.2363, "step": 4552 }, { "epoch": 29.564935064935064, "grad_norm": 1.5783758163452148, "learning_rate": 1.959106738893035e-06, "loss": 0.262, "step": 4553 }, { "epoch": 29.571428571428573, "grad_norm": 1.450778841972351, "learning_rate": 1.9504083401163e-06, "loss": 0.2496, "step": 4554 }, { "epoch": 29.57792207792208, "grad_norm": 1.3727223873138428, "learning_rate": 1.9417289105574053e-06, "loss": 0.2141, "step": 4555 }, { "epoch": 29.584415584415584, "grad_norm": 1.3032400608062744, "learning_rate": 1.933068453642833e-06, "loss": 0.2102, "step": 4556 }, { "epoch": 29.59090909090909, "grad_norm": 1.4764373302459717, "learning_rate": 1.9244269727916096e-06, "loss": 0.2461, "step": 4557 }, { "epoch": 29.5974025974026, "grad_norm": 1.3464124202728271, "learning_rate": 1.9158044714152445e-06, "loss": 0.2094, "step": 4558 }, { "epoch": 29.603896103896105, "grad_norm": 1.329056978225708, "learning_rate": 1.9072009529177648e-06, "loss": 0.2142, "step": 4559 }, { "epoch": 29.61038961038961, "grad_norm": 1.4573792219161987, "learning_rate": 1.8986164206957035e-06, "loss": 0.2528, "step": 4560 }, { "epoch": 29.616883116883116, "grad_norm": 1.4189461469650269, "learning_rate": 1.8900508781381054e-06, "loss": 0.2216, "step": 4561 }, { "epoch": 29.623376623376622, "grad_norm": 1.234270453453064, "learning_rate": 1.8815043286265045e-06, "loss": 0.2121, "step": 4562 }, { "epoch": 29.62987012987013, "grad_norm": 1.5275107622146606, "learning_rate": 1.8729767755349515e-06, "loss": 0.2504, "step": 4563 }, { "epoch": 29.636363636363637, "grad_norm": 1.3555330038070679, "learning_rate": 1.8644682222299703e-06, "loss": 0.2129, "step": 4564 }, { "epoch": 29.642857142857142, "grad_norm": 1.5257729291915894, "learning_rate": 1.8559786720706184e-06, "loss": 0.232, "step": 4565 }, { "epoch": 29.649350649350648, "grad_norm": 1.280228614807129, "learning_rate": 1.8475081284084428e-06, "loss": 0.1934, "step": 4566 }, { "epoch": 29.655844155844157, "grad_norm": 1.5212812423706055, "learning_rate": 1.8390565945874571e-06, "loss": 0.228, "step": 4567 }, { "epoch": 29.662337662337663, "grad_norm": 1.3626916408538818, "learning_rate": 1.8306240739442093e-06, "loss": 0.2292, "step": 4568 }, { "epoch": 29.66883116883117, "grad_norm": 1.4685323238372803, "learning_rate": 1.8222105698077251e-06, "loss": 0.23, "step": 4569 }, { "epoch": 29.675324675324674, "grad_norm": 1.3952929973602295, "learning_rate": 1.8138160854995145e-06, "loss": 0.2172, "step": 4570 }, { "epoch": 29.681818181818183, "grad_norm": 1.5327379703521729, "learning_rate": 1.805440624333593e-06, "loss": 0.2669, "step": 4571 }, { "epoch": 29.68831168831169, "grad_norm": 1.5147796869277954, "learning_rate": 1.7970841896164658e-06, "loss": 0.243, "step": 4572 }, { "epoch": 29.694805194805195, "grad_norm": 1.345550298690796, "learning_rate": 1.788746784647105e-06, "loss": 0.2038, "step": 4573 }, { "epoch": 29.7012987012987, "grad_norm": 1.663666844367981, "learning_rate": 1.7804284127169946e-06, "loss": 0.2817, "step": 4574 }, { "epoch": 29.707792207792206, "grad_norm": 1.3431698083877563, "learning_rate": 1.7721290771100961e-06, "loss": 0.2148, "step": 4575 }, { "epoch": 29.714285714285715, "grad_norm": 1.4743592739105225, "learning_rate": 1.7638487811028615e-06, "loss": 0.2363, "step": 4576 }, { "epoch": 29.72077922077922, "grad_norm": 1.347269058227539, "learning_rate": 1.7555875279642087e-06, "loss": 0.2232, "step": 4577 }, { "epoch": 29.727272727272727, "grad_norm": 1.4954112768173218, "learning_rate": 1.7473453209555623e-06, "loss": 0.2509, "step": 4578 }, { "epoch": 29.733766233766232, "grad_norm": 1.5747220516204834, "learning_rate": 1.739122163330803e-06, "loss": 0.251, "step": 4579 }, { "epoch": 29.74025974025974, "grad_norm": 1.4564876556396484, "learning_rate": 1.730918058336306e-06, "loss": 0.2465, "step": 4580 }, { "epoch": 29.746753246753247, "grad_norm": 1.3976736068725586, "learning_rate": 1.7227330092109307e-06, "loss": 0.2247, "step": 4581 }, { "epoch": 29.753246753246753, "grad_norm": 1.4252331256866455, "learning_rate": 1.7145670191859974e-06, "loss": 0.2222, "step": 4582 }, { "epoch": 29.75974025974026, "grad_norm": 1.4407166242599487, "learning_rate": 1.7064200914853111e-06, "loss": 0.2049, "step": 4583 }, { "epoch": 29.766233766233768, "grad_norm": 1.2483116388320923, "learning_rate": 1.6982922293251545e-06, "loss": 0.2049, "step": 4584 }, { "epoch": 29.772727272727273, "grad_norm": 1.1946392059326172, "learning_rate": 1.690183435914261e-06, "loss": 0.1834, "step": 4585 }, { "epoch": 29.77922077922078, "grad_norm": 1.4287269115447998, "learning_rate": 1.6820937144538807e-06, "loss": 0.2269, "step": 4586 }, { "epoch": 29.785714285714285, "grad_norm": 1.231776237487793, "learning_rate": 1.6740230681376867e-06, "loss": 0.1824, "step": 4587 }, { "epoch": 29.792207792207794, "grad_norm": 1.3937435150146484, "learning_rate": 1.6659715001518584e-06, "loss": 0.2269, "step": 4588 }, { "epoch": 29.7987012987013, "grad_norm": 1.4768844842910767, "learning_rate": 1.6579390136750084e-06, "loss": 0.2429, "step": 4589 }, { "epoch": 29.805194805194805, "grad_norm": 1.379350185394287, "learning_rate": 1.6499256118782503e-06, "loss": 0.2176, "step": 4590 }, { "epoch": 29.81168831168831, "grad_norm": 1.4761512279510498, "learning_rate": 1.6419312979251366e-06, "loss": 0.2283, "step": 4591 }, { "epoch": 29.818181818181817, "grad_norm": 1.3178898096084595, "learning_rate": 1.6339560749717154e-06, "loss": 0.2005, "step": 4592 }, { "epoch": 29.824675324675326, "grad_norm": 1.4400105476379395, "learning_rate": 1.6259999461664566e-06, "loss": 0.2102, "step": 4593 }, { "epoch": 29.83116883116883, "grad_norm": 1.6487106084823608, "learning_rate": 1.6180629146503256e-06, "loss": 0.2613, "step": 4594 }, { "epoch": 29.837662337662337, "grad_norm": 1.5849696397781372, "learning_rate": 1.6101449835567273e-06, "loss": 0.2682, "step": 4595 }, { "epoch": 29.844155844155843, "grad_norm": 1.5418355464935303, "learning_rate": 1.6022461560115497e-06, "loss": 0.2491, "step": 4596 }, { "epoch": 29.850649350649352, "grad_norm": 1.5607249736785889, "learning_rate": 1.59436643513311e-06, "loss": 0.2448, "step": 4597 }, { "epoch": 29.857142857142858, "grad_norm": 1.5290418863296509, "learning_rate": 1.5865058240322139e-06, "loss": 0.2329, "step": 4598 }, { "epoch": 29.863636363636363, "grad_norm": 1.2339144945144653, "learning_rate": 1.5786643258120904e-06, "loss": 0.203, "step": 4599 }, { "epoch": 29.87012987012987, "grad_norm": 1.4262816905975342, "learning_rate": 1.5708419435684462e-06, "loss": 0.2084, "step": 4600 }, { "epoch": 29.876623376623378, "grad_norm": 1.3901803493499756, "learning_rate": 1.563038680389428e-06, "loss": 0.2151, "step": 4601 }, { "epoch": 29.883116883116884, "grad_norm": 1.1629455089569092, "learning_rate": 1.555254539355655e-06, "loss": 0.1764, "step": 4602 }, { "epoch": 29.88961038961039, "grad_norm": 1.4925212860107422, "learning_rate": 1.5474895235401688e-06, "loss": 0.2571, "step": 4603 }, { "epoch": 29.896103896103895, "grad_norm": 1.4131487607955933, "learning_rate": 1.5397436360084783e-06, "loss": 0.2345, "step": 4604 }, { "epoch": 29.9025974025974, "grad_norm": 1.2736296653747559, "learning_rate": 1.532016879818532e-06, "loss": 0.1843, "step": 4605 }, { "epoch": 29.90909090909091, "grad_norm": 1.3848419189453125, "learning_rate": 1.5243092580207507e-06, "loss": 0.2376, "step": 4606 }, { "epoch": 29.915584415584416, "grad_norm": 1.4516197443008423, "learning_rate": 1.5166207736579562e-06, "loss": 0.2211, "step": 4607 }, { "epoch": 29.92207792207792, "grad_norm": 1.3948246240615845, "learning_rate": 1.5089514297654594e-06, "loss": 0.2248, "step": 4608 }, { "epoch": 29.928571428571427, "grad_norm": 1.4730905294418335, "learning_rate": 1.5013012293709827e-06, "loss": 0.2295, "step": 4609 }, { "epoch": 29.935064935064936, "grad_norm": 1.6055909395217896, "learning_rate": 1.4936701754947101e-06, "loss": 0.2508, "step": 4610 }, { "epoch": 29.941558441558442, "grad_norm": 1.4886142015457153, "learning_rate": 1.4860582711492544e-06, "loss": 0.2457, "step": 4611 }, { "epoch": 29.948051948051948, "grad_norm": 1.6967780590057373, "learning_rate": 1.4784655193396946e-06, "loss": 0.2837, "step": 4612 }, { "epoch": 29.954545454545453, "grad_norm": 1.4026026725769043, "learning_rate": 1.4708919230635054e-06, "loss": 0.1926, "step": 4613 }, { "epoch": 29.961038961038962, "grad_norm": 1.4642516374588013, "learning_rate": 1.463337485310634e-06, "loss": 0.2359, "step": 4614 }, { "epoch": 29.967532467532468, "grad_norm": 1.307497262954712, "learning_rate": 1.4558022090634504e-06, "loss": 0.215, "step": 4615 }, { "epoch": 29.974025974025974, "grad_norm": 1.401755928993225, "learning_rate": 1.4482860972967637e-06, "loss": 0.2257, "step": 4616 }, { "epoch": 29.98051948051948, "grad_norm": 1.4976235628128052, "learning_rate": 1.4407891529778172e-06, "loss": 0.2339, "step": 4617 }, { "epoch": 29.98701298701299, "grad_norm": 1.2983542680740356, "learning_rate": 1.4333113790662822e-06, "loss": 0.2079, "step": 4618 }, { "epoch": 29.993506493506494, "grad_norm": 1.3633376359939575, "learning_rate": 1.425852778514264e-06, "loss": 0.2267, "step": 4619 }, { "epoch": 30.0, "grad_norm": 1067.601806640625, "learning_rate": 1.4184133542663014e-06, "loss": 0.3013, "step": 4620 }, { "epoch": 30.006493506493506, "grad_norm": 1.2011126279830933, "learning_rate": 1.4109931092593731e-06, "loss": 0.1896, "step": 4621 }, { "epoch": 30.01298701298701, "grad_norm": 1.3121923208236694, "learning_rate": 1.4035920464228526e-06, "loss": 0.2097, "step": 4622 }, { "epoch": 30.01948051948052, "grad_norm": 1.3485032320022583, "learning_rate": 1.39621016867858e-06, "loss": 0.2358, "step": 4623 }, { "epoch": 30.025974025974026, "grad_norm": 1.3204777240753174, "learning_rate": 1.3888474789407968e-06, "loss": 0.2173, "step": 4624 }, { "epoch": 30.032467532467532, "grad_norm": 1.3863506317138672, "learning_rate": 1.3815039801161721e-06, "loss": 0.2248, "step": 4625 }, { "epoch": 30.038961038961038, "grad_norm": 1.270582675933838, "learning_rate": 1.3741796751038094e-06, "loss": 0.197, "step": 4626 }, { "epoch": 30.045454545454547, "grad_norm": 1.7072105407714844, "learning_rate": 1.3668745667952287e-06, "loss": 0.2556, "step": 4627 }, { "epoch": 30.051948051948052, "grad_norm": 1.5396653413772583, "learning_rate": 1.3595886580743678e-06, "loss": 0.2302, "step": 4628 }, { "epoch": 30.058441558441558, "grad_norm": 1.5195355415344238, "learning_rate": 1.3523219518175923e-06, "loss": 0.2321, "step": 4629 }, { "epoch": 30.064935064935064, "grad_norm": 1.5370792150497437, "learning_rate": 1.3450744508936686e-06, "loss": 0.2464, "step": 4630 }, { "epoch": 30.071428571428573, "grad_norm": 1.4256504774093628, "learning_rate": 1.337846158163819e-06, "loss": 0.2151, "step": 4631 }, { "epoch": 30.07792207792208, "grad_norm": 1.4232343435287476, "learning_rate": 1.3306370764816389e-06, "loss": 0.1732, "step": 4632 }, { "epoch": 30.084415584415584, "grad_norm": 1.3586199283599854, "learning_rate": 1.3234472086931737e-06, "loss": 0.216, "step": 4633 }, { "epoch": 30.09090909090909, "grad_norm": 1.5035972595214844, "learning_rate": 1.3162765576368584e-06, "loss": 0.2278, "step": 4634 }, { "epoch": 30.0974025974026, "grad_norm": 1.3979326486587524, "learning_rate": 1.3091251261435566e-06, "loss": 0.2123, "step": 4635 }, { "epoch": 30.103896103896105, "grad_norm": 1.4777899980545044, "learning_rate": 1.3019929170365374e-06, "loss": 0.2278, "step": 4636 }, { "epoch": 30.11038961038961, "grad_norm": 1.4044756889343262, "learning_rate": 1.2948799331314932e-06, "loss": 0.2278, "step": 4637 }, { "epoch": 30.116883116883116, "grad_norm": 1.4233686923980713, "learning_rate": 1.2877861772365108e-06, "loss": 0.2131, "step": 4638 }, { "epoch": 30.123376623376622, "grad_norm": 1.5021657943725586, "learning_rate": 1.2807116521520945e-06, "loss": 0.2395, "step": 4639 }, { "epoch": 30.12987012987013, "grad_norm": 1.6399335861206055, "learning_rate": 1.2736563606711382e-06, "loss": 0.2632, "step": 4640 }, { "epoch": 30.136363636363637, "grad_norm": 1.4309699535369873, "learning_rate": 1.2666203055789916e-06, "loss": 0.2285, "step": 4641 }, { "epoch": 30.142857142857142, "grad_norm": 1.4695476293563843, "learning_rate": 1.2596034896533549e-06, "loss": 0.225, "step": 4642 }, { "epoch": 30.149350649350648, "grad_norm": 1.3788657188415527, "learning_rate": 1.252605915664362e-06, "loss": 0.2037, "step": 4643 }, { "epoch": 30.155844155844157, "grad_norm": 1.459823489189148, "learning_rate": 1.2456275863745426e-06, "loss": 0.236, "step": 4644 }, { "epoch": 30.162337662337663, "grad_norm": 1.4960808753967285, "learning_rate": 1.2386685045388313e-06, "loss": 0.2365, "step": 4645 }, { "epoch": 30.16883116883117, "grad_norm": 1.5328145027160645, "learning_rate": 1.2317286729045586e-06, "loss": 0.2489, "step": 4646 }, { "epoch": 30.175324675324674, "grad_norm": 1.333230972290039, "learning_rate": 1.224808094211477e-06, "loss": 0.1936, "step": 4647 }, { "epoch": 30.181818181818183, "grad_norm": 1.367729663848877, "learning_rate": 1.2179067711917015e-06, "loss": 0.2374, "step": 4648 }, { "epoch": 30.18831168831169, "grad_norm": 1.3551483154296875, "learning_rate": 1.21102470656978e-06, "loss": 0.2143, "step": 4649 }, { "epoch": 30.194805194805195, "grad_norm": 1.5022242069244385, "learning_rate": 1.2041619030626284e-06, "loss": 0.2476, "step": 4650 }, { "epoch": 30.2012987012987, "grad_norm": 1.3777878284454346, "learning_rate": 1.1973183633795849e-06, "loss": 0.2105, "step": 4651 }, { "epoch": 30.207792207792206, "grad_norm": 1.443731427192688, "learning_rate": 1.190494090222366e-06, "loss": 0.2297, "step": 4652 }, { "epoch": 30.214285714285715, "grad_norm": 1.5264109373092651, "learning_rate": 1.183689086285089e-06, "loss": 0.2657, "step": 4653 }, { "epoch": 30.22077922077922, "grad_norm": 1.5993235111236572, "learning_rate": 1.1769033542542552e-06, "loss": 0.252, "step": 4654 }, { "epoch": 30.227272727272727, "grad_norm": 1.4382950067520142, "learning_rate": 1.1701368968087712e-06, "loss": 0.2287, "step": 4655 }, { "epoch": 30.233766233766232, "grad_norm": 1.447139024734497, "learning_rate": 1.1633897166199226e-06, "loss": 0.2327, "step": 4656 }, { "epoch": 30.24025974025974, "grad_norm": 1.3230527639389038, "learning_rate": 1.1566618163513953e-06, "loss": 0.204, "step": 4657 }, { "epoch": 30.246753246753247, "grad_norm": 1.4629783630371094, "learning_rate": 1.1499531986592481e-06, "loss": 0.2263, "step": 4658 }, { "epoch": 30.253246753246753, "grad_norm": 1.5198249816894531, "learning_rate": 1.1432638661919458e-06, "loss": 0.2361, "step": 4659 }, { "epoch": 30.25974025974026, "grad_norm": 1.508629560470581, "learning_rate": 1.136593821590326e-06, "loss": 0.2234, "step": 4660 }, { "epoch": 30.266233766233768, "grad_norm": 1.2462126016616821, "learning_rate": 1.129943067487621e-06, "loss": 0.1983, "step": 4661 }, { "epoch": 30.272727272727273, "grad_norm": 1.3352580070495605, "learning_rate": 1.1233116065094362e-06, "loss": 0.2147, "step": 4662 }, { "epoch": 30.27922077922078, "grad_norm": 1.3788704872131348, "learning_rate": 1.1166994412737774e-06, "loss": 0.2341, "step": 4663 }, { "epoch": 30.285714285714285, "grad_norm": 1.43989098072052, "learning_rate": 1.1101065743910121e-06, "loss": 0.2345, "step": 4664 }, { "epoch": 30.292207792207794, "grad_norm": 1.3213152885437012, "learning_rate": 1.1035330084639083e-06, "loss": 0.2135, "step": 4665 }, { "epoch": 30.2987012987013, "grad_norm": 1.3354477882385254, "learning_rate": 1.0969787460876012e-06, "loss": 0.2233, "step": 4666 }, { "epoch": 30.305194805194805, "grad_norm": 1.3998503684997559, "learning_rate": 1.0904437898496101e-06, "loss": 0.2165, "step": 4667 }, { "epoch": 30.31168831168831, "grad_norm": 1.4371103048324585, "learning_rate": 1.0839281423298375e-06, "loss": 0.2334, "step": 4668 }, { "epoch": 30.318181818181817, "grad_norm": 1.2368924617767334, "learning_rate": 1.0774318061005483e-06, "loss": 0.1959, "step": 4669 }, { "epoch": 30.324675324675326, "grad_norm": 1.555105209350586, "learning_rate": 1.0709547837263966e-06, "loss": 0.2526, "step": 4670 }, { "epoch": 30.33116883116883, "grad_norm": 1.3962255716323853, "learning_rate": 1.0644970777644091e-06, "loss": 0.2398, "step": 4671 }, { "epoch": 30.337662337662337, "grad_norm": 1.3587555885314941, "learning_rate": 1.058058690763991e-06, "loss": 0.2242, "step": 4672 }, { "epoch": 30.344155844155843, "grad_norm": 1.5387598276138306, "learning_rate": 1.0516396252669093e-06, "loss": 0.2537, "step": 4673 }, { "epoch": 30.350649350649352, "grad_norm": 1.541693925857544, "learning_rate": 1.045239883807314e-06, "loss": 0.2837, "step": 4674 }, { "epoch": 30.357142857142858, "grad_norm": 1.5055370330810547, "learning_rate": 1.0388594689117071e-06, "loss": 0.2453, "step": 4675 }, { "epoch": 30.363636363636363, "grad_norm": 1.2910610437393188, "learning_rate": 1.032498383099001e-06, "loss": 0.1859, "step": 4676 }, { "epoch": 30.37012987012987, "grad_norm": 1.371899127960205, "learning_rate": 1.0261566288804315e-06, "loss": 0.2048, "step": 4677 }, { "epoch": 30.376623376623378, "grad_norm": 1.380179762840271, "learning_rate": 1.019834208759629e-06, "loss": 0.2025, "step": 4678 }, { "epoch": 30.383116883116884, "grad_norm": 1.419575572013855, "learning_rate": 1.0135311252325864e-06, "loss": 0.221, "step": 4679 }, { "epoch": 30.38961038961039, "grad_norm": 1.4421792030334473, "learning_rate": 1.007247380787657e-06, "loss": 0.2277, "step": 4680 }, { "epoch": 30.396103896103895, "grad_norm": 1.671634316444397, "learning_rate": 1.000982977905568e-06, "loss": 0.2718, "step": 4681 }, { "epoch": 30.4025974025974, "grad_norm": 1.3425742387771606, "learning_rate": 9.947379190594075e-07, "loss": 0.2042, "step": 4682 }, { "epoch": 30.40909090909091, "grad_norm": 1.416672945022583, "learning_rate": 9.885122067146146e-07, "loss": 0.2283, "step": 4683 }, { "epoch": 30.415584415584416, "grad_norm": 1.4166967868804932, "learning_rate": 9.823058433290177e-07, "loss": 0.2166, "step": 4684 }, { "epoch": 30.42207792207792, "grad_norm": 1.5779871940612793, "learning_rate": 9.761188313527791e-07, "loss": 0.2546, "step": 4685 }, { "epoch": 30.428571428571427, "grad_norm": 1.496155858039856, "learning_rate": 9.699511732284393e-07, "loss": 0.2462, "step": 4686 }, { "epoch": 30.435064935064936, "grad_norm": 1.4519951343536377, "learning_rate": 9.638028713908897e-07, "loss": 0.2251, "step": 4687 }, { "epoch": 30.441558441558442, "grad_norm": 1.561194658279419, "learning_rate": 9.576739282673886e-07, "loss": 0.259, "step": 4688 }, { "epoch": 30.448051948051948, "grad_norm": 1.4109373092651367, "learning_rate": 9.515643462775337e-07, "loss": 0.2215, "step": 4689 }, { "epoch": 30.454545454545453, "grad_norm": 1.4258251190185547, "learning_rate": 9.454741278333012e-07, "loss": 0.2347, "step": 4690 }, { "epoch": 30.461038961038962, "grad_norm": 1.4444737434387207, "learning_rate": 9.394032753390014e-07, "loss": 0.2425, "step": 4691 }, { "epoch": 30.467532467532468, "grad_norm": 1.3838112354278564, "learning_rate": 9.333517911913226e-07, "loss": 0.2143, "step": 4692 }, { "epoch": 30.474025974025974, "grad_norm": 1.428850531578064, "learning_rate": 9.273196777792926e-07, "loss": 0.2193, "step": 4693 }, { "epoch": 30.48051948051948, "grad_norm": 1.4568206071853638, "learning_rate": 9.213069374842953e-07, "loss": 0.214, "step": 4694 }, { "epoch": 30.48701298701299, "grad_norm": 1.3723341226577759, "learning_rate": 9.153135726800599e-07, "loss": 0.1755, "step": 4695 }, { "epoch": 30.493506493506494, "grad_norm": 1.272161602973938, "learning_rate": 9.093395857326714e-07, "loss": 0.1828, "step": 4696 }, { "epoch": 30.5, "grad_norm": 1.2683285474777222, "learning_rate": 9.03384979000571e-07, "loss": 0.1972, "step": 4697 }, { "epoch": 30.506493506493506, "grad_norm": 1.7095534801483154, "learning_rate": 8.974497548345396e-07, "loss": 0.258, "step": 4698 }, { "epoch": 30.51298701298701, "grad_norm": 1.5242334604263306, "learning_rate": 8.915339155777136e-07, "loss": 0.252, "step": 4699 }, { "epoch": 30.51948051948052, "grad_norm": 1.3591055870056152, "learning_rate": 8.856374635655695e-07, "loss": 0.2128, "step": 4700 }, { "epoch": 30.525974025974026, "grad_norm": 1.2505815029144287, "learning_rate": 8.797604011259287e-07, "loss": 0.19, "step": 4701 }, { "epoch": 30.532467532467532, "grad_norm": 1.3938148021697998, "learning_rate": 8.739027305789683e-07, "loss": 0.2186, "step": 4702 }, { "epoch": 30.538961038961038, "grad_norm": 1.522039532661438, "learning_rate": 8.680644542372051e-07, "loss": 0.2401, "step": 4703 }, { "epoch": 30.545454545454547, "grad_norm": 1.4690169095993042, "learning_rate": 8.622455744054958e-07, "loss": 0.2382, "step": 4704 }, { "epoch": 30.551948051948052, "grad_norm": 1.5384557247161865, "learning_rate": 8.564460933810415e-07, "loss": 0.2358, "step": 4705 }, { "epoch": 30.558441558441558, "grad_norm": 1.2483651638031006, "learning_rate": 8.506660134533828e-07, "loss": 0.1947, "step": 4706 }, { "epoch": 30.564935064935064, "grad_norm": 1.153969645500183, "learning_rate": 8.449053369044058e-07, "loss": 0.1698, "step": 4707 }, { "epoch": 30.571428571428573, "grad_norm": 1.3502869606018066, "learning_rate": 8.391640660083411e-07, "loss": 0.2255, "step": 4708 }, { "epoch": 30.57792207792208, "grad_norm": 1.375724196434021, "learning_rate": 8.334422030317424e-07, "loss": 0.2033, "step": 4709 }, { "epoch": 30.584415584415584, "grad_norm": 1.3740615844726562, "learning_rate": 8.277397502335194e-07, "loss": 0.2165, "step": 4710 }, { "epoch": 30.59090909090909, "grad_norm": 1.3953828811645508, "learning_rate": 8.22056709864899e-07, "loss": 0.2228, "step": 4711 }, { "epoch": 30.5974025974026, "grad_norm": 1.4169948101043701, "learning_rate": 8.163930841694589e-07, "loss": 0.2095, "step": 4712 }, { "epoch": 30.603896103896105, "grad_norm": 1.4881272315979004, "learning_rate": 8.10748875383116e-07, "loss": 0.24, "step": 4713 }, { "epoch": 30.61038961038961, "grad_norm": 1.5430946350097656, "learning_rate": 8.051240857341103e-07, "loss": 0.2359, "step": 4714 }, { "epoch": 30.616883116883116, "grad_norm": 1.370585322380066, "learning_rate": 7.995187174430152e-07, "loss": 0.2032, "step": 4715 }, { "epoch": 30.623376623376622, "grad_norm": 1.427137851715088, "learning_rate": 7.939327727227441e-07, "loss": 0.2354, "step": 4716 }, { "epoch": 30.62987012987013, "grad_norm": 1.3055822849273682, "learning_rate": 7.883662537785441e-07, "loss": 0.2118, "step": 4717 }, { "epoch": 30.636363636363637, "grad_norm": 1.4892123937606812, "learning_rate": 7.82819162807985e-07, "loss": 0.2494, "step": 4718 }, { "epoch": 30.642857142857142, "grad_norm": 1.3692604303359985, "learning_rate": 7.772915020009707e-07, "loss": 0.2256, "step": 4719 }, { "epoch": 30.649350649350648, "grad_norm": 1.2689048051834106, "learning_rate": 7.717832735397335e-07, "loss": 0.1942, "step": 4720 }, { "epoch": 30.655844155844157, "grad_norm": 1.3880759477615356, "learning_rate": 7.662944795988336e-07, "loss": 0.2263, "step": 4721 }, { "epoch": 30.662337662337663, "grad_norm": 1.2648857831954956, "learning_rate": 7.608251223451602e-07, "loss": 0.2215, "step": 4722 }, { "epoch": 30.66883116883117, "grad_norm": 1.5660555362701416, "learning_rate": 7.553752039379358e-07, "loss": 0.241, "step": 4723 }, { "epoch": 30.675324675324674, "grad_norm": 1.0508201122283936, "learning_rate": 7.499447265286951e-07, "loss": 0.1307, "step": 4724 }, { "epoch": 30.681818181818183, "grad_norm": 1.4483400583267212, "learning_rate": 7.445336922613067e-07, "loss": 0.2207, "step": 4725 }, { "epoch": 30.68831168831169, "grad_norm": 1.3860273361206055, "learning_rate": 7.391421032719559e-07, "loss": 0.2009, "step": 4726 }, { "epoch": 30.694805194805195, "grad_norm": 1.433535099029541, "learning_rate": 7.33769961689168e-07, "loss": 0.2274, "step": 4727 }, { "epoch": 30.7012987012987, "grad_norm": 1.4920105934143066, "learning_rate": 7.284172696337632e-07, "loss": 0.2377, "step": 4728 }, { "epoch": 30.707792207792206, "grad_norm": 1.2424510717391968, "learning_rate": 7.230840292189178e-07, "loss": 0.1983, "step": 4729 }, { "epoch": 30.714285714285715, "grad_norm": 1.491529107093811, "learning_rate": 7.177702425500976e-07, "loss": 0.2518, "step": 4730 }, { "epoch": 30.72077922077922, "grad_norm": 1.5076813697814941, "learning_rate": 7.124759117251079e-07, "loss": 0.235, "step": 4731 }, { "epoch": 30.727272727272727, "grad_norm": 1.4388847351074219, "learning_rate": 7.072010388340655e-07, "loss": 0.2133, "step": 4732 }, { "epoch": 30.733766233766232, "grad_norm": 1.516015887260437, "learning_rate": 7.019456259594048e-07, "loss": 0.2409, "step": 4733 }, { "epoch": 30.74025974025974, "grad_norm": 1.1933428049087524, "learning_rate": 6.967096751758773e-07, "loss": 0.1927, "step": 4734 }, { "epoch": 30.746753246753247, "grad_norm": 1.3725800514221191, "learning_rate": 6.914931885505627e-07, "loss": 0.2259, "step": 4735 }, { "epoch": 30.753246753246753, "grad_norm": 1.3572255373001099, "learning_rate": 6.862961681428304e-07, "loss": 0.2171, "step": 4736 }, { "epoch": 30.75974025974026, "grad_norm": 1.4173394441604614, "learning_rate": 6.811186160044003e-07, "loss": 0.2225, "step": 4737 }, { "epoch": 30.766233766233768, "grad_norm": 1.260750651359558, "learning_rate": 6.759605341792819e-07, "loss": 0.1916, "step": 4738 }, { "epoch": 30.772727272727273, "grad_norm": 1.5509462356567383, "learning_rate": 6.708219247038017e-07, "loss": 0.2558, "step": 4739 }, { "epoch": 30.77922077922078, "grad_norm": 1.276381254196167, "learning_rate": 6.657027896065982e-07, "loss": 0.2192, "step": 4740 }, { "epoch": 30.785714285714285, "grad_norm": 1.519776701927185, "learning_rate": 6.60603130908627e-07, "loss": 0.2401, "step": 4741 }, { "epoch": 30.792207792207794, "grad_norm": 1.467542290687561, "learning_rate": 6.555229506231608e-07, "loss": 0.2128, "step": 4742 }, { "epoch": 30.7987012987013, "grad_norm": 1.3514313697814941, "learning_rate": 6.504622507557678e-07, "loss": 0.2334, "step": 4743 }, { "epoch": 30.805194805194805, "grad_norm": 1.4580824375152588, "learning_rate": 6.454210333043276e-07, "loss": 0.2248, "step": 4744 }, { "epoch": 30.81168831168831, "grad_norm": 1.262427568435669, "learning_rate": 6.403993002590425e-07, "loss": 0.2161, "step": 4745 }, { "epoch": 30.818181818181817, "grad_norm": 1.2508149147033691, "learning_rate": 6.353970536024045e-07, "loss": 0.1908, "step": 4746 }, { "epoch": 30.824675324675326, "grad_norm": 1.3874893188476562, "learning_rate": 6.304142953092284e-07, "loss": 0.2094, "step": 4747 }, { "epoch": 30.83116883116883, "grad_norm": 1.4068247079849243, "learning_rate": 6.254510273466186e-07, "loss": 0.2152, "step": 4748 }, { "epoch": 30.837662337662337, "grad_norm": 1.5314143896102905, "learning_rate": 6.205072516740129e-07, "loss": 0.2364, "step": 4749 }, { "epoch": 30.844155844155843, "grad_norm": 1.439985752105713, "learning_rate": 6.15582970243117e-07, "loss": 0.2354, "step": 4750 }, { "epoch": 30.850649350649352, "grad_norm": 1.2261022329330444, "learning_rate": 6.106781849979648e-07, "loss": 0.1718, "step": 4751 }, { "epoch": 30.857142857142858, "grad_norm": 1.3499640226364136, "learning_rate": 6.057928978748905e-07, "loss": 0.2252, "step": 4752 }, { "epoch": 30.863636363636363, "grad_norm": 1.624617099761963, "learning_rate": 6.009271108025294e-07, "loss": 0.2659, "step": 4753 }, { "epoch": 30.87012987012987, "grad_norm": 1.5444817543029785, "learning_rate": 5.960808257018114e-07, "loss": 0.243, "step": 4754 }, { "epoch": 30.876623376623378, "grad_norm": 1.5056469440460205, "learning_rate": 5.912540444859782e-07, "loss": 0.2578, "step": 4755 }, { "epoch": 30.883116883116884, "grad_norm": 1.4643126726150513, "learning_rate": 5.864467690605613e-07, "loss": 0.2529, "step": 4756 }, { "epoch": 30.88961038961039, "grad_norm": 1.4542299509048462, "learning_rate": 5.816590013234035e-07, "loss": 0.2227, "step": 4757 }, { "epoch": 30.896103896103895, "grad_norm": 1.3155229091644287, "learning_rate": 5.76890743164632e-07, "loss": 0.1812, "step": 4758 }, { "epoch": 30.9025974025974, "grad_norm": 1.4375791549682617, "learning_rate": 5.721419964666908e-07, "loss": 0.2258, "step": 4759 }, { "epoch": 30.90909090909091, "grad_norm": 1.5131149291992188, "learning_rate": 5.674127631043025e-07, "loss": 0.2352, "step": 4760 }, { "epoch": 30.915584415584416, "grad_norm": 1.3214205503463745, "learning_rate": 5.627030449444903e-07, "loss": 0.1971, "step": 4761 }, { "epoch": 30.92207792207792, "grad_norm": 1.3359191417694092, "learning_rate": 5.580128438465837e-07, "loss": 0.2231, "step": 4762 }, { "epoch": 30.928571428571427, "grad_norm": 1.4121034145355225, "learning_rate": 5.533421616621904e-07, "loss": 0.2187, "step": 4763 }, { "epoch": 30.935064935064936, "grad_norm": 1.6365861892700195, "learning_rate": 5.486910002352352e-07, "loss": 0.2724, "step": 4764 }, { "epoch": 30.941558441558442, "grad_norm": 1.4394079446792603, "learning_rate": 5.440593614019108e-07, "loss": 0.2298, "step": 4765 }, { "epoch": 30.948051948051948, "grad_norm": 1.4614982604980469, "learning_rate": 5.394472469907208e-07, "loss": 0.2258, "step": 4766 }, { "epoch": 30.954545454545453, "grad_norm": 1.4309954643249512, "learning_rate": 5.348546588224534e-07, "loss": 0.2147, "step": 4767 }, { "epoch": 30.961038961038962, "grad_norm": 1.6089102029800415, "learning_rate": 5.302815987101917e-07, "loss": 0.2734, "step": 4768 }, { "epoch": 30.967532467532468, "grad_norm": 1.3566418886184692, "learning_rate": 5.257280684593024e-07, "loss": 0.2306, "step": 4769 }, { "epoch": 30.974025974025974, "grad_norm": 1.295680046081543, "learning_rate": 5.211940698674534e-07, "loss": 0.2238, "step": 4770 }, { "epoch": 30.98051948051948, "grad_norm": 1.4230977296829224, "learning_rate": 5.166796047245903e-07, "loss": 0.2237, "step": 4771 }, { "epoch": 30.98701298701299, "grad_norm": 1.5614374876022339, "learning_rate": 5.121846748129544e-07, "loss": 0.277, "step": 4772 }, { "epoch": 30.993506493506494, "grad_norm": 1.3989629745483398, "learning_rate": 5.077092819070761e-07, "loss": 0.2207, "step": 4773 }, { "epoch": 31.0, "grad_norm": 1641.1507568359375, "learning_rate": 5.032534277737643e-07, "loss": 0.229, "step": 4774 }, { "epoch": 31.006493506493506, "grad_norm": 1.3975937366485596, "learning_rate": 4.988171141721232e-07, "loss": 0.2268, "step": 4775 }, { "epoch": 31.01298701298701, "grad_norm": 1.2979328632354736, "learning_rate": 4.944003428535348e-07, "loss": 0.2207, "step": 4776 }, { "epoch": 31.01948051948052, "grad_norm": 1.447725772857666, "learning_rate": 4.900031155616769e-07, "loss": 0.2243, "step": 4777 }, { "epoch": 31.025974025974026, "grad_norm": 1.3527864217758179, "learning_rate": 4.856254340325051e-07, "loss": 0.2381, "step": 4778 }, { "epoch": 31.032467532467532, "grad_norm": 1.3849494457244873, "learning_rate": 4.81267299994248e-07, "loss": 0.2238, "step": 4779 }, { "epoch": 31.038961038961038, "grad_norm": 1.2796428203582764, "learning_rate": 4.769287151674406e-07, "loss": 0.1949, "step": 4780 }, { "epoch": 31.045454545454547, "grad_norm": 1.475150227546692, "learning_rate": 4.726096812648795e-07, "loss": 0.2343, "step": 4781 }, { "epoch": 31.051948051948052, "grad_norm": 1.4981529712677002, "learning_rate": 4.6831019999165617e-07, "loss": 0.2498, "step": 4782 }, { "epoch": 31.058441558441558, "grad_norm": 1.5081206560134888, "learning_rate": 4.6403027304513515e-07, "loss": 0.2456, "step": 4783 }, { "epoch": 31.064935064935064, "grad_norm": 1.5308469533920288, "learning_rate": 4.597699021149648e-07, "loss": 0.2383, "step": 4784 }, { "epoch": 31.071428571428573, "grad_norm": 1.3245189189910889, "learning_rate": 4.5552908888306655e-07, "loss": 0.2112, "step": 4785 }, { "epoch": 31.07792207792208, "grad_norm": 1.5210825204849243, "learning_rate": 4.5130783502365103e-07, "loss": 0.2325, "step": 4786 }, { "epoch": 31.084415584415584, "grad_norm": 1.4164528846740723, "learning_rate": 4.4710614220320746e-07, "loss": 0.2138, "step": 4787 }, { "epoch": 31.09090909090909, "grad_norm": 1.566509485244751, "learning_rate": 4.4292401208049226e-07, "loss": 0.2377, "step": 4788 }, { "epoch": 31.0974025974026, "grad_norm": 1.561910629272461, "learning_rate": 4.387614463065404e-07, "loss": 0.2383, "step": 4789 }, { "epoch": 31.103896103896105, "grad_norm": 1.4738324880599976, "learning_rate": 4.3461844652467607e-07, "loss": 0.2347, "step": 4790 }, { "epoch": 31.11038961038961, "grad_norm": 1.645966649055481, "learning_rate": 4.3049501437047444e-07, "loss": 0.2527, "step": 4791 }, { "epoch": 31.116883116883116, "grad_norm": 1.1387031078338623, "learning_rate": 4.2639115147182217e-07, "loss": 0.1782, "step": 4792 }, { "epoch": 31.123376623376622, "grad_norm": 1.3942773342132568, "learning_rate": 4.2230685944884553e-07, "loss": 0.217, "step": 4793 }, { "epoch": 31.12987012987013, "grad_norm": 1.3566133975982666, "learning_rate": 4.182421399139602e-07, "loss": 0.2262, "step": 4794 }, { "epoch": 31.136363636363637, "grad_norm": 1.4231529235839844, "learning_rate": 4.141969944718549e-07, "loss": 0.2327, "step": 4795 }, { "epoch": 31.142857142857142, "grad_norm": 1.2249860763549805, "learning_rate": 4.10171424719491e-07, "loss": 0.1981, "step": 4796 }, { "epoch": 31.149350649350648, "grad_norm": 1.2686113119125366, "learning_rate": 4.0616543224609726e-07, "loss": 0.1888, "step": 4797 }, { "epoch": 31.155844155844157, "grad_norm": 1.226015329360962, "learning_rate": 4.021790186331753e-07, "loss": 0.1767, "step": 4798 }, { "epoch": 31.162337662337663, "grad_norm": 1.3818334341049194, "learning_rate": 3.9821218545449956e-07, "loss": 0.225, "step": 4799 }, { "epoch": 31.16883116883117, "grad_norm": 1.3977967500686646, "learning_rate": 3.9426493427611177e-07, "loss": 0.2356, "step": 4800 }, { "epoch": 31.175324675324674, "grad_norm": 1.2903099060058594, "learning_rate": 3.9033726665632096e-07, "loss": 0.2012, "step": 4801 }, { "epoch": 31.181818181818183, "grad_norm": 1.3344894647598267, "learning_rate": 3.864291841457146e-07, "loss": 0.2128, "step": 4802 }, { "epoch": 31.18831168831169, "grad_norm": 1.300842523574829, "learning_rate": 3.8254068828713627e-07, "loss": 0.2039, "step": 4803 }, { "epoch": 31.194805194805195, "grad_norm": 1.76253080368042, "learning_rate": 3.7867178061571364e-07, "loss": 0.2529, "step": 4804 }, { "epoch": 31.2012987012987, "grad_norm": 1.4582929611206055, "learning_rate": 3.748224626588137e-07, "loss": 0.2649, "step": 4805 }, { "epoch": 31.207792207792206, "grad_norm": 1.21044921875, "learning_rate": 3.709927359360932e-07, "loss": 0.1872, "step": 4806 }, { "epoch": 31.214285714285715, "grad_norm": 1.3666410446166992, "learning_rate": 3.67182601959476e-07, "loss": 0.214, "step": 4807 }, { "epoch": 31.22077922077922, "grad_norm": 1.4099982976913452, "learning_rate": 3.6339206223313104e-07, "loss": 0.2329, "step": 4808 }, { "epoch": 31.227272727272727, "grad_norm": 1.4051538705825806, "learning_rate": 3.5962111825350586e-07, "loss": 0.2148, "step": 4809 }, { "epoch": 31.233766233766232, "grad_norm": 1.5412423610687256, "learning_rate": 3.558697715093207e-07, "loss": 0.25, "step": 4810 }, { "epoch": 31.24025974025974, "grad_norm": 1.4066494703292847, "learning_rate": 3.521380234815297e-07, "loss": 0.217, "step": 4811 }, { "epoch": 31.246753246753247, "grad_norm": 1.3286962509155273, "learning_rate": 3.484258756433767e-07, "loss": 0.2103, "step": 4812 }, { "epoch": 31.253246753246753, "grad_norm": 1.4361311197280884, "learning_rate": 3.447333294603616e-07, "loss": 0.2375, "step": 4813 }, { "epoch": 31.25974025974026, "grad_norm": 1.6048885583877563, "learning_rate": 3.410603863902406e-07, "loss": 0.2602, "step": 4814 }, { "epoch": 31.266233766233768, "grad_norm": 1.4700747728347778, "learning_rate": 3.3740704788303157e-07, "loss": 0.235, "step": 4815 }, { "epoch": 31.272727272727273, "grad_norm": 1.4577665328979492, "learning_rate": 3.3377331538101407e-07, "loss": 0.2598, "step": 4816 }, { "epoch": 31.27922077922078, "grad_norm": 1.4110027551651, "learning_rate": 3.301591903187351e-07, "loss": 0.2266, "step": 4817 }, { "epoch": 31.285714285714285, "grad_norm": 1.331971526145935, "learning_rate": 3.2656467412298664e-07, "loss": 0.2144, "step": 4818 }, { "epoch": 31.292207792207794, "grad_norm": 1.5475175380706787, "learning_rate": 3.22989768212828e-07, "loss": 0.2351, "step": 4819 }, { "epoch": 31.2987012987013, "grad_norm": 1.4858325719833374, "learning_rate": 3.1943447399958027e-07, "loss": 0.2341, "step": 4820 }, { "epoch": 31.305194805194805, "grad_norm": 1.4014004468917847, "learning_rate": 3.158987928868151e-07, "loss": 0.2293, "step": 4821 }, { "epoch": 31.31168831168831, "grad_norm": 1.5771843194961548, "learning_rate": 3.123827262703549e-07, "loss": 0.2349, "step": 4822 }, { "epoch": 31.318181818181817, "grad_norm": 1.5923490524291992, "learning_rate": 3.088862755383004e-07, "loss": 0.247, "step": 4823 }, { "epoch": 31.324675324675326, "grad_norm": 1.6026840209960938, "learning_rate": 3.054094420709863e-07, "loss": 0.2728, "step": 4824 }, { "epoch": 31.33116883116883, "grad_norm": 1.3792734146118164, "learning_rate": 3.019522272410202e-07, "loss": 0.2144, "step": 4825 }, { "epoch": 31.337662337662337, "grad_norm": 1.3602818250656128, "learning_rate": 2.985146324132437e-07, "loss": 0.2243, "step": 4826 }, { "epoch": 31.344155844155843, "grad_norm": 1.5236181020736694, "learning_rate": 2.9509665894476566e-07, "loss": 0.2331, "step": 4827 }, { "epoch": 31.350649350649352, "grad_norm": 1.4916106462478638, "learning_rate": 2.9169830818496225e-07, "loss": 0.2573, "step": 4828 }, { "epoch": 31.357142857142858, "grad_norm": 1.3621578216552734, "learning_rate": 2.8831958147543805e-07, "loss": 0.2462, "step": 4829 }, { "epoch": 31.363636363636363, "grad_norm": 1.2556349039077759, "learning_rate": 2.849604801500538e-07, "loss": 0.1778, "step": 4830 }, { "epoch": 31.37012987012987, "grad_norm": 1.4829384088516235, "learning_rate": 2.8162100553494884e-07, "loss": 0.2381, "step": 4831 }, { "epoch": 31.376623376623378, "grad_norm": 1.315341830253601, "learning_rate": 2.783011589484741e-07, "loss": 0.2076, "step": 4832 }, { "epoch": 31.383116883116884, "grad_norm": 1.3404558897018433, "learning_rate": 2.7500094170126444e-07, "loss": 0.1952, "step": 4833 }, { "epoch": 31.38961038961039, "grad_norm": 1.5623887777328491, "learning_rate": 2.717203550961944e-07, "loss": 0.2606, "step": 4834 }, { "epoch": 31.396103896103895, "grad_norm": 1.562747836112976, "learning_rate": 2.684594004283836e-07, "loss": 0.256, "step": 4835 }, { "epoch": 31.4025974025974, "grad_norm": 1.148888349533081, "learning_rate": 2.6521807898520213e-07, "loss": 0.177, "step": 4836 }, { "epoch": 31.40909090909091, "grad_norm": 1.5755507946014404, "learning_rate": 2.61996392046282e-07, "loss": 0.2467, "step": 4837 }, { "epoch": 31.415584415584416, "grad_norm": 1.3029296398162842, "learning_rate": 2.5879434088348366e-07, "loss": 0.2049, "step": 4838 }, { "epoch": 31.42207792207792, "grad_norm": 1.3719687461853027, "learning_rate": 2.556119267609347e-07, "loss": 0.2161, "step": 4839 }, { "epoch": 31.428571428571427, "grad_norm": 1.2144418954849243, "learning_rate": 2.5244915093499134e-07, "loss": 0.1754, "step": 4840 }, { "epoch": 31.435064935064936, "grad_norm": 1.399951696395874, "learning_rate": 2.493060146542825e-07, "loss": 0.2196, "step": 4841 }, { "epoch": 31.441558441558442, "grad_norm": 1.3518197536468506, "learning_rate": 2.4618251915964896e-07, "loss": 0.2049, "step": 4842 }, { "epoch": 31.448051948051948, "grad_norm": 1.4507516622543335, "learning_rate": 2.43078665684221e-07, "loss": 0.2254, "step": 4843 }, { "epoch": 31.454545454545453, "grad_norm": 1.432663083076477, "learning_rate": 2.399944554533295e-07, "loss": 0.2401, "step": 4844 }, { "epoch": 31.461038961038962, "grad_norm": 1.4511537551879883, "learning_rate": 2.3692988968458395e-07, "loss": 0.2428, "step": 4845 }, { "epoch": 31.467532467532468, "grad_norm": 1.5356429815292358, "learning_rate": 2.3388496958782202e-07, "loss": 0.2392, "step": 4846 }, { "epoch": 31.474025974025974, "grad_norm": 1.1476186513900757, "learning_rate": 2.3085969636513216e-07, "loss": 0.1762, "step": 4847 }, { "epoch": 31.48051948051948, "grad_norm": 1.3980841636657715, "learning_rate": 2.2785407121084235e-07, "loss": 0.2054, "step": 4848 }, { "epoch": 31.48701298701299, "grad_norm": 1.3446372747421265, "learning_rate": 2.2486809531152564e-07, "loss": 0.1852, "step": 4849 }, { "epoch": 31.493506493506494, "grad_norm": 1.5585228204727173, "learning_rate": 2.219017698460002e-07, "loss": 0.2423, "step": 4850 }, { "epoch": 31.5, "grad_norm": 1.2824307680130005, "learning_rate": 2.1895509598532372e-07, "loss": 0.1971, "step": 4851 }, { "epoch": 31.506493506493506, "grad_norm": 1.5058579444885254, "learning_rate": 2.160280748927934e-07, "loss": 0.2385, "step": 4852 }, { "epoch": 31.51298701298701, "grad_norm": 1.1482194662094116, "learning_rate": 2.1312070772395166e-07, "loss": 0.1762, "step": 4853 }, { "epoch": 31.51948051948052, "grad_norm": 1.5301028490066528, "learning_rate": 2.1023299562658583e-07, "loss": 0.2466, "step": 4854 }, { "epoch": 31.525974025974026, "grad_norm": 1.3849236965179443, "learning_rate": 2.0736493974071736e-07, "loss": 0.2383, "step": 4855 }, { "epoch": 31.532467532467532, "grad_norm": 1.3211804628372192, "learning_rate": 2.0451654119860165e-07, "loss": 0.2063, "step": 4856 }, { "epoch": 31.538961038961038, "grad_norm": 1.458785057067871, "learning_rate": 2.0168780112475026e-07, "loss": 0.24, "step": 4857 }, { "epoch": 31.545454545454547, "grad_norm": 1.304371953010559, "learning_rate": 1.988787206359033e-07, "loss": 0.2081, "step": 4858 }, { "epoch": 31.551948051948052, "grad_norm": 1.4084887504577637, "learning_rate": 1.9608930084104027e-07, "loss": 0.2424, "step": 4859 }, { "epoch": 31.558441558441558, "grad_norm": 1.3972254991531372, "learning_rate": 1.9331954284137476e-07, "loss": 0.2153, "step": 4860 }, { "epoch": 31.564935064935064, "grad_norm": 1.369728446006775, "learning_rate": 1.9056944773037655e-07, "loss": 0.2236, "step": 4861 }, { "epoch": 31.571428571428573, "grad_norm": 1.3551347255706787, "learning_rate": 1.878390165937216e-07, "loss": 0.2139, "step": 4862 }, { "epoch": 31.57792207792208, "grad_norm": 1.4082741737365723, "learning_rate": 1.8512825050935323e-07, "loss": 0.1973, "step": 4863 }, { "epoch": 31.584415584415584, "grad_norm": 1.441206455230713, "learning_rate": 1.8243715054744316e-07, "loss": 0.2249, "step": 4864 }, { "epoch": 31.59090909090909, "grad_norm": 1.2138317823410034, "learning_rate": 1.7976571777038044e-07, "loss": 0.2035, "step": 4865 }, { "epoch": 31.5974025974026, "grad_norm": 1.385532259941101, "learning_rate": 1.7711395323281587e-07, "loss": 0.2316, "step": 4866 }, { "epoch": 31.603896103896105, "grad_norm": 1.2439817190170288, "learning_rate": 1.7448185798161764e-07, "loss": 0.2028, "step": 4867 }, { "epoch": 31.61038961038961, "grad_norm": 1.3761063814163208, "learning_rate": 1.7186943305589898e-07, "loss": 0.2083, "step": 4868 }, { "epoch": 31.616883116883116, "grad_norm": 1.4757968187332153, "learning_rate": 1.6927667948700155e-07, "loss": 0.2181, "step": 4869 }, { "epoch": 31.623376623376622, "grad_norm": 1.2799804210662842, "learning_rate": 1.6670359829850657e-07, "loss": 0.2032, "step": 4870 }, { "epoch": 31.62987012987013, "grad_norm": 1.4994889497756958, "learning_rate": 1.641501905062237e-07, "loss": 0.2366, "step": 4871 }, { "epoch": 31.636363636363637, "grad_norm": 1.5027410984039307, "learning_rate": 1.6161645711819662e-07, "loss": 0.2509, "step": 4872 }, { "epoch": 31.642857142857142, "grad_norm": 1.6694434881210327, "learning_rate": 1.5910239913470292e-07, "loss": 0.2724, "step": 4873 }, { "epoch": 31.649350649350648, "grad_norm": 1.2930212020874023, "learning_rate": 1.5660801754825981e-07, "loss": 0.1978, "step": 4874 }, { "epoch": 31.655844155844157, "grad_norm": 1.4607458114624023, "learning_rate": 1.5413331334360182e-07, "loss": 0.2112, "step": 4875 }, { "epoch": 31.662337662337663, "grad_norm": 1.4452189207077026, "learning_rate": 1.516782874977085e-07, "loss": 0.2296, "step": 4876 }, { "epoch": 31.66883116883117, "grad_norm": 1.4618563652038574, "learning_rate": 1.4924294097977686e-07, "loss": 0.2181, "step": 4877 }, { "epoch": 31.675324675324674, "grad_norm": 1.3591219186782837, "learning_rate": 1.468272747512489e-07, "loss": 0.2116, "step": 4878 }, { "epoch": 31.681818181818183, "grad_norm": 1.4753000736236572, "learning_rate": 1.4443128976579512e-07, "loss": 0.2272, "step": 4879 }, { "epoch": 31.68831168831169, "grad_norm": 1.435119867324829, "learning_rate": 1.420549869693033e-07, "loss": 0.2215, "step": 4880 }, { "epoch": 31.694805194805195, "grad_norm": 1.5033072233200073, "learning_rate": 1.3969836729990638e-07, "loss": 0.251, "step": 4881 }, { "epoch": 31.7012987012987, "grad_norm": 1.28299081325531, "learning_rate": 1.3736143168796012e-07, "loss": 0.2033, "step": 4882 }, { "epoch": 31.707792207792206, "grad_norm": 1.4719812870025635, "learning_rate": 1.3504418105604878e-07, "loss": 0.2438, "step": 4883 }, { "epoch": 31.714285714285715, "grad_norm": 1.3661195039749146, "learning_rate": 1.3274661631899054e-07, "loss": 0.2304, "step": 4884 }, { "epoch": 31.72077922077922, "grad_norm": 1.3205084800720215, "learning_rate": 1.3046873838381546e-07, "loss": 0.2096, "step": 4885 }, { "epoch": 31.727272727272727, "grad_norm": 1.3279850482940674, "learning_rate": 1.282105481498097e-07, "loss": 0.2267, "step": 4886 }, { "epoch": 31.733766233766232, "grad_norm": 1.449994683265686, "learning_rate": 1.2597204650845463e-07, "loss": 0.2211, "step": 4887 }, { "epoch": 31.74025974025974, "grad_norm": 1.6140695810317993, "learning_rate": 1.2375323434348773e-07, "loss": 0.2366, "step": 4888 }, { "epoch": 31.746753246753247, "grad_norm": 1.4511138200759888, "learning_rate": 1.2155411253085835e-07, "loss": 0.2424, "step": 4889 }, { "epoch": 31.753246753246753, "grad_norm": 1.468500018119812, "learning_rate": 1.193746819387387e-07, "loss": 0.2285, "step": 4890 }, { "epoch": 31.75974025974026, "grad_norm": 1.163522720336914, "learning_rate": 1.1721494342754048e-07, "loss": 0.1957, "step": 4891 }, { "epoch": 31.766233766233768, "grad_norm": 1.4025077819824219, "learning_rate": 1.1507489784989278e-07, "loss": 0.2193, "step": 4892 }, { "epoch": 31.772727272727273, "grad_norm": 1.3328723907470703, "learning_rate": 1.129545460506476e-07, "loss": 0.2129, "step": 4893 }, { "epoch": 31.77922077922078, "grad_norm": 1.572609782218933, "learning_rate": 1.1085388886689085e-07, "loss": 0.2756, "step": 4894 }, { "epoch": 31.785714285714285, "grad_norm": 1.3879311084747314, "learning_rate": 1.0877292712792585e-07, "loss": 0.2316, "step": 4895 }, { "epoch": 31.792207792207794, "grad_norm": 1.4279173612594604, "learning_rate": 1.067116616552899e-07, "loss": 0.2202, "step": 4896 }, { "epoch": 31.7987012987013, "grad_norm": 1.4845510721206665, "learning_rate": 1.0467009326272648e-07, "loss": 0.205, "step": 4897 }, { "epoch": 31.805194805194805, "grad_norm": 1.3270889520645142, "learning_rate": 1.026482227562242e-07, "loss": 0.1823, "step": 4898 }, { "epoch": 31.81168831168831, "grad_norm": 1.3610776662826538, "learning_rate": 1.0064605093397794e-07, "loss": 0.2092, "step": 4899 }, { "epoch": 31.818181818181817, "grad_norm": 1.4318331480026245, "learning_rate": 9.866357858642205e-08, "loss": 0.2369, "step": 4900 }, { "epoch": 31.824675324675326, "grad_norm": 1.2095530033111572, "learning_rate": 9.670080649619717e-08, "loss": 0.1779, "step": 4901 }, { "epoch": 31.83116883116883, "grad_norm": 1.463456630706787, "learning_rate": 9.475773543818344e-08, "loss": 0.2293, "step": 4902 }, { "epoch": 31.837662337662337, "grad_norm": 1.2570163011550903, "learning_rate": 9.283436617946173e-08, "loss": 0.1899, "step": 4903 }, { "epoch": 31.844155844155843, "grad_norm": 1.3969603776931763, "learning_rate": 9.09306994793635e-08, "loss": 0.2263, "step": 4904 }, { "epoch": 31.850649350649352, "grad_norm": 1.4223355054855347, "learning_rate": 8.904673608940983e-08, "loss": 0.2233, "step": 4905 }, { "epoch": 31.857142857142858, "grad_norm": 1.2923760414123535, "learning_rate": 8.718247675337244e-08, "loss": 0.194, "step": 4906 }, { "epoch": 31.863636363636363, "grad_norm": 1.602528691291809, "learning_rate": 8.53379222072237e-08, "loss": 0.2611, "step": 4907 }, { "epoch": 31.87012987012987, "grad_norm": 1.3463027477264404, "learning_rate": 8.351307317917001e-08, "loss": 0.2122, "step": 4908 }, { "epoch": 31.876623376623378, "grad_norm": 1.4832775592803955, "learning_rate": 8.170793038963509e-08, "loss": 0.252, "step": 4909 }, { "epoch": 31.883116883116884, "grad_norm": 1.3249467611312866, "learning_rate": 7.99224945512489e-08, "loss": 0.221, "step": 4910 }, { "epoch": 31.88961038961039, "grad_norm": 1.21480131149292, "learning_rate": 7.815676636888092e-08, "loss": 0.2021, "step": 4911 }, { "epoch": 31.896103896103895, "grad_norm": 1.4285305738449097, "learning_rate": 7.641074653961244e-08, "loss": 0.2492, "step": 4912 }, { "epoch": 31.9025974025974, "grad_norm": 1.4393455982208252, "learning_rate": 7.468443575274764e-08, "loss": 0.2354, "step": 4913 }, { "epoch": 31.90909090909091, "grad_norm": 1.4678078889846802, "learning_rate": 7.297783468980246e-08, "loss": 0.2133, "step": 4914 }, { "epoch": 31.915584415584416, "grad_norm": 1.384844422340393, "learning_rate": 7.129094402451575e-08, "loss": 0.2234, "step": 4915 }, { "epoch": 31.92207792207792, "grad_norm": 1.4255553483963013, "learning_rate": 6.962376442284368e-08, "loss": 0.2188, "step": 4916 }, { "epoch": 31.928571428571427, "grad_norm": 1.3805224895477295, "learning_rate": 6.797629654296534e-08, "loss": 0.2333, "step": 4917 }, { "epoch": 31.935064935064936, "grad_norm": 1.3608981370925903, "learning_rate": 6.63485410352771e-08, "loss": 0.2142, "step": 4918 }, { "epoch": 31.941558441558442, "grad_norm": 1.5125210285186768, "learning_rate": 6.474049854238717e-08, "loss": 0.2543, "step": 4919 }, { "epoch": 31.948051948051948, "grad_norm": 1.4834202527999878, "learning_rate": 6.315216969912663e-08, "loss": 0.2548, "step": 4920 }, { "epoch": 31.954545454545453, "grad_norm": 1.5085341930389404, "learning_rate": 6.158355513254388e-08, "loss": 0.2376, "step": 4921 }, { "epoch": 31.961038961038962, "grad_norm": 1.5183302164077759, "learning_rate": 6.003465546189358e-08, "loss": 0.2256, "step": 4922 }, { "epoch": 31.967532467532468, "grad_norm": 1.2665663957595825, "learning_rate": 5.850547129867545e-08, "loss": 0.1964, "step": 4923 }, { "epoch": 31.974025974025974, "grad_norm": 1.3507740497589111, "learning_rate": 5.6996003246573283e-08, "loss": 0.2296, "step": 4924 }, { "epoch": 31.98051948051948, "grad_norm": 1.4157506227493286, "learning_rate": 5.550625190150483e-08, "loss": 0.2283, "step": 4925 }, { "epoch": 31.98701298701299, "grad_norm": 1.5225261449813843, "learning_rate": 5.4036217851594075e-08, "loss": 0.2339, "step": 4926 }, { "epoch": 31.993506493506494, "grad_norm": 1.3280245065689087, "learning_rate": 5.258590167719901e-08, "loss": 0.1991, "step": 4927 }, { "epoch": 32.0, "grad_norm": 1743.5234375, "learning_rate": 5.115530395087276e-08, "loss": 0.2862, "step": 4928 }, { "epoch": 32.006493506493506, "grad_norm": 1.511450171470642, "learning_rate": 4.9744425237396864e-08, "loss": 0.2637, "step": 4929 }, { "epoch": 32.01298701298701, "grad_norm": 1.445878267288208, "learning_rate": 4.835326609376467e-08, "loss": 0.2284, "step": 4930 }, { "epoch": 32.01948051948052, "grad_norm": 1.5547192096710205, "learning_rate": 4.698182706918131e-08, "loss": 0.27, "step": 4931 }, { "epoch": 32.02597402597402, "grad_norm": 1.3976191282272339, "learning_rate": 4.563010870506368e-08, "loss": 0.2337, "step": 4932 }, { "epoch": 32.032467532467535, "grad_norm": 1.529395341873169, "learning_rate": 4.4298111535057143e-08, "loss": 0.256, "step": 4933 }, { "epoch": 32.03896103896104, "grad_norm": 1.3919570446014404, "learning_rate": 4.2985836085013275e-08, "loss": 0.2207, "step": 4934 }, { "epoch": 32.04545454545455, "grad_norm": 1.3812589645385742, "learning_rate": 4.169328287299545e-08, "loss": 0.2127, "step": 4935 }, { "epoch": 32.05194805194805, "grad_norm": 1.3727024793624878, "learning_rate": 4.042045240927883e-08, "loss": 0.2172, "step": 4936 }, { "epoch": 32.05844155844156, "grad_norm": 1.4353840351104736, "learning_rate": 3.916734519636145e-08, "loss": 0.2555, "step": 4937 }, { "epoch": 32.064935064935064, "grad_norm": 1.4604127407073975, "learning_rate": 3.793396172895314e-08, "loss": 0.2309, "step": 4938 }, { "epoch": 32.07142857142857, "grad_norm": 1.415607213973999, "learning_rate": 3.6720302493964407e-08, "loss": 0.23, "step": 4939 }, { "epoch": 32.077922077922075, "grad_norm": 1.5463145971298218, "learning_rate": 3.5526367970539765e-08, "loss": 0.2513, "step": 4940 }, { "epoch": 32.08441558441559, "grad_norm": 1.4481228590011597, "learning_rate": 3.435215863001884e-08, "loss": 0.221, "step": 4941 }, { "epoch": 32.09090909090909, "grad_norm": 1.2926489114761353, "learning_rate": 3.31976749359586e-08, "loss": 0.1986, "step": 4942 }, { "epoch": 32.0974025974026, "grad_norm": 1.549300193786621, "learning_rate": 3.206291734413891e-08, "loss": 0.2416, "step": 4943 }, { "epoch": 32.103896103896105, "grad_norm": 1.3677971363067627, "learning_rate": 3.094788630254031e-08, "loss": 0.2296, "step": 4944 }, { "epoch": 32.11038961038961, "grad_norm": 1.3318811655044556, "learning_rate": 2.985258225135512e-08, "loss": 0.2222, "step": 4945 }, { "epoch": 32.116883116883116, "grad_norm": 1.4566057920455933, "learning_rate": 2.8777005622998564e-08, "loss": 0.2342, "step": 4946 }, { "epoch": 32.12337662337662, "grad_norm": 1.1788028478622437, "learning_rate": 2.772115684209209e-08, "loss": 0.1866, "step": 4947 }, { "epoch": 32.12987012987013, "grad_norm": 1.322718858718872, "learning_rate": 2.6685036325457824e-08, "loss": 0.2167, "step": 4948 }, { "epoch": 32.13636363636363, "grad_norm": 1.3604342937469482, "learning_rate": 2.5668644482151895e-08, "loss": 0.1996, "step": 4949 }, { "epoch": 32.142857142857146, "grad_norm": 1.3962228298187256, "learning_rate": 2.467198171342e-08, "loss": 0.2153, "step": 4950 }, { "epoch": 32.14935064935065, "grad_norm": 1.388566017150879, "learning_rate": 2.3695048412736285e-08, "loss": 0.2058, "step": 4951 }, { "epoch": 32.15584415584416, "grad_norm": 1.5011347532272339, "learning_rate": 2.273784496577558e-08, "loss": 0.2285, "step": 4952 }, { "epoch": 32.16233766233766, "grad_norm": 1.313040852546692, "learning_rate": 2.1800371750430037e-08, "loss": 0.2114, "step": 4953 }, { "epoch": 32.16883116883117, "grad_norm": 1.3303238153457642, "learning_rate": 2.088262913679251e-08, "loss": 0.2329, "step": 4954 }, { "epoch": 32.175324675324674, "grad_norm": 1.1089547872543335, "learning_rate": 1.9984617487173174e-08, "loss": 0.1526, "step": 4955 }, { "epoch": 32.18181818181818, "grad_norm": 1.4295611381530762, "learning_rate": 1.910633715609955e-08, "loss": 0.2257, "step": 4956 }, { "epoch": 32.188311688311686, "grad_norm": 1.2369484901428223, "learning_rate": 1.8247788490299844e-08, "loss": 0.1771, "step": 4957 }, { "epoch": 32.1948051948052, "grad_norm": 1.599870204925537, "learning_rate": 1.7408971828714038e-08, "loss": 0.2658, "step": 4958 }, { "epoch": 32.201298701298704, "grad_norm": 1.4956282377243042, "learning_rate": 1.6589887502493907e-08, "loss": 0.2283, "step": 4959 }, { "epoch": 32.20779220779221, "grad_norm": 1.2589012384414673, "learning_rate": 1.5790535835003008e-08, "loss": 0.1999, "step": 4960 }, { "epoch": 32.214285714285715, "grad_norm": 1.191191554069519, "learning_rate": 1.501091714181113e-08, "loss": 0.1906, "step": 4961 }, { "epoch": 32.22077922077922, "grad_norm": 1.5597410202026367, "learning_rate": 1.425103173069986e-08, "loss": 0.2247, "step": 4962 }, { "epoch": 32.22727272727273, "grad_norm": 1.5198577642440796, "learning_rate": 1.3510879901657002e-08, "loss": 0.2283, "step": 4963 }, { "epoch": 32.23376623376623, "grad_norm": 1.2728452682495117, "learning_rate": 1.2790461946887711e-08, "loss": 0.2018, "step": 4964 }, { "epoch": 32.24025974025974, "grad_norm": 1.4071130752563477, "learning_rate": 1.2089778150797815e-08, "loss": 0.2046, "step": 4965 }, { "epoch": 32.246753246753244, "grad_norm": 1.5618197917938232, "learning_rate": 1.1408828790010484e-08, "loss": 0.2427, "step": 4966 }, { "epoch": 32.253246753246756, "grad_norm": 1.270912766456604, "learning_rate": 1.074761413334957e-08, "loss": 0.1943, "step": 4967 }, { "epoch": 32.25974025974026, "grad_norm": 1.4742865562438965, "learning_rate": 1.0106134441850711e-08, "loss": 0.2619, "step": 4968 }, { "epoch": 32.26623376623377, "grad_norm": 1.4048014879226685, "learning_rate": 9.484389968766882e-09, "loss": 0.2179, "step": 4969 }, { "epoch": 32.27272727272727, "grad_norm": 1.4007841348648071, "learning_rate": 8.882380959551739e-09, "loss": 0.2157, "step": 4970 }, { "epoch": 32.27922077922078, "grad_norm": 1.3311463594436646, "learning_rate": 8.300107651859623e-09, "loss": 0.2035, "step": 4971 }, { "epoch": 32.285714285714285, "grad_norm": 1.3603016138076782, "learning_rate": 7.737570275573313e-09, "loss": 0.2044, "step": 4972 }, { "epoch": 32.29220779220779, "grad_norm": 1.4253227710723877, "learning_rate": 7.194769052765171e-09, "loss": 0.2119, "step": 4973 }, { "epoch": 32.298701298701296, "grad_norm": 1.522019624710083, "learning_rate": 6.671704197735995e-09, "loss": 0.2564, "step": 4974 }, { "epoch": 32.3051948051948, "grad_norm": 1.505655288696289, "learning_rate": 6.1683759169706146e-09, "loss": 0.2319, "step": 4975 }, { "epoch": 32.311688311688314, "grad_norm": 1.3795942068099976, "learning_rate": 5.684784409182298e-09, "loss": 0.2096, "step": 4976 }, { "epoch": 32.31818181818182, "grad_norm": 1.4211721420288086, "learning_rate": 5.220929865284996e-09, "loss": 0.2039, "step": 4977 }, { "epoch": 32.324675324675326, "grad_norm": 1.4269096851348877, "learning_rate": 4.776812468398895e-09, "loss": 0.2144, "step": 4978 }, { "epoch": 32.33116883116883, "grad_norm": 1.3960460424423218, "learning_rate": 4.352432393855965e-09, "loss": 0.2339, "step": 4979 }, { "epoch": 32.33766233766234, "grad_norm": 1.3636562824249268, "learning_rate": 3.947789809194414e-09, "loss": 0.2468, "step": 4980 }, { "epoch": 32.34415584415584, "grad_norm": 1.2369745969772339, "learning_rate": 3.5628848741586786e-09, "loss": 0.1807, "step": 4981 }, { "epoch": 32.35064935064935, "grad_norm": 1.5180789232254028, "learning_rate": 3.1977177407105372e-09, "loss": 0.2392, "step": 4982 }, { "epoch": 32.357142857142854, "grad_norm": 1.3790357112884521, "learning_rate": 2.8522885530013478e-09, "loss": 0.2329, "step": 4983 }, { "epoch": 32.36363636363637, "grad_norm": 1.2963364124298096, "learning_rate": 2.5265974474109056e-09, "loss": 0.2098, "step": 4984 }, { "epoch": 32.37012987012987, "grad_norm": 1.5217865705490112, "learning_rate": 2.2206445525085883e-09, "loss": 0.2461, "step": 4985 }, { "epoch": 32.37662337662338, "grad_norm": 1.3449853658676147, "learning_rate": 1.934429989086661e-09, "loss": 0.2107, "step": 4986 }, { "epoch": 32.383116883116884, "grad_norm": 1.436370849609375, "learning_rate": 1.6679538701325215e-09, "loss": 0.2223, "step": 4987 }, { "epoch": 32.38961038961039, "grad_norm": 1.5517100095748901, "learning_rate": 1.4212163008509027e-09, "loss": 0.2565, "step": 4988 }, { "epoch": 32.396103896103895, "grad_norm": 1.4848861694335938, "learning_rate": 1.1942173786527734e-09, "loss": 0.2274, "step": 4989 }, { "epoch": 32.4025974025974, "grad_norm": 1.5431606769561768, "learning_rate": 9.869571931442334e-10, "loss": 0.2457, "step": 4990 }, { "epoch": 32.40909090909091, "grad_norm": 1.2045327425003052, "learning_rate": 7.994358261542712e-10, "loss": 0.1994, "step": 4991 }, { "epoch": 32.41558441558441, "grad_norm": 1.2429924011230469, "learning_rate": 6.316533517125578e-10, "loss": 0.2063, "step": 4992 }, { "epoch": 32.422077922077925, "grad_norm": 1.2943520545959473, "learning_rate": 4.83609836054999e-10, "loss": 0.21, "step": 4993 }, { "epoch": 32.42857142857143, "grad_norm": 1.496764063835144, "learning_rate": 3.5530533763483695e-10, "loss": 0.2549, "step": 4994 }, { "epoch": 32.435064935064936, "grad_norm": 1.4615414142608643, "learning_rate": 2.4673990708934393e-10, "loss": 0.2347, "step": 4995 }, { "epoch": 32.44155844155844, "grad_norm": 1.5383694171905518, "learning_rate": 1.57913587295333e-10, "loss": 0.2497, "step": 4996 }, { "epoch": 32.44805194805195, "grad_norm": 1.3649652004241943, "learning_rate": 8.882641330809626e-11, "loss": 0.2061, "step": 4997 }, { "epoch": 32.45454545454545, "grad_norm": 1.3758985996246338, "learning_rate": 3.9478412411364515e-11, "loss": 0.2243, "step": 4998 }, { "epoch": 32.46103896103896, "grad_norm": 1.3766566514968872, "learning_rate": 9.869604078449612e-12, "loss": 0.2276, "step": 4999 }, { "epoch": 32.467532467532465, "grad_norm": 1.3104761838912964, "learning_rate": 0.0, "loss": 0.218, "step": 5000 } ], "logging_steps": 1, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 33, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3878204530176000.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }