diff --git "a/pft/trainer_state.json" "b/pft/trainer_state.json" new file mode 100644--- /dev/null +++ "b/pft/trainer_state.json" @@ -0,0 +1,51674 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999322171761675, + "eval_steps": 500, + "global_step": 7376, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00013556564766488172, + "grad_norm": 24.01125176394445, + "learning_rate": 9.009009009009009e-09, + "loss": 2.2843, + "step": 1 + }, + { + "epoch": 0.00027113129532976344, + "grad_norm": 27.137206284884098, + "learning_rate": 1.8018018018018017e-08, + "loss": 2.2688, + "step": 2 + }, + { + "epoch": 0.00040669694299464516, + "grad_norm": 34.16040946460364, + "learning_rate": 2.7027027027027028e-08, + "loss": 2.226, + "step": 3 + }, + { + "epoch": 0.0005422625906595269, + "grad_norm": 23.041414640295088, + "learning_rate": 3.6036036036036035e-08, + "loss": 2.259, + "step": 4 + }, + { + "epoch": 0.0006778282383244086, + "grad_norm": 35.76648158675801, + "learning_rate": 4.504504504504504e-08, + "loss": 2.261, + "step": 5 + }, + { + "epoch": 0.0008133938859892903, + "grad_norm": 23.793048352504638, + "learning_rate": 5.4054054054054056e-08, + "loss": 2.303, + "step": 6 + }, + { + "epoch": 0.000948959533654172, + "grad_norm": 22.406009774776706, + "learning_rate": 6.306306306306305e-08, + "loss": 2.2587, + "step": 7 + }, + { + "epoch": 0.0010845251813190538, + "grad_norm": 24.790605494588608, + "learning_rate": 7.207207207207207e-08, + "loss": 2.2673, + "step": 8 + }, + { + "epoch": 0.0012200908289839354, + "grad_norm": 23.11765029125426, + "learning_rate": 8.108108108108108e-08, + "loss": 2.3013, + "step": 9 + }, + { + "epoch": 0.0013556564766488172, + "grad_norm": 23.860144064651642, + "learning_rate": 9.009009009009008e-08, + "loss": 2.2535, + "step": 10 + }, + { + "epoch": 0.0014912221243136988, + "grad_norm": 21.27813010690102, + "learning_rate": 9.909909909909909e-08, + "loss": 2.2516, + "step": 11 + }, + { + "epoch": 0.0016267877719785807, + "grad_norm": 24.298444248027746, + "learning_rate": 1.0810810810810811e-07, + "loss": 2.3147, + "step": 12 + }, + { + "epoch": 0.0017623534196434623, + "grad_norm": 24.62640787232673, + "learning_rate": 1.171171171171171e-07, + "loss": 2.2336, + "step": 13 + }, + { + "epoch": 0.001897919067308344, + "grad_norm": 21.205643408404896, + "learning_rate": 1.261261261261261e-07, + "loss": 2.2735, + "step": 14 + }, + { + "epoch": 0.002033484714973226, + "grad_norm": 24.148365328131433, + "learning_rate": 1.3513513513513515e-07, + "loss": 2.3197, + "step": 15 + }, + { + "epoch": 0.0021690503626381075, + "grad_norm": 65.48312734536941, + "learning_rate": 1.4414414414414414e-07, + "loss": 2.2656, + "step": 16 + }, + { + "epoch": 0.002304616010302989, + "grad_norm": 26.64491409959724, + "learning_rate": 1.5315315315315313e-07, + "loss": 2.2645, + "step": 17 + }, + { + "epoch": 0.0024401816579678708, + "grad_norm": 25.32834477081852, + "learning_rate": 1.6216216216216215e-07, + "loss": 2.2704, + "step": 18 + }, + { + "epoch": 0.002575747305632753, + "grad_norm": 21.741745975502226, + "learning_rate": 1.7117117117117117e-07, + "loss": 2.2487, + "step": 19 + }, + { + "epoch": 0.0027113129532976344, + "grad_norm": 24.90418662219934, + "learning_rate": 1.8018018018018017e-07, + "loss": 2.2524, + "step": 20 + }, + { + "epoch": 0.002846878600962516, + "grad_norm": 21.914489208817287, + "learning_rate": 1.891891891891892e-07, + "loss": 2.2463, + "step": 21 + }, + { + "epoch": 0.0029824442486273976, + "grad_norm": 21.736438294974157, + "learning_rate": 1.9819819819819818e-07, + "loss": 2.2364, + "step": 22 + }, + { + "epoch": 0.0031180098962922797, + "grad_norm": 21.033330604470763, + "learning_rate": 2.072072072072072e-07, + "loss": 2.2604, + "step": 23 + }, + { + "epoch": 0.0032535755439571613, + "grad_norm": 23.170892369159443, + "learning_rate": 2.1621621621621622e-07, + "loss": 2.2427, + "step": 24 + }, + { + "epoch": 0.003389141191622043, + "grad_norm": 72.34972833773223, + "learning_rate": 2.2522522522522522e-07, + "loss": 2.2719, + "step": 25 + }, + { + "epoch": 0.0035247068392869245, + "grad_norm": 24.39608646774881, + "learning_rate": 2.342342342342342e-07, + "loss": 2.2872, + "step": 26 + }, + { + "epoch": 0.0036602724869518066, + "grad_norm": 22.73633857232502, + "learning_rate": 2.4324324324324326e-07, + "loss": 2.2433, + "step": 27 + }, + { + "epoch": 0.003795838134616688, + "grad_norm": 26.05715673510671, + "learning_rate": 2.522522522522522e-07, + "loss": 2.2355, + "step": 28 + }, + { + "epoch": 0.00393140378228157, + "grad_norm": 21.24663540193212, + "learning_rate": 2.6126126126126124e-07, + "loss": 2.283, + "step": 29 + }, + { + "epoch": 0.004066969429946452, + "grad_norm": 24.382687222633567, + "learning_rate": 2.702702702702703e-07, + "loss": 2.2606, + "step": 30 + }, + { + "epoch": 0.0042025350776113335, + "grad_norm": 19.47123060316013, + "learning_rate": 2.7927927927927923e-07, + "loss": 2.2016, + "step": 31 + }, + { + "epoch": 0.004338100725276215, + "grad_norm": 19.209290329542796, + "learning_rate": 2.882882882882883e-07, + "loss": 2.2062, + "step": 32 + }, + { + "epoch": 0.004473666372941097, + "grad_norm": 19.87352435371915, + "learning_rate": 2.972972972972973e-07, + "loss": 2.1751, + "step": 33 + }, + { + "epoch": 0.004609232020605978, + "grad_norm": 24.355594514020556, + "learning_rate": 3.0630630630630627e-07, + "loss": 2.1891, + "step": 34 + }, + { + "epoch": 0.00474479766827086, + "grad_norm": 17.603159845539142, + "learning_rate": 3.153153153153153e-07, + "loss": 2.1484, + "step": 35 + }, + { + "epoch": 0.0048803633159357415, + "grad_norm": 25.802652096559957, + "learning_rate": 3.243243243243243e-07, + "loss": 2.1932, + "step": 36 + }, + { + "epoch": 0.005015928963600624, + "grad_norm": 18.874346321837198, + "learning_rate": 3.333333333333333e-07, + "loss": 2.1933, + "step": 37 + }, + { + "epoch": 0.005151494611265506, + "grad_norm": 18.123751439648085, + "learning_rate": 3.4234234234234235e-07, + "loss": 2.2033, + "step": 38 + }, + { + "epoch": 0.005287060258930387, + "grad_norm": 18.617829450223937, + "learning_rate": 3.5135135135135134e-07, + "loss": 2.2274, + "step": 39 + }, + { + "epoch": 0.005422625906595269, + "grad_norm": 72.7934496519844, + "learning_rate": 3.6036036036036033e-07, + "loss": 2.167, + "step": 40 + }, + { + "epoch": 0.0055581915542601504, + "grad_norm": 17.56929602123319, + "learning_rate": 3.6936936936936933e-07, + "loss": 2.2286, + "step": 41 + }, + { + "epoch": 0.005693757201925032, + "grad_norm": 27.90141611533808, + "learning_rate": 3.783783783783784e-07, + "loss": 2.1981, + "step": 42 + }, + { + "epoch": 0.005829322849589914, + "grad_norm": 19.357835927514877, + "learning_rate": 3.8738738738738737e-07, + "loss": 2.1085, + "step": 43 + }, + { + "epoch": 0.005964888497254795, + "grad_norm": 14.520291527787796, + "learning_rate": 3.9639639639639636e-07, + "loss": 2.0406, + "step": 44 + }, + { + "epoch": 0.006100454144919678, + "grad_norm": 15.457213377398524, + "learning_rate": 4.054054054054054e-07, + "loss": 2.0425, + "step": 45 + }, + { + "epoch": 0.006236019792584559, + "grad_norm": 16.742927911309263, + "learning_rate": 4.144144144144144e-07, + "loss": 1.9975, + "step": 46 + }, + { + "epoch": 0.006371585440249441, + "grad_norm": 13.3170519770282, + "learning_rate": 4.234234234234234e-07, + "loss": 2.0186, + "step": 47 + }, + { + "epoch": 0.006507151087914323, + "grad_norm": 20.559143434975574, + "learning_rate": 4.3243243243243244e-07, + "loss": 2.0225, + "step": 48 + }, + { + "epoch": 0.006642716735579204, + "grad_norm": 25.105346000098887, + "learning_rate": 4.414414414414414e-07, + "loss": 2.0111, + "step": 49 + }, + { + "epoch": 0.006778282383244086, + "grad_norm": 16.778841876262163, + "learning_rate": 4.5045045045045043e-07, + "loss": 2.0322, + "step": 50 + }, + { + "epoch": 0.0069138480309089674, + "grad_norm": 14.749320817231402, + "learning_rate": 4.594594594594595e-07, + "loss": 2.0035, + "step": 51 + }, + { + "epoch": 0.007049413678573849, + "grad_norm": 18.853480688401966, + "learning_rate": 4.684684684684684e-07, + "loss": 1.9946, + "step": 52 + }, + { + "epoch": 0.0071849793262387315, + "grad_norm": 14.089638696156065, + "learning_rate": 4.774774774774775e-07, + "loss": 2.0095, + "step": 53 + }, + { + "epoch": 0.007320544973903613, + "grad_norm": 16.658578053739742, + "learning_rate": 4.864864864864865e-07, + "loss": 2.0074, + "step": 54 + }, + { + "epoch": 0.007456110621568495, + "grad_norm": 131.2740434419637, + "learning_rate": 4.954954954954955e-07, + "loss": 1.9902, + "step": 55 + }, + { + "epoch": 0.007591676269233376, + "grad_norm": 12.959425189597502, + "learning_rate": 5.045045045045044e-07, + "loss": 2.0016, + "step": 56 + }, + { + "epoch": 0.007727241916898258, + "grad_norm": 11.904745441907268, + "learning_rate": 5.135135135135134e-07, + "loss": 1.9575, + "step": 57 + }, + { + "epoch": 0.00786280756456314, + "grad_norm": 14.538530384970057, + "learning_rate": 5.225225225225225e-07, + "loss": 1.9443, + "step": 58 + }, + { + "epoch": 0.007998373212228021, + "grad_norm": 36.482046151787955, + "learning_rate": 5.315315315315315e-07, + "loss": 1.943, + "step": 59 + }, + { + "epoch": 0.008133938859892904, + "grad_norm": 12.49546245571636, + "learning_rate": 5.405405405405406e-07, + "loss": 1.929, + "step": 60 + }, + { + "epoch": 0.008269504507557784, + "grad_norm": 9.627990348143992, + "learning_rate": 5.495495495495495e-07, + "loss": 1.875, + "step": 61 + }, + { + "epoch": 0.008405070155222667, + "grad_norm": 18.77479728563107, + "learning_rate": 5.585585585585585e-07, + "loss": 1.8573, + "step": 62 + }, + { + "epoch": 0.008540635802887548, + "grad_norm": 8.127064213560146, + "learning_rate": 5.675675675675675e-07, + "loss": 1.8018, + "step": 63 + }, + { + "epoch": 0.00867620145055243, + "grad_norm": 9.440020649542799, + "learning_rate": 5.765765765765766e-07, + "loss": 1.7906, + "step": 64 + }, + { + "epoch": 0.008811767098217311, + "grad_norm": 6.771958379170247, + "learning_rate": 5.855855855855856e-07, + "loss": 1.7934, + "step": 65 + }, + { + "epoch": 0.008947332745882193, + "grad_norm": 6.199974983973236, + "learning_rate": 5.945945945945947e-07, + "loss": 1.7367, + "step": 66 + }, + { + "epoch": 0.009082898393547076, + "grad_norm": 6.619897894626267, + "learning_rate": 6.036036036036036e-07, + "loss": 1.7407, + "step": 67 + }, + { + "epoch": 0.009218464041211957, + "grad_norm": 6.745392419812889, + "learning_rate": 6.126126126126125e-07, + "loss": 1.7284, + "step": 68 + }, + { + "epoch": 0.009354029688876839, + "grad_norm": 6.830663434952967, + "learning_rate": 6.216216216216216e-07, + "loss": 1.7505, + "step": 69 + }, + { + "epoch": 0.00948959533654172, + "grad_norm": 6.406896633153945, + "learning_rate": 6.306306306306306e-07, + "loss": 1.6682, + "step": 70 + }, + { + "epoch": 0.009625160984206602, + "grad_norm": 10.172140767329372, + "learning_rate": 6.396396396396397e-07, + "loss": 1.7478, + "step": 71 + }, + { + "epoch": 0.009760726631871483, + "grad_norm": 6.392964294314573, + "learning_rate": 6.486486486486486e-07, + "loss": 1.7038, + "step": 72 + }, + { + "epoch": 0.009896292279536366, + "grad_norm": 5.363629099651644, + "learning_rate": 6.576576576576577e-07, + "loss": 1.7295, + "step": 73 + }, + { + "epoch": 0.010031857927201248, + "grad_norm": 5.215695606240297, + "learning_rate": 6.666666666666666e-07, + "loss": 1.6904, + "step": 74 + }, + { + "epoch": 0.010167423574866129, + "grad_norm": 5.240603707255642, + "learning_rate": 6.756756756756756e-07, + "loss": 1.6937, + "step": 75 + }, + { + "epoch": 0.010302989222531011, + "grad_norm": 5.794367850143765, + "learning_rate": 6.846846846846847e-07, + "loss": 1.6958, + "step": 76 + }, + { + "epoch": 0.010438554870195892, + "grad_norm": 5.1755168616280685, + "learning_rate": 6.936936936936936e-07, + "loss": 1.6623, + "step": 77 + }, + { + "epoch": 0.010574120517860774, + "grad_norm": 5.404536294704342, + "learning_rate": 7.027027027027027e-07, + "loss": 1.6829, + "step": 78 + }, + { + "epoch": 0.010709686165525655, + "grad_norm": 4.936418045672427, + "learning_rate": 7.117117117117116e-07, + "loss": 1.6736, + "step": 79 + }, + { + "epoch": 0.010845251813190538, + "grad_norm": 5.687766451156374, + "learning_rate": 7.207207207207207e-07, + "loss": 1.6955, + "step": 80 + }, + { + "epoch": 0.010980817460855418, + "grad_norm": 8.726685702313473, + "learning_rate": 7.297297297297297e-07, + "loss": 1.6719, + "step": 81 + }, + { + "epoch": 0.011116383108520301, + "grad_norm": 5.260493112644015, + "learning_rate": 7.387387387387387e-07, + "loss": 1.6484, + "step": 82 + }, + { + "epoch": 0.011251948756185183, + "grad_norm": 4.414797517290715, + "learning_rate": 7.477477477477477e-07, + "loss": 1.6448, + "step": 83 + }, + { + "epoch": 0.011387514403850064, + "grad_norm": 27.310739760689955, + "learning_rate": 7.567567567567568e-07, + "loss": 1.6582, + "step": 84 + }, + { + "epoch": 0.011523080051514947, + "grad_norm": 4.03019339681888, + "learning_rate": 7.657657657657657e-07, + "loss": 1.6317, + "step": 85 + }, + { + "epoch": 0.011658645699179827, + "grad_norm": 11.785404103992619, + "learning_rate": 7.747747747747747e-07, + "loss": 1.5986, + "step": 86 + }, + { + "epoch": 0.01179421134684471, + "grad_norm": 4.398622000286691, + "learning_rate": 7.837837837837838e-07, + "loss": 1.6082, + "step": 87 + }, + { + "epoch": 0.01192977699450959, + "grad_norm": 3.571859863801341, + "learning_rate": 7.927927927927927e-07, + "loss": 1.6094, + "step": 88 + }, + { + "epoch": 0.012065342642174473, + "grad_norm": 3.7513073193139586, + "learning_rate": 8.018018018018018e-07, + "loss": 1.5419, + "step": 89 + }, + { + "epoch": 0.012200908289839356, + "grad_norm": 13.353200685100488, + "learning_rate": 8.108108108108108e-07, + "loss": 1.5774, + "step": 90 + }, + { + "epoch": 0.012336473937504236, + "grad_norm": 3.132189819159853, + "learning_rate": 8.198198198198198e-07, + "loss": 1.6223, + "step": 91 + }, + { + "epoch": 0.012472039585169119, + "grad_norm": 3.0509509708344713, + "learning_rate": 8.288288288288288e-07, + "loss": 1.568, + "step": 92 + }, + { + "epoch": 0.012607605232834, + "grad_norm": 4.078577857874112, + "learning_rate": 8.378378378378377e-07, + "loss": 1.5891, + "step": 93 + }, + { + "epoch": 0.012743170880498882, + "grad_norm": 7.855077276998569, + "learning_rate": 8.468468468468468e-07, + "loss": 1.6068, + "step": 94 + }, + { + "epoch": 0.012878736528163763, + "grad_norm": 2.7267936566758957, + "learning_rate": 8.558558558558558e-07, + "loss": 1.5608, + "step": 95 + }, + { + "epoch": 0.013014302175828645, + "grad_norm": 2.972661710521432, + "learning_rate": 8.648648648648649e-07, + "loss": 1.563, + "step": 96 + }, + { + "epoch": 0.013149867823493526, + "grad_norm": 2.7641739174193107, + "learning_rate": 8.738738738738738e-07, + "loss": 1.5656, + "step": 97 + }, + { + "epoch": 0.013285433471158408, + "grad_norm": 4.458879266902286, + "learning_rate": 8.828828828828828e-07, + "loss": 1.5265, + "step": 98 + }, + { + "epoch": 0.013420999118823291, + "grad_norm": 2.626771757816332, + "learning_rate": 8.918918918918918e-07, + "loss": 1.5183, + "step": 99 + }, + { + "epoch": 0.013556564766488172, + "grad_norm": 3.485237608205376, + "learning_rate": 9.009009009009009e-07, + "loss": 1.5461, + "step": 100 + }, + { + "epoch": 0.013692130414153054, + "grad_norm": 2.7300767631589085, + "learning_rate": 9.099099099099099e-07, + "loss": 1.5536, + "step": 101 + }, + { + "epoch": 0.013827696061817935, + "grad_norm": 2.6262841921214912, + "learning_rate": 9.18918918918919e-07, + "loss": 1.5203, + "step": 102 + }, + { + "epoch": 0.013963261709482817, + "grad_norm": 4.081210868286267, + "learning_rate": 9.279279279279278e-07, + "loss": 1.5566, + "step": 103 + }, + { + "epoch": 0.014098827357147698, + "grad_norm": 2.2836414826241573, + "learning_rate": 9.369369369369368e-07, + "loss": 1.5457, + "step": 104 + }, + { + "epoch": 0.01423439300481258, + "grad_norm": 2.349007870096232, + "learning_rate": 9.459459459459459e-07, + "loss": 1.5534, + "step": 105 + }, + { + "epoch": 0.014369958652477463, + "grad_norm": 2.696926865464839, + "learning_rate": 9.54954954954955e-07, + "loss": 1.5262, + "step": 106 + }, + { + "epoch": 0.014505524300142344, + "grad_norm": 2.5244907209231275, + "learning_rate": 9.63963963963964e-07, + "loss": 1.4915, + "step": 107 + }, + { + "epoch": 0.014641089947807226, + "grad_norm": 3.2824729004984996, + "learning_rate": 9.72972972972973e-07, + "loss": 1.5455, + "step": 108 + }, + { + "epoch": 0.014776655595472107, + "grad_norm": 2.2934927793674067, + "learning_rate": 9.819819819819819e-07, + "loss": 1.5034, + "step": 109 + }, + { + "epoch": 0.01491222124313699, + "grad_norm": 2.5298734771696867, + "learning_rate": 9.90990990990991e-07, + "loss": 1.4979, + "step": 110 + }, + { + "epoch": 0.01504778689080187, + "grad_norm": 2.250437238649095, + "learning_rate": 1e-06, + "loss": 1.5396, + "step": 111 + }, + { + "epoch": 0.015183352538466753, + "grad_norm": 2.842564306476503, + "learning_rate": 1.0090090090090088e-06, + "loss": 1.5013, + "step": 112 + }, + { + "epoch": 0.015318918186131633, + "grad_norm": 2.0362868563283953, + "learning_rate": 1.018018018018018e-06, + "loss": 1.5415, + "step": 113 + }, + { + "epoch": 0.015454483833796516, + "grad_norm": 1.862416670875627, + "learning_rate": 1.0270270270270269e-06, + "loss": 1.5315, + "step": 114 + }, + { + "epoch": 0.015590049481461398, + "grad_norm": 1.78330793922286, + "learning_rate": 1.0360360360360361e-06, + "loss": 1.5362, + "step": 115 + }, + { + "epoch": 0.01572561512912628, + "grad_norm": 2.00752841860934, + "learning_rate": 1.045045045045045e-06, + "loss": 1.5405, + "step": 116 + }, + { + "epoch": 0.01586118077679116, + "grad_norm": 1.920368262887671, + "learning_rate": 1.0540540540540538e-06, + "loss": 1.5513, + "step": 117 + }, + { + "epoch": 0.015996746424456042, + "grad_norm": 3.099799903020496, + "learning_rate": 1.063063063063063e-06, + "loss": 1.4785, + "step": 118 + }, + { + "epoch": 0.016132312072120923, + "grad_norm": 2.0703378678971425, + "learning_rate": 1.072072072072072e-06, + "loss": 1.5581, + "step": 119 + }, + { + "epoch": 0.016267877719785807, + "grad_norm": 2.569465764543604, + "learning_rate": 1.0810810810810812e-06, + "loss": 1.5025, + "step": 120 + }, + { + "epoch": 0.016403443367450688, + "grad_norm": 11.849216155055109, + "learning_rate": 1.09009009009009e-06, + "loss": 1.5024, + "step": 121 + }, + { + "epoch": 0.01653900901511557, + "grad_norm": 2.8756800145843515, + "learning_rate": 1.099099099099099e-06, + "loss": 1.5178, + "step": 122 + }, + { + "epoch": 0.016674574662780453, + "grad_norm": 1.66613415955462, + "learning_rate": 1.108108108108108e-06, + "loss": 1.4854, + "step": 123 + }, + { + "epoch": 0.016810140310445334, + "grad_norm": 1.7174600790995793, + "learning_rate": 1.117117117117117e-06, + "loss": 1.5155, + "step": 124 + }, + { + "epoch": 0.016945705958110215, + "grad_norm": 3.631407585916671, + "learning_rate": 1.1261261261261262e-06, + "loss": 1.4951, + "step": 125 + }, + { + "epoch": 0.017081271605775095, + "grad_norm": 1.946337580541867, + "learning_rate": 1.135135135135135e-06, + "loss": 1.4927, + "step": 126 + }, + { + "epoch": 0.01721683725343998, + "grad_norm": 1.769982447453932, + "learning_rate": 1.1441441441441443e-06, + "loss": 1.496, + "step": 127 + }, + { + "epoch": 0.01735240290110486, + "grad_norm": 1.6640975192377936, + "learning_rate": 1.1531531531531531e-06, + "loss": 1.4807, + "step": 128 + }, + { + "epoch": 0.01748796854876974, + "grad_norm": 1.7802641327269135, + "learning_rate": 1.162162162162162e-06, + "loss": 1.4822, + "step": 129 + }, + { + "epoch": 0.017623534196434622, + "grad_norm": 11.380976146866304, + "learning_rate": 1.1711711711711712e-06, + "loss": 1.4622, + "step": 130 + }, + { + "epoch": 0.017759099844099506, + "grad_norm": 1.6350376122633814, + "learning_rate": 1.18018018018018e-06, + "loss": 1.4923, + "step": 131 + }, + { + "epoch": 0.017894665491764387, + "grad_norm": 1.6940944001799714, + "learning_rate": 1.1891891891891893e-06, + "loss": 1.532, + "step": 132 + }, + { + "epoch": 0.018030231139429267, + "grad_norm": 6.061899848948961, + "learning_rate": 1.1981981981981981e-06, + "loss": 1.4817, + "step": 133 + }, + { + "epoch": 0.01816579678709415, + "grad_norm": 1.7734805296617122, + "learning_rate": 1.2072072072072072e-06, + "loss": 1.4851, + "step": 134 + }, + { + "epoch": 0.018301362434759032, + "grad_norm": 1.7192809825510058, + "learning_rate": 1.2162162162162162e-06, + "loss": 1.5193, + "step": 135 + }, + { + "epoch": 0.018436928082423913, + "grad_norm": 1.9843754926424206, + "learning_rate": 1.225225225225225e-06, + "loss": 1.5016, + "step": 136 + }, + { + "epoch": 0.018572493730088794, + "grad_norm": 2.07441447559389, + "learning_rate": 1.2342342342342343e-06, + "loss": 1.4896, + "step": 137 + }, + { + "epoch": 0.018708059377753678, + "grad_norm": 1.9554379892102682, + "learning_rate": 1.2432432432432432e-06, + "loss": 1.5067, + "step": 138 + }, + { + "epoch": 0.01884362502541856, + "grad_norm": 5.735711866674401, + "learning_rate": 1.2522522522522522e-06, + "loss": 1.471, + "step": 139 + }, + { + "epoch": 0.01897919067308344, + "grad_norm": 3.6681931332568385, + "learning_rate": 1.2612612612612613e-06, + "loss": 1.487, + "step": 140 + }, + { + "epoch": 0.019114756320748324, + "grad_norm": 3.6967973345859897, + "learning_rate": 1.27027027027027e-06, + "loss": 1.4416, + "step": 141 + }, + { + "epoch": 0.019250321968413205, + "grad_norm": 1.6526554118443435, + "learning_rate": 1.2792792792792793e-06, + "loss": 1.4671, + "step": 142 + }, + { + "epoch": 0.019385887616078085, + "grad_norm": 2.353162344043656, + "learning_rate": 1.2882882882882882e-06, + "loss": 1.4997, + "step": 143 + }, + { + "epoch": 0.019521453263742966, + "grad_norm": 7.16126122960063, + "learning_rate": 1.2972972972972972e-06, + "loss": 1.5045, + "step": 144 + }, + { + "epoch": 0.01965701891140785, + "grad_norm": 1.5263188654237, + "learning_rate": 1.3063063063063063e-06, + "loss": 1.4783, + "step": 145 + }, + { + "epoch": 0.01979258455907273, + "grad_norm": 1.6150898747346438, + "learning_rate": 1.3153153153153153e-06, + "loss": 1.4678, + "step": 146 + }, + { + "epoch": 0.019928150206737612, + "grad_norm": 2.6167684199124674, + "learning_rate": 1.3243243243243244e-06, + "loss": 1.4985, + "step": 147 + }, + { + "epoch": 0.020063715854402496, + "grad_norm": 2.157803505095716, + "learning_rate": 1.3333333333333332e-06, + "loss": 1.4607, + "step": 148 + }, + { + "epoch": 0.020199281502067377, + "grad_norm": 1.5084792715771218, + "learning_rate": 1.3423423423423422e-06, + "loss": 1.4885, + "step": 149 + }, + { + "epoch": 0.020334847149732257, + "grad_norm": 1.615135866208239, + "learning_rate": 1.3513513513513513e-06, + "loss": 1.4559, + "step": 150 + }, + { + "epoch": 0.020470412797397138, + "grad_norm": 2.5646910514430097, + "learning_rate": 1.3603603603603603e-06, + "loss": 1.5156, + "step": 151 + }, + { + "epoch": 0.020605978445062022, + "grad_norm": 8.227069728279288, + "learning_rate": 1.3693693693693694e-06, + "loss": 1.4706, + "step": 152 + }, + { + "epoch": 0.020741544092726903, + "grad_norm": 1.643806904326497, + "learning_rate": 1.3783783783783782e-06, + "loss": 1.4426, + "step": 153 + }, + { + "epoch": 0.020877109740391784, + "grad_norm": 4.909692516101664, + "learning_rate": 1.3873873873873873e-06, + "loss": 1.4459, + "step": 154 + }, + { + "epoch": 0.021012675388056668, + "grad_norm": 1.6611799283957167, + "learning_rate": 1.3963963963963963e-06, + "loss": 1.4584, + "step": 155 + }, + { + "epoch": 0.02114824103572155, + "grad_norm": 1.5312277977471949, + "learning_rate": 1.4054054054054054e-06, + "loss": 1.4667, + "step": 156 + }, + { + "epoch": 0.02128380668338643, + "grad_norm": 1.6434536553321057, + "learning_rate": 1.4144144144144144e-06, + "loss": 1.4389, + "step": 157 + }, + { + "epoch": 0.02141937233105131, + "grad_norm": 1.6392096012644777, + "learning_rate": 1.4234234234234232e-06, + "loss": 1.4849, + "step": 158 + }, + { + "epoch": 0.021554937978716195, + "grad_norm": 1.7319763741049772, + "learning_rate": 1.4324324324324323e-06, + "loss": 1.5019, + "step": 159 + }, + { + "epoch": 0.021690503626381075, + "grad_norm": 1.8334846660101116, + "learning_rate": 1.4414414414414413e-06, + "loss": 1.4391, + "step": 160 + }, + { + "epoch": 0.021826069274045956, + "grad_norm": 1.6290102306872545, + "learning_rate": 1.4504504504504504e-06, + "loss": 1.4497, + "step": 161 + }, + { + "epoch": 0.021961634921710837, + "grad_norm": 2.085340658323169, + "learning_rate": 1.4594594594594594e-06, + "loss": 1.4686, + "step": 162 + }, + { + "epoch": 0.02209720056937572, + "grad_norm": 2.0464844150108306, + "learning_rate": 1.4684684684684685e-06, + "loss": 1.4416, + "step": 163 + }, + { + "epoch": 0.022232766217040602, + "grad_norm": 2.6571662459574585, + "learning_rate": 1.4774774774774773e-06, + "loss": 1.4324, + "step": 164 + }, + { + "epoch": 0.022368331864705483, + "grad_norm": 1.9224545957658028, + "learning_rate": 1.4864864864864864e-06, + "loss": 1.454, + "step": 165 + }, + { + "epoch": 0.022503897512370367, + "grad_norm": 1.8447210705726331, + "learning_rate": 1.4954954954954954e-06, + "loss": 1.4539, + "step": 166 + }, + { + "epoch": 0.022639463160035248, + "grad_norm": 2.2349403809846367, + "learning_rate": 1.5045045045045045e-06, + "loss": 1.4697, + "step": 167 + }, + { + "epoch": 0.022775028807700128, + "grad_norm": 2.007458706414246, + "learning_rate": 1.5135135135135135e-06, + "loss": 1.4756, + "step": 168 + }, + { + "epoch": 0.02291059445536501, + "grad_norm": 2.4047001312034064, + "learning_rate": 1.5225225225225225e-06, + "loss": 1.449, + "step": 169 + }, + { + "epoch": 0.023046160103029893, + "grad_norm": 1.8807673757774008, + "learning_rate": 1.5315315315315314e-06, + "loss": 1.4907, + "step": 170 + }, + { + "epoch": 0.023181725750694774, + "grad_norm": 1.6761368869472493, + "learning_rate": 1.5405405405405404e-06, + "loss": 1.4357, + "step": 171 + }, + { + "epoch": 0.023317291398359655, + "grad_norm": 1.4678113506101014, + "learning_rate": 1.5495495495495495e-06, + "loss": 1.4629, + "step": 172 + }, + { + "epoch": 0.02345285704602454, + "grad_norm": 2.315861853488389, + "learning_rate": 1.5585585585585585e-06, + "loss": 1.4395, + "step": 173 + }, + { + "epoch": 0.02358842269368942, + "grad_norm": 1.5915565278319124, + "learning_rate": 1.5675675675675676e-06, + "loss": 1.4331, + "step": 174 + }, + { + "epoch": 0.0237239883413543, + "grad_norm": 1.4655443819622014, + "learning_rate": 1.5765765765765766e-06, + "loss": 1.4262, + "step": 175 + }, + { + "epoch": 0.02385955398901918, + "grad_norm": 6.23630069816874, + "learning_rate": 1.5855855855855855e-06, + "loss": 1.446, + "step": 176 + }, + { + "epoch": 0.023995119636684065, + "grad_norm": 1.5904899367457077, + "learning_rate": 1.5945945945945945e-06, + "loss": 1.4296, + "step": 177 + }, + { + "epoch": 0.024130685284348946, + "grad_norm": 2.759976776335535, + "learning_rate": 1.6036036036036035e-06, + "loss": 1.4627, + "step": 178 + }, + { + "epoch": 0.024266250932013827, + "grad_norm": 1.57501364187711, + "learning_rate": 1.6126126126126126e-06, + "loss": 1.4494, + "step": 179 + }, + { + "epoch": 0.02440181657967871, + "grad_norm": 1.70462076761471, + "learning_rate": 1.6216216216216216e-06, + "loss": 1.4478, + "step": 180 + }, + { + "epoch": 0.024537382227343592, + "grad_norm": 2.172767891588567, + "learning_rate": 1.6306306306306305e-06, + "loss": 1.4713, + "step": 181 + }, + { + "epoch": 0.024672947875008473, + "grad_norm": 3.1959466901228035, + "learning_rate": 1.6396396396396395e-06, + "loss": 1.4563, + "step": 182 + }, + { + "epoch": 0.024808513522673353, + "grad_norm": 2.047619908889512, + "learning_rate": 1.6486486486486486e-06, + "loss": 1.4914, + "step": 183 + }, + { + "epoch": 0.024944079170338238, + "grad_norm": 1.7630016938917448, + "learning_rate": 1.6576576576576576e-06, + "loss": 1.4371, + "step": 184 + }, + { + "epoch": 0.025079644818003118, + "grad_norm": 1.6836108111279398, + "learning_rate": 1.6666666666666667e-06, + "loss": 1.4464, + "step": 185 + }, + { + "epoch": 0.025215210465668, + "grad_norm": 2.5202836894168046, + "learning_rate": 1.6756756756756755e-06, + "loss": 1.4187, + "step": 186 + }, + { + "epoch": 0.02535077611333288, + "grad_norm": 1.53089895363344, + "learning_rate": 1.6846846846846845e-06, + "loss": 1.4528, + "step": 187 + }, + { + "epoch": 0.025486341760997764, + "grad_norm": 1.8143824818660468, + "learning_rate": 1.6936936936936936e-06, + "loss": 1.432, + "step": 188 + }, + { + "epoch": 0.025621907408662645, + "grad_norm": 1.7576199202312712, + "learning_rate": 1.7027027027027026e-06, + "loss": 1.4676, + "step": 189 + }, + { + "epoch": 0.025757473056327525, + "grad_norm": 1.7480417807021291, + "learning_rate": 1.7117117117117117e-06, + "loss": 1.4136, + "step": 190 + }, + { + "epoch": 0.02589303870399241, + "grad_norm": 1.6906427002878124, + "learning_rate": 1.7207207207207205e-06, + "loss": 1.3976, + "step": 191 + }, + { + "epoch": 0.02602860435165729, + "grad_norm": 1.595665275235212, + "learning_rate": 1.7297297297297298e-06, + "loss": 1.488, + "step": 192 + }, + { + "epoch": 0.02616416999932217, + "grad_norm": 1.892271645352398, + "learning_rate": 1.7387387387387386e-06, + "loss": 1.4823, + "step": 193 + }, + { + "epoch": 0.026299735646987052, + "grad_norm": 2.061930532856341, + "learning_rate": 1.7477477477477477e-06, + "loss": 1.4528, + "step": 194 + }, + { + "epoch": 0.026435301294651936, + "grad_norm": 1.5582148354764205, + "learning_rate": 1.7567567567567567e-06, + "loss": 1.4333, + "step": 195 + }, + { + "epoch": 0.026570866942316817, + "grad_norm": 1.5724564204171239, + "learning_rate": 1.7657657657657655e-06, + "loss": 1.4153, + "step": 196 + }, + { + "epoch": 0.026706432589981698, + "grad_norm": 1.7498958432994374, + "learning_rate": 1.7747747747747748e-06, + "loss": 1.4052, + "step": 197 + }, + { + "epoch": 0.026841998237646582, + "grad_norm": 1.4445934807542893, + "learning_rate": 1.7837837837837836e-06, + "loss": 1.4229, + "step": 198 + }, + { + "epoch": 0.026977563885311463, + "grad_norm": 1.9360394769538924, + "learning_rate": 1.7927927927927927e-06, + "loss": 1.4084, + "step": 199 + }, + { + "epoch": 0.027113129532976343, + "grad_norm": 1.4068943485983287, + "learning_rate": 1.8018018018018017e-06, + "loss": 1.4065, + "step": 200 + }, + { + "epoch": 0.027248695180641224, + "grad_norm": 1.4189606819785174, + "learning_rate": 1.8108108108108106e-06, + "loss": 1.4297, + "step": 201 + }, + { + "epoch": 0.02738426082830611, + "grad_norm": 2.0822996667476645, + "learning_rate": 1.8198198198198198e-06, + "loss": 1.4633, + "step": 202 + }, + { + "epoch": 0.02751982647597099, + "grad_norm": 2.1957851198776766, + "learning_rate": 1.8288288288288287e-06, + "loss": 1.4443, + "step": 203 + }, + { + "epoch": 0.02765539212363587, + "grad_norm": 2.1255486066116838, + "learning_rate": 1.837837837837838e-06, + "loss": 1.4657, + "step": 204 + }, + { + "epoch": 0.027790957771300754, + "grad_norm": 1.84628284232362, + "learning_rate": 1.8468468468468467e-06, + "loss": 1.4181, + "step": 205 + }, + { + "epoch": 0.027926523418965635, + "grad_norm": 2.773287667084954, + "learning_rate": 1.8558558558558556e-06, + "loss": 1.4367, + "step": 206 + }, + { + "epoch": 0.028062089066630515, + "grad_norm": 1.9262118713819927, + "learning_rate": 1.8648648648648648e-06, + "loss": 1.4385, + "step": 207 + }, + { + "epoch": 0.028197654714295396, + "grad_norm": 2.6698271271873812, + "learning_rate": 1.8738738738738737e-06, + "loss": 1.3991, + "step": 208 + }, + { + "epoch": 0.02833322036196028, + "grad_norm": 1.9299907001335062, + "learning_rate": 1.882882882882883e-06, + "loss": 1.4366, + "step": 209 + }, + { + "epoch": 0.02846878600962516, + "grad_norm": 1.4853311852940358, + "learning_rate": 1.8918918918918918e-06, + "loss": 1.4306, + "step": 210 + }, + { + "epoch": 0.028604351657290042, + "grad_norm": 1.5131310444678263, + "learning_rate": 1.9009009009009008e-06, + "loss": 1.4164, + "step": 211 + }, + { + "epoch": 0.028739917304954926, + "grad_norm": 3.0044187985625888, + "learning_rate": 1.90990990990991e-06, + "loss": 1.4201, + "step": 212 + }, + { + "epoch": 0.028875482952619807, + "grad_norm": 1.626239215180113, + "learning_rate": 1.9189189189189187e-06, + "loss": 1.4107, + "step": 213 + }, + { + "epoch": 0.029011048600284688, + "grad_norm": 2.056201852131565, + "learning_rate": 1.927927927927928e-06, + "loss": 1.4575, + "step": 214 + }, + { + "epoch": 0.02914661424794957, + "grad_norm": 1.5386570171102998, + "learning_rate": 1.936936936936937e-06, + "loss": 1.4271, + "step": 215 + }, + { + "epoch": 0.029282179895614453, + "grad_norm": 1.4438826320478255, + "learning_rate": 1.945945945945946e-06, + "loss": 1.4295, + "step": 216 + }, + { + "epoch": 0.029417745543279333, + "grad_norm": 1.561840166399376, + "learning_rate": 1.954954954954955e-06, + "loss": 1.4418, + "step": 217 + }, + { + "epoch": 0.029553311190944214, + "grad_norm": 3.6648625849051997, + "learning_rate": 1.9639639639639637e-06, + "loss": 1.4535, + "step": 218 + }, + { + "epoch": 0.029688876838609095, + "grad_norm": 1.847830961001093, + "learning_rate": 1.972972972972973e-06, + "loss": 1.4131, + "step": 219 + }, + { + "epoch": 0.02982444248627398, + "grad_norm": 2.434717490244357, + "learning_rate": 1.981981981981982e-06, + "loss": 1.4091, + "step": 220 + }, + { + "epoch": 0.02996000813393886, + "grad_norm": 2.688892478909532, + "learning_rate": 1.990990990990991e-06, + "loss": 1.357, + "step": 221 + }, + { + "epoch": 0.03009557378160374, + "grad_norm": 1.5548686505551659, + "learning_rate": 2e-06, + "loss": 1.4188, + "step": 222 + }, + { + "epoch": 0.030231139429268625, + "grad_norm": 1.4953380788257233, + "learning_rate": 1.9999999035789467e-06, + "loss": 1.4278, + "step": 223 + }, + { + "epoch": 0.030366705076933505, + "grad_norm": 1.7120980499638292, + "learning_rate": 1.9999996143158056e-06, + "loss": 1.3979, + "step": 224 + }, + { + "epoch": 0.030502270724598386, + "grad_norm": 1.7180871266547544, + "learning_rate": 1.9999991322106323e-06, + "loss": 1.4302, + "step": 225 + }, + { + "epoch": 0.030637836372263267, + "grad_norm": 2.8526735112207224, + "learning_rate": 1.99999845726352e-06, + "loss": 1.4125, + "step": 226 + }, + { + "epoch": 0.03077340201992815, + "grad_norm": 1.6150233383682473, + "learning_rate": 1.9999975894745984e-06, + "loss": 1.4303, + "step": 227 + }, + { + "epoch": 0.030908967667593032, + "grad_norm": 1.5299696343681943, + "learning_rate": 1.9999965288440357e-06, + "loss": 1.4078, + "step": 228 + }, + { + "epoch": 0.031044533315257913, + "grad_norm": 1.5513114961172367, + "learning_rate": 1.9999952753720353e-06, + "loss": 1.3979, + "step": 229 + }, + { + "epoch": 0.031180098962922797, + "grad_norm": 1.918235306830325, + "learning_rate": 1.99999382905884e-06, + "loss": 1.4138, + "step": 230 + }, + { + "epoch": 0.03131566461058768, + "grad_norm": 2.0090948675465636, + "learning_rate": 1.9999921899047284e-06, + "loss": 1.4223, + "step": 231 + }, + { + "epoch": 0.03145123025825256, + "grad_norm": 2.6669749828346894, + "learning_rate": 1.999990357910016e-06, + "loss": 1.3652, + "step": 232 + }, + { + "epoch": 0.03158679590591744, + "grad_norm": 1.541728090197999, + "learning_rate": 1.9999883330750567e-06, + "loss": 1.3798, + "step": 233 + }, + { + "epoch": 0.03172236155358232, + "grad_norm": 2.121057917703474, + "learning_rate": 1.9999861154002405e-06, + "loss": 1.4514, + "step": 234 + }, + { + "epoch": 0.0318579272012472, + "grad_norm": 1.8452865754986856, + "learning_rate": 1.9999837048859957e-06, + "loss": 1.3811, + "step": 235 + }, + { + "epoch": 0.031993492848912085, + "grad_norm": 1.4774486597848988, + "learning_rate": 1.999981101532787e-06, + "loss": 1.4134, + "step": 236 + }, + { + "epoch": 0.03212905849657697, + "grad_norm": 7.432021527611812, + "learning_rate": 1.9999783053411157e-06, + "loss": 1.3901, + "step": 237 + }, + { + "epoch": 0.032264624144241846, + "grad_norm": 2.6746197439382176, + "learning_rate": 1.999975316311522e-06, + "loss": 1.3862, + "step": 238 + }, + { + "epoch": 0.03240018979190673, + "grad_norm": 1.8641305734301594, + "learning_rate": 1.9999721344445816e-06, + "loss": 1.4516, + "step": 239 + }, + { + "epoch": 0.032535755439571615, + "grad_norm": 2.1615306009925512, + "learning_rate": 1.9999687597409084e-06, + "loss": 1.4406, + "step": 240 + }, + { + "epoch": 0.03267132108723649, + "grad_norm": 2.0731583815869152, + "learning_rate": 1.9999651922011532e-06, + "loss": 1.439, + "step": 241 + }, + { + "epoch": 0.032806886734901376, + "grad_norm": 2.36110597557446, + "learning_rate": 1.999961431826004e-06, + "loss": 1.3966, + "step": 242 + }, + { + "epoch": 0.03294245238256626, + "grad_norm": 2.1205783878442968, + "learning_rate": 1.999957478616186e-06, + "loss": 1.4089, + "step": 243 + }, + { + "epoch": 0.03307801803023114, + "grad_norm": 1.8669269966980822, + "learning_rate": 1.9999533325724613e-06, + "loss": 1.4467, + "step": 244 + }, + { + "epoch": 0.03321358367789602, + "grad_norm": 1.6413169350948584, + "learning_rate": 1.9999489936956295e-06, + "loss": 1.376, + "step": 245 + }, + { + "epoch": 0.033349149325560906, + "grad_norm": 2.148676646213094, + "learning_rate": 1.9999444619865273e-06, + "loss": 1.4013, + "step": 246 + }, + { + "epoch": 0.03348471497322578, + "grad_norm": 3.3840377673809163, + "learning_rate": 1.999939737446029e-06, + "loss": 1.3857, + "step": 247 + }, + { + "epoch": 0.03362028062089067, + "grad_norm": 1.6672992036754588, + "learning_rate": 1.999934820075045e-06, + "loss": 1.4163, + "step": 248 + }, + { + "epoch": 0.033755846268555545, + "grad_norm": 1.7035014219036255, + "learning_rate": 1.9999297098745245e-06, + "loss": 1.4228, + "step": 249 + }, + { + "epoch": 0.03389141191622043, + "grad_norm": 2.6934676291703465, + "learning_rate": 1.999924406845452e-06, + "loss": 1.4167, + "step": 250 + }, + { + "epoch": 0.03402697756388531, + "grad_norm": 1.7522907457460484, + "learning_rate": 1.9999189109888503e-06, + "loss": 1.3889, + "step": 251 + }, + { + "epoch": 0.03416254321155019, + "grad_norm": 1.6233690589164709, + "learning_rate": 1.9999132223057797e-06, + "loss": 1.428, + "step": 252 + }, + { + "epoch": 0.034298108859215075, + "grad_norm": 4.678654516414119, + "learning_rate": 1.999907340797337e-06, + "loss": 1.3917, + "step": 253 + }, + { + "epoch": 0.03443367450687996, + "grad_norm": 1.7906191946149932, + "learning_rate": 1.9999012664646567e-06, + "loss": 1.4126, + "step": 254 + }, + { + "epoch": 0.034569240154544836, + "grad_norm": 1.909959733670534, + "learning_rate": 1.99989499930891e-06, + "loss": 1.4078, + "step": 255 + }, + { + "epoch": 0.03470480580220972, + "grad_norm": 1.5968016442113955, + "learning_rate": 1.999888539331305e-06, + "loss": 1.3624, + "step": 256 + }, + { + "epoch": 0.034840371449874605, + "grad_norm": 1.4808023801476928, + "learning_rate": 1.999881886533088e-06, + "loss": 1.4235, + "step": 257 + }, + { + "epoch": 0.03497593709753948, + "grad_norm": 2.2008012921678106, + "learning_rate": 1.9998750409155416e-06, + "loss": 1.3531, + "step": 258 + }, + { + "epoch": 0.035111502745204366, + "grad_norm": 2.0007879285599492, + "learning_rate": 1.999868002479986e-06, + "loss": 1.4266, + "step": 259 + }, + { + "epoch": 0.035247068392869244, + "grad_norm": 1.5372176743179662, + "learning_rate": 1.9998607712277792e-06, + "loss": 1.398, + "step": 260 + }, + { + "epoch": 0.03538263404053413, + "grad_norm": 1.7551828928515532, + "learning_rate": 1.9998533471603145e-06, + "loss": 1.3996, + "step": 261 + }, + { + "epoch": 0.03551819968819901, + "grad_norm": 1.4951610226660987, + "learning_rate": 1.9998457302790245e-06, + "loss": 1.3749, + "step": 262 + }, + { + "epoch": 0.03565376533586389, + "grad_norm": 2.0734149543346634, + "learning_rate": 1.9998379205853775e-06, + "loss": 1.3864, + "step": 263 + }, + { + "epoch": 0.03578933098352877, + "grad_norm": 2.054184667772265, + "learning_rate": 1.9998299180808796e-06, + "loss": 1.3971, + "step": 264 + }, + { + "epoch": 0.03592489663119366, + "grad_norm": 2.657311418949087, + "learning_rate": 1.999821722767075e-06, + "loss": 1.3656, + "step": 265 + }, + { + "epoch": 0.036060462278858535, + "grad_norm": 1.795795009620089, + "learning_rate": 1.9998133346455422e-06, + "loss": 1.3853, + "step": 266 + }, + { + "epoch": 0.03619602792652342, + "grad_norm": 2.097507138281453, + "learning_rate": 1.9998047537179007e-06, + "loss": 1.386, + "step": 267 + }, + { + "epoch": 0.0363315935741883, + "grad_norm": 1.785024814758915, + "learning_rate": 1.999795979985804e-06, + "loss": 1.3784, + "step": 268 + }, + { + "epoch": 0.03646715922185318, + "grad_norm": 1.5455486537726475, + "learning_rate": 1.9997870134509444e-06, + "loss": 1.3855, + "step": 269 + }, + { + "epoch": 0.036602724869518065, + "grad_norm": 1.5462769419382232, + "learning_rate": 1.9997778541150515e-06, + "loss": 1.3626, + "step": 270 + }, + { + "epoch": 0.03673829051718295, + "grad_norm": 1.7859011974254393, + "learning_rate": 1.9997685019798908e-06, + "loss": 1.3647, + "step": 271 + }, + { + "epoch": 0.036873856164847826, + "grad_norm": 1.6109085417913906, + "learning_rate": 1.999758957047266e-06, + "loss": 1.4239, + "step": 272 + }, + { + "epoch": 0.03700942181251271, + "grad_norm": 1.7856583369435595, + "learning_rate": 1.9997492193190185e-06, + "loss": 1.3727, + "step": 273 + }, + { + "epoch": 0.03714498746017759, + "grad_norm": 2.6454228641803446, + "learning_rate": 1.9997392887970253e-06, + "loss": 1.3635, + "step": 274 + }, + { + "epoch": 0.03728055310784247, + "grad_norm": 1.6497445339633086, + "learning_rate": 1.999729165483202e-06, + "loss": 1.3744, + "step": 275 + }, + { + "epoch": 0.037416118755507356, + "grad_norm": 2.7164900407472437, + "learning_rate": 1.9997188493795e-06, + "loss": 1.4027, + "step": 276 + }, + { + "epoch": 0.037551684403172234, + "grad_norm": 1.6016254707762736, + "learning_rate": 1.99970834048791e-06, + "loss": 1.3982, + "step": 277 + }, + { + "epoch": 0.03768725005083712, + "grad_norm": 4.040154685626384, + "learning_rate": 1.999697638810457e-06, + "loss": 1.3754, + "step": 278 + }, + { + "epoch": 0.037822815698502, + "grad_norm": 1.8233751148061572, + "learning_rate": 1.9996867443492057e-06, + "loss": 1.4241, + "step": 279 + }, + { + "epoch": 0.03795838134616688, + "grad_norm": 1.5955700482259814, + "learning_rate": 1.999675657106257e-06, + "loss": 1.4022, + "step": 280 + }, + { + "epoch": 0.038093946993831763, + "grad_norm": 1.7880389643996633, + "learning_rate": 1.9996643770837486e-06, + "loss": 1.3831, + "step": 281 + }, + { + "epoch": 0.03822951264149665, + "grad_norm": 4.631067613559033, + "learning_rate": 1.999652904283856e-06, + "loss": 1.4063, + "step": 282 + }, + { + "epoch": 0.038365078289161525, + "grad_norm": 2.1179589560514462, + "learning_rate": 1.9996412387087914e-06, + "loss": 1.3859, + "step": 283 + }, + { + "epoch": 0.03850064393682641, + "grad_norm": 1.7067494433054944, + "learning_rate": 1.9996293803608053e-06, + "loss": 1.3277, + "step": 284 + }, + { + "epoch": 0.038636209584491286, + "grad_norm": 1.4315973921468204, + "learning_rate": 1.9996173292421828e-06, + "loss": 1.3776, + "step": 285 + }, + { + "epoch": 0.03877177523215617, + "grad_norm": 2.815122479572097, + "learning_rate": 1.9996050853552494e-06, + "loss": 1.4142, + "step": 286 + }, + { + "epoch": 0.038907340879821055, + "grad_norm": 1.4872788621547943, + "learning_rate": 1.999592648702366e-06, + "loss": 1.4064, + "step": 287 + }, + { + "epoch": 0.03904290652748593, + "grad_norm": 3.430510616413572, + "learning_rate": 1.99958001928593e-06, + "loss": 1.44, + "step": 288 + }, + { + "epoch": 0.039178472175150816, + "grad_norm": 1.6911609845512268, + "learning_rate": 1.9995671971083777e-06, + "loss": 1.382, + "step": 289 + }, + { + "epoch": 0.0393140378228157, + "grad_norm": 3.7168209009680253, + "learning_rate": 1.9995541821721814e-06, + "loss": 1.378, + "step": 290 + }, + { + "epoch": 0.03944960347048058, + "grad_norm": 1.8750241074768454, + "learning_rate": 1.9995409744798512e-06, + "loss": 1.4002, + "step": 291 + }, + { + "epoch": 0.03958516911814546, + "grad_norm": 1.7178571989003566, + "learning_rate": 1.999527574033934e-06, + "loss": 1.333, + "step": 292 + }, + { + "epoch": 0.039720734765810346, + "grad_norm": 1.7082441045308787, + "learning_rate": 1.9995139808370142e-06, + "loss": 1.3798, + "step": 293 + }, + { + "epoch": 0.039856300413475224, + "grad_norm": 3.359373389778896, + "learning_rate": 1.9995001948917124e-06, + "loss": 1.355, + "step": 294 + }, + { + "epoch": 0.03999186606114011, + "grad_norm": 1.7423392966373235, + "learning_rate": 1.999486216200688e-06, + "loss": 1.4181, + "step": 295 + }, + { + "epoch": 0.04012743170880499, + "grad_norm": 2.655311150367046, + "learning_rate": 1.999472044766636e-06, + "loss": 1.3985, + "step": 296 + }, + { + "epoch": 0.04026299735646987, + "grad_norm": 2.7373726267225313, + "learning_rate": 1.9994576805922898e-06, + "loss": 1.3648, + "step": 297 + }, + { + "epoch": 0.040398563004134753, + "grad_norm": 1.5443917777036618, + "learning_rate": 1.9994431236804187e-06, + "loss": 1.3776, + "step": 298 + }, + { + "epoch": 0.04053412865179963, + "grad_norm": 1.87053864054108, + "learning_rate": 1.9994283740338306e-06, + "loss": 1.4082, + "step": 299 + }, + { + "epoch": 0.040669694299464515, + "grad_norm": 1.609364660906859, + "learning_rate": 1.9994134316553693e-06, + "loss": 1.3972, + "step": 300 + }, + { + "epoch": 0.0408052599471294, + "grad_norm": 1.8332046614096695, + "learning_rate": 1.999398296547917e-06, + "loss": 1.3685, + "step": 301 + }, + { + "epoch": 0.040940825594794276, + "grad_norm": 1.8944934849687547, + "learning_rate": 1.9993829687143913e-06, + "loss": 1.398, + "step": 302 + }, + { + "epoch": 0.04107639124245916, + "grad_norm": 5.761227330744534, + "learning_rate": 1.9993674481577497e-06, + "loss": 1.3841, + "step": 303 + }, + { + "epoch": 0.041211956890124045, + "grad_norm": 1.7867896862570118, + "learning_rate": 1.9993517348809836e-06, + "loss": 1.3604, + "step": 304 + }, + { + "epoch": 0.04134752253778892, + "grad_norm": 1.625435789903539, + "learning_rate": 1.999335828887124e-06, + "loss": 1.3698, + "step": 305 + }, + { + "epoch": 0.041483088185453806, + "grad_norm": 1.4650410127498437, + "learning_rate": 1.999319730179238e-06, + "loss": 1.3869, + "step": 306 + }, + { + "epoch": 0.04161865383311869, + "grad_norm": 1.435460931772477, + "learning_rate": 1.9993034387604302e-06, + "loss": 1.3791, + "step": 307 + }, + { + "epoch": 0.04175421948078357, + "grad_norm": 1.4961397946018833, + "learning_rate": 1.9992869546338428e-06, + "loss": 1.375, + "step": 308 + }, + { + "epoch": 0.04188978512844845, + "grad_norm": 3.171430204404736, + "learning_rate": 1.9992702778026532e-06, + "loss": 1.3503, + "step": 309 + }, + { + "epoch": 0.042025350776113336, + "grad_norm": 3.80216439123467, + "learning_rate": 1.999253408270079e-06, + "loss": 1.3514, + "step": 310 + }, + { + "epoch": 0.042160916423778214, + "grad_norm": 1.7029640792765957, + "learning_rate": 1.9992363460393724e-06, + "loss": 1.3612, + "step": 311 + }, + { + "epoch": 0.0422964820714431, + "grad_norm": 1.5188783955183822, + "learning_rate": 1.9992190911138236e-06, + "loss": 1.3645, + "step": 312 + }, + { + "epoch": 0.042432047719107975, + "grad_norm": 1.8455493078066156, + "learning_rate": 1.999201643496761e-06, + "loss": 1.3753, + "step": 313 + }, + { + "epoch": 0.04256761336677286, + "grad_norm": 2.4180437109708417, + "learning_rate": 1.9991840031915484e-06, + "loss": 1.3702, + "step": 314 + }, + { + "epoch": 0.042703179014437743, + "grad_norm": 1.8791510489007646, + "learning_rate": 1.9991661702015877e-06, + "loss": 1.3781, + "step": 315 + }, + { + "epoch": 0.04283874466210262, + "grad_norm": 1.7991919032170818, + "learning_rate": 1.9991481445303182e-06, + "loss": 1.3647, + "step": 316 + }, + { + "epoch": 0.042974310309767505, + "grad_norm": 1.7632955113072728, + "learning_rate": 1.999129926181216e-06, + "loss": 1.353, + "step": 317 + }, + { + "epoch": 0.04310987595743239, + "grad_norm": 1.7963819751485304, + "learning_rate": 1.9991115151577938e-06, + "loss": 1.3596, + "step": 318 + }, + { + "epoch": 0.043245441605097266, + "grad_norm": 8.921840404938333, + "learning_rate": 1.999092911463603e-06, + "loss": 1.4082, + "step": 319 + }, + { + "epoch": 0.04338100725276215, + "grad_norm": 2.00928275734412, + "learning_rate": 1.99907411510223e-06, + "loss": 1.3966, + "step": 320 + }, + { + "epoch": 0.043516572900427035, + "grad_norm": 1.8271936465017902, + "learning_rate": 1.9990551260773003e-06, + "loss": 1.3403, + "step": 321 + }, + { + "epoch": 0.04365213854809191, + "grad_norm": 2.7782555785270997, + "learning_rate": 1.9990359443924755e-06, + "loss": 1.3796, + "step": 322 + }, + { + "epoch": 0.043787704195756796, + "grad_norm": 2.0181306379935475, + "learning_rate": 1.999016570051455e-06, + "loss": 1.3691, + "step": 323 + }, + { + "epoch": 0.043923269843421674, + "grad_norm": 3.090482039607639, + "learning_rate": 1.9989970030579744e-06, + "loss": 1.3578, + "step": 324 + }, + { + "epoch": 0.04405883549108656, + "grad_norm": 1.487002626116209, + "learning_rate": 1.9989772434158076e-06, + "loss": 1.3743, + "step": 325 + }, + { + "epoch": 0.04419440113875144, + "grad_norm": 1.5946887859470575, + "learning_rate": 1.9989572911287647e-06, + "loss": 1.3653, + "step": 326 + }, + { + "epoch": 0.04432996678641632, + "grad_norm": 1.7696908632724295, + "learning_rate": 1.9989371462006938e-06, + "loss": 1.353, + "step": 327 + }, + { + "epoch": 0.044465532434081204, + "grad_norm": 1.444090974387985, + "learning_rate": 1.998916808635479e-06, + "loss": 1.3689, + "step": 328 + }, + { + "epoch": 0.04460109808174609, + "grad_norm": 1.5906753532749278, + "learning_rate": 1.998896278437043e-06, + "loss": 1.369, + "step": 329 + }, + { + "epoch": 0.044736663729410965, + "grad_norm": 3.4850686829824555, + "learning_rate": 1.998875555609344e-06, + "loss": 1.3745, + "step": 330 + }, + { + "epoch": 0.04487222937707585, + "grad_norm": 1.4947674485205928, + "learning_rate": 1.998854640156379e-06, + "loss": 1.3635, + "step": 331 + }, + { + "epoch": 0.045007795024740734, + "grad_norm": 2.317490776077818, + "learning_rate": 1.998833532082181e-06, + "loss": 1.2992, + "step": 332 + }, + { + "epoch": 0.04514336067240561, + "grad_norm": 1.9416763977213127, + "learning_rate": 1.9988122313908212e-06, + "loss": 1.3952, + "step": 333 + }, + { + "epoch": 0.045278926320070495, + "grad_norm": 2.7494587734989637, + "learning_rate": 1.998790738086406e-06, + "loss": 1.4035, + "step": 334 + }, + { + "epoch": 0.04541449196773538, + "grad_norm": 2.413302791299032, + "learning_rate": 1.9987690521730817e-06, + "loss": 1.3757, + "step": 335 + }, + { + "epoch": 0.045550057615400256, + "grad_norm": 1.5989028512335826, + "learning_rate": 1.9987471736550287e-06, + "loss": 1.399, + "step": 336 + }, + { + "epoch": 0.04568562326306514, + "grad_norm": 1.5736310572763854, + "learning_rate": 1.9987251025364677e-06, + "loss": 1.4135, + "step": 337 + }, + { + "epoch": 0.04582118891073002, + "grad_norm": 1.8401285955892226, + "learning_rate": 1.9987028388216532e-06, + "loss": 1.367, + "step": 338 + }, + { + "epoch": 0.0459567545583949, + "grad_norm": 3.4141212947568156, + "learning_rate": 1.99868038251488e-06, + "loss": 1.3684, + "step": 339 + }, + { + "epoch": 0.046092320206059786, + "grad_norm": 1.4875369168328567, + "learning_rate": 1.9986577336204782e-06, + "loss": 1.3428, + "step": 340 + }, + { + "epoch": 0.046227885853724664, + "grad_norm": 1.8064418713427566, + "learning_rate": 1.9986348921428154e-06, + "loss": 1.3882, + "step": 341 + }, + { + "epoch": 0.04636345150138955, + "grad_norm": 1.4850778190226952, + "learning_rate": 1.9986118580862964e-06, + "loss": 1.4262, + "step": 342 + }, + { + "epoch": 0.04649901714905443, + "grad_norm": 1.776900412499991, + "learning_rate": 1.998588631455363e-06, + "loss": 1.3544, + "step": 343 + }, + { + "epoch": 0.04663458279671931, + "grad_norm": 1.8509359555507916, + "learning_rate": 1.9985652122544947e-06, + "loss": 1.323, + "step": 344 + }, + { + "epoch": 0.046770148444384194, + "grad_norm": 2.1964607659462674, + "learning_rate": 1.998541600488207e-06, + "loss": 1.3325, + "step": 345 + }, + { + "epoch": 0.04690571409204908, + "grad_norm": 1.71178728628039, + "learning_rate": 1.998517796161054e-06, + "loss": 1.3937, + "step": 346 + }, + { + "epoch": 0.047041279739713955, + "grad_norm": 2.4784396220120697, + "learning_rate": 1.9984937992776257e-06, + "loss": 1.3923, + "step": 347 + }, + { + "epoch": 0.04717684538737884, + "grad_norm": 2.0851828784612016, + "learning_rate": 1.99846960984255e-06, + "loss": 1.3944, + "step": 348 + }, + { + "epoch": 0.04731241103504372, + "grad_norm": 1.6139327972967603, + "learning_rate": 1.9984452278604907e-06, + "loss": 1.3968, + "step": 349 + }, + { + "epoch": 0.0474479766827086, + "grad_norm": 1.5722185810612594, + "learning_rate": 1.998420653336151e-06, + "loss": 1.3484, + "step": 350 + }, + { + "epoch": 0.047583542330373485, + "grad_norm": 2.011221595006952, + "learning_rate": 1.99839588627427e-06, + "loss": 1.3445, + "step": 351 + }, + { + "epoch": 0.04771910797803836, + "grad_norm": 1.6051910121117103, + "learning_rate": 1.9983709266796224e-06, + "loss": 1.3248, + "step": 352 + }, + { + "epoch": 0.047854673625703247, + "grad_norm": 1.7966225148145154, + "learning_rate": 1.9983457745570222e-06, + "loss": 1.3602, + "step": 353 + }, + { + "epoch": 0.04799023927336813, + "grad_norm": 1.8802854550594112, + "learning_rate": 1.99832042991132e-06, + "loss": 1.3609, + "step": 354 + }, + { + "epoch": 0.04812580492103301, + "grad_norm": 1.6932533743452551, + "learning_rate": 1.9982948927474033e-06, + "loss": 1.3864, + "step": 355 + }, + { + "epoch": 0.04826137056869789, + "grad_norm": 1.57079307022146, + "learning_rate": 1.9982691630701966e-06, + "loss": 1.3711, + "step": 356 + }, + { + "epoch": 0.048396936216362776, + "grad_norm": 2.821896672029162, + "learning_rate": 1.9982432408846615e-06, + "loss": 1.3841, + "step": 357 + }, + { + "epoch": 0.048532501864027654, + "grad_norm": 2.083575810944987, + "learning_rate": 1.998217126195797e-06, + "loss": 1.3787, + "step": 358 + }, + { + "epoch": 0.04866806751169254, + "grad_norm": 1.7028902498317742, + "learning_rate": 1.9981908190086398e-06, + "loss": 1.3695, + "step": 359 + }, + { + "epoch": 0.04880363315935742, + "grad_norm": 1.511472007211898, + "learning_rate": 1.9981643193282617e-06, + "loss": 1.3433, + "step": 360 + }, + { + "epoch": 0.0489391988070223, + "grad_norm": 1.847515335441604, + "learning_rate": 1.9981376271597735e-06, + "loss": 1.4109, + "step": 361 + }, + { + "epoch": 0.049074764454687184, + "grad_norm": 2.0977058540865854, + "learning_rate": 1.9981107425083233e-06, + "loss": 1.3486, + "step": 362 + }, + { + "epoch": 0.04921033010235206, + "grad_norm": 2.0117266034314825, + "learning_rate": 1.9980836653790946e-06, + "loss": 1.3841, + "step": 363 + }, + { + "epoch": 0.049345895750016945, + "grad_norm": 1.833170045100589, + "learning_rate": 1.9980563957773097e-06, + "loss": 1.3536, + "step": 364 + }, + { + "epoch": 0.04948146139768183, + "grad_norm": 1.9420242599252318, + "learning_rate": 1.998028933708227e-06, + "loss": 1.3347, + "step": 365 + }, + { + "epoch": 0.04961702704534671, + "grad_norm": 1.8152629437789516, + "learning_rate": 1.9980012791771424e-06, + "loss": 1.3906, + "step": 366 + }, + { + "epoch": 0.04975259269301159, + "grad_norm": 7.28880671962065, + "learning_rate": 1.9979734321893885e-06, + "loss": 1.3332, + "step": 367 + }, + { + "epoch": 0.049888158340676475, + "grad_norm": 2.4733300676285395, + "learning_rate": 1.9979453927503364e-06, + "loss": 1.3872, + "step": 368 + }, + { + "epoch": 0.05002372398834135, + "grad_norm": 1.5610173579686677, + "learning_rate": 1.9979171608653923e-06, + "loss": 1.3452, + "step": 369 + }, + { + "epoch": 0.050159289636006237, + "grad_norm": 2.7907080589985136, + "learning_rate": 1.9978887365400006e-06, + "loss": 1.3579, + "step": 370 + }, + { + "epoch": 0.05029485528367112, + "grad_norm": 1.87535182737447, + "learning_rate": 1.997860119779643e-06, + "loss": 1.3511, + "step": 371 + }, + { + "epoch": 0.050430420931336, + "grad_norm": 1.572861286817152, + "learning_rate": 1.9978313105898378e-06, + "loss": 1.3577, + "step": 372 + }, + { + "epoch": 0.05056598657900088, + "grad_norm": 2.198483295547023, + "learning_rate": 1.997802308976141e-06, + "loss": 1.3899, + "step": 373 + }, + { + "epoch": 0.05070155222666576, + "grad_norm": 2.1857867923160406, + "learning_rate": 1.997773114944145e-06, + "loss": 1.366, + "step": 374 + }, + { + "epoch": 0.050837117874330644, + "grad_norm": 1.917445107759119, + "learning_rate": 1.99774372849948e-06, + "loss": 1.3071, + "step": 375 + }, + { + "epoch": 0.05097268352199553, + "grad_norm": 2.696528505634222, + "learning_rate": 1.9977141496478124e-06, + "loss": 1.3519, + "step": 376 + }, + { + "epoch": 0.051108249169660405, + "grad_norm": 2.1771422341390116, + "learning_rate": 1.9976843783948463e-06, + "loss": 1.3925, + "step": 377 + }, + { + "epoch": 0.05124381481732529, + "grad_norm": 2.1930313198169573, + "learning_rate": 1.9976544147463237e-06, + "loss": 1.3456, + "step": 378 + }, + { + "epoch": 0.051379380464990174, + "grad_norm": 16.44643581750224, + "learning_rate": 1.9976242587080216e-06, + "loss": 1.3857, + "step": 379 + }, + { + "epoch": 0.05151494611265505, + "grad_norm": 8.838252097643538, + "learning_rate": 1.997593910285756e-06, + "loss": 1.3421, + "step": 380 + }, + { + "epoch": 0.051650511760319935, + "grad_norm": 1.498712396306132, + "learning_rate": 1.9975633694853797e-06, + "loss": 1.3719, + "step": 381 + }, + { + "epoch": 0.05178607740798482, + "grad_norm": 2.249634962117406, + "learning_rate": 1.9975326363127815e-06, + "loss": 1.3529, + "step": 382 + }, + { + "epoch": 0.0519216430556497, + "grad_norm": 2.040296058250955, + "learning_rate": 1.9975017107738887e-06, + "loss": 1.368, + "step": 383 + }, + { + "epoch": 0.05205720870331458, + "grad_norm": 1.6603728223985388, + "learning_rate": 1.997470592874665e-06, + "loss": 1.3666, + "step": 384 + }, + { + "epoch": 0.052192774350979465, + "grad_norm": 1.8374628809103548, + "learning_rate": 1.9974392826211107e-06, + "loss": 1.3951, + "step": 385 + }, + { + "epoch": 0.05232833999864434, + "grad_norm": 2.917121686952167, + "learning_rate": 1.997407780019264e-06, + "loss": 1.3261, + "step": 386 + }, + { + "epoch": 0.05246390564630923, + "grad_norm": 2.7868934908063316, + "learning_rate": 1.9973760850752e-06, + "loss": 1.3375, + "step": 387 + }, + { + "epoch": 0.052599471293974104, + "grad_norm": 1.6462466170532575, + "learning_rate": 1.997344197795031e-06, + "loss": 1.3846, + "step": 388 + }, + { + "epoch": 0.05273503694163899, + "grad_norm": 2.167955744211204, + "learning_rate": 1.9973121181849056e-06, + "loss": 1.37, + "step": 389 + }, + { + "epoch": 0.05287060258930387, + "grad_norm": 2.68895188232664, + "learning_rate": 1.997279846251011e-06, + "loss": 1.3525, + "step": 390 + }, + { + "epoch": 0.05300616823696875, + "grad_norm": 1.8260080628679796, + "learning_rate": 1.99724738199957e-06, + "loss": 1.3726, + "step": 391 + }, + { + "epoch": 0.053141733884633634, + "grad_norm": 1.8794881762534281, + "learning_rate": 1.997214725436843e-06, + "loss": 1.3838, + "step": 392 + }, + { + "epoch": 0.05327729953229852, + "grad_norm": 1.4909446230686645, + "learning_rate": 1.997181876569128e-06, + "loss": 1.3425, + "step": 393 + }, + { + "epoch": 0.053412865179963395, + "grad_norm": 7.500650003331918, + "learning_rate": 1.9971488354027592e-06, + "loss": 1.3509, + "step": 394 + }, + { + "epoch": 0.05354843082762828, + "grad_norm": 2.016303206876096, + "learning_rate": 1.997115601944108e-06, + "loss": 1.321, + "step": 395 + }, + { + "epoch": 0.053683996475293164, + "grad_norm": 1.9318860795218582, + "learning_rate": 1.9970821761995843e-06, + "loss": 1.3222, + "step": 396 + }, + { + "epoch": 0.05381956212295804, + "grad_norm": 1.8120709273226159, + "learning_rate": 1.9970485581756334e-06, + "loss": 1.3527, + "step": 397 + }, + { + "epoch": 0.053955127770622925, + "grad_norm": 1.5458854510615796, + "learning_rate": 1.997014747878738e-06, + "loss": 1.335, + "step": 398 + }, + { + "epoch": 0.0540906934182878, + "grad_norm": 1.8671478883653112, + "learning_rate": 1.996980745315419e-06, + "loss": 1.3632, + "step": 399 + }, + { + "epoch": 0.05422625906595269, + "grad_norm": 1.50924497364322, + "learning_rate": 1.9969465504922324e-06, + "loss": 1.3911, + "step": 400 + }, + { + "epoch": 0.05436182471361757, + "grad_norm": 2.9952255403144674, + "learning_rate": 1.9969121634157734e-06, + "loss": 1.3661, + "step": 401 + }, + { + "epoch": 0.05449739036128245, + "grad_norm": 1.7875867337870743, + "learning_rate": 1.9968775840926725e-06, + "loss": 1.3468, + "step": 402 + }, + { + "epoch": 0.05463295600894733, + "grad_norm": 1.5332105617135894, + "learning_rate": 1.996842812529598e-06, + "loss": 1.3075, + "step": 403 + }, + { + "epoch": 0.05476852165661222, + "grad_norm": 1.8097497286329118, + "learning_rate": 1.9968078487332563e-06, + "loss": 1.3541, + "step": 404 + }, + { + "epoch": 0.054904087304277094, + "grad_norm": 2.113591854498792, + "learning_rate": 1.9967726927103893e-06, + "loss": 1.3399, + "step": 405 + }, + { + "epoch": 0.05503965295194198, + "grad_norm": 1.5827227353247677, + "learning_rate": 1.9967373444677763e-06, + "loss": 1.3178, + "step": 406 + }, + { + "epoch": 0.05517521859960686, + "grad_norm": 1.7248819491406286, + "learning_rate": 1.996701804012234e-06, + "loss": 1.3462, + "step": 407 + }, + { + "epoch": 0.05531078424727174, + "grad_norm": 1.6909821886217677, + "learning_rate": 1.9966660713506167e-06, + "loss": 1.3816, + "step": 408 + }, + { + "epoch": 0.055446349894936624, + "grad_norm": 2.559193175741804, + "learning_rate": 1.996630146489815e-06, + "loss": 1.342, + "step": 409 + }, + { + "epoch": 0.05558191554260151, + "grad_norm": 1.7585051473790494, + "learning_rate": 1.996594029436756e-06, + "loss": 1.3598, + "step": 410 + }, + { + "epoch": 0.055717481190266385, + "grad_norm": 1.9881554639572703, + "learning_rate": 1.9965577201984048e-06, + "loss": 1.3079, + "step": 411 + }, + { + "epoch": 0.05585304683793127, + "grad_norm": 1.5868097228204503, + "learning_rate": 1.9965212187817644e-06, + "loss": 1.3466, + "step": 412 + }, + { + "epoch": 0.05598861248559615, + "grad_norm": 2.353792133692654, + "learning_rate": 1.9964845251938722e-06, + "loss": 1.3744, + "step": 413 + }, + { + "epoch": 0.05612417813326103, + "grad_norm": 2.222473820400585, + "learning_rate": 1.9964476394418054e-06, + "loss": 1.3272, + "step": 414 + }, + { + "epoch": 0.056259743780925915, + "grad_norm": 2.1970092682986793, + "learning_rate": 1.996410561532677e-06, + "loss": 1.3436, + "step": 415 + }, + { + "epoch": 0.05639530942859079, + "grad_norm": 1.6394823897164519, + "learning_rate": 1.996373291473637e-06, + "loss": 1.3325, + "step": 416 + }, + { + "epoch": 0.05653087507625568, + "grad_norm": 1.7216281091072123, + "learning_rate": 1.9963358292718723e-06, + "loss": 1.395, + "step": 417 + }, + { + "epoch": 0.05666644072392056, + "grad_norm": 1.619662329340508, + "learning_rate": 1.996298174934608e-06, + "loss": 1.3347, + "step": 418 + }, + { + "epoch": 0.05680200637158544, + "grad_norm": 1.7173869693838835, + "learning_rate": 1.996260328469104e-06, + "loss": 1.3313, + "step": 419 + }, + { + "epoch": 0.05693757201925032, + "grad_norm": 2.2583250226715355, + "learning_rate": 1.9962222898826608e-06, + "loss": 1.3532, + "step": 420 + }, + { + "epoch": 0.05707313766691521, + "grad_norm": 2.7431590490279114, + "learning_rate": 1.996184059182612e-06, + "loss": 1.3377, + "step": 421 + }, + { + "epoch": 0.057208703314580084, + "grad_norm": 1.743520838197008, + "learning_rate": 1.996145636376331e-06, + "loss": 1.3682, + "step": 422 + }, + { + "epoch": 0.05734426896224497, + "grad_norm": 3.6329148413603805, + "learning_rate": 1.996107021471227e-06, + "loss": 1.3626, + "step": 423 + }, + { + "epoch": 0.05747983460990985, + "grad_norm": 2.90841716100241, + "learning_rate": 1.996068214474747e-06, + "loss": 1.3564, + "step": 424 + }, + { + "epoch": 0.05761540025757473, + "grad_norm": 1.8144982978771513, + "learning_rate": 1.996029215394374e-06, + "loss": 1.336, + "step": 425 + }, + { + "epoch": 0.057750965905239614, + "grad_norm": 1.7067043320145194, + "learning_rate": 1.9959900242376294e-06, + "loss": 1.3178, + "step": 426 + }, + { + "epoch": 0.05788653155290449, + "grad_norm": 2.3564278783384887, + "learning_rate": 1.9959506410120702e-06, + "loss": 1.332, + "step": 427 + }, + { + "epoch": 0.058022097200569375, + "grad_norm": 1.7497293489466554, + "learning_rate": 1.9959110657252915e-06, + "loss": 1.3447, + "step": 428 + }, + { + "epoch": 0.05815766284823426, + "grad_norm": 3.0974425815630786, + "learning_rate": 1.995871298384925e-06, + "loss": 1.3508, + "step": 429 + }, + { + "epoch": 0.05829322849589914, + "grad_norm": 1.9721707749922412, + "learning_rate": 1.9958313389986395e-06, + "loss": 1.3182, + "step": 430 + }, + { + "epoch": 0.05842879414356402, + "grad_norm": 1.7311048334111603, + "learning_rate": 1.995791187574141e-06, + "loss": 1.3479, + "step": 431 + }, + { + "epoch": 0.058564359791228905, + "grad_norm": 1.501708088865736, + "learning_rate": 1.995750844119172e-06, + "loss": 1.3291, + "step": 432 + }, + { + "epoch": 0.05869992543889378, + "grad_norm": 1.5776418825476481, + "learning_rate": 1.995710308641513e-06, + "loss": 1.3471, + "step": 433 + }, + { + "epoch": 0.05883549108655867, + "grad_norm": 3.0735207626815395, + "learning_rate": 1.9956695811489803e-06, + "loss": 1.3295, + "step": 434 + }, + { + "epoch": 0.05897105673422355, + "grad_norm": 1.7493578249586563, + "learning_rate": 1.9956286616494287e-06, + "loss": 1.3123, + "step": 435 + }, + { + "epoch": 0.05910662238188843, + "grad_norm": 1.6021438706106559, + "learning_rate": 1.9955875501507485e-06, + "loss": 1.3387, + "step": 436 + }, + { + "epoch": 0.05924218802955331, + "grad_norm": 1.785712094352974, + "learning_rate": 1.995546246660868e-06, + "loss": 1.3089, + "step": 437 + }, + { + "epoch": 0.05937775367721819, + "grad_norm": 2.3124003486256077, + "learning_rate": 1.995504751187752e-06, + "loss": 1.3633, + "step": 438 + }, + { + "epoch": 0.059513319324883074, + "grad_norm": 1.569989717867369, + "learning_rate": 1.9954630637394027e-06, + "loss": 1.3052, + "step": 439 + }, + { + "epoch": 0.05964888497254796, + "grad_norm": 1.9498817648603564, + "learning_rate": 1.9954211843238594e-06, + "loss": 1.333, + "step": 440 + }, + { + "epoch": 0.059784450620212835, + "grad_norm": 2.0977219147792474, + "learning_rate": 1.9953791129491983e-06, + "loss": 1.3559, + "step": 441 + }, + { + "epoch": 0.05992001626787772, + "grad_norm": 1.6232724592082064, + "learning_rate": 1.995336849623532e-06, + "loss": 1.364, + "step": 442 + }, + { + "epoch": 0.060055581915542604, + "grad_norm": 1.7701646929365478, + "learning_rate": 1.995294394355011e-06, + "loss": 1.3017, + "step": 443 + }, + { + "epoch": 0.06019114756320748, + "grad_norm": 2.1834042549224697, + "learning_rate": 1.9952517471518228e-06, + "loss": 1.3329, + "step": 444 + }, + { + "epoch": 0.060326713210872365, + "grad_norm": 1.831495472613614, + "learning_rate": 1.9952089080221907e-06, + "loss": 1.311, + "step": 445 + }, + { + "epoch": 0.06046227885853725, + "grad_norm": 2.7179925923803467, + "learning_rate": 1.9951658769743766e-06, + "loss": 1.3535, + "step": 446 + }, + { + "epoch": 0.06059784450620213, + "grad_norm": 1.6609289025423724, + "learning_rate": 1.9951226540166785e-06, + "loss": 1.3314, + "step": 447 + }, + { + "epoch": 0.06073341015386701, + "grad_norm": 1.3791677755130247, + "learning_rate": 1.9950792391574316e-06, + "loss": 1.343, + "step": 448 + }, + { + "epoch": 0.060868975801531895, + "grad_norm": 5.48584609986471, + "learning_rate": 1.995035632405008e-06, + "loss": 1.3562, + "step": 449 + }, + { + "epoch": 0.06100454144919677, + "grad_norm": 1.6531928755178391, + "learning_rate": 1.994991833767817e-06, + "loss": 1.3341, + "step": 450 + }, + { + "epoch": 0.06114010709686166, + "grad_norm": 1.6991055472441285, + "learning_rate": 1.994947843254305e-06, + "loss": 1.3308, + "step": 451 + }, + { + "epoch": 0.061275672744526534, + "grad_norm": 1.5319317366627059, + "learning_rate": 1.994903660872955e-06, + "loss": 1.3211, + "step": 452 + }, + { + "epoch": 0.06141123839219142, + "grad_norm": 1.7593571100555643, + "learning_rate": 1.9948592866322873e-06, + "loss": 1.2922, + "step": 453 + }, + { + "epoch": 0.0615468040398563, + "grad_norm": 5.357783537661789, + "learning_rate": 1.9948147205408593e-06, + "loss": 1.3631, + "step": 454 + }, + { + "epoch": 0.06168236968752118, + "grad_norm": 3.4523456453408006, + "learning_rate": 1.9947699626072646e-06, + "loss": 1.3463, + "step": 455 + }, + { + "epoch": 0.061817935335186064, + "grad_norm": 1.8352315386304954, + "learning_rate": 1.9947250128401354e-06, + "loss": 1.3027, + "step": 456 + }, + { + "epoch": 0.06195350098285095, + "grad_norm": 1.7888043485574892, + "learning_rate": 1.994679871248139e-06, + "loss": 1.3357, + "step": 457 + }, + { + "epoch": 0.062089066630515825, + "grad_norm": 1.6423321033893716, + "learning_rate": 1.9946345378399807e-06, + "loss": 1.3617, + "step": 458 + }, + { + "epoch": 0.06222463227818071, + "grad_norm": 1.602565428228716, + "learning_rate": 1.9945890126244038e-06, + "loss": 1.3848, + "step": 459 + }, + { + "epoch": 0.062360197925845594, + "grad_norm": 1.803049787317197, + "learning_rate": 1.9945432956101858e-06, + "loss": 1.3382, + "step": 460 + }, + { + "epoch": 0.06249576357351047, + "grad_norm": 4.852185140696661, + "learning_rate": 1.994497386806144e-06, + "loss": 1.324, + "step": 461 + }, + { + "epoch": 0.06263132922117536, + "grad_norm": 2.154217941403144, + "learning_rate": 1.9944512862211313e-06, + "loss": 1.2844, + "step": 462 + }, + { + "epoch": 0.06276689486884024, + "grad_norm": 1.9207082028299844, + "learning_rate": 1.9944049938640377e-06, + "loss": 1.3106, + "step": 463 + }, + { + "epoch": 0.06290246051650512, + "grad_norm": 1.7427302683286936, + "learning_rate": 1.9943585097437903e-06, + "loss": 1.3323, + "step": 464 + }, + { + "epoch": 0.06303802616417, + "grad_norm": 2.3598326664543023, + "learning_rate": 1.9943118338693533e-06, + "loss": 1.2918, + "step": 465 + }, + { + "epoch": 0.06317359181183488, + "grad_norm": 4.141861023795289, + "learning_rate": 1.994264966249728e-06, + "loss": 1.3557, + "step": 466 + }, + { + "epoch": 0.06330915745949976, + "grad_norm": 1.6342800862948985, + "learning_rate": 1.9942179068939516e-06, + "loss": 1.3262, + "step": 467 + }, + { + "epoch": 0.06344472310716465, + "grad_norm": 2.034576580657811, + "learning_rate": 1.9941706558111004e-06, + "loss": 1.3321, + "step": 468 + }, + { + "epoch": 0.06358028875482953, + "grad_norm": 1.814432034966728, + "learning_rate": 1.9941232130102854e-06, + "loss": 1.3209, + "step": 469 + }, + { + "epoch": 0.0637158544024944, + "grad_norm": 1.6103366336618314, + "learning_rate": 1.9940755785006564e-06, + "loss": 1.3234, + "step": 470 + }, + { + "epoch": 0.06385142005015929, + "grad_norm": 1.9646120803731146, + "learning_rate": 1.994027752291398e-06, + "loss": 1.3196, + "step": 471 + }, + { + "epoch": 0.06398698569782417, + "grad_norm": 2.7788368820337657, + "learning_rate": 1.9939797343917344e-06, + "loss": 1.3299, + "step": 472 + }, + { + "epoch": 0.06412255134548905, + "grad_norm": 1.7860528713959174, + "learning_rate": 1.9939315248109253e-06, + "loss": 1.3461, + "step": 473 + }, + { + "epoch": 0.06425811699315394, + "grad_norm": 1.5251856080563881, + "learning_rate": 1.993883123558267e-06, + "loss": 1.3259, + "step": 474 + }, + { + "epoch": 0.06439368264081882, + "grad_norm": 4.019726498725399, + "learning_rate": 1.9938345306430936e-06, + "loss": 1.3193, + "step": 475 + }, + { + "epoch": 0.06452924828848369, + "grad_norm": 6.34356083477445, + "learning_rate": 1.9937857460747757e-06, + "loss": 1.3374, + "step": 476 + }, + { + "epoch": 0.06466481393614858, + "grad_norm": 1.7651690972279994, + "learning_rate": 1.9937367698627208e-06, + "loss": 1.3488, + "step": 477 + }, + { + "epoch": 0.06480037958381346, + "grad_norm": 1.8890972537513047, + "learning_rate": 1.9936876020163746e-06, + "loss": 1.3323, + "step": 478 + }, + { + "epoch": 0.06493594523147835, + "grad_norm": 2.5100821939779006, + "learning_rate": 1.9936382425452176e-06, + "loss": 1.3678, + "step": 479 + }, + { + "epoch": 0.06507151087914323, + "grad_norm": 1.9159238770594769, + "learning_rate": 1.993588691458769e-06, + "loss": 1.3208, + "step": 480 + }, + { + "epoch": 0.06520707652680811, + "grad_norm": 1.9035113870977798, + "learning_rate": 1.993538948766584e-06, + "loss": 1.3428, + "step": 481 + }, + { + "epoch": 0.06534264217447298, + "grad_norm": 1.6743181791550203, + "learning_rate": 1.9934890144782558e-06, + "loss": 1.3294, + "step": 482 + }, + { + "epoch": 0.06547820782213787, + "grad_norm": 2.486096891486843, + "learning_rate": 1.9934388886034126e-06, + "loss": 1.2827, + "step": 483 + }, + { + "epoch": 0.06561377346980275, + "grad_norm": 1.550160409764198, + "learning_rate": 1.993388571151722e-06, + "loss": 1.3323, + "step": 484 + }, + { + "epoch": 0.06574933911746764, + "grad_norm": 2.6248487600688, + "learning_rate": 1.993338062132886e-06, + "loss": 1.3812, + "step": 485 + }, + { + "epoch": 0.06588490476513252, + "grad_norm": 2.3226975414535547, + "learning_rate": 1.993287361556646e-06, + "loss": 1.3388, + "step": 486 + }, + { + "epoch": 0.06602047041279739, + "grad_norm": 2.375750328089479, + "learning_rate": 1.9932364694327795e-06, + "loss": 1.3151, + "step": 487 + }, + { + "epoch": 0.06615603606046228, + "grad_norm": 1.6141105704557646, + "learning_rate": 1.9931853857710995e-06, + "loss": 1.3438, + "step": 488 + }, + { + "epoch": 0.06629160170812716, + "grad_norm": 1.9641086744611485, + "learning_rate": 1.9931341105814575e-06, + "loss": 1.356, + "step": 489 + }, + { + "epoch": 0.06642716735579204, + "grad_norm": 3.2052038369559814, + "learning_rate": 1.993082643873742e-06, + "loss": 1.3659, + "step": 490 + }, + { + "epoch": 0.06656273300345693, + "grad_norm": 2.147944737433154, + "learning_rate": 1.9930309856578772e-06, + "loss": 1.3105, + "step": 491 + }, + { + "epoch": 0.06669829865112181, + "grad_norm": 1.9060888838370746, + "learning_rate": 1.992979135943825e-06, + "loss": 1.32, + "step": 492 + }, + { + "epoch": 0.06683386429878668, + "grad_norm": 1.923574187437612, + "learning_rate": 1.9929270947415852e-06, + "loss": 1.3189, + "step": 493 + }, + { + "epoch": 0.06696942994645157, + "grad_norm": 1.7173732779161024, + "learning_rate": 1.9928748620611927e-06, + "loss": 1.3432, + "step": 494 + }, + { + "epoch": 0.06710499559411645, + "grad_norm": 1.9002622129011457, + "learning_rate": 1.99282243791272e-06, + "loss": 1.3949, + "step": 495 + }, + { + "epoch": 0.06724056124178134, + "grad_norm": 2.8682037802734914, + "learning_rate": 1.992769822306277e-06, + "loss": 1.3565, + "step": 496 + }, + { + "epoch": 0.06737612688944622, + "grad_norm": 1.7735245706446674, + "learning_rate": 1.992717015252011e-06, + "loss": 1.3175, + "step": 497 + }, + { + "epoch": 0.06751169253711109, + "grad_norm": 4.300188490047065, + "learning_rate": 1.992664016760104e-06, + "loss": 1.3382, + "step": 498 + }, + { + "epoch": 0.06764725818477597, + "grad_norm": 1.733437269007982, + "learning_rate": 1.992610826840777e-06, + "loss": 1.3161, + "step": 499 + }, + { + "epoch": 0.06778282383244086, + "grad_norm": 1.8598956202276442, + "learning_rate": 1.9925574455042873e-06, + "loss": 1.3648, + "step": 500 + }, + { + "epoch": 0.06791838948010574, + "grad_norm": 1.8237990764923873, + "learning_rate": 1.9925038727609287e-06, + "loss": 1.3047, + "step": 501 + }, + { + "epoch": 0.06805395512777063, + "grad_norm": 1.6220362437703266, + "learning_rate": 1.9924501086210334e-06, + "loss": 1.3581, + "step": 502 + }, + { + "epoch": 0.06818952077543551, + "grad_norm": 1.4784628504493948, + "learning_rate": 1.9923961530949677e-06, + "loss": 1.3104, + "step": 503 + }, + { + "epoch": 0.06832508642310038, + "grad_norm": 2.54261669430464, + "learning_rate": 1.9923420061931376e-06, + "loss": 1.3089, + "step": 504 + }, + { + "epoch": 0.06846065207076527, + "grad_norm": 2.3377716646806967, + "learning_rate": 1.992287667925985e-06, + "loss": 1.2861, + "step": 505 + }, + { + "epoch": 0.06859621771843015, + "grad_norm": 14.921605586307779, + "learning_rate": 1.992233138303988e-06, + "loss": 1.2977, + "step": 506 + }, + { + "epoch": 0.06873178336609503, + "grad_norm": 1.5612635741771355, + "learning_rate": 1.9921784173376626e-06, + "loss": 1.3055, + "step": 507 + }, + { + "epoch": 0.06886734901375992, + "grad_norm": 1.514090128089839, + "learning_rate": 1.9921235050375612e-06, + "loss": 1.3109, + "step": 508 + }, + { + "epoch": 0.06900291466142479, + "grad_norm": 1.8025066724198695, + "learning_rate": 1.9920684014142736e-06, + "loss": 1.3123, + "step": 509 + }, + { + "epoch": 0.06913848030908967, + "grad_norm": 1.8269452582929346, + "learning_rate": 1.992013106478425e-06, + "loss": 1.3327, + "step": 510 + }, + { + "epoch": 0.06927404595675456, + "grad_norm": 2.06920981995799, + "learning_rate": 1.9919576202406795e-06, + "loss": 1.2832, + "step": 511 + }, + { + "epoch": 0.06940961160441944, + "grad_norm": 1.6533718267630848, + "learning_rate": 1.9919019427117372e-06, + "loss": 1.3251, + "step": 512 + }, + { + "epoch": 0.06954517725208433, + "grad_norm": 2.301158079703493, + "learning_rate": 1.9918460739023348e-06, + "loss": 1.3494, + "step": 513 + }, + { + "epoch": 0.06968074289974921, + "grad_norm": 4.190633099128733, + "learning_rate": 1.991790013823246e-06, + "loss": 1.3432, + "step": 514 + }, + { + "epoch": 0.06981630854741408, + "grad_norm": 14.099683251820618, + "learning_rate": 1.991733762485282e-06, + "loss": 1.3197, + "step": 515 + }, + { + "epoch": 0.06995187419507896, + "grad_norm": 1.558325941349923, + "learning_rate": 1.9916773198992897e-06, + "loss": 1.322, + "step": 516 + }, + { + "epoch": 0.07008743984274385, + "grad_norm": 2.5727031924049455, + "learning_rate": 1.9916206860761546e-06, + "loss": 1.3128, + "step": 517 + }, + { + "epoch": 0.07022300549040873, + "grad_norm": 2.1000688914381707, + "learning_rate": 1.9915638610267974e-06, + "loss": 1.3464, + "step": 518 + }, + { + "epoch": 0.07035857113807362, + "grad_norm": 2.2689424408500023, + "learning_rate": 1.9915068447621765e-06, + "loss": 1.345, + "step": 519 + }, + { + "epoch": 0.07049413678573849, + "grad_norm": 1.693115899446775, + "learning_rate": 1.9914496372932873e-06, + "loss": 1.3164, + "step": 520 + }, + { + "epoch": 0.07062970243340337, + "grad_norm": 3.2350321324544735, + "learning_rate": 1.9913922386311612e-06, + "loss": 1.3377, + "step": 521 + }, + { + "epoch": 0.07076526808106826, + "grad_norm": 2.0117797278686536, + "learning_rate": 1.9913346487868676e-06, + "loss": 1.3397, + "step": 522 + }, + { + "epoch": 0.07090083372873314, + "grad_norm": 1.5877666662335417, + "learning_rate": 1.9912768677715123e-06, + "loss": 1.341, + "step": 523 + }, + { + "epoch": 0.07103639937639802, + "grad_norm": 4.194390060465099, + "learning_rate": 1.9912188955962376e-06, + "loss": 1.3418, + "step": 524 + }, + { + "epoch": 0.07117196502406291, + "grad_norm": 1.8087961687097784, + "learning_rate": 1.991160732272223e-06, + "loss": 1.3407, + "step": 525 + }, + { + "epoch": 0.07130753067172778, + "grad_norm": 3.0434007062621244, + "learning_rate": 1.9911023778106846e-06, + "loss": 1.3243, + "step": 526 + }, + { + "epoch": 0.07144309631939266, + "grad_norm": 6.762005180696664, + "learning_rate": 1.9910438322228762e-06, + "loss": 1.327, + "step": 527 + }, + { + "epoch": 0.07157866196705755, + "grad_norm": 1.7987520234593297, + "learning_rate": 1.990985095520088e-06, + "loss": 1.3082, + "step": 528 + }, + { + "epoch": 0.07171422761472243, + "grad_norm": 1.9843970751246542, + "learning_rate": 1.990926167713646e-06, + "loss": 1.2591, + "step": 529 + }, + { + "epoch": 0.07184979326238732, + "grad_norm": 1.6048786777917532, + "learning_rate": 1.9908670488149145e-06, + "loss": 1.3046, + "step": 530 + }, + { + "epoch": 0.0719853589100522, + "grad_norm": 2.506551143487652, + "learning_rate": 1.9908077388352943e-06, + "loss": 1.2919, + "step": 531 + }, + { + "epoch": 0.07212092455771707, + "grad_norm": 2.131397381769379, + "learning_rate": 1.9907482377862226e-06, + "loss": 1.332, + "step": 532 + }, + { + "epoch": 0.07225649020538195, + "grad_norm": 3.0152997959082115, + "learning_rate": 1.990688545679173e-06, + "loss": 1.3152, + "step": 533 + }, + { + "epoch": 0.07239205585304684, + "grad_norm": 3.0705330782537708, + "learning_rate": 1.990628662525658e-06, + "loss": 1.3085, + "step": 534 + }, + { + "epoch": 0.07252762150071172, + "grad_norm": 1.9235047923565862, + "learning_rate": 1.9905685883372254e-06, + "loss": 1.3117, + "step": 535 + }, + { + "epoch": 0.0726631871483766, + "grad_norm": 2.219619618171178, + "learning_rate": 1.990508323125459e-06, + "loss": 1.294, + "step": 536 + }, + { + "epoch": 0.07279875279604148, + "grad_norm": 3.041791263360748, + "learning_rate": 1.9904478669019815e-06, + "loss": 1.2573, + "step": 537 + }, + { + "epoch": 0.07293431844370636, + "grad_norm": 3.109293115463507, + "learning_rate": 1.990387219678451e-06, + "loss": 1.3286, + "step": 538 + }, + { + "epoch": 0.07306988409137125, + "grad_norm": 1.6920038850324108, + "learning_rate": 1.9903263814665624e-06, + "loss": 1.2864, + "step": 539 + }, + { + "epoch": 0.07320544973903613, + "grad_norm": 2.5419485660635623, + "learning_rate": 1.9902653522780482e-06, + "loss": 1.309, + "step": 540 + }, + { + "epoch": 0.07334101538670101, + "grad_norm": 2.216268665667187, + "learning_rate": 1.990204132124678e-06, + "loss": 1.3001, + "step": 541 + }, + { + "epoch": 0.0734765810343659, + "grad_norm": 1.8711771912659352, + "learning_rate": 1.990142721018257e-06, + "loss": 1.2844, + "step": 542 + }, + { + "epoch": 0.07361214668203077, + "grad_norm": 1.7573405562982112, + "learning_rate": 1.990081118970628e-06, + "loss": 1.2953, + "step": 543 + }, + { + "epoch": 0.07374771232969565, + "grad_norm": 9.335363504054285, + "learning_rate": 1.99001932599367e-06, + "loss": 1.3134, + "step": 544 + }, + { + "epoch": 0.07388327797736054, + "grad_norm": 1.8002838338197442, + "learning_rate": 1.9899573420993003e-06, + "loss": 1.3349, + "step": 545 + }, + { + "epoch": 0.07401884362502542, + "grad_norm": 1.6317468069588128, + "learning_rate": 1.9898951672994708e-06, + "loss": 1.3222, + "step": 546 + }, + { + "epoch": 0.0741544092726903, + "grad_norm": 1.4963994012962076, + "learning_rate": 1.9898328016061726e-06, + "loss": 1.3006, + "step": 547 + }, + { + "epoch": 0.07428997492035518, + "grad_norm": 2.3373365976976817, + "learning_rate": 1.9897702450314316e-06, + "loss": 1.3146, + "step": 548 + }, + { + "epoch": 0.07442554056802006, + "grad_norm": 2.300562922092513, + "learning_rate": 1.9897074975873116e-06, + "loss": 1.3234, + "step": 549 + }, + { + "epoch": 0.07456110621568494, + "grad_norm": 2.149842800652685, + "learning_rate": 1.9896445592859134e-06, + "loss": 1.3114, + "step": 550 + }, + { + "epoch": 0.07469667186334983, + "grad_norm": 1.5821678127224938, + "learning_rate": 1.989581430139373e-06, + "loss": 1.3346, + "step": 551 + }, + { + "epoch": 0.07483223751101471, + "grad_norm": 2.5670894364364543, + "learning_rate": 1.9895181101598656e-06, + "loss": 1.3083, + "step": 552 + }, + { + "epoch": 0.0749678031586796, + "grad_norm": 1.6859935789096443, + "learning_rate": 1.9894545993596014e-06, + "loss": 1.285, + "step": 553 + }, + { + "epoch": 0.07510336880634447, + "grad_norm": 1.5184562623104128, + "learning_rate": 1.9893908977508277e-06, + "loss": 1.3025, + "step": 554 + }, + { + "epoch": 0.07523893445400935, + "grad_norm": 2.6996891922957382, + "learning_rate": 1.9893270053458293e-06, + "loss": 1.3299, + "step": 555 + }, + { + "epoch": 0.07537450010167424, + "grad_norm": 16.516304347727278, + "learning_rate": 1.9892629221569274e-06, + "loss": 1.3384, + "step": 556 + }, + { + "epoch": 0.07551006574933912, + "grad_norm": 13.213647281269282, + "learning_rate": 1.989198648196479e-06, + "loss": 1.334, + "step": 557 + }, + { + "epoch": 0.075645631397004, + "grad_norm": 1.3924150410496505, + "learning_rate": 1.9891341834768806e-06, + "loss": 1.3222, + "step": 558 + }, + { + "epoch": 0.07578119704466887, + "grad_norm": 1.7299978252252144, + "learning_rate": 1.9890695280105622e-06, + "loss": 1.3135, + "step": 559 + }, + { + "epoch": 0.07591676269233376, + "grad_norm": 2.2912281840601083, + "learning_rate": 1.9890046818099925e-06, + "loss": 1.2973, + "step": 560 + }, + { + "epoch": 0.07605232833999864, + "grad_norm": 1.644818756516707, + "learning_rate": 1.9889396448876765e-06, + "loss": 1.3705, + "step": 561 + }, + { + "epoch": 0.07618789398766353, + "grad_norm": 9.08861555619498, + "learning_rate": 1.9888744172561563e-06, + "loss": 1.3442, + "step": 562 + }, + { + "epoch": 0.07632345963532841, + "grad_norm": 1.8841781017943136, + "learning_rate": 1.9888089989280107e-06, + "loss": 1.3032, + "step": 563 + }, + { + "epoch": 0.0764590252829933, + "grad_norm": 1.76595004738289, + "learning_rate": 1.9887433899158547e-06, + "loss": 1.3359, + "step": 564 + }, + { + "epoch": 0.07659459093065817, + "grad_norm": 1.811378737253356, + "learning_rate": 1.9886775902323402e-06, + "loss": 1.3389, + "step": 565 + }, + { + "epoch": 0.07673015657832305, + "grad_norm": 1.846512038408247, + "learning_rate": 1.9886115998901572e-06, + "loss": 1.3476, + "step": 566 + }, + { + "epoch": 0.07686572222598793, + "grad_norm": 1.696442680567805, + "learning_rate": 1.9885454189020303e-06, + "loss": 1.3373, + "step": 567 + }, + { + "epoch": 0.07700128787365282, + "grad_norm": 1.9154313166023091, + "learning_rate": 1.988479047280723e-06, + "loss": 1.3417, + "step": 568 + }, + { + "epoch": 0.0771368535213177, + "grad_norm": 1.8235657810591313, + "learning_rate": 1.9884124850390336e-06, + "loss": 1.3175, + "step": 569 + }, + { + "epoch": 0.07727241916898257, + "grad_norm": 2.4242289904845973, + "learning_rate": 1.9883457321897984e-06, + "loss": 1.3445, + "step": 570 + }, + { + "epoch": 0.07740798481664746, + "grad_norm": 3.555719887458753, + "learning_rate": 1.9882787887458907e-06, + "loss": 1.3167, + "step": 571 + }, + { + "epoch": 0.07754355046431234, + "grad_norm": 2.669508777850099, + "learning_rate": 1.988211654720219e-06, + "loss": 1.363, + "step": 572 + }, + { + "epoch": 0.07767911611197723, + "grad_norm": 1.8815902059867413, + "learning_rate": 1.9881443301257308e-06, + "loss": 1.2985, + "step": 573 + }, + { + "epoch": 0.07781468175964211, + "grad_norm": 25.71447319624708, + "learning_rate": 1.988076814975408e-06, + "loss": 1.3131, + "step": 574 + }, + { + "epoch": 0.077950247407307, + "grad_norm": 1.7573693927022893, + "learning_rate": 1.988009109282271e-06, + "loss": 1.332, + "step": 575 + }, + { + "epoch": 0.07808581305497186, + "grad_norm": 1.6348102745109545, + "learning_rate": 1.9879412130593765e-06, + "loss": 1.3468, + "step": 576 + }, + { + "epoch": 0.07822137870263675, + "grad_norm": 1.701145292280483, + "learning_rate": 1.9878731263198165e-06, + "loss": 1.2994, + "step": 577 + }, + { + "epoch": 0.07835694435030163, + "grad_norm": 1.8725805658805585, + "learning_rate": 1.987804849076723e-06, + "loss": 1.2879, + "step": 578 + }, + { + "epoch": 0.07849250999796652, + "grad_norm": 2.264829991294664, + "learning_rate": 1.9877363813432607e-06, + "loss": 1.35, + "step": 579 + }, + { + "epoch": 0.0786280756456314, + "grad_norm": 1.945957926942763, + "learning_rate": 1.9876677231326347e-06, + "loss": 1.3064, + "step": 580 + }, + { + "epoch": 0.07876364129329629, + "grad_norm": 2.327411422070427, + "learning_rate": 1.9875988744580837e-06, + "loss": 1.3047, + "step": 581 + }, + { + "epoch": 0.07889920694096116, + "grad_norm": 1.665435267231894, + "learning_rate": 1.987529835332886e-06, + "loss": 1.2773, + "step": 582 + }, + { + "epoch": 0.07903477258862604, + "grad_norm": 1.6575112955776419, + "learning_rate": 1.9874606057703546e-06, + "loss": 1.3295, + "step": 583 + }, + { + "epoch": 0.07917033823629092, + "grad_norm": 3.226961937632259, + "learning_rate": 1.9873911857838395e-06, + "loss": 1.3032, + "step": 584 + }, + { + "epoch": 0.07930590388395581, + "grad_norm": 1.7486366325043978, + "learning_rate": 1.9873215753867286e-06, + "loss": 1.3373, + "step": 585 + }, + { + "epoch": 0.07944146953162069, + "grad_norm": 2.1368362517881883, + "learning_rate": 1.987251774592445e-06, + "loss": 1.3287, + "step": 586 + }, + { + "epoch": 0.07957703517928556, + "grad_norm": 2.0595432398623705, + "learning_rate": 1.98718178341445e-06, + "loss": 1.3262, + "step": 587 + }, + { + "epoch": 0.07971260082695045, + "grad_norm": 1.6285923797271558, + "learning_rate": 1.9871116018662403e-06, + "loss": 1.3154, + "step": 588 + }, + { + "epoch": 0.07984816647461533, + "grad_norm": 2.6241331988164336, + "learning_rate": 1.98704122996135e-06, + "loss": 1.3141, + "step": 589 + }, + { + "epoch": 0.07998373212228022, + "grad_norm": 4.23393662979764, + "learning_rate": 1.9869706677133493e-06, + "loss": 1.3215, + "step": 590 + }, + { + "epoch": 0.0801192977699451, + "grad_norm": 2.99604988776911, + "learning_rate": 1.9868999151358465e-06, + "loss": 1.3472, + "step": 591 + }, + { + "epoch": 0.08025486341760998, + "grad_norm": 2.6833323847670068, + "learning_rate": 1.9868289722424846e-06, + "loss": 1.3142, + "step": 592 + }, + { + "epoch": 0.08039042906527485, + "grad_norm": 2.0107233164231775, + "learning_rate": 1.9867578390469454e-06, + "loss": 1.3189, + "step": 593 + }, + { + "epoch": 0.08052599471293974, + "grad_norm": 1.5800274099821496, + "learning_rate": 1.986686515562946e-06, + "loss": 1.3053, + "step": 594 + }, + { + "epoch": 0.08066156036060462, + "grad_norm": 1.673358445482914, + "learning_rate": 1.9866150018042403e-06, + "loss": 1.3282, + "step": 595 + }, + { + "epoch": 0.08079712600826951, + "grad_norm": 1.6658494829270096, + "learning_rate": 1.986543297784619e-06, + "loss": 1.3365, + "step": 596 + }, + { + "epoch": 0.08093269165593439, + "grad_norm": 1.7362271437421009, + "learning_rate": 1.9864714035179108e-06, + "loss": 1.3152, + "step": 597 + }, + { + "epoch": 0.08106825730359926, + "grad_norm": 1.9905961523160542, + "learning_rate": 1.986399319017979e-06, + "loss": 1.2946, + "step": 598 + }, + { + "epoch": 0.08120382295126415, + "grad_norm": 1.8897185468511393, + "learning_rate": 1.986327044298724e-06, + "loss": 1.3204, + "step": 599 + }, + { + "epoch": 0.08133938859892903, + "grad_norm": 2.4019853695910687, + "learning_rate": 1.986254579374085e-06, + "loss": 1.2745, + "step": 600 + }, + { + "epoch": 0.08147495424659391, + "grad_norm": 2.3432565523531297, + "learning_rate": 1.9861819242580353e-06, + "loss": 1.2969, + "step": 601 + }, + { + "epoch": 0.0816105198942588, + "grad_norm": 1.926296112664522, + "learning_rate": 1.9861090789645855e-06, + "loss": 1.293, + "step": 602 + }, + { + "epoch": 0.08174608554192368, + "grad_norm": 2.59410056691364, + "learning_rate": 1.9860360435077837e-06, + "loss": 1.304, + "step": 603 + }, + { + "epoch": 0.08188165118958855, + "grad_norm": 1.3981913062724594, + "learning_rate": 1.9859628179017142e-06, + "loss": 1.2963, + "step": 604 + }, + { + "epoch": 0.08201721683725344, + "grad_norm": 1.5701208165434493, + "learning_rate": 1.985889402160498e-06, + "loss": 1.3189, + "step": 605 + }, + { + "epoch": 0.08215278248491832, + "grad_norm": 1.9405410445921778, + "learning_rate": 1.985815796298293e-06, + "loss": 1.3173, + "step": 606 + }, + { + "epoch": 0.0822883481325832, + "grad_norm": 1.685927753793799, + "learning_rate": 1.985742000329293e-06, + "loss": 1.3216, + "step": 607 + }, + { + "epoch": 0.08242391378024809, + "grad_norm": 1.6328487559416636, + "learning_rate": 1.9856680142677294e-06, + "loss": 1.2797, + "step": 608 + }, + { + "epoch": 0.08255947942791296, + "grad_norm": 2.149721559747031, + "learning_rate": 1.9855938381278698e-06, + "loss": 1.307, + "step": 609 + }, + { + "epoch": 0.08269504507557784, + "grad_norm": 1.6690622513774969, + "learning_rate": 1.985519471924018e-06, + "loss": 1.2972, + "step": 610 + }, + { + "epoch": 0.08283061072324273, + "grad_norm": 1.943602776666496, + "learning_rate": 1.985444915670515e-06, + "loss": 1.3098, + "step": 611 + }, + { + "epoch": 0.08296617637090761, + "grad_norm": 1.6234049666292307, + "learning_rate": 1.9853701693817393e-06, + "loss": 1.2694, + "step": 612 + }, + { + "epoch": 0.0831017420185725, + "grad_norm": 1.9559771686270697, + "learning_rate": 1.985295233072104e-06, + "loss": 1.2986, + "step": 613 + }, + { + "epoch": 0.08323730766623738, + "grad_norm": 1.5644976164509021, + "learning_rate": 1.9852201067560607e-06, + "loss": 1.332, + "step": 614 + }, + { + "epoch": 0.08337287331390225, + "grad_norm": 1.8936209555710235, + "learning_rate": 1.9851447904480964e-06, + "loss": 1.308, + "step": 615 + }, + { + "epoch": 0.08350843896156714, + "grad_norm": 2.1119806571030755, + "learning_rate": 1.9850692841627356e-06, + "loss": 1.326, + "step": 616 + }, + { + "epoch": 0.08364400460923202, + "grad_norm": 2.7432599637263375, + "learning_rate": 1.984993587914539e-06, + "loss": 1.2748, + "step": 617 + }, + { + "epoch": 0.0837795702568969, + "grad_norm": 2.955570350223664, + "learning_rate": 1.9849177017181044e-06, + "loss": 1.2843, + "step": 618 + }, + { + "epoch": 0.08391513590456179, + "grad_norm": 1.763788951291223, + "learning_rate": 1.984841625588065e-06, + "loss": 1.2887, + "step": 619 + }, + { + "epoch": 0.08405070155222667, + "grad_norm": 1.9063709899540533, + "learning_rate": 1.9847653595390923e-06, + "loss": 1.2955, + "step": 620 + }, + { + "epoch": 0.08418626719989154, + "grad_norm": 1.7814882378794943, + "learning_rate": 1.984688903585893e-06, + "loss": 1.2943, + "step": 621 + }, + { + "epoch": 0.08432183284755643, + "grad_norm": 1.6538040341025528, + "learning_rate": 1.9846122577432116e-06, + "loss": 1.289, + "step": 622 + }, + { + "epoch": 0.08445739849522131, + "grad_norm": 1.626509929450783, + "learning_rate": 1.9845354220258283e-06, + "loss": 1.2669, + "step": 623 + }, + { + "epoch": 0.0845929641428862, + "grad_norm": 2.4129110106206206, + "learning_rate": 1.9844583964485604e-06, + "loss": 1.297, + "step": 624 + }, + { + "epoch": 0.08472852979055108, + "grad_norm": 1.5904855008857377, + "learning_rate": 1.9843811810262612e-06, + "loss": 1.3068, + "step": 625 + }, + { + "epoch": 0.08486409543821595, + "grad_norm": 1.6307229101444674, + "learning_rate": 1.984303775773822e-06, + "loss": 1.3083, + "step": 626 + }, + { + "epoch": 0.08499966108588083, + "grad_norm": 2.0429567886254807, + "learning_rate": 1.9842261807061685e-06, + "loss": 1.3056, + "step": 627 + }, + { + "epoch": 0.08513522673354572, + "grad_norm": 1.719497046669397, + "learning_rate": 1.984148395838266e-06, + "loss": 1.3128, + "step": 628 + }, + { + "epoch": 0.0852707923812106, + "grad_norm": 3.7124516685084967, + "learning_rate": 1.984070421185113e-06, + "loss": 1.296, + "step": 629 + }, + { + "epoch": 0.08540635802887549, + "grad_norm": 1.6999314916102728, + "learning_rate": 1.983992256761747e-06, + "loss": 1.2888, + "step": 630 + }, + { + "epoch": 0.08554192367654037, + "grad_norm": 1.8007454592754037, + "learning_rate": 1.983913902583242e-06, + "loss": 1.2886, + "step": 631 + }, + { + "epoch": 0.08567748932420524, + "grad_norm": 2.9241146857282274, + "learning_rate": 1.983835358664707e-06, + "loss": 1.3426, + "step": 632 + }, + { + "epoch": 0.08581305497187013, + "grad_norm": 2.638871536182331, + "learning_rate": 1.9837566250212894e-06, + "loss": 1.3161, + "step": 633 + }, + { + "epoch": 0.08594862061953501, + "grad_norm": 10.18730257844879, + "learning_rate": 1.9836777016681723e-06, + "loss": 1.2516, + "step": 634 + }, + { + "epoch": 0.0860841862671999, + "grad_norm": 1.8278166272597558, + "learning_rate": 1.9835985886205744e-06, + "loss": 1.3002, + "step": 635 + }, + { + "epoch": 0.08621975191486478, + "grad_norm": 2.2503685246305514, + "learning_rate": 1.983519285893753e-06, + "loss": 1.3357, + "step": 636 + }, + { + "epoch": 0.08635531756252965, + "grad_norm": 1.8171874129718046, + "learning_rate": 1.983439793503e-06, + "loss": 1.2891, + "step": 637 + }, + { + "epoch": 0.08649088321019453, + "grad_norm": 1.6615401794775384, + "learning_rate": 1.9833601114636465e-06, + "loss": 1.2887, + "step": 638 + }, + { + "epoch": 0.08662644885785942, + "grad_norm": 1.5722956969740767, + "learning_rate": 1.9832802397910578e-06, + "loss": 1.3002, + "step": 639 + }, + { + "epoch": 0.0867620145055243, + "grad_norm": 1.853607668694328, + "learning_rate": 1.983200178500636e-06, + "loss": 1.3179, + "step": 640 + }, + { + "epoch": 0.08689758015318919, + "grad_norm": 1.977864103054011, + "learning_rate": 1.9831199276078208e-06, + "loss": 1.3163, + "step": 641 + }, + { + "epoch": 0.08703314580085407, + "grad_norm": 1.6883626458669663, + "learning_rate": 1.9830394871280876e-06, + "loss": 1.2633, + "step": 642 + }, + { + "epoch": 0.08716871144851894, + "grad_norm": 2.0902928480757486, + "learning_rate": 1.982958857076949e-06, + "loss": 1.3156, + "step": 643 + }, + { + "epoch": 0.08730427709618382, + "grad_norm": 1.9964026488196194, + "learning_rate": 1.982878037469954e-06, + "loss": 1.3076, + "step": 644 + }, + { + "epoch": 0.08743984274384871, + "grad_norm": 1.957809606861147, + "learning_rate": 1.9827970283226883e-06, + "loss": 1.325, + "step": 645 + }, + { + "epoch": 0.08757540839151359, + "grad_norm": 2.5994458860087404, + "learning_rate": 1.9827158296507727e-06, + "loss": 1.3125, + "step": 646 + }, + { + "epoch": 0.08771097403917848, + "grad_norm": 2.268235462168707, + "learning_rate": 1.9826344414698667e-06, + "loss": 1.2847, + "step": 647 + }, + { + "epoch": 0.08784653968684335, + "grad_norm": 2.817579289188683, + "learning_rate": 1.982552863795665e-06, + "loss": 1.2827, + "step": 648 + }, + { + "epoch": 0.08798210533450823, + "grad_norm": 2.1051394767711793, + "learning_rate": 1.9824710966438995e-06, + "loss": 1.3095, + "step": 649 + }, + { + "epoch": 0.08811767098217312, + "grad_norm": 2.8910620938692984, + "learning_rate": 1.982389140030338e-06, + "loss": 1.2507, + "step": 650 + }, + { + "epoch": 0.088253236629838, + "grad_norm": 2.082467176207069, + "learning_rate": 1.9823069939707856e-06, + "loss": 1.3002, + "step": 651 + }, + { + "epoch": 0.08838880227750288, + "grad_norm": 1.5986042189962644, + "learning_rate": 1.982224658481083e-06, + "loss": 1.2903, + "step": 652 + }, + { + "epoch": 0.08852436792516777, + "grad_norm": 2.23290234400066, + "learning_rate": 1.9821421335771084e-06, + "loss": 1.2983, + "step": 653 + }, + { + "epoch": 0.08865993357283264, + "grad_norm": 8.889289178200807, + "learning_rate": 1.9820594192747757e-06, + "loss": 1.3132, + "step": 654 + }, + { + "epoch": 0.08879549922049752, + "grad_norm": 2.4479876304236416, + "learning_rate": 1.981976515590036e-06, + "loss": 1.3198, + "step": 655 + }, + { + "epoch": 0.08893106486816241, + "grad_norm": 2.5584256796461555, + "learning_rate": 1.9818934225388765e-06, + "loss": 1.3321, + "step": 656 + }, + { + "epoch": 0.08906663051582729, + "grad_norm": 1.8145730966039297, + "learning_rate": 1.981810140137321e-06, + "loss": 1.3008, + "step": 657 + }, + { + "epoch": 0.08920219616349218, + "grad_norm": 1.5726055890786308, + "learning_rate": 1.9817266684014303e-06, + "loss": 1.2683, + "step": 658 + }, + { + "epoch": 0.08933776181115705, + "grad_norm": 1.7183945586698774, + "learning_rate": 1.9816430073473005e-06, + "loss": 1.2958, + "step": 659 + }, + { + "epoch": 0.08947332745882193, + "grad_norm": 3.4342620071039622, + "learning_rate": 1.9815591569910653e-06, + "loss": 1.3111, + "step": 660 + }, + { + "epoch": 0.08960889310648681, + "grad_norm": 2.8110185848172176, + "learning_rate": 1.9814751173488944e-06, + "loss": 1.316, + "step": 661 + }, + { + "epoch": 0.0897444587541517, + "grad_norm": 2.615068571474921, + "learning_rate": 1.981390888436995e-06, + "loss": 1.352, + "step": 662 + }, + { + "epoch": 0.08988002440181658, + "grad_norm": 1.8585778258557208, + "learning_rate": 1.981306470271609e-06, + "loss": 1.3029, + "step": 663 + }, + { + "epoch": 0.09001559004948147, + "grad_norm": 1.7389218139815166, + "learning_rate": 1.9812218628690165e-06, + "loss": 1.2966, + "step": 664 + }, + { + "epoch": 0.09015115569714634, + "grad_norm": 4.952540461760087, + "learning_rate": 1.981137066245533e-06, + "loss": 1.2745, + "step": 665 + }, + { + "epoch": 0.09028672134481122, + "grad_norm": 1.9291163935026163, + "learning_rate": 1.981052080417511e-06, + "loss": 1.3078, + "step": 666 + }, + { + "epoch": 0.0904222869924761, + "grad_norm": 2.146206080448223, + "learning_rate": 1.980966905401339e-06, + "loss": 1.2654, + "step": 667 + }, + { + "epoch": 0.09055785264014099, + "grad_norm": 1.77300256338064, + "learning_rate": 1.9808815412134424e-06, + "loss": 1.2879, + "step": 668 + }, + { + "epoch": 0.09069341828780587, + "grad_norm": 3.149868833764891, + "learning_rate": 1.9807959878702833e-06, + "loss": 1.2743, + "step": 669 + }, + { + "epoch": 0.09082898393547076, + "grad_norm": 2.257259974548039, + "learning_rate": 1.98071024538836e-06, + "loss": 1.3172, + "step": 670 + }, + { + "epoch": 0.09096454958313563, + "grad_norm": 3.729463163635312, + "learning_rate": 1.980624313784207e-06, + "loss": 1.2847, + "step": 671 + }, + { + "epoch": 0.09110011523080051, + "grad_norm": 2.1716582740178567, + "learning_rate": 1.980538193074396e-06, + "loss": 1.277, + "step": 672 + }, + { + "epoch": 0.0912356808784654, + "grad_norm": 2.890143951414565, + "learning_rate": 1.980451883275534e-06, + "loss": 1.2812, + "step": 673 + }, + { + "epoch": 0.09137124652613028, + "grad_norm": 2.981498260989165, + "learning_rate": 1.9803653844042655e-06, + "loss": 1.3035, + "step": 674 + }, + { + "epoch": 0.09150681217379517, + "grad_norm": 1.6551007449515491, + "learning_rate": 1.9802786964772714e-06, + "loss": 1.2927, + "step": 675 + }, + { + "epoch": 0.09164237782146004, + "grad_norm": 1.7370072475846503, + "learning_rate": 1.9801918195112684e-06, + "loss": 1.3014, + "step": 676 + }, + { + "epoch": 0.09177794346912492, + "grad_norm": 1.7835657373679847, + "learning_rate": 1.9801047535230103e-06, + "loss": 1.3107, + "step": 677 + }, + { + "epoch": 0.0919135091167898, + "grad_norm": 1.6517496957835933, + "learning_rate": 1.9800174985292866e-06, + "loss": 1.3089, + "step": 678 + }, + { + "epoch": 0.09204907476445469, + "grad_norm": 1.7582910370251583, + "learning_rate": 1.9799300545469248e-06, + "loss": 1.3054, + "step": 679 + }, + { + "epoch": 0.09218464041211957, + "grad_norm": 1.738141114286796, + "learning_rate": 1.9798424215927864e-06, + "loss": 1.2509, + "step": 680 + }, + { + "epoch": 0.09232020605978446, + "grad_norm": 2.669283246147495, + "learning_rate": 1.979754599683772e-06, + "loss": 1.3169, + "step": 681 + }, + { + "epoch": 0.09245577170744933, + "grad_norm": 1.7425270464921725, + "learning_rate": 1.979666588836816e-06, + "loss": 1.2787, + "step": 682 + }, + { + "epoch": 0.09259133735511421, + "grad_norm": 2.688752471351209, + "learning_rate": 1.9795783890688917e-06, + "loss": 1.3168, + "step": 683 + }, + { + "epoch": 0.0927269030027791, + "grad_norm": 4.1258731213151005, + "learning_rate": 1.9794900003970073e-06, + "loss": 1.2992, + "step": 684 + }, + { + "epoch": 0.09286246865044398, + "grad_norm": 2.461363204750844, + "learning_rate": 1.9794014228382085e-06, + "loss": 1.2795, + "step": 685 + }, + { + "epoch": 0.09299803429810886, + "grad_norm": 2.285789521495444, + "learning_rate": 1.9793126564095756e-06, + "loss": 1.3117, + "step": 686 + }, + { + "epoch": 0.09313359994577373, + "grad_norm": 1.8570426717555448, + "learning_rate": 1.979223701128227e-06, + "loss": 1.3341, + "step": 687 + }, + { + "epoch": 0.09326916559343862, + "grad_norm": 14.683498484451093, + "learning_rate": 1.979134557011318e-06, + "loss": 1.2935, + "step": 688 + }, + { + "epoch": 0.0934047312411035, + "grad_norm": 8.748145918903639, + "learning_rate": 1.979045224076038e-06, + "loss": 1.3257, + "step": 689 + }, + { + "epoch": 0.09354029688876839, + "grad_norm": 2.2387249410332806, + "learning_rate": 1.9789557023396145e-06, + "loss": 1.2935, + "step": 690 + }, + { + "epoch": 0.09367586253643327, + "grad_norm": 2.157844367263473, + "learning_rate": 1.9788659918193115e-06, + "loss": 1.3164, + "step": 691 + }, + { + "epoch": 0.09381142818409816, + "grad_norm": 1.5988376430575066, + "learning_rate": 1.9787760925324285e-06, + "loss": 1.3099, + "step": 692 + }, + { + "epoch": 0.09394699383176303, + "grad_norm": 1.812210649901409, + "learning_rate": 1.9786860044963023e-06, + "loss": 1.2925, + "step": 693 + }, + { + "epoch": 0.09408255947942791, + "grad_norm": 1.8853281664040753, + "learning_rate": 1.978595727728305e-06, + "loss": 1.3099, + "step": 694 + }, + { + "epoch": 0.0942181251270928, + "grad_norm": 2.140280857061623, + "learning_rate": 1.9785052622458467e-06, + "loss": 1.292, + "step": 695 + }, + { + "epoch": 0.09435369077475768, + "grad_norm": 2.9562521207178305, + "learning_rate": 1.978414608066372e-06, + "loss": 1.3156, + "step": 696 + }, + { + "epoch": 0.09448925642242256, + "grad_norm": 2.9781537670570426, + "learning_rate": 1.9783237652073633e-06, + "loss": 1.2613, + "step": 697 + }, + { + "epoch": 0.09462482207008743, + "grad_norm": 1.9774813064062517, + "learning_rate": 1.978232733686339e-06, + "loss": 1.2681, + "step": 698 + }, + { + "epoch": 0.09476038771775232, + "grad_norm": 2.626513427003976, + "learning_rate": 1.9781415135208536e-06, + "loss": 1.2904, + "step": 699 + }, + { + "epoch": 0.0948959533654172, + "grad_norm": 2.3535325902070685, + "learning_rate": 1.9780501047284983e-06, + "loss": 1.2931, + "step": 700 + }, + { + "epoch": 0.09503151901308209, + "grad_norm": 2.298242009912923, + "learning_rate": 1.977958507326901e-06, + "loss": 1.2777, + "step": 701 + }, + { + "epoch": 0.09516708466074697, + "grad_norm": 2.693102603517135, + "learning_rate": 1.9778667213337242e-06, + "loss": 1.3003, + "step": 702 + }, + { + "epoch": 0.09530265030841185, + "grad_norm": 2.1315610040774, + "learning_rate": 1.97777474676667e-06, + "loss": 1.2832, + "step": 703 + }, + { + "epoch": 0.09543821595607672, + "grad_norm": 2.3447838639744174, + "learning_rate": 1.9776825836434733e-06, + "loss": 1.2793, + "step": 704 + }, + { + "epoch": 0.09557378160374161, + "grad_norm": 2.049338367582136, + "learning_rate": 1.977590231981908e-06, + "loss": 1.2853, + "step": 705 + }, + { + "epoch": 0.09570934725140649, + "grad_norm": 1.5606387354423124, + "learning_rate": 1.977497691799783e-06, + "loss": 1.2851, + "step": 706 + }, + { + "epoch": 0.09584491289907138, + "grad_norm": 2.067604430670879, + "learning_rate": 1.9774049631149443e-06, + "loss": 1.2953, + "step": 707 + }, + { + "epoch": 0.09598047854673626, + "grad_norm": 1.802172800723547, + "learning_rate": 1.977312045945273e-06, + "loss": 1.2929, + "step": 708 + }, + { + "epoch": 0.09611604419440115, + "grad_norm": 2.5943794192930403, + "learning_rate": 1.9772189403086884e-06, + "loss": 1.2819, + "step": 709 + }, + { + "epoch": 0.09625160984206602, + "grad_norm": 2.632061215492049, + "learning_rate": 1.977125646223145e-06, + "loss": 1.2842, + "step": 710 + }, + { + "epoch": 0.0963871754897309, + "grad_norm": 2.0232351366733514, + "learning_rate": 1.977032163706633e-06, + "loss": 1.339, + "step": 711 + }, + { + "epoch": 0.09652274113739578, + "grad_norm": 2.096220216458923, + "learning_rate": 1.976938492777182e-06, + "loss": 1.2635, + "step": 712 + }, + { + "epoch": 0.09665830678506067, + "grad_norm": 2.1278270479308805, + "learning_rate": 1.976844633452853e-06, + "loss": 1.2894, + "step": 713 + }, + { + "epoch": 0.09679387243272555, + "grad_norm": 3.0226344196280897, + "learning_rate": 1.976750585751747e-06, + "loss": 1.2707, + "step": 714 + }, + { + "epoch": 0.09692943808039042, + "grad_norm": 2.127170909366041, + "learning_rate": 1.9766563496920014e-06, + "loss": 1.307, + "step": 715 + }, + { + "epoch": 0.09706500372805531, + "grad_norm": 2.273999974148821, + "learning_rate": 1.9765619252917873e-06, + "loss": 1.2934, + "step": 716 + }, + { + "epoch": 0.09720056937572019, + "grad_norm": 1.6880811693370674, + "learning_rate": 1.9764673125693146e-06, + "loss": 1.3059, + "step": 717 + }, + { + "epoch": 0.09733613502338508, + "grad_norm": 1.6825723680314157, + "learning_rate": 1.9763725115428284e-06, + "loss": 1.2697, + "step": 718 + }, + { + "epoch": 0.09747170067104996, + "grad_norm": 1.8328608028971138, + "learning_rate": 1.9762775222306107e-06, + "loss": 1.2984, + "step": 719 + }, + { + "epoch": 0.09760726631871484, + "grad_norm": 1.9521389861601184, + "learning_rate": 1.976182344650979e-06, + "loss": 1.2845, + "step": 720 + }, + { + "epoch": 0.09774283196637971, + "grad_norm": 1.9141365752315207, + "learning_rate": 1.9760869788222873e-06, + "loss": 1.273, + "step": 721 + }, + { + "epoch": 0.0978783976140446, + "grad_norm": 3.46228714113412, + "learning_rate": 1.9759914247629264e-06, + "loss": 1.3164, + "step": 722 + }, + { + "epoch": 0.09801396326170948, + "grad_norm": 2.636105200779348, + "learning_rate": 1.975895682491324e-06, + "loss": 1.2659, + "step": 723 + }, + { + "epoch": 0.09814952890937437, + "grad_norm": 1.8191098875995824, + "learning_rate": 1.975799752025942e-06, + "loss": 1.3207, + "step": 724 + }, + { + "epoch": 0.09828509455703925, + "grad_norm": 1.6287885328419887, + "learning_rate": 1.97570363338528e-06, + "loss": 1.2816, + "step": 725 + }, + { + "epoch": 0.09842066020470412, + "grad_norm": 2.195093097793652, + "learning_rate": 1.9756073265878746e-06, + "loss": 1.2932, + "step": 726 + }, + { + "epoch": 0.098556225852369, + "grad_norm": 3.383177433433686, + "learning_rate": 1.9755108316522967e-06, + "loss": 1.295, + "step": 727 + }, + { + "epoch": 0.09869179150003389, + "grad_norm": 1.7483507298621606, + "learning_rate": 1.9754141485971555e-06, + "loss": 1.2927, + "step": 728 + }, + { + "epoch": 0.09882735714769877, + "grad_norm": 2.1535280717785517, + "learning_rate": 1.9753172774410952e-06, + "loss": 1.3052, + "step": 729 + }, + { + "epoch": 0.09896292279536366, + "grad_norm": 2.879176741086381, + "learning_rate": 1.9752202182027967e-06, + "loss": 1.2871, + "step": 730 + }, + { + "epoch": 0.09909848844302854, + "grad_norm": 1.6547755739745607, + "learning_rate": 1.9751229709009767e-06, + "loss": 1.2689, + "step": 731 + }, + { + "epoch": 0.09923405409069341, + "grad_norm": 2.6778763115096047, + "learning_rate": 1.975025535554389e-06, + "loss": 1.3304, + "step": 732 + }, + { + "epoch": 0.0993696197383583, + "grad_norm": 1.9554611877554566, + "learning_rate": 1.9749279121818236e-06, + "loss": 1.2629, + "step": 733 + }, + { + "epoch": 0.09950518538602318, + "grad_norm": 2.5895213079441564, + "learning_rate": 1.9748301008021055e-06, + "loss": 1.2951, + "step": 734 + }, + { + "epoch": 0.09964075103368807, + "grad_norm": 1.8830710133844804, + "learning_rate": 1.9747321014340974e-06, + "loss": 1.2463, + "step": 735 + }, + { + "epoch": 0.09977631668135295, + "grad_norm": 2.995163491828962, + "learning_rate": 1.974633914096698e-06, + "loss": 1.2704, + "step": 736 + }, + { + "epoch": 0.09991188232901782, + "grad_norm": 2.5598629608688963, + "learning_rate": 1.974535538808841e-06, + "loss": 1.2846, + "step": 737 + }, + { + "epoch": 0.1000474479766827, + "grad_norm": 2.0734350228706346, + "learning_rate": 1.9744369755894977e-06, + "loss": 1.3146, + "step": 738 + }, + { + "epoch": 0.10018301362434759, + "grad_norm": 1.6781249405011838, + "learning_rate": 1.974338224457676e-06, + "loss": 1.2777, + "step": 739 + }, + { + "epoch": 0.10031857927201247, + "grad_norm": 2.1380895070761885, + "learning_rate": 1.9742392854324186e-06, + "loss": 1.3122, + "step": 740 + }, + { + "epoch": 0.10045414491967736, + "grad_norm": 1.6501329069153798, + "learning_rate": 1.974140158532805e-06, + "loss": 1.2992, + "step": 741 + }, + { + "epoch": 0.10058971056734224, + "grad_norm": 1.8136359814814544, + "learning_rate": 1.974040843777951e-06, + "loss": 1.2417, + "step": 742 + }, + { + "epoch": 0.10072527621500711, + "grad_norm": 1.9076074296978016, + "learning_rate": 1.973941341187009e-06, + "loss": 1.2978, + "step": 743 + }, + { + "epoch": 0.100860841862672, + "grad_norm": 1.9186272858910556, + "learning_rate": 1.9738416507791676e-06, + "loss": 1.3084, + "step": 744 + }, + { + "epoch": 0.10099640751033688, + "grad_norm": 1.9102528585686884, + "learning_rate": 1.9737417725736507e-06, + "loss": 1.254, + "step": 745 + }, + { + "epoch": 0.10113197315800176, + "grad_norm": 2.9505095817437352, + "learning_rate": 1.9736417065897187e-06, + "loss": 1.3389, + "step": 746 + }, + { + "epoch": 0.10126753880566665, + "grad_norm": 1.5148047973207992, + "learning_rate": 1.9735414528466694e-06, + "loss": 1.31, + "step": 747 + }, + { + "epoch": 0.10140310445333152, + "grad_norm": 2.137014205899074, + "learning_rate": 1.9734410113638356e-06, + "loss": 1.2898, + "step": 748 + }, + { + "epoch": 0.1015386701009964, + "grad_norm": 1.8516330350598567, + "learning_rate": 1.973340382160587e-06, + "loss": 1.2945, + "step": 749 + }, + { + "epoch": 0.10167423574866129, + "grad_norm": 1.6422561587601099, + "learning_rate": 1.973239565256328e-06, + "loss": 1.3342, + "step": 750 + }, + { + "epoch": 0.10180980139632617, + "grad_norm": 2.0669350804760547, + "learning_rate": 1.973138560670502e-06, + "loss": 1.3171, + "step": 751 + }, + { + "epoch": 0.10194536704399106, + "grad_norm": 1.7629218459705216, + "learning_rate": 1.973037368422585e-06, + "loss": 1.2604, + "step": 752 + }, + { + "epoch": 0.10208093269165594, + "grad_norm": 5.104597459694944, + "learning_rate": 1.9729359885320933e-06, + "loss": 1.2927, + "step": 753 + }, + { + "epoch": 0.10221649833932081, + "grad_norm": 2.656792544547903, + "learning_rate": 1.9728344210185757e-06, + "loss": 1.2615, + "step": 754 + }, + { + "epoch": 0.1023520639869857, + "grad_norm": 1.7706690342256055, + "learning_rate": 1.9727326659016187e-06, + "loss": 1.3039, + "step": 755 + }, + { + "epoch": 0.10248762963465058, + "grad_norm": 2.3888653875761166, + "learning_rate": 1.972630723200846e-06, + "loss": 1.2685, + "step": 756 + }, + { + "epoch": 0.10262319528231546, + "grad_norm": 1.7093242838517937, + "learning_rate": 1.9725285929359156e-06, + "loss": 1.2772, + "step": 757 + }, + { + "epoch": 0.10275876092998035, + "grad_norm": 1.931207042036317, + "learning_rate": 1.9724262751265222e-06, + "loss": 1.3107, + "step": 758 + }, + { + "epoch": 0.10289432657764523, + "grad_norm": 1.9313959284375775, + "learning_rate": 1.972323769792398e-06, + "loss": 1.3248, + "step": 759 + }, + { + "epoch": 0.1030298922253101, + "grad_norm": 1.7470420632287533, + "learning_rate": 1.97222107695331e-06, + "loss": 1.3145, + "step": 760 + }, + { + "epoch": 0.10316545787297499, + "grad_norm": 1.7506833596889058, + "learning_rate": 1.9721181966290614e-06, + "loss": 1.2725, + "step": 761 + }, + { + "epoch": 0.10330102352063987, + "grad_norm": 2.2227765844322525, + "learning_rate": 1.9720151288394916e-06, + "loss": 1.2934, + "step": 762 + }, + { + "epoch": 0.10343658916830475, + "grad_norm": 8.923500857782122, + "learning_rate": 1.9719118736044773e-06, + "loss": 1.2634, + "step": 763 + }, + { + "epoch": 0.10357215481596964, + "grad_norm": 2.4815429227425034, + "learning_rate": 1.97180843094393e-06, + "loss": 1.2691, + "step": 764 + }, + { + "epoch": 0.10370772046363451, + "grad_norm": 1.8965949437234808, + "learning_rate": 1.9717048008777978e-06, + "loss": 1.3001, + "step": 765 + }, + { + "epoch": 0.1038432861112994, + "grad_norm": 1.6970523853494521, + "learning_rate": 1.9716009834260645e-06, + "loss": 1.3101, + "step": 766 + }, + { + "epoch": 0.10397885175896428, + "grad_norm": 2.1287892034127855, + "learning_rate": 1.971496978608751e-06, + "loss": 1.2856, + "step": 767 + }, + { + "epoch": 0.10411441740662916, + "grad_norm": 2.272158635826368, + "learning_rate": 1.971392786445914e-06, + "loss": 1.2843, + "step": 768 + }, + { + "epoch": 0.10424998305429405, + "grad_norm": 2.858989385698721, + "learning_rate": 1.9712884069576455e-06, + "loss": 1.2594, + "step": 769 + }, + { + "epoch": 0.10438554870195893, + "grad_norm": 1.7109885472915194, + "learning_rate": 1.971183840164075e-06, + "loss": 1.2659, + "step": 770 + }, + { + "epoch": 0.1045211143496238, + "grad_norm": 2.4416709667903858, + "learning_rate": 1.9710790860853667e-06, + "loss": 1.2695, + "step": 771 + }, + { + "epoch": 0.10465667999728868, + "grad_norm": 2.155973359840443, + "learning_rate": 1.9709741447417223e-06, + "loss": 1.2706, + "step": 772 + }, + { + "epoch": 0.10479224564495357, + "grad_norm": 2.699368698716781, + "learning_rate": 1.970869016153378e-06, + "loss": 1.3103, + "step": 773 + }, + { + "epoch": 0.10492781129261845, + "grad_norm": 1.930118773911834, + "learning_rate": 1.9707637003406075e-06, + "loss": 1.2767, + "step": 774 + }, + { + "epoch": 0.10506337694028334, + "grad_norm": 1.6551656384785587, + "learning_rate": 1.9706581973237202e-06, + "loss": 1.2723, + "step": 775 + }, + { + "epoch": 0.10519894258794821, + "grad_norm": 2.8802458808796363, + "learning_rate": 1.9705525071230616e-06, + "loss": 1.2948, + "step": 776 + }, + { + "epoch": 0.10533450823561309, + "grad_norm": 3.240076030853301, + "learning_rate": 1.9704466297590134e-06, + "loss": 1.308, + "step": 777 + }, + { + "epoch": 0.10547007388327798, + "grad_norm": 2.199123744097051, + "learning_rate": 1.9703405652519924e-06, + "loss": 1.3226, + "step": 778 + }, + { + "epoch": 0.10560563953094286, + "grad_norm": 2.2457806684071002, + "learning_rate": 1.970234313622453e-06, + "loss": 1.2992, + "step": 779 + }, + { + "epoch": 0.10574120517860774, + "grad_norm": 1.9635550010958975, + "learning_rate": 1.9701278748908844e-06, + "loss": 1.2933, + "step": 780 + }, + { + "epoch": 0.10587677082627263, + "grad_norm": 1.6839729520313818, + "learning_rate": 1.9700212490778136e-06, + "loss": 1.2894, + "step": 781 + }, + { + "epoch": 0.1060123364739375, + "grad_norm": 1.6921055821930304, + "learning_rate": 1.969914436203801e-06, + "loss": 1.2973, + "step": 782 + }, + { + "epoch": 0.10614790212160238, + "grad_norm": 1.9363497053664738, + "learning_rate": 1.9698074362894456e-06, + "loss": 1.311, + "step": 783 + }, + { + "epoch": 0.10628346776926727, + "grad_norm": 2.023254173610856, + "learning_rate": 1.9697002493553815e-06, + "loss": 1.2932, + "step": 784 + }, + { + "epoch": 0.10641903341693215, + "grad_norm": 2.6100216261421747, + "learning_rate": 1.969592875422279e-06, + "loss": 1.2993, + "step": 785 + }, + { + "epoch": 0.10655459906459704, + "grad_norm": 2.1215275608968374, + "learning_rate": 1.9694853145108433e-06, + "loss": 1.2782, + "step": 786 + }, + { + "epoch": 0.1066901647122619, + "grad_norm": 1.6238878608630825, + "learning_rate": 1.969377566641818e-06, + "loss": 1.2838, + "step": 787 + }, + { + "epoch": 0.10682573035992679, + "grad_norm": 4.119280870035211, + "learning_rate": 1.96926963183598e-06, + "loss": 1.301, + "step": 788 + }, + { + "epoch": 0.10696129600759167, + "grad_norm": 1.9355701890231012, + "learning_rate": 1.9691615101141454e-06, + "loss": 1.305, + "step": 789 + }, + { + "epoch": 0.10709686165525656, + "grad_norm": 1.6474418838576739, + "learning_rate": 1.969053201497163e-06, + "loss": 1.2882, + "step": 790 + }, + { + "epoch": 0.10723242730292144, + "grad_norm": 1.6839631639786914, + "learning_rate": 1.96894470600592e-06, + "loss": 1.3116, + "step": 791 + }, + { + "epoch": 0.10736799295058633, + "grad_norm": 4.985632852224656, + "learning_rate": 1.9688360236613388e-06, + "loss": 1.2458, + "step": 792 + }, + { + "epoch": 0.1075035585982512, + "grad_norm": 2.1887392198334625, + "learning_rate": 1.968727154484378e-06, + "loss": 1.3292, + "step": 793 + }, + { + "epoch": 0.10763912424591608, + "grad_norm": 2.308492227569492, + "learning_rate": 1.968618098496032e-06, + "loss": 1.3102, + "step": 794 + }, + { + "epoch": 0.10777468989358097, + "grad_norm": 1.6699178218170494, + "learning_rate": 1.9685088557173318e-06, + "loss": 1.2945, + "step": 795 + }, + { + "epoch": 0.10791025554124585, + "grad_norm": 3.6436982939378835, + "learning_rate": 1.968399426169344e-06, + "loss": 1.2867, + "step": 796 + }, + { + "epoch": 0.10804582118891073, + "grad_norm": 1.9535116332632858, + "learning_rate": 1.9682898098731707e-06, + "loss": 1.262, + "step": 797 + }, + { + "epoch": 0.1081813868365756, + "grad_norm": 1.7396898412854924, + "learning_rate": 1.9681800068499507e-06, + "loss": 1.2865, + "step": 798 + }, + { + "epoch": 0.10831695248424049, + "grad_norm": 2.0637756042332516, + "learning_rate": 1.9680700171208583e-06, + "loss": 1.3389, + "step": 799 + }, + { + "epoch": 0.10845251813190537, + "grad_norm": 2.0990274365213737, + "learning_rate": 1.9679598407071053e-06, + "loss": 1.3072, + "step": 800 + }, + { + "epoch": 0.10858808377957026, + "grad_norm": 2.3830515994777244, + "learning_rate": 1.967849477629937e-06, + "loss": 1.2987, + "step": 801 + }, + { + "epoch": 0.10872364942723514, + "grad_norm": 4.349090010698838, + "learning_rate": 1.9677389279106367e-06, + "loss": 1.2903, + "step": 802 + }, + { + "epoch": 0.10885921507490003, + "grad_norm": 2.149092819962164, + "learning_rate": 1.9676281915705236e-06, + "loss": 1.2683, + "step": 803 + }, + { + "epoch": 0.1089947807225649, + "grad_norm": 2.0840672298681366, + "learning_rate": 1.9675172686309516e-06, + "loss": 1.2984, + "step": 804 + }, + { + "epoch": 0.10913034637022978, + "grad_norm": 6.602235811414823, + "learning_rate": 1.9674061591133114e-06, + "loss": 1.2867, + "step": 805 + }, + { + "epoch": 0.10926591201789466, + "grad_norm": 1.939530970256452, + "learning_rate": 1.9672948630390295e-06, + "loss": 1.2606, + "step": 806 + }, + { + "epoch": 0.10940147766555955, + "grad_norm": 1.9495269303440934, + "learning_rate": 1.9671833804295684e-06, + "loss": 1.3018, + "step": 807 + }, + { + "epoch": 0.10953704331322443, + "grad_norm": 1.886758488314711, + "learning_rate": 1.967071711306427e-06, + "loss": 1.3009, + "step": 808 + }, + { + "epoch": 0.10967260896088932, + "grad_norm": 2.0105100247409085, + "learning_rate": 1.96695985569114e-06, + "loss": 1.2937, + "step": 809 + }, + { + "epoch": 0.10980817460855419, + "grad_norm": 1.7572302806629154, + "learning_rate": 1.966847813605277e-06, + "loss": 1.2811, + "step": 810 + }, + { + "epoch": 0.10994374025621907, + "grad_norm": 1.8030651454846838, + "learning_rate": 1.9667355850704456e-06, + "loss": 1.3069, + "step": 811 + }, + { + "epoch": 0.11007930590388396, + "grad_norm": 9.199452857719917, + "learning_rate": 1.9666231701082876e-06, + "loss": 1.2324, + "step": 812 + }, + { + "epoch": 0.11021487155154884, + "grad_norm": 2.074544549899018, + "learning_rate": 1.966510568740481e-06, + "loss": 1.2815, + "step": 813 + }, + { + "epoch": 0.11035043719921372, + "grad_norm": 1.6667479784602792, + "learning_rate": 1.9663977809887406e-06, + "loss": 1.2636, + "step": 814 + }, + { + "epoch": 0.1104860028468786, + "grad_norm": 3.5852841375065716, + "learning_rate": 1.966284806874816e-06, + "loss": 1.2506, + "step": 815 + }, + { + "epoch": 0.11062156849454348, + "grad_norm": 2.2235006736130334, + "learning_rate": 1.966171646420494e-06, + "loss": 1.2695, + "step": 816 + }, + { + "epoch": 0.11075713414220836, + "grad_norm": 2.0287989301548746, + "learning_rate": 1.9660582996475962e-06, + "loss": 1.3164, + "step": 817 + }, + { + "epoch": 0.11089269978987325, + "grad_norm": 2.9817085187786536, + "learning_rate": 1.9659447665779815e-06, + "loss": 1.2497, + "step": 818 + }, + { + "epoch": 0.11102826543753813, + "grad_norm": 1.98866328476989, + "learning_rate": 1.965831047233543e-06, + "loss": 1.2714, + "step": 819 + }, + { + "epoch": 0.11116383108520302, + "grad_norm": 2.0527871928141117, + "learning_rate": 1.965717141636211e-06, + "loss": 1.2984, + "step": 820 + }, + { + "epoch": 0.11129939673286789, + "grad_norm": 1.8755726048148478, + "learning_rate": 1.9656030498079507e-06, + "loss": 1.2861, + "step": 821 + }, + { + "epoch": 0.11143496238053277, + "grad_norm": 4.013263567186776, + "learning_rate": 1.9654887717707645e-06, + "loss": 1.2754, + "step": 822 + }, + { + "epoch": 0.11157052802819765, + "grad_norm": 1.4565672481651146, + "learning_rate": 1.96537430754669e-06, + "loss": 1.3106, + "step": 823 + }, + { + "epoch": 0.11170609367586254, + "grad_norm": 1.621835575727714, + "learning_rate": 1.9652596571578003e-06, + "loss": 1.313, + "step": 824 + }, + { + "epoch": 0.11184165932352742, + "grad_norm": 2.973950594387097, + "learning_rate": 1.9651448206262047e-06, + "loss": 1.2667, + "step": 825 + }, + { + "epoch": 0.1119772249711923, + "grad_norm": 1.7763998062155748, + "learning_rate": 1.965029797974049e-06, + "loss": 1.2821, + "step": 826 + }, + { + "epoch": 0.11211279061885718, + "grad_norm": 2.670453608161941, + "learning_rate": 1.9649145892235145e-06, + "loss": 1.2608, + "step": 827 + }, + { + "epoch": 0.11224835626652206, + "grad_norm": 2.0262883362644724, + "learning_rate": 1.964799194396818e-06, + "loss": 1.2728, + "step": 828 + }, + { + "epoch": 0.11238392191418695, + "grad_norm": 1.6520496777737694, + "learning_rate": 1.9646836135162125e-06, + "loss": 1.281, + "step": 829 + }, + { + "epoch": 0.11251948756185183, + "grad_norm": 1.595695523534776, + "learning_rate": 1.9645678466039864e-06, + "loss": 1.2536, + "step": 830 + }, + { + "epoch": 0.11265505320951671, + "grad_norm": 5.411731882666048, + "learning_rate": 1.9644518936824658e-06, + "loss": 1.3031, + "step": 831 + }, + { + "epoch": 0.11279061885718158, + "grad_norm": 2.6547023358117494, + "learning_rate": 1.9643357547740097e-06, + "loss": 1.2688, + "step": 832 + }, + { + "epoch": 0.11292618450484647, + "grad_norm": 12.843931013957842, + "learning_rate": 1.9642194299010155e-06, + "loss": 1.2255, + "step": 833 + }, + { + "epoch": 0.11306175015251135, + "grad_norm": 1.76670258545355, + "learning_rate": 1.9641029190859155e-06, + "loss": 1.266, + "step": 834 + }, + { + "epoch": 0.11319731580017624, + "grad_norm": 1.8845052873869277, + "learning_rate": 1.9639862223511777e-06, + "loss": 1.2608, + "step": 835 + }, + { + "epoch": 0.11333288144784112, + "grad_norm": 1.7063028481598366, + "learning_rate": 1.9638693397193057e-06, + "loss": 1.248, + "step": 836 + }, + { + "epoch": 0.11346844709550599, + "grad_norm": 3.4496142001699583, + "learning_rate": 1.9637522712128407e-06, + "loss": 1.2549, + "step": 837 + }, + { + "epoch": 0.11360401274317088, + "grad_norm": 2.9907423891345335, + "learning_rate": 1.963635016854357e-06, + "loss": 1.2898, + "step": 838 + }, + { + "epoch": 0.11373957839083576, + "grad_norm": 1.7291594197658193, + "learning_rate": 1.963517576666467e-06, + "loss": 1.2753, + "step": 839 + }, + { + "epoch": 0.11387514403850064, + "grad_norm": 1.7468045441908089, + "learning_rate": 1.9633999506718176e-06, + "loss": 1.2905, + "step": 840 + }, + { + "epoch": 0.11401070968616553, + "grad_norm": 1.9370862818401682, + "learning_rate": 1.9632821388930926e-06, + "loss": 1.2608, + "step": 841 + }, + { + "epoch": 0.11414627533383041, + "grad_norm": 1.5922048193226943, + "learning_rate": 1.9631641413530102e-06, + "loss": 1.2422, + "step": 842 + }, + { + "epoch": 0.11428184098149528, + "grad_norm": 1.7257862217288342, + "learning_rate": 1.9630459580743264e-06, + "loss": 1.3112, + "step": 843 + }, + { + "epoch": 0.11441740662916017, + "grad_norm": 1.9990011673422183, + "learning_rate": 1.9629275890798315e-06, + "loss": 1.2734, + "step": 844 + }, + { + "epoch": 0.11455297227682505, + "grad_norm": 2.152026658500898, + "learning_rate": 1.962809034392352e-06, + "loss": 1.2815, + "step": 845 + }, + { + "epoch": 0.11468853792448994, + "grad_norm": 4.258337228593757, + "learning_rate": 1.96269029403475e-06, + "loss": 1.2958, + "step": 846 + }, + { + "epoch": 0.11482410357215482, + "grad_norm": 1.9291050880426774, + "learning_rate": 1.962571368029924e-06, + "loss": 1.285, + "step": 847 + }, + { + "epoch": 0.1149596692198197, + "grad_norm": 1.6724379046449602, + "learning_rate": 1.9624522564008074e-06, + "loss": 1.2763, + "step": 848 + }, + { + "epoch": 0.11509523486748457, + "grad_norm": 3.1651152850917863, + "learning_rate": 1.9623329591703706e-06, + "loss": 1.2681, + "step": 849 + }, + { + "epoch": 0.11523080051514946, + "grad_norm": 2.7343420437346064, + "learning_rate": 1.962213476361619e-06, + "loss": 1.292, + "step": 850 + }, + { + "epoch": 0.11536636616281434, + "grad_norm": 1.695267138694282, + "learning_rate": 1.962093807997593e-06, + "loss": 1.2771, + "step": 851 + }, + { + "epoch": 0.11550193181047923, + "grad_norm": 1.709075453498453, + "learning_rate": 1.961973954101371e-06, + "loss": 1.2996, + "step": 852 + }, + { + "epoch": 0.11563749745814411, + "grad_norm": 1.5452712494597403, + "learning_rate": 1.961853914696065e-06, + "loss": 1.2941, + "step": 853 + }, + { + "epoch": 0.11577306310580898, + "grad_norm": 1.8496641152540467, + "learning_rate": 1.961733689804824e-06, + "loss": 1.2632, + "step": 854 + }, + { + "epoch": 0.11590862875347387, + "grad_norm": 2.0591135988978353, + "learning_rate": 1.961613279450833e-06, + "loss": 1.2929, + "step": 855 + }, + { + "epoch": 0.11604419440113875, + "grad_norm": 3.148864375452961, + "learning_rate": 1.9614926836573107e-06, + "loss": 1.3012, + "step": 856 + }, + { + "epoch": 0.11617976004880363, + "grad_norm": 2.1658524502033467, + "learning_rate": 1.9613719024475145e-06, + "loss": 1.2759, + "step": 857 + }, + { + "epoch": 0.11631532569646852, + "grad_norm": 1.94764636098015, + "learning_rate": 1.961250935844735e-06, + "loss": 1.3165, + "step": 858 + }, + { + "epoch": 0.1164508913441334, + "grad_norm": 1.8942292246021384, + "learning_rate": 1.9611297838723007e-06, + "loss": 1.2841, + "step": 859 + }, + { + "epoch": 0.11658645699179827, + "grad_norm": 2.4321116562313687, + "learning_rate": 1.961008446553574e-06, + "loss": 1.3274, + "step": 860 + }, + { + "epoch": 0.11672202263946316, + "grad_norm": 1.980741206435538, + "learning_rate": 1.9608869239119545e-06, + "loss": 1.3034, + "step": 861 + }, + { + "epoch": 0.11685758828712804, + "grad_norm": 1.8300223814611152, + "learning_rate": 1.960765215970876e-06, + "loss": 1.2518, + "step": 862 + }, + { + "epoch": 0.11699315393479293, + "grad_norm": 1.999290540014251, + "learning_rate": 1.9606433227538095e-06, + "loss": 1.3021, + "step": 863 + }, + { + "epoch": 0.11712871958245781, + "grad_norm": 1.9433421987123738, + "learning_rate": 1.960521244284261e-06, + "loss": 1.3126, + "step": 864 + }, + { + "epoch": 0.11726428523012268, + "grad_norm": 2.922677324291142, + "learning_rate": 1.960398980585773e-06, + "loss": 1.2659, + "step": 865 + }, + { + "epoch": 0.11739985087778756, + "grad_norm": 2.6106317509618444, + "learning_rate": 1.960276531681922e-06, + "loss": 1.2667, + "step": 866 + }, + { + "epoch": 0.11753541652545245, + "grad_norm": 2.2035710898836802, + "learning_rate": 1.960153897596322e-06, + "loss": 1.2634, + "step": 867 + }, + { + "epoch": 0.11767098217311733, + "grad_norm": 1.8457443220820786, + "learning_rate": 1.960031078352622e-06, + "loss": 1.2751, + "step": 868 + }, + { + "epoch": 0.11780654782078222, + "grad_norm": 2.7417630347189803, + "learning_rate": 1.9599080739745064e-06, + "loss": 1.2657, + "step": 869 + }, + { + "epoch": 0.1179421134684471, + "grad_norm": 2.8792219879296304, + "learning_rate": 1.9597848844856955e-06, + "loss": 1.3047, + "step": 870 + }, + { + "epoch": 0.11807767911611197, + "grad_norm": 2.660781613462646, + "learning_rate": 1.959661509909946e-06, + "loss": 1.2685, + "step": 871 + }, + { + "epoch": 0.11821324476377686, + "grad_norm": 1.7573908242731435, + "learning_rate": 1.9595379502710495e-06, + "loss": 1.2793, + "step": 872 + }, + { + "epoch": 0.11834881041144174, + "grad_norm": 1.6085608949520571, + "learning_rate": 1.9594142055928333e-06, + "loss": 1.2519, + "step": 873 + }, + { + "epoch": 0.11848437605910662, + "grad_norm": 2.8299108385855636, + "learning_rate": 1.9592902758991606e-06, + "loss": 1.2835, + "step": 874 + }, + { + "epoch": 0.11861994170677151, + "grad_norm": 1.6412070958979725, + "learning_rate": 1.9591661612139306e-06, + "loss": 1.2926, + "step": 875 + }, + { + "epoch": 0.11875550735443638, + "grad_norm": 2.090974818087307, + "learning_rate": 1.9590418615610775e-06, + "loss": 1.2911, + "step": 876 + }, + { + "epoch": 0.11889107300210126, + "grad_norm": 1.9034483823698836, + "learning_rate": 1.9589173769645714e-06, + "loss": 1.29, + "step": 877 + }, + { + "epoch": 0.11902663864976615, + "grad_norm": 1.5743943509767588, + "learning_rate": 1.958792707448419e-06, + "loss": 1.2853, + "step": 878 + }, + { + "epoch": 0.11916220429743103, + "grad_norm": 4.716083430014237, + "learning_rate": 1.9586678530366606e-06, + "loss": 1.26, + "step": 879 + }, + { + "epoch": 0.11929776994509592, + "grad_norm": 1.7466513412658076, + "learning_rate": 1.958542813753374e-06, + "loss": 1.3245, + "step": 880 + }, + { + "epoch": 0.1194333355927608, + "grad_norm": 2.0779451602717796, + "learning_rate": 1.9584175896226725e-06, + "loss": 1.2903, + "step": 881 + }, + { + "epoch": 0.11956890124042567, + "grad_norm": 1.8622467873014983, + "learning_rate": 1.9582921806687037e-06, + "loss": 1.2846, + "step": 882 + }, + { + "epoch": 0.11970446688809055, + "grad_norm": 1.7881606068964897, + "learning_rate": 1.9581665869156526e-06, + "loss": 1.2576, + "step": 883 + }, + { + "epoch": 0.11984003253575544, + "grad_norm": 6.624315091158297, + "learning_rate": 1.958040808387738e-06, + "loss": 1.2587, + "step": 884 + }, + { + "epoch": 0.11997559818342032, + "grad_norm": 2.013378502046489, + "learning_rate": 1.9579148451092163e-06, + "loss": 1.2644, + "step": 885 + }, + { + "epoch": 0.12011116383108521, + "grad_norm": 3.042492362486355, + "learning_rate": 1.957788697104378e-06, + "loss": 1.3255, + "step": 886 + }, + { + "epoch": 0.12024672947875008, + "grad_norm": 3.322074589893828, + "learning_rate": 1.9576623643975496e-06, + "loss": 1.2778, + "step": 887 + }, + { + "epoch": 0.12038229512641496, + "grad_norm": 2.365116412987425, + "learning_rate": 1.9575358470130934e-06, + "loss": 1.2571, + "step": 888 + }, + { + "epoch": 0.12051786077407985, + "grad_norm": 2.047040810435255, + "learning_rate": 1.9574091449754074e-06, + "loss": 1.2821, + "step": 889 + }, + { + "epoch": 0.12065342642174473, + "grad_norm": 1.610757779140362, + "learning_rate": 1.9572822583089253e-06, + "loss": 1.2956, + "step": 890 + }, + { + "epoch": 0.12078899206940961, + "grad_norm": 1.990129121934713, + "learning_rate": 1.9571551870381163e-06, + "loss": 1.2736, + "step": 891 + }, + { + "epoch": 0.1209245577170745, + "grad_norm": 1.8682483862161126, + "learning_rate": 1.9570279311874842e-06, + "loss": 1.2716, + "step": 892 + }, + { + "epoch": 0.12106012336473937, + "grad_norm": 2.6860639613158197, + "learning_rate": 1.9569004907815706e-06, + "loss": 1.2724, + "step": 893 + }, + { + "epoch": 0.12119568901240425, + "grad_norm": 2.149085553934019, + "learning_rate": 1.9567728658449503e-06, + "loss": 1.2606, + "step": 894 + }, + { + "epoch": 0.12133125466006914, + "grad_norm": 3.020374374777928, + "learning_rate": 1.956645056402235e-06, + "loss": 1.2492, + "step": 895 + }, + { + "epoch": 0.12146682030773402, + "grad_norm": 1.9028471802296274, + "learning_rate": 1.956517062478072e-06, + "loss": 1.2741, + "step": 896 + }, + { + "epoch": 0.1216023859553989, + "grad_norm": 1.728449278759905, + "learning_rate": 1.956388884097144e-06, + "loss": 1.2943, + "step": 897 + }, + { + "epoch": 0.12173795160306379, + "grad_norm": 1.900866637621388, + "learning_rate": 1.9562605212841686e-06, + "loss": 1.2783, + "step": 898 + }, + { + "epoch": 0.12187351725072866, + "grad_norm": 2.0371819983765747, + "learning_rate": 1.9561319740639e-06, + "loss": 1.2875, + "step": 899 + }, + { + "epoch": 0.12200908289839354, + "grad_norm": 2.3094887316450885, + "learning_rate": 1.9560032424611274e-06, + "loss": 1.2739, + "step": 900 + }, + { + "epoch": 0.12214464854605843, + "grad_norm": 1.9071500886455282, + "learning_rate": 1.955874326500676e-06, + "loss": 1.287, + "step": 901 + }, + { + "epoch": 0.12228021419372331, + "grad_norm": 5.977670056691222, + "learning_rate": 1.955745226207406e-06, + "loss": 1.2504, + "step": 902 + }, + { + "epoch": 0.1224157798413882, + "grad_norm": 1.6115797242092216, + "learning_rate": 1.9556159416062127e-06, + "loss": 1.2397, + "step": 903 + }, + { + "epoch": 0.12255134548905307, + "grad_norm": 11.68755160476082, + "learning_rate": 1.955486472722029e-06, + "loss": 1.306, + "step": 904 + }, + { + "epoch": 0.12268691113671795, + "grad_norm": 2.3776527209937637, + "learning_rate": 1.955356819579821e-06, + "loss": 1.2542, + "step": 905 + }, + { + "epoch": 0.12282247678438284, + "grad_norm": 1.8460948106958528, + "learning_rate": 1.955226982204591e-06, + "loss": 1.2757, + "step": 906 + }, + { + "epoch": 0.12295804243204772, + "grad_norm": 1.687308337090163, + "learning_rate": 1.955096960621378e-06, + "loss": 1.2803, + "step": 907 + }, + { + "epoch": 0.1230936080797126, + "grad_norm": 1.9701603844200375, + "learning_rate": 1.9549667548552553e-06, + "loss": 1.2639, + "step": 908 + }, + { + "epoch": 0.12322917372737749, + "grad_norm": 1.6165267146528128, + "learning_rate": 1.9548363649313315e-06, + "loss": 1.2717, + "step": 909 + }, + { + "epoch": 0.12336473937504236, + "grad_norm": 2.685771826512924, + "learning_rate": 1.9547057908747522e-06, + "loss": 1.2815, + "step": 910 + }, + { + "epoch": 0.12350030502270724, + "grad_norm": 1.7161008806082334, + "learning_rate": 1.954575032710697e-06, + "loss": 1.2718, + "step": 911 + }, + { + "epoch": 0.12363587067037213, + "grad_norm": 1.7799917308870339, + "learning_rate": 1.954444090464382e-06, + "loss": 1.288, + "step": 912 + }, + { + "epoch": 0.12377143631803701, + "grad_norm": 2.077800181566304, + "learning_rate": 1.9543129641610575e-06, + "loss": 1.2569, + "step": 913 + }, + { + "epoch": 0.1239070019657019, + "grad_norm": 1.828497474070491, + "learning_rate": 1.9541816538260105e-06, + "loss": 1.2878, + "step": 914 + }, + { + "epoch": 0.12404256761336677, + "grad_norm": 2.110604915070212, + "learning_rate": 1.954050159484564e-06, + "loss": 1.2901, + "step": 915 + }, + { + "epoch": 0.12417813326103165, + "grad_norm": 2.6039326678884027, + "learning_rate": 1.953918481162075e-06, + "loss": 1.255, + "step": 916 + }, + { + "epoch": 0.12431369890869653, + "grad_norm": 2.0586413032411435, + "learning_rate": 1.953786618883937e-06, + "loss": 1.2562, + "step": 917 + }, + { + "epoch": 0.12444926455636142, + "grad_norm": 3.3038852896242377, + "learning_rate": 1.953654572675578e-06, + "loss": 1.328, + "step": 918 + }, + { + "epoch": 0.1245848302040263, + "grad_norm": 3.8540791241910526, + "learning_rate": 1.953522342562462e-06, + "loss": 1.2582, + "step": 919 + }, + { + "epoch": 0.12472039585169119, + "grad_norm": 1.8592129904264205, + "learning_rate": 1.9533899285700893e-06, + "loss": 1.2293, + "step": 920 + }, + { + "epoch": 0.12485596149935606, + "grad_norm": 1.778800349483944, + "learning_rate": 1.9532573307239942e-06, + "loss": 1.2324, + "step": 921 + }, + { + "epoch": 0.12499152714702094, + "grad_norm": 1.5890798493214113, + "learning_rate": 1.9531245490497475e-06, + "loss": 1.285, + "step": 922 + }, + { + "epoch": 0.12512709279468584, + "grad_norm": 1.8445448384851697, + "learning_rate": 1.952991583572955e-06, + "loss": 1.2805, + "step": 923 + }, + { + "epoch": 0.1252626584423507, + "grad_norm": 2.263734807473622, + "learning_rate": 1.9528584343192583e-06, + "loss": 1.282, + "step": 924 + }, + { + "epoch": 0.12539822409001558, + "grad_norm": 1.7285843078715861, + "learning_rate": 1.9527251013143338e-06, + "loss": 1.262, + "step": 925 + }, + { + "epoch": 0.12553378973768048, + "grad_norm": 2.3697259017010266, + "learning_rate": 1.9525915845838942e-06, + "loss": 1.2778, + "step": 926 + }, + { + "epoch": 0.12566935538534535, + "grad_norm": 1.6084205611746412, + "learning_rate": 1.952457884153686e-06, + "loss": 1.2504, + "step": 927 + }, + { + "epoch": 0.12580492103301025, + "grad_norm": 1.744684458761882, + "learning_rate": 1.952324000049494e-06, + "loss": 1.2876, + "step": 928 + }, + { + "epoch": 0.12594048668067512, + "grad_norm": 1.7702267058221501, + "learning_rate": 1.952189932297135e-06, + "loss": 1.2685, + "step": 929 + }, + { + "epoch": 0.12607605232834, + "grad_norm": 2.464546491612376, + "learning_rate": 1.9520556809224643e-06, + "loss": 1.2901, + "step": 930 + }, + { + "epoch": 0.1262116179760049, + "grad_norm": 2.2963837304134063, + "learning_rate": 1.9519212459513702e-06, + "loss": 1.3313, + "step": 931 + }, + { + "epoch": 0.12634718362366976, + "grad_norm": 1.5643685811898735, + "learning_rate": 1.951786627409778e-06, + "loss": 1.246, + "step": 932 + }, + { + "epoch": 0.12648274927133465, + "grad_norm": 2.555889367816986, + "learning_rate": 1.9516518253236474e-06, + "loss": 1.2801, + "step": 933 + }, + { + "epoch": 0.12661831491899952, + "grad_norm": 1.7585611493460396, + "learning_rate": 1.9515168397189743e-06, + "loss": 1.2425, + "step": 934 + }, + { + "epoch": 0.1267538805666644, + "grad_norm": 2.3463190585970595, + "learning_rate": 1.95138167062179e-06, + "loss": 1.26, + "step": 935 + }, + { + "epoch": 0.1268894462143293, + "grad_norm": 1.934864542863584, + "learning_rate": 1.9512463180581595e-06, + "loss": 1.2647, + "step": 936 + }, + { + "epoch": 0.12702501186199416, + "grad_norm": 2.905765605223124, + "learning_rate": 1.9511107820541857e-06, + "loss": 1.2495, + "step": 937 + }, + { + "epoch": 0.12716057750965906, + "grad_norm": 1.8018080536628949, + "learning_rate": 1.9509750626360053e-06, + "loss": 1.2485, + "step": 938 + }, + { + "epoch": 0.12729614315732393, + "grad_norm": 1.9551329707331615, + "learning_rate": 1.95083915982979e-06, + "loss": 1.2632, + "step": 939 + }, + { + "epoch": 0.1274317088049888, + "grad_norm": 2.480543476273984, + "learning_rate": 1.950703073661749e-06, + "loss": 1.3012, + "step": 940 + }, + { + "epoch": 0.1275672744526537, + "grad_norm": 1.9139112482406608, + "learning_rate": 1.950566804158124e-06, + "loss": 1.2776, + "step": 941 + }, + { + "epoch": 0.12770284010031857, + "grad_norm": 2.4431794118367214, + "learning_rate": 1.9504303513451944e-06, + "loss": 1.2506, + "step": 942 + }, + { + "epoch": 0.12783840574798347, + "grad_norm": 2.039821486014558, + "learning_rate": 1.9502937152492737e-06, + "loss": 1.2496, + "step": 943 + }, + { + "epoch": 0.12797397139564834, + "grad_norm": 2.0045555915657864, + "learning_rate": 1.950156895896711e-06, + "loss": 1.2786, + "step": 944 + }, + { + "epoch": 0.12810953704331324, + "grad_norm": 1.8463000642739062, + "learning_rate": 1.9500198933138914e-06, + "loss": 1.2516, + "step": 945 + }, + { + "epoch": 0.1282451026909781, + "grad_norm": 1.7824171471324581, + "learning_rate": 1.949882707527234e-06, + "loss": 1.2568, + "step": 946 + }, + { + "epoch": 0.12838066833864298, + "grad_norm": 2.8677504525837216, + "learning_rate": 1.949745338563195e-06, + "loss": 1.2564, + "step": 947 + }, + { + "epoch": 0.12851623398630788, + "grad_norm": 6.049607799845932, + "learning_rate": 1.949607786448264e-06, + "loss": 1.2547, + "step": 948 + }, + { + "epoch": 0.12865179963397275, + "grad_norm": 5.896790181818982, + "learning_rate": 1.9494700512089664e-06, + "loss": 1.2973, + "step": 949 + }, + { + "epoch": 0.12878736528163764, + "grad_norm": 2.0391729437695183, + "learning_rate": 1.949332132871865e-06, + "loss": 1.2365, + "step": 950 + }, + { + "epoch": 0.12892293092930251, + "grad_norm": 2.4519658696922897, + "learning_rate": 1.9491940314635553e-06, + "loss": 1.2435, + "step": 951 + }, + { + "epoch": 0.12905849657696739, + "grad_norm": 2.458020073349322, + "learning_rate": 1.9490557470106686e-06, + "loss": 1.2985, + "step": 952 + }, + { + "epoch": 0.12919406222463228, + "grad_norm": 2.1084736950923038, + "learning_rate": 1.9489172795398727e-06, + "loss": 1.2707, + "step": 953 + }, + { + "epoch": 0.12932962787229715, + "grad_norm": 1.7955195996153375, + "learning_rate": 1.9487786290778696e-06, + "loss": 1.2666, + "step": 954 + }, + { + "epoch": 0.12946519351996205, + "grad_norm": 1.8524766840737081, + "learning_rate": 1.9486397956513975e-06, + "loss": 1.2593, + "step": 955 + }, + { + "epoch": 0.12960075916762692, + "grad_norm": 1.9963189969800377, + "learning_rate": 1.9485007792872285e-06, + "loss": 1.3022, + "step": 956 + }, + { + "epoch": 0.1297363248152918, + "grad_norm": 4.065383413404805, + "learning_rate": 1.9483615800121713e-06, + "loss": 1.255, + "step": 957 + }, + { + "epoch": 0.1298718904629567, + "grad_norm": 1.9371286764710955, + "learning_rate": 1.9482221978530695e-06, + "loss": 1.2579, + "step": 958 + }, + { + "epoch": 0.13000745611062156, + "grad_norm": 5.655794180748263, + "learning_rate": 1.9480826328368018e-06, + "loss": 1.2704, + "step": 959 + }, + { + "epoch": 0.13014302175828646, + "grad_norm": 2.6418589245187456, + "learning_rate": 1.9479428849902816e-06, + "loss": 1.25, + "step": 960 + }, + { + "epoch": 0.13027858740595133, + "grad_norm": 2.362411365518207, + "learning_rate": 1.9478029543404587e-06, + "loss": 1.2867, + "step": 961 + }, + { + "epoch": 0.13041415305361623, + "grad_norm": 3.1694409290040095, + "learning_rate": 1.9476628409143177e-06, + "loss": 1.2909, + "step": 962 + }, + { + "epoch": 0.1305497187012811, + "grad_norm": 3.9922658749242768, + "learning_rate": 1.9475225447388787e-06, + "loss": 1.2824, + "step": 963 + }, + { + "epoch": 0.13068528434894597, + "grad_norm": 1.8290169079328702, + "learning_rate": 1.9473820658411954e-06, + "loss": 1.2675, + "step": 964 + }, + { + "epoch": 0.13082084999661087, + "grad_norm": 1.7606532295793598, + "learning_rate": 1.9472414042483594e-06, + "loss": 1.2495, + "step": 965 + }, + { + "epoch": 0.13095641564427574, + "grad_norm": 1.8736821763119056, + "learning_rate": 1.9471005599874955e-06, + "loss": 1.27, + "step": 966 + }, + { + "epoch": 0.13109198129194063, + "grad_norm": 3.531761641186994, + "learning_rate": 1.9469595330857644e-06, + "loss": 1.255, + "step": 967 + }, + { + "epoch": 0.1312275469396055, + "grad_norm": 1.590411047843772, + "learning_rate": 1.946818323570362e-06, + "loss": 1.2526, + "step": 968 + }, + { + "epoch": 0.13136311258727038, + "grad_norm": 1.7390131261915274, + "learning_rate": 1.9466769314685204e-06, + "loss": 1.2517, + "step": 969 + }, + { + "epoch": 0.13149867823493527, + "grad_norm": 3.7343285505524704, + "learning_rate": 1.9465353568075047e-06, + "loss": 1.267, + "step": 970 + }, + { + "epoch": 0.13163424388260014, + "grad_norm": 1.8182807113557136, + "learning_rate": 1.946393599614617e-06, + "loss": 1.2849, + "step": 971 + }, + { + "epoch": 0.13176980953026504, + "grad_norm": 2.2654890611835903, + "learning_rate": 1.9462516599171944e-06, + "loss": 1.3018, + "step": 972 + }, + { + "epoch": 0.1319053751779299, + "grad_norm": 2.6529790032292477, + "learning_rate": 1.946109537742608e-06, + "loss": 1.2228, + "step": 973 + }, + { + "epoch": 0.13204094082559478, + "grad_norm": 2.691856107204863, + "learning_rate": 1.945967233118265e-06, + "loss": 1.2382, + "step": 974 + }, + { + "epoch": 0.13217650647325968, + "grad_norm": 3.6614761057464382, + "learning_rate": 1.945824746071609e-06, + "loss": 1.2449, + "step": 975 + }, + { + "epoch": 0.13231207212092455, + "grad_norm": 1.9182952213577693, + "learning_rate": 1.945682076630116e-06, + "loss": 1.2728, + "step": 976 + }, + { + "epoch": 0.13244763776858945, + "grad_norm": 1.934055840877942, + "learning_rate": 1.9455392248212995e-06, + "loss": 1.245, + "step": 977 + }, + { + "epoch": 0.13258320341625432, + "grad_norm": 3.814035962380871, + "learning_rate": 1.945396190672707e-06, + "loss": 1.2577, + "step": 978 + }, + { + "epoch": 0.1327187690639192, + "grad_norm": 6.37260124365308, + "learning_rate": 1.9452529742119214e-06, + "loss": 1.275, + "step": 979 + }, + { + "epoch": 0.1328543347115841, + "grad_norm": 3.1514958808105877, + "learning_rate": 1.9451095754665613e-06, + "loss": 1.2499, + "step": 980 + }, + { + "epoch": 0.13298990035924896, + "grad_norm": 1.9217418306378384, + "learning_rate": 1.94496599446428e-06, + "loss": 1.223, + "step": 981 + }, + { + "epoch": 0.13312546600691386, + "grad_norm": 2.271776375476813, + "learning_rate": 1.9448222312327654e-06, + "loss": 1.2545, + "step": 982 + }, + { + "epoch": 0.13326103165457873, + "grad_norm": 1.9968127009004972, + "learning_rate": 1.944678285799742e-06, + "loss": 1.2483, + "step": 983 + }, + { + "epoch": 0.13339659730224362, + "grad_norm": 5.347930909622141, + "learning_rate": 1.944534158192968e-06, + "loss": 1.237, + "step": 984 + }, + { + "epoch": 0.1335321629499085, + "grad_norm": 1.5885306894083076, + "learning_rate": 1.944389848440237e-06, + "loss": 1.2648, + "step": 985 + }, + { + "epoch": 0.13366772859757337, + "grad_norm": 1.9062517849812082, + "learning_rate": 1.9442453565693782e-06, + "loss": 1.2409, + "step": 986 + }, + { + "epoch": 0.13380329424523826, + "grad_norm": 2.5802578096928785, + "learning_rate": 1.944100682608256e-06, + "loss": 1.2379, + "step": 987 + }, + { + "epoch": 0.13393885989290313, + "grad_norm": 1.752581431056972, + "learning_rate": 1.943955826584769e-06, + "loss": 1.2248, + "step": 988 + }, + { + "epoch": 0.13407442554056803, + "grad_norm": 1.767623430918167, + "learning_rate": 1.9438107885268525e-06, + "loss": 1.2614, + "step": 989 + }, + { + "epoch": 0.1342099911882329, + "grad_norm": 1.6197814722948483, + "learning_rate": 1.9436655684624755e-06, + "loss": 1.306, + "step": 990 + }, + { + "epoch": 0.13434555683589777, + "grad_norm": 2.0744897125338264, + "learning_rate": 1.9435201664196424e-06, + "loss": 1.2428, + "step": 991 + }, + { + "epoch": 0.13448112248356267, + "grad_norm": 1.7620646784960532, + "learning_rate": 1.9433745824263924e-06, + "loss": 1.2189, + "step": 992 + }, + { + "epoch": 0.13461668813122754, + "grad_norm": 1.966836434496043, + "learning_rate": 1.943228816510801e-06, + "loss": 1.2867, + "step": 993 + }, + { + "epoch": 0.13475225377889244, + "grad_norm": 4.420807178223552, + "learning_rate": 1.943082868700978e-06, + "loss": 1.2591, + "step": 994 + }, + { + "epoch": 0.1348878194265573, + "grad_norm": 1.9143814227039089, + "learning_rate": 1.9429367390250676e-06, + "loss": 1.2476, + "step": 995 + }, + { + "epoch": 0.13502338507422218, + "grad_norm": 1.7270092761884366, + "learning_rate": 1.942790427511251e-06, + "loss": 1.2249, + "step": 996 + }, + { + "epoch": 0.13515895072188708, + "grad_norm": 2.2099561339306804, + "learning_rate": 1.9426439341877412e-06, + "loss": 1.2633, + "step": 997 + }, + { + "epoch": 0.13529451636955195, + "grad_norm": 1.7423735129141322, + "learning_rate": 1.94249725908279e-06, + "loss": 1.2419, + "step": 998 + }, + { + "epoch": 0.13543008201721685, + "grad_norm": 3.442225229593552, + "learning_rate": 1.942350402224682e-06, + "loss": 1.2861, + "step": 999 + }, + { + "epoch": 0.13556564766488172, + "grad_norm": 1.5008231049802845, + "learning_rate": 1.942203363641738e-06, + "loss": 1.2366, + "step": 1000 + }, + { + "epoch": 0.1357012133125466, + "grad_norm": 2.375621572396239, + "learning_rate": 1.942056143362312e-06, + "loss": 1.249, + "step": 1001 + }, + { + "epoch": 0.13583677896021149, + "grad_norm": 1.941385091937466, + "learning_rate": 1.941908741414795e-06, + "loss": 1.2553, + "step": 1002 + }, + { + "epoch": 0.13597234460787636, + "grad_norm": 2.152870950701802, + "learning_rate": 1.941761157827612e-06, + "loss": 1.2343, + "step": 1003 + }, + { + "epoch": 0.13610791025554125, + "grad_norm": 2.0491459538953634, + "learning_rate": 1.9416133926292236e-06, + "loss": 1.2391, + "step": 1004 + }, + { + "epoch": 0.13624347590320612, + "grad_norm": 1.9264318341690134, + "learning_rate": 1.941465445848125e-06, + "loss": 1.2753, + "step": 1005 + }, + { + "epoch": 0.13637904155087102, + "grad_norm": 2.9796846685714136, + "learning_rate": 1.941317317512847e-06, + "loss": 1.24, + "step": 1006 + }, + { + "epoch": 0.1365146071985359, + "grad_norm": 1.727158795467942, + "learning_rate": 1.9411690076519545e-06, + "loss": 1.2103, + "step": 1007 + }, + { + "epoch": 0.13665017284620076, + "grad_norm": 2.9517006907843566, + "learning_rate": 1.941020516294048e-06, + "loss": 1.2702, + "step": 1008 + }, + { + "epoch": 0.13678573849386566, + "grad_norm": 1.917167591024721, + "learning_rate": 1.9408718434677625e-06, + "loss": 1.281, + "step": 1009 + }, + { + "epoch": 0.13692130414153053, + "grad_norm": 1.8034632581271544, + "learning_rate": 1.9407229892017694e-06, + "loss": 1.2673, + "step": 1010 + }, + { + "epoch": 0.13705686978919543, + "grad_norm": 1.7886874573825269, + "learning_rate": 1.940573953524773e-06, + "loss": 1.2641, + "step": 1011 + }, + { + "epoch": 0.1371924354368603, + "grad_norm": 1.6844815828047737, + "learning_rate": 1.9404247364655145e-06, + "loss": 1.2395, + "step": 1012 + }, + { + "epoch": 0.13732800108452517, + "grad_norm": 5.839167889524332, + "learning_rate": 1.9402753380527684e-06, + "loss": 1.2402, + "step": 1013 + }, + { + "epoch": 0.13746356673219007, + "grad_norm": 2.1140650483834964, + "learning_rate": 1.9401257583153456e-06, + "loss": 1.2305, + "step": 1014 + }, + { + "epoch": 0.13759913237985494, + "grad_norm": 4.604756935920825, + "learning_rate": 1.9399759972820913e-06, + "loss": 1.2616, + "step": 1015 + }, + { + "epoch": 0.13773469802751984, + "grad_norm": 2.436046568523522, + "learning_rate": 1.9398260549818856e-06, + "loss": 1.2555, + "step": 1016 + }, + { + "epoch": 0.1378702636751847, + "grad_norm": 1.6761835046973006, + "learning_rate": 1.9396759314436435e-06, + "loss": 1.248, + "step": 1017 + }, + { + "epoch": 0.13800582932284958, + "grad_norm": 1.9285417833531677, + "learning_rate": 1.939525626696316e-06, + "loss": 1.2624, + "step": 1018 + }, + { + "epoch": 0.13814139497051448, + "grad_norm": 1.6162883189931259, + "learning_rate": 1.9393751407688866e-06, + "loss": 1.2285, + "step": 1019 + }, + { + "epoch": 0.13827696061817935, + "grad_norm": 2.5536137342324388, + "learning_rate": 1.9392244736903773e-06, + "loss": 1.3028, + "step": 1020 + }, + { + "epoch": 0.13841252626584424, + "grad_norm": 4.458985341148804, + "learning_rate": 1.9390736254898414e-06, + "loss": 1.2652, + "step": 1021 + }, + { + "epoch": 0.1385480919135091, + "grad_norm": 2.188308146030955, + "learning_rate": 1.9389225961963698e-06, + "loss": 1.311, + "step": 1022 + }, + { + "epoch": 0.138683657561174, + "grad_norm": 1.7893399073483414, + "learning_rate": 1.9387713858390863e-06, + "loss": 1.2695, + "step": 1023 + }, + { + "epoch": 0.13881922320883888, + "grad_norm": 2.024626988074003, + "learning_rate": 1.938619994447152e-06, + "loss": 1.2262, + "step": 1024 + }, + { + "epoch": 0.13895478885650375, + "grad_norm": 2.2616041272489307, + "learning_rate": 1.9384684220497604e-06, + "loss": 1.2671, + "step": 1025 + }, + { + "epoch": 0.13909035450416865, + "grad_norm": 1.8614932599133023, + "learning_rate": 1.9383166686761416e-06, + "loss": 1.2976, + "step": 1026 + }, + { + "epoch": 0.13922592015183352, + "grad_norm": 2.0680452877173896, + "learning_rate": 1.9381647343555596e-06, + "loss": 1.2903, + "step": 1027 + }, + { + "epoch": 0.13936148579949842, + "grad_norm": 1.8879233258030381, + "learning_rate": 1.938012619117314e-06, + "loss": 1.2486, + "step": 1028 + }, + { + "epoch": 0.1394970514471633, + "grad_norm": 1.9642313198126646, + "learning_rate": 1.9378603229907393e-06, + "loss": 1.2284, + "step": 1029 + }, + { + "epoch": 0.13963261709482816, + "grad_norm": 2.2832025306690777, + "learning_rate": 1.937707846005204e-06, + "loss": 1.2313, + "step": 1030 + }, + { + "epoch": 0.13976818274249306, + "grad_norm": 2.4223342530612757, + "learning_rate": 1.9375551881901127e-06, + "loss": 1.2405, + "step": 1031 + }, + { + "epoch": 0.13990374839015793, + "grad_norm": 1.5593956664216542, + "learning_rate": 1.937402349574904e-06, + "loss": 1.2232, + "step": 1032 + }, + { + "epoch": 0.14003931403782283, + "grad_norm": 2.1479351028482236, + "learning_rate": 1.9372493301890517e-06, + "loss": 1.2471, + "step": 1033 + }, + { + "epoch": 0.1401748796854877, + "grad_norm": 2.2187447786042447, + "learning_rate": 1.9370961300620636e-06, + "loss": 1.2796, + "step": 1034 + }, + { + "epoch": 0.14031044533315257, + "grad_norm": 2.147624520107726, + "learning_rate": 1.9369427492234846e-06, + "loss": 1.2088, + "step": 1035 + }, + { + "epoch": 0.14044601098081747, + "grad_norm": 1.6665371512462652, + "learning_rate": 1.9367891877028917e-06, + "loss": 1.2674, + "step": 1036 + }, + { + "epoch": 0.14058157662848234, + "grad_norm": 4.112677723103257, + "learning_rate": 1.9366354455298987e-06, + "loss": 1.2884, + "step": 1037 + }, + { + "epoch": 0.14071714227614723, + "grad_norm": 2.2897652907239734, + "learning_rate": 1.936481522734153e-06, + "loss": 1.2707, + "step": 1038 + }, + { + "epoch": 0.1408527079238121, + "grad_norm": 1.5098528288815618, + "learning_rate": 1.9363274193453383e-06, + "loss": 1.249, + "step": 1039 + }, + { + "epoch": 0.14098827357147697, + "grad_norm": 2.041956600881612, + "learning_rate": 1.9361731353931714e-06, + "loss": 1.2468, + "step": 1040 + }, + { + "epoch": 0.14112383921914187, + "grad_norm": 1.7081579739426331, + "learning_rate": 1.936018670907405e-06, + "loss": 1.2568, + "step": 1041 + }, + { + "epoch": 0.14125940486680674, + "grad_norm": 2.126205387465136, + "learning_rate": 1.935864025917827e-06, + "loss": 1.2741, + "step": 1042 + }, + { + "epoch": 0.14139497051447164, + "grad_norm": 1.6322177362135466, + "learning_rate": 1.935709200454258e-06, + "loss": 1.2397, + "step": 1043 + }, + { + "epoch": 0.1415305361621365, + "grad_norm": 2.3385203305721416, + "learning_rate": 1.9355541945465563e-06, + "loss": 1.3131, + "step": 1044 + }, + { + "epoch": 0.1416661018098014, + "grad_norm": 1.6496413284915905, + "learning_rate": 1.9353990082246127e-06, + "loss": 1.3298, + "step": 1045 + }, + { + "epoch": 0.14180166745746628, + "grad_norm": 1.6814351806496979, + "learning_rate": 1.935243641518354e-06, + "loss": 1.2946, + "step": 1046 + }, + { + "epoch": 0.14193723310513115, + "grad_norm": 2.005187158627368, + "learning_rate": 1.935088094457742e-06, + "loss": 1.277, + "step": 1047 + }, + { + "epoch": 0.14207279875279605, + "grad_norm": 2.726888899853686, + "learning_rate": 1.9349323670727717e-06, + "loss": 1.2238, + "step": 1048 + }, + { + "epoch": 0.14220836440046092, + "grad_norm": 2.6000636104146255, + "learning_rate": 1.9347764593934743e-06, + "loss": 1.3011, + "step": 1049 + }, + { + "epoch": 0.14234393004812582, + "grad_norm": 1.8348896866717568, + "learning_rate": 1.934620371449915e-06, + "loss": 1.2917, + "step": 1050 + }, + { + "epoch": 0.1424794956957907, + "grad_norm": 1.720356827674451, + "learning_rate": 1.934464103272195e-06, + "loss": 1.2733, + "step": 1051 + }, + { + "epoch": 0.14261506134345556, + "grad_norm": 2.7549339640640382, + "learning_rate": 1.9343076548904483e-06, + "loss": 1.2704, + "step": 1052 + }, + { + "epoch": 0.14275062699112046, + "grad_norm": 1.7280030770329609, + "learning_rate": 1.9341510263348457e-06, + "loss": 1.2402, + "step": 1053 + }, + { + "epoch": 0.14288619263878533, + "grad_norm": 1.9645611207960683, + "learning_rate": 1.9339942176355916e-06, + "loss": 1.3046, + "step": 1054 + }, + { + "epoch": 0.14302175828645022, + "grad_norm": 2.799006857508258, + "learning_rate": 1.933837228822925e-06, + "loss": 1.277, + "step": 1055 + }, + { + "epoch": 0.1431573239341151, + "grad_norm": 1.9651341032167007, + "learning_rate": 1.9336800599271203e-06, + "loss": 1.2741, + "step": 1056 + }, + { + "epoch": 0.14329288958177996, + "grad_norm": 1.90758817294976, + "learning_rate": 1.933522710978486e-06, + "loss": 1.265, + "step": 1057 + }, + { + "epoch": 0.14342845522944486, + "grad_norm": 2.133307746784619, + "learning_rate": 1.9333651820073655e-06, + "loss": 1.2759, + "step": 1058 + }, + { + "epoch": 0.14356402087710973, + "grad_norm": 2.293216267436857, + "learning_rate": 1.933207473044137e-06, + "loss": 1.2317, + "step": 1059 + }, + { + "epoch": 0.14369958652477463, + "grad_norm": 2.1695658841951126, + "learning_rate": 1.9330495841192138e-06, + "loss": 1.2814, + "step": 1060 + }, + { + "epoch": 0.1438351521724395, + "grad_norm": 2.3501704078120023, + "learning_rate": 1.9328915152630435e-06, + "loss": 1.2428, + "step": 1061 + }, + { + "epoch": 0.1439707178201044, + "grad_norm": 1.9367523686090293, + "learning_rate": 1.932733266506108e-06, + "loss": 1.2487, + "step": 1062 + }, + { + "epoch": 0.14410628346776927, + "grad_norm": 2.600646747276015, + "learning_rate": 1.9325748378789246e-06, + "loss": 1.2634, + "step": 1063 + }, + { + "epoch": 0.14424184911543414, + "grad_norm": 1.8907308213270988, + "learning_rate": 1.9324162294120453e-06, + "loss": 1.2987, + "step": 1064 + }, + { + "epoch": 0.14437741476309904, + "grad_norm": 1.690470547641784, + "learning_rate": 1.9322574411360557e-06, + "loss": 1.2586, + "step": 1065 + }, + { + "epoch": 0.1445129804107639, + "grad_norm": 1.9177200370258056, + "learning_rate": 1.932098473081578e-06, + "loss": 1.2781, + "step": 1066 + }, + { + "epoch": 0.1446485460584288, + "grad_norm": 3.013080826203071, + "learning_rate": 1.931939325279267e-06, + "loss": 1.2765, + "step": 1067 + }, + { + "epoch": 0.14478411170609368, + "grad_norm": 1.8211636934298407, + "learning_rate": 1.9317799977598136e-06, + "loss": 1.2361, + "step": 1068 + }, + { + "epoch": 0.14491967735375855, + "grad_norm": 3.7874782561826885, + "learning_rate": 1.9316204905539425e-06, + "loss": 1.2833, + "step": 1069 + }, + { + "epoch": 0.14505524300142345, + "grad_norm": 1.8125535851231689, + "learning_rate": 1.9314608036924133e-06, + "loss": 1.2883, + "step": 1070 + }, + { + "epoch": 0.14519080864908832, + "grad_norm": 1.9527672347996141, + "learning_rate": 1.931300937206021e-06, + "loss": 1.2826, + "step": 1071 + }, + { + "epoch": 0.1453263742967532, + "grad_norm": 1.7086962807798551, + "learning_rate": 1.931140891125594e-06, + "loss": 1.2757, + "step": 1072 + }, + { + "epoch": 0.14546193994441808, + "grad_norm": 3.3607405931970056, + "learning_rate": 1.9309806654819963e-06, + "loss": 1.3111, + "step": 1073 + }, + { + "epoch": 0.14559750559208295, + "grad_norm": 2.8453793579309026, + "learning_rate": 1.9308202603061258e-06, + "loss": 1.2487, + "step": 1074 + }, + { + "epoch": 0.14573307123974785, + "grad_norm": 1.947828234878255, + "learning_rate": 1.9306596756289155e-06, + "loss": 1.2818, + "step": 1075 + }, + { + "epoch": 0.14586863688741272, + "grad_norm": 2.4609096189862, + "learning_rate": 1.930498911481333e-06, + "loss": 1.2364, + "step": 1076 + }, + { + "epoch": 0.14600420253507762, + "grad_norm": 2.254273645238766, + "learning_rate": 1.9303379678943805e-06, + "loss": 1.2757, + "step": 1077 + }, + { + "epoch": 0.1461397681827425, + "grad_norm": 1.9542754412042782, + "learning_rate": 1.9301768448990946e-06, + "loss": 1.2398, + "step": 1078 + }, + { + "epoch": 0.14627533383040736, + "grad_norm": 2.159071972141311, + "learning_rate": 1.930015542526546e-06, + "loss": 1.2525, + "step": 1079 + }, + { + "epoch": 0.14641089947807226, + "grad_norm": 1.7120121987993278, + "learning_rate": 1.9298540608078417e-06, + "loss": 1.2689, + "step": 1080 + }, + { + "epoch": 0.14654646512573713, + "grad_norm": 2.904423541157732, + "learning_rate": 1.9296923997741216e-06, + "loss": 1.2765, + "step": 1081 + }, + { + "epoch": 0.14668203077340203, + "grad_norm": 3.02778081044767, + "learning_rate": 1.9295305594565604e-06, + "loss": 1.2531, + "step": 1082 + }, + { + "epoch": 0.1468175964210669, + "grad_norm": 1.892513176131049, + "learning_rate": 1.9293685398863683e-06, + "loss": 1.2442, + "step": 1083 + }, + { + "epoch": 0.1469531620687318, + "grad_norm": 1.7186818911047013, + "learning_rate": 1.929206341094789e-06, + "loss": 1.2687, + "step": 1084 + }, + { + "epoch": 0.14708872771639667, + "grad_norm": 2.2436322525513357, + "learning_rate": 1.9290439631131018e-06, + "loss": 1.2769, + "step": 1085 + }, + { + "epoch": 0.14722429336406154, + "grad_norm": 1.9280067199288429, + "learning_rate": 1.9288814059726196e-06, + "loss": 1.2605, + "step": 1086 + }, + { + "epoch": 0.14735985901172644, + "grad_norm": 1.7863580143814248, + "learning_rate": 1.92871866970469e-06, + "loss": 1.2307, + "step": 1087 + }, + { + "epoch": 0.1474954246593913, + "grad_norm": 2.040537796722911, + "learning_rate": 1.9285557543406964e-06, + "loss": 1.3059, + "step": 1088 + }, + { + "epoch": 0.1476309903070562, + "grad_norm": 1.9129962402570386, + "learning_rate": 1.928392659912055e-06, + "loss": 1.281, + "step": 1089 + }, + { + "epoch": 0.14776655595472107, + "grad_norm": 1.6777876240127207, + "learning_rate": 1.9282293864502176e-06, + "loss": 1.2466, + "step": 1090 + }, + { + "epoch": 0.14790212160238594, + "grad_norm": 1.8361603229002628, + "learning_rate": 1.92806593398667e-06, + "loss": 1.2301, + "step": 1091 + }, + { + "epoch": 0.14803768725005084, + "grad_norm": 2.8500887961128596, + "learning_rate": 1.9279023025529324e-06, + "loss": 1.2596, + "step": 1092 + }, + { + "epoch": 0.1481732528977157, + "grad_norm": 1.642774944652343, + "learning_rate": 1.9277384921805604e-06, + "loss": 1.2576, + "step": 1093 + }, + { + "epoch": 0.1483088185453806, + "grad_norm": 1.7619094148799013, + "learning_rate": 1.927574502901143e-06, + "loss": 1.2405, + "step": 1094 + }, + { + "epoch": 0.14844438419304548, + "grad_norm": 1.8403042604491278, + "learning_rate": 1.927410334746305e-06, + "loss": 1.2803, + "step": 1095 + }, + { + "epoch": 0.14857994984071035, + "grad_norm": 2.0268106707197453, + "learning_rate": 1.927245987747704e-06, + "loss": 1.2673, + "step": 1096 + }, + { + "epoch": 0.14871551548837525, + "grad_norm": 2.7419313416259428, + "learning_rate": 1.9270814619370337e-06, + "loss": 1.274, + "step": 1097 + }, + { + "epoch": 0.14885108113604012, + "grad_norm": 2.2732548134774198, + "learning_rate": 1.9269167573460217e-06, + "loss": 1.2754, + "step": 1098 + }, + { + "epoch": 0.14898664678370502, + "grad_norm": 1.6961019568599836, + "learning_rate": 1.9267518740064294e-06, + "loss": 1.2664, + "step": 1099 + }, + { + "epoch": 0.1491222124313699, + "grad_norm": 2.5064779893977125, + "learning_rate": 1.9265868119500538e-06, + "loss": 1.2273, + "step": 1100 + }, + { + "epoch": 0.1492577780790348, + "grad_norm": 1.9442064191365542, + "learning_rate": 1.926421571208725e-06, + "loss": 1.2208, + "step": 1101 + }, + { + "epoch": 0.14939334372669966, + "grad_norm": 1.9997372625971903, + "learning_rate": 1.9262561518143095e-06, + "loss": 1.2929, + "step": 1102 + }, + { + "epoch": 0.14952890937436453, + "grad_norm": 1.8612221484376499, + "learning_rate": 1.9260905537987063e-06, + "loss": 1.2649, + "step": 1103 + }, + { + "epoch": 0.14966447502202943, + "grad_norm": 3.0739432717807835, + "learning_rate": 1.92592477719385e-06, + "loss": 1.2191, + "step": 1104 + }, + { + "epoch": 0.1498000406696943, + "grad_norm": 1.7124066650451648, + "learning_rate": 1.925758822031709e-06, + "loss": 1.2528, + "step": 1105 + }, + { + "epoch": 0.1499356063173592, + "grad_norm": 3.0930633572018835, + "learning_rate": 1.9255926883442867e-06, + "loss": 1.2779, + "step": 1106 + }, + { + "epoch": 0.15007117196502406, + "grad_norm": 5.872224944438453, + "learning_rate": 1.9254263761636207e-06, + "loss": 1.2557, + "step": 1107 + }, + { + "epoch": 0.15020673761268893, + "grad_norm": 2.454547607375999, + "learning_rate": 1.925259885521783e-06, + "loss": 1.2528, + "step": 1108 + }, + { + "epoch": 0.15034230326035383, + "grad_norm": 2.3683529510686547, + "learning_rate": 1.92509321645088e-06, + "loss": 1.2884, + "step": 1109 + }, + { + "epoch": 0.1504778689080187, + "grad_norm": 1.777761465797449, + "learning_rate": 1.924926368983052e-06, + "loss": 1.2474, + "step": 1110 + }, + { + "epoch": 0.1506134345556836, + "grad_norm": 1.4660465842811472, + "learning_rate": 1.9247593431504756e-06, + "loss": 1.2278, + "step": 1111 + }, + { + "epoch": 0.15074900020334847, + "grad_norm": 2.0035999766862997, + "learning_rate": 1.9245921389853588e-06, + "loss": 1.2415, + "step": 1112 + }, + { + "epoch": 0.15088456585101334, + "grad_norm": 2.8616612795447014, + "learning_rate": 1.9244247565199463e-06, + "loss": 1.2383, + "step": 1113 + }, + { + "epoch": 0.15102013149867824, + "grad_norm": 2.431894104062967, + "learning_rate": 1.9242571957865165e-06, + "loss": 1.2947, + "step": 1114 + }, + { + "epoch": 0.1511556971463431, + "grad_norm": 1.731280584000765, + "learning_rate": 1.924089456817382e-06, + "loss": 1.2434, + "step": 1115 + }, + { + "epoch": 0.151291262794008, + "grad_norm": 1.6986188014931154, + "learning_rate": 1.92392153964489e-06, + "loss": 1.2594, + "step": 1116 + }, + { + "epoch": 0.15142682844167288, + "grad_norm": 1.907903224925178, + "learning_rate": 1.923753444301423e-06, + "loss": 1.2704, + "step": 1117 + }, + { + "epoch": 0.15156239408933775, + "grad_norm": 1.7943400116098407, + "learning_rate": 1.923585170819395e-06, + "loss": 1.2627, + "step": 1118 + }, + { + "epoch": 0.15169795973700265, + "grad_norm": 6.890993480438287, + "learning_rate": 1.923416719231257e-06, + "loss": 1.2605, + "step": 1119 + }, + { + "epoch": 0.15183352538466752, + "grad_norm": 2.2534888724982802, + "learning_rate": 1.9232480895694945e-06, + "loss": 1.226, + "step": 1120 + }, + { + "epoch": 0.15196909103233242, + "grad_norm": 1.8310670068847934, + "learning_rate": 1.9230792818666252e-06, + "loss": 1.2428, + "step": 1121 + }, + { + "epoch": 0.15210465667999729, + "grad_norm": 1.9399996223919724, + "learning_rate": 1.9229102961552026e-06, + "loss": 1.2849, + "step": 1122 + }, + { + "epoch": 0.15224022232766218, + "grad_norm": 1.7764676308281302, + "learning_rate": 1.9227411324678146e-06, + "loss": 1.2429, + "step": 1123 + }, + { + "epoch": 0.15237578797532705, + "grad_norm": 3.422407475554076, + "learning_rate": 1.922571790837083e-06, + "loss": 1.2764, + "step": 1124 + }, + { + "epoch": 0.15251135362299192, + "grad_norm": 3.084153351463619, + "learning_rate": 1.9224022712956635e-06, + "loss": 1.2261, + "step": 1125 + }, + { + "epoch": 0.15264691927065682, + "grad_norm": 1.8010198978039424, + "learning_rate": 1.922232573876247e-06, + "loss": 1.2423, + "step": 1126 + }, + { + "epoch": 0.1527824849183217, + "grad_norm": 2.698077798650513, + "learning_rate": 1.922062698611559e-06, + "loss": 1.2619, + "step": 1127 + }, + { + "epoch": 0.1529180505659866, + "grad_norm": 2.581597090165482, + "learning_rate": 1.921892645534357e-06, + "loss": 1.3031, + "step": 1128 + }, + { + "epoch": 0.15305361621365146, + "grad_norm": 2.1489050459892423, + "learning_rate": 1.9217224146774357e-06, + "loss": 1.2484, + "step": 1129 + }, + { + "epoch": 0.15318918186131633, + "grad_norm": 1.9048084671878447, + "learning_rate": 1.921552006073622e-06, + "loss": 1.2521, + "step": 1130 + }, + { + "epoch": 0.15332474750898123, + "grad_norm": 1.7122685463864484, + "learning_rate": 1.9213814197557787e-06, + "loss": 1.2651, + "step": 1131 + }, + { + "epoch": 0.1534603131566461, + "grad_norm": 2.5374596121843176, + "learning_rate": 1.9212106557568016e-06, + "loss": 1.2343, + "step": 1132 + }, + { + "epoch": 0.153595878804311, + "grad_norm": 2.1764866611949634, + "learning_rate": 1.9210397141096206e-06, + "loss": 1.2358, + "step": 1133 + }, + { + "epoch": 0.15373144445197587, + "grad_norm": 2.3063405489951396, + "learning_rate": 1.9208685948472014e-06, + "loss": 1.2634, + "step": 1134 + }, + { + "epoch": 0.15386701009964074, + "grad_norm": 2.2111999098519806, + "learning_rate": 1.9206972980025426e-06, + "loss": 1.2259, + "step": 1135 + }, + { + "epoch": 0.15400257574730564, + "grad_norm": 1.8554795227037624, + "learning_rate": 1.9205258236086773e-06, + "loss": 1.2669, + "step": 1136 + }, + { + "epoch": 0.1541381413949705, + "grad_norm": 1.6895463326777165, + "learning_rate": 1.920354171698673e-06, + "loss": 1.2427, + "step": 1137 + }, + { + "epoch": 0.1542737070426354, + "grad_norm": 2.6937289303755043, + "learning_rate": 1.9201823423056315e-06, + "loss": 1.2638, + "step": 1138 + }, + { + "epoch": 0.15440927269030028, + "grad_norm": 1.8714901884444735, + "learning_rate": 1.920010335462689e-06, + "loss": 1.2665, + "step": 1139 + }, + { + "epoch": 0.15454483833796515, + "grad_norm": 1.6583832721454732, + "learning_rate": 1.9198381512030154e-06, + "loss": 1.2829, + "step": 1140 + }, + { + "epoch": 0.15468040398563004, + "grad_norm": 1.927186484512076, + "learning_rate": 1.919665789559815e-06, + "loss": 1.2138, + "step": 1141 + }, + { + "epoch": 0.15481596963329491, + "grad_norm": 3.453721525322463, + "learning_rate": 1.9194932505663265e-06, + "loss": 1.2392, + "step": 1142 + }, + { + "epoch": 0.1549515352809598, + "grad_norm": 2.02194933010778, + "learning_rate": 1.9193205342558227e-06, + "loss": 1.2537, + "step": 1143 + }, + { + "epoch": 0.15508710092862468, + "grad_norm": 2.0764832616944457, + "learning_rate": 1.9191476406616107e-06, + "loss": 1.2681, + "step": 1144 + }, + { + "epoch": 0.15522266657628958, + "grad_norm": 1.8286730893505452, + "learning_rate": 1.918974569817031e-06, + "loss": 1.2262, + "step": 1145 + }, + { + "epoch": 0.15535823222395445, + "grad_norm": 1.984804654947209, + "learning_rate": 1.9188013217554596e-06, + "loss": 1.2292, + "step": 1146 + }, + { + "epoch": 0.15549379787161932, + "grad_norm": 1.9718001199882969, + "learning_rate": 1.918627896510306e-06, + "loss": 1.2728, + "step": 1147 + }, + { + "epoch": 0.15562936351928422, + "grad_norm": 1.9920273336775247, + "learning_rate": 1.9184542941150143e-06, + "loss": 1.2667, + "step": 1148 + }, + { + "epoch": 0.1557649291669491, + "grad_norm": 2.054915919192572, + "learning_rate": 1.9182805146030614e-06, + "loss": 1.2418, + "step": 1149 + }, + { + "epoch": 0.155900494814614, + "grad_norm": 2.006914917687457, + "learning_rate": 1.9181065580079593e-06, + "loss": 1.2622, + "step": 1150 + }, + { + "epoch": 0.15603606046227886, + "grad_norm": 1.909139729143758, + "learning_rate": 1.917932424363255e-06, + "loss": 1.2457, + "step": 1151 + }, + { + "epoch": 0.15617162610994373, + "grad_norm": 2.577223145851878, + "learning_rate": 1.9177581137025284e-06, + "loss": 1.2557, + "step": 1152 + }, + { + "epoch": 0.15630719175760863, + "grad_norm": 9.345050193918164, + "learning_rate": 1.9175836260593937e-06, + "loss": 1.2263, + "step": 1153 + }, + { + "epoch": 0.1564427574052735, + "grad_norm": 1.6621028260581119, + "learning_rate": 1.9174089614674998e-06, + "loss": 1.2617, + "step": 1154 + }, + { + "epoch": 0.1565783230529384, + "grad_norm": 2.382859861439827, + "learning_rate": 1.9172341199605293e-06, + "loss": 1.2338, + "step": 1155 + }, + { + "epoch": 0.15671388870060327, + "grad_norm": 2.1875524286205446, + "learning_rate": 1.9170591015721987e-06, + "loss": 1.2299, + "step": 1156 + }, + { + "epoch": 0.15684945434826814, + "grad_norm": 1.955500368390389, + "learning_rate": 1.9168839063362595e-06, + "loss": 1.2401, + "step": 1157 + }, + { + "epoch": 0.15698501999593303, + "grad_norm": 2.058148840632306, + "learning_rate": 1.9167085342864962e-06, + "loss": 1.2032, + "step": 1158 + }, + { + "epoch": 0.1571205856435979, + "grad_norm": 1.648607568026541, + "learning_rate": 1.9165329854567285e-06, + "loss": 1.2306, + "step": 1159 + }, + { + "epoch": 0.1572561512912628, + "grad_norm": 1.5972844701967848, + "learning_rate": 1.916357259880809e-06, + "loss": 1.2414, + "step": 1160 + }, + { + "epoch": 0.15739171693892767, + "grad_norm": 2.053951065894212, + "learning_rate": 1.916181357592625e-06, + "loss": 1.2802, + "step": 1161 + }, + { + "epoch": 0.15752728258659257, + "grad_norm": 1.634264991741333, + "learning_rate": 1.916005278626098e-06, + "loss": 1.2668, + "step": 1162 + }, + { + "epoch": 0.15766284823425744, + "grad_norm": 2.381700181649914, + "learning_rate": 1.915829023015184e-06, + "loss": 1.2835, + "step": 1163 + }, + { + "epoch": 0.1577984138819223, + "grad_norm": 3.0769289071550237, + "learning_rate": 1.915652590793872e-06, + "loss": 1.2502, + "step": 1164 + }, + { + "epoch": 0.1579339795295872, + "grad_norm": 1.9529072128745848, + "learning_rate": 1.9154759819961854e-06, + "loss": 1.2578, + "step": 1165 + }, + { + "epoch": 0.15806954517725208, + "grad_norm": 3.5212653178031292, + "learning_rate": 1.915299196656182e-06, + "loss": 1.2573, + "step": 1166 + }, + { + "epoch": 0.15820511082491698, + "grad_norm": 2.289248006791021, + "learning_rate": 1.9151222348079535e-06, + "loss": 1.2792, + "step": 1167 + }, + { + "epoch": 0.15834067647258185, + "grad_norm": 1.9759160607613335, + "learning_rate": 1.9149450964856254e-06, + "loss": 1.3018, + "step": 1168 + }, + { + "epoch": 0.15847624212024672, + "grad_norm": 2.021787377005143, + "learning_rate": 1.914767781723358e-06, + "loss": 1.2394, + "step": 1169 + }, + { + "epoch": 0.15861180776791162, + "grad_norm": 2.0785578887955887, + "learning_rate": 1.914590290555344e-06, + "loss": 1.2701, + "step": 1170 + }, + { + "epoch": 0.1587473734155765, + "grad_norm": 2.0789260742757607, + "learning_rate": 1.9144126230158124e-06, + "loss": 1.2439, + "step": 1171 + }, + { + "epoch": 0.15888293906324139, + "grad_norm": 2.2609307959354887, + "learning_rate": 1.9142347791390242e-06, + "loss": 1.3141, + "step": 1172 + }, + { + "epoch": 0.15901850471090626, + "grad_norm": 1.9895013963962778, + "learning_rate": 1.9140567589592755e-06, + "loss": 1.1962, + "step": 1173 + }, + { + "epoch": 0.15915407035857113, + "grad_norm": 1.8270229833764349, + "learning_rate": 1.9138785625108955e-06, + "loss": 1.2995, + "step": 1174 + }, + { + "epoch": 0.15928963600623602, + "grad_norm": 1.857341433669138, + "learning_rate": 1.9137001898282484e-06, + "loss": 1.2307, + "step": 1175 + }, + { + "epoch": 0.1594252016539009, + "grad_norm": 1.8283918420171297, + "learning_rate": 1.9135216409457327e-06, + "loss": 1.2793, + "step": 1176 + }, + { + "epoch": 0.1595607673015658, + "grad_norm": 1.9792794213105314, + "learning_rate": 1.913342915897779e-06, + "loss": 1.2515, + "step": 1177 + }, + { + "epoch": 0.15969633294923066, + "grad_norm": 3.3863171441327453, + "learning_rate": 1.9131640147188534e-06, + "loss": 1.2422, + "step": 1178 + }, + { + "epoch": 0.15983189859689553, + "grad_norm": 2.2723755110348507, + "learning_rate": 1.912984937443456e-06, + "loss": 1.2628, + "step": 1179 + }, + { + "epoch": 0.15996746424456043, + "grad_norm": 1.6230873189050743, + "learning_rate": 1.9128056841061197e-06, + "loss": 1.2867, + "step": 1180 + }, + { + "epoch": 0.1601030298922253, + "grad_norm": 1.6785450109562188, + "learning_rate": 1.912626254741413e-06, + "loss": 1.2513, + "step": 1181 + }, + { + "epoch": 0.1602385955398902, + "grad_norm": 2.4209704373720786, + "learning_rate": 1.912446649383936e-06, + "loss": 1.2506, + "step": 1182 + }, + { + "epoch": 0.16037416118755507, + "grad_norm": 3.4734204703164746, + "learning_rate": 1.9122668680683255e-06, + "loss": 1.2589, + "step": 1183 + }, + { + "epoch": 0.16050972683521997, + "grad_norm": 2.3115194807538386, + "learning_rate": 1.9120869108292504e-06, + "loss": 1.2242, + "step": 1184 + }, + { + "epoch": 0.16064529248288484, + "grad_norm": 2.023035573141384, + "learning_rate": 1.9119067777014146e-06, + "loss": 1.2415, + "step": 1185 + }, + { + "epoch": 0.1607808581305497, + "grad_norm": 2.276360736634274, + "learning_rate": 1.9117264687195546e-06, + "loss": 1.2885, + "step": 1186 + }, + { + "epoch": 0.1609164237782146, + "grad_norm": 1.6036690831264686, + "learning_rate": 1.911545983918442e-06, + "loss": 1.2543, + "step": 1187 + }, + { + "epoch": 0.16105198942587948, + "grad_norm": 2.134972517626086, + "learning_rate": 1.911365323332881e-06, + "loss": 1.2406, + "step": 1188 + }, + { + "epoch": 0.16118755507354438, + "grad_norm": 2.0999086823035356, + "learning_rate": 1.9111844869977123e-06, + "loss": 1.2954, + "step": 1189 + }, + { + "epoch": 0.16132312072120925, + "grad_norm": 1.5759312087558497, + "learning_rate": 1.911003474947807e-06, + "loss": 1.2432, + "step": 1190 + }, + { + "epoch": 0.16145868636887412, + "grad_norm": 1.8792367960289744, + "learning_rate": 1.910822287218073e-06, + "loss": 1.2211, + "step": 1191 + }, + { + "epoch": 0.16159425201653901, + "grad_norm": 1.887600968904665, + "learning_rate": 1.9106409238434503e-06, + "loss": 1.2151, + "step": 1192 + }, + { + "epoch": 0.16172981766420388, + "grad_norm": 1.9356282798168938, + "learning_rate": 1.9104593848589137e-06, + "loss": 1.2545, + "step": 1193 + }, + { + "epoch": 0.16186538331186878, + "grad_norm": 2.3438057103395575, + "learning_rate": 1.9102776702994713e-06, + "loss": 1.2729, + "step": 1194 + }, + { + "epoch": 0.16200094895953365, + "grad_norm": 5.4219320678647716, + "learning_rate": 1.9100957802001654e-06, + "loss": 1.2376, + "step": 1195 + }, + { + "epoch": 0.16213651460719852, + "grad_norm": 2.4596704014058774, + "learning_rate": 1.9099137145960724e-06, + "loss": 1.2574, + "step": 1196 + }, + { + "epoch": 0.16227208025486342, + "grad_norm": 1.6662193423901304, + "learning_rate": 1.909731473522302e-06, + "loss": 1.2497, + "step": 1197 + }, + { + "epoch": 0.1624076459025283, + "grad_norm": 1.8693589305065212, + "learning_rate": 1.9095490570139977e-06, + "loss": 1.2295, + "step": 1198 + }, + { + "epoch": 0.1625432115501932, + "grad_norm": 1.8120999981096741, + "learning_rate": 1.9093664651063375e-06, + "loss": 1.2513, + "step": 1199 + }, + { + "epoch": 0.16267877719785806, + "grad_norm": 1.9892159454585756, + "learning_rate": 1.9091836978345323e-06, + "loss": 1.2331, + "step": 1200 + }, + { + "epoch": 0.16281434284552296, + "grad_norm": 2.260863277170653, + "learning_rate": 1.909000755233828e-06, + "loss": 1.2655, + "step": 1201 + }, + { + "epoch": 0.16294990849318783, + "grad_norm": 2.329678727137876, + "learning_rate": 1.908817637339503e-06, + "loss": 1.2197, + "step": 1202 + }, + { + "epoch": 0.1630854741408527, + "grad_norm": 4.808585552805517, + "learning_rate": 1.9086343441868706e-06, + "loss": 1.2357, + "step": 1203 + }, + { + "epoch": 0.1632210397885176, + "grad_norm": 3.021799546855328, + "learning_rate": 1.908450875811277e-06, + "loss": 1.2898, + "step": 1204 + }, + { + "epoch": 0.16335660543618247, + "grad_norm": 1.6952036655132472, + "learning_rate": 1.908267232248103e-06, + "loss": 1.2663, + "step": 1205 + }, + { + "epoch": 0.16349217108384737, + "grad_norm": 16.46331071059676, + "learning_rate": 1.9080834135327624e-06, + "loss": 1.2606, + "step": 1206 + }, + { + "epoch": 0.16362773673151224, + "grad_norm": 2.8965467636615005, + "learning_rate": 1.907899419700704e-06, + "loss": 1.279, + "step": 1207 + }, + { + "epoch": 0.1637633023791771, + "grad_norm": 2.3247335210497866, + "learning_rate": 1.9077152507874086e-06, + "loss": 1.2083, + "step": 1208 + }, + { + "epoch": 0.163898868026842, + "grad_norm": 3.778782987630236, + "learning_rate": 1.9075309068283928e-06, + "loss": 1.2504, + "step": 1209 + }, + { + "epoch": 0.16403443367450687, + "grad_norm": 1.9462293983449062, + "learning_rate": 1.9073463878592046e-06, + "loss": 1.2114, + "step": 1210 + }, + { + "epoch": 0.16416999932217177, + "grad_norm": 1.733259838430761, + "learning_rate": 1.9071616939154279e-06, + "loss": 1.263, + "step": 1211 + }, + { + "epoch": 0.16430556496983664, + "grad_norm": 8.710323782394541, + "learning_rate": 1.9069768250326792e-06, + "loss": 1.2837, + "step": 1212 + }, + { + "epoch": 0.1644411306175015, + "grad_norm": 2.6293117944376894, + "learning_rate": 1.9067917812466088e-06, + "loss": 1.231, + "step": 1213 + }, + { + "epoch": 0.1645766962651664, + "grad_norm": 2.0639311883860576, + "learning_rate": 1.9066065625929014e-06, + "loss": 1.2469, + "step": 1214 + }, + { + "epoch": 0.16471226191283128, + "grad_norm": 4.757362896981926, + "learning_rate": 1.9064211691072747e-06, + "loss": 1.2371, + "step": 1215 + }, + { + "epoch": 0.16484782756049618, + "grad_norm": 1.6660732692709748, + "learning_rate": 1.9062356008254804e-06, + "loss": 1.2629, + "step": 1216 + }, + { + "epoch": 0.16498339320816105, + "grad_norm": 1.728783214877443, + "learning_rate": 1.906049857783304e-06, + "loss": 1.2373, + "step": 1217 + }, + { + "epoch": 0.16511895885582592, + "grad_norm": 2.00670765978545, + "learning_rate": 1.905863940016564e-06, + "loss": 1.2532, + "step": 1218 + }, + { + "epoch": 0.16525452450349082, + "grad_norm": 2.17273324459327, + "learning_rate": 1.9056778475611143e-06, + "loss": 1.2376, + "step": 1219 + }, + { + "epoch": 0.1653900901511557, + "grad_norm": 1.9710770439619354, + "learning_rate": 1.9054915804528403e-06, + "loss": 1.247, + "step": 1220 + }, + { + "epoch": 0.1655256557988206, + "grad_norm": 2.20993389055933, + "learning_rate": 1.9053051387276625e-06, + "loss": 1.2821, + "step": 1221 + }, + { + "epoch": 0.16566122144648546, + "grad_norm": 1.7266629770819328, + "learning_rate": 1.9051185224215347e-06, + "loss": 1.2298, + "step": 1222 + }, + { + "epoch": 0.16579678709415036, + "grad_norm": 1.8137766161851852, + "learning_rate": 1.9049317315704445e-06, + "loss": 1.2219, + "step": 1223 + }, + { + "epoch": 0.16593235274181523, + "grad_norm": 1.82386010141255, + "learning_rate": 1.904744766210413e-06, + "loss": 1.2424, + "step": 1224 + }, + { + "epoch": 0.1660679183894801, + "grad_norm": 2.062747441031642, + "learning_rate": 1.904557626377495e-06, + "loss": 1.2593, + "step": 1225 + }, + { + "epoch": 0.166203484037145, + "grad_norm": 2.3180952045968564, + "learning_rate": 1.9043703121077788e-06, + "loss": 1.2305, + "step": 1226 + }, + { + "epoch": 0.16633904968480986, + "grad_norm": 1.6832116729962618, + "learning_rate": 1.9041828234373866e-06, + "loss": 1.2597, + "step": 1227 + }, + { + "epoch": 0.16647461533247476, + "grad_norm": 1.5626880404110246, + "learning_rate": 1.903995160402474e-06, + "loss": 1.2084, + "step": 1228 + }, + { + "epoch": 0.16661018098013963, + "grad_norm": 2.934195351732801, + "learning_rate": 1.9038073230392306e-06, + "loss": 1.2476, + "step": 1229 + }, + { + "epoch": 0.1667457466278045, + "grad_norm": 1.873988566910776, + "learning_rate": 1.903619311383879e-06, + "loss": 1.2676, + "step": 1230 + }, + { + "epoch": 0.1668813122754694, + "grad_norm": 2.065563697273787, + "learning_rate": 1.903431125472676e-06, + "loss": 1.2622, + "step": 1231 + }, + { + "epoch": 0.16701687792313427, + "grad_norm": 3.2576635863265753, + "learning_rate": 1.903242765341912e-06, + "loss": 1.2769, + "step": 1232 + }, + { + "epoch": 0.16715244357079917, + "grad_norm": 2.2511533008652482, + "learning_rate": 1.90305423102791e-06, + "loss": 1.225, + "step": 1233 + }, + { + "epoch": 0.16728800921846404, + "grad_norm": 2.117777158745222, + "learning_rate": 1.902865522567028e-06, + "loss": 1.2334, + "step": 1234 + }, + { + "epoch": 0.1674235748661289, + "grad_norm": 2.0561153977766553, + "learning_rate": 1.9026766399956568e-06, + "loss": 1.2482, + "step": 1235 + }, + { + "epoch": 0.1675591405137938, + "grad_norm": 1.8875140596034072, + "learning_rate": 1.9024875833502208e-06, + "loss": 1.2361, + "step": 1236 + }, + { + "epoch": 0.16769470616145868, + "grad_norm": 1.9265939315281828, + "learning_rate": 1.9022983526671784e-06, + "loss": 1.185, + "step": 1237 + }, + { + "epoch": 0.16783027180912358, + "grad_norm": 2.1662106729684942, + "learning_rate": 1.9021089479830206e-06, + "loss": 1.2515, + "step": 1238 + }, + { + "epoch": 0.16796583745678845, + "grad_norm": 2.4328850383629113, + "learning_rate": 1.9019193693342733e-06, + "loss": 1.2467, + "step": 1239 + }, + { + "epoch": 0.16810140310445335, + "grad_norm": 2.4283735905730923, + "learning_rate": 1.9017296167574948e-06, + "loss": 1.216, + "step": 1240 + }, + { + "epoch": 0.16823696875211822, + "grad_norm": 10.526921643277241, + "learning_rate": 1.9015396902892775e-06, + "loss": 1.2513, + "step": 1241 + }, + { + "epoch": 0.16837253439978309, + "grad_norm": 2.149074427911466, + "learning_rate": 1.9013495899662474e-06, + "loss": 1.2973, + "step": 1242 + }, + { + "epoch": 0.16850810004744798, + "grad_norm": 4.145047767405199, + "learning_rate": 1.9011593158250637e-06, + "loss": 1.25, + "step": 1243 + }, + { + "epoch": 0.16864366569511285, + "grad_norm": 1.6979930603686928, + "learning_rate": 1.9009688679024189e-06, + "loss": 1.209, + "step": 1244 + }, + { + "epoch": 0.16877923134277775, + "grad_norm": 2.2492924337411995, + "learning_rate": 1.9007782462350401e-06, + "loss": 1.2584, + "step": 1245 + }, + { + "epoch": 0.16891479699044262, + "grad_norm": 4.386421498885255, + "learning_rate": 1.9005874508596868e-06, + "loss": 1.2561, + "step": 1246 + }, + { + "epoch": 0.1690503626381075, + "grad_norm": 1.758005425891091, + "learning_rate": 1.9003964818131524e-06, + "loss": 1.2752, + "step": 1247 + }, + { + "epoch": 0.1691859282857724, + "grad_norm": 2.0511360047503095, + "learning_rate": 1.9002053391322636e-06, + "loss": 1.2198, + "step": 1248 + }, + { + "epoch": 0.16932149393343726, + "grad_norm": 1.8357532315626086, + "learning_rate": 1.900014022853881e-06, + "loss": 1.2548, + "step": 1249 + }, + { + "epoch": 0.16945705958110216, + "grad_norm": 1.8827168876778904, + "learning_rate": 1.8998225330148988e-06, + "loss": 1.2246, + "step": 1250 + }, + { + "epoch": 0.16959262522876703, + "grad_norm": 2.1255577804221812, + "learning_rate": 1.8996308696522432e-06, + "loss": 1.2517, + "step": 1251 + }, + { + "epoch": 0.1697281908764319, + "grad_norm": 1.855094506731317, + "learning_rate": 1.899439032802876e-06, + "loss": 1.2558, + "step": 1252 + }, + { + "epoch": 0.1698637565240968, + "grad_norm": 4.0897795082154556, + "learning_rate": 1.8992470225037911e-06, + "loss": 1.3051, + "step": 1253 + }, + { + "epoch": 0.16999932217176167, + "grad_norm": 1.7125280486486834, + "learning_rate": 1.899054838792016e-06, + "loss": 1.2342, + "step": 1254 + }, + { + "epoch": 0.17013488781942657, + "grad_norm": 2.4620289904022026, + "learning_rate": 1.8988624817046119e-06, + "loss": 1.2089, + "step": 1255 + }, + { + "epoch": 0.17027045346709144, + "grad_norm": 1.867105147517223, + "learning_rate": 1.8986699512786735e-06, + "loss": 1.2502, + "step": 1256 + }, + { + "epoch": 0.1704060191147563, + "grad_norm": 1.8331956430372207, + "learning_rate": 1.898477247551329e-06, + "loss": 1.2147, + "step": 1257 + }, + { + "epoch": 0.1705415847624212, + "grad_norm": 2.0787136367804426, + "learning_rate": 1.8982843705597388e-06, + "loss": 1.2626, + "step": 1258 + }, + { + "epoch": 0.17067715041008608, + "grad_norm": 3.9411070645004633, + "learning_rate": 1.8980913203410988e-06, + "loss": 1.2578, + "step": 1259 + }, + { + "epoch": 0.17081271605775097, + "grad_norm": 1.8465323936601505, + "learning_rate": 1.8978980969326366e-06, + "loss": 1.2684, + "step": 1260 + }, + { + "epoch": 0.17094828170541584, + "grad_norm": 1.8963873093748131, + "learning_rate": 1.897704700371614e-06, + "loss": 1.2391, + "step": 1261 + }, + { + "epoch": 0.17108384735308074, + "grad_norm": 2.103507282578768, + "learning_rate": 1.8975111306953261e-06, + "loss": 1.2888, + "step": 1262 + }, + { + "epoch": 0.1712194130007456, + "grad_norm": 2.5345605725408524, + "learning_rate": 1.8973173879411011e-06, + "loss": 1.2762, + "step": 1263 + }, + { + "epoch": 0.17135497864841048, + "grad_norm": 2.182412855272182, + "learning_rate": 1.8971234721463008e-06, + "loss": 1.278, + "step": 1264 + }, + { + "epoch": 0.17149054429607538, + "grad_norm": 2.201817617763733, + "learning_rate": 1.8969293833483202e-06, + "loss": 1.2188, + "step": 1265 + }, + { + "epoch": 0.17162610994374025, + "grad_norm": 1.5381631737895332, + "learning_rate": 1.896735121584588e-06, + "loss": 1.1882, + "step": 1266 + }, + { + "epoch": 0.17176167559140515, + "grad_norm": 3.267574645870925, + "learning_rate": 1.8965406868925664e-06, + "loss": 1.2184, + "step": 1267 + }, + { + "epoch": 0.17189724123907002, + "grad_norm": 2.2466154500850037, + "learning_rate": 1.89634607930975e-06, + "loss": 1.2503, + "step": 1268 + }, + { + "epoch": 0.1720328068867349, + "grad_norm": 1.9282163980221756, + "learning_rate": 1.8961512988736671e-06, + "loss": 1.2546, + "step": 1269 + }, + { + "epoch": 0.1721683725343998, + "grad_norm": 1.8789030229094394, + "learning_rate": 1.8959563456218807e-06, + "loss": 1.245, + "step": 1270 + }, + { + "epoch": 0.17230393818206466, + "grad_norm": 1.701443176294545, + "learning_rate": 1.8957612195919847e-06, + "loss": 1.2477, + "step": 1271 + }, + { + "epoch": 0.17243950382972956, + "grad_norm": 1.9414482036003369, + "learning_rate": 1.8955659208216086e-06, + "loss": 1.2562, + "step": 1272 + }, + { + "epoch": 0.17257506947739443, + "grad_norm": 1.9867259426042838, + "learning_rate": 1.8953704493484138e-06, + "loss": 1.241, + "step": 1273 + }, + { + "epoch": 0.1727106351250593, + "grad_norm": 2.150248756467047, + "learning_rate": 1.8951748052100954e-06, + "loss": 1.3109, + "step": 1274 + }, + { + "epoch": 0.1728462007727242, + "grad_norm": 33.66405582425212, + "learning_rate": 1.894978988444382e-06, + "loss": 1.2207, + "step": 1275 + }, + { + "epoch": 0.17298176642038907, + "grad_norm": 3.6039796501162096, + "learning_rate": 1.8947829990890347e-06, + "loss": 1.2774, + "step": 1276 + }, + { + "epoch": 0.17311733206805396, + "grad_norm": 1.8191164684485566, + "learning_rate": 1.8945868371818493e-06, + "loss": 1.2486, + "step": 1277 + }, + { + "epoch": 0.17325289771571883, + "grad_norm": 4.4904843615860575, + "learning_rate": 1.8943905027606539e-06, + "loss": 1.2489, + "step": 1278 + }, + { + "epoch": 0.17338846336338373, + "grad_norm": 1.7610091632587277, + "learning_rate": 1.8941939958633099e-06, + "loss": 1.2765, + "step": 1279 + }, + { + "epoch": 0.1735240290110486, + "grad_norm": 2.0175768717037688, + "learning_rate": 1.8939973165277123e-06, + "loss": 1.2521, + "step": 1280 + }, + { + "epoch": 0.17365959465871347, + "grad_norm": 2.57153808376016, + "learning_rate": 1.8938004647917886e-06, + "loss": 1.2351, + "step": 1281 + }, + { + "epoch": 0.17379516030637837, + "grad_norm": 1.8284291755977893, + "learning_rate": 1.8936034406935008e-06, + "loss": 1.1983, + "step": 1282 + }, + { + "epoch": 0.17393072595404324, + "grad_norm": 1.6363080275819666, + "learning_rate": 1.8934062442708432e-06, + "loss": 1.211, + "step": 1283 + }, + { + "epoch": 0.17406629160170814, + "grad_norm": 2.511864662156205, + "learning_rate": 1.8932088755618434e-06, + "loss": 1.2634, + "step": 1284 + }, + { + "epoch": 0.174201857249373, + "grad_norm": 2.1187161601552247, + "learning_rate": 1.8930113346045627e-06, + "loss": 1.232, + "step": 1285 + }, + { + "epoch": 0.17433742289703788, + "grad_norm": 2.2748951730595914, + "learning_rate": 1.892813621437095e-06, + "loss": 1.2375, + "step": 1286 + }, + { + "epoch": 0.17447298854470278, + "grad_norm": 5.446369975862091, + "learning_rate": 1.8926157360975674e-06, + "loss": 1.2328, + "step": 1287 + }, + { + "epoch": 0.17460855419236765, + "grad_norm": 2.317454286679598, + "learning_rate": 1.8924176786241416e-06, + "loss": 1.239, + "step": 1288 + }, + { + "epoch": 0.17474411984003255, + "grad_norm": 3.399809227589041, + "learning_rate": 1.8922194490550103e-06, + "loss": 1.2455, + "step": 1289 + }, + { + "epoch": 0.17487968548769742, + "grad_norm": 2.0788853839855865, + "learning_rate": 1.8920210474284014e-06, + "loss": 1.2734, + "step": 1290 + }, + { + "epoch": 0.1750152511353623, + "grad_norm": 3.5590003752818538, + "learning_rate": 1.8918224737825743e-06, + "loss": 1.218, + "step": 1291 + }, + { + "epoch": 0.17515081678302719, + "grad_norm": 2.3695914923288988, + "learning_rate": 1.891623728155823e-06, + "loss": 1.2317, + "step": 1292 + }, + { + "epoch": 0.17528638243069206, + "grad_norm": 1.7349002567820526, + "learning_rate": 1.8914248105864738e-06, + "loss": 1.2373, + "step": 1293 + }, + { + "epoch": 0.17542194807835695, + "grad_norm": 5.004592158112508, + "learning_rate": 1.8912257211128864e-06, + "loss": 1.2634, + "step": 1294 + }, + { + "epoch": 0.17555751372602182, + "grad_norm": 14.517004706237461, + "learning_rate": 1.8910264597734535e-06, + "loss": 1.2106, + "step": 1295 + }, + { + "epoch": 0.1756930793736867, + "grad_norm": 2.477688051179709, + "learning_rate": 1.8908270266066011e-06, + "loss": 1.2614, + "step": 1296 + }, + { + "epoch": 0.1758286450213516, + "grad_norm": 1.9568908044243714, + "learning_rate": 1.8906274216507885e-06, + "loss": 1.2575, + "step": 1297 + }, + { + "epoch": 0.17596421066901646, + "grad_norm": 2.7201233028688523, + "learning_rate": 1.8904276449445079e-06, + "loss": 1.2256, + "step": 1298 + }, + { + "epoch": 0.17609977631668136, + "grad_norm": 1.7850312165550921, + "learning_rate": 1.8902276965262845e-06, + "loss": 1.2262, + "step": 1299 + }, + { + "epoch": 0.17623534196434623, + "grad_norm": 1.8568556192180439, + "learning_rate": 1.8900275764346768e-06, + "loss": 1.2335, + "step": 1300 + }, + { + "epoch": 0.17637090761201113, + "grad_norm": 2.3979322970153976, + "learning_rate": 1.8898272847082764e-06, + "loss": 1.2517, + "step": 1301 + }, + { + "epoch": 0.176506473259676, + "grad_norm": 1.6531919972674793, + "learning_rate": 1.8896268213857078e-06, + "loss": 1.2751, + "step": 1302 + }, + { + "epoch": 0.17664203890734087, + "grad_norm": 2.5934435176158432, + "learning_rate": 1.8894261865056293e-06, + "loss": 1.2348, + "step": 1303 + }, + { + "epoch": 0.17677760455500577, + "grad_norm": 2.514287026259883, + "learning_rate": 1.8892253801067315e-06, + "loss": 1.2462, + "step": 1304 + }, + { + "epoch": 0.17691317020267064, + "grad_norm": 1.8011203092497423, + "learning_rate": 1.889024402227738e-06, + "loss": 1.2266, + "step": 1305 + }, + { + "epoch": 0.17704873585033554, + "grad_norm": 2.818902533449304, + "learning_rate": 1.8888232529074062e-06, + "loss": 1.2152, + "step": 1306 + }, + { + "epoch": 0.1771843014980004, + "grad_norm": 3.843841382601721, + "learning_rate": 1.888621932184526e-06, + "loss": 1.3004, + "step": 1307 + }, + { + "epoch": 0.17731986714566528, + "grad_norm": 1.74351312168936, + "learning_rate": 1.8884204400979206e-06, + "loss": 1.2641, + "step": 1308 + }, + { + "epoch": 0.17745543279333018, + "grad_norm": 1.9974642297256635, + "learning_rate": 1.888218776686446e-06, + "loss": 1.2208, + "step": 1309 + }, + { + "epoch": 0.17759099844099505, + "grad_norm": 1.658196975328815, + "learning_rate": 1.8880169419889915e-06, + "loss": 1.2198, + "step": 1310 + }, + { + "epoch": 0.17772656408865994, + "grad_norm": 3.2100409063235835, + "learning_rate": 1.8878149360444793e-06, + "loss": 1.2159, + "step": 1311 + }, + { + "epoch": 0.17786212973632481, + "grad_norm": 2.362138221712709, + "learning_rate": 1.8876127588918648e-06, + "loss": 1.2598, + "step": 1312 + }, + { + "epoch": 0.17799769538398968, + "grad_norm": 2.045939205157097, + "learning_rate": 1.887410410570136e-06, + "loss": 1.2458, + "step": 1313 + }, + { + "epoch": 0.17813326103165458, + "grad_norm": 2.003861478678199, + "learning_rate": 1.8872078911183145e-06, + "loss": 1.263, + "step": 1314 + }, + { + "epoch": 0.17826882667931945, + "grad_norm": 1.9130726486081804, + "learning_rate": 1.8870052005754542e-06, + "loss": 1.2062, + "step": 1315 + }, + { + "epoch": 0.17840439232698435, + "grad_norm": 1.9693162746966746, + "learning_rate": 1.8868023389806428e-06, + "loss": 1.211, + "step": 1316 + }, + { + "epoch": 0.17853995797464922, + "grad_norm": 2.317802932679959, + "learning_rate": 1.8865993063730002e-06, + "loss": 1.246, + "step": 1317 + }, + { + "epoch": 0.1786755236223141, + "grad_norm": 2.1072665856639827, + "learning_rate": 1.8863961027916794e-06, + "loss": 1.2643, + "step": 1318 + }, + { + "epoch": 0.178811089269979, + "grad_norm": 3.7954728295169193, + "learning_rate": 1.8861927282758673e-06, + "loss": 1.2514, + "step": 1319 + }, + { + "epoch": 0.17894665491764386, + "grad_norm": 2.2806138106364124, + "learning_rate": 1.8859891828647827e-06, + "loss": 1.2126, + "step": 1320 + }, + { + "epoch": 0.17908222056530876, + "grad_norm": 1.781114739659256, + "learning_rate": 1.8857854665976777e-06, + "loss": 1.2467, + "step": 1321 + }, + { + "epoch": 0.17921778621297363, + "grad_norm": 1.998732587584506, + "learning_rate": 1.8855815795138375e-06, + "loss": 1.2669, + "step": 1322 + }, + { + "epoch": 0.17935335186063853, + "grad_norm": 1.4911448986540055, + "learning_rate": 1.8853775216525803e-06, + "loss": 1.2737, + "step": 1323 + }, + { + "epoch": 0.1794889175083034, + "grad_norm": 2.1565068045301645, + "learning_rate": 1.8851732930532563e-06, + "loss": 1.2245, + "step": 1324 + }, + { + "epoch": 0.17962448315596827, + "grad_norm": 1.58315809535004, + "learning_rate": 1.8849688937552502e-06, + "loss": 1.2348, + "step": 1325 + }, + { + "epoch": 0.17976004880363317, + "grad_norm": 1.7864614878359628, + "learning_rate": 1.8847643237979783e-06, + "loss": 1.2298, + "step": 1326 + }, + { + "epoch": 0.17989561445129804, + "grad_norm": 1.8555759192088939, + "learning_rate": 1.8845595832208905e-06, + "loss": 1.2154, + "step": 1327 + }, + { + "epoch": 0.18003118009896293, + "grad_norm": 1.8954776718043866, + "learning_rate": 1.8843546720634693e-06, + "loss": 1.2712, + "step": 1328 + }, + { + "epoch": 0.1801667457466278, + "grad_norm": 1.910524523423986, + "learning_rate": 1.8841495903652302e-06, + "loss": 1.2227, + "step": 1329 + }, + { + "epoch": 0.18030231139429267, + "grad_norm": 2.0092512126577255, + "learning_rate": 1.883944338165722e-06, + "loss": 1.2625, + "step": 1330 + }, + { + "epoch": 0.18043787704195757, + "grad_norm": 1.7422050684137507, + "learning_rate": 1.8837389155045253e-06, + "loss": 1.223, + "step": 1331 + }, + { + "epoch": 0.18057344268962244, + "grad_norm": 2.0668161223984214, + "learning_rate": 1.883533322421255e-06, + "loss": 1.238, + "step": 1332 + }, + { + "epoch": 0.18070900833728734, + "grad_norm": 2.1423873901283645, + "learning_rate": 1.883327558955557e-06, + "loss": 1.255, + "step": 1333 + }, + { + "epoch": 0.1808445739849522, + "grad_norm": 1.6270784771946751, + "learning_rate": 1.8831216251471123e-06, + "loss": 1.2647, + "step": 1334 + }, + { + "epoch": 0.18098013963261708, + "grad_norm": 1.9770402254843376, + "learning_rate": 1.8829155210356329e-06, + "loss": 1.2147, + "step": 1335 + }, + { + "epoch": 0.18111570528028198, + "grad_norm": 5.913778622046024, + "learning_rate": 1.8827092466608647e-06, + "loss": 1.2221, + "step": 1336 + }, + { + "epoch": 0.18125127092794685, + "grad_norm": 1.8301567901651519, + "learning_rate": 1.8825028020625858e-06, + "loss": 1.1991, + "step": 1337 + }, + { + "epoch": 0.18138683657561175, + "grad_norm": 1.718471334390489, + "learning_rate": 1.8822961872806076e-06, + "loss": 1.2383, + "step": 1338 + }, + { + "epoch": 0.18152240222327662, + "grad_norm": 1.7423020478183011, + "learning_rate": 1.8820894023547745e-06, + "loss": 1.2196, + "step": 1339 + }, + { + "epoch": 0.18165796787094152, + "grad_norm": 1.4975714273348792, + "learning_rate": 1.8818824473249624e-06, + "loss": 1.2305, + "step": 1340 + }, + { + "epoch": 0.1817935335186064, + "grad_norm": 4.92023567970986, + "learning_rate": 1.8816753222310818e-06, + "loss": 1.2621, + "step": 1341 + }, + { + "epoch": 0.18192909916627126, + "grad_norm": 1.7377991006813884, + "learning_rate": 1.8814680271130747e-06, + "loss": 1.2279, + "step": 1342 + }, + { + "epoch": 0.18206466481393616, + "grad_norm": 1.8721612635153688, + "learning_rate": 1.8812605620109165e-06, + "loss": 1.2072, + "step": 1343 + }, + { + "epoch": 0.18220023046160103, + "grad_norm": 1.6938126667379068, + "learning_rate": 1.881052926964615e-06, + "loss": 1.266, + "step": 1344 + }, + { + "epoch": 0.18233579610926592, + "grad_norm": 2.7970830488229828, + "learning_rate": 1.8808451220142114e-06, + "loss": 1.253, + "step": 1345 + }, + { + "epoch": 0.1824713617569308, + "grad_norm": 1.830079505336092, + "learning_rate": 1.880637147199779e-06, + "loss": 1.2645, + "step": 1346 + }, + { + "epoch": 0.18260692740459566, + "grad_norm": 2.0826468145792822, + "learning_rate": 1.8804290025614242e-06, + "loss": 1.2742, + "step": 1347 + }, + { + "epoch": 0.18274249305226056, + "grad_norm": 1.8237964055426592, + "learning_rate": 1.8802206881392858e-06, + "loss": 1.241, + "step": 1348 + }, + { + "epoch": 0.18287805869992543, + "grad_norm": 2.1512697448315565, + "learning_rate": 1.8800122039735355e-06, + "loss": 1.2658, + "step": 1349 + }, + { + "epoch": 0.18301362434759033, + "grad_norm": 2.879962233604458, + "learning_rate": 1.8798035501043783e-06, + "loss": 1.2102, + "step": 1350 + }, + { + "epoch": 0.1831491899952552, + "grad_norm": 2.729844659515945, + "learning_rate": 1.879594726572051e-06, + "loss": 1.2175, + "step": 1351 + }, + { + "epoch": 0.18328475564292007, + "grad_norm": 1.9883237457914327, + "learning_rate": 1.8793857334168243e-06, + "loss": 1.27, + "step": 1352 + }, + { + "epoch": 0.18342032129058497, + "grad_norm": 3.064261648600981, + "learning_rate": 1.8791765706789997e-06, + "loss": 1.2153, + "step": 1353 + }, + { + "epoch": 0.18355588693824984, + "grad_norm": 2.02601408287937, + "learning_rate": 1.8789672383989134e-06, + "loss": 1.228, + "step": 1354 + }, + { + "epoch": 0.18369145258591474, + "grad_norm": 1.7870502420349206, + "learning_rate": 1.8787577366169336e-06, + "loss": 1.2659, + "step": 1355 + }, + { + "epoch": 0.1838270182335796, + "grad_norm": 2.1669747995444797, + "learning_rate": 1.8785480653734607e-06, + "loss": 1.2536, + "step": 1356 + }, + { + "epoch": 0.18396258388124448, + "grad_norm": 1.978782117044531, + "learning_rate": 1.878338224708928e-06, + "loss": 1.2561, + "step": 1357 + }, + { + "epoch": 0.18409814952890938, + "grad_norm": 1.9735216517404681, + "learning_rate": 1.878128214663802e-06, + "loss": 1.23, + "step": 1358 + }, + { + "epoch": 0.18423371517657425, + "grad_norm": 2.368038945026116, + "learning_rate": 1.8779180352785814e-06, + "loss": 1.2433, + "step": 1359 + }, + { + "epoch": 0.18436928082423915, + "grad_norm": 2.101925630741658, + "learning_rate": 1.8777076865937976e-06, + "loss": 1.2448, + "step": 1360 + }, + { + "epoch": 0.18450484647190402, + "grad_norm": 2.6485165686400514, + "learning_rate": 1.8774971686500143e-06, + "loss": 1.2296, + "step": 1361 + }, + { + "epoch": 0.18464041211956891, + "grad_norm": 1.565115142634395, + "learning_rate": 1.877286481487829e-06, + "loss": 1.2105, + "step": 1362 + }, + { + "epoch": 0.18477597776723378, + "grad_norm": 1.7412163266206433, + "learning_rate": 1.8770756251478703e-06, + "loss": 1.2617, + "step": 1363 + }, + { + "epoch": 0.18491154341489865, + "grad_norm": 2.033986549140736, + "learning_rate": 1.8768645996708007e-06, + "loss": 1.2457, + "step": 1364 + }, + { + "epoch": 0.18504710906256355, + "grad_norm": 2.125563934200084, + "learning_rate": 1.8766534050973144e-06, + "loss": 1.2007, + "step": 1365 + }, + { + "epoch": 0.18518267471022842, + "grad_norm": 2.5182056631801872, + "learning_rate": 1.876442041468139e-06, + "loss": 1.2286, + "step": 1366 + }, + { + "epoch": 0.18531824035789332, + "grad_norm": 2.424740508772263, + "learning_rate": 1.876230508824034e-06, + "loss": 1.238, + "step": 1367 + }, + { + "epoch": 0.1854538060055582, + "grad_norm": 3.653515584832514, + "learning_rate": 1.876018807205792e-06, + "loss": 1.2113, + "step": 1368 + }, + { + "epoch": 0.18558937165322306, + "grad_norm": 1.9310584794306935, + "learning_rate": 1.875806936654238e-06, + "loss": 1.2164, + "step": 1369 + }, + { + "epoch": 0.18572493730088796, + "grad_norm": 1.735855066740508, + "learning_rate": 1.8755948972102292e-06, + "loss": 1.243, + "step": 1370 + }, + { + "epoch": 0.18586050294855283, + "grad_norm": 2.344604517336858, + "learning_rate": 1.8753826889146562e-06, + "loss": 1.2384, + "step": 1371 + }, + { + "epoch": 0.18599606859621773, + "grad_norm": 4.050618675453687, + "learning_rate": 1.8751703118084413e-06, + "loss": 1.2297, + "step": 1372 + }, + { + "epoch": 0.1861316342438826, + "grad_norm": 1.8231958073426695, + "learning_rate": 1.8749577659325401e-06, + "loss": 1.2103, + "step": 1373 + }, + { + "epoch": 0.18626719989154747, + "grad_norm": 2.7712569428555045, + "learning_rate": 1.8747450513279403e-06, + "loss": 1.2145, + "step": 1374 + }, + { + "epoch": 0.18640276553921237, + "grad_norm": 2.6133615608636633, + "learning_rate": 1.874532168035662e-06, + "loss": 1.2862, + "step": 1375 + }, + { + "epoch": 0.18653833118687724, + "grad_norm": 4.853878572476156, + "learning_rate": 1.8743191160967584e-06, + "loss": 1.2527, + "step": 1376 + }, + { + "epoch": 0.18667389683454214, + "grad_norm": 2.2257626705765015, + "learning_rate": 1.8741058955523145e-06, + "loss": 1.2645, + "step": 1377 + }, + { + "epoch": 0.186809462482207, + "grad_norm": 1.965016507948669, + "learning_rate": 1.8738925064434485e-06, + "loss": 1.2494, + "step": 1378 + }, + { + "epoch": 0.1869450281298719, + "grad_norm": 2.6376501704949753, + "learning_rate": 1.8736789488113108e-06, + "loss": 1.2327, + "step": 1379 + }, + { + "epoch": 0.18708059377753677, + "grad_norm": 1.6681181105582434, + "learning_rate": 1.8734652226970844e-06, + "loss": 1.2187, + "step": 1380 + }, + { + "epoch": 0.18721615942520164, + "grad_norm": 4.453271591023285, + "learning_rate": 1.8732513281419843e-06, + "loss": 1.2467, + "step": 1381 + }, + { + "epoch": 0.18735172507286654, + "grad_norm": 1.82192326282396, + "learning_rate": 1.8730372651872585e-06, + "loss": 1.2282, + "step": 1382 + }, + { + "epoch": 0.1874872907205314, + "grad_norm": 1.7760547889762313, + "learning_rate": 1.8728230338741877e-06, + "loss": 1.2048, + "step": 1383 + }, + { + "epoch": 0.1876228563681963, + "grad_norm": 1.9449066605474612, + "learning_rate": 1.8726086342440842e-06, + "loss": 1.221, + "step": 1384 + }, + { + "epoch": 0.18775842201586118, + "grad_norm": 2.0575462744792032, + "learning_rate": 1.8723940663382939e-06, + "loss": 1.2245, + "step": 1385 + }, + { + "epoch": 0.18789398766352605, + "grad_norm": 1.8376050405781321, + "learning_rate": 1.8721793301981937e-06, + "loss": 1.2156, + "step": 1386 + }, + { + "epoch": 0.18802955331119095, + "grad_norm": 2.5674756825821716, + "learning_rate": 1.8719644258651942e-06, + "loss": 1.2471, + "step": 1387 + }, + { + "epoch": 0.18816511895885582, + "grad_norm": 1.7384528422862326, + "learning_rate": 1.8717493533807386e-06, + "loss": 1.2203, + "step": 1388 + }, + { + "epoch": 0.18830068460652072, + "grad_norm": 1.9010580396857433, + "learning_rate": 1.871534112786301e-06, + "loss": 1.2137, + "step": 1389 + }, + { + "epoch": 0.1884362502541856, + "grad_norm": 1.5676027131858916, + "learning_rate": 1.8713187041233893e-06, + "loss": 1.2217, + "step": 1390 + }, + { + "epoch": 0.18857181590185046, + "grad_norm": 2.078791592996095, + "learning_rate": 1.8711031274335434e-06, + "loss": 1.1732, + "step": 1391 + }, + { + "epoch": 0.18870738154951536, + "grad_norm": 2.211315060832647, + "learning_rate": 1.8708873827583352e-06, + "loss": 1.2635, + "step": 1392 + }, + { + "epoch": 0.18884294719718023, + "grad_norm": 3.095669071180061, + "learning_rate": 1.8706714701393697e-06, + "loss": 1.2545, + "step": 1393 + }, + { + "epoch": 0.18897851284484513, + "grad_norm": 2.066381751225512, + "learning_rate": 1.8704553896182838e-06, + "loss": 1.2239, + "step": 1394 + }, + { + "epoch": 0.18911407849251, + "grad_norm": 2.471262188422591, + "learning_rate": 1.870239141236747e-06, + "loss": 1.2676, + "step": 1395 + }, + { + "epoch": 0.18924964414017487, + "grad_norm": 1.979213967202975, + "learning_rate": 1.870022725036461e-06, + "loss": 1.2002, + "step": 1396 + }, + { + "epoch": 0.18938520978783976, + "grad_norm": 1.8959321097753667, + "learning_rate": 1.8698061410591604e-06, + "loss": 1.226, + "step": 1397 + }, + { + "epoch": 0.18952077543550463, + "grad_norm": 1.9139942826450869, + "learning_rate": 1.8695893893466108e-06, + "loss": 1.2251, + "step": 1398 + }, + { + "epoch": 0.18965634108316953, + "grad_norm": 1.7408387118445907, + "learning_rate": 1.869372469940612e-06, + "loss": 1.2568, + "step": 1399 + }, + { + "epoch": 0.1897919067308344, + "grad_norm": 2.5092586642671653, + "learning_rate": 1.8691553828829948e-06, + "loss": 1.2445, + "step": 1400 + }, + { + "epoch": 0.1899274723784993, + "grad_norm": 1.9571635876662816, + "learning_rate": 1.8689381282156222e-06, + "loss": 1.2149, + "step": 1401 + }, + { + "epoch": 0.19006303802616417, + "grad_norm": 1.748433046311791, + "learning_rate": 1.868720705980391e-06, + "loss": 1.2487, + "step": 1402 + }, + { + "epoch": 0.19019860367382904, + "grad_norm": 1.7380479606896249, + "learning_rate": 1.8685031162192287e-06, + "loss": 1.2445, + "step": 1403 + }, + { + "epoch": 0.19033416932149394, + "grad_norm": 2.796998361056928, + "learning_rate": 1.8682853589740962e-06, + "loss": 1.2188, + "step": 1404 + }, + { + "epoch": 0.1904697349691588, + "grad_norm": 1.7378255094489052, + "learning_rate": 1.8680674342869858e-06, + "loss": 1.2288, + "step": 1405 + }, + { + "epoch": 0.1906053006168237, + "grad_norm": 1.6450667141061586, + "learning_rate": 1.867849342199923e-06, + "loss": 1.214, + "step": 1406 + }, + { + "epoch": 0.19074086626448858, + "grad_norm": 1.7319615476079662, + "learning_rate": 1.867631082754965e-06, + "loss": 1.2381, + "step": 1407 + }, + { + "epoch": 0.19087643191215345, + "grad_norm": 1.664362673020585, + "learning_rate": 1.8674126559942009e-06, + "loss": 1.2252, + "step": 1408 + }, + { + "epoch": 0.19101199755981835, + "grad_norm": 2.21317367961526, + "learning_rate": 1.8671940619597532e-06, + "loss": 1.2116, + "step": 1409 + }, + { + "epoch": 0.19114756320748322, + "grad_norm": 2.334620661079541, + "learning_rate": 1.8669753006937762e-06, + "loss": 1.2612, + "step": 1410 + }, + { + "epoch": 0.19128312885514812, + "grad_norm": 2.0501869296895547, + "learning_rate": 1.8667563722384559e-06, + "loss": 1.1933, + "step": 1411 + }, + { + "epoch": 0.19141869450281299, + "grad_norm": 2.334827067720466, + "learning_rate": 1.8665372766360107e-06, + "loss": 1.2694, + "step": 1412 + }, + { + "epoch": 0.19155426015047786, + "grad_norm": 1.7265016761434941, + "learning_rate": 1.866318013928692e-06, + "loss": 1.1998, + "step": 1413 + }, + { + "epoch": 0.19168982579814275, + "grad_norm": 1.90351532251785, + "learning_rate": 1.8660985841587824e-06, + "loss": 1.1922, + "step": 1414 + }, + { + "epoch": 0.19182539144580762, + "grad_norm": 2.4833523065002363, + "learning_rate": 1.8658789873685973e-06, + "loss": 1.2544, + "step": 1415 + }, + { + "epoch": 0.19196095709347252, + "grad_norm": 2.1835516064663194, + "learning_rate": 1.8656592236004847e-06, + "loss": 1.2262, + "step": 1416 + }, + { + "epoch": 0.1920965227411374, + "grad_norm": 1.6534277652493135, + "learning_rate": 1.8654392928968239e-06, + "loss": 1.2298, + "step": 1417 + }, + { + "epoch": 0.1922320883888023, + "grad_norm": 1.9855274873164925, + "learning_rate": 1.8652191953000265e-06, + "loss": 1.206, + "step": 1418 + }, + { + "epoch": 0.19236765403646716, + "grad_norm": 1.7585166176954679, + "learning_rate": 1.864998930852537e-06, + "loss": 1.2135, + "step": 1419 + }, + { + "epoch": 0.19250321968413203, + "grad_norm": 1.858943527265771, + "learning_rate": 1.8647784995968317e-06, + "loss": 1.1902, + "step": 1420 + }, + { + "epoch": 0.19263878533179693, + "grad_norm": 1.5706226563897898, + "learning_rate": 1.8645579015754189e-06, + "loss": 1.2035, + "step": 1421 + }, + { + "epoch": 0.1927743509794618, + "grad_norm": 1.6222241610837251, + "learning_rate": 1.8643371368308389e-06, + "loss": 1.2647, + "step": 1422 + }, + { + "epoch": 0.1929099166271267, + "grad_norm": 7.354176798105414, + "learning_rate": 1.8641162054056651e-06, + "loss": 1.2494, + "step": 1423 + }, + { + "epoch": 0.19304548227479157, + "grad_norm": 1.9832198580133713, + "learning_rate": 1.8638951073425018e-06, + "loss": 1.2127, + "step": 1424 + }, + { + "epoch": 0.19318104792245644, + "grad_norm": 1.9379630007680735, + "learning_rate": 1.8636738426839863e-06, + "loss": 1.2191, + "step": 1425 + }, + { + "epoch": 0.19331661357012134, + "grad_norm": 2.233360644239882, + "learning_rate": 1.8634524114727878e-06, + "loss": 1.2185, + "step": 1426 + }, + { + "epoch": 0.1934521792177862, + "grad_norm": 2.3823765226698144, + "learning_rate": 1.8632308137516071e-06, + "loss": 1.2355, + "step": 1427 + }, + { + "epoch": 0.1935877448654511, + "grad_norm": 4.696002475809125, + "learning_rate": 1.8630090495631783e-06, + "loss": 1.2698, + "step": 1428 + }, + { + "epoch": 0.19372331051311598, + "grad_norm": 3.2917200945931717, + "learning_rate": 1.8627871189502662e-06, + "loss": 1.2334, + "step": 1429 + }, + { + "epoch": 0.19385887616078085, + "grad_norm": 2.4413388761427974, + "learning_rate": 1.8625650219556688e-06, + "loss": 1.2386, + "step": 1430 + }, + { + "epoch": 0.19399444180844574, + "grad_norm": 2.1938123420733087, + "learning_rate": 1.8623427586222154e-06, + "loss": 1.2276, + "step": 1431 + }, + { + "epoch": 0.19413000745611061, + "grad_norm": 1.5750111679413, + "learning_rate": 1.8621203289927681e-06, + "loss": 1.2775, + "step": 1432 + }, + { + "epoch": 0.1942655731037755, + "grad_norm": 1.8329821855954802, + "learning_rate": 1.8618977331102204e-06, + "loss": 1.2326, + "step": 1433 + }, + { + "epoch": 0.19440113875144038, + "grad_norm": 1.6377728765378137, + "learning_rate": 1.861674971017498e-06, + "loss": 1.223, + "step": 1434 + }, + { + "epoch": 0.19453670439910525, + "grad_norm": 2.064871814407531, + "learning_rate": 1.8614520427575596e-06, + "loss": 1.2244, + "step": 1435 + }, + { + "epoch": 0.19467227004677015, + "grad_norm": 1.8610966097769803, + "learning_rate": 1.8612289483733942e-06, + "loss": 1.2271, + "step": 1436 + }, + { + "epoch": 0.19480783569443502, + "grad_norm": 2.362537915699557, + "learning_rate": 1.8610056879080247e-06, + "loss": 1.1972, + "step": 1437 + }, + { + "epoch": 0.19494340134209992, + "grad_norm": 2.0618684309804887, + "learning_rate": 1.8607822614045041e-06, + "loss": 1.2864, + "step": 1438 + }, + { + "epoch": 0.1950789669897648, + "grad_norm": 3.045009379693363, + "learning_rate": 1.8605586689059195e-06, + "loss": 1.1923, + "step": 1439 + }, + { + "epoch": 0.1952145326374297, + "grad_norm": 2.495084602772215, + "learning_rate": 1.8603349104553882e-06, + "loss": 1.1767, + "step": 1440 + }, + { + "epoch": 0.19535009828509456, + "grad_norm": 1.7083014826021572, + "learning_rate": 1.8601109860960603e-06, + "loss": 1.1699, + "step": 1441 + }, + { + "epoch": 0.19548566393275943, + "grad_norm": 1.5192476508833623, + "learning_rate": 1.8598868958711185e-06, + "loss": 1.2154, + "step": 1442 + }, + { + "epoch": 0.19562122958042433, + "grad_norm": 1.933061167732601, + "learning_rate": 1.8596626398237762e-06, + "loss": 1.2184, + "step": 1443 + }, + { + "epoch": 0.1957567952280892, + "grad_norm": 3.29439420826514, + "learning_rate": 1.8594382179972794e-06, + "loss": 1.2226, + "step": 1444 + }, + { + "epoch": 0.1958923608757541, + "grad_norm": 4.755164245133784, + "learning_rate": 1.8592136304349063e-06, + "loss": 1.2569, + "step": 1445 + }, + { + "epoch": 0.19602792652341897, + "grad_norm": 3.203056490314853, + "learning_rate": 1.8589888771799669e-06, + "loss": 1.2273, + "step": 1446 + }, + { + "epoch": 0.19616349217108384, + "grad_norm": 1.700285900481202, + "learning_rate": 1.858763958275803e-06, + "loss": 1.2215, + "step": 1447 + }, + { + "epoch": 0.19629905781874873, + "grad_norm": 2.277590358046841, + "learning_rate": 1.8585388737657883e-06, + "loss": 1.2877, + "step": 1448 + }, + { + "epoch": 0.1964346234664136, + "grad_norm": 1.818890023757658, + "learning_rate": 1.8583136236933287e-06, + "loss": 1.2675, + "step": 1449 + }, + { + "epoch": 0.1965701891140785, + "grad_norm": 1.9823370751809102, + "learning_rate": 1.858088208101862e-06, + "loss": 1.2203, + "step": 1450 + }, + { + "epoch": 0.19670575476174337, + "grad_norm": 2.396488047106841, + "learning_rate": 1.8578626270348576e-06, + "loss": 1.1951, + "step": 1451 + }, + { + "epoch": 0.19684132040940824, + "grad_norm": 2.0435638225214103, + "learning_rate": 1.8576368805358171e-06, + "loss": 1.2819, + "step": 1452 + }, + { + "epoch": 0.19697688605707314, + "grad_norm": 2.125032299119229, + "learning_rate": 1.857410968648274e-06, + "loss": 1.2239, + "step": 1453 + }, + { + "epoch": 0.197112451704738, + "grad_norm": 2.2865350357914687, + "learning_rate": 1.8571848914157938e-06, + "loss": 1.2163, + "step": 1454 + }, + { + "epoch": 0.1972480173524029, + "grad_norm": 1.6797774985089622, + "learning_rate": 1.8569586488819732e-06, + "loss": 1.2225, + "step": 1455 + }, + { + "epoch": 0.19738358300006778, + "grad_norm": 2.5915084961668087, + "learning_rate": 1.8567322410904416e-06, + "loss": 1.2149, + "step": 1456 + }, + { + "epoch": 0.19751914864773265, + "grad_norm": 1.9606010449882147, + "learning_rate": 1.8565056680848602e-06, + "loss": 1.2588, + "step": 1457 + }, + { + "epoch": 0.19765471429539755, + "grad_norm": 1.6077504867023953, + "learning_rate": 1.8562789299089212e-06, + "loss": 1.2068, + "step": 1458 + }, + { + "epoch": 0.19779027994306242, + "grad_norm": 2.438235103524109, + "learning_rate": 1.8560520266063497e-06, + "loss": 1.2142, + "step": 1459 + }, + { + "epoch": 0.19792584559072732, + "grad_norm": 1.9492163998545224, + "learning_rate": 1.8558249582209022e-06, + "loss": 1.2121, + "step": 1460 + }, + { + "epoch": 0.1980614112383922, + "grad_norm": 1.9563596398498837, + "learning_rate": 1.8555977247963673e-06, + "loss": 1.2133, + "step": 1461 + }, + { + "epoch": 0.19819697688605709, + "grad_norm": 1.965489980234663, + "learning_rate": 1.8553703263765646e-06, + "loss": 1.2216, + "step": 1462 + }, + { + "epoch": 0.19833254253372196, + "grad_norm": 2.095081584358577, + "learning_rate": 1.8551427630053463e-06, + "loss": 1.2118, + "step": 1463 + }, + { + "epoch": 0.19846810818138683, + "grad_norm": 1.6935925625772559, + "learning_rate": 1.854915034726596e-06, + "loss": 1.1899, + "step": 1464 + }, + { + "epoch": 0.19860367382905172, + "grad_norm": 1.64825136461562, + "learning_rate": 1.8546871415842298e-06, + "loss": 1.2741, + "step": 1465 + }, + { + "epoch": 0.1987392394767166, + "grad_norm": 2.1412189991642743, + "learning_rate": 1.8544590836221947e-06, + "loss": 1.1743, + "step": 1466 + }, + { + "epoch": 0.1988748051243815, + "grad_norm": 2.3524711636818734, + "learning_rate": 1.8542308608844704e-06, + "loss": 1.234, + "step": 1467 + }, + { + "epoch": 0.19901037077204636, + "grad_norm": 6.054342822134298, + "learning_rate": 1.854002473415067e-06, + "loss": 1.2364, + "step": 1468 + }, + { + "epoch": 0.19914593641971123, + "grad_norm": 1.8224864585296763, + "learning_rate": 1.853773921258028e-06, + "loss": 1.2468, + "step": 1469 + }, + { + "epoch": 0.19928150206737613, + "grad_norm": 2.5000746241011558, + "learning_rate": 1.8535452044574274e-06, + "loss": 1.2451, + "step": 1470 + }, + { + "epoch": 0.199417067715041, + "grad_norm": 1.7448415476228685, + "learning_rate": 1.8533163230573716e-06, + "loss": 1.2518, + "step": 1471 + }, + { + "epoch": 0.1995526333627059, + "grad_norm": 2.0796429152418954, + "learning_rate": 1.8530872771019984e-06, + "loss": 1.2385, + "step": 1472 + }, + { + "epoch": 0.19968819901037077, + "grad_norm": 2.139740557998274, + "learning_rate": 1.8528580666354782e-06, + "loss": 1.2368, + "step": 1473 + }, + { + "epoch": 0.19982376465803564, + "grad_norm": 1.866008216000854, + "learning_rate": 1.8526286917020114e-06, + "loss": 1.2391, + "step": 1474 + }, + { + "epoch": 0.19995933030570054, + "grad_norm": 2.1367948947212905, + "learning_rate": 1.852399152345832e-06, + "loss": 1.1992, + "step": 1475 + }, + { + "epoch": 0.2000948959533654, + "grad_norm": 1.6636477943684067, + "learning_rate": 1.8521694486112045e-06, + "loss": 1.2185, + "step": 1476 + }, + { + "epoch": 0.2002304616010303, + "grad_norm": 1.730564729745614, + "learning_rate": 1.851939580542425e-06, + "loss": 1.2166, + "step": 1477 + }, + { + "epoch": 0.20036602724869518, + "grad_norm": 1.634151116899306, + "learning_rate": 1.8517095481838228e-06, + "loss": 1.228, + "step": 1478 + }, + { + "epoch": 0.20050159289636008, + "grad_norm": 2.5392594550939998, + "learning_rate": 1.8514793515797567e-06, + "loss": 1.258, + "step": 1479 + }, + { + "epoch": 0.20063715854402495, + "grad_norm": 2.323378600851342, + "learning_rate": 1.8512489907746193e-06, + "loss": 1.251, + "step": 1480 + }, + { + "epoch": 0.20077272419168982, + "grad_norm": 3.387468834479324, + "learning_rate": 1.851018465812833e-06, + "loss": 1.2086, + "step": 1481 + }, + { + "epoch": 0.20090828983935471, + "grad_norm": 1.8493897601152416, + "learning_rate": 1.8507877767388531e-06, + "loss": 1.2396, + "step": 1482 + }, + { + "epoch": 0.20104385548701958, + "grad_norm": 2.615246813817561, + "learning_rate": 1.8505569235971663e-06, + "loss": 1.2294, + "step": 1483 + }, + { + "epoch": 0.20117942113468448, + "grad_norm": 1.8834267110762362, + "learning_rate": 1.8503259064322907e-06, + "loss": 1.2598, + "step": 1484 + }, + { + "epoch": 0.20131498678234935, + "grad_norm": 1.6673690515506954, + "learning_rate": 1.8500947252887759e-06, + "loss": 1.2413, + "step": 1485 + }, + { + "epoch": 0.20145055243001422, + "grad_norm": 1.6647313838463893, + "learning_rate": 1.8498633802112039e-06, + "loss": 1.2428, + "step": 1486 + }, + { + "epoch": 0.20158611807767912, + "grad_norm": 1.731607415711226, + "learning_rate": 1.849631871244187e-06, + "loss": 1.2397, + "step": 1487 + }, + { + "epoch": 0.201721683725344, + "grad_norm": 2.6541490527666567, + "learning_rate": 1.8494001984323706e-06, + "loss": 1.2085, + "step": 1488 + }, + { + "epoch": 0.2018572493730089, + "grad_norm": 1.773144079031115, + "learning_rate": 1.8491683618204307e-06, + "loss": 1.236, + "step": 1489 + }, + { + "epoch": 0.20199281502067376, + "grad_norm": 1.6392176274058283, + "learning_rate": 1.848936361453075e-06, + "loss": 1.2173, + "step": 1490 + }, + { + "epoch": 0.20212838066833863, + "grad_norm": 3.2899371762542473, + "learning_rate": 1.8487041973750434e-06, + "loss": 1.2379, + "step": 1491 + }, + { + "epoch": 0.20226394631600353, + "grad_norm": 2.347163676394992, + "learning_rate": 1.8484718696311063e-06, + "loss": 1.2637, + "step": 1492 + }, + { + "epoch": 0.2023995119636684, + "grad_norm": 2.6439932163578987, + "learning_rate": 1.8482393782660669e-06, + "loss": 1.2454, + "step": 1493 + }, + { + "epoch": 0.2025350776113333, + "grad_norm": 1.6830410490547583, + "learning_rate": 1.8480067233247584e-06, + "loss": 1.1858, + "step": 1494 + }, + { + "epoch": 0.20267064325899817, + "grad_norm": 1.9281910801538564, + "learning_rate": 1.8477739048520475e-06, + "loss": 1.1961, + "step": 1495 + }, + { + "epoch": 0.20280620890666304, + "grad_norm": 2.1135821683912432, + "learning_rate": 1.847540922892831e-06, + "loss": 1.2465, + "step": 1496 + }, + { + "epoch": 0.20294177455432794, + "grad_norm": 2.205918980979365, + "learning_rate": 1.8473077774920377e-06, + "loss": 1.2296, + "step": 1497 + }, + { + "epoch": 0.2030773402019928, + "grad_norm": 2.0002849685459636, + "learning_rate": 1.8470744686946276e-06, + "loss": 1.2161, + "step": 1498 + }, + { + "epoch": 0.2032129058496577, + "grad_norm": 2.307169558326574, + "learning_rate": 1.8468409965455924e-06, + "loss": 1.2155, + "step": 1499 + }, + { + "epoch": 0.20334847149732257, + "grad_norm": 1.5976426410129458, + "learning_rate": 1.8466073610899557e-06, + "loss": 1.2097, + "step": 1500 + }, + { + "epoch": 0.20348403714498747, + "grad_norm": 4.8690852724875935, + "learning_rate": 1.846373562372772e-06, + "loss": 1.2002, + "step": 1501 + }, + { + "epoch": 0.20361960279265234, + "grad_norm": 1.6990391325392835, + "learning_rate": 1.846139600439128e-06, + "loss": 1.273, + "step": 1502 + }, + { + "epoch": 0.2037551684403172, + "grad_norm": 1.6365264363104979, + "learning_rate": 1.845905475334141e-06, + "loss": 1.2457, + "step": 1503 + }, + { + "epoch": 0.2038907340879821, + "grad_norm": 2.345754306348919, + "learning_rate": 1.84567118710296e-06, + "loss": 1.2368, + "step": 1504 + }, + { + "epoch": 0.20402629973564698, + "grad_norm": 1.6719618549147712, + "learning_rate": 1.8454367357907663e-06, + "loss": 1.2779, + "step": 1505 + }, + { + "epoch": 0.20416186538331188, + "grad_norm": 2.253555832686218, + "learning_rate": 1.8452021214427713e-06, + "loss": 1.2377, + "step": 1506 + }, + { + "epoch": 0.20429743103097675, + "grad_norm": 2.5113474928829422, + "learning_rate": 1.8449673441042188e-06, + "loss": 1.2468, + "step": 1507 + }, + { + "epoch": 0.20443299667864162, + "grad_norm": 2.0177617160211727, + "learning_rate": 1.8447324038203838e-06, + "loss": 1.2585, + "step": 1508 + }, + { + "epoch": 0.20456856232630652, + "grad_norm": 1.9839348685339944, + "learning_rate": 1.8444973006365724e-06, + "loss": 1.2221, + "step": 1509 + }, + { + "epoch": 0.2047041279739714, + "grad_norm": 3.1089557299441575, + "learning_rate": 1.844262034598123e-06, + "loss": 1.2456, + "step": 1510 + }, + { + "epoch": 0.2048396936216363, + "grad_norm": 3.3728170532315223, + "learning_rate": 1.8440266057504044e-06, + "loss": 1.2357, + "step": 1511 + }, + { + "epoch": 0.20497525926930116, + "grad_norm": 1.571068870112634, + "learning_rate": 1.843791014138817e-06, + "loss": 1.1792, + "step": 1512 + }, + { + "epoch": 0.20511082491696603, + "grad_norm": 2.3402859886136795, + "learning_rate": 1.843555259808793e-06, + "loss": 1.2512, + "step": 1513 + }, + { + "epoch": 0.20524639056463093, + "grad_norm": 4.540455494550336, + "learning_rate": 1.8433193428057958e-06, + "loss": 1.2335, + "step": 1514 + }, + { + "epoch": 0.2053819562122958, + "grad_norm": 1.7243032624503525, + "learning_rate": 1.84308326317532e-06, + "loss": 1.2302, + "step": 1515 + }, + { + "epoch": 0.2055175218599607, + "grad_norm": 2.083516136098412, + "learning_rate": 1.842847020962892e-06, + "loss": 1.2277, + "step": 1516 + }, + { + "epoch": 0.20565308750762556, + "grad_norm": 5.6633473749555465, + "learning_rate": 1.842610616214069e-06, + "loss": 1.2498, + "step": 1517 + }, + { + "epoch": 0.20578865315529046, + "grad_norm": 2.7356586256693385, + "learning_rate": 1.8423740489744399e-06, + "loss": 1.2354, + "step": 1518 + }, + { + "epoch": 0.20592421880295533, + "grad_norm": 4.032944141138344, + "learning_rate": 1.8421373192896248e-06, + "loss": 1.2264, + "step": 1519 + }, + { + "epoch": 0.2060597844506202, + "grad_norm": 1.8220362352352324, + "learning_rate": 1.841900427205275e-06, + "loss": 1.2416, + "step": 1520 + }, + { + "epoch": 0.2061953500982851, + "grad_norm": 2.1639680009902453, + "learning_rate": 1.8416633727670732e-06, + "loss": 1.2262, + "step": 1521 + }, + { + "epoch": 0.20633091574594997, + "grad_norm": 2.796946182096788, + "learning_rate": 1.8414261560207337e-06, + "loss": 1.2422, + "step": 1522 + }, + { + "epoch": 0.20646648139361487, + "grad_norm": 2.2154615217506657, + "learning_rate": 1.8411887770120021e-06, + "loss": 1.2151, + "step": 1523 + }, + { + "epoch": 0.20660204704127974, + "grad_norm": 1.9844225421294477, + "learning_rate": 1.8409512357866546e-06, + "loss": 1.2463, + "step": 1524 + }, + { + "epoch": 0.2067376126889446, + "grad_norm": 1.9530332425762869, + "learning_rate": 1.8407135323904995e-06, + "loss": 1.1871, + "step": 1525 + }, + { + "epoch": 0.2068731783366095, + "grad_norm": 2.196517216537945, + "learning_rate": 1.8404756668693758e-06, + "loss": 1.1868, + "step": 1526 + }, + { + "epoch": 0.20700874398427438, + "grad_norm": 2.808843387395895, + "learning_rate": 1.8402376392691539e-06, + "loss": 1.2302, + "step": 1527 + }, + { + "epoch": 0.20714430963193928, + "grad_norm": 1.8037321139196023, + "learning_rate": 1.8399994496357359e-06, + "loss": 1.2435, + "step": 1528 + }, + { + "epoch": 0.20727987527960415, + "grad_norm": 3.801133080039018, + "learning_rate": 1.8397610980150544e-06, + "loss": 1.2335, + "step": 1529 + }, + { + "epoch": 0.20741544092726902, + "grad_norm": 3.9262097981413455, + "learning_rate": 1.8395225844530738e-06, + "loss": 1.209, + "step": 1530 + }, + { + "epoch": 0.20755100657493392, + "grad_norm": 1.896141486023541, + "learning_rate": 1.8392839089957897e-06, + "loss": 1.2297, + "step": 1531 + }, + { + "epoch": 0.2076865722225988, + "grad_norm": 3.331949726821893, + "learning_rate": 1.8390450716892288e-06, + "loss": 1.2275, + "step": 1532 + }, + { + "epoch": 0.20782213787026368, + "grad_norm": 2.322312629940393, + "learning_rate": 1.8388060725794485e-06, + "loss": 1.2708, + "step": 1533 + }, + { + "epoch": 0.20795770351792855, + "grad_norm": 3.4358258123858723, + "learning_rate": 1.8385669117125385e-06, + "loss": 1.2191, + "step": 1534 + }, + { + "epoch": 0.20809326916559343, + "grad_norm": 2.31701019616967, + "learning_rate": 1.8383275891346186e-06, + "loss": 1.261, + "step": 1535 + }, + { + "epoch": 0.20822883481325832, + "grad_norm": 1.753988864328173, + "learning_rate": 1.8380881048918404e-06, + "loss": 1.2146, + "step": 1536 + }, + { + "epoch": 0.2083644004609232, + "grad_norm": 2.641608479597289, + "learning_rate": 1.837848459030387e-06, + "loss": 1.2046, + "step": 1537 + }, + { + "epoch": 0.2084999661085881, + "grad_norm": 2.2565171090375755, + "learning_rate": 1.8376086515964716e-06, + "loss": 1.2307, + "step": 1538 + }, + { + "epoch": 0.20863553175625296, + "grad_norm": 2.5163393771408598, + "learning_rate": 1.8373686826363397e-06, + "loss": 1.2228, + "step": 1539 + }, + { + "epoch": 0.20877109740391786, + "grad_norm": 1.9406955712461926, + "learning_rate": 1.837128552196267e-06, + "loss": 1.1936, + "step": 1540 + }, + { + "epoch": 0.20890666305158273, + "grad_norm": 1.8465452055677958, + "learning_rate": 1.8368882603225609e-06, + "loss": 1.2524, + "step": 1541 + }, + { + "epoch": 0.2090422286992476, + "grad_norm": 1.8782163403136807, + "learning_rate": 1.8366478070615596e-06, + "loss": 1.235, + "step": 1542 + }, + { + "epoch": 0.2091777943469125, + "grad_norm": 1.6617710471549823, + "learning_rate": 1.8364071924596328e-06, + "loss": 1.2329, + "step": 1543 + }, + { + "epoch": 0.20931335999457737, + "grad_norm": 1.8792007187385216, + "learning_rate": 1.8361664165631817e-06, + "loss": 1.2066, + "step": 1544 + }, + { + "epoch": 0.20944892564224227, + "grad_norm": 2.0303290806190732, + "learning_rate": 1.8359254794186368e-06, + "loss": 1.2279, + "step": 1545 + }, + { + "epoch": 0.20958449128990714, + "grad_norm": 1.7344827508185707, + "learning_rate": 1.835684381072462e-06, + "loss": 1.246, + "step": 1546 + }, + { + "epoch": 0.209720056937572, + "grad_norm": 1.7775065766133262, + "learning_rate": 1.8354431215711506e-06, + "loss": 1.1732, + "step": 1547 + }, + { + "epoch": 0.2098556225852369, + "grad_norm": 3.5040188207728855, + "learning_rate": 1.8352017009612276e-06, + "loss": 1.1941, + "step": 1548 + }, + { + "epoch": 0.20999118823290178, + "grad_norm": 2.2128998210845365, + "learning_rate": 1.8349601192892498e-06, + "loss": 1.2118, + "step": 1549 + }, + { + "epoch": 0.21012675388056667, + "grad_norm": 1.7002740916537666, + "learning_rate": 1.8347183766018033e-06, + "loss": 1.2369, + "step": 1550 + }, + { + "epoch": 0.21026231952823154, + "grad_norm": 2.9169279472508096, + "learning_rate": 1.8344764729455066e-06, + "loss": 1.2666, + "step": 1551 + }, + { + "epoch": 0.21039788517589642, + "grad_norm": 2.911779398277658, + "learning_rate": 1.8342344083670097e-06, + "loss": 1.2263, + "step": 1552 + }, + { + "epoch": 0.2105334508235613, + "grad_norm": 2.5772787566611592, + "learning_rate": 1.8339921829129916e-06, + "loss": 1.2266, + "step": 1553 + }, + { + "epoch": 0.21066901647122618, + "grad_norm": 2.4513429818347525, + "learning_rate": 1.8337497966301645e-06, + "loss": 1.2595, + "step": 1554 + }, + { + "epoch": 0.21080458211889108, + "grad_norm": 2.18131939994078, + "learning_rate": 1.8335072495652702e-06, + "loss": 1.1545, + "step": 1555 + }, + { + "epoch": 0.21094014776655595, + "grad_norm": 1.7573853996108497, + "learning_rate": 1.8332645417650822e-06, + "loss": 1.2434, + "step": 1556 + }, + { + "epoch": 0.21107571341422085, + "grad_norm": 2.3688698008193376, + "learning_rate": 1.8330216732764049e-06, + "loss": 1.2308, + "step": 1557 + }, + { + "epoch": 0.21121127906188572, + "grad_norm": 2.1158772410382967, + "learning_rate": 1.832778644146073e-06, + "loss": 1.2179, + "step": 1558 + }, + { + "epoch": 0.2113468447095506, + "grad_norm": 2.0934354568878115, + "learning_rate": 1.8325354544209532e-06, + "loss": 1.1834, + "step": 1559 + }, + { + "epoch": 0.2114824103572155, + "grad_norm": 2.090335406399481, + "learning_rate": 1.832292104147943e-06, + "loss": 1.2122, + "step": 1560 + }, + { + "epoch": 0.21161797600488036, + "grad_norm": 1.8221538549267577, + "learning_rate": 1.8320485933739697e-06, + "loss": 1.2534, + "step": 1561 + }, + { + "epoch": 0.21175354165254526, + "grad_norm": 2.140022768721683, + "learning_rate": 1.8318049221459932e-06, + "loss": 1.2178, + "step": 1562 + }, + { + "epoch": 0.21188910730021013, + "grad_norm": 1.7680895167972566, + "learning_rate": 1.8315610905110032e-06, + "loss": 1.241, + "step": 1563 + }, + { + "epoch": 0.212024672947875, + "grad_norm": 1.8952480366264837, + "learning_rate": 1.8313170985160213e-06, + "loss": 1.2223, + "step": 1564 + }, + { + "epoch": 0.2121602385955399, + "grad_norm": 1.8716248479883888, + "learning_rate": 1.8310729462080987e-06, + "loss": 1.2563, + "step": 1565 + }, + { + "epoch": 0.21229580424320477, + "grad_norm": 4.034880007002983, + "learning_rate": 1.8308286336343183e-06, + "loss": 1.2369, + "step": 1566 + }, + { + "epoch": 0.21243136989086966, + "grad_norm": 1.672030984458911, + "learning_rate": 1.8305841608417945e-06, + "loss": 1.2089, + "step": 1567 + }, + { + "epoch": 0.21256693553853453, + "grad_norm": 1.8597341903255542, + "learning_rate": 1.8303395278776712e-06, + "loss": 1.2484, + "step": 1568 + }, + { + "epoch": 0.2127025011861994, + "grad_norm": 1.8220803562346692, + "learning_rate": 1.830094734789124e-06, + "loss": 1.2438, + "step": 1569 + }, + { + "epoch": 0.2128380668338643, + "grad_norm": 1.7507920821546223, + "learning_rate": 1.82984978162336e-06, + "loss": 1.2523, + "step": 1570 + }, + { + "epoch": 0.21297363248152917, + "grad_norm": 1.7936254199696389, + "learning_rate": 1.8296046684276161e-06, + "loss": 1.2606, + "step": 1571 + }, + { + "epoch": 0.21310919812919407, + "grad_norm": 1.876560175228161, + "learning_rate": 1.8293593952491602e-06, + "loss": 1.2139, + "step": 1572 + }, + { + "epoch": 0.21324476377685894, + "grad_norm": 2.2316182412119807, + "learning_rate": 1.8291139621352913e-06, + "loss": 1.2001, + "step": 1573 + }, + { + "epoch": 0.2133803294245238, + "grad_norm": 1.677965833849725, + "learning_rate": 1.8288683691333398e-06, + "loss": 1.2119, + "step": 1574 + }, + { + "epoch": 0.2135158950721887, + "grad_norm": 1.8259410965638891, + "learning_rate": 1.8286226162906657e-06, + "loss": 1.2248, + "step": 1575 + }, + { + "epoch": 0.21365146071985358, + "grad_norm": 2.3803144076120395, + "learning_rate": 1.8283767036546612e-06, + "loss": 1.2697, + "step": 1576 + }, + { + "epoch": 0.21378702636751848, + "grad_norm": 1.9435579499586995, + "learning_rate": 1.8281306312727477e-06, + "loss": 1.2535, + "step": 1577 + }, + { + "epoch": 0.21392259201518335, + "grad_norm": 1.8276166653774233, + "learning_rate": 1.8278843991923791e-06, + "loss": 1.2576, + "step": 1578 + }, + { + "epoch": 0.21405815766284825, + "grad_norm": 1.999270629488657, + "learning_rate": 1.8276380074610392e-06, + "loss": 1.1875, + "step": 1579 + }, + { + "epoch": 0.21419372331051312, + "grad_norm": 1.8093394272556758, + "learning_rate": 1.8273914561262422e-06, + "loss": 1.2432, + "step": 1580 + }, + { + "epoch": 0.214329288958178, + "grad_norm": 1.6993062771352767, + "learning_rate": 1.8271447452355343e-06, + "loss": 1.2315, + "step": 1581 + }, + { + "epoch": 0.2144648546058429, + "grad_norm": 1.998915144015159, + "learning_rate": 1.826897874836491e-06, + "loss": 1.2488, + "step": 1582 + }, + { + "epoch": 0.21460042025350776, + "grad_norm": 3.6425715327915373, + "learning_rate": 1.8266508449767196e-06, + "loss": 1.217, + "step": 1583 + }, + { + "epoch": 0.21473598590117265, + "grad_norm": 2.3890441273106227, + "learning_rate": 1.8264036557038581e-06, + "loss": 1.2302, + "step": 1584 + }, + { + "epoch": 0.21487155154883752, + "grad_norm": 2.107781673148063, + "learning_rate": 1.826156307065575e-06, + "loss": 1.2188, + "step": 1585 + }, + { + "epoch": 0.2150071171965024, + "grad_norm": 4.150412169490622, + "learning_rate": 1.8259087991095692e-06, + "loss": 1.224, + "step": 1586 + }, + { + "epoch": 0.2151426828441673, + "grad_norm": 1.5598360198985148, + "learning_rate": 1.8256611318835709e-06, + "loss": 1.2023, + "step": 1587 + }, + { + "epoch": 0.21527824849183216, + "grad_norm": 1.7429227631204978, + "learning_rate": 1.8254133054353406e-06, + "loss": 1.198, + "step": 1588 + }, + { + "epoch": 0.21541381413949706, + "grad_norm": 3.521582192134665, + "learning_rate": 1.8251653198126697e-06, + "loss": 1.2128, + "step": 1589 + }, + { + "epoch": 0.21554937978716193, + "grad_norm": 1.5532383559593292, + "learning_rate": 1.8249171750633808e-06, + "loss": 1.1649, + "step": 1590 + }, + { + "epoch": 0.2156849454348268, + "grad_norm": 1.8862352601235688, + "learning_rate": 1.8246688712353256e-06, + "loss": 1.242, + "step": 1591 + }, + { + "epoch": 0.2158205110824917, + "grad_norm": 1.7359864559962264, + "learning_rate": 1.8244204083763886e-06, + "loss": 1.2355, + "step": 1592 + }, + { + "epoch": 0.21595607673015657, + "grad_norm": 2.3834491954078567, + "learning_rate": 1.824171786534483e-06, + "loss": 1.2149, + "step": 1593 + }, + { + "epoch": 0.21609164237782147, + "grad_norm": 2.1149875910065052, + "learning_rate": 1.823923005757554e-06, + "loss": 1.2296, + "step": 1594 + }, + { + "epoch": 0.21622720802548634, + "grad_norm": 1.6031657450131804, + "learning_rate": 1.8236740660935772e-06, + "loss": 1.1885, + "step": 1595 + }, + { + "epoch": 0.2163627736731512, + "grad_norm": 4.912848830749999, + "learning_rate": 1.8234249675905584e-06, + "loss": 1.2673, + "step": 1596 + }, + { + "epoch": 0.2164983393208161, + "grad_norm": 1.5896760854947578, + "learning_rate": 1.8231757102965343e-06, + "loss": 1.2093, + "step": 1597 + }, + { + "epoch": 0.21663390496848098, + "grad_norm": 2.9892977833635155, + "learning_rate": 1.8229262942595724e-06, + "loss": 1.2181, + "step": 1598 + }, + { + "epoch": 0.21676947061614588, + "grad_norm": 2.092691860020802, + "learning_rate": 1.8226767195277702e-06, + "loss": 1.2244, + "step": 1599 + }, + { + "epoch": 0.21690503626381075, + "grad_norm": 2.906247921957392, + "learning_rate": 1.8224269861492565e-06, + "loss": 1.2158, + "step": 1600 + }, + { + "epoch": 0.21704060191147564, + "grad_norm": 1.9123422940973551, + "learning_rate": 1.8221770941721904e-06, + "loss": 1.2162, + "step": 1601 + }, + { + "epoch": 0.21717616755914051, + "grad_norm": 3.323369126941001, + "learning_rate": 1.8219270436447615e-06, + "loss": 1.2258, + "step": 1602 + }, + { + "epoch": 0.21731173320680539, + "grad_norm": 2.236600206768039, + "learning_rate": 1.8216768346151904e-06, + "loss": 1.2147, + "step": 1603 + }, + { + "epoch": 0.21744729885447028, + "grad_norm": 1.7044004239352908, + "learning_rate": 1.8214264671317272e-06, + "loss": 1.198, + "step": 1604 + }, + { + "epoch": 0.21758286450213515, + "grad_norm": 2.1425779907116236, + "learning_rate": 1.821175941242654e-06, + "loss": 1.2365, + "step": 1605 + }, + { + "epoch": 0.21771843014980005, + "grad_norm": 1.5452992492651696, + "learning_rate": 1.8209252569962828e-06, + "loss": 1.2532, + "step": 1606 + }, + { + "epoch": 0.21785399579746492, + "grad_norm": 2.167535028325463, + "learning_rate": 1.8206744144409553e-06, + "loss": 1.2287, + "step": 1607 + }, + { + "epoch": 0.2179895614451298, + "grad_norm": 2.363263532476731, + "learning_rate": 1.8204234136250452e-06, + "loss": 1.246, + "step": 1608 + }, + { + "epoch": 0.2181251270927947, + "grad_norm": 3.84392758961811, + "learning_rate": 1.8201722545969557e-06, + "loss": 1.2142, + "step": 1609 + }, + { + "epoch": 0.21826069274045956, + "grad_norm": 3.57812967631636, + "learning_rate": 1.8199209374051212e-06, + "loss": 1.2082, + "step": 1610 + }, + { + "epoch": 0.21839625838812446, + "grad_norm": 10.363720287496543, + "learning_rate": 1.8196694620980058e-06, + "loss": 1.2251, + "step": 1611 + }, + { + "epoch": 0.21853182403578933, + "grad_norm": 2.2432505681135, + "learning_rate": 1.8194178287241047e-06, + "loss": 1.2468, + "step": 1612 + }, + { + "epoch": 0.2186673896834542, + "grad_norm": 1.9654140914069222, + "learning_rate": 1.8191660373319433e-06, + "loss": 1.2192, + "step": 1613 + }, + { + "epoch": 0.2188029553311191, + "grad_norm": 2.210268113670963, + "learning_rate": 1.8189140879700779e-06, + "loss": 1.1928, + "step": 1614 + }, + { + "epoch": 0.21893852097878397, + "grad_norm": 2.4518671974357424, + "learning_rate": 1.818661980687095e-06, + "loss": 1.2092, + "step": 1615 + }, + { + "epoch": 0.21907408662644887, + "grad_norm": 2.304254135925195, + "learning_rate": 1.8184097155316108e-06, + "loss": 1.2525, + "step": 1616 + }, + { + "epoch": 0.21920965227411374, + "grad_norm": 3.047945058728287, + "learning_rate": 1.8181572925522732e-06, + "loss": 1.2305, + "step": 1617 + }, + { + "epoch": 0.21934521792177863, + "grad_norm": 1.8008639188454099, + "learning_rate": 1.81790471179776e-06, + "loss": 1.1937, + "step": 1618 + }, + { + "epoch": 0.2194807835694435, + "grad_norm": 3.109497173292771, + "learning_rate": 1.8176519733167792e-06, + "loss": 1.2028, + "step": 1619 + }, + { + "epoch": 0.21961634921710838, + "grad_norm": 1.8427294952051576, + "learning_rate": 1.8173990771580694e-06, + "loss": 1.2268, + "step": 1620 + }, + { + "epoch": 0.21975191486477327, + "grad_norm": 4.464701938569533, + "learning_rate": 1.8171460233704e-06, + "loss": 1.2347, + "step": 1621 + }, + { + "epoch": 0.21988748051243814, + "grad_norm": 2.1521893775758656, + "learning_rate": 1.8168928120025698e-06, + "loss": 1.231, + "step": 1622 + }, + { + "epoch": 0.22002304616010304, + "grad_norm": 1.7986123953829776, + "learning_rate": 1.816639443103409e-06, + "loss": 1.247, + "step": 1623 + }, + { + "epoch": 0.2201586118077679, + "grad_norm": 2.67063233917694, + "learning_rate": 1.8163859167217778e-06, + "loss": 1.2487, + "step": 1624 + }, + { + "epoch": 0.22029417745543278, + "grad_norm": 2.201355176621884, + "learning_rate": 1.816132232906567e-06, + "loss": 1.2073, + "step": 1625 + }, + { + "epoch": 0.22042974310309768, + "grad_norm": 2.4471251251766435, + "learning_rate": 1.815878391706697e-06, + "loss": 1.2051, + "step": 1626 + }, + { + "epoch": 0.22056530875076255, + "grad_norm": 2.3059630391185832, + "learning_rate": 1.8156243931711194e-06, + "loss": 1.2042, + "step": 1627 + }, + { + "epoch": 0.22070087439842745, + "grad_norm": 2.030898458146953, + "learning_rate": 1.8153702373488157e-06, + "loss": 1.2029, + "step": 1628 + }, + { + "epoch": 0.22083644004609232, + "grad_norm": 2.033831322562416, + "learning_rate": 1.815115924288798e-06, + "loss": 1.2224, + "step": 1629 + }, + { + "epoch": 0.2209720056937572, + "grad_norm": 2.3314235071547316, + "learning_rate": 1.8148614540401082e-06, + "loss": 1.2008, + "step": 1630 + }, + { + "epoch": 0.2211075713414221, + "grad_norm": 1.8270523721783505, + "learning_rate": 1.8146068266518193e-06, + "loss": 1.2028, + "step": 1631 + }, + { + "epoch": 0.22124313698908696, + "grad_norm": 2.055886265598465, + "learning_rate": 1.8143520421730338e-06, + "loss": 1.2343, + "step": 1632 + }, + { + "epoch": 0.22137870263675186, + "grad_norm": 1.9342763928619409, + "learning_rate": 1.8140971006528854e-06, + "loss": 1.2295, + "step": 1633 + }, + { + "epoch": 0.22151426828441673, + "grad_norm": 1.8908036365552194, + "learning_rate": 1.8138420021405367e-06, + "loss": 1.1983, + "step": 1634 + }, + { + "epoch": 0.2216498339320816, + "grad_norm": 2.6316042589029114, + "learning_rate": 1.8135867466851824e-06, + "loss": 1.2494, + "step": 1635 + }, + { + "epoch": 0.2217853995797465, + "grad_norm": 2.2942866412997125, + "learning_rate": 1.813331334336046e-06, + "loss": 1.2038, + "step": 1636 + }, + { + "epoch": 0.22192096522741137, + "grad_norm": 1.972104769025066, + "learning_rate": 1.8130757651423817e-06, + "loss": 1.205, + "step": 1637 + }, + { + "epoch": 0.22205653087507626, + "grad_norm": 1.7534912412057828, + "learning_rate": 1.812820039153474e-06, + "loss": 1.1777, + "step": 1638 + }, + { + "epoch": 0.22219209652274113, + "grad_norm": 1.6483571089152242, + "learning_rate": 1.812564156418638e-06, + "loss": 1.2133, + "step": 1639 + }, + { + "epoch": 0.22232766217040603, + "grad_norm": 1.8920512838995667, + "learning_rate": 1.8123081169872184e-06, + "loss": 1.2602, + "step": 1640 + }, + { + "epoch": 0.2224632278180709, + "grad_norm": 1.6778700793473054, + "learning_rate": 1.8120519209085905e-06, + "loss": 1.2094, + "step": 1641 + }, + { + "epoch": 0.22259879346573577, + "grad_norm": 2.115399911917058, + "learning_rate": 1.8117955682321594e-06, + "loss": 1.2274, + "step": 1642 + }, + { + "epoch": 0.22273435911340067, + "grad_norm": 2.191589986719069, + "learning_rate": 1.811539059007361e-06, + "loss": 1.2114, + "step": 1643 + }, + { + "epoch": 0.22286992476106554, + "grad_norm": 2.0893379286165743, + "learning_rate": 1.8112823932836609e-06, + "loss": 1.2139, + "step": 1644 + }, + { + "epoch": 0.22300549040873044, + "grad_norm": 2.8213999312462184, + "learning_rate": 1.8110255711105552e-06, + "loss": 1.2315, + "step": 1645 + }, + { + "epoch": 0.2231410560563953, + "grad_norm": 2.6945635545251596, + "learning_rate": 1.81076859253757e-06, + "loss": 1.1931, + "step": 1646 + }, + { + "epoch": 0.22327662170406018, + "grad_norm": 2.4361251424839976, + "learning_rate": 1.8105114576142615e-06, + "loss": 1.2666, + "step": 1647 + }, + { + "epoch": 0.22341218735172508, + "grad_norm": 43.442693879817604, + "learning_rate": 1.810254166390216e-06, + "loss": 1.2294, + "step": 1648 + }, + { + "epoch": 0.22354775299938995, + "grad_norm": 1.8756723697245736, + "learning_rate": 1.8099967189150505e-06, + "loss": 1.2199, + "step": 1649 + }, + { + "epoch": 0.22368331864705485, + "grad_norm": 2.14815085211523, + "learning_rate": 1.8097391152384113e-06, + "loss": 1.2288, + "step": 1650 + }, + { + "epoch": 0.22381888429471972, + "grad_norm": 1.7608061599616647, + "learning_rate": 1.8094813554099754e-06, + "loss": 1.2298, + "step": 1651 + }, + { + "epoch": 0.2239544499423846, + "grad_norm": 1.613470764609859, + "learning_rate": 1.80922343947945e-06, + "loss": 1.2121, + "step": 1652 + }, + { + "epoch": 0.22409001559004949, + "grad_norm": 2.69989183914245, + "learning_rate": 1.808965367496572e-06, + "loss": 1.205, + "step": 1653 + }, + { + "epoch": 0.22422558123771436, + "grad_norm": 1.9688944887542812, + "learning_rate": 1.808707139511108e-06, + "loss": 1.2552, + "step": 1654 + }, + { + "epoch": 0.22436114688537925, + "grad_norm": 1.787262774090861, + "learning_rate": 1.808448755572856e-06, + "loss": 1.2313, + "step": 1655 + }, + { + "epoch": 0.22449671253304412, + "grad_norm": 1.6999305009751313, + "learning_rate": 1.808190215731643e-06, + "loss": 1.2234, + "step": 1656 + }, + { + "epoch": 0.22463227818070902, + "grad_norm": 1.7578539976179455, + "learning_rate": 1.8079315200373265e-06, + "loss": 1.2446, + "step": 1657 + }, + { + "epoch": 0.2247678438283739, + "grad_norm": 1.960402409432757, + "learning_rate": 1.8076726685397934e-06, + "loss": 1.2707, + "step": 1658 + }, + { + "epoch": 0.22490340947603876, + "grad_norm": 1.647488714189101, + "learning_rate": 1.8074136612889619e-06, + "loss": 1.2145, + "step": 1659 + }, + { + "epoch": 0.22503897512370366, + "grad_norm": 1.946600856482562, + "learning_rate": 1.8071544983347791e-06, + "loss": 1.2438, + "step": 1660 + }, + { + "epoch": 0.22517454077136853, + "grad_norm": 2.3172044624465826, + "learning_rate": 1.8068951797272222e-06, + "loss": 1.2164, + "step": 1661 + }, + { + "epoch": 0.22531010641903343, + "grad_norm": 1.7890403630344356, + "learning_rate": 1.8066357055162994e-06, + "loss": 1.2254, + "step": 1662 + }, + { + "epoch": 0.2254456720666983, + "grad_norm": 2.2291065913693875, + "learning_rate": 1.8063760757520483e-06, + "loss": 1.2144, + "step": 1663 + }, + { + "epoch": 0.22558123771436317, + "grad_norm": 2.3026423483807523, + "learning_rate": 1.8061162904845356e-06, + "loss": 1.1819, + "step": 1664 + }, + { + "epoch": 0.22571680336202807, + "grad_norm": 1.865219830779375, + "learning_rate": 1.80585634976386e-06, + "loss": 1.2185, + "step": 1665 + }, + { + "epoch": 0.22585236900969294, + "grad_norm": 1.976248086712234, + "learning_rate": 1.8055962536401479e-06, + "loss": 1.2418, + "step": 1666 + }, + { + "epoch": 0.22598793465735784, + "grad_norm": 1.949720629339714, + "learning_rate": 1.8053360021635572e-06, + "loss": 1.2434, + "step": 1667 + }, + { + "epoch": 0.2261235003050227, + "grad_norm": 2.34551020454707, + "learning_rate": 1.8050755953842757e-06, + "loss": 1.2511, + "step": 1668 + }, + { + "epoch": 0.22625906595268758, + "grad_norm": 1.6498528079976893, + "learning_rate": 1.8048150333525206e-06, + "loss": 1.2142, + "step": 1669 + }, + { + "epoch": 0.22639463160035248, + "grad_norm": 2.4666169497521557, + "learning_rate": 1.8045543161185388e-06, + "loss": 1.2358, + "step": 1670 + }, + { + "epoch": 0.22653019724801735, + "grad_norm": 2.1200473546178067, + "learning_rate": 1.8042934437326082e-06, + "loss": 1.2247, + "step": 1671 + }, + { + "epoch": 0.22666576289568224, + "grad_norm": 2.1791532418651673, + "learning_rate": 1.8040324162450355e-06, + "loss": 1.2275, + "step": 1672 + }, + { + "epoch": 0.2268013285433471, + "grad_norm": 1.7753749968333419, + "learning_rate": 1.8037712337061582e-06, + "loss": 1.2484, + "step": 1673 + }, + { + "epoch": 0.22693689419101198, + "grad_norm": 2.7275477089526747, + "learning_rate": 1.803509896166343e-06, + "loss": 1.2487, + "step": 1674 + }, + { + "epoch": 0.22707245983867688, + "grad_norm": 1.852821235081209, + "learning_rate": 1.8032484036759866e-06, + "loss": 1.2017, + "step": 1675 + }, + { + "epoch": 0.22720802548634175, + "grad_norm": 2.437506682380557, + "learning_rate": 1.8029867562855161e-06, + "loss": 1.2058, + "step": 1676 + }, + { + "epoch": 0.22734359113400665, + "grad_norm": 3.608087502045267, + "learning_rate": 1.8027249540453878e-06, + "loss": 1.2123, + "step": 1677 + }, + { + "epoch": 0.22747915678167152, + "grad_norm": 2.1151459892811033, + "learning_rate": 1.802462997006089e-06, + "loss": 1.2689, + "step": 1678 + }, + { + "epoch": 0.22761472242933642, + "grad_norm": 4.103457187451408, + "learning_rate": 1.8022008852181351e-06, + "loss": 1.2361, + "step": 1679 + }, + { + "epoch": 0.2277502880770013, + "grad_norm": 2.589435323052379, + "learning_rate": 1.801938618732073e-06, + "loss": 1.2196, + "step": 1680 + }, + { + "epoch": 0.22788585372466616, + "grad_norm": 2.311996064253026, + "learning_rate": 1.801676197598478e-06, + "loss": 1.2251, + "step": 1681 + }, + { + "epoch": 0.22802141937233106, + "grad_norm": 1.8058351828976018, + "learning_rate": 1.8014136218679566e-06, + "loss": 1.2531, + "step": 1682 + }, + { + "epoch": 0.22815698501999593, + "grad_norm": 1.8560590674686666, + "learning_rate": 1.8011508915911441e-06, + "loss": 1.1735, + "step": 1683 + }, + { + "epoch": 0.22829255066766083, + "grad_norm": 4.473886138093167, + "learning_rate": 1.800888006818706e-06, + "loss": 1.2278, + "step": 1684 + }, + { + "epoch": 0.2284281163153257, + "grad_norm": 1.7118184627425872, + "learning_rate": 1.8006249676013377e-06, + "loss": 1.231, + "step": 1685 + }, + { + "epoch": 0.22856368196299057, + "grad_norm": 1.7393038190313483, + "learning_rate": 1.8003617739897642e-06, + "loss": 1.1805, + "step": 1686 + }, + { + "epoch": 0.22869924761065547, + "grad_norm": 2.0286960147799715, + "learning_rate": 1.8000984260347401e-06, + "loss": 1.2217, + "step": 1687 + }, + { + "epoch": 0.22883481325832034, + "grad_norm": 1.6856459969802307, + "learning_rate": 1.7998349237870506e-06, + "loss": 1.209, + "step": 1688 + }, + { + "epoch": 0.22897037890598523, + "grad_norm": 1.8167464616743634, + "learning_rate": 1.7995712672975088e-06, + "loss": 1.2273, + "step": 1689 + }, + { + "epoch": 0.2291059445536501, + "grad_norm": 2.3996578989588433, + "learning_rate": 1.79930745661696e-06, + "loss": 1.228, + "step": 1690 + }, + { + "epoch": 0.22924151020131497, + "grad_norm": 2.5282542196362123, + "learning_rate": 1.7990434917962776e-06, + "loss": 1.2312, + "step": 1691 + }, + { + "epoch": 0.22937707584897987, + "grad_norm": 1.9604944728984535, + "learning_rate": 1.7987793728863649e-06, + "loss": 1.1939, + "step": 1692 + }, + { + "epoch": 0.22951264149664474, + "grad_norm": 1.9886215642355671, + "learning_rate": 1.7985150999381553e-06, + "loss": 1.202, + "step": 1693 + }, + { + "epoch": 0.22964820714430964, + "grad_norm": 3.435788071826878, + "learning_rate": 1.798250673002612e-06, + "loss": 1.2066, + "step": 1694 + }, + { + "epoch": 0.2297837727919745, + "grad_norm": 1.5837382600384915, + "learning_rate": 1.797986092130727e-06, + "loss": 1.2285, + "step": 1695 + }, + { + "epoch": 0.2299193384396394, + "grad_norm": 2.374979807470129, + "learning_rate": 1.7977213573735234e-06, + "loss": 1.172, + "step": 1696 + }, + { + "epoch": 0.23005490408730428, + "grad_norm": 2.9319357604031366, + "learning_rate": 1.7974564687820526e-06, + "loss": 1.2505, + "step": 1697 + }, + { + "epoch": 0.23019046973496915, + "grad_norm": 2.290574505520387, + "learning_rate": 1.7971914264073967e-06, + "loss": 1.2352, + "step": 1698 + }, + { + "epoch": 0.23032603538263405, + "grad_norm": 2.20613494233487, + "learning_rate": 1.796926230300667e-06, + "loss": 1.2275, + "step": 1699 + }, + { + "epoch": 0.23046160103029892, + "grad_norm": 1.8410387728425028, + "learning_rate": 1.7966608805130043e-06, + "loss": 1.2062, + "step": 1700 + }, + { + "epoch": 0.23059716667796382, + "grad_norm": 2.0124298439869706, + "learning_rate": 1.7963953770955791e-06, + "loss": 1.1919, + "step": 1701 + }, + { + "epoch": 0.2307327323256287, + "grad_norm": 2.060969208075314, + "learning_rate": 1.7961297200995917e-06, + "loss": 1.1906, + "step": 1702 + }, + { + "epoch": 0.23086829797329356, + "grad_norm": 1.8790727212711922, + "learning_rate": 1.7958639095762722e-06, + "loss": 1.187, + "step": 1703 + }, + { + "epoch": 0.23100386362095846, + "grad_norm": 2.1484707028886967, + "learning_rate": 1.79559794557688e-06, + "loss": 1.2204, + "step": 1704 + }, + { + "epoch": 0.23113942926862333, + "grad_norm": 2.9309946793290376, + "learning_rate": 1.795331828152704e-06, + "loss": 1.1847, + "step": 1705 + }, + { + "epoch": 0.23127499491628822, + "grad_norm": 2.1846093566120874, + "learning_rate": 1.7950655573550627e-06, + "loss": 1.2375, + "step": 1706 + }, + { + "epoch": 0.2314105605639531, + "grad_norm": 2.0491753693760564, + "learning_rate": 1.7947991332353048e-06, + "loss": 1.2388, + "step": 1707 + }, + { + "epoch": 0.23154612621161796, + "grad_norm": 1.6218436393604738, + "learning_rate": 1.7945325558448078e-06, + "loss": 1.1886, + "step": 1708 + }, + { + "epoch": 0.23168169185928286, + "grad_norm": 2.2308445586552, + "learning_rate": 1.7942658252349787e-06, + "loss": 1.2074, + "step": 1709 + }, + { + "epoch": 0.23181725750694773, + "grad_norm": 3.6376905490835703, + "learning_rate": 1.7939989414572552e-06, + "loss": 1.2138, + "step": 1710 + }, + { + "epoch": 0.23195282315461263, + "grad_norm": 1.9514173853005117, + "learning_rate": 1.7937319045631032e-06, + "loss": 1.2035, + "step": 1711 + }, + { + "epoch": 0.2320883888022775, + "grad_norm": 4.7560171652967735, + "learning_rate": 1.7934647146040185e-06, + "loss": 1.1997, + "step": 1712 + }, + { + "epoch": 0.23222395444994237, + "grad_norm": 3.0289140009530224, + "learning_rate": 1.793197371631527e-06, + "loss": 1.1906, + "step": 1713 + }, + { + "epoch": 0.23235952009760727, + "grad_norm": 2.0273438705628677, + "learning_rate": 1.7929298756971836e-06, + "loss": 1.2391, + "step": 1714 + }, + { + "epoch": 0.23249508574527214, + "grad_norm": 1.7101337706841748, + "learning_rate": 1.7926622268525725e-06, + "loss": 1.2092, + "step": 1715 + }, + { + "epoch": 0.23263065139293704, + "grad_norm": 2.889853948020089, + "learning_rate": 1.792394425149308e-06, + "loss": 1.1861, + "step": 1716 + }, + { + "epoch": 0.2327662170406019, + "grad_norm": 2.0176222039813725, + "learning_rate": 1.792126470639033e-06, + "loss": 1.2085, + "step": 1717 + }, + { + "epoch": 0.2329017826882668, + "grad_norm": 2.298152986356252, + "learning_rate": 1.7918583633734212e-06, + "loss": 1.1889, + "step": 1718 + }, + { + "epoch": 0.23303734833593168, + "grad_norm": 1.79462806464228, + "learning_rate": 1.7915901034041744e-06, + "loss": 1.2209, + "step": 1719 + }, + { + "epoch": 0.23317291398359655, + "grad_norm": 2.112724718753545, + "learning_rate": 1.7913216907830248e-06, + "loss": 1.2353, + "step": 1720 + }, + { + "epoch": 0.23330847963126145, + "grad_norm": 1.812081646786715, + "learning_rate": 1.7910531255617332e-06, + "loss": 1.2159, + "step": 1721 + }, + { + "epoch": 0.23344404527892632, + "grad_norm": 1.9007486534492724, + "learning_rate": 1.7907844077920905e-06, + "loss": 1.2597, + "step": 1722 + }, + { + "epoch": 0.2335796109265912, + "grad_norm": 1.8814685658971446, + "learning_rate": 1.790515537525917e-06, + "loss": 1.2176, + "step": 1723 + }, + { + "epoch": 0.23371517657425608, + "grad_norm": 1.952218218369193, + "learning_rate": 1.7902465148150623e-06, + "loss": 1.1954, + "step": 1724 + }, + { + "epoch": 0.23385074222192095, + "grad_norm": 3.045192339892041, + "learning_rate": 1.7899773397114046e-06, + "loss": 1.2169, + "step": 1725 + }, + { + "epoch": 0.23398630786958585, + "grad_norm": 2.0200646455448656, + "learning_rate": 1.789708012266853e-06, + "loss": 1.2456, + "step": 1726 + }, + { + "epoch": 0.23412187351725072, + "grad_norm": 3.5313315104336787, + "learning_rate": 1.7894385325333444e-06, + "loss": 1.1846, + "step": 1727 + }, + { + "epoch": 0.23425743916491562, + "grad_norm": 2.0067396929374013, + "learning_rate": 1.7891689005628466e-06, + "loss": 1.2438, + "step": 1728 + }, + { + "epoch": 0.2343930048125805, + "grad_norm": 3.09648820794417, + "learning_rate": 1.7888991164073554e-06, + "loss": 1.2385, + "step": 1729 + }, + { + "epoch": 0.23452857046024536, + "grad_norm": 1.8527098625499323, + "learning_rate": 1.7886291801188968e-06, + "loss": 1.245, + "step": 1730 + }, + { + "epoch": 0.23466413610791026, + "grad_norm": 2.03693790128315, + "learning_rate": 1.788359091749526e-06, + "loss": 1.2785, + "step": 1731 + }, + { + "epoch": 0.23479970175557513, + "grad_norm": 2.255674813648073, + "learning_rate": 1.7880888513513272e-06, + "loss": 1.2402, + "step": 1732 + }, + { + "epoch": 0.23493526740324003, + "grad_norm": 2.172121348692921, + "learning_rate": 1.7878184589764142e-06, + "loss": 1.2523, + "step": 1733 + }, + { + "epoch": 0.2350708330509049, + "grad_norm": 1.8827688116055237, + "learning_rate": 1.7875479146769303e-06, + "loss": 1.2006, + "step": 1734 + }, + { + "epoch": 0.23520639869856977, + "grad_norm": 1.9066402945004794, + "learning_rate": 1.7872772185050474e-06, + "loss": 1.216, + "step": 1735 + }, + { + "epoch": 0.23534196434623467, + "grad_norm": 1.6176642533381071, + "learning_rate": 1.7870063705129672e-06, + "loss": 1.2253, + "step": 1736 + }, + { + "epoch": 0.23547752999389954, + "grad_norm": 1.7693499404451594, + "learning_rate": 1.786735370752921e-06, + "loss": 1.2446, + "step": 1737 + }, + { + "epoch": 0.23561309564156444, + "grad_norm": 1.8484908215174158, + "learning_rate": 1.7864642192771683e-06, + "loss": 1.2293, + "step": 1738 + }, + { + "epoch": 0.2357486612892293, + "grad_norm": 2.7143201384214466, + "learning_rate": 1.786192916137999e-06, + "loss": 1.2199, + "step": 1739 + }, + { + "epoch": 0.2358842269368942, + "grad_norm": 2.162787491365201, + "learning_rate": 1.7859214613877316e-06, + "loss": 1.1867, + "step": 1740 + }, + { + "epoch": 0.23601979258455907, + "grad_norm": 2.016061191761289, + "learning_rate": 1.7856498550787141e-06, + "loss": 1.2088, + "step": 1741 + }, + { + "epoch": 0.23615535823222394, + "grad_norm": 2.3062982001431305, + "learning_rate": 1.7853780972633239e-06, + "loss": 1.1908, + "step": 1742 + }, + { + "epoch": 0.23629092387988884, + "grad_norm": 1.9927303871939965, + "learning_rate": 1.7851061879939669e-06, + "loss": 1.2269, + "step": 1743 + }, + { + "epoch": 0.2364264895275537, + "grad_norm": 1.8109703614978818, + "learning_rate": 1.7848341273230786e-06, + "loss": 1.2483, + "step": 1744 + }, + { + "epoch": 0.2365620551752186, + "grad_norm": 3.0059426266512927, + "learning_rate": 1.784561915303124e-06, + "loss": 1.1956, + "step": 1745 + }, + { + "epoch": 0.23669762082288348, + "grad_norm": 2.0353730569967325, + "learning_rate": 1.784289551986597e-06, + "loss": 1.2125, + "step": 1746 + }, + { + "epoch": 0.23683318647054835, + "grad_norm": 1.7261084324939484, + "learning_rate": 1.7840170374260206e-06, + "loss": 1.2155, + "step": 1747 + }, + { + "epoch": 0.23696875211821325, + "grad_norm": 2.7933725858564475, + "learning_rate": 1.7837443716739474e-06, + "loss": 1.2096, + "step": 1748 + }, + { + "epoch": 0.23710431776587812, + "grad_norm": 1.849307405187441, + "learning_rate": 1.7834715547829584e-06, + "loss": 1.2186, + "step": 1749 + }, + { + "epoch": 0.23723988341354302, + "grad_norm": 1.6773179066794217, + "learning_rate": 1.7831985868056646e-06, + "loss": 1.1919, + "step": 1750 + }, + { + "epoch": 0.2373754490612079, + "grad_norm": 2.3957026225572773, + "learning_rate": 1.7829254677947054e-06, + "loss": 1.2195, + "step": 1751 + }, + { + "epoch": 0.23751101470887276, + "grad_norm": 2.012842907814488, + "learning_rate": 1.7826521978027499e-06, + "loss": 1.2086, + "step": 1752 + }, + { + "epoch": 0.23764658035653766, + "grad_norm": 1.788099565825616, + "learning_rate": 1.7823787768824958e-06, + "loss": 1.2454, + "step": 1753 + }, + { + "epoch": 0.23778214600420253, + "grad_norm": 1.8006862229365828, + "learning_rate": 1.7821052050866703e-06, + "loss": 1.2295, + "step": 1754 + }, + { + "epoch": 0.23791771165186743, + "grad_norm": 1.5707272038919249, + "learning_rate": 1.7818314824680298e-06, + "loss": 1.247, + "step": 1755 + }, + { + "epoch": 0.2380532772995323, + "grad_norm": 2.5069987361668655, + "learning_rate": 1.7815576090793592e-06, + "loss": 1.1929, + "step": 1756 + }, + { + "epoch": 0.2381888429471972, + "grad_norm": 2.9590543342186435, + "learning_rate": 1.781283584973473e-06, + "loss": 1.2741, + "step": 1757 + }, + { + "epoch": 0.23832440859486206, + "grad_norm": 1.9139747412921384, + "learning_rate": 1.781009410203214e-06, + "loss": 1.1767, + "step": 1758 + }, + { + "epoch": 0.23845997424252693, + "grad_norm": 1.7772700181363925, + "learning_rate": 1.7807350848214557e-06, + "loss": 1.2211, + "step": 1759 + }, + { + "epoch": 0.23859553989019183, + "grad_norm": 1.722584302198514, + "learning_rate": 1.780460608881099e-06, + "loss": 1.2321, + "step": 1760 + }, + { + "epoch": 0.2387311055378567, + "grad_norm": 4.1560769117191985, + "learning_rate": 1.7801859824350743e-06, + "loss": 1.2244, + "step": 1761 + }, + { + "epoch": 0.2388666711855216, + "grad_norm": 2.0950915217126145, + "learning_rate": 1.7799112055363415e-06, + "loss": 1.1758, + "step": 1762 + }, + { + "epoch": 0.23900223683318647, + "grad_norm": 2.0458446809516895, + "learning_rate": 1.7796362782378887e-06, + "loss": 1.2021, + "step": 1763 + }, + { + "epoch": 0.23913780248085134, + "grad_norm": 1.9487604790881143, + "learning_rate": 1.7793612005927337e-06, + "loss": 1.2354, + "step": 1764 + }, + { + "epoch": 0.23927336812851624, + "grad_norm": 2.123235297803601, + "learning_rate": 1.7790859726539232e-06, + "loss": 1.2024, + "step": 1765 + }, + { + "epoch": 0.2394089337761811, + "grad_norm": 2.07154259382699, + "learning_rate": 1.7788105944745325e-06, + "loss": 1.2146, + "step": 1766 + }, + { + "epoch": 0.239544499423846, + "grad_norm": 2.4033813639055857, + "learning_rate": 1.7785350661076663e-06, + "loss": 1.2041, + "step": 1767 + }, + { + "epoch": 0.23968006507151088, + "grad_norm": 1.8217765652184839, + "learning_rate": 1.778259387606458e-06, + "loss": 1.2279, + "step": 1768 + }, + { + "epoch": 0.23981563071917575, + "grad_norm": 3.0003930428516323, + "learning_rate": 1.7779835590240699e-06, + "loss": 1.1906, + "step": 1769 + }, + { + "epoch": 0.23995119636684065, + "grad_norm": 1.635389168348073, + "learning_rate": 1.7777075804136938e-06, + "loss": 1.1839, + "step": 1770 + }, + { + "epoch": 0.24008676201450552, + "grad_norm": 3.587024424025778, + "learning_rate": 1.7774314518285492e-06, + "loss": 1.2085, + "step": 1771 + }, + { + "epoch": 0.24022232766217042, + "grad_norm": 3.267593859266708, + "learning_rate": 1.777155173321886e-06, + "loss": 1.227, + "step": 1772 + }, + { + "epoch": 0.24035789330983529, + "grad_norm": 1.8614362114256373, + "learning_rate": 1.7768787449469823e-06, + "loss": 1.1861, + "step": 1773 + }, + { + "epoch": 0.24049345895750016, + "grad_norm": 2.6878737220569078, + "learning_rate": 1.7766021667571448e-06, + "loss": 1.2199, + "step": 1774 + }, + { + "epoch": 0.24062902460516505, + "grad_norm": 1.9336287123309444, + "learning_rate": 1.7763254388057094e-06, + "loss": 1.2403, + "step": 1775 + }, + { + "epoch": 0.24076459025282992, + "grad_norm": 9.41310292900186, + "learning_rate": 1.7760485611460415e-06, + "loss": 1.2067, + "step": 1776 + }, + { + "epoch": 0.24090015590049482, + "grad_norm": 3.492751338361507, + "learning_rate": 1.7757715338315337e-06, + "loss": 1.2479, + "step": 1777 + }, + { + "epoch": 0.2410357215481597, + "grad_norm": 2.5519861524800835, + "learning_rate": 1.7754943569156096e-06, + "loss": 1.2345, + "step": 1778 + }, + { + "epoch": 0.2411712871958246, + "grad_norm": 1.914238794144076, + "learning_rate": 1.7752170304517202e-06, + "loss": 1.2612, + "step": 1779 + }, + { + "epoch": 0.24130685284348946, + "grad_norm": 2.011727626787541, + "learning_rate": 1.7749395544933455e-06, + "loss": 1.2279, + "step": 1780 + }, + { + "epoch": 0.24144241849115433, + "grad_norm": 2.428832691966606, + "learning_rate": 1.7746619290939946e-06, + "loss": 1.1967, + "step": 1781 + }, + { + "epoch": 0.24157798413881923, + "grad_norm": 2.218806313415624, + "learning_rate": 1.7743841543072055e-06, + "loss": 1.2002, + "step": 1782 + }, + { + "epoch": 0.2417135497864841, + "grad_norm": 1.643653384538718, + "learning_rate": 1.7741062301865453e-06, + "loss": 1.174, + "step": 1783 + }, + { + "epoch": 0.241849115434149, + "grad_norm": 1.889580186683379, + "learning_rate": 1.7738281567856088e-06, + "loss": 1.222, + "step": 1784 + }, + { + "epoch": 0.24198468108181387, + "grad_norm": 1.7842835145237965, + "learning_rate": 1.7735499341580203e-06, + "loss": 1.2326, + "step": 1785 + }, + { + "epoch": 0.24212024672947874, + "grad_norm": 1.6899969667843595, + "learning_rate": 1.7732715623574333e-06, + "loss": 1.1873, + "step": 1786 + }, + { + "epoch": 0.24225581237714364, + "grad_norm": 1.6724212689164863, + "learning_rate": 1.772993041437529e-06, + "loss": 1.1837, + "step": 1787 + }, + { + "epoch": 0.2423913780248085, + "grad_norm": 1.7154310009564073, + "learning_rate": 1.7727143714520184e-06, + "loss": 1.2289, + "step": 1788 + }, + { + "epoch": 0.2425269436724734, + "grad_norm": 2.0575060256530047, + "learning_rate": 1.7724355524546409e-06, + "loss": 1.1953, + "step": 1789 + }, + { + "epoch": 0.24266250932013828, + "grad_norm": 2.108579829556193, + "learning_rate": 1.7721565844991641e-06, + "loss": 1.2163, + "step": 1790 + }, + { + "epoch": 0.24279807496780315, + "grad_norm": 2.4075354264512296, + "learning_rate": 1.7718774676393852e-06, + "loss": 1.2215, + "step": 1791 + }, + { + "epoch": 0.24293364061546804, + "grad_norm": 1.8629845646848435, + "learning_rate": 1.7715982019291293e-06, + "loss": 1.2082, + "step": 1792 + }, + { + "epoch": 0.24306920626313291, + "grad_norm": 2.213475985930192, + "learning_rate": 1.771318787422251e-06, + "loss": 1.2287, + "step": 1793 + }, + { + "epoch": 0.2432047719107978, + "grad_norm": 2.5642624086147383, + "learning_rate": 1.7710392241726328e-06, + "loss": 1.2133, + "step": 1794 + }, + { + "epoch": 0.24334033755846268, + "grad_norm": 2.0691671025120404, + "learning_rate": 1.7707595122341865e-06, + "loss": 1.2566, + "step": 1795 + }, + { + "epoch": 0.24347590320612758, + "grad_norm": 2.157887664457546, + "learning_rate": 1.7704796516608524e-06, + "loss": 1.2248, + "step": 1796 + }, + { + "epoch": 0.24361146885379245, + "grad_norm": 1.7343968496258475, + "learning_rate": 1.7701996425065992e-06, + "loss": 1.1888, + "step": 1797 + }, + { + "epoch": 0.24374703450145732, + "grad_norm": 3.9949059883037368, + "learning_rate": 1.7699194848254244e-06, + "loss": 1.2169, + "step": 1798 + }, + { + "epoch": 0.24388260014912222, + "grad_norm": 2.8316613685279854, + "learning_rate": 1.7696391786713545e-06, + "loss": 1.2168, + "step": 1799 + }, + { + "epoch": 0.2440181657967871, + "grad_norm": 4.036048185836938, + "learning_rate": 1.769358724098444e-06, + "loss": 1.2125, + "step": 1800 + }, + { + "epoch": 0.244153731444452, + "grad_norm": 1.6428148176711614, + "learning_rate": 1.7690781211607767e-06, + "loss": 1.2123, + "step": 1801 + }, + { + "epoch": 0.24428929709211686, + "grad_norm": 6.094270938359351, + "learning_rate": 1.7687973699124643e-06, + "loss": 1.1983, + "step": 1802 + }, + { + "epoch": 0.24442486273978173, + "grad_norm": 2.042003790911632, + "learning_rate": 1.7685164704076476e-06, + "loss": 1.1859, + "step": 1803 + }, + { + "epoch": 0.24456042838744663, + "grad_norm": 2.2111113357599987, + "learning_rate": 1.768235422700496e-06, + "loss": 1.2733, + "step": 1804 + }, + { + "epoch": 0.2446959940351115, + "grad_norm": 1.873699153848952, + "learning_rate": 1.767954226845207e-06, + "loss": 1.224, + "step": 1805 + }, + { + "epoch": 0.2448315596827764, + "grad_norm": 1.8110920696114001, + "learning_rate": 1.7676728828960075e-06, + "loss": 1.1761, + "step": 1806 + }, + { + "epoch": 0.24496712533044127, + "grad_norm": 2.1564126202252876, + "learning_rate": 1.7673913909071523e-06, + "loss": 1.2269, + "step": 1807 + }, + { + "epoch": 0.24510269097810614, + "grad_norm": 1.899040026557861, + "learning_rate": 1.7671097509329242e-06, + "loss": 1.1823, + "step": 1808 + }, + { + "epoch": 0.24523825662577103, + "grad_norm": 1.6972703153076651, + "learning_rate": 1.7668279630276364e-06, + "loss": 1.2207, + "step": 1809 + }, + { + "epoch": 0.2453738222734359, + "grad_norm": 2.0552851600436037, + "learning_rate": 1.7665460272456287e-06, + "loss": 1.222, + "step": 1810 + }, + { + "epoch": 0.2455093879211008, + "grad_norm": 1.809909984117463, + "learning_rate": 1.7662639436412703e-06, + "loss": 1.2113, + "step": 1811 + }, + { + "epoch": 0.24564495356876567, + "grad_norm": 1.944253536621921, + "learning_rate": 1.7659817122689589e-06, + "loss": 1.1798, + "step": 1812 + }, + { + "epoch": 0.24578051921643054, + "grad_norm": 1.9661239756187894, + "learning_rate": 1.7656993331831208e-06, + "loss": 1.2229, + "step": 1813 + }, + { + "epoch": 0.24591608486409544, + "grad_norm": 3.9425095795433673, + "learning_rate": 1.76541680643821e-06, + "loss": 1.2323, + "step": 1814 + }, + { + "epoch": 0.2460516505117603, + "grad_norm": 1.9084167850744362, + "learning_rate": 1.7651341320887102e-06, + "loss": 1.1865, + "step": 1815 + }, + { + "epoch": 0.2461872161594252, + "grad_norm": 2.033462882849893, + "learning_rate": 1.7648513101891325e-06, + "loss": 1.2203, + "step": 1816 + }, + { + "epoch": 0.24632278180709008, + "grad_norm": 2.3147325375960333, + "learning_rate": 1.764568340794017e-06, + "loss": 1.2666, + "step": 1817 + }, + { + "epoch": 0.24645834745475498, + "grad_norm": 4.0825982239941645, + "learning_rate": 1.7642852239579323e-06, + "loss": 1.2121, + "step": 1818 + }, + { + "epoch": 0.24659391310241985, + "grad_norm": 2.1160892863300655, + "learning_rate": 1.7640019597354747e-06, + "loss": 1.212, + "step": 1819 + }, + { + "epoch": 0.24672947875008472, + "grad_norm": 4.171573892319886, + "learning_rate": 1.76371854818127e-06, + "loss": 1.2051, + "step": 1820 + }, + { + "epoch": 0.24686504439774962, + "grad_norm": 2.1404473767737207, + "learning_rate": 1.7634349893499719e-06, + "loss": 1.2338, + "step": 1821 + }, + { + "epoch": 0.2470006100454145, + "grad_norm": 2.152295264776574, + "learning_rate": 1.7631512832962622e-06, + "loss": 1.2056, + "step": 1822 + }, + { + "epoch": 0.24713617569307939, + "grad_norm": 1.901009506224045, + "learning_rate": 1.7628674300748511e-06, + "loss": 1.2173, + "step": 1823 + }, + { + "epoch": 0.24727174134074426, + "grad_norm": 9.58232076385533, + "learning_rate": 1.7625834297404783e-06, + "loss": 1.2022, + "step": 1824 + }, + { + "epoch": 0.24740730698840913, + "grad_norm": 4.587183639669272, + "learning_rate": 1.7622992823479103e-06, + "loss": 1.2217, + "step": 1825 + }, + { + "epoch": 0.24754287263607402, + "grad_norm": 2.0596374176981636, + "learning_rate": 1.7620149879519431e-06, + "loss": 1.1698, + "step": 1826 + }, + { + "epoch": 0.2476784382837389, + "grad_norm": 2.0078285711861548, + "learning_rate": 1.7617305466074002e-06, + "loss": 1.2034, + "step": 1827 + }, + { + "epoch": 0.2478140039314038, + "grad_norm": 2.279077270921636, + "learning_rate": 1.7614459583691342e-06, + "loss": 1.2469, + "step": 1828 + }, + { + "epoch": 0.24794956957906866, + "grad_norm": 3.1852599085984936, + "learning_rate": 1.7611612232920258e-06, + "loss": 1.2039, + "step": 1829 + }, + { + "epoch": 0.24808513522673353, + "grad_norm": 2.144142879543449, + "learning_rate": 1.7608763414309835e-06, + "loss": 1.2142, + "step": 1830 + }, + { + "epoch": 0.24822070087439843, + "grad_norm": 4.551598707393487, + "learning_rate": 1.7605913128409449e-06, + "loss": 1.1602, + "step": 1831 + }, + { + "epoch": 0.2483562665220633, + "grad_norm": 1.8561869642479543, + "learning_rate": 1.7603061375768754e-06, + "loss": 1.1896, + "step": 1832 + }, + { + "epoch": 0.2484918321697282, + "grad_norm": 1.895115605911324, + "learning_rate": 1.7600208156937688e-06, + "loss": 1.2062, + "step": 1833 + }, + { + "epoch": 0.24862739781739307, + "grad_norm": 4.596377052219914, + "learning_rate": 1.759735347246647e-06, + "loss": 1.1764, + "step": 1834 + }, + { + "epoch": 0.24876296346505797, + "grad_norm": 2.2584280804199497, + "learning_rate": 1.7594497322905603e-06, + "loss": 1.2139, + "step": 1835 + }, + { + "epoch": 0.24889852911272284, + "grad_norm": 2.6702792803111763, + "learning_rate": 1.759163970880588e-06, + "loss": 1.1757, + "step": 1836 + }, + { + "epoch": 0.2490340947603877, + "grad_norm": 1.8581761192713404, + "learning_rate": 1.7588780630718358e-06, + "loss": 1.2075, + "step": 1837 + }, + { + "epoch": 0.2491696604080526, + "grad_norm": 1.7357276111004396, + "learning_rate": 1.7585920089194394e-06, + "loss": 1.2016, + "step": 1838 + }, + { + "epoch": 0.24930522605571748, + "grad_norm": 3.036904318609174, + "learning_rate": 1.7583058084785625e-06, + "loss": 1.2346, + "step": 1839 + }, + { + "epoch": 0.24944079170338238, + "grad_norm": 3.7443055315331524, + "learning_rate": 1.758019461804396e-06, + "loss": 1.1948, + "step": 1840 + }, + { + "epoch": 0.24957635735104725, + "grad_norm": 1.9164522221591054, + "learning_rate": 1.7577329689521596e-06, + "loss": 1.2004, + "step": 1841 + }, + { + "epoch": 0.24971192299871212, + "grad_norm": 1.8054549206871935, + "learning_rate": 1.7574463299771011e-06, + "loss": 1.186, + "step": 1842 + }, + { + "epoch": 0.24984748864637701, + "grad_norm": 1.8120831656218659, + "learning_rate": 1.7571595449344972e-06, + "loss": 1.2161, + "step": 1843 + }, + { + "epoch": 0.24998305429404188, + "grad_norm": 2.08005100124003, + "learning_rate": 1.7568726138796515e-06, + "loss": 1.2286, + "step": 1844 + }, + { + "epoch": 0.25011861994170675, + "grad_norm": 2.9318665897419454, + "learning_rate": 1.7565855368678965e-06, + "loss": 1.2164, + "step": 1845 + }, + { + "epoch": 0.2502541855893717, + "grad_norm": 10.941398898165891, + "learning_rate": 1.756298313954593e-06, + "loss": 1.2529, + "step": 1846 + }, + { + "epoch": 0.25038975123703655, + "grad_norm": 1.9479244575224257, + "learning_rate": 1.7560109451951295e-06, + "loss": 1.1867, + "step": 1847 + }, + { + "epoch": 0.2505253168847014, + "grad_norm": 2.0098063145829634, + "learning_rate": 1.7557234306449227e-06, + "loss": 1.2041, + "step": 1848 + }, + { + "epoch": 0.2506608825323663, + "grad_norm": 2.8897557322364933, + "learning_rate": 1.7554357703594178e-06, + "loss": 1.2326, + "step": 1849 + }, + { + "epoch": 0.25079644818003116, + "grad_norm": 1.8596811431822187, + "learning_rate": 1.7551479643940874e-06, + "loss": 1.2054, + "step": 1850 + }, + { + "epoch": 0.2509320138276961, + "grad_norm": 1.6587849257953666, + "learning_rate": 1.7548600128044328e-06, + "loss": 1.2079, + "step": 1851 + }, + { + "epoch": 0.25106757947536096, + "grad_norm": 2.3146652553188694, + "learning_rate": 1.7545719156459835e-06, + "loss": 1.2023, + "step": 1852 + }, + { + "epoch": 0.25120314512302583, + "grad_norm": 1.8613271067951414, + "learning_rate": 1.7542836729742964e-06, + "loss": 1.2044, + "step": 1853 + }, + { + "epoch": 0.2513387107706907, + "grad_norm": 1.6119564254695464, + "learning_rate": 1.753995284844957e-06, + "loss": 1.2229, + "step": 1854 + }, + { + "epoch": 0.25147427641835557, + "grad_norm": 1.6640143293864529, + "learning_rate": 1.7537067513135787e-06, + "loss": 1.2512, + "step": 1855 + }, + { + "epoch": 0.2516098420660205, + "grad_norm": 1.9354993662583853, + "learning_rate": 1.7534180724358026e-06, + "loss": 1.1942, + "step": 1856 + }, + { + "epoch": 0.25174540771368537, + "grad_norm": 2.4354010739555862, + "learning_rate": 1.7531292482672982e-06, + "loss": 1.2496, + "step": 1857 + }, + { + "epoch": 0.25188097336135024, + "grad_norm": 1.7104907560159721, + "learning_rate": 1.7528402788637633e-06, + "loss": 1.202, + "step": 1858 + }, + { + "epoch": 0.2520165390090151, + "grad_norm": 1.7947054365741593, + "learning_rate": 1.7525511642809232e-06, + "loss": 1.1962, + "step": 1859 + }, + { + "epoch": 0.25215210465668, + "grad_norm": 4.236197446182214, + "learning_rate": 1.7522619045745312e-06, + "loss": 1.2071, + "step": 1860 + }, + { + "epoch": 0.2522876703043449, + "grad_norm": 1.638404894471569, + "learning_rate": 1.751972499800369e-06, + "loss": 1.1698, + "step": 1861 + }, + { + "epoch": 0.2524232359520098, + "grad_norm": 2.7375240820995, + "learning_rate": 1.7516829500142461e-06, + "loss": 1.1933, + "step": 1862 + }, + { + "epoch": 0.25255880159967464, + "grad_norm": 2.032852951727809, + "learning_rate": 1.7513932552719995e-06, + "loss": 1.1927, + "step": 1863 + }, + { + "epoch": 0.2526943672473395, + "grad_norm": 1.7053202860496837, + "learning_rate": 1.7511034156294948e-06, + "loss": 1.2093, + "step": 1864 + }, + { + "epoch": 0.2528299328950044, + "grad_norm": 2.1139214895118474, + "learning_rate": 1.7508134311426253e-06, + "loss": 1.1901, + "step": 1865 + }, + { + "epoch": 0.2529654985426693, + "grad_norm": 1.9642911595692327, + "learning_rate": 1.750523301867312e-06, + "loss": 1.2091, + "step": 1866 + }, + { + "epoch": 0.2531010641903342, + "grad_norm": 1.7358501855282837, + "learning_rate": 1.7502330278595043e-06, + "loss": 1.2501, + "step": 1867 + }, + { + "epoch": 0.25323662983799905, + "grad_norm": 2.5572966577383354, + "learning_rate": 1.7499426091751792e-06, + "loss": 1.2039, + "step": 1868 + }, + { + "epoch": 0.2533721954856639, + "grad_norm": 2.503743013457755, + "learning_rate": 1.7496520458703416e-06, + "loss": 1.266, + "step": 1869 + }, + { + "epoch": 0.2535077611333288, + "grad_norm": 1.9009614683700904, + "learning_rate": 1.7493613380010244e-06, + "loss": 1.1972, + "step": 1870 + }, + { + "epoch": 0.2536433267809937, + "grad_norm": 1.9542804251511516, + "learning_rate": 1.7490704856232882e-06, + "loss": 1.2172, + "step": 1871 + }, + { + "epoch": 0.2537788924286586, + "grad_norm": 2.2095045524660972, + "learning_rate": 1.7487794887932216e-06, + "loss": 1.2133, + "step": 1872 + }, + { + "epoch": 0.25391445807632346, + "grad_norm": 1.8062812851699177, + "learning_rate": 1.7484883475669412e-06, + "loss": 1.2277, + "step": 1873 + }, + { + "epoch": 0.2540500237239883, + "grad_norm": 1.9693021273457196, + "learning_rate": 1.748197062000591e-06, + "loss": 1.2075, + "step": 1874 + }, + { + "epoch": 0.2541855893716532, + "grad_norm": 1.9841218888101602, + "learning_rate": 1.7479056321503436e-06, + "loss": 1.1621, + "step": 1875 + }, + { + "epoch": 0.2543211550193181, + "grad_norm": 1.5712242027626608, + "learning_rate": 1.7476140580723984e-06, + "loss": 1.1753, + "step": 1876 + }, + { + "epoch": 0.254456720666983, + "grad_norm": 2.214796226277904, + "learning_rate": 1.7473223398229836e-06, + "loss": 1.2193, + "step": 1877 + }, + { + "epoch": 0.25459228631464786, + "grad_norm": 2.230255014268556, + "learning_rate": 1.7470304774583542e-06, + "loss": 1.2238, + "step": 1878 + }, + { + "epoch": 0.25472785196231273, + "grad_norm": 1.8224038819490866, + "learning_rate": 1.7467384710347943e-06, + "loss": 1.1851, + "step": 1879 + }, + { + "epoch": 0.2548634176099776, + "grad_norm": 1.7333177180290142, + "learning_rate": 1.7464463206086144e-06, + "loss": 1.2149, + "step": 1880 + }, + { + "epoch": 0.25499898325764253, + "grad_norm": 2.513963337490863, + "learning_rate": 1.7461540262361538e-06, + "loss": 1.2225, + "step": 1881 + }, + { + "epoch": 0.2551345489053074, + "grad_norm": 2.1794133428352978, + "learning_rate": 1.7458615879737791e-06, + "loss": 1.26, + "step": 1882 + }, + { + "epoch": 0.25527011455297227, + "grad_norm": 1.7561893242519422, + "learning_rate": 1.7455690058778844e-06, + "loss": 1.2174, + "step": 1883 + }, + { + "epoch": 0.25540568020063714, + "grad_norm": 2.3537274326176334, + "learning_rate": 1.7452762800048924e-06, + "loss": 1.1957, + "step": 1884 + }, + { + "epoch": 0.25554124584830207, + "grad_norm": 1.6474782853381682, + "learning_rate": 1.7449834104112525e-06, + "loss": 1.2273, + "step": 1885 + }, + { + "epoch": 0.25567681149596694, + "grad_norm": 2.624708373938763, + "learning_rate": 1.7446903971534423e-06, + "loss": 1.1965, + "step": 1886 + }, + { + "epoch": 0.2558123771436318, + "grad_norm": 2.025228186377415, + "learning_rate": 1.7443972402879674e-06, + "loss": 1.2124, + "step": 1887 + }, + { + "epoch": 0.2559479427912967, + "grad_norm": 1.7429472680025364, + "learning_rate": 1.7441039398713605e-06, + "loss": 1.2103, + "step": 1888 + }, + { + "epoch": 0.25608350843896155, + "grad_norm": 1.793623582437906, + "learning_rate": 1.7438104959601826e-06, + "loss": 1.1604, + "step": 1889 + }, + { + "epoch": 0.2562190740866265, + "grad_norm": 2.320222409638612, + "learning_rate": 1.7435169086110217e-06, + "loss": 1.1984, + "step": 1890 + }, + { + "epoch": 0.25635463973429135, + "grad_norm": 2.1960289648740634, + "learning_rate": 1.743223177880494e-06, + "loss": 1.2044, + "step": 1891 + }, + { + "epoch": 0.2564902053819562, + "grad_norm": 1.6611496009863616, + "learning_rate": 1.742929303825243e-06, + "loss": 1.2472, + "step": 1892 + }, + { + "epoch": 0.2566257710296211, + "grad_norm": 2.0242789641249246, + "learning_rate": 1.7426352865019402e-06, + "loss": 1.1835, + "step": 1893 + }, + { + "epoch": 0.25676133667728596, + "grad_norm": 1.6748353870422994, + "learning_rate": 1.7423411259672841e-06, + "loss": 1.2057, + "step": 1894 + }, + { + "epoch": 0.2568969023249509, + "grad_norm": 1.974414131495983, + "learning_rate": 1.7420468222780017e-06, + "loss": 1.1937, + "step": 1895 + }, + { + "epoch": 0.25703246797261575, + "grad_norm": 1.6332445580756587, + "learning_rate": 1.7417523754908473e-06, + "loss": 1.241, + "step": 1896 + }, + { + "epoch": 0.2571680336202806, + "grad_norm": 3.0780022335145634, + "learning_rate": 1.741457785662602e-06, + "loss": 1.2072, + "step": 1897 + }, + { + "epoch": 0.2573035992679455, + "grad_norm": 2.424970344117456, + "learning_rate": 1.7411630528500757e-06, + "loss": 1.1964, + "step": 1898 + }, + { + "epoch": 0.25743916491561036, + "grad_norm": 1.8908048509347088, + "learning_rate": 1.7408681771101048e-06, + "loss": 1.258, + "step": 1899 + }, + { + "epoch": 0.2575747305632753, + "grad_norm": 1.8351011574802956, + "learning_rate": 1.740573158499554e-06, + "loss": 1.2568, + "step": 1900 + }, + { + "epoch": 0.25771029621094016, + "grad_norm": 1.8685381493371467, + "learning_rate": 1.7402779970753154e-06, + "loss": 1.2191, + "step": 1901 + }, + { + "epoch": 0.25784586185860503, + "grad_norm": 2.501796821869135, + "learning_rate": 1.7399826928943084e-06, + "loss": 1.2149, + "step": 1902 + }, + { + "epoch": 0.2579814275062699, + "grad_norm": 1.9956151913080984, + "learning_rate": 1.7396872460134805e-06, + "loss": 1.2247, + "step": 1903 + }, + { + "epoch": 0.25811699315393477, + "grad_norm": 27.605688913511877, + "learning_rate": 1.7393916564898055e-06, + "loss": 1.2294, + "step": 1904 + }, + { + "epoch": 0.2582525588015997, + "grad_norm": 3.8526278461661208, + "learning_rate": 1.739095924380286e-06, + "loss": 1.1909, + "step": 1905 + }, + { + "epoch": 0.25838812444926457, + "grad_norm": 2.0782491215265804, + "learning_rate": 1.7388000497419518e-06, + "loss": 1.2514, + "step": 1906 + }, + { + "epoch": 0.25852369009692944, + "grad_norm": 1.6274032994034127, + "learning_rate": 1.7385040326318597e-06, + "loss": 1.1892, + "step": 1907 + }, + { + "epoch": 0.2586592557445943, + "grad_norm": 1.7569017353470198, + "learning_rate": 1.738207873107094e-06, + "loss": 1.1895, + "step": 1908 + }, + { + "epoch": 0.2587948213922592, + "grad_norm": 2.0159984324733777, + "learning_rate": 1.7379115712247675e-06, + "loss": 1.2204, + "step": 1909 + }, + { + "epoch": 0.2589303870399241, + "grad_norm": 2.1409272322401622, + "learning_rate": 1.7376151270420186e-06, + "loss": 1.1899, + "step": 1910 + }, + { + "epoch": 0.259065952687589, + "grad_norm": 1.5718820999188443, + "learning_rate": 1.737318540616015e-06, + "loss": 1.191, + "step": 1911 + }, + { + "epoch": 0.25920151833525384, + "grad_norm": 1.800888994925895, + "learning_rate": 1.7370218120039512e-06, + "loss": 1.179, + "step": 1912 + }, + { + "epoch": 0.2593370839829187, + "grad_norm": 1.9653172272111485, + "learning_rate": 1.7367249412630484e-06, + "loss": 1.1991, + "step": 1913 + }, + { + "epoch": 0.2594726496305836, + "grad_norm": 1.8333776653433949, + "learning_rate": 1.7364279284505564e-06, + "loss": 1.2134, + "step": 1914 + }, + { + "epoch": 0.2596082152782485, + "grad_norm": 1.6981562796395744, + "learning_rate": 1.736130773623751e-06, + "loss": 1.1596, + "step": 1915 + }, + { + "epoch": 0.2597437809259134, + "grad_norm": 2.085760020229843, + "learning_rate": 1.7358334768399368e-06, + "loss": 1.2242, + "step": 1916 + }, + { + "epoch": 0.25987934657357825, + "grad_norm": 2.889000069539263, + "learning_rate": 1.7355360381564449e-06, + "loss": 1.2159, + "step": 1917 + }, + { + "epoch": 0.2600149122212431, + "grad_norm": 1.5078783161153055, + "learning_rate": 1.7352384576306336e-06, + "loss": 1.205, + "step": 1918 + }, + { + "epoch": 0.260150477868908, + "grad_norm": 2.095098802919073, + "learning_rate": 1.7349407353198898e-06, + "loss": 1.2207, + "step": 1919 + }, + { + "epoch": 0.2602860435165729, + "grad_norm": 2.257038989785114, + "learning_rate": 1.7346428712816262e-06, + "loss": 1.1807, + "step": 1920 + }, + { + "epoch": 0.2604216091642378, + "grad_norm": 2.1798939598729925, + "learning_rate": 1.734344865573284e-06, + "loss": 1.2282, + "step": 1921 + }, + { + "epoch": 0.26055717481190266, + "grad_norm": 4.609500391297694, + "learning_rate": 1.734046718252331e-06, + "loss": 1.1956, + "step": 1922 + }, + { + "epoch": 0.26069274045956753, + "grad_norm": 2.0088523177785635, + "learning_rate": 1.7337484293762627e-06, + "loss": 1.1779, + "step": 1923 + }, + { + "epoch": 0.26082830610723245, + "grad_norm": 2.8406929789522137, + "learning_rate": 1.7334499990026014e-06, + "loss": 1.1862, + "step": 1924 + }, + { + "epoch": 0.2609638717548973, + "grad_norm": 2.440033528998893, + "learning_rate": 1.7331514271888973e-06, + "loss": 1.2022, + "step": 1925 + }, + { + "epoch": 0.2610994374025622, + "grad_norm": 3.3572512677177784, + "learning_rate": 1.7328527139927278e-06, + "loss": 1.1838, + "step": 1926 + }, + { + "epoch": 0.26123500305022707, + "grad_norm": 2.958170313305445, + "learning_rate": 1.7325538594716971e-06, + "loss": 1.2162, + "step": 1927 + }, + { + "epoch": 0.26137056869789194, + "grad_norm": 1.9358018815091595, + "learning_rate": 1.7322548636834372e-06, + "loss": 1.2862, + "step": 1928 + }, + { + "epoch": 0.26150613434555686, + "grad_norm": 1.9324468892018107, + "learning_rate": 1.7319557266856067e-06, + "loss": 1.202, + "step": 1929 + }, + { + "epoch": 0.26164169999322173, + "grad_norm": 2.0524125990665674, + "learning_rate": 1.731656448535892e-06, + "loss": 1.1983, + "step": 1930 + }, + { + "epoch": 0.2617772656408866, + "grad_norm": 2.2894761949409377, + "learning_rate": 1.7313570292920065e-06, + "loss": 1.1653, + "step": 1931 + }, + { + "epoch": 0.2619128312885515, + "grad_norm": 1.8897561477560558, + "learning_rate": 1.731057469011691e-06, + "loss": 1.2059, + "step": 1932 + }, + { + "epoch": 0.26204839693621634, + "grad_norm": 1.6973529035252515, + "learning_rate": 1.7307577677527135e-06, + "loss": 1.222, + "step": 1933 + }, + { + "epoch": 0.26218396258388127, + "grad_norm": 2.5441111939454224, + "learning_rate": 1.7304579255728684e-06, + "loss": 1.219, + "step": 1934 + }, + { + "epoch": 0.26231952823154614, + "grad_norm": 1.8112073938748308, + "learning_rate": 1.7301579425299782e-06, + "loss": 1.1832, + "step": 1935 + }, + { + "epoch": 0.262455093879211, + "grad_norm": 1.874181480287997, + "learning_rate": 1.7298578186818925e-06, + "loss": 1.1922, + "step": 1936 + }, + { + "epoch": 0.2625906595268759, + "grad_norm": 4.105635568553595, + "learning_rate": 1.7295575540864875e-06, + "loss": 1.2053, + "step": 1937 + }, + { + "epoch": 0.26272622517454075, + "grad_norm": 1.9068856918774595, + "learning_rate": 1.729257148801667e-06, + "loss": 1.1767, + "step": 1938 + }, + { + "epoch": 0.2628617908222057, + "grad_norm": 1.9114881116656066, + "learning_rate": 1.7289566028853616e-06, + "loss": 1.1799, + "step": 1939 + }, + { + "epoch": 0.26299735646987055, + "grad_norm": 1.805191538681426, + "learning_rate": 1.7286559163955297e-06, + "loss": 1.2496, + "step": 1940 + }, + { + "epoch": 0.2631329221175354, + "grad_norm": 8.819029264655315, + "learning_rate": 1.7283550893901557e-06, + "loss": 1.2166, + "step": 1941 + }, + { + "epoch": 0.2632684877652003, + "grad_norm": 3.184993040524436, + "learning_rate": 1.728054121927252e-06, + "loss": 1.1691, + "step": 1942 + }, + { + "epoch": 0.26340405341286516, + "grad_norm": 1.898234052539829, + "learning_rate": 1.727753014064858e-06, + "loss": 1.2385, + "step": 1943 + }, + { + "epoch": 0.2635396190605301, + "grad_norm": 2.2564688988699535, + "learning_rate": 1.7274517658610397e-06, + "loss": 1.1924, + "step": 1944 + }, + { + "epoch": 0.26367518470819495, + "grad_norm": 1.6176499212941755, + "learning_rate": 1.7271503773738906e-06, + "loss": 1.1958, + "step": 1945 + }, + { + "epoch": 0.2638107503558598, + "grad_norm": 2.4088390798760084, + "learning_rate": 1.7268488486615307e-06, + "loss": 1.1751, + "step": 1946 + }, + { + "epoch": 0.2639463160035247, + "grad_norm": 1.9195905932943327, + "learning_rate": 1.726547179782108e-06, + "loss": 1.1982, + "step": 1947 + }, + { + "epoch": 0.26408188165118957, + "grad_norm": 1.8334738114668687, + "learning_rate": 1.7262453707937964e-06, + "loss": 1.1902, + "step": 1948 + }, + { + "epoch": 0.2642174472988545, + "grad_norm": 1.7760671490290973, + "learning_rate": 1.725943421754798e-06, + "loss": 1.223, + "step": 1949 + }, + { + "epoch": 0.26435301294651936, + "grad_norm": 2.090778117631798, + "learning_rate": 1.7256413327233408e-06, + "loss": 1.199, + "step": 1950 + }, + { + "epoch": 0.26448857859418423, + "grad_norm": 2.394474905549524, + "learning_rate": 1.7253391037576806e-06, + "loss": 1.1789, + "step": 1951 + }, + { + "epoch": 0.2646241442418491, + "grad_norm": 1.929110007088266, + "learning_rate": 1.7250367349160994e-06, + "loss": 1.1917, + "step": 1952 + }, + { + "epoch": 0.26475970988951397, + "grad_norm": 2.3009218254927437, + "learning_rate": 1.724734226256907e-06, + "loss": 1.2499, + "step": 1953 + }, + { + "epoch": 0.2648952755371789, + "grad_norm": 12.697382192889492, + "learning_rate": 1.7244315778384403e-06, + "loss": 1.2132, + "step": 1954 + }, + { + "epoch": 0.26503084118484377, + "grad_norm": 1.830156049841281, + "learning_rate": 1.7241287897190616e-06, + "loss": 1.2025, + "step": 1955 + }, + { + "epoch": 0.26516640683250864, + "grad_norm": 2.04360150302558, + "learning_rate": 1.7238258619571616e-06, + "loss": 1.2285, + "step": 1956 + }, + { + "epoch": 0.2653019724801735, + "grad_norm": 1.5408717632937872, + "learning_rate": 1.7235227946111582e-06, + "loss": 1.2007, + "step": 1957 + }, + { + "epoch": 0.2654375381278384, + "grad_norm": 2.266636570419872, + "learning_rate": 1.7232195877394948e-06, + "loss": 1.2367, + "step": 1958 + }, + { + "epoch": 0.2655731037755033, + "grad_norm": 1.7635541285751999, + "learning_rate": 1.7229162414006426e-06, + "loss": 1.2244, + "step": 1959 + }, + { + "epoch": 0.2657086694231682, + "grad_norm": 1.6980200871892785, + "learning_rate": 1.7226127556530997e-06, + "loss": 1.2188, + "step": 1960 + }, + { + "epoch": 0.26584423507083305, + "grad_norm": 2.721187409972732, + "learning_rate": 1.7223091305553905e-06, + "loss": 1.2386, + "step": 1961 + }, + { + "epoch": 0.2659798007184979, + "grad_norm": 1.737398058281931, + "learning_rate": 1.7220053661660673e-06, + "loss": 1.2088, + "step": 1962 + }, + { + "epoch": 0.2661153663661628, + "grad_norm": 2.6303714373436735, + "learning_rate": 1.7217014625437085e-06, + "loss": 1.186, + "step": 1963 + }, + { + "epoch": 0.2662509320138277, + "grad_norm": 2.213262856159067, + "learning_rate": 1.721397419746919e-06, + "loss": 1.2169, + "step": 1964 + }, + { + "epoch": 0.2663864976614926, + "grad_norm": 1.6859549922229211, + "learning_rate": 1.721093237834332e-06, + "loss": 1.1934, + "step": 1965 + }, + { + "epoch": 0.26652206330915745, + "grad_norm": 5.933114186915044, + "learning_rate": 1.7207889168646056e-06, + "loss": 1.2103, + "step": 1966 + }, + { + "epoch": 0.2666576289568223, + "grad_norm": 2.161571102963607, + "learning_rate": 1.7204844568964262e-06, + "loss": 1.2009, + "step": 1967 + }, + { + "epoch": 0.26679319460448725, + "grad_norm": 4.1141427155273, + "learning_rate": 1.7201798579885067e-06, + "loss": 1.1943, + "step": 1968 + }, + { + "epoch": 0.2669287602521521, + "grad_norm": 2.0680769377635206, + "learning_rate": 1.7198751201995862e-06, + "loss": 1.2461, + "step": 1969 + }, + { + "epoch": 0.267064325899817, + "grad_norm": 2.172366456910459, + "learning_rate": 1.7195702435884312e-06, + "loss": 1.1702, + "step": 1970 + }, + { + "epoch": 0.26719989154748186, + "grad_norm": 2.7117438683571837, + "learning_rate": 1.7192652282138346e-06, + "loss": 1.199, + "step": 1971 + }, + { + "epoch": 0.26733545719514673, + "grad_norm": 1.849417518368055, + "learning_rate": 1.7189600741346164e-06, + "loss": 1.2013, + "step": 1972 + }, + { + "epoch": 0.26747102284281166, + "grad_norm": 1.820562531380051, + "learning_rate": 1.7186547814096232e-06, + "loss": 1.2068, + "step": 1973 + }, + { + "epoch": 0.2676065884904765, + "grad_norm": 1.8034874815354125, + "learning_rate": 1.7183493500977275e-06, + "loss": 1.2368, + "step": 1974 + }, + { + "epoch": 0.2677421541381414, + "grad_norm": 2.781506169816165, + "learning_rate": 1.7180437802578302e-06, + "loss": 1.2191, + "step": 1975 + }, + { + "epoch": 0.26787771978580627, + "grad_norm": 2.9965969825907877, + "learning_rate": 1.717738071948858e-06, + "loss": 1.1989, + "step": 1976 + }, + { + "epoch": 0.26801328543347114, + "grad_norm": 1.987493417005933, + "learning_rate": 1.7174322252297638e-06, + "loss": 1.25, + "step": 1977 + }, + { + "epoch": 0.26814885108113606, + "grad_norm": 5.0269906168413065, + "learning_rate": 1.7171262401595282e-06, + "loss": 1.1536, + "step": 1978 + }, + { + "epoch": 0.26828441672880093, + "grad_norm": 1.938630854410778, + "learning_rate": 1.7168201167971579e-06, + "loss": 1.1847, + "step": 1979 + }, + { + "epoch": 0.2684199823764658, + "grad_norm": 2.239198051242695, + "learning_rate": 1.7165138552016861e-06, + "loss": 1.21, + "step": 1980 + }, + { + "epoch": 0.2685555480241307, + "grad_norm": 2.777247961091616, + "learning_rate": 1.7162074554321736e-06, + "loss": 1.2007, + "step": 1981 + }, + { + "epoch": 0.26869111367179555, + "grad_norm": 1.9833349485735339, + "learning_rate": 1.7159009175477061e-06, + "loss": 1.2077, + "step": 1982 + }, + { + "epoch": 0.26882667931946047, + "grad_norm": 2.0842063080832998, + "learning_rate": 1.715594241607398e-06, + "loss": 1.189, + "step": 1983 + }, + { + "epoch": 0.26896224496712534, + "grad_norm": 1.8464599920451572, + "learning_rate": 1.7152874276703888e-06, + "loss": 1.2344, + "step": 1984 + }, + { + "epoch": 0.2690978106147902, + "grad_norm": 1.6310958467832215, + "learning_rate": 1.7149804757958456e-06, + "loss": 1.1791, + "step": 1985 + }, + { + "epoch": 0.2692333762624551, + "grad_norm": 2.311596566628899, + "learning_rate": 1.714673386042961e-06, + "loss": 1.2013, + "step": 1986 + }, + { + "epoch": 0.26936894191011995, + "grad_norm": 2.0112765269446635, + "learning_rate": 1.7143661584709553e-06, + "loss": 1.2411, + "step": 1987 + }, + { + "epoch": 0.2695045075577849, + "grad_norm": 3.447880749773373, + "learning_rate": 1.714058793139075e-06, + "loss": 1.183, + "step": 1988 + }, + { + "epoch": 0.26964007320544975, + "grad_norm": 1.8170163409567497, + "learning_rate": 1.7137512901065924e-06, + "loss": 1.2197, + "step": 1989 + }, + { + "epoch": 0.2697756388531146, + "grad_norm": 1.619765454022951, + "learning_rate": 1.713443649432808e-06, + "loss": 1.2082, + "step": 1990 + }, + { + "epoch": 0.2699112045007795, + "grad_norm": 1.813835495155757, + "learning_rate": 1.7131358711770472e-06, + "loss": 1.2048, + "step": 1991 + }, + { + "epoch": 0.27004677014844436, + "grad_norm": 1.7872561159161602, + "learning_rate": 1.7128279553986626e-06, + "loss": 1.2278, + "step": 1992 + }, + { + "epoch": 0.2701823357961093, + "grad_norm": 2.044025775486627, + "learning_rate": 1.7125199021570339e-06, + "loss": 1.1831, + "step": 1993 + }, + { + "epoch": 0.27031790144377416, + "grad_norm": 1.986006651259262, + "learning_rate": 1.712211711511566e-06, + "loss": 1.2192, + "step": 1994 + }, + { + "epoch": 0.270453467091439, + "grad_norm": 2.035996026701872, + "learning_rate": 1.7119033835216916e-06, + "loss": 1.1826, + "step": 1995 + }, + { + "epoch": 0.2705890327391039, + "grad_norm": 2.475981017964558, + "learning_rate": 1.7115949182468693e-06, + "loss": 1.1957, + "step": 1996 + }, + { + "epoch": 0.27072459838676877, + "grad_norm": 2.1244724575297926, + "learning_rate": 1.7112863157465838e-06, + "loss": 1.1894, + "step": 1997 + }, + { + "epoch": 0.2708601640344337, + "grad_norm": 3.7030011963317215, + "learning_rate": 1.7109775760803466e-06, + "loss": 1.1904, + "step": 1998 + }, + { + "epoch": 0.27099572968209856, + "grad_norm": 5.442444569899328, + "learning_rate": 1.7106686993076962e-06, + "loss": 1.1993, + "step": 1999 + }, + { + "epoch": 0.27113129532976343, + "grad_norm": 2.893913962929676, + "learning_rate": 1.710359685488197e-06, + "loss": 1.2039, + "step": 2000 + }, + { + "epoch": 0.2712668609774283, + "grad_norm": 1.853087187276584, + "learning_rate": 1.7100505346814396e-06, + "loss": 1.2077, + "step": 2001 + }, + { + "epoch": 0.2714024266250932, + "grad_norm": 1.9027505019058564, + "learning_rate": 1.709741246947041e-06, + "loss": 1.1689, + "step": 2002 + }, + { + "epoch": 0.2715379922727581, + "grad_norm": 2.2870309214584186, + "learning_rate": 1.709431822344646e-06, + "loss": 1.17, + "step": 2003 + }, + { + "epoch": 0.27167355792042297, + "grad_norm": 2.8206122390425397, + "learning_rate": 1.7091222609339234e-06, + "loss": 1.19, + "step": 2004 + }, + { + "epoch": 0.27180912356808784, + "grad_norm": 2.734799880226546, + "learning_rate": 1.7088125627745704e-06, + "loss": 1.1812, + "step": 2005 + }, + { + "epoch": 0.2719446892157527, + "grad_norm": 2.5672649699694934, + "learning_rate": 1.7085027279263098e-06, + "loss": 1.2073, + "step": 2006 + }, + { + "epoch": 0.27208025486341764, + "grad_norm": 3.0412584279786063, + "learning_rate": 1.7081927564488908e-06, + "loss": 1.2251, + "step": 2007 + }, + { + "epoch": 0.2722158205110825, + "grad_norm": 2.1176522876080375, + "learning_rate": 1.7078826484020886e-06, + "loss": 1.2484, + "step": 2008 + }, + { + "epoch": 0.2723513861587474, + "grad_norm": 13.44259664519994, + "learning_rate": 1.7075724038457053e-06, + "loss": 1.1431, + "step": 2009 + }, + { + "epoch": 0.27248695180641225, + "grad_norm": 2.913794394655179, + "learning_rate": 1.7072620228395693e-06, + "loss": 1.1831, + "step": 2010 + }, + { + "epoch": 0.2726225174540771, + "grad_norm": 1.8752035532845786, + "learning_rate": 1.7069515054435351e-06, + "loss": 1.2296, + "step": 2011 + }, + { + "epoch": 0.27275808310174204, + "grad_norm": 1.75228006717395, + "learning_rate": 1.7066408517174832e-06, + "loss": 1.2293, + "step": 2012 + }, + { + "epoch": 0.2728936487494069, + "grad_norm": 2.0344178806945905, + "learning_rate": 1.706330061721321e-06, + "loss": 1.1864, + "step": 2013 + }, + { + "epoch": 0.2730292143970718, + "grad_norm": 2.6929870915010654, + "learning_rate": 1.7060191355149817e-06, + "loss": 1.1899, + "step": 2014 + }, + { + "epoch": 0.27316478004473665, + "grad_norm": 2.1212311255485528, + "learning_rate": 1.7057080731584252e-06, + "loss": 1.2593, + "step": 2015 + }, + { + "epoch": 0.2733003456924015, + "grad_norm": 3.2730851900877798, + "learning_rate": 1.7053968747116374e-06, + "loss": 1.1859, + "step": 2016 + }, + { + "epoch": 0.27343591134006645, + "grad_norm": 1.6506030753641825, + "learning_rate": 1.7050855402346303e-06, + "loss": 1.1718, + "step": 2017 + }, + { + "epoch": 0.2735714769877313, + "grad_norm": 1.6159195347123356, + "learning_rate": 1.7047740697874425e-06, + "loss": 1.2093, + "step": 2018 + }, + { + "epoch": 0.2737070426353962, + "grad_norm": 1.82238357646692, + "learning_rate": 1.7044624634301382e-06, + "loss": 1.2167, + "step": 2019 + }, + { + "epoch": 0.27384260828306106, + "grad_norm": 1.6019122206728154, + "learning_rate": 1.7041507212228088e-06, + "loss": 1.1943, + "step": 2020 + }, + { + "epoch": 0.27397817393072593, + "grad_norm": 6.848587024666462, + "learning_rate": 1.7038388432255709e-06, + "loss": 1.2007, + "step": 2021 + }, + { + "epoch": 0.27411373957839086, + "grad_norm": 1.9078961163175128, + "learning_rate": 1.7035268294985677e-06, + "loss": 1.2183, + "step": 2022 + }, + { + "epoch": 0.27424930522605573, + "grad_norm": 2.127575361504497, + "learning_rate": 1.703214680101969e-06, + "loss": 1.2075, + "step": 2023 + }, + { + "epoch": 0.2743848708737206, + "grad_norm": 2.4549771968253715, + "learning_rate": 1.70290239509597e-06, + "loss": 1.215, + "step": 2024 + }, + { + "epoch": 0.27452043652138547, + "grad_norm": 6.907893999737602, + "learning_rate": 1.7025899745407925e-06, + "loss": 1.1914, + "step": 2025 + }, + { + "epoch": 0.27465600216905034, + "grad_norm": 2.0197401144852525, + "learning_rate": 1.7022774184966845e-06, + "loss": 1.2081, + "step": 2026 + }, + { + "epoch": 0.27479156781671527, + "grad_norm": 1.8806971015953928, + "learning_rate": 1.7019647270239194e-06, + "loss": 1.2142, + "step": 2027 + }, + { + "epoch": 0.27492713346438014, + "grad_norm": 1.7262894491410545, + "learning_rate": 1.7016519001827977e-06, + "loss": 1.2153, + "step": 2028 + }, + { + "epoch": 0.275062699112045, + "grad_norm": 1.9014911617842152, + "learning_rate": 1.7013389380336458e-06, + "loss": 1.2158, + "step": 2029 + }, + { + "epoch": 0.2751982647597099, + "grad_norm": 1.9142166750435208, + "learning_rate": 1.7010258406368157e-06, + "loss": 1.2174, + "step": 2030 + }, + { + "epoch": 0.27533383040737475, + "grad_norm": 2.4066841130476004, + "learning_rate": 1.7007126080526857e-06, + "loss": 1.1788, + "step": 2031 + }, + { + "epoch": 0.2754693960550397, + "grad_norm": 2.5617423894319953, + "learning_rate": 1.7003992403416603e-06, + "loss": 1.2553, + "step": 2032 + }, + { + "epoch": 0.27560496170270454, + "grad_norm": 2.159535960044901, + "learning_rate": 1.70008573756417e-06, + "loss": 1.1883, + "step": 2033 + }, + { + "epoch": 0.2757405273503694, + "grad_norm": 1.7273477774459494, + "learning_rate": 1.6997720997806714e-06, + "loss": 1.2536, + "step": 2034 + }, + { + "epoch": 0.2758760929980343, + "grad_norm": 2.2132798507965887, + "learning_rate": 1.699458327051647e-06, + "loss": 1.2336, + "step": 2035 + }, + { + "epoch": 0.27601165864569915, + "grad_norm": 1.723358615639129, + "learning_rate": 1.6991444194376054e-06, + "loss": 1.2527, + "step": 2036 + }, + { + "epoch": 0.2761472242933641, + "grad_norm": 2.229516785880859, + "learning_rate": 1.6988303769990813e-06, + "loss": 1.1656, + "step": 2037 + }, + { + "epoch": 0.27628278994102895, + "grad_norm": 2.610316621611733, + "learning_rate": 1.6985161997966352e-06, + "loss": 1.1975, + "step": 2038 + }, + { + "epoch": 0.2764183555886938, + "grad_norm": 1.9745607677742203, + "learning_rate": 1.6982018878908536e-06, + "loss": 1.2186, + "step": 2039 + }, + { + "epoch": 0.2765539212363587, + "grad_norm": 1.6286994894563178, + "learning_rate": 1.6978874413423495e-06, + "loss": 1.2046, + "step": 2040 + }, + { + "epoch": 0.27668948688402356, + "grad_norm": 2.786221909586749, + "learning_rate": 1.6975728602117609e-06, + "loss": 1.1974, + "step": 2041 + }, + { + "epoch": 0.2768250525316885, + "grad_norm": 4.1090051984403395, + "learning_rate": 1.6972581445597527e-06, + "loss": 1.2155, + "step": 2042 + }, + { + "epoch": 0.27696061817935336, + "grad_norm": 3.2684982092588846, + "learning_rate": 1.6969432944470148e-06, + "loss": 1.2178, + "step": 2043 + }, + { + "epoch": 0.2770961838270182, + "grad_norm": 1.6896511607556945, + "learning_rate": 1.6966283099342643e-06, + "loss": 1.2053, + "step": 2044 + }, + { + "epoch": 0.2772317494746831, + "grad_norm": 1.9351658918455241, + "learning_rate": 1.6963131910822427e-06, + "loss": 1.1694, + "step": 2045 + }, + { + "epoch": 0.277367315122348, + "grad_norm": 3.6734163282099086, + "learning_rate": 1.6959979379517186e-06, + "loss": 1.1727, + "step": 2046 + }, + { + "epoch": 0.2775028807700129, + "grad_norm": 2.097495122019846, + "learning_rate": 1.6956825506034863e-06, + "loss": 1.192, + "step": 2047 + }, + { + "epoch": 0.27763844641767776, + "grad_norm": 1.5087048942897219, + "learning_rate": 1.6953670290983656e-06, + "loss": 1.1626, + "step": 2048 + }, + { + "epoch": 0.27777401206534263, + "grad_norm": 1.924072371807599, + "learning_rate": 1.6950513734972018e-06, + "loss": 1.1784, + "step": 2049 + }, + { + "epoch": 0.2779095777130075, + "grad_norm": 16.498881527489424, + "learning_rate": 1.6947355838608672e-06, + "loss": 1.1677, + "step": 2050 + }, + { + "epoch": 0.27804514336067243, + "grad_norm": 2.0335238059396614, + "learning_rate": 1.6944196602502593e-06, + "loss": 1.2225, + "step": 2051 + }, + { + "epoch": 0.2781807090083373, + "grad_norm": 2.3135864234474792, + "learning_rate": 1.694103602726301e-06, + "loss": 1.2247, + "step": 2052 + }, + { + "epoch": 0.27831627465600217, + "grad_norm": 1.9409159682641464, + "learning_rate": 1.6937874113499425e-06, + "loss": 1.1839, + "step": 2053 + }, + { + "epoch": 0.27845184030366704, + "grad_norm": 4.632869028500078, + "learning_rate": 1.6934710861821575e-06, + "loss": 1.2344, + "step": 2054 + }, + { + "epoch": 0.2785874059513319, + "grad_norm": 2.669667308075107, + "learning_rate": 1.6931546272839477e-06, + "loss": 1.2145, + "step": 2055 + }, + { + "epoch": 0.27872297159899684, + "grad_norm": 2.1851413787753398, + "learning_rate": 1.6928380347163396e-06, + "loss": 1.2156, + "step": 2056 + }, + { + "epoch": 0.2788585372466617, + "grad_norm": 1.910924180074857, + "learning_rate": 1.6925213085403849e-06, + "loss": 1.2034, + "step": 2057 + }, + { + "epoch": 0.2789941028943266, + "grad_norm": 1.8512089381647876, + "learning_rate": 1.6922044488171627e-06, + "loss": 1.2105, + "step": 2058 + }, + { + "epoch": 0.27912966854199145, + "grad_norm": 1.760560267404531, + "learning_rate": 1.6918874556077764e-06, + "loss": 1.2258, + "step": 2059 + }, + { + "epoch": 0.2792652341896563, + "grad_norm": 2.0087444128575584, + "learning_rate": 1.6915703289733558e-06, + "loss": 1.2311, + "step": 2060 + }, + { + "epoch": 0.27940079983732125, + "grad_norm": 2.0375819975502774, + "learning_rate": 1.6912530689750559e-06, + "loss": 1.2239, + "step": 2061 + }, + { + "epoch": 0.2795363654849861, + "grad_norm": 1.865290542492894, + "learning_rate": 1.6909356756740586e-06, + "loss": 1.1365, + "step": 2062 + }, + { + "epoch": 0.279671931132651, + "grad_norm": 2.8906595295639383, + "learning_rate": 1.6906181491315697e-06, + "loss": 1.1936, + "step": 2063 + }, + { + "epoch": 0.27980749678031586, + "grad_norm": 1.9108371199196077, + "learning_rate": 1.6903004894088223e-06, + "loss": 1.2062, + "step": 2064 + }, + { + "epoch": 0.2799430624279807, + "grad_norm": 2.328101651436945, + "learning_rate": 1.6899826965670742e-06, + "loss": 1.239, + "step": 2065 + }, + { + "epoch": 0.28007862807564565, + "grad_norm": 3.507202361243393, + "learning_rate": 1.6896647706676098e-06, + "loss": 1.2235, + "step": 2066 + }, + { + "epoch": 0.2802141937233105, + "grad_norm": 56.97069217645331, + "learning_rate": 1.6893467117717383e-06, + "loss": 1.2056, + "step": 2067 + }, + { + "epoch": 0.2803497593709754, + "grad_norm": 1.9765624357765756, + "learning_rate": 1.6890285199407945e-06, + "loss": 1.2378, + "step": 2068 + }, + { + "epoch": 0.28048532501864026, + "grad_norm": 2.1482719774028256, + "learning_rate": 1.6887101952361395e-06, + "loss": 1.2371, + "step": 2069 + }, + { + "epoch": 0.28062089066630513, + "grad_norm": 1.7808421675072619, + "learning_rate": 1.6883917377191602e-06, + "loss": 1.2108, + "step": 2070 + }, + { + "epoch": 0.28075645631397006, + "grad_norm": 2.652490027776899, + "learning_rate": 1.6880731474512677e-06, + "loss": 1.2074, + "step": 2071 + }, + { + "epoch": 0.28089202196163493, + "grad_norm": 2.3625530752493527, + "learning_rate": 1.6877544244938998e-06, + "loss": 1.2062, + "step": 2072 + }, + { + "epoch": 0.2810275876092998, + "grad_norm": 1.8549956962635183, + "learning_rate": 1.6874355689085205e-06, + "loss": 1.2132, + "step": 2073 + }, + { + "epoch": 0.28116315325696467, + "grad_norm": 2.177830602711867, + "learning_rate": 1.6871165807566174e-06, + "loss": 1.175, + "step": 2074 + }, + { + "epoch": 0.28129871890462954, + "grad_norm": 3.3412499071246526, + "learning_rate": 1.686797460099706e-06, + "loss": 1.1801, + "step": 2075 + }, + { + "epoch": 0.28143428455229447, + "grad_norm": 10.153821860890586, + "learning_rate": 1.6864782069993252e-06, + "loss": 1.2084, + "step": 2076 + }, + { + "epoch": 0.28156985019995934, + "grad_norm": 1.7583427381681915, + "learning_rate": 1.6861588215170413e-06, + "loss": 1.2102, + "step": 2077 + }, + { + "epoch": 0.2817054158476242, + "grad_norm": 2.452910197573052, + "learning_rate": 1.6858393037144447e-06, + "loss": 1.1665, + "step": 2078 + }, + { + "epoch": 0.2818409814952891, + "grad_norm": 1.9060837452708423, + "learning_rate": 1.6855196536531522e-06, + "loss": 1.2196, + "step": 2079 + }, + { + "epoch": 0.28197654714295395, + "grad_norm": 2.18852253588501, + "learning_rate": 1.6851998713948055e-06, + "loss": 1.1884, + "step": 2080 + }, + { + "epoch": 0.2821121127906189, + "grad_norm": 3.4924473627834045, + "learning_rate": 1.6848799570010725e-06, + "loss": 1.2185, + "step": 2081 + }, + { + "epoch": 0.28224767843828374, + "grad_norm": 2.154466847341866, + "learning_rate": 1.6845599105336456e-06, + "loss": 1.2116, + "step": 2082 + }, + { + "epoch": 0.2823832440859486, + "grad_norm": 2.047256524655035, + "learning_rate": 1.6842397320542436e-06, + "loss": 1.2026, + "step": 2083 + }, + { + "epoch": 0.2825188097336135, + "grad_norm": 3.9430639590202574, + "learning_rate": 1.6839194216246107e-06, + "loss": 1.2256, + "step": 2084 + }, + { + "epoch": 0.2826543753812784, + "grad_norm": 2.107353298859395, + "learning_rate": 1.6835989793065152e-06, + "loss": 1.1837, + "step": 2085 + }, + { + "epoch": 0.2827899410289433, + "grad_norm": 1.9527113001228744, + "learning_rate": 1.683278405161753e-06, + "loss": 1.1814, + "step": 2086 + }, + { + "epoch": 0.28292550667660815, + "grad_norm": 2.52965118974388, + "learning_rate": 1.682957699252144e-06, + "loss": 1.1768, + "step": 2087 + }, + { + "epoch": 0.283061072324273, + "grad_norm": 2.25064668700667, + "learning_rate": 1.6826368616395331e-06, + "loss": 1.2012, + "step": 2088 + }, + { + "epoch": 0.2831966379719379, + "grad_norm": 2.0814585671497263, + "learning_rate": 1.6823158923857924e-06, + "loss": 1.1597, + "step": 2089 + }, + { + "epoch": 0.2833322036196028, + "grad_norm": 2.308304798478916, + "learning_rate": 1.6819947915528173e-06, + "loss": 1.1729, + "step": 2090 + }, + { + "epoch": 0.2834677692672677, + "grad_norm": 1.8074230372238163, + "learning_rate": 1.6816735592025303e-06, + "loss": 1.2489, + "step": 2091 + }, + { + "epoch": 0.28360333491493256, + "grad_norm": 1.8286641196356181, + "learning_rate": 1.681352195396878e-06, + "loss": 1.1897, + "step": 2092 + }, + { + "epoch": 0.28373890056259743, + "grad_norm": 2.364245153560911, + "learning_rate": 1.681030700197833e-06, + "loss": 1.2077, + "step": 2093 + }, + { + "epoch": 0.2838744662102623, + "grad_norm": 1.8840048649999237, + "learning_rate": 1.6807090736673932e-06, + "loss": 1.246, + "step": 2094 + }, + { + "epoch": 0.2840100318579272, + "grad_norm": 2.4551274786823503, + "learning_rate": 1.6803873158675823e-06, + "loss": 1.1767, + "step": 2095 + }, + { + "epoch": 0.2841455975055921, + "grad_norm": 2.4063347224707794, + "learning_rate": 1.6800654268604478e-06, + "loss": 1.1655, + "step": 2096 + }, + { + "epoch": 0.28428116315325697, + "grad_norm": 2.429237495664338, + "learning_rate": 1.6797434067080635e-06, + "loss": 1.1965, + "step": 2097 + }, + { + "epoch": 0.28441672880092184, + "grad_norm": 1.9124615476668165, + "learning_rate": 1.679421255472529e-06, + "loss": 1.2048, + "step": 2098 + }, + { + "epoch": 0.2845522944485867, + "grad_norm": 2.013492093860587, + "learning_rate": 1.6790989732159685e-06, + "loss": 1.1734, + "step": 2099 + }, + { + "epoch": 0.28468786009625163, + "grad_norm": 1.9423752844514302, + "learning_rate": 1.6787765600005317e-06, + "loss": 1.2436, + "step": 2100 + }, + { + "epoch": 0.2848234257439165, + "grad_norm": 2.1281654233628515, + "learning_rate": 1.6784540158883928e-06, + "loss": 1.1794, + "step": 2101 + }, + { + "epoch": 0.2849589913915814, + "grad_norm": 6.6175631538288275, + "learning_rate": 1.6781313409417527e-06, + "loss": 1.1877, + "step": 2102 + }, + { + "epoch": 0.28509455703924624, + "grad_norm": 2.1594702461303297, + "learning_rate": 1.6778085352228362e-06, + "loss": 1.2189, + "step": 2103 + }, + { + "epoch": 0.2852301226869111, + "grad_norm": 2.2977896870204795, + "learning_rate": 1.6774855987938938e-06, + "loss": 1.1996, + "step": 2104 + }, + { + "epoch": 0.28536568833457604, + "grad_norm": 1.737071424695625, + "learning_rate": 1.6771625317172018e-06, + "loss": 1.173, + "step": 2105 + }, + { + "epoch": 0.2855012539822409, + "grad_norm": 1.6228221945248706, + "learning_rate": 1.6768393340550607e-06, + "loss": 1.1816, + "step": 2106 + }, + { + "epoch": 0.2856368196299058, + "grad_norm": 1.877985483961013, + "learning_rate": 1.6765160058697962e-06, + "loss": 1.2054, + "step": 2107 + }, + { + "epoch": 0.28577238527757065, + "grad_norm": 2.02298673787898, + "learning_rate": 1.6761925472237604e-06, + "loss": 1.186, + "step": 2108 + }, + { + "epoch": 0.2859079509252355, + "grad_norm": 1.9915813897261605, + "learning_rate": 1.6758689581793295e-06, + "loss": 1.1579, + "step": 2109 + }, + { + "epoch": 0.28604351657290045, + "grad_norm": 2.2050094693564697, + "learning_rate": 1.675545238798905e-06, + "loss": 1.1824, + "step": 2110 + }, + { + "epoch": 0.2861790822205653, + "grad_norm": 1.7963257827355217, + "learning_rate": 1.6752213891449134e-06, + "loss": 1.2048, + "step": 2111 + }, + { + "epoch": 0.2863146478682302, + "grad_norm": 1.8608938120341803, + "learning_rate": 1.674897409279807e-06, + "loss": 1.223, + "step": 2112 + }, + { + "epoch": 0.28645021351589506, + "grad_norm": 1.69204637310914, + "learning_rate": 1.6745732992660622e-06, + "loss": 1.2177, + "step": 2113 + }, + { + "epoch": 0.28658577916355993, + "grad_norm": 1.7942948392422169, + "learning_rate": 1.6742490591661817e-06, + "loss": 1.2199, + "step": 2114 + }, + { + "epoch": 0.28672134481122485, + "grad_norm": 1.8030913362052752, + "learning_rate": 1.6739246890426922e-06, + "loss": 1.2161, + "step": 2115 + }, + { + "epoch": 0.2868569104588897, + "grad_norm": 1.7356390972419997, + "learning_rate": 1.673600188958146e-06, + "loss": 1.1803, + "step": 2116 + }, + { + "epoch": 0.2869924761065546, + "grad_norm": 1.8390306891239623, + "learning_rate": 1.6732755589751208e-06, + "loss": 1.2212, + "step": 2117 + }, + { + "epoch": 0.28712804175421947, + "grad_norm": 1.949639662895768, + "learning_rate": 1.6729507991562181e-06, + "loss": 1.1933, + "step": 2118 + }, + { + "epoch": 0.28726360740188434, + "grad_norm": 1.8723253580292258, + "learning_rate": 1.6726259095640663e-06, + "loss": 1.1655, + "step": 2119 + }, + { + "epoch": 0.28739917304954926, + "grad_norm": 2.0352482889566863, + "learning_rate": 1.6723008902613168e-06, + "loss": 1.1932, + "step": 2120 + }, + { + "epoch": 0.28753473869721413, + "grad_norm": 2.6810707490720516, + "learning_rate": 1.6719757413106475e-06, + "loss": 1.204, + "step": 2121 + }, + { + "epoch": 0.287670304344879, + "grad_norm": 1.7351700748497194, + "learning_rate": 1.6716504627747608e-06, + "loss": 1.1967, + "step": 2122 + }, + { + "epoch": 0.2878058699925439, + "grad_norm": 1.7987613125655457, + "learning_rate": 1.6713250547163839e-06, + "loss": 1.1776, + "step": 2123 + }, + { + "epoch": 0.2879414356402088, + "grad_norm": 1.8618118856143266, + "learning_rate": 1.6709995171982697e-06, + "loss": 1.1966, + "step": 2124 + }, + { + "epoch": 0.28807700128787367, + "grad_norm": 1.7637158290349664, + "learning_rate": 1.6706738502831948e-06, + "loss": 1.1658, + "step": 2125 + }, + { + "epoch": 0.28821256693553854, + "grad_norm": 1.6610712958590308, + "learning_rate": 1.6703480540339617e-06, + "loss": 1.1892, + "step": 2126 + }, + { + "epoch": 0.2883481325832034, + "grad_norm": 1.9637050916264724, + "learning_rate": 1.670022128513398e-06, + "loss": 1.1946, + "step": 2127 + }, + { + "epoch": 0.2884836982308683, + "grad_norm": 1.7628886931691294, + "learning_rate": 1.6696960737843556e-06, + "loss": 1.155, + "step": 2128 + }, + { + "epoch": 0.2886192638785332, + "grad_norm": 2.6754834259373714, + "learning_rate": 1.6693698899097117e-06, + "loss": 1.2, + "step": 2129 + }, + { + "epoch": 0.2887548295261981, + "grad_norm": 2.056701598480212, + "learning_rate": 1.6690435769523684e-06, + "loss": 1.2088, + "step": 2130 + }, + { + "epoch": 0.28889039517386295, + "grad_norm": 1.7386307932775047, + "learning_rate": 1.668717134975252e-06, + "loss": 1.2265, + "step": 2131 + }, + { + "epoch": 0.2890259608215278, + "grad_norm": 2.0722159060355594, + "learning_rate": 1.668390564041315e-06, + "loss": 1.2254, + "step": 2132 + }, + { + "epoch": 0.2891615264691927, + "grad_norm": 1.9990927067284145, + "learning_rate": 1.6680638642135334e-06, + "loss": 1.1943, + "step": 2133 + }, + { + "epoch": 0.2892970921168576, + "grad_norm": 1.7577573177390644, + "learning_rate": 1.667737035554909e-06, + "loss": 1.2147, + "step": 2134 + }, + { + "epoch": 0.2894326577645225, + "grad_norm": 1.7075968179760312, + "learning_rate": 1.6674100781284683e-06, + "loss": 1.2488, + "step": 2135 + }, + { + "epoch": 0.28956822341218735, + "grad_norm": 1.7857226378623905, + "learning_rate": 1.6670829919972622e-06, + "loss": 1.2024, + "step": 2136 + }, + { + "epoch": 0.2897037890598522, + "grad_norm": 2.3039393931753476, + "learning_rate": 1.6667557772243668e-06, + "loss": 1.208, + "step": 2137 + }, + { + "epoch": 0.2898393547075171, + "grad_norm": 2.059938924484952, + "learning_rate": 1.6664284338728824e-06, + "loss": 1.2026, + "step": 2138 + }, + { + "epoch": 0.289974920355182, + "grad_norm": 2.5566206662644095, + "learning_rate": 1.6661009620059355e-06, + "loss": 1.1876, + "step": 2139 + }, + { + "epoch": 0.2901104860028469, + "grad_norm": 6.500140900713756, + "learning_rate": 1.6657733616866755e-06, + "loss": 1.2115, + "step": 2140 + }, + { + "epoch": 0.29024605165051176, + "grad_norm": 2.124725213606999, + "learning_rate": 1.6654456329782783e-06, + "loss": 1.1755, + "step": 2141 + }, + { + "epoch": 0.29038161729817663, + "grad_norm": 6.52796639102791, + "learning_rate": 1.6651177759439432e-06, + "loss": 1.1903, + "step": 2142 + }, + { + "epoch": 0.2905171829458415, + "grad_norm": 1.619182486598753, + "learning_rate": 1.6647897906468953e-06, + "loss": 1.1809, + "step": 2143 + }, + { + "epoch": 0.2906527485935064, + "grad_norm": 2.0591541774369317, + "learning_rate": 1.6644616771503838e-06, + "loss": 1.1749, + "step": 2144 + }, + { + "epoch": 0.2907883142411713, + "grad_norm": 2.153655973268168, + "learning_rate": 1.6641334355176827e-06, + "loss": 1.2291, + "step": 2145 + }, + { + "epoch": 0.29092387988883617, + "grad_norm": 1.9348362353899757, + "learning_rate": 1.6638050658120913e-06, + "loss": 1.2465, + "step": 2146 + }, + { + "epoch": 0.29105944553650104, + "grad_norm": 3.940685663916513, + "learning_rate": 1.6634765680969323e-06, + "loss": 1.2024, + "step": 2147 + }, + { + "epoch": 0.2911950111841659, + "grad_norm": 1.8746546643927466, + "learning_rate": 1.6631479424355548e-06, + "loss": 1.1814, + "step": 2148 + }, + { + "epoch": 0.29133057683183083, + "grad_norm": 1.6960318409941453, + "learning_rate": 1.6628191888913308e-06, + "loss": 1.2169, + "step": 2149 + }, + { + "epoch": 0.2914661424794957, + "grad_norm": 2.0668794137273014, + "learning_rate": 1.662490307527658e-06, + "loss": 1.2053, + "step": 2150 + }, + { + "epoch": 0.2916017081271606, + "grad_norm": 2.5143632229940573, + "learning_rate": 1.6621612984079592e-06, + "loss": 1.2146, + "step": 2151 + }, + { + "epoch": 0.29173727377482545, + "grad_norm": 2.2738913880223475, + "learning_rate": 1.6618321615956808e-06, + "loss": 1.1964, + "step": 2152 + }, + { + "epoch": 0.2918728394224903, + "grad_norm": 1.711875828190821, + "learning_rate": 1.661502897154294e-06, + "loss": 1.2001, + "step": 2153 + }, + { + "epoch": 0.29200840507015524, + "grad_norm": 2.9769039651688964, + "learning_rate": 1.6611735051472948e-06, + "loss": 1.205, + "step": 2154 + }, + { + "epoch": 0.2921439707178201, + "grad_norm": 1.968986955777571, + "learning_rate": 1.6608439856382046e-06, + "loss": 1.1676, + "step": 2155 + }, + { + "epoch": 0.292279536365485, + "grad_norm": 3.978417661666662, + "learning_rate": 1.660514338690568e-06, + "loss": 1.2086, + "step": 2156 + }, + { + "epoch": 0.29241510201314985, + "grad_norm": 1.7840758754590715, + "learning_rate": 1.6601845643679548e-06, + "loss": 1.1615, + "step": 2157 + }, + { + "epoch": 0.2925506676608147, + "grad_norm": 2.1494556598263994, + "learning_rate": 1.6598546627339598e-06, + "loss": 1.1988, + "step": 2158 + }, + { + "epoch": 0.29268623330847965, + "grad_norm": 1.7037427501680114, + "learning_rate": 1.6595246338522016e-06, + "loss": 1.2268, + "step": 2159 + }, + { + "epoch": 0.2928217989561445, + "grad_norm": 1.8935425756746458, + "learning_rate": 1.6591944777863237e-06, + "loss": 1.2231, + "step": 2160 + }, + { + "epoch": 0.2929573646038094, + "grad_norm": 3.5526543037607876, + "learning_rate": 1.6588641945999937e-06, + "loss": 1.1914, + "step": 2161 + }, + { + "epoch": 0.29309293025147426, + "grad_norm": 1.8497055113549847, + "learning_rate": 1.658533784356905e-06, + "loss": 1.2131, + "step": 2162 + }, + { + "epoch": 0.2932284958991392, + "grad_norm": 1.9968428804883902, + "learning_rate": 1.658203247120774e-06, + "loss": 1.2819, + "step": 2163 + }, + { + "epoch": 0.29336406154680406, + "grad_norm": 1.7477891890114565, + "learning_rate": 1.6578725829553425e-06, + "loss": 1.1955, + "step": 2164 + }, + { + "epoch": 0.2934996271944689, + "grad_norm": 1.7160609623144252, + "learning_rate": 1.6575417919243765e-06, + "loss": 1.1727, + "step": 2165 + }, + { + "epoch": 0.2936351928421338, + "grad_norm": 2.3711727327282315, + "learning_rate": 1.6572108740916657e-06, + "loss": 1.1809, + "step": 2166 + }, + { + "epoch": 0.29377075848979867, + "grad_norm": 2.9781278601783767, + "learning_rate": 1.656879829521026e-06, + "loss": 1.2125, + "step": 2167 + }, + { + "epoch": 0.2939063241374636, + "grad_norm": 2.5679913234270426, + "learning_rate": 1.656548658276296e-06, + "loss": 1.2106, + "step": 2168 + }, + { + "epoch": 0.29404188978512846, + "grad_norm": 2.488334671154973, + "learning_rate": 1.6562173604213396e-06, + "loss": 1.1964, + "step": 2169 + }, + { + "epoch": 0.29417745543279333, + "grad_norm": 1.8982391606852884, + "learning_rate": 1.6558859360200454e-06, + "loss": 1.202, + "step": 2170 + }, + { + "epoch": 0.2943130210804582, + "grad_norm": 2.0246519262358906, + "learning_rate": 1.6555543851363256e-06, + "loss": 1.1947, + "step": 2171 + }, + { + "epoch": 0.2944485867281231, + "grad_norm": 3.1470925232922684, + "learning_rate": 1.6552227078341171e-06, + "loss": 1.1912, + "step": 2172 + }, + { + "epoch": 0.294584152375788, + "grad_norm": 2.9137663573356796, + "learning_rate": 1.6548909041773817e-06, + "loss": 1.1528, + "step": 2173 + }, + { + "epoch": 0.29471971802345287, + "grad_norm": 1.9275072338041788, + "learning_rate": 1.6545589742301048e-06, + "loss": 1.1927, + "step": 2174 + }, + { + "epoch": 0.29485528367111774, + "grad_norm": 2.3524392890369357, + "learning_rate": 1.6542269180562961e-06, + "loss": 1.2302, + "step": 2175 + }, + { + "epoch": 0.2949908493187826, + "grad_norm": 2.2372503740773406, + "learning_rate": 1.6538947357199907e-06, + "loss": 1.1587, + "step": 2176 + }, + { + "epoch": 0.2951264149664475, + "grad_norm": 2.0478500426964694, + "learning_rate": 1.6535624272852471e-06, + "loss": 1.1947, + "step": 2177 + }, + { + "epoch": 0.2952619806141124, + "grad_norm": 1.5868514750267033, + "learning_rate": 1.653229992816148e-06, + "loss": 1.1764, + "step": 2178 + }, + { + "epoch": 0.2953975462617773, + "grad_norm": 3.722753993315867, + "learning_rate": 1.6528974323768016e-06, + "loss": 1.1778, + "step": 2179 + }, + { + "epoch": 0.29553311190944215, + "grad_norm": 2.149453345144711, + "learning_rate": 1.6525647460313388e-06, + "loss": 1.186, + "step": 2180 + }, + { + "epoch": 0.295668677557107, + "grad_norm": 2.51233273845251, + "learning_rate": 1.6522319338439156e-06, + "loss": 1.2037, + "step": 2181 + }, + { + "epoch": 0.2958042432047719, + "grad_norm": 2.029492172350344, + "learning_rate": 1.6518989958787125e-06, + "loss": 1.1926, + "step": 2182 + }, + { + "epoch": 0.2959398088524368, + "grad_norm": 2.009128686508153, + "learning_rate": 1.6515659321999337e-06, + "loss": 1.2107, + "step": 2183 + }, + { + "epoch": 0.2960753745001017, + "grad_norm": 4.109508103108488, + "learning_rate": 1.6512327428718082e-06, + "loss": 1.1818, + "step": 2184 + }, + { + "epoch": 0.29621094014776655, + "grad_norm": 1.8476902180965527, + "learning_rate": 1.6508994279585885e-06, + "loss": 1.1479, + "step": 2185 + }, + { + "epoch": 0.2963465057954314, + "grad_norm": 2.633082600747477, + "learning_rate": 1.6505659875245524e-06, + "loss": 1.2074, + "step": 2186 + }, + { + "epoch": 0.2964820714430963, + "grad_norm": 1.8963662309779719, + "learning_rate": 1.6502324216340004e-06, + "loss": 1.1976, + "step": 2187 + }, + { + "epoch": 0.2966176370907612, + "grad_norm": 1.8199905823588225, + "learning_rate": 1.6498987303512588e-06, + "loss": 1.1896, + "step": 2188 + }, + { + "epoch": 0.2967532027384261, + "grad_norm": 1.6895648197273923, + "learning_rate": 1.649564913740677e-06, + "loss": 1.1743, + "step": 2189 + }, + { + "epoch": 0.29688876838609096, + "grad_norm": 1.8991415197199166, + "learning_rate": 1.6492309718666289e-06, + "loss": 1.2091, + "step": 2190 + }, + { + "epoch": 0.29702433403375583, + "grad_norm": 1.8787156440037565, + "learning_rate": 1.6488969047935125e-06, + "loss": 1.172, + "step": 2191 + }, + { + "epoch": 0.2971598996814207, + "grad_norm": 4.77055156803168, + "learning_rate": 1.6485627125857504e-06, + "loss": 1.1889, + "step": 2192 + }, + { + "epoch": 0.29729546532908563, + "grad_norm": 2.2372314092027823, + "learning_rate": 1.6482283953077884e-06, + "loss": 1.1885, + "step": 2193 + }, + { + "epoch": 0.2974310309767505, + "grad_norm": 1.8719525836842081, + "learning_rate": 1.6478939530240971e-06, + "loss": 1.1978, + "step": 2194 + }, + { + "epoch": 0.29756659662441537, + "grad_norm": 2.6489725005675995, + "learning_rate": 1.6475593857991714e-06, + "loss": 1.1993, + "step": 2195 + }, + { + "epoch": 0.29770216227208024, + "grad_norm": 1.76984152663889, + "learning_rate": 1.6472246936975293e-06, + "loss": 1.1939, + "step": 2196 + }, + { + "epoch": 0.2978377279197451, + "grad_norm": 2.3304760256473513, + "learning_rate": 1.6468898767837142e-06, + "loss": 1.194, + "step": 2197 + }, + { + "epoch": 0.29797329356741004, + "grad_norm": 2.136735992016526, + "learning_rate": 1.6465549351222924e-06, + "loss": 1.2558, + "step": 2198 + }, + { + "epoch": 0.2981088592150749, + "grad_norm": 2.2057855167015914, + "learning_rate": 1.646219868777855e-06, + "loss": 1.1629, + "step": 2199 + }, + { + "epoch": 0.2982444248627398, + "grad_norm": 1.900439776551073, + "learning_rate": 1.645884677815017e-06, + "loss": 1.1749, + "step": 2200 + }, + { + "epoch": 0.29837999051040465, + "grad_norm": 2.444006192900403, + "learning_rate": 1.645549362298417e-06, + "loss": 1.2053, + "step": 2201 + }, + { + "epoch": 0.2985155561580696, + "grad_norm": 2.841671479839427, + "learning_rate": 1.6452139222927181e-06, + "loss": 1.2112, + "step": 2202 + }, + { + "epoch": 0.29865112180573444, + "grad_norm": 1.9302940872389718, + "learning_rate": 1.6448783578626076e-06, + "loss": 1.1689, + "step": 2203 + }, + { + "epoch": 0.2987866874533993, + "grad_norm": 3.1120028881541115, + "learning_rate": 1.6445426690727959e-06, + "loss": 1.1618, + "step": 2204 + }, + { + "epoch": 0.2989222531010642, + "grad_norm": 1.5911500064725004, + "learning_rate": 1.6442068559880182e-06, + "loss": 1.1836, + "step": 2205 + }, + { + "epoch": 0.29905781874872905, + "grad_norm": 2.1983990351354326, + "learning_rate": 1.6438709186730333e-06, + "loss": 1.1595, + "step": 2206 + }, + { + "epoch": 0.299193384396394, + "grad_norm": 1.8937351107634326, + "learning_rate": 1.6435348571926245e-06, + "loss": 1.1861, + "step": 2207 + }, + { + "epoch": 0.29932895004405885, + "grad_norm": 1.760121197931658, + "learning_rate": 1.6431986716115982e-06, + "loss": 1.2038, + "step": 2208 + }, + { + "epoch": 0.2994645156917237, + "grad_norm": 1.7721512554153036, + "learning_rate": 1.6428623619947848e-06, + "loss": 1.2381, + "step": 2209 + }, + { + "epoch": 0.2996000813393886, + "grad_norm": 1.8807510715939877, + "learning_rate": 1.6425259284070395e-06, + "loss": 1.204, + "step": 2210 + }, + { + "epoch": 0.29973564698705346, + "grad_norm": 4.5936638779876455, + "learning_rate": 1.6421893709132405e-06, + "loss": 1.1534, + "step": 2211 + }, + { + "epoch": 0.2998712126347184, + "grad_norm": 2.277683579204806, + "learning_rate": 1.641852689578291e-06, + "loss": 1.2031, + "step": 2212 + }, + { + "epoch": 0.30000677828238326, + "grad_norm": 2.3510995148466014, + "learning_rate": 1.6415158844671163e-06, + "loss": 1.2052, + "step": 2213 + }, + { + "epoch": 0.3001423439300481, + "grad_norm": 1.9574212047697188, + "learning_rate": 1.6411789556446673e-06, + "loss": 1.1824, + "step": 2214 + }, + { + "epoch": 0.300277909577713, + "grad_norm": 2.041470263105566, + "learning_rate": 1.640841903175918e-06, + "loss": 1.2194, + "step": 2215 + }, + { + "epoch": 0.30041347522537787, + "grad_norm": 2.2593870426677354, + "learning_rate": 1.640504727125866e-06, + "loss": 1.1886, + "step": 2216 + }, + { + "epoch": 0.3005490408730428, + "grad_norm": 2.462546184826882, + "learning_rate": 1.640167427559533e-06, + "loss": 1.2493, + "step": 2217 + }, + { + "epoch": 0.30068460652070766, + "grad_norm": 1.9490493680263228, + "learning_rate": 1.639830004541965e-06, + "loss": 1.181, + "step": 2218 + }, + { + "epoch": 0.30082017216837253, + "grad_norm": 5.537081345413015, + "learning_rate": 1.6394924581382312e-06, + "loss": 1.2041, + "step": 2219 + }, + { + "epoch": 0.3009557378160374, + "grad_norm": 1.8029954181161583, + "learning_rate": 1.6391547884134247e-06, + "loss": 1.2141, + "step": 2220 + }, + { + "epoch": 0.3010913034637023, + "grad_norm": 2.179740945697593, + "learning_rate": 1.6388169954326623e-06, + "loss": 1.2172, + "step": 2221 + }, + { + "epoch": 0.3012268691113672, + "grad_norm": 1.8702388018446559, + "learning_rate": 1.6384790792610849e-06, + "loss": 1.2079, + "step": 2222 + }, + { + "epoch": 0.30136243475903207, + "grad_norm": 1.8606002911532011, + "learning_rate": 1.6381410399638571e-06, + "loss": 1.1681, + "step": 2223 + }, + { + "epoch": 0.30149800040669694, + "grad_norm": 1.556777641919625, + "learning_rate": 1.6378028776061666e-06, + "loss": 1.2194, + "step": 2224 + }, + { + "epoch": 0.3016335660543618, + "grad_norm": 2.2673951277805324, + "learning_rate": 1.6374645922532257e-06, + "loss": 1.1691, + "step": 2225 + }, + { + "epoch": 0.3017691317020267, + "grad_norm": 2.0799869363410495, + "learning_rate": 1.63712618397027e-06, + "loss": 1.2009, + "step": 2226 + }, + { + "epoch": 0.3019046973496916, + "grad_norm": 1.874641904651504, + "learning_rate": 1.636787652822559e-06, + "loss": 1.1878, + "step": 2227 + }, + { + "epoch": 0.3020402629973565, + "grad_norm": 1.9953059468881513, + "learning_rate": 1.6364489988753757e-06, + "loss": 1.1833, + "step": 2228 + }, + { + "epoch": 0.30217582864502135, + "grad_norm": 2.823169445725037, + "learning_rate": 1.6361102221940268e-06, + "loss": 1.2081, + "step": 2229 + }, + { + "epoch": 0.3023113942926862, + "grad_norm": 8.758735952148495, + "learning_rate": 1.6357713228438428e-06, + "loss": 1.145, + "step": 2230 + }, + { + "epoch": 0.3024469599403511, + "grad_norm": 1.7779429554105215, + "learning_rate": 1.6354323008901773e-06, + "loss": 1.1752, + "step": 2231 + }, + { + "epoch": 0.302582525588016, + "grad_norm": 1.6166089162929698, + "learning_rate": 1.6350931563984087e-06, + "loss": 1.2017, + "step": 2232 + }, + { + "epoch": 0.3027180912356809, + "grad_norm": 1.812252134905986, + "learning_rate": 1.6347538894339379e-06, + "loss": 1.1776, + "step": 2233 + }, + { + "epoch": 0.30285365688334576, + "grad_norm": 2.1781396293243356, + "learning_rate": 1.6344145000621898e-06, + "loss": 1.1628, + "step": 2234 + }, + { + "epoch": 0.3029892225310106, + "grad_norm": 2.0853236199482486, + "learning_rate": 1.6340749883486136e-06, + "loss": 1.2295, + "step": 2235 + }, + { + "epoch": 0.3031247881786755, + "grad_norm": 1.9862782621813326, + "learning_rate": 1.6337353543586808e-06, + "loss": 1.2077, + "step": 2236 + }, + { + "epoch": 0.3032603538263404, + "grad_norm": 1.860079501987115, + "learning_rate": 1.6333955981578868e-06, + "loss": 1.2107, + "step": 2237 + }, + { + "epoch": 0.3033959194740053, + "grad_norm": 2.4178446917036496, + "learning_rate": 1.633055719811752e-06, + "loss": 1.2042, + "step": 2238 + }, + { + "epoch": 0.30353148512167016, + "grad_norm": 1.8122083498329797, + "learning_rate": 1.6327157193858182e-06, + "loss": 1.2309, + "step": 2239 + }, + { + "epoch": 0.30366705076933503, + "grad_norm": 2.394643352637598, + "learning_rate": 1.6323755969456526e-06, + "loss": 1.1812, + "step": 2240 + }, + { + "epoch": 0.30380261641699996, + "grad_norm": 2.5047030453991512, + "learning_rate": 1.6320353525568447e-06, + "loss": 1.1636, + "step": 2241 + }, + { + "epoch": 0.30393818206466483, + "grad_norm": 1.8293104400760678, + "learning_rate": 1.6316949862850082e-06, + "loss": 1.1786, + "step": 2242 + }, + { + "epoch": 0.3040737477123297, + "grad_norm": 1.8194683584249256, + "learning_rate": 1.6313544981957797e-06, + "loss": 1.2264, + "step": 2243 + }, + { + "epoch": 0.30420931335999457, + "grad_norm": 2.0014475782780314, + "learning_rate": 1.6310138883548199e-06, + "loss": 1.1783, + "step": 2244 + }, + { + "epoch": 0.30434487900765944, + "grad_norm": 2.37906702956568, + "learning_rate": 1.6306731568278126e-06, + "loss": 1.1879, + "step": 2245 + }, + { + "epoch": 0.30448044465532437, + "grad_norm": 1.8613444631925746, + "learning_rate": 1.6303323036804652e-06, + "loss": 1.227, + "step": 2246 + }, + { + "epoch": 0.30461601030298924, + "grad_norm": 2.84421382170422, + "learning_rate": 1.6299913289785087e-06, + "loss": 1.1753, + "step": 2247 + }, + { + "epoch": 0.3047515759506541, + "grad_norm": 1.679757632611884, + "learning_rate": 1.6296502327876974e-06, + "loss": 1.2047, + "step": 2248 + }, + { + "epoch": 0.304887141598319, + "grad_norm": 2.174459102161277, + "learning_rate": 1.6293090151738086e-06, + "loss": 1.1901, + "step": 2249 + }, + { + "epoch": 0.30502270724598385, + "grad_norm": 1.9346090102503977, + "learning_rate": 1.6289676762026438e-06, + "loss": 1.1628, + "step": 2250 + }, + { + "epoch": 0.3051582728936488, + "grad_norm": 1.8684807240413484, + "learning_rate": 1.6286262159400275e-06, + "loss": 1.1966, + "step": 2251 + }, + { + "epoch": 0.30529383854131364, + "grad_norm": 2.0204744891177784, + "learning_rate": 1.6282846344518073e-06, + "loss": 1.2048, + "step": 2252 + }, + { + "epoch": 0.3054294041889785, + "grad_norm": 2.25859839036469, + "learning_rate": 1.627942931803855e-06, + "loss": 1.185, + "step": 2253 + }, + { + "epoch": 0.3055649698366434, + "grad_norm": 2.7041430511338875, + "learning_rate": 1.627601108062065e-06, + "loss": 1.1865, + "step": 2254 + }, + { + "epoch": 0.30570053548430826, + "grad_norm": 1.6997017219036967, + "learning_rate": 1.6272591632923548e-06, + "loss": 1.193, + "step": 2255 + }, + { + "epoch": 0.3058361011319732, + "grad_norm": 1.7747684369253136, + "learning_rate": 1.6269170975606665e-06, + "loss": 1.172, + "step": 2256 + }, + { + "epoch": 0.30597166677963805, + "grad_norm": 1.6626900940809917, + "learning_rate": 1.6265749109329647e-06, + "loss": 1.1996, + "step": 2257 + }, + { + "epoch": 0.3061072324273029, + "grad_norm": 1.5977408096012706, + "learning_rate": 1.6262326034752371e-06, + "loss": 1.1514, + "step": 2258 + }, + { + "epoch": 0.3062427980749678, + "grad_norm": 1.840044322202103, + "learning_rate": 1.6258901752534947e-06, + "loss": 1.2147, + "step": 2259 + }, + { + "epoch": 0.30637836372263266, + "grad_norm": 2.750078663673782, + "learning_rate": 1.625547626333773e-06, + "loss": 1.2085, + "step": 2260 + }, + { + "epoch": 0.3065139293702976, + "grad_norm": 1.9229539764760806, + "learning_rate": 1.6252049567821294e-06, + "loss": 1.2155, + "step": 2261 + }, + { + "epoch": 0.30664949501796246, + "grad_norm": 1.921901673997532, + "learning_rate": 1.6248621666646448e-06, + "loss": 1.1917, + "step": 2262 + }, + { + "epoch": 0.30678506066562733, + "grad_norm": 2.8008325849776017, + "learning_rate": 1.6245192560474237e-06, + "loss": 1.1575, + "step": 2263 + }, + { + "epoch": 0.3069206263132922, + "grad_norm": 3.5435230156892636, + "learning_rate": 1.6241762249965935e-06, + "loss": 1.179, + "step": 2264 + }, + { + "epoch": 0.30705619196095707, + "grad_norm": 1.7010013968011115, + "learning_rate": 1.6238330735783054e-06, + "loss": 1.1451, + "step": 2265 + }, + { + "epoch": 0.307191757608622, + "grad_norm": 1.7080219338806688, + "learning_rate": 1.6234898018587336e-06, + "loss": 1.2066, + "step": 2266 + }, + { + "epoch": 0.30732732325628687, + "grad_norm": 2.137586446284985, + "learning_rate": 1.6231464099040748e-06, + "loss": 1.1698, + "step": 2267 + }, + { + "epoch": 0.30746288890395174, + "grad_norm": 1.8574392221177456, + "learning_rate": 1.6228028977805495e-06, + "loss": 1.1995, + "step": 2268 + }, + { + "epoch": 0.3075984545516166, + "grad_norm": 1.7704553088168997, + "learning_rate": 1.6224592655544016e-06, + "loss": 1.193, + "step": 2269 + }, + { + "epoch": 0.3077340201992815, + "grad_norm": 2.077463672143214, + "learning_rate": 1.6221155132918979e-06, + "loss": 1.229, + "step": 2270 + }, + { + "epoch": 0.3078695858469464, + "grad_norm": 1.9168540965163834, + "learning_rate": 1.6217716410593281e-06, + "loss": 1.2016, + "step": 2271 + }, + { + "epoch": 0.3080051514946113, + "grad_norm": 1.7413607710615093, + "learning_rate": 1.621427648923005e-06, + "loss": 1.1935, + "step": 2272 + }, + { + "epoch": 0.30814071714227614, + "grad_norm": 1.6498152124669352, + "learning_rate": 1.6210835369492652e-06, + "loss": 1.1794, + "step": 2273 + }, + { + "epoch": 0.308276282789941, + "grad_norm": 1.9931546449579627, + "learning_rate": 1.6207393052044678e-06, + "loss": 1.2, + "step": 2274 + }, + { + "epoch": 0.3084118484376059, + "grad_norm": 1.8001590212355463, + "learning_rate": 1.6203949537549954e-06, + "loss": 1.2023, + "step": 2275 + }, + { + "epoch": 0.3085474140852708, + "grad_norm": 2.664314543957132, + "learning_rate": 1.6200504826672533e-06, + "loss": 1.1884, + "step": 2276 + }, + { + "epoch": 0.3086829797329357, + "grad_norm": 1.7569941226145525, + "learning_rate": 1.6197058920076696e-06, + "loss": 1.2095, + "step": 2277 + }, + { + "epoch": 0.30881854538060055, + "grad_norm": 2.611652207460391, + "learning_rate": 1.6193611818426968e-06, + "loss": 1.2095, + "step": 2278 + }, + { + "epoch": 0.3089541110282654, + "grad_norm": 1.9270641008151719, + "learning_rate": 1.6190163522388088e-06, + "loss": 1.184, + "step": 2279 + }, + { + "epoch": 0.3090896766759303, + "grad_norm": 1.8149665292967512, + "learning_rate": 1.6186714032625033e-06, + "loss": 1.1545, + "step": 2280 + }, + { + "epoch": 0.3092252423235952, + "grad_norm": 2.3357399814406175, + "learning_rate": 1.6183263349803014e-06, + "loss": 1.2161, + "step": 2281 + }, + { + "epoch": 0.3093608079712601, + "grad_norm": 2.056279574929264, + "learning_rate": 1.6179811474587464e-06, + "loss": 1.2046, + "step": 2282 + }, + { + "epoch": 0.30949637361892496, + "grad_norm": 1.905601737556078, + "learning_rate": 1.6176358407644055e-06, + "loss": 1.2033, + "step": 2283 + }, + { + "epoch": 0.30963193926658983, + "grad_norm": 1.928975439336167, + "learning_rate": 1.6172904149638677e-06, + "loss": 1.2385, + "step": 2284 + }, + { + "epoch": 0.30976750491425475, + "grad_norm": 1.7152832091268675, + "learning_rate": 1.616944870123746e-06, + "loss": 1.2026, + "step": 2285 + }, + { + "epoch": 0.3099030705619196, + "grad_norm": 1.6684044537564178, + "learning_rate": 1.616599206310676e-06, + "loss": 1.1951, + "step": 2286 + }, + { + "epoch": 0.3100386362095845, + "grad_norm": 1.8957375240856906, + "learning_rate": 1.616253423591316e-06, + "loss": 1.1943, + "step": 2287 + }, + { + "epoch": 0.31017420185724937, + "grad_norm": 2.289621205956834, + "learning_rate": 1.6159075220323482e-06, + "loss": 1.2033, + "step": 2288 + }, + { + "epoch": 0.31030976750491424, + "grad_norm": 2.7861844382832386, + "learning_rate": 1.6155615017004762e-06, + "loss": 1.1766, + "step": 2289 + }, + { + "epoch": 0.31044533315257916, + "grad_norm": 1.6353191075966713, + "learning_rate": 1.6152153626624275e-06, + "loss": 1.2034, + "step": 2290 + }, + { + "epoch": 0.31058089880024403, + "grad_norm": 1.9758914410149042, + "learning_rate": 1.6148691049849523e-06, + "loss": 1.1628, + "step": 2291 + }, + { + "epoch": 0.3107164644479089, + "grad_norm": 4.513181351938792, + "learning_rate": 1.6145227287348238e-06, + "loss": 1.2108, + "step": 2292 + }, + { + "epoch": 0.3108520300955738, + "grad_norm": 3.215503102446753, + "learning_rate": 1.6141762339788376e-06, + "loss": 1.1996, + "step": 2293 + }, + { + "epoch": 0.31098759574323864, + "grad_norm": 1.7669993499293175, + "learning_rate": 1.6138296207838127e-06, + "loss": 1.2234, + "step": 2294 + }, + { + "epoch": 0.31112316139090357, + "grad_norm": 2.2832754222538565, + "learning_rate": 1.6134828892165907e-06, + "loss": 1.2077, + "step": 2295 + }, + { + "epoch": 0.31125872703856844, + "grad_norm": 2.7128900928749107, + "learning_rate": 1.6131360393440362e-06, + "loss": 1.1574, + "step": 2296 + }, + { + "epoch": 0.3113942926862333, + "grad_norm": 1.9741055850099936, + "learning_rate": 1.6127890712330364e-06, + "loss": 1.1761, + "step": 2297 + }, + { + "epoch": 0.3115298583338982, + "grad_norm": 3.598640784538275, + "learning_rate": 1.6124419849505013e-06, + "loss": 1.176, + "step": 2298 + }, + { + "epoch": 0.31166542398156305, + "grad_norm": 2.2446223481019927, + "learning_rate": 1.6120947805633636e-06, + "loss": 1.1998, + "step": 2299 + }, + { + "epoch": 0.311800989629228, + "grad_norm": 3.943054804838593, + "learning_rate": 1.6117474581385788e-06, + "loss": 1.1729, + "step": 2300 + }, + { + "epoch": 0.31193655527689285, + "grad_norm": 3.147485264007524, + "learning_rate": 1.611400017743126e-06, + "loss": 1.1726, + "step": 2301 + }, + { + "epoch": 0.3120721209245577, + "grad_norm": 2.2319718914182713, + "learning_rate": 1.6110524594440055e-06, + "loss": 1.1699, + "step": 2302 + }, + { + "epoch": 0.3122076865722226, + "grad_norm": 3.209676769249188, + "learning_rate": 1.6107047833082418e-06, + "loss": 1.166, + "step": 2303 + }, + { + "epoch": 0.31234325221988746, + "grad_norm": 2.455614154260562, + "learning_rate": 1.6103569894028813e-06, + "loss": 1.2108, + "step": 2304 + }, + { + "epoch": 0.3124788178675524, + "grad_norm": 1.7580359432542223, + "learning_rate": 1.6100090777949928e-06, + "loss": 1.1606, + "step": 2305 + }, + { + "epoch": 0.31261438351521725, + "grad_norm": 1.8071094379130266, + "learning_rate": 1.6096610485516693e-06, + "loss": 1.1957, + "step": 2306 + }, + { + "epoch": 0.3127499491628821, + "grad_norm": 1.7608356453173768, + "learning_rate": 1.6093129017400248e-06, + "loss": 1.2085, + "step": 2307 + }, + { + "epoch": 0.312885514810547, + "grad_norm": 1.920758731712591, + "learning_rate": 1.6089646374271965e-06, + "loss": 1.2255, + "step": 2308 + }, + { + "epoch": 0.31302108045821186, + "grad_norm": 1.9871219729630167, + "learning_rate": 1.6086162556803453e-06, + "loss": 1.1803, + "step": 2309 + }, + { + "epoch": 0.3131566461058768, + "grad_norm": 2.218893975500075, + "learning_rate": 1.608267756566653e-06, + "loss": 1.1969, + "step": 2310 + }, + { + "epoch": 0.31329221175354166, + "grad_norm": 1.618996916061544, + "learning_rate": 1.607919140153325e-06, + "loss": 1.1793, + "step": 2311 + }, + { + "epoch": 0.31342777740120653, + "grad_norm": 2.1919513991854185, + "learning_rate": 1.6075704065075897e-06, + "loss": 1.2088, + "step": 2312 + }, + { + "epoch": 0.3135633430488714, + "grad_norm": 3.19122736607462, + "learning_rate": 1.6072215556966975e-06, + "loss": 1.2304, + "step": 2313 + }, + { + "epoch": 0.31369890869653627, + "grad_norm": 1.8319292386700048, + "learning_rate": 1.6068725877879213e-06, + "loss": 1.1663, + "step": 2314 + }, + { + "epoch": 0.3138344743442012, + "grad_norm": 1.8129914515359036, + "learning_rate": 1.6065235028485567e-06, + "loss": 1.1541, + "step": 2315 + }, + { + "epoch": 0.31397003999186607, + "grad_norm": 2.007151728112373, + "learning_rate": 1.6061743009459225e-06, + "loss": 1.1936, + "step": 2316 + }, + { + "epoch": 0.31410560563953094, + "grad_norm": 1.8774895354094774, + "learning_rate": 1.605824982147359e-06, + "loss": 1.1784, + "step": 2317 + }, + { + "epoch": 0.3142411712871958, + "grad_norm": 2.1926346429131662, + "learning_rate": 1.6054755465202296e-06, + "loss": 1.1677, + "step": 2318 + }, + { + "epoch": 0.3143767369348607, + "grad_norm": 2.6148146571483997, + "learning_rate": 1.6051259941319209e-06, + "loss": 1.1757, + "step": 2319 + }, + { + "epoch": 0.3145123025825256, + "grad_norm": 2.082490046477907, + "learning_rate": 1.6047763250498405e-06, + "loss": 1.2003, + "step": 2320 + }, + { + "epoch": 0.3146478682301905, + "grad_norm": 2.069112013702786, + "learning_rate": 1.6044265393414196e-06, + "loss": 1.2362, + "step": 2321 + }, + { + "epoch": 0.31478343387785535, + "grad_norm": 1.8144216938781306, + "learning_rate": 1.6040766370741117e-06, + "loss": 1.1967, + "step": 2322 + }, + { + "epoch": 0.3149189995255202, + "grad_norm": 5.277595569915457, + "learning_rate": 1.6037266183153925e-06, + "loss": 1.2247, + "step": 2323 + }, + { + "epoch": 0.31505456517318514, + "grad_norm": 1.7435505017329305, + "learning_rate": 1.6033764831327607e-06, + "loss": 1.2214, + "step": 2324 + }, + { + "epoch": 0.31519013082085, + "grad_norm": 2.717961575568524, + "learning_rate": 1.6030262315937368e-06, + "loss": 1.2566, + "step": 2325 + }, + { + "epoch": 0.3153256964685149, + "grad_norm": 1.7903361916975618, + "learning_rate": 1.6026758637658642e-06, + "loss": 1.2169, + "step": 2326 + }, + { + "epoch": 0.31546126211617975, + "grad_norm": 7.574131051975267, + "learning_rate": 1.6023253797167084e-06, + "loss": 1.2058, + "step": 2327 + }, + { + "epoch": 0.3155968277638446, + "grad_norm": 2.2334702592299696, + "learning_rate": 1.6019747795138576e-06, + "loss": 1.1812, + "step": 2328 + }, + { + "epoch": 0.31573239341150955, + "grad_norm": 1.6988040780382723, + "learning_rate": 1.6016240632249222e-06, + "loss": 1.2177, + "step": 2329 + }, + { + "epoch": 0.3158679590591744, + "grad_norm": 1.7321644624924282, + "learning_rate": 1.6012732309175356e-06, + "loss": 1.2063, + "step": 2330 + }, + { + "epoch": 0.3160035247068393, + "grad_norm": 1.7914401119201215, + "learning_rate": 1.600922282659352e-06, + "loss": 1.1761, + "step": 2331 + }, + { + "epoch": 0.31613909035450416, + "grad_norm": 1.8510557185855412, + "learning_rate": 1.60057121851805e-06, + "loss": 1.1654, + "step": 2332 + }, + { + "epoch": 0.31627465600216903, + "grad_norm": 1.5981946987558169, + "learning_rate": 1.600220038561329e-06, + "loss": 1.2032, + "step": 2333 + }, + { + "epoch": 0.31641022164983396, + "grad_norm": 3.042604552339144, + "learning_rate": 1.5998687428569113e-06, + "loss": 1.2083, + "step": 2334 + }, + { + "epoch": 0.3165457872974988, + "grad_norm": 2.503819060490965, + "learning_rate": 1.5995173314725419e-06, + "loss": 1.1743, + "step": 2335 + }, + { + "epoch": 0.3166813529451637, + "grad_norm": 2.4090653007049068, + "learning_rate": 1.5991658044759871e-06, + "loss": 1.1897, + "step": 2336 + }, + { + "epoch": 0.31681691859282857, + "grad_norm": 1.7548179275754718, + "learning_rate": 1.5988141619350363e-06, + "loss": 1.1803, + "step": 2337 + }, + { + "epoch": 0.31695248424049344, + "grad_norm": 2.1080566211435143, + "learning_rate": 1.5984624039175016e-06, + "loss": 1.2493, + "step": 2338 + }, + { + "epoch": 0.31708804988815836, + "grad_norm": 2.6831006825904833, + "learning_rate": 1.5981105304912159e-06, + "loss": 1.2081, + "step": 2339 + }, + { + "epoch": 0.31722361553582323, + "grad_norm": 1.9275669628202698, + "learning_rate": 1.5977585417240358e-06, + "loss": 1.1622, + "step": 2340 + }, + { + "epoch": 0.3173591811834881, + "grad_norm": 2.139435800076176, + "learning_rate": 1.5974064376838392e-06, + "loss": 1.1817, + "step": 2341 + }, + { + "epoch": 0.317494746831153, + "grad_norm": 2.964691359327291, + "learning_rate": 1.5970542184385268e-06, + "loss": 1.1907, + "step": 2342 + }, + { + "epoch": 0.31763031247881784, + "grad_norm": 2.964809947456507, + "learning_rate": 1.5967018840560212e-06, + "loss": 1.1739, + "step": 2343 + }, + { + "epoch": 0.31776587812648277, + "grad_norm": 1.676308284858603, + "learning_rate": 1.5963494346042674e-06, + "loss": 1.2059, + "step": 2344 + }, + { + "epoch": 0.31790144377414764, + "grad_norm": 1.8001221439106787, + "learning_rate": 1.5959968701512326e-06, + "loss": 1.189, + "step": 2345 + }, + { + "epoch": 0.3180370094218125, + "grad_norm": 1.7281500445588287, + "learning_rate": 1.5956441907649057e-06, + "loss": 1.1731, + "step": 2346 + }, + { + "epoch": 0.3181725750694774, + "grad_norm": 2.116026698032713, + "learning_rate": 1.595291396513298e-06, + "loss": 1.2144, + "step": 2347 + }, + { + "epoch": 0.31830814071714225, + "grad_norm": 1.862698839572492, + "learning_rate": 1.594938487464444e-06, + "loss": 1.1912, + "step": 2348 + }, + { + "epoch": 0.3184437063648072, + "grad_norm": 1.8514774158097367, + "learning_rate": 1.5945854636863987e-06, + "loss": 1.2098, + "step": 2349 + }, + { + "epoch": 0.31857927201247205, + "grad_norm": 2.2814898910676757, + "learning_rate": 1.59423232524724e-06, + "loss": 1.2208, + "step": 2350 + }, + { + "epoch": 0.3187148376601369, + "grad_norm": 2.5822642406052116, + "learning_rate": 1.593879072215068e-06, + "loss": 1.2003, + "step": 2351 + }, + { + "epoch": 0.3188504033078018, + "grad_norm": 1.994603971022834, + "learning_rate": 1.5935257046580048e-06, + "loss": 1.188, + "step": 2352 + }, + { + "epoch": 0.31898596895546666, + "grad_norm": 2.103261296337051, + "learning_rate": 1.5931722226441945e-06, + "loss": 1.1918, + "step": 2353 + }, + { + "epoch": 0.3191215346031316, + "grad_norm": 1.8366833964943077, + "learning_rate": 1.5928186262418032e-06, + "loss": 1.2001, + "step": 2354 + }, + { + "epoch": 0.31925710025079646, + "grad_norm": 2.5413246215767087, + "learning_rate": 1.5924649155190191e-06, + "loss": 1.193, + "step": 2355 + }, + { + "epoch": 0.3193926658984613, + "grad_norm": 4.678983143516526, + "learning_rate": 1.5921110905440526e-06, + "loss": 1.2163, + "step": 2356 + }, + { + "epoch": 0.3195282315461262, + "grad_norm": 1.7200384189979778, + "learning_rate": 1.5917571513851364e-06, + "loss": 1.1879, + "step": 2357 + }, + { + "epoch": 0.31966379719379107, + "grad_norm": 2.0008884131610976, + "learning_rate": 1.5914030981105246e-06, + "loss": 1.1869, + "step": 2358 + }, + { + "epoch": 0.319799362841456, + "grad_norm": 1.6899528032668096, + "learning_rate": 1.5910489307884936e-06, + "loss": 1.1487, + "step": 2359 + }, + { + "epoch": 0.31993492848912086, + "grad_norm": 3.6802582240871033, + "learning_rate": 1.5906946494873415e-06, + "loss": 1.197, + "step": 2360 + }, + { + "epoch": 0.32007049413678573, + "grad_norm": 1.7939328719899728, + "learning_rate": 1.590340254275389e-06, + "loss": 1.2272, + "step": 2361 + }, + { + "epoch": 0.3202060597844506, + "grad_norm": 2.556559175250382, + "learning_rate": 1.5899857452209787e-06, + "loss": 1.1685, + "step": 2362 + }, + { + "epoch": 0.32034162543211553, + "grad_norm": 1.76473706201516, + "learning_rate": 1.589631122392474e-06, + "loss": 1.1829, + "step": 2363 + }, + { + "epoch": 0.3204771910797804, + "grad_norm": 2.3292665352015467, + "learning_rate": 1.5892763858582618e-06, + "loss": 1.1825, + "step": 2364 + }, + { + "epoch": 0.32061275672744527, + "grad_norm": 1.8235382444892767, + "learning_rate": 1.58892153568675e-06, + "loss": 1.1739, + "step": 2365 + }, + { + "epoch": 0.32074832237511014, + "grad_norm": 2.4579698969687556, + "learning_rate": 1.588566571946369e-06, + "loss": 1.1819, + "step": 2366 + }, + { + "epoch": 0.320883888022775, + "grad_norm": 2.08183446073054, + "learning_rate": 1.58821149470557e-06, + "loss": 1.1759, + "step": 2367 + }, + { + "epoch": 0.32101945367043994, + "grad_norm": 2.5801151462976097, + "learning_rate": 1.5878563040328276e-06, + "loss": 1.1973, + "step": 2368 + }, + { + "epoch": 0.3211550193181048, + "grad_norm": 1.8617498326647135, + "learning_rate": 1.5875009999966371e-06, + "loss": 1.1981, + "step": 2369 + }, + { + "epoch": 0.3212905849657697, + "grad_norm": 2.9025238917823737, + "learning_rate": 1.5871455826655163e-06, + "loss": 1.1739, + "step": 2370 + }, + { + "epoch": 0.32142615061343455, + "grad_norm": 3.669205355144678, + "learning_rate": 1.5867900521080044e-06, + "loss": 1.1761, + "step": 2371 + }, + { + "epoch": 0.3215617162610994, + "grad_norm": 2.189267572820922, + "learning_rate": 1.586434408392663e-06, + "loss": 1.2017, + "step": 2372 + }, + { + "epoch": 0.32169728190876434, + "grad_norm": 1.5415578040636744, + "learning_rate": 1.5860786515880745e-06, + "loss": 1.1748, + "step": 2373 + }, + { + "epoch": 0.3218328475564292, + "grad_norm": 7.525716946631925, + "learning_rate": 1.5857227817628447e-06, + "loss": 1.202, + "step": 2374 + }, + { + "epoch": 0.3219684132040941, + "grad_norm": 2.773901841913629, + "learning_rate": 1.5853667989855999e-06, + "loss": 1.1656, + "step": 2375 + }, + { + "epoch": 0.32210397885175895, + "grad_norm": 1.753300976087697, + "learning_rate": 1.5850107033249884e-06, + "loss": 1.2171, + "step": 2376 + }, + { + "epoch": 0.3222395444994238, + "grad_norm": 2.371841255753591, + "learning_rate": 1.5846544948496807e-06, + "loss": 1.1735, + "step": 2377 + }, + { + "epoch": 0.32237511014708875, + "grad_norm": 68.59727760703478, + "learning_rate": 1.5842981736283685e-06, + "loss": 1.1914, + "step": 2378 + }, + { + "epoch": 0.3225106757947536, + "grad_norm": 2.157807125935284, + "learning_rate": 1.5839417397297656e-06, + "loss": 1.1738, + "step": 2379 + }, + { + "epoch": 0.3226462414424185, + "grad_norm": 1.8038084031732553, + "learning_rate": 1.5835851932226074e-06, + "loss": 1.2626, + "step": 2380 + }, + { + "epoch": 0.32278180709008336, + "grad_norm": 2.1883940185703565, + "learning_rate": 1.5832285341756517e-06, + "loss": 1.1856, + "step": 2381 + }, + { + "epoch": 0.32291737273774823, + "grad_norm": 2.2485764003305024, + "learning_rate": 1.5828717626576766e-06, + "loss": 1.1751, + "step": 2382 + }, + { + "epoch": 0.32305293838541316, + "grad_norm": 1.5955389509494415, + "learning_rate": 1.582514878737483e-06, + "loss": 1.1686, + "step": 2383 + }, + { + "epoch": 0.32318850403307803, + "grad_norm": 2.900276839467996, + "learning_rate": 1.5821578824838932e-06, + "loss": 1.1772, + "step": 2384 + }, + { + "epoch": 0.3233240696807429, + "grad_norm": 2.597001555680131, + "learning_rate": 1.5818007739657512e-06, + "loss": 1.2106, + "step": 2385 + }, + { + "epoch": 0.32345963532840777, + "grad_norm": 1.695318730560217, + "learning_rate": 1.5814435532519221e-06, + "loss": 1.1718, + "step": 2386 + }, + { + "epoch": 0.32359520097607264, + "grad_norm": 1.7200079062541487, + "learning_rate": 1.5810862204112933e-06, + "loss": 1.1837, + "step": 2387 + }, + { + "epoch": 0.32373076662373756, + "grad_norm": 3.4940088934942315, + "learning_rate": 1.580728775512774e-06, + "loss": 1.1895, + "step": 2388 + }, + { + "epoch": 0.32386633227140244, + "grad_norm": 1.733033949232732, + "learning_rate": 1.5803712186252943e-06, + "loss": 1.1857, + "step": 2389 + }, + { + "epoch": 0.3240018979190673, + "grad_norm": 2.340735669732916, + "learning_rate": 1.5800135498178065e-06, + "loss": 1.214, + "step": 2390 + }, + { + "epoch": 0.3241374635667322, + "grad_norm": 1.5928710078927775, + "learning_rate": 1.5796557691592835e-06, + "loss": 1.2011, + "step": 2391 + }, + { + "epoch": 0.32427302921439705, + "grad_norm": 1.5885395499850958, + "learning_rate": 1.579297876718721e-06, + "loss": 1.1927, + "step": 2392 + }, + { + "epoch": 0.32440859486206197, + "grad_norm": 1.6597622807497499, + "learning_rate": 1.5789398725651358e-06, + "loss": 1.1663, + "step": 2393 + }, + { + "epoch": 0.32454416050972684, + "grad_norm": 2.347610027673513, + "learning_rate": 1.5785817567675661e-06, + "loss": 1.1904, + "step": 2394 + }, + { + "epoch": 0.3246797261573917, + "grad_norm": 1.7761256973166875, + "learning_rate": 1.5782235293950717e-06, + "loss": 1.1641, + "step": 2395 + }, + { + "epoch": 0.3248152918050566, + "grad_norm": 2.3095403626085433, + "learning_rate": 1.5778651905167334e-06, + "loss": 1.1624, + "step": 2396 + }, + { + "epoch": 0.32495085745272145, + "grad_norm": 1.6810471431692982, + "learning_rate": 1.577506740201655e-06, + "loss": 1.1677, + "step": 2397 + }, + { + "epoch": 0.3250864231003864, + "grad_norm": 1.8982594401604969, + "learning_rate": 1.5771481785189601e-06, + "loss": 1.1258, + "step": 2398 + }, + { + "epoch": 0.32522198874805125, + "grad_norm": 1.8326515519164222, + "learning_rate": 1.5767895055377948e-06, + "loss": 1.2094, + "step": 2399 + }, + { + "epoch": 0.3253575543957161, + "grad_norm": 2.674116301324435, + "learning_rate": 1.5764307213273264e-06, + "loss": 1.1983, + "step": 2400 + }, + { + "epoch": 0.325493120043381, + "grad_norm": 1.7692601857342702, + "learning_rate": 1.5760718259567432e-06, + "loss": 1.2092, + "step": 2401 + }, + { + "epoch": 0.3256286856910459, + "grad_norm": 1.9916061970422123, + "learning_rate": 1.5757128194952557e-06, + "loss": 1.1628, + "step": 2402 + }, + { + "epoch": 0.3257642513387108, + "grad_norm": 1.9526500446810025, + "learning_rate": 1.5753537020120952e-06, + "loss": 1.2166, + "step": 2403 + }, + { + "epoch": 0.32589981698637566, + "grad_norm": 2.308959561440273, + "learning_rate": 1.5749944735765153e-06, + "loss": 1.2111, + "step": 2404 + }, + { + "epoch": 0.3260353826340405, + "grad_norm": 1.8231238226282869, + "learning_rate": 1.5746351342577895e-06, + "loss": 1.2032, + "step": 2405 + }, + { + "epoch": 0.3261709482817054, + "grad_norm": 7.126366900676986, + "learning_rate": 1.5742756841252143e-06, + "loss": 1.1895, + "step": 2406 + }, + { + "epoch": 0.3263065139293703, + "grad_norm": 3.188759457478089, + "learning_rate": 1.573916123248106e-06, + "loss": 1.1881, + "step": 2407 + }, + { + "epoch": 0.3264420795770352, + "grad_norm": 1.9792736358895073, + "learning_rate": 1.5735564516958039e-06, + "loss": 1.1943, + "step": 2408 + }, + { + "epoch": 0.32657764522470006, + "grad_norm": 2.0891723114214567, + "learning_rate": 1.5731966695376672e-06, + "loss": 1.1984, + "step": 2409 + }, + { + "epoch": 0.32671321087236493, + "grad_norm": 2.0033001002067263, + "learning_rate": 1.5728367768430775e-06, + "loss": 1.1537, + "step": 2410 + }, + { + "epoch": 0.3268487765200298, + "grad_norm": 3.423316734189896, + "learning_rate": 1.572476773681437e-06, + "loss": 1.1947, + "step": 2411 + }, + { + "epoch": 0.32698434216769473, + "grad_norm": 1.6968606671406574, + "learning_rate": 1.5721166601221695e-06, + "loss": 1.2036, + "step": 2412 + }, + { + "epoch": 0.3271199078153596, + "grad_norm": 1.743284227691782, + "learning_rate": 1.5717564362347203e-06, + "loss": 1.2049, + "step": 2413 + }, + { + "epoch": 0.32725547346302447, + "grad_norm": 1.7229229315698729, + "learning_rate": 1.5713961020885553e-06, + "loss": 1.1847, + "step": 2414 + }, + { + "epoch": 0.32739103911068934, + "grad_norm": 1.6957414828072255, + "learning_rate": 1.5710356577531628e-06, + "loss": 1.1683, + "step": 2415 + }, + { + "epoch": 0.3275266047583542, + "grad_norm": 2.380128091049466, + "learning_rate": 1.5706751032980506e-06, + "loss": 1.1984, + "step": 2416 + }, + { + "epoch": 0.32766217040601914, + "grad_norm": 2.1240982649122326, + "learning_rate": 1.5703144387927499e-06, + "loss": 1.1909, + "step": 2417 + }, + { + "epoch": 0.327797736053684, + "grad_norm": 2.737087291458293, + "learning_rate": 1.5699536643068113e-06, + "loss": 1.2277, + "step": 2418 + }, + { + "epoch": 0.3279333017013489, + "grad_norm": 2.028119015882871, + "learning_rate": 1.5695927799098071e-06, + "loss": 1.1995, + "step": 2419 + }, + { + "epoch": 0.32806886734901375, + "grad_norm": 2.0191425977838473, + "learning_rate": 1.5692317856713318e-06, + "loss": 1.2135, + "step": 2420 + }, + { + "epoch": 0.3282044329966786, + "grad_norm": 1.649619755499313, + "learning_rate": 1.5688706816609995e-06, + "loss": 1.1989, + "step": 2421 + }, + { + "epoch": 0.32833999864434354, + "grad_norm": 1.7087725559114972, + "learning_rate": 1.5685094679484472e-06, + "loss": 1.1932, + "step": 2422 + }, + { + "epoch": 0.3284755642920084, + "grad_norm": 1.6687399580615676, + "learning_rate": 1.5681481446033312e-06, + "loss": 1.2106, + "step": 2423 + }, + { + "epoch": 0.3286111299396733, + "grad_norm": 1.8020656440109233, + "learning_rate": 1.56778671169533e-06, + "loss": 1.2429, + "step": 2424 + }, + { + "epoch": 0.32874669558733816, + "grad_norm": 1.8633653595784057, + "learning_rate": 1.5674251692941436e-06, + "loss": 1.2038, + "step": 2425 + }, + { + "epoch": 0.328882261235003, + "grad_norm": 1.5038314507707338, + "learning_rate": 1.5670635174694923e-06, + "loss": 1.2095, + "step": 2426 + }, + { + "epoch": 0.32901782688266795, + "grad_norm": 1.806884550090567, + "learning_rate": 1.5667017562911176e-06, + "loss": 1.1994, + "step": 2427 + }, + { + "epoch": 0.3291533925303328, + "grad_norm": 1.6157984887414354, + "learning_rate": 1.5663398858287824e-06, + "loss": 1.2011, + "step": 2428 + }, + { + "epoch": 0.3292889581779977, + "grad_norm": 1.8144471966918216, + "learning_rate": 1.565977906152271e-06, + "loss": 1.1449, + "step": 2429 + }, + { + "epoch": 0.32942452382566256, + "grad_norm": 4.628465274694856, + "learning_rate": 1.5656158173313876e-06, + "loss": 1.1752, + "step": 2430 + }, + { + "epoch": 0.32956008947332743, + "grad_norm": 2.0048080109977056, + "learning_rate": 1.5652536194359586e-06, + "loss": 1.2045, + "step": 2431 + }, + { + "epoch": 0.32969565512099236, + "grad_norm": 2.0481918915197426, + "learning_rate": 1.5648913125358312e-06, + "loss": 1.1825, + "step": 2432 + }, + { + "epoch": 0.32983122076865723, + "grad_norm": 1.8420079402662166, + "learning_rate": 1.564528896700873e-06, + "loss": 1.205, + "step": 2433 + }, + { + "epoch": 0.3299667864163221, + "grad_norm": 1.713342693722228, + "learning_rate": 1.5641663720009732e-06, + "loss": 1.1822, + "step": 2434 + }, + { + "epoch": 0.33010235206398697, + "grad_norm": 2.0328326174993214, + "learning_rate": 1.5638037385060416e-06, + "loss": 1.21, + "step": 2435 + }, + { + "epoch": 0.33023791771165184, + "grad_norm": 1.7087365548981412, + "learning_rate": 1.5634409962860096e-06, + "loss": 1.2004, + "step": 2436 + }, + { + "epoch": 0.33037348335931677, + "grad_norm": 2.1076738784397504, + "learning_rate": 1.5630781454108291e-06, + "loss": 1.165, + "step": 2437 + }, + { + "epoch": 0.33050904900698164, + "grad_norm": 2.181901926302262, + "learning_rate": 1.5627151859504726e-06, + "loss": 1.1948, + "step": 2438 + }, + { + "epoch": 0.3306446146546465, + "grad_norm": 1.7849478232003582, + "learning_rate": 1.5623521179749346e-06, + "loss": 1.2024, + "step": 2439 + }, + { + "epoch": 0.3307801803023114, + "grad_norm": 2.111358783592567, + "learning_rate": 1.5619889415542296e-06, + "loss": 1.2077, + "step": 2440 + }, + { + "epoch": 0.3309157459499763, + "grad_norm": 1.6236866559910408, + "learning_rate": 1.5616256567583932e-06, + "loss": 1.2116, + "step": 2441 + }, + { + "epoch": 0.3310513115976412, + "grad_norm": 2.3399528760533226, + "learning_rate": 1.561262263657482e-06, + "loss": 1.2342, + "step": 2442 + }, + { + "epoch": 0.33118687724530604, + "grad_norm": 1.9996397664949865, + "learning_rate": 1.5608987623215736e-06, + "loss": 1.1946, + "step": 2443 + }, + { + "epoch": 0.3313224428929709, + "grad_norm": 2.8841753904769303, + "learning_rate": 1.5605351528207664e-06, + "loss": 1.2056, + "step": 2444 + }, + { + "epoch": 0.3314580085406358, + "grad_norm": 1.9924245742078566, + "learning_rate": 1.5601714352251798e-06, + "loss": 1.2381, + "step": 2445 + }, + { + "epoch": 0.3315935741883007, + "grad_norm": 1.9704929392894277, + "learning_rate": 1.5598076096049533e-06, + "loss": 1.191, + "step": 2446 + }, + { + "epoch": 0.3317291398359656, + "grad_norm": 1.6539866426885148, + "learning_rate": 1.5594436760302483e-06, + "loss": 1.1838, + "step": 2447 + }, + { + "epoch": 0.33186470548363045, + "grad_norm": 3.5853981546106866, + "learning_rate": 1.5590796345712465e-06, + "loss": 1.1992, + "step": 2448 + }, + { + "epoch": 0.3320002711312953, + "grad_norm": 1.6957382704214101, + "learning_rate": 1.55871548529815e-06, + "loss": 1.17, + "step": 2449 + }, + { + "epoch": 0.3321358367789602, + "grad_norm": 1.7376941408619442, + "learning_rate": 1.5583512282811826e-06, + "loss": 1.1942, + "step": 2450 + }, + { + "epoch": 0.3322714024266251, + "grad_norm": 1.6233582525613806, + "learning_rate": 1.557986863590588e-06, + "loss": 1.1418, + "step": 2451 + }, + { + "epoch": 0.33240696807429, + "grad_norm": 1.706880698972644, + "learning_rate": 1.5576223912966313e-06, + "loss": 1.1682, + "step": 2452 + }, + { + "epoch": 0.33254253372195486, + "grad_norm": 2.061122680131451, + "learning_rate": 1.557257811469598e-06, + "loss": 1.1722, + "step": 2453 + }, + { + "epoch": 0.33267809936961973, + "grad_norm": 1.7826682747891966, + "learning_rate": 1.5568931241797947e-06, + "loss": 1.2137, + "step": 2454 + }, + { + "epoch": 0.3328136650172846, + "grad_norm": 1.692529206459833, + "learning_rate": 1.556528329497548e-06, + "loss": 1.1799, + "step": 2455 + }, + { + "epoch": 0.3329492306649495, + "grad_norm": 2.0611946573502387, + "learning_rate": 1.5561634274932061e-06, + "loss": 1.1767, + "step": 2456 + }, + { + "epoch": 0.3330847963126144, + "grad_norm": 2.2935644157597954, + "learning_rate": 1.555798418237137e-06, + "loss": 1.1881, + "step": 2457 + }, + { + "epoch": 0.33322036196027927, + "grad_norm": 2.110002859106921, + "learning_rate": 1.5554333017997306e-06, + "loss": 1.1995, + "step": 2458 + }, + { + "epoch": 0.33335592760794414, + "grad_norm": 2.0585519392752714, + "learning_rate": 1.5550680782513962e-06, + "loss": 1.1752, + "step": 2459 + }, + { + "epoch": 0.333491493255609, + "grad_norm": 1.7034845899574267, + "learning_rate": 1.554702747662564e-06, + "loss": 1.2045, + "step": 2460 + }, + { + "epoch": 0.33362705890327393, + "grad_norm": 2.0998535459379277, + "learning_rate": 1.5543373101036856e-06, + "loss": 1.1966, + "step": 2461 + }, + { + "epoch": 0.3337626245509388, + "grad_norm": 1.8854787547830003, + "learning_rate": 1.5539717656452327e-06, + "loss": 1.1859, + "step": 2462 + }, + { + "epoch": 0.3338981901986037, + "grad_norm": 1.94580735103245, + "learning_rate": 1.5536061143576978e-06, + "loss": 1.1838, + "step": 2463 + }, + { + "epoch": 0.33403375584626854, + "grad_norm": 1.872070571897341, + "learning_rate": 1.5532403563115932e-06, + "loss": 1.2036, + "step": 2464 + }, + { + "epoch": 0.3341693214939334, + "grad_norm": 4.277135083729698, + "learning_rate": 1.5528744915774532e-06, + "loss": 1.1878, + "step": 2465 + }, + { + "epoch": 0.33430488714159834, + "grad_norm": 3.100323632921928, + "learning_rate": 1.5525085202258316e-06, + "loss": 1.1905, + "step": 2466 + }, + { + "epoch": 0.3344404527892632, + "grad_norm": 2.349778460567662, + "learning_rate": 1.552142442327303e-06, + "loss": 1.1727, + "step": 2467 + }, + { + "epoch": 0.3345760184369281, + "grad_norm": 1.7962203259923106, + "learning_rate": 1.5517762579524628e-06, + "loss": 1.1523, + "step": 2468 + }, + { + "epoch": 0.33471158408459295, + "grad_norm": 2.571551705531622, + "learning_rate": 1.5514099671719267e-06, + "loss": 1.2073, + "step": 2469 + }, + { + "epoch": 0.3348471497322578, + "grad_norm": 2.2424041495968154, + "learning_rate": 1.551043570056331e-06, + "loss": 1.2084, + "step": 2470 + }, + { + "epoch": 0.33498271537992275, + "grad_norm": 1.4741386424421896, + "learning_rate": 1.5506770666763324e-06, + "loss": 1.1573, + "step": 2471 + }, + { + "epoch": 0.3351182810275876, + "grad_norm": 1.8742786508107505, + "learning_rate": 1.5503104571026084e-06, + "loss": 1.2056, + "step": 2472 + }, + { + "epoch": 0.3352538466752525, + "grad_norm": 4.022739347259859, + "learning_rate": 1.5499437414058564e-06, + "loss": 1.1965, + "step": 2473 + }, + { + "epoch": 0.33538941232291736, + "grad_norm": 2.7204556947000484, + "learning_rate": 1.5495769196567955e-06, + "loss": 1.1762, + "step": 2474 + }, + { + "epoch": 0.3355249779705822, + "grad_norm": 1.978220299294861, + "learning_rate": 1.5492099919261632e-06, + "loss": 1.2345, + "step": 2475 + }, + { + "epoch": 0.33566054361824715, + "grad_norm": 1.8277276952723245, + "learning_rate": 1.5488429582847192e-06, + "loss": 1.1996, + "step": 2476 + }, + { + "epoch": 0.335796109265912, + "grad_norm": 1.7871699855076735, + "learning_rate": 1.5484758188032433e-06, + "loss": 1.1908, + "step": 2477 + }, + { + "epoch": 0.3359316749135769, + "grad_norm": 1.8106661833822746, + "learning_rate": 1.5481085735525348e-06, + "loss": 1.2189, + "step": 2478 + }, + { + "epoch": 0.33606724056124176, + "grad_norm": 1.8528082394508463, + "learning_rate": 1.5477412226034145e-06, + "loss": 1.1902, + "step": 2479 + }, + { + "epoch": 0.3362028062089067, + "grad_norm": 2.0532654030982243, + "learning_rate": 1.547373766026723e-06, + "loss": 1.2358, + "step": 2480 + }, + { + "epoch": 0.33633837185657156, + "grad_norm": 2.9672313740656806, + "learning_rate": 1.5470062038933213e-06, + "loss": 1.1957, + "step": 2481 + }, + { + "epoch": 0.33647393750423643, + "grad_norm": 1.7560769572703927, + "learning_rate": 1.5466385362740911e-06, + "loss": 1.2068, + "step": 2482 + }, + { + "epoch": 0.3366095031519013, + "grad_norm": 2.85184269209754, + "learning_rate": 1.5462707632399342e-06, + "loss": 1.1557, + "step": 2483 + }, + { + "epoch": 0.33674506879956617, + "grad_norm": 2.7513464405978074, + "learning_rate": 1.5459028848617726e-06, + "loss": 1.1897, + "step": 2484 + }, + { + "epoch": 0.3368806344472311, + "grad_norm": 1.8945303870766226, + "learning_rate": 1.5455349012105486e-06, + "loss": 1.2076, + "step": 2485 + }, + { + "epoch": 0.33701620009489597, + "grad_norm": 2.108499100751231, + "learning_rate": 1.545166812357225e-06, + "loss": 1.1618, + "step": 2486 + }, + { + "epoch": 0.33715176574256084, + "grad_norm": 1.9123794432831405, + "learning_rate": 1.5447986183727852e-06, + "loss": 1.2047, + "step": 2487 + }, + { + "epoch": 0.3372873313902257, + "grad_norm": 1.6967414384993158, + "learning_rate": 1.5444303193282324e-06, + "loss": 1.1982, + "step": 2488 + }, + { + "epoch": 0.3374228970378906, + "grad_norm": 1.8469229297580347, + "learning_rate": 1.5440619152945896e-06, + "loss": 1.1407, + "step": 2489 + }, + { + "epoch": 0.3375584626855555, + "grad_norm": 1.8892616314803083, + "learning_rate": 1.5436934063429013e-06, + "loss": 1.2332, + "step": 2490 + }, + { + "epoch": 0.3376940283332204, + "grad_norm": 1.7964514552682571, + "learning_rate": 1.5433247925442308e-06, + "loss": 1.1679, + "step": 2491 + }, + { + "epoch": 0.33782959398088525, + "grad_norm": 1.7500023703056589, + "learning_rate": 1.542956073969663e-06, + "loss": 1.168, + "step": 2492 + }, + { + "epoch": 0.3379651596285501, + "grad_norm": 1.5998856232554122, + "learning_rate": 1.5425872506903024e-06, + "loss": 1.1532, + "step": 2493 + }, + { + "epoch": 0.338100725276215, + "grad_norm": 2.026303378268343, + "learning_rate": 1.542218322777273e-06, + "loss": 1.1772, + "step": 2494 + }, + { + "epoch": 0.3382362909238799, + "grad_norm": 1.7152422359238224, + "learning_rate": 1.5418492903017204e-06, + "loss": 1.1963, + "step": 2495 + }, + { + "epoch": 0.3383718565715448, + "grad_norm": 2.3387397854098593, + "learning_rate": 1.5414801533348091e-06, + "loss": 1.2313, + "step": 2496 + }, + { + "epoch": 0.33850742221920965, + "grad_norm": 3.3137328260473993, + "learning_rate": 1.5411109119477247e-06, + "loss": 1.2125, + "step": 2497 + }, + { + "epoch": 0.3386429878668745, + "grad_norm": 2.133459407144816, + "learning_rate": 1.5407415662116718e-06, + "loss": 1.2246, + "step": 2498 + }, + { + "epoch": 0.3387785535145394, + "grad_norm": 1.9049048163731714, + "learning_rate": 1.5403721161978764e-06, + "loss": 1.1679, + "step": 2499 + }, + { + "epoch": 0.3389141191622043, + "grad_norm": 2.3545071172752956, + "learning_rate": 1.5400025619775838e-06, + "loss": 1.1526, + "step": 2500 + }, + { + "epoch": 0.3390496848098692, + "grad_norm": 1.8474219556292033, + "learning_rate": 1.5396329036220598e-06, + "loss": 1.1833, + "step": 2501 + }, + { + "epoch": 0.33918525045753406, + "grad_norm": 1.6855260340149882, + "learning_rate": 1.5392631412025898e-06, + "loss": 1.1897, + "step": 2502 + }, + { + "epoch": 0.33932081610519893, + "grad_norm": 1.660733420945779, + "learning_rate": 1.5388932747904797e-06, + "loss": 1.1833, + "step": 2503 + }, + { + "epoch": 0.3394563817528638, + "grad_norm": 2.5452126052285293, + "learning_rate": 1.5385233044570554e-06, + "loss": 1.1886, + "step": 2504 + }, + { + "epoch": 0.3395919474005287, + "grad_norm": 1.9996061410021657, + "learning_rate": 1.5381532302736627e-06, + "loss": 1.1985, + "step": 2505 + }, + { + "epoch": 0.3397275130481936, + "grad_norm": 1.903813248534084, + "learning_rate": 1.5377830523116675e-06, + "loss": 1.1797, + "step": 2506 + }, + { + "epoch": 0.33986307869585847, + "grad_norm": 2.0069823231634887, + "learning_rate": 1.5374127706424553e-06, + "loss": 1.1793, + "step": 2507 + }, + { + "epoch": 0.33999864434352334, + "grad_norm": 1.8159731387560234, + "learning_rate": 1.5370423853374325e-06, + "loss": 1.1872, + "step": 2508 + }, + { + "epoch": 0.3401342099911882, + "grad_norm": 2.2734673859044614, + "learning_rate": 1.5366718964680253e-06, + "loss": 1.1689, + "step": 2509 + }, + { + "epoch": 0.34026977563885313, + "grad_norm": 2.3776396418641235, + "learning_rate": 1.5363013041056787e-06, + "loss": 1.2157, + "step": 2510 + }, + { + "epoch": 0.340405341286518, + "grad_norm": 1.6908369070122415, + "learning_rate": 1.5359306083218588e-06, + "loss": 1.1753, + "step": 2511 + }, + { + "epoch": 0.3405409069341829, + "grad_norm": 2.33435421459921, + "learning_rate": 1.5355598091880517e-06, + "loss": 1.1618, + "step": 2512 + }, + { + "epoch": 0.34067647258184774, + "grad_norm": 1.8407138940533572, + "learning_rate": 1.5351889067757627e-06, + "loss": 1.2252, + "step": 2513 + }, + { + "epoch": 0.3408120382295126, + "grad_norm": 1.7460002729461421, + "learning_rate": 1.5348179011565176e-06, + "loss": 1.2369, + "step": 2514 + }, + { + "epoch": 0.34094760387717754, + "grad_norm": 4.213853807853341, + "learning_rate": 1.5344467924018619e-06, + "loss": 1.1422, + "step": 2515 + }, + { + "epoch": 0.3410831695248424, + "grad_norm": 2.265239469008891, + "learning_rate": 1.534075580583361e-06, + "loss": 1.1416, + "step": 2516 + }, + { + "epoch": 0.3412187351725073, + "grad_norm": 2.1645056665653883, + "learning_rate": 1.5337042657726e-06, + "loss": 1.2073, + "step": 2517 + }, + { + "epoch": 0.34135430082017215, + "grad_norm": 2.09240071318638, + "learning_rate": 1.5333328480411842e-06, + "loss": 1.1497, + "step": 2518 + }, + { + "epoch": 0.3414898664678371, + "grad_norm": 1.8859661602050894, + "learning_rate": 1.5329613274607387e-06, + "loss": 1.2041, + "step": 2519 + }, + { + "epoch": 0.34162543211550195, + "grad_norm": 2.618221951652812, + "learning_rate": 1.5325897041029078e-06, + "loss": 1.2106, + "step": 2520 + }, + { + "epoch": 0.3417609977631668, + "grad_norm": 1.9977709690182806, + "learning_rate": 1.5322179780393567e-06, + "loss": 1.1734, + "step": 2521 + }, + { + "epoch": 0.3418965634108317, + "grad_norm": 1.8390340861853098, + "learning_rate": 1.5318461493417694e-06, + "loss": 1.2025, + "step": 2522 + }, + { + "epoch": 0.34203212905849656, + "grad_norm": 2.191976082208833, + "learning_rate": 1.5314742180818504e-06, + "loss": 1.2129, + "step": 2523 + }, + { + "epoch": 0.3421676947061615, + "grad_norm": 1.9345647209858814, + "learning_rate": 1.5311021843313238e-06, + "loss": 1.1812, + "step": 2524 + }, + { + "epoch": 0.34230326035382636, + "grad_norm": 1.650243297345723, + "learning_rate": 1.5307300481619332e-06, + "loss": 1.225, + "step": 2525 + }, + { + "epoch": 0.3424388260014912, + "grad_norm": 1.83368387644837, + "learning_rate": 1.5303578096454422e-06, + "loss": 1.1777, + "step": 2526 + }, + { + "epoch": 0.3425743916491561, + "grad_norm": 1.916274456935737, + "learning_rate": 1.5299854688536339e-06, + "loss": 1.1744, + "step": 2527 + }, + { + "epoch": 0.34270995729682097, + "grad_norm": 1.907302583075146, + "learning_rate": 1.5296130258583113e-06, + "loss": 1.1902, + "step": 2528 + }, + { + "epoch": 0.3428455229444859, + "grad_norm": 1.7556885478068733, + "learning_rate": 1.5292404807312971e-06, + "loss": 1.1881, + "step": 2529 + }, + { + "epoch": 0.34298108859215076, + "grad_norm": 2.3591889858798805, + "learning_rate": 1.5288678335444342e-06, + "loss": 1.1896, + "step": 2530 + }, + { + "epoch": 0.34311665423981563, + "grad_norm": 2.0957816776102884, + "learning_rate": 1.5284950843695838e-06, + "loss": 1.1473, + "step": 2531 + }, + { + "epoch": 0.3432522198874805, + "grad_norm": 2.4984873240009953, + "learning_rate": 1.5281222332786282e-06, + "loss": 1.1965, + "step": 2532 + }, + { + "epoch": 0.3433877855351454, + "grad_norm": 2.856843577128895, + "learning_rate": 1.527749280343469e-06, + "loss": 1.1764, + "step": 2533 + }, + { + "epoch": 0.3435233511828103, + "grad_norm": 1.849355896240357, + "learning_rate": 1.527376225636026e-06, + "loss": 1.1963, + "step": 2534 + }, + { + "epoch": 0.34365891683047517, + "grad_norm": 5.64080760378772, + "learning_rate": 1.5270030692282415e-06, + "loss": 1.1804, + "step": 2535 + }, + { + "epoch": 0.34379448247814004, + "grad_norm": 1.9262014386373993, + "learning_rate": 1.526629811192075e-06, + "loss": 1.1443, + "step": 2536 + }, + { + "epoch": 0.3439300481258049, + "grad_norm": 1.8338665254730877, + "learning_rate": 1.5262564515995062e-06, + "loss": 1.2053, + "step": 2537 + }, + { + "epoch": 0.3440656137734698, + "grad_norm": 1.9290666677940678, + "learning_rate": 1.5258829905225348e-06, + "loss": 1.1643, + "step": 2538 + }, + { + "epoch": 0.3442011794211347, + "grad_norm": 1.9949356826869031, + "learning_rate": 1.5255094280331795e-06, + "loss": 1.1948, + "step": 2539 + }, + { + "epoch": 0.3443367450687996, + "grad_norm": 2.0473437280254108, + "learning_rate": 1.5251357642034793e-06, + "loss": 1.166, + "step": 2540 + }, + { + "epoch": 0.34447231071646445, + "grad_norm": 1.7243355900499235, + "learning_rate": 1.524761999105492e-06, + "loss": 1.1558, + "step": 2541 + }, + { + "epoch": 0.3446078763641293, + "grad_norm": 2.08060272580144, + "learning_rate": 1.5243881328112953e-06, + "loss": 1.1933, + "step": 2542 + }, + { + "epoch": 0.3447434420117942, + "grad_norm": 3.908576232021684, + "learning_rate": 1.5240141653929868e-06, + "loss": 1.2073, + "step": 2543 + }, + { + "epoch": 0.3448790076594591, + "grad_norm": 2.4358888434997548, + "learning_rate": 1.5236400969226828e-06, + "loss": 1.1761, + "step": 2544 + }, + { + "epoch": 0.345014573307124, + "grad_norm": 1.780739534097406, + "learning_rate": 1.5232659274725195e-06, + "loss": 1.1762, + "step": 2545 + }, + { + "epoch": 0.34515013895478885, + "grad_norm": 1.7840183359870534, + "learning_rate": 1.5228916571146522e-06, + "loss": 1.1774, + "step": 2546 + }, + { + "epoch": 0.3452857046024537, + "grad_norm": 2.002154760071911, + "learning_rate": 1.5225172859212565e-06, + "loss": 1.2081, + "step": 2547 + }, + { + "epoch": 0.3454212702501186, + "grad_norm": 5.684929779015611, + "learning_rate": 1.5221428139645266e-06, + "loss": 1.1734, + "step": 2548 + }, + { + "epoch": 0.3455568358977835, + "grad_norm": 1.6564922228281822, + "learning_rate": 1.5217682413166767e-06, + "loss": 1.1848, + "step": 2549 + }, + { + "epoch": 0.3456924015454484, + "grad_norm": 2.298333251986971, + "learning_rate": 1.5213935680499397e-06, + "loss": 1.1358, + "step": 2550 + }, + { + "epoch": 0.34582796719311326, + "grad_norm": 2.119098153023781, + "learning_rate": 1.521018794236569e-06, + "loss": 1.1572, + "step": 2551 + }, + { + "epoch": 0.34596353284077813, + "grad_norm": 2.7079956836803176, + "learning_rate": 1.5206439199488366e-06, + "loss": 1.2046, + "step": 2552 + }, + { + "epoch": 0.346099098488443, + "grad_norm": 3.235038481473252, + "learning_rate": 1.5202689452590339e-06, + "loss": 1.1901, + "step": 2553 + }, + { + "epoch": 0.34623466413610793, + "grad_norm": 2.5558117337805877, + "learning_rate": 1.5198938702394717e-06, + "loss": 1.1548, + "step": 2554 + }, + { + "epoch": 0.3463702297837728, + "grad_norm": 1.8825860942704093, + "learning_rate": 1.5195186949624804e-06, + "loss": 1.1734, + "step": 2555 + }, + { + "epoch": 0.34650579543143767, + "grad_norm": 1.714715736103818, + "learning_rate": 1.5191434195004098e-06, + "loss": 1.1564, + "step": 2556 + }, + { + "epoch": 0.34664136107910254, + "grad_norm": 2.1821841415583916, + "learning_rate": 1.5187680439256285e-06, + "loss": 1.1588, + "step": 2557 + }, + { + "epoch": 0.34677692672676746, + "grad_norm": 3.261541107289295, + "learning_rate": 1.5183925683105251e-06, + "loss": 1.1837, + "step": 2558 + }, + { + "epoch": 0.34691249237443234, + "grad_norm": 1.7643191199377948, + "learning_rate": 1.5180169927275066e-06, + "loss": 1.1733, + "step": 2559 + }, + { + "epoch": 0.3470480580220972, + "grad_norm": 1.6601984167774337, + "learning_rate": 1.517641317249e-06, + "loss": 1.2322, + "step": 2560 + }, + { + "epoch": 0.3471836236697621, + "grad_norm": 2.457778768307528, + "learning_rate": 1.5172655419474514e-06, + "loss": 1.1879, + "step": 2561 + }, + { + "epoch": 0.34731918931742695, + "grad_norm": 1.7181828461991082, + "learning_rate": 1.5168896668953261e-06, + "loss": 1.1607, + "step": 2562 + }, + { + "epoch": 0.34745475496509187, + "grad_norm": 1.7548002949586503, + "learning_rate": 1.5165136921651084e-06, + "loss": 1.1648, + "step": 2563 + }, + { + "epoch": 0.34759032061275674, + "grad_norm": 1.9610240967289998, + "learning_rate": 1.5161376178293028e-06, + "loss": 1.1876, + "step": 2564 + }, + { + "epoch": 0.3477258862604216, + "grad_norm": 2.221997595866566, + "learning_rate": 1.5157614439604313e-06, + "loss": 1.1644, + "step": 2565 + }, + { + "epoch": 0.3478614519080865, + "grad_norm": 2.1060593209079945, + "learning_rate": 1.5153851706310367e-06, + "loss": 1.23, + "step": 2566 + }, + { + "epoch": 0.34799701755575135, + "grad_norm": 1.6985057225225564, + "learning_rate": 1.51500879791368e-06, + "loss": 1.171, + "step": 2567 + }, + { + "epoch": 0.3481325832034163, + "grad_norm": 1.9756494978188177, + "learning_rate": 1.5146323258809423e-06, + "loss": 1.1775, + "step": 2568 + }, + { + "epoch": 0.34826814885108115, + "grad_norm": 3.130117904790816, + "learning_rate": 1.5142557546054224e-06, + "loss": 1.2171, + "step": 2569 + }, + { + "epoch": 0.348403714498746, + "grad_norm": 1.6681444819224507, + "learning_rate": 1.5138790841597398e-06, + "loss": 1.2004, + "step": 2570 + }, + { + "epoch": 0.3485392801464109, + "grad_norm": 2.1585895188333617, + "learning_rate": 1.5135023146165317e-06, + "loss": 1.2403, + "step": 2571 + }, + { + "epoch": 0.34867484579407576, + "grad_norm": 1.6131571812876653, + "learning_rate": 1.513125446048456e-06, + "loss": 1.2118, + "step": 2572 + }, + { + "epoch": 0.3488104114417407, + "grad_norm": 5.903015910847125, + "learning_rate": 1.5127484785281884e-06, + "loss": 1.183, + "step": 2573 + }, + { + "epoch": 0.34894597708940556, + "grad_norm": 2.055607528718225, + "learning_rate": 1.5123714121284237e-06, + "loss": 1.2011, + "step": 2574 + }, + { + "epoch": 0.3490815427370704, + "grad_norm": 3.8095328511460274, + "learning_rate": 1.5119942469218768e-06, + "loss": 1.2127, + "step": 2575 + }, + { + "epoch": 0.3492171083847353, + "grad_norm": 1.6532609199943062, + "learning_rate": 1.5116169829812807e-06, + "loss": 1.2138, + "step": 2576 + }, + { + "epoch": 0.34935267403240017, + "grad_norm": 2.696581111418815, + "learning_rate": 1.511239620379388e-06, + "loss": 1.1985, + "step": 2577 + }, + { + "epoch": 0.3494882396800651, + "grad_norm": 2.586518119690418, + "learning_rate": 1.51086215918897e-06, + "loss": 1.2034, + "step": 2578 + }, + { + "epoch": 0.34962380532772996, + "grad_norm": 1.6574398326246986, + "learning_rate": 1.510484599482817e-06, + "loss": 1.1528, + "step": 2579 + }, + { + "epoch": 0.34975937097539483, + "grad_norm": 1.9406267985301273, + "learning_rate": 1.5101069413337386e-06, + "loss": 1.159, + "step": 2580 + }, + { + "epoch": 0.3498949366230597, + "grad_norm": 5.399835617029856, + "learning_rate": 1.5097291848145631e-06, + "loss": 1.1933, + "step": 2581 + }, + { + "epoch": 0.3500305022707246, + "grad_norm": 2.6784001323455557, + "learning_rate": 1.5093513299981378e-06, + "loss": 1.1816, + "step": 2582 + }, + { + "epoch": 0.3501660679183895, + "grad_norm": 1.782875351582939, + "learning_rate": 1.5089733769573292e-06, + "loss": 1.2136, + "step": 2583 + }, + { + "epoch": 0.35030163356605437, + "grad_norm": 2.061509905250401, + "learning_rate": 1.5085953257650223e-06, + "loss": 1.1724, + "step": 2584 + }, + { + "epoch": 0.35043719921371924, + "grad_norm": 2.3801348797465245, + "learning_rate": 1.5082171764941216e-06, + "loss": 1.199, + "step": 2585 + }, + { + "epoch": 0.3505727648613841, + "grad_norm": 1.9874967914956687, + "learning_rate": 1.5078389292175499e-06, + "loss": 1.1988, + "step": 2586 + }, + { + "epoch": 0.350708330509049, + "grad_norm": 1.6980916643854573, + "learning_rate": 1.5074605840082494e-06, + "loss": 1.1705, + "step": 2587 + }, + { + "epoch": 0.3508438961567139, + "grad_norm": 1.9049861410515063, + "learning_rate": 1.5070821409391812e-06, + "loss": 1.21, + "step": 2588 + }, + { + "epoch": 0.3509794618043788, + "grad_norm": 2.00153356975662, + "learning_rate": 1.5067036000833242e-06, + "loss": 1.1863, + "step": 2589 + }, + { + "epoch": 0.35111502745204365, + "grad_norm": 2.0815832496456497, + "learning_rate": 1.5063249615136782e-06, + "loss": 1.1629, + "step": 2590 + }, + { + "epoch": 0.3512505930997085, + "grad_norm": 2.195411200429557, + "learning_rate": 1.5059462253032595e-06, + "loss": 1.2058, + "step": 2591 + }, + { + "epoch": 0.3513861587473734, + "grad_norm": 4.1859673390491805, + "learning_rate": 1.5055673915251052e-06, + "loss": 1.1437, + "step": 2592 + }, + { + "epoch": 0.3515217243950383, + "grad_norm": 2.109320100486466, + "learning_rate": 1.5051884602522702e-06, + "loss": 1.1563, + "step": 2593 + }, + { + "epoch": 0.3516572900427032, + "grad_norm": 1.9299770482022107, + "learning_rate": 1.5048094315578284e-06, + "loss": 1.1895, + "step": 2594 + }, + { + "epoch": 0.35179285569036806, + "grad_norm": 4.499580817005904, + "learning_rate": 1.5044303055148722e-06, + "loss": 1.1521, + "step": 2595 + }, + { + "epoch": 0.3519284213380329, + "grad_norm": 1.6680740660484368, + "learning_rate": 1.5040510821965135e-06, + "loss": 1.2079, + "step": 2596 + }, + { + "epoch": 0.3520639869856978, + "grad_norm": 1.7380528001822029, + "learning_rate": 1.5036717616758824e-06, + "loss": 1.15, + "step": 2597 + }, + { + "epoch": 0.3521995526333627, + "grad_norm": 1.737132915062673, + "learning_rate": 1.5032923440261276e-06, + "loss": 1.1862, + "step": 2598 + }, + { + "epoch": 0.3523351182810276, + "grad_norm": 1.9748653544895725, + "learning_rate": 1.5029128293204174e-06, + "loss": 1.1861, + "step": 2599 + }, + { + "epoch": 0.35247068392869246, + "grad_norm": 1.7187237349802675, + "learning_rate": 1.5025332176319373e-06, + "loss": 1.1713, + "step": 2600 + }, + { + "epoch": 0.35260624957635733, + "grad_norm": 1.8891401271793618, + "learning_rate": 1.5021535090338932e-06, + "loss": 1.1634, + "step": 2601 + }, + { + "epoch": 0.35274181522402226, + "grad_norm": 1.494241147902518, + "learning_rate": 1.5017737035995087e-06, + "loss": 1.1817, + "step": 2602 + }, + { + "epoch": 0.35287738087168713, + "grad_norm": 1.844865844029006, + "learning_rate": 1.5013938014020262e-06, + "loss": 1.1713, + "step": 2603 + }, + { + "epoch": 0.353012946519352, + "grad_norm": 5.560405146419798, + "learning_rate": 1.501013802514707e-06, + "loss": 1.1743, + "step": 2604 + }, + { + "epoch": 0.35314851216701687, + "grad_norm": 1.6274081881569118, + "learning_rate": 1.5006337070108304e-06, + "loss": 1.235, + "step": 2605 + }, + { + "epoch": 0.35328407781468174, + "grad_norm": 1.5998457505026444, + "learning_rate": 1.5002535149636952e-06, + "loss": 1.1845, + "step": 2606 + }, + { + "epoch": 0.35341964346234667, + "grad_norm": 2.161107432759761, + "learning_rate": 1.4998732264466186e-06, + "loss": 1.1335, + "step": 2607 + }, + { + "epoch": 0.35355520911001154, + "grad_norm": 2.1324922327497497, + "learning_rate": 1.499492841532936e-06, + "loss": 1.184, + "step": 2608 + }, + { + "epoch": 0.3536907747576764, + "grad_norm": 1.874010420868067, + "learning_rate": 1.4991123602960017e-06, + "loss": 1.2014, + "step": 2609 + }, + { + "epoch": 0.3538263404053413, + "grad_norm": 2.1904562913346677, + "learning_rate": 1.4987317828091882e-06, + "loss": 1.205, + "step": 2610 + }, + { + "epoch": 0.35396190605300615, + "grad_norm": 5.759381872848652, + "learning_rate": 1.4983511091458874e-06, + "loss": 1.1791, + "step": 2611 + }, + { + "epoch": 0.3540974717006711, + "grad_norm": 2.1519456462586013, + "learning_rate": 1.4979703393795086e-06, + "loss": 1.1954, + "step": 2612 + }, + { + "epoch": 0.35423303734833594, + "grad_norm": 2.2799259244324217, + "learning_rate": 1.4975894735834809e-06, + "loss": 1.1176, + "step": 2613 + }, + { + "epoch": 0.3543686029960008, + "grad_norm": 2.1191198976416654, + "learning_rate": 1.4972085118312511e-06, + "loss": 1.1569, + "step": 2614 + }, + { + "epoch": 0.3545041686436657, + "grad_norm": 1.7902275252259392, + "learning_rate": 1.4968274541962845e-06, + "loss": 1.1894, + "step": 2615 + }, + { + "epoch": 0.35463973429133056, + "grad_norm": 2.3190660880034732, + "learning_rate": 1.4964463007520647e-06, + "loss": 1.1878, + "step": 2616 + }, + { + "epoch": 0.3547752999389955, + "grad_norm": 1.8083625619561072, + "learning_rate": 1.4960650515720947e-06, + "loss": 1.1731, + "step": 2617 + }, + { + "epoch": 0.35491086558666035, + "grad_norm": 2.323430463384462, + "learning_rate": 1.4956837067298954e-06, + "loss": 1.2048, + "step": 2618 + }, + { + "epoch": 0.3550464312343252, + "grad_norm": 1.9752347997567985, + "learning_rate": 1.4953022662990057e-06, + "loss": 1.1602, + "step": 2619 + }, + { + "epoch": 0.3551819968819901, + "grad_norm": 2.0855473459806046, + "learning_rate": 1.4949207303529835e-06, + "loss": 1.2079, + "step": 2620 + }, + { + "epoch": 0.35531756252965496, + "grad_norm": 1.8999747608213333, + "learning_rate": 1.4945390989654054e-06, + "loss": 1.1957, + "step": 2621 + }, + { + "epoch": 0.3554531281773199, + "grad_norm": 1.9237671515062775, + "learning_rate": 1.4941573722098655e-06, + "loss": 1.1977, + "step": 2622 + }, + { + "epoch": 0.35558869382498476, + "grad_norm": 1.7192685856844037, + "learning_rate": 1.4937755501599772e-06, + "loss": 1.1559, + "step": 2623 + }, + { + "epoch": 0.35572425947264963, + "grad_norm": 1.632529065129846, + "learning_rate": 1.4933936328893714e-06, + "loss": 1.1797, + "step": 2624 + }, + { + "epoch": 0.3558598251203145, + "grad_norm": 1.8706277216014042, + "learning_rate": 1.4930116204716984e-06, + "loss": 1.2032, + "step": 2625 + }, + { + "epoch": 0.35599539076797937, + "grad_norm": 1.6573587249275425, + "learning_rate": 1.492629512980626e-06, + "loss": 1.1649, + "step": 2626 + }, + { + "epoch": 0.3561309564156443, + "grad_norm": 1.6432752398093586, + "learning_rate": 1.4922473104898404e-06, + "loss": 1.1765, + "step": 2627 + }, + { + "epoch": 0.35626652206330917, + "grad_norm": 1.9322798288842045, + "learning_rate": 1.4918650130730467e-06, + "loss": 1.1826, + "step": 2628 + }, + { + "epoch": 0.35640208771097404, + "grad_norm": 1.8439613646681094, + "learning_rate": 1.491482620803968e-06, + "loss": 1.1729, + "step": 2629 + }, + { + "epoch": 0.3565376533586389, + "grad_norm": 1.6138662849112064, + "learning_rate": 1.491100133756345e-06, + "loss": 1.173, + "step": 2630 + }, + { + "epoch": 0.3566732190063038, + "grad_norm": 1.8320158676368554, + "learning_rate": 1.490717552003938e-06, + "loss": 1.2026, + "step": 2631 + }, + { + "epoch": 0.3568087846539687, + "grad_norm": 1.8687195586633003, + "learning_rate": 1.4903348756205242e-06, + "loss": 1.1888, + "step": 2632 + }, + { + "epoch": 0.3569443503016336, + "grad_norm": 2.153693773529145, + "learning_rate": 1.4899521046799005e-06, + "loss": 1.2041, + "step": 2633 + }, + { + "epoch": 0.35707991594929844, + "grad_norm": 2.148170209122433, + "learning_rate": 1.4895692392558806e-06, + "loss": 1.1814, + "step": 2634 + }, + { + "epoch": 0.3572154815969633, + "grad_norm": 1.935074635145785, + "learning_rate": 1.4891862794222976e-06, + "loss": 1.1809, + "step": 2635 + }, + { + "epoch": 0.3573510472446282, + "grad_norm": 1.836457233507465, + "learning_rate": 1.4888032252530017e-06, + "loss": 1.2038, + "step": 2636 + }, + { + "epoch": 0.3574866128922931, + "grad_norm": 2.0065867843470944, + "learning_rate": 1.4884200768218625e-06, + "loss": 1.2074, + "step": 2637 + }, + { + "epoch": 0.357622178539958, + "grad_norm": 1.923189073739377, + "learning_rate": 1.4880368342027665e-06, + "loss": 1.1835, + "step": 2638 + }, + { + "epoch": 0.35775774418762285, + "grad_norm": 3.983298673925265, + "learning_rate": 1.4876534974696196e-06, + "loss": 1.2017, + "step": 2639 + }, + { + "epoch": 0.3578933098352877, + "grad_norm": 1.8939408431625284, + "learning_rate": 1.487270066696345e-06, + "loss": 1.1451, + "step": 2640 + }, + { + "epoch": 0.35802887548295265, + "grad_norm": 2.2212112688549754, + "learning_rate": 1.4868865419568841e-06, + "loss": 1.1339, + "step": 2641 + }, + { + "epoch": 0.3581644411306175, + "grad_norm": 1.7773751896712096, + "learning_rate": 1.4865029233251971e-06, + "loss": 1.1843, + "step": 2642 + }, + { + "epoch": 0.3583000067782824, + "grad_norm": 2.3310547398367403, + "learning_rate": 1.4861192108752617e-06, + "loss": 1.2111, + "step": 2643 + }, + { + "epoch": 0.35843557242594726, + "grad_norm": 2.807149096082532, + "learning_rate": 1.485735404681073e-06, + "loss": 1.1713, + "step": 2644 + }, + { + "epoch": 0.35857113807361213, + "grad_norm": 1.6745297224505085, + "learning_rate": 1.4853515048166463e-06, + "loss": 1.2105, + "step": 2645 + }, + { + "epoch": 0.35870670372127705, + "grad_norm": 2.457836753086844, + "learning_rate": 1.4849675113560128e-06, + "loss": 1.1357, + "step": 2646 + }, + { + "epoch": 0.3588422693689419, + "grad_norm": 1.9294026871934424, + "learning_rate": 1.4845834243732228e-06, + "loss": 1.1907, + "step": 2647 + }, + { + "epoch": 0.3589778350166068, + "grad_norm": 1.9954747429519502, + "learning_rate": 1.4841992439423445e-06, + "loss": 1.1591, + "step": 2648 + }, + { + "epoch": 0.35911340066427166, + "grad_norm": 1.9029953823621013, + "learning_rate": 1.483814970137464e-06, + "loss": 1.144, + "step": 2649 + }, + { + "epoch": 0.35924896631193654, + "grad_norm": 1.7178032039931233, + "learning_rate": 1.4834306030326855e-06, + "loss": 1.169, + "step": 2650 + }, + { + "epoch": 0.35938453195960146, + "grad_norm": 1.7332987313144037, + "learning_rate": 1.4830461427021311e-06, + "loss": 1.2086, + "step": 2651 + }, + { + "epoch": 0.35952009760726633, + "grad_norm": 1.6328792270227153, + "learning_rate": 1.4826615892199415e-06, + "loss": 1.1711, + "step": 2652 + }, + { + "epoch": 0.3596556632549312, + "grad_norm": 3.2370757064219338, + "learning_rate": 1.482276942660274e-06, + "loss": 1.1311, + "step": 2653 + }, + { + "epoch": 0.35979122890259607, + "grad_norm": 1.6988509858185588, + "learning_rate": 1.481892203097305e-06, + "loss": 1.153, + "step": 2654 + }, + { + "epoch": 0.35992679455026094, + "grad_norm": 1.8995481630097286, + "learning_rate": 1.481507370605228e-06, + "loss": 1.1951, + "step": 2655 + }, + { + "epoch": 0.36006236019792587, + "grad_norm": 1.6248525753881924, + "learning_rate": 1.481122445258256e-06, + "loss": 1.1959, + "step": 2656 + }, + { + "epoch": 0.36019792584559074, + "grad_norm": 1.63382327907909, + "learning_rate": 1.4807374271306182e-06, + "loss": 1.1883, + "step": 2657 + }, + { + "epoch": 0.3603334914932556, + "grad_norm": 2.280237959191525, + "learning_rate": 1.4803523162965618e-06, + "loss": 1.1597, + "step": 2658 + }, + { + "epoch": 0.3604690571409205, + "grad_norm": 2.248421016513146, + "learning_rate": 1.4799671128303533e-06, + "loss": 1.1483, + "step": 2659 + }, + { + "epoch": 0.36060462278858535, + "grad_norm": 2.146763758353986, + "learning_rate": 1.4795818168062755e-06, + "loss": 1.1785, + "step": 2660 + }, + { + "epoch": 0.3607401884362503, + "grad_norm": 2.2086223800096967, + "learning_rate": 1.47919642829863e-06, + "loss": 1.1997, + "step": 2661 + }, + { + "epoch": 0.36087575408391515, + "grad_norm": 2.0867156513485536, + "learning_rate": 1.4788109473817359e-06, + "loss": 1.1945, + "step": 2662 + }, + { + "epoch": 0.36101131973158, + "grad_norm": 1.8402868077116847, + "learning_rate": 1.4784253741299298e-06, + "loss": 1.1957, + "step": 2663 + }, + { + "epoch": 0.3611468853792449, + "grad_norm": 1.7857357804066045, + "learning_rate": 1.4780397086175672e-06, + "loss": 1.1461, + "step": 2664 + }, + { + "epoch": 0.36128245102690976, + "grad_norm": 1.8464009618671726, + "learning_rate": 1.4776539509190198e-06, + "loss": 1.184, + "step": 2665 + }, + { + "epoch": 0.3614180166745747, + "grad_norm": 1.6245582879926248, + "learning_rate": 1.4772681011086788e-06, + "loss": 1.1566, + "step": 2666 + }, + { + "epoch": 0.36155358232223955, + "grad_norm": 1.852531616149586, + "learning_rate": 1.4768821592609513e-06, + "loss": 1.1732, + "step": 2667 + }, + { + "epoch": 0.3616891479699044, + "grad_norm": 2.021286374924253, + "learning_rate": 1.4764961254502639e-06, + "loss": 1.2117, + "step": 2668 + }, + { + "epoch": 0.3618247136175693, + "grad_norm": 1.6389156014835906, + "learning_rate": 1.47610999975106e-06, + "loss": 1.1729, + "step": 2669 + }, + { + "epoch": 0.36196027926523416, + "grad_norm": 2.411107448907856, + "learning_rate": 1.4757237822378009e-06, + "loss": 1.1765, + "step": 2670 + }, + { + "epoch": 0.3620958449128991, + "grad_norm": 2.2359057217674954, + "learning_rate": 1.4753374729849656e-06, + "loss": 1.2028, + "step": 2671 + }, + { + "epoch": 0.36223141056056396, + "grad_norm": 2.0284888382672133, + "learning_rate": 1.4749510720670503e-06, + "loss": 1.1504, + "step": 2672 + }, + { + "epoch": 0.36236697620822883, + "grad_norm": 1.6522448947000423, + "learning_rate": 1.47456457955857e-06, + "loss": 1.1619, + "step": 2673 + }, + { + "epoch": 0.3625025418558937, + "grad_norm": 7.6454169473420945, + "learning_rate": 1.4741779955340565e-06, + "loss": 1.1446, + "step": 2674 + }, + { + "epoch": 0.36263810750355857, + "grad_norm": 1.6038170892821373, + "learning_rate": 1.4737913200680596e-06, + "loss": 1.153, + "step": 2675 + }, + { + "epoch": 0.3627736731512235, + "grad_norm": 3.3016713321678974, + "learning_rate": 1.4734045532351463e-06, + "loss": 1.2275, + "step": 2676 + }, + { + "epoch": 0.36290923879888837, + "grad_norm": 3.441704806347007, + "learning_rate": 1.473017695109902e-06, + "loss": 1.1559, + "step": 2677 + }, + { + "epoch": 0.36304480444655324, + "grad_norm": 1.7111474656932357, + "learning_rate": 1.472630745766929e-06, + "loss": 1.2075, + "step": 2678 + }, + { + "epoch": 0.3631803700942181, + "grad_norm": 4.5418388707869335, + "learning_rate": 1.4722437052808472e-06, + "loss": 1.1912, + "step": 2679 + }, + { + "epoch": 0.36331593574188303, + "grad_norm": 2.5802328354119783, + "learning_rate": 1.4718565737262945e-06, + "loss": 1.1693, + "step": 2680 + }, + { + "epoch": 0.3634515013895479, + "grad_norm": 1.9110856143055908, + "learning_rate": 1.4714693511779262e-06, + "loss": 1.1753, + "step": 2681 + }, + { + "epoch": 0.3635870670372128, + "grad_norm": 1.6381249579266304, + "learning_rate": 1.471082037710415e-06, + "loss": 1.1962, + "step": 2682 + }, + { + "epoch": 0.36372263268487764, + "grad_norm": 6.046869041089487, + "learning_rate": 1.4706946333984514e-06, + "loss": 1.1699, + "step": 2683 + }, + { + "epoch": 0.3638581983325425, + "grad_norm": 2.9145255051928776, + "learning_rate": 1.4703071383167433e-06, + "loss": 1.172, + "step": 2684 + }, + { + "epoch": 0.36399376398020744, + "grad_norm": 3.0087481028654364, + "learning_rate": 1.4699195525400158e-06, + "loss": 1.1926, + "step": 2685 + }, + { + "epoch": 0.3641293296278723, + "grad_norm": 1.9193653847428336, + "learning_rate": 1.469531876143012e-06, + "loss": 1.2087, + "step": 2686 + }, + { + "epoch": 0.3642648952755372, + "grad_norm": 1.951061272472, + "learning_rate": 1.4691441092004921e-06, + "loss": 1.1501, + "step": 2687 + }, + { + "epoch": 0.36440046092320205, + "grad_norm": 2.7800905920249863, + "learning_rate": 1.4687562517872342e-06, + "loss": 1.1909, + "step": 2688 + }, + { + "epoch": 0.3645360265708669, + "grad_norm": 1.8751735423611176, + "learning_rate": 1.4683683039780328e-06, + "loss": 1.2013, + "step": 2689 + }, + { + "epoch": 0.36467159221853185, + "grad_norm": 2.810597737518139, + "learning_rate": 1.4679802658477013e-06, + "loss": 1.1816, + "step": 2690 + }, + { + "epoch": 0.3648071578661967, + "grad_norm": 3.2471377428599015, + "learning_rate": 1.4675921374710696e-06, + "loss": 1.1801, + "step": 2691 + }, + { + "epoch": 0.3649427235138616, + "grad_norm": 2.3316954972984365, + "learning_rate": 1.467203918922985e-06, + "loss": 1.1765, + "step": 2692 + }, + { + "epoch": 0.36507828916152646, + "grad_norm": 2.4670080973732875, + "learning_rate": 1.4668156102783125e-06, + "loss": 1.1471, + "step": 2693 + }, + { + "epoch": 0.36521385480919133, + "grad_norm": 3.1360933312349424, + "learning_rate": 1.4664272116119345e-06, + "loss": 1.1621, + "step": 2694 + }, + { + "epoch": 0.36534942045685626, + "grad_norm": 2.097812244010261, + "learning_rate": 1.4660387229987504e-06, + "loss": 1.1497, + "step": 2695 + }, + { + "epoch": 0.3654849861045211, + "grad_norm": 2.304653025804307, + "learning_rate": 1.4656501445136774e-06, + "loss": 1.2105, + "step": 2696 + }, + { + "epoch": 0.365620551752186, + "grad_norm": 1.8846768184839264, + "learning_rate": 1.4652614762316495e-06, + "loss": 1.15, + "step": 2697 + }, + { + "epoch": 0.36575611739985087, + "grad_norm": 1.6075161656712422, + "learning_rate": 1.4648727182276186e-06, + "loss": 1.2045, + "step": 2698 + }, + { + "epoch": 0.36589168304751574, + "grad_norm": 3.2383136618389403, + "learning_rate": 1.4644838705765534e-06, + "loss": 1.1732, + "step": 2699 + }, + { + "epoch": 0.36602724869518066, + "grad_norm": 2.258222536065019, + "learning_rate": 1.46409493335344e-06, + "loss": 1.1727, + "step": 2700 + }, + { + "epoch": 0.36616281434284553, + "grad_norm": 2.6385566192753416, + "learning_rate": 1.4637059066332824e-06, + "loss": 1.1956, + "step": 2701 + }, + { + "epoch": 0.3662983799905104, + "grad_norm": 1.8857657276500126, + "learning_rate": 1.4633167904911008e-06, + "loss": 1.1956, + "step": 2702 + }, + { + "epoch": 0.3664339456381753, + "grad_norm": 2.3439733437775017, + "learning_rate": 1.4629275850019336e-06, + "loss": 1.1774, + "step": 2703 + }, + { + "epoch": 0.36656951128584014, + "grad_norm": 2.299943335533618, + "learning_rate": 1.4625382902408354e-06, + "loss": 1.1937, + "step": 2704 + }, + { + "epoch": 0.36670507693350507, + "grad_norm": 2.570839844193462, + "learning_rate": 1.4621489062828788e-06, + "loss": 1.1653, + "step": 2705 + }, + { + "epoch": 0.36684064258116994, + "grad_norm": 1.6440226815231977, + "learning_rate": 1.461759433203154e-06, + "loss": 1.1618, + "step": 2706 + }, + { + "epoch": 0.3669762082288348, + "grad_norm": 2.1579737284390244, + "learning_rate": 1.4613698710767674e-06, + "loss": 1.1805, + "step": 2707 + }, + { + "epoch": 0.3671117738764997, + "grad_norm": 1.6255142396108544, + "learning_rate": 1.4609802199788427e-06, + "loss": 1.1704, + "step": 2708 + }, + { + "epoch": 0.36724733952416455, + "grad_norm": 1.6491041772503872, + "learning_rate": 1.4605904799845218e-06, + "loss": 1.1705, + "step": 2709 + }, + { + "epoch": 0.3673829051718295, + "grad_norm": 1.9721278813441896, + "learning_rate": 1.4602006511689623e-06, + "loss": 1.1945, + "step": 2710 + }, + { + "epoch": 0.36751847081949435, + "grad_norm": 2.0221010203452168, + "learning_rate": 1.4598107336073396e-06, + "loss": 1.1523, + "step": 2711 + }, + { + "epoch": 0.3676540364671592, + "grad_norm": 1.7813177204125752, + "learning_rate": 1.4594207273748467e-06, + "loss": 1.156, + "step": 2712 + }, + { + "epoch": 0.3677896021148241, + "grad_norm": 4.112664784498156, + "learning_rate": 1.459030632546693e-06, + "loss": 1.2086, + "step": 2713 + }, + { + "epoch": 0.36792516776248896, + "grad_norm": 1.9573545744096326, + "learning_rate": 1.458640449198105e-06, + "loss": 1.2076, + "step": 2714 + }, + { + "epoch": 0.3680607334101539, + "grad_norm": 1.7840181080786912, + "learning_rate": 1.4582501774043268e-06, + "loss": 1.2246, + "step": 2715 + }, + { + "epoch": 0.36819629905781875, + "grad_norm": 2.1948490307182644, + "learning_rate": 1.4578598172406189e-06, + "loss": 1.1804, + "step": 2716 + }, + { + "epoch": 0.3683318647054836, + "grad_norm": 1.929250304738359, + "learning_rate": 1.4574693687822594e-06, + "loss": 1.1799, + "step": 2717 + }, + { + "epoch": 0.3684674303531485, + "grad_norm": 2.1141249082092872, + "learning_rate": 1.4570788321045432e-06, + "loss": 1.1474, + "step": 2718 + }, + { + "epoch": 0.3686029960008134, + "grad_norm": 2.2631555316156162, + "learning_rate": 1.4566882072827824e-06, + "loss": 1.1517, + "step": 2719 + }, + { + "epoch": 0.3687385616484783, + "grad_norm": 2.9682796217021017, + "learning_rate": 1.4562974943923054e-06, + "loss": 1.2162, + "step": 2720 + }, + { + "epoch": 0.36887412729614316, + "grad_norm": 1.9170827092862452, + "learning_rate": 1.4559066935084588e-06, + "loss": 1.1639, + "step": 2721 + }, + { + "epoch": 0.36900969294380803, + "grad_norm": 2.2187666406748274, + "learning_rate": 1.4555158047066047e-06, + "loss": 1.1846, + "step": 2722 + }, + { + "epoch": 0.3691452585914729, + "grad_norm": 1.9012583089014403, + "learning_rate": 1.4551248280621234e-06, + "loss": 1.1667, + "step": 2723 + }, + { + "epoch": 0.36928082423913783, + "grad_norm": 4.668999506676799, + "learning_rate": 1.4547337636504116e-06, + "loss": 1.211, + "step": 2724 + }, + { + "epoch": 0.3694163898868027, + "grad_norm": 2.3666484028210686, + "learning_rate": 1.4543426115468829e-06, + "loss": 1.1571, + "step": 2725 + }, + { + "epoch": 0.36955195553446757, + "grad_norm": 1.7578404949022828, + "learning_rate": 1.453951371826968e-06, + "loss": 1.154, + "step": 2726 + }, + { + "epoch": 0.36968752118213244, + "grad_norm": 2.152622476040555, + "learning_rate": 1.4535600445661143e-06, + "loss": 1.2226, + "step": 2727 + }, + { + "epoch": 0.3698230868297973, + "grad_norm": 2.3806496700226525, + "learning_rate": 1.453168629839786e-06, + "loss": 1.1995, + "step": 2728 + }, + { + "epoch": 0.36995865247746224, + "grad_norm": 3.3399625429926307, + "learning_rate": 1.4527771277234648e-06, + "loss": 1.2146, + "step": 2729 + }, + { + "epoch": 0.3700942181251271, + "grad_norm": 1.8979735090984053, + "learning_rate": 1.4523855382926483e-06, + "loss": 1.2239, + "step": 2730 + }, + { + "epoch": 0.370229783772792, + "grad_norm": 2.1637362601279837, + "learning_rate": 1.4519938616228518e-06, + "loss": 1.171, + "step": 2731 + }, + { + "epoch": 0.37036534942045685, + "grad_norm": 1.8432543718588692, + "learning_rate": 1.4516020977896067e-06, + "loss": 1.1758, + "step": 2732 + }, + { + "epoch": 0.3705009150681217, + "grad_norm": 1.7259282411339252, + "learning_rate": 1.4512102468684621e-06, + "loss": 1.1852, + "step": 2733 + }, + { + "epoch": 0.37063648071578664, + "grad_norm": 1.7000626610460714, + "learning_rate": 1.4508183089349828e-06, + "loss": 1.1626, + "step": 2734 + }, + { + "epoch": 0.3707720463634515, + "grad_norm": 1.8935115823573097, + "learning_rate": 1.4504262840647512e-06, + "loss": 1.1445, + "step": 2735 + }, + { + "epoch": 0.3709076120111164, + "grad_norm": 1.6510483205581563, + "learning_rate": 1.4500341723333663e-06, + "loss": 1.2022, + "step": 2736 + }, + { + "epoch": 0.37104317765878125, + "grad_norm": 2.2678377421138856, + "learning_rate": 1.4496419738164434e-06, + "loss": 1.1739, + "step": 2737 + }, + { + "epoch": 0.3711787433064461, + "grad_norm": 1.8237882564659307, + "learning_rate": 1.449249688589615e-06, + "loss": 1.1421, + "step": 2738 + }, + { + "epoch": 0.37131430895411105, + "grad_norm": 1.8387548985226523, + "learning_rate": 1.4488573167285307e-06, + "loss": 1.175, + "step": 2739 + }, + { + "epoch": 0.3714498746017759, + "grad_norm": 2.631913596404088, + "learning_rate": 1.448464858308856e-06, + "loss": 1.1964, + "step": 2740 + }, + { + "epoch": 0.3715854402494408, + "grad_norm": 1.5947225916413026, + "learning_rate": 1.4480723134062732e-06, + "loss": 1.1425, + "step": 2741 + }, + { + "epoch": 0.37172100589710566, + "grad_norm": 2.0332070475354698, + "learning_rate": 1.4476796820964814e-06, + "loss": 1.166, + "step": 2742 + }, + { + "epoch": 0.37185657154477053, + "grad_norm": 1.7714673388441884, + "learning_rate": 1.4472869644551966e-06, + "loss": 1.1841, + "step": 2743 + }, + { + "epoch": 0.37199213719243546, + "grad_norm": 1.829345272548568, + "learning_rate": 1.4468941605581518e-06, + "loss": 1.1525, + "step": 2744 + }, + { + "epoch": 0.3721277028401003, + "grad_norm": 2.6595461171069137, + "learning_rate": 1.4465012704810952e-06, + "loss": 1.1921, + "step": 2745 + }, + { + "epoch": 0.3722632684877652, + "grad_norm": 1.705634243457367, + "learning_rate": 1.4461082942997936e-06, + "loss": 1.1804, + "step": 2746 + }, + { + "epoch": 0.37239883413543007, + "grad_norm": 2.01972461812955, + "learning_rate": 1.4457152320900283e-06, + "loss": 1.184, + "step": 2747 + }, + { + "epoch": 0.37253439978309494, + "grad_norm": 2.568796991120599, + "learning_rate": 1.445322083927599e-06, + "loss": 1.1754, + "step": 2748 + }, + { + "epoch": 0.37266996543075986, + "grad_norm": 1.7146379790927613, + "learning_rate": 1.444928849888321e-06, + "loss": 1.1573, + "step": 2749 + }, + { + "epoch": 0.37280553107842473, + "grad_norm": 1.640655273363469, + "learning_rate": 1.4445355300480262e-06, + "loss": 1.1159, + "step": 2750 + }, + { + "epoch": 0.3729410967260896, + "grad_norm": 2.550017554063904, + "learning_rate": 1.4441421244825636e-06, + "loss": 1.1736, + "step": 2751 + }, + { + "epoch": 0.3730766623737545, + "grad_norm": 1.797392342183988, + "learning_rate": 1.443748633267798e-06, + "loss": 1.1936, + "step": 2752 + }, + { + "epoch": 0.37321222802141935, + "grad_norm": 2.0716536206518517, + "learning_rate": 1.443355056479611e-06, + "loss": 1.2068, + "step": 2753 + }, + { + "epoch": 0.37334779366908427, + "grad_norm": 5.80565601322652, + "learning_rate": 1.4429613941939016e-06, + "loss": 1.1685, + "step": 2754 + }, + { + "epoch": 0.37348335931674914, + "grad_norm": 2.0103585118262908, + "learning_rate": 1.4425676464865835e-06, + "loss": 1.2382, + "step": 2755 + }, + { + "epoch": 0.373618924964414, + "grad_norm": 1.861663036519799, + "learning_rate": 1.442173813433588e-06, + "loss": 1.185, + "step": 2756 + }, + { + "epoch": 0.3737544906120789, + "grad_norm": 2.2008514508298593, + "learning_rate": 1.4417798951108632e-06, + "loss": 1.2162, + "step": 2757 + }, + { + "epoch": 0.3738900562597438, + "grad_norm": 1.8714106655316147, + "learning_rate": 1.4413858915943728e-06, + "loss": 1.1907, + "step": 2758 + }, + { + "epoch": 0.3740256219074087, + "grad_norm": 1.6329350531807878, + "learning_rate": 1.4409918029600972e-06, + "loss": 1.2137, + "step": 2759 + }, + { + "epoch": 0.37416118755507355, + "grad_norm": 1.826337807042834, + "learning_rate": 1.4405976292840332e-06, + "loss": 1.1776, + "step": 2760 + }, + { + "epoch": 0.3742967532027384, + "grad_norm": 1.8269102752730262, + "learning_rate": 1.4402033706421945e-06, + "loss": 1.1448, + "step": 2761 + }, + { + "epoch": 0.3744323188504033, + "grad_norm": 1.6822920784798978, + "learning_rate": 1.4398090271106104e-06, + "loss": 1.1952, + "step": 2762 + }, + { + "epoch": 0.3745678844980682, + "grad_norm": 2.4721819516903176, + "learning_rate": 1.4394145987653272e-06, + "loss": 1.1879, + "step": 2763 + }, + { + "epoch": 0.3747034501457331, + "grad_norm": 2.1370021501845615, + "learning_rate": 1.4390200856824072e-06, + "loss": 1.1675, + "step": 2764 + }, + { + "epoch": 0.37483901579339796, + "grad_norm": 2.500931889282977, + "learning_rate": 1.438625487937929e-06, + "loss": 1.2383, + "step": 2765 + }, + { + "epoch": 0.3749745814410628, + "grad_norm": 1.4246739377399287, + "learning_rate": 1.4382308056079876e-06, + "loss": 1.1632, + "step": 2766 + }, + { + "epoch": 0.3751101470887277, + "grad_norm": 1.7303448965603228, + "learning_rate": 1.4378360387686948e-06, + "loss": 1.202, + "step": 2767 + }, + { + "epoch": 0.3752457127363926, + "grad_norm": 1.9625458253292307, + "learning_rate": 1.4374411874961777e-06, + "loss": 1.1825, + "step": 2768 + }, + { + "epoch": 0.3753812783840575, + "grad_norm": 1.6471134623308152, + "learning_rate": 1.437046251866581e-06, + "loss": 1.1663, + "step": 2769 + }, + { + "epoch": 0.37551684403172236, + "grad_norm": 1.7535416647941384, + "learning_rate": 1.436651231956064e-06, + "loss": 1.1694, + "step": 2770 + }, + { + "epoch": 0.37565240967938723, + "grad_norm": 2.5891290597043337, + "learning_rate": 1.4362561278408038e-06, + "loss": 1.199, + "step": 2771 + }, + { + "epoch": 0.3757879753270521, + "grad_norm": 2.092041064436753, + "learning_rate": 1.435860939596993e-06, + "loss": 1.2037, + "step": 2772 + }, + { + "epoch": 0.37592354097471703, + "grad_norm": 1.760414793025723, + "learning_rate": 1.43546566730084e-06, + "loss": 1.1361, + "step": 2773 + }, + { + "epoch": 0.3760591066223819, + "grad_norm": 1.6492624187080291, + "learning_rate": 1.4350703110285709e-06, + "loss": 1.1943, + "step": 2774 + }, + { + "epoch": 0.37619467227004677, + "grad_norm": 2.058819802618877, + "learning_rate": 1.4346748708564264e-06, + "loss": 1.2151, + "step": 2775 + }, + { + "epoch": 0.37633023791771164, + "grad_norm": 3.5689141121133527, + "learning_rate": 1.4342793468606643e-06, + "loss": 1.1717, + "step": 2776 + }, + { + "epoch": 0.3764658035653765, + "grad_norm": 4.2649977838167565, + "learning_rate": 1.433883739117558e-06, + "loss": 1.1448, + "step": 2777 + }, + { + "epoch": 0.37660136921304144, + "grad_norm": 2.7298716907996305, + "learning_rate": 1.4334880477033976e-06, + "loss": 1.1897, + "step": 2778 + }, + { + "epoch": 0.3767369348607063, + "grad_norm": 1.8670945499959664, + "learning_rate": 1.4330922726944889e-06, + "loss": 1.1403, + "step": 2779 + }, + { + "epoch": 0.3768725005083712, + "grad_norm": 1.6720635949822686, + "learning_rate": 1.432696414167154e-06, + "loss": 1.2227, + "step": 2780 + }, + { + "epoch": 0.37700806615603605, + "grad_norm": 1.7018162030227213, + "learning_rate": 1.4323004721977312e-06, + "loss": 1.1416, + "step": 2781 + }, + { + "epoch": 0.3771436318037009, + "grad_norm": 1.8381382731015636, + "learning_rate": 1.4319044468625748e-06, + "loss": 1.1248, + "step": 2782 + }, + { + "epoch": 0.37727919745136584, + "grad_norm": 2.3860361479578747, + "learning_rate": 1.4315083382380552e-06, + "loss": 1.1842, + "step": 2783 + }, + { + "epoch": 0.3774147630990307, + "grad_norm": 2.208942741749002, + "learning_rate": 1.4311121464005582e-06, + "loss": 1.1912, + "step": 2784 + }, + { + "epoch": 0.3775503287466956, + "grad_norm": 1.536820154626763, + "learning_rate": 1.430715871426487e-06, + "loss": 1.2052, + "step": 2785 + }, + { + "epoch": 0.37768589439436046, + "grad_norm": 2.454610700880374, + "learning_rate": 1.43031951339226e-06, + "loss": 1.1931, + "step": 2786 + }, + { + "epoch": 0.3778214600420253, + "grad_norm": 2.1299877845490998, + "learning_rate": 1.4299230723743112e-06, + "loss": 1.2215, + "step": 2787 + }, + { + "epoch": 0.37795702568969025, + "grad_norm": 2.085311699381953, + "learning_rate": 1.4295265484490918e-06, + "loss": 1.1787, + "step": 2788 + }, + { + "epoch": 0.3780925913373551, + "grad_norm": 1.7557224649313516, + "learning_rate": 1.429129941693068e-06, + "loss": 1.199, + "step": 2789 + }, + { + "epoch": 0.37822815698502, + "grad_norm": 1.9519858926239588, + "learning_rate": 1.428733252182722e-06, + "loss": 1.1526, + "step": 2790 + }, + { + "epoch": 0.37836372263268486, + "grad_norm": 1.7336269958590798, + "learning_rate": 1.4283364799945527e-06, + "loss": 1.1379, + "step": 2791 + }, + { + "epoch": 0.37849928828034973, + "grad_norm": 1.8140212104536728, + "learning_rate": 1.4279396252050747e-06, + "loss": 1.177, + "step": 2792 + }, + { + "epoch": 0.37863485392801466, + "grad_norm": 1.6503190172137328, + "learning_rate": 1.4275426878908174e-06, + "loss": 1.1604, + "step": 2793 + }, + { + "epoch": 0.37877041957567953, + "grad_norm": 1.7108850930885688, + "learning_rate": 1.4271456681283275e-06, + "loss": 1.1793, + "step": 2794 + }, + { + "epoch": 0.3789059852233444, + "grad_norm": 1.6020190881720466, + "learning_rate": 1.4267485659941676e-06, + "loss": 1.1734, + "step": 2795 + }, + { + "epoch": 0.37904155087100927, + "grad_norm": 2.2161099490356677, + "learning_rate": 1.4263513815649152e-06, + "loss": 1.1695, + "step": 2796 + }, + { + "epoch": 0.3791771165186742, + "grad_norm": 1.6318229384345841, + "learning_rate": 1.4259541149171643e-06, + "loss": 1.1778, + "step": 2797 + }, + { + "epoch": 0.37931268216633907, + "grad_norm": 1.6816564966670535, + "learning_rate": 1.4255567661275247e-06, + "loss": 1.167, + "step": 2798 + }, + { + "epoch": 0.37944824781400394, + "grad_norm": 2.080928773539993, + "learning_rate": 1.4251593352726217e-06, + "loss": 1.189, + "step": 2799 + }, + { + "epoch": 0.3795838134616688, + "grad_norm": 2.277091325952829, + "learning_rate": 1.4247618224290968e-06, + "loss": 1.1921, + "step": 2800 + }, + { + "epoch": 0.3797193791093337, + "grad_norm": 1.6366099235430251, + "learning_rate": 1.4243642276736076e-06, + "loss": 1.1866, + "step": 2801 + }, + { + "epoch": 0.3798549447569986, + "grad_norm": 1.9920650702378837, + "learning_rate": 1.4239665510828266e-06, + "loss": 1.2155, + "step": 2802 + }, + { + "epoch": 0.3799905104046635, + "grad_norm": 2.295550672944647, + "learning_rate": 1.423568792733443e-06, + "loss": 1.1616, + "step": 2803 + }, + { + "epoch": 0.38012607605232834, + "grad_norm": 1.7887177100752438, + "learning_rate": 1.423170952702161e-06, + "loss": 1.1676, + "step": 2804 + }, + { + "epoch": 0.3802616416999932, + "grad_norm": 1.8864871493678719, + "learning_rate": 1.422773031065701e-06, + "loss": 1.2037, + "step": 2805 + }, + { + "epoch": 0.3803972073476581, + "grad_norm": 7.5589494605151435, + "learning_rate": 1.4223750279007993e-06, + "loss": 1.1675, + "step": 2806 + }, + { + "epoch": 0.380532772995323, + "grad_norm": 2.050515693802934, + "learning_rate": 1.4219769432842075e-06, + "loss": 1.1907, + "step": 2807 + }, + { + "epoch": 0.3806683386429879, + "grad_norm": 1.8867182798789306, + "learning_rate": 1.4215787772926931e-06, + "loss": 1.1791, + "step": 2808 + }, + { + "epoch": 0.38080390429065275, + "grad_norm": 1.6566242390894266, + "learning_rate": 1.4211805300030389e-06, + "loss": 1.1509, + "step": 2809 + }, + { + "epoch": 0.3809394699383176, + "grad_norm": 1.5166066190306866, + "learning_rate": 1.4207822014920443e-06, + "loss": 1.1687, + "step": 2810 + }, + { + "epoch": 0.3810750355859825, + "grad_norm": 1.91398983526122, + "learning_rate": 1.420383791836524e-06, + "loss": 1.1883, + "step": 2811 + }, + { + "epoch": 0.3812106012336474, + "grad_norm": 2.0346425240753194, + "learning_rate": 1.419985301113307e-06, + "loss": 1.1789, + "step": 2812 + }, + { + "epoch": 0.3813461668813123, + "grad_norm": 1.6144493341198562, + "learning_rate": 1.4195867293992405e-06, + "loss": 1.1993, + "step": 2813 + }, + { + "epoch": 0.38148173252897716, + "grad_norm": 2.412995286999884, + "learning_rate": 1.419188076771185e-06, + "loss": 1.1956, + "step": 2814 + }, + { + "epoch": 0.38161729817664203, + "grad_norm": 1.8024161124681302, + "learning_rate": 1.4187893433060176e-06, + "loss": 1.1618, + "step": 2815 + }, + { + "epoch": 0.3817528638243069, + "grad_norm": 2.5722584470235983, + "learning_rate": 1.4183905290806313e-06, + "loss": 1.1894, + "step": 2816 + }, + { + "epoch": 0.3818884294719718, + "grad_norm": 1.8139027866674544, + "learning_rate": 1.4179916341719339e-06, + "loss": 1.1829, + "step": 2817 + }, + { + "epoch": 0.3820239951196367, + "grad_norm": 1.931714774940511, + "learning_rate": 1.4175926586568493e-06, + "loss": 1.1711, + "step": 2818 + }, + { + "epoch": 0.38215956076730156, + "grad_norm": 1.998225411759208, + "learning_rate": 1.4171936026123168e-06, + "loss": 1.1859, + "step": 2819 + }, + { + "epoch": 0.38229512641496644, + "grad_norm": 2.0049110755869046, + "learning_rate": 1.4167944661152911e-06, + "loss": 1.2011, + "step": 2820 + }, + { + "epoch": 0.3824306920626313, + "grad_norm": 1.7995720969875804, + "learning_rate": 1.4163952492427424e-06, + "loss": 1.1877, + "step": 2821 + }, + { + "epoch": 0.38256625771029623, + "grad_norm": 2.3112449219939135, + "learning_rate": 1.415995952071657e-06, + "loss": 1.2013, + "step": 2822 + }, + { + "epoch": 0.3827018233579611, + "grad_norm": 1.8752765488852614, + "learning_rate": 1.415596574679036e-06, + "loss": 1.1678, + "step": 2823 + }, + { + "epoch": 0.38283738900562597, + "grad_norm": 1.5387121465138285, + "learning_rate": 1.4151971171418959e-06, + "loss": 1.1586, + "step": 2824 + }, + { + "epoch": 0.38297295465329084, + "grad_norm": 1.7708864281292973, + "learning_rate": 1.4147975795372694e-06, + "loss": 1.1443, + "step": 2825 + }, + { + "epoch": 0.3831085203009557, + "grad_norm": 2.0327764176264886, + "learning_rate": 1.4143979619422035e-06, + "loss": 1.1997, + "step": 2826 + }, + { + "epoch": 0.38324408594862064, + "grad_norm": 1.8516164065425242, + "learning_rate": 1.4139982644337617e-06, + "loss": 1.1696, + "step": 2827 + }, + { + "epoch": 0.3833796515962855, + "grad_norm": 1.607495358629344, + "learning_rate": 1.4135984870890228e-06, + "loss": 1.1205, + "step": 2828 + }, + { + "epoch": 0.3835152172439504, + "grad_norm": 2.070636309979588, + "learning_rate": 1.4131986299850803e-06, + "loss": 1.2242, + "step": 2829 + }, + { + "epoch": 0.38365078289161525, + "grad_norm": 1.7898069335692406, + "learning_rate": 1.4127986931990437e-06, + "loss": 1.1261, + "step": 2830 + }, + { + "epoch": 0.3837863485392801, + "grad_norm": 1.678532609440648, + "learning_rate": 1.4123986768080375e-06, + "loss": 1.1533, + "step": 2831 + }, + { + "epoch": 0.38392191418694505, + "grad_norm": 1.9707221087036375, + "learning_rate": 1.4119985808892016e-06, + "loss": 1.2064, + "step": 2832 + }, + { + "epoch": 0.3840574798346099, + "grad_norm": 1.8283910025303025, + "learning_rate": 1.4115984055196918e-06, + "loss": 1.154, + "step": 2833 + }, + { + "epoch": 0.3841930454822748, + "grad_norm": 1.8432748359909101, + "learning_rate": 1.4111981507766782e-06, + "loss": 1.1723, + "step": 2834 + }, + { + "epoch": 0.38432861112993966, + "grad_norm": 1.630430272155036, + "learning_rate": 1.4107978167373469e-06, + "loss": 1.2023, + "step": 2835 + }, + { + "epoch": 0.3844641767776046, + "grad_norm": 2.814668723324181, + "learning_rate": 1.4103974034788994e-06, + "loss": 1.1578, + "step": 2836 + }, + { + "epoch": 0.38459974242526945, + "grad_norm": 1.8249052475139533, + "learning_rate": 1.4099969110785521e-06, + "loss": 1.1862, + "step": 2837 + }, + { + "epoch": 0.3847353080729343, + "grad_norm": 2.0146029994754575, + "learning_rate": 1.409596339613537e-06, + "loss": 1.1795, + "step": 2838 + }, + { + "epoch": 0.3848708737205992, + "grad_norm": 2.210581267436707, + "learning_rate": 1.409195689161101e-06, + "loss": 1.189, + "step": 2839 + }, + { + "epoch": 0.38500643936826406, + "grad_norm": 3.0511403576612723, + "learning_rate": 1.4087949597985062e-06, + "loss": 1.2041, + "step": 2840 + }, + { + "epoch": 0.385142005015929, + "grad_norm": 2.35217050502945, + "learning_rate": 1.4083941516030303e-06, + "loss": 1.2111, + "step": 2841 + }, + { + "epoch": 0.38527757066359386, + "grad_norm": 3.9833726666601064, + "learning_rate": 1.407993264651966e-06, + "loss": 1.1589, + "step": 2842 + }, + { + "epoch": 0.38541313631125873, + "grad_norm": 2.0150505131517114, + "learning_rate": 1.4075922990226209e-06, + "loss": 1.1765, + "step": 2843 + }, + { + "epoch": 0.3855487019589236, + "grad_norm": 2.7320169406252126, + "learning_rate": 1.407191254792318e-06, + "loss": 1.1403, + "step": 2844 + }, + { + "epoch": 0.38568426760658847, + "grad_norm": 1.9851419654778522, + "learning_rate": 1.4067901320383962e-06, + "loss": 1.1611, + "step": 2845 + }, + { + "epoch": 0.3858198332542534, + "grad_norm": 1.6431103830773388, + "learning_rate": 1.4063889308382084e-06, + "loss": 1.1526, + "step": 2846 + }, + { + "epoch": 0.38595539890191827, + "grad_norm": 1.9667022182031222, + "learning_rate": 1.405987651269123e-06, + "loss": 1.18, + "step": 2847 + }, + { + "epoch": 0.38609096454958314, + "grad_norm": 1.5344779278517302, + "learning_rate": 1.4055862934085239e-06, + "loss": 1.2061, + "step": 2848 + }, + { + "epoch": 0.386226530197248, + "grad_norm": 2.061963340811552, + "learning_rate": 1.4051848573338095e-06, + "loss": 1.1667, + "step": 2849 + }, + { + "epoch": 0.3863620958449129, + "grad_norm": 2.1179462169733974, + "learning_rate": 1.4047833431223936e-06, + "loss": 1.154, + "step": 2850 + }, + { + "epoch": 0.3864976614925778, + "grad_norm": 1.724692193587771, + "learning_rate": 1.4043817508517053e-06, + "loss": 1.2248, + "step": 2851 + }, + { + "epoch": 0.3866332271402427, + "grad_norm": 2.037476461799464, + "learning_rate": 1.4039800805991883e-06, + "loss": 1.157, + "step": 2852 + }, + { + "epoch": 0.38676879278790754, + "grad_norm": 1.9775217996529957, + "learning_rate": 1.403578332442302e-06, + "loss": 1.2019, + "step": 2853 + }, + { + "epoch": 0.3869043584355724, + "grad_norm": 2.120130563022504, + "learning_rate": 1.4031765064585196e-06, + "loss": 1.2192, + "step": 2854 + }, + { + "epoch": 0.3870399240832373, + "grad_norm": 2.255105548745387, + "learning_rate": 1.4027746027253301e-06, + "loss": 1.1632, + "step": 2855 + }, + { + "epoch": 0.3871754897309022, + "grad_norm": 2.488840039209609, + "learning_rate": 1.402372621320238e-06, + "loss": 1.239, + "step": 2856 + }, + { + "epoch": 0.3873110553785671, + "grad_norm": 2.7184210617626734, + "learning_rate": 1.401970562320762e-06, + "loss": 1.193, + "step": 2857 + }, + { + "epoch": 0.38744662102623195, + "grad_norm": 1.568679149832594, + "learning_rate": 1.4015684258044363e-06, + "loss": 1.1405, + "step": 2858 + }, + { + "epoch": 0.3875821866738968, + "grad_norm": 1.863573033776483, + "learning_rate": 1.401166211848809e-06, + "loss": 1.1351, + "step": 2859 + }, + { + "epoch": 0.3877177523215617, + "grad_norm": 4.414019223795054, + "learning_rate": 1.4007639205314448e-06, + "loss": 1.1634, + "step": 2860 + }, + { + "epoch": 0.3878533179692266, + "grad_norm": 2.1860644442841752, + "learning_rate": 1.4003615519299216e-06, + "loss": 1.157, + "step": 2861 + }, + { + "epoch": 0.3879888836168915, + "grad_norm": 2.0490634111932513, + "learning_rate": 1.3999591061218334e-06, + "loss": 1.1694, + "step": 2862 + }, + { + "epoch": 0.38812444926455636, + "grad_norm": 2.083742259072921, + "learning_rate": 1.399556583184789e-06, + "loss": 1.189, + "step": 2863 + }, + { + "epoch": 0.38826001491222123, + "grad_norm": 2.1149780635214315, + "learning_rate": 1.3991539831964114e-06, + "loss": 1.1377, + "step": 2864 + }, + { + "epoch": 0.3883955805598861, + "grad_norm": 2.4278379793323306, + "learning_rate": 1.3987513062343385e-06, + "loss": 1.2047, + "step": 2865 + }, + { + "epoch": 0.388531146207551, + "grad_norm": 2.0252918008016225, + "learning_rate": 1.3983485523762243e-06, + "loss": 1.1834, + "step": 2866 + }, + { + "epoch": 0.3886667118552159, + "grad_norm": 2.126863759781773, + "learning_rate": 1.3979457216997358e-06, + "loss": 1.1854, + "step": 2867 + }, + { + "epoch": 0.38880227750288077, + "grad_norm": 2.28150626842345, + "learning_rate": 1.397542814282556e-06, + "loss": 1.1659, + "step": 2868 + }, + { + "epoch": 0.38893784315054564, + "grad_norm": 11.078569662364973, + "learning_rate": 1.3971398302023824e-06, + "loss": 1.1707, + "step": 2869 + }, + { + "epoch": 0.3890734087982105, + "grad_norm": 2.3626013661588967, + "learning_rate": 1.3967367695369276e-06, + "loss": 1.2112, + "step": 2870 + }, + { + "epoch": 0.38920897444587543, + "grad_norm": 1.6132934808130275, + "learning_rate": 1.3963336323639183e-06, + "loss": 1.2003, + "step": 2871 + }, + { + "epoch": 0.3893445400935403, + "grad_norm": 5.338710341190096, + "learning_rate": 1.3959304187610967e-06, + "loss": 1.145, + "step": 2872 + }, + { + "epoch": 0.3894801057412052, + "grad_norm": 2.8751136339811625, + "learning_rate": 1.3955271288062188e-06, + "loss": 1.1878, + "step": 2873 + }, + { + "epoch": 0.38961567138887004, + "grad_norm": 1.7746574903168384, + "learning_rate": 1.3951237625770564e-06, + "loss": 1.1566, + "step": 2874 + }, + { + "epoch": 0.3897512370365349, + "grad_norm": 2.3283800997838, + "learning_rate": 1.3947203201513953e-06, + "loss": 1.177, + "step": 2875 + }, + { + "epoch": 0.38988680268419984, + "grad_norm": 1.7829245192447436, + "learning_rate": 1.3943168016070361e-06, + "loss": 1.1742, + "step": 2876 + }, + { + "epoch": 0.3900223683318647, + "grad_norm": 1.687837856961204, + "learning_rate": 1.3939132070217942e-06, + "loss": 1.1645, + "step": 2877 + }, + { + "epoch": 0.3901579339795296, + "grad_norm": 2.365514605377223, + "learning_rate": 1.3935095364734998e-06, + "loss": 1.1761, + "step": 2878 + }, + { + "epoch": 0.39029349962719445, + "grad_norm": 1.8810611166983138, + "learning_rate": 1.3931057900399976e-06, + "loss": 1.1924, + "step": 2879 + }, + { + "epoch": 0.3904290652748594, + "grad_norm": 1.8658888481525275, + "learning_rate": 1.3927019677991466e-06, + "loss": 1.193, + "step": 2880 + }, + { + "epoch": 0.39056463092252425, + "grad_norm": 5.182553508020511, + "learning_rate": 1.3922980698288212e-06, + "loss": 1.1623, + "step": 2881 + }, + { + "epoch": 0.3907001965701891, + "grad_norm": 1.9836686290188432, + "learning_rate": 1.3918940962069093e-06, + "loss": 1.2188, + "step": 2882 + }, + { + "epoch": 0.390835762217854, + "grad_norm": 5.03865974464268, + "learning_rate": 1.3914900470113144e-06, + "loss": 1.1752, + "step": 2883 + }, + { + "epoch": 0.39097132786551886, + "grad_norm": 2.325198350433925, + "learning_rate": 1.3910859223199545e-06, + "loss": 1.197, + "step": 2884 + }, + { + "epoch": 0.3911068935131838, + "grad_norm": 1.7848540364419434, + "learning_rate": 1.3906817222107611e-06, + "loss": 1.2033, + "step": 2885 + }, + { + "epoch": 0.39124245916084865, + "grad_norm": 2.2158731374485754, + "learning_rate": 1.3902774467616817e-06, + "loss": 1.17, + "step": 2886 + }, + { + "epoch": 0.3913780248085135, + "grad_norm": 1.763422848130486, + "learning_rate": 1.3898730960506772e-06, + "loss": 1.2062, + "step": 2887 + }, + { + "epoch": 0.3915135904561784, + "grad_norm": 3.524670272096588, + "learning_rate": 1.3894686701557237e-06, + "loss": 1.1704, + "step": 2888 + }, + { + "epoch": 0.39164915610384327, + "grad_norm": 2.017654951153878, + "learning_rate": 1.3890641691548113e-06, + "loss": 1.203, + "step": 2889 + }, + { + "epoch": 0.3917847217515082, + "grad_norm": 1.6147566604884738, + "learning_rate": 1.3886595931259451e-06, + "loss": 1.1729, + "step": 2890 + }, + { + "epoch": 0.39192028739917306, + "grad_norm": 2.8372786997440955, + "learning_rate": 1.3882549421471442e-06, + "loss": 1.2066, + "step": 2891 + }, + { + "epoch": 0.39205585304683793, + "grad_norm": 1.7219086571865887, + "learning_rate": 1.3878502162964422e-06, + "loss": 1.1571, + "step": 2892 + }, + { + "epoch": 0.3921914186945028, + "grad_norm": 2.5008607359521657, + "learning_rate": 1.3874454156518877e-06, + "loss": 1.158, + "step": 2893 + }, + { + "epoch": 0.3923269843421677, + "grad_norm": 2.3546731581754345, + "learning_rate": 1.3870405402915436e-06, + "loss": 1.2015, + "step": 2894 + }, + { + "epoch": 0.3924625499898326, + "grad_norm": 1.7653650110918155, + "learning_rate": 1.3866355902934856e-06, + "loss": 1.1682, + "step": 2895 + }, + { + "epoch": 0.39259811563749747, + "grad_norm": 1.9663063537959946, + "learning_rate": 1.3862305657358065e-06, + "loss": 1.1796, + "step": 2896 + }, + { + "epoch": 0.39273368128516234, + "grad_norm": 1.9942787200232068, + "learning_rate": 1.385825466696611e-06, + "loss": 1.1602, + "step": 2897 + }, + { + "epoch": 0.3928692469328272, + "grad_norm": 2.6357193451009806, + "learning_rate": 1.3854202932540202e-06, + "loss": 1.1917, + "step": 2898 + }, + { + "epoch": 0.3930048125804921, + "grad_norm": 1.7634643686277827, + "learning_rate": 1.3850150454861682e-06, + "loss": 1.1839, + "step": 2899 + }, + { + "epoch": 0.393140378228157, + "grad_norm": 1.7563977063084624, + "learning_rate": 1.3846097234712034e-06, + "loss": 1.1467, + "step": 2900 + }, + { + "epoch": 0.3932759438758219, + "grad_norm": 2.1918961617419557, + "learning_rate": 1.3842043272872896e-06, + "loss": 1.181, + "step": 2901 + }, + { + "epoch": 0.39341150952348675, + "grad_norm": 1.974744149913827, + "learning_rate": 1.383798857012604e-06, + "loss": 1.2191, + "step": 2902 + }, + { + "epoch": 0.3935470751711516, + "grad_norm": 2.1393945119097597, + "learning_rate": 1.3833933127253383e-06, + "loss": 1.1722, + "step": 2903 + }, + { + "epoch": 0.3936826408188165, + "grad_norm": 1.5789950101046193, + "learning_rate": 1.3829876945036987e-06, + "loss": 1.1647, + "step": 2904 + }, + { + "epoch": 0.3938182064664814, + "grad_norm": 1.8417256515884026, + "learning_rate": 1.3825820024259052e-06, + "loss": 1.1632, + "step": 2905 + }, + { + "epoch": 0.3939537721141463, + "grad_norm": 1.8049942423216745, + "learning_rate": 1.3821762365701926e-06, + "loss": 1.1758, + "step": 2906 + }, + { + "epoch": 0.39408933776181115, + "grad_norm": 2.1332599960409215, + "learning_rate": 1.3817703970148092e-06, + "loss": 1.1864, + "step": 2907 + }, + { + "epoch": 0.394224903409476, + "grad_norm": 1.7138012632361566, + "learning_rate": 1.3813644838380184e-06, + "loss": 1.1954, + "step": 2908 + }, + { + "epoch": 0.3943604690571409, + "grad_norm": 6.759192914889974, + "learning_rate": 1.3809584971180975e-06, + "loss": 1.177, + "step": 2909 + }, + { + "epoch": 0.3944960347048058, + "grad_norm": 1.6807279024730886, + "learning_rate": 1.3805524369333371e-06, + "loss": 1.1968, + "step": 2910 + }, + { + "epoch": 0.3946316003524707, + "grad_norm": 2.358619281329494, + "learning_rate": 1.3801463033620433e-06, + "loss": 1.179, + "step": 2911 + }, + { + "epoch": 0.39476716600013556, + "grad_norm": 1.6247569941856423, + "learning_rate": 1.3797400964825357e-06, + "loss": 1.1799, + "step": 2912 + }, + { + "epoch": 0.39490273164780043, + "grad_norm": 2.0410333944482932, + "learning_rate": 1.3793338163731476e-06, + "loss": 1.185, + "step": 2913 + }, + { + "epoch": 0.3950382972954653, + "grad_norm": 1.6619521284606147, + "learning_rate": 1.3789274631122277e-06, + "loss": 1.1495, + "step": 2914 + }, + { + "epoch": 0.3951738629431302, + "grad_norm": 1.5969052474432655, + "learning_rate": 1.3785210367781375e-06, + "loss": 1.1786, + "step": 2915 + }, + { + "epoch": 0.3953094285907951, + "grad_norm": 1.8687140558127109, + "learning_rate": 1.378114537449253e-06, + "loss": 1.1752, + "step": 2916 + }, + { + "epoch": 0.39544499423845997, + "grad_norm": 1.8706730958675237, + "learning_rate": 1.3777079652039646e-06, + "loss": 1.213, + "step": 2917 + }, + { + "epoch": 0.39558055988612484, + "grad_norm": 1.7731412119915928, + "learning_rate": 1.3773013201206768e-06, + "loss": 1.2304, + "step": 2918 + }, + { + "epoch": 0.39571612553378976, + "grad_norm": 2.6835459585721986, + "learning_rate": 1.3768946022778075e-06, + "loss": 1.2002, + "step": 2919 + }, + { + "epoch": 0.39585169118145463, + "grad_norm": 1.7654456765693185, + "learning_rate": 1.3764878117537895e-06, + "loss": 1.1594, + "step": 2920 + }, + { + "epoch": 0.3959872568291195, + "grad_norm": 2.4297528218485014, + "learning_rate": 1.3760809486270684e-06, + "loss": 1.1827, + "step": 2921 + }, + { + "epoch": 0.3961228224767844, + "grad_norm": 2.315744149871914, + "learning_rate": 1.3756740129761053e-06, + "loss": 1.2274, + "step": 2922 + }, + { + "epoch": 0.39625838812444925, + "grad_norm": 2.07957108411169, + "learning_rate": 1.3752670048793743e-06, + "loss": 1.2317, + "step": 2923 + }, + { + "epoch": 0.39639395377211417, + "grad_norm": 1.8239331195850352, + "learning_rate": 1.3748599244153632e-06, + "loss": 1.1398, + "step": 2924 + }, + { + "epoch": 0.39652951941977904, + "grad_norm": 1.782345907596219, + "learning_rate": 1.3744527716625746e-06, + "loss": 1.1729, + "step": 2925 + }, + { + "epoch": 0.3966650850674439, + "grad_norm": 2.3495882669863866, + "learning_rate": 1.3740455466995248e-06, + "loss": 1.1578, + "step": 2926 + }, + { + "epoch": 0.3968006507151088, + "grad_norm": 1.7928856560111912, + "learning_rate": 1.373638249604744e-06, + "loss": 1.207, + "step": 2927 + }, + { + "epoch": 0.39693621636277365, + "grad_norm": 2.2393336218064714, + "learning_rate": 1.3732308804567761e-06, + "loss": 1.1699, + "step": 2928 + }, + { + "epoch": 0.3970717820104386, + "grad_norm": 1.741531684986531, + "learning_rate": 1.3728234393341789e-06, + "loss": 1.1258, + "step": 2929 + }, + { + "epoch": 0.39720734765810345, + "grad_norm": 3.458347318185045, + "learning_rate": 1.3724159263155246e-06, + "loss": 1.1311, + "step": 2930 + }, + { + "epoch": 0.3973429133057683, + "grad_norm": 1.7458566528069293, + "learning_rate": 1.3720083414793984e-06, + "loss": 1.1896, + "step": 2931 + }, + { + "epoch": 0.3974784789534332, + "grad_norm": 1.8837117303001154, + "learning_rate": 1.3716006849043998e-06, + "loss": 1.1845, + "step": 2932 + }, + { + "epoch": 0.39761404460109806, + "grad_norm": 2.9869328432809663, + "learning_rate": 1.3711929566691424e-06, + "loss": 1.1604, + "step": 2933 + }, + { + "epoch": 0.397749610248763, + "grad_norm": 1.914493840353464, + "learning_rate": 1.3707851568522534e-06, + "loss": 1.1851, + "step": 2934 + }, + { + "epoch": 0.39788517589642786, + "grad_norm": 2.3619860769915433, + "learning_rate": 1.3703772855323739e-06, + "loss": 1.1389, + "step": 2935 + }, + { + "epoch": 0.3980207415440927, + "grad_norm": 2.6275238164281283, + "learning_rate": 1.3699693427881582e-06, + "loss": 1.1629, + "step": 2936 + }, + { + "epoch": 0.3981563071917576, + "grad_norm": 1.9352215499823828, + "learning_rate": 1.3695613286982754e-06, + "loss": 1.1447, + "step": 2937 + }, + { + "epoch": 0.39829187283942247, + "grad_norm": 2.2437903364125815, + "learning_rate": 1.3691532433414073e-06, + "loss": 1.1906, + "step": 2938 + }, + { + "epoch": 0.3984274384870874, + "grad_norm": 1.5105897478357122, + "learning_rate": 1.36874508679625e-06, + "loss": 1.1342, + "step": 2939 + }, + { + "epoch": 0.39856300413475226, + "grad_norm": 1.6411463257808636, + "learning_rate": 1.3683368591415137e-06, + "loss": 1.1335, + "step": 2940 + }, + { + "epoch": 0.39869856978241713, + "grad_norm": 1.6952416606621306, + "learning_rate": 1.3679285604559211e-06, + "loss": 1.1707, + "step": 2941 + }, + { + "epoch": 0.398834135430082, + "grad_norm": 2.579553818063693, + "learning_rate": 1.3675201908182103e-06, + "loss": 1.1787, + "step": 2942 + }, + { + "epoch": 0.3989697010777469, + "grad_norm": 2.1182833611179466, + "learning_rate": 1.3671117503071317e-06, + "loss": 1.1745, + "step": 2943 + }, + { + "epoch": 0.3991052667254118, + "grad_norm": 1.519780527571384, + "learning_rate": 1.3667032390014497e-06, + "loss": 1.1766, + "step": 2944 + }, + { + "epoch": 0.39924083237307667, + "grad_norm": 1.9511864071317686, + "learning_rate": 1.3662946569799426e-06, + "loss": 1.1935, + "step": 2945 + }, + { + "epoch": 0.39937639802074154, + "grad_norm": 2.1520124921283306, + "learning_rate": 1.3658860043214024e-06, + "loss": 1.2232, + "step": 2946 + }, + { + "epoch": 0.3995119636684064, + "grad_norm": 1.6726719536638357, + "learning_rate": 1.3654772811046344e-06, + "loss": 1.1673, + "step": 2947 + }, + { + "epoch": 0.3996475293160713, + "grad_norm": 2.1304132446036084, + "learning_rate": 1.3650684874084577e-06, + "loss": 1.1792, + "step": 2948 + }, + { + "epoch": 0.3997830949637362, + "grad_norm": 1.9900796248555306, + "learning_rate": 1.3646596233117047e-06, + "loss": 1.1234, + "step": 2949 + }, + { + "epoch": 0.3999186606114011, + "grad_norm": 2.2630431036941814, + "learning_rate": 1.364250688893222e-06, + "loss": 1.147, + "step": 2950 + }, + { + "epoch": 0.40005422625906595, + "grad_norm": 3.122236959972393, + "learning_rate": 1.3638416842318691e-06, + "loss": 1.1595, + "step": 2951 + }, + { + "epoch": 0.4001897919067308, + "grad_norm": 2.060901193850816, + "learning_rate": 1.3634326094065194e-06, + "loss": 1.1709, + "step": 2952 + }, + { + "epoch": 0.4003253575543957, + "grad_norm": 2.5056634350643434, + "learning_rate": 1.3630234644960597e-06, + "loss": 1.1819, + "step": 2953 + }, + { + "epoch": 0.4004609232020606, + "grad_norm": 1.875719158412489, + "learning_rate": 1.3626142495793902e-06, + "loss": 1.1905, + "step": 2954 + }, + { + "epoch": 0.4005964888497255, + "grad_norm": 1.673758489643848, + "learning_rate": 1.3622049647354252e-06, + "loss": 1.1822, + "step": 2955 + }, + { + "epoch": 0.40073205449739036, + "grad_norm": 2.4545761786357083, + "learning_rate": 1.361795610043092e-06, + "loss": 1.161, + "step": 2956 + }, + { + "epoch": 0.4008676201450552, + "grad_norm": 6.421823384014772, + "learning_rate": 1.3613861855813308e-06, + "loss": 1.1935, + "step": 2957 + }, + { + "epoch": 0.40100318579272015, + "grad_norm": 1.5870365069951584, + "learning_rate": 1.3609766914290965e-06, + "loss": 1.1692, + "step": 2958 + }, + { + "epoch": 0.401138751440385, + "grad_norm": 3.03052106705732, + "learning_rate": 1.3605671276653565e-06, + "loss": 1.1978, + "step": 2959 + }, + { + "epoch": 0.4012743170880499, + "grad_norm": 1.9847866565159953, + "learning_rate": 1.3601574943690924e-06, + "loss": 1.2452, + "step": 2960 + }, + { + "epoch": 0.40140988273571476, + "grad_norm": 2.065972423562635, + "learning_rate": 1.3597477916192985e-06, + "loss": 1.2066, + "step": 2961 + }, + { + "epoch": 0.40154544838337963, + "grad_norm": 1.949827478818413, + "learning_rate": 1.3593380194949823e-06, + "loss": 1.1794, + "step": 2962 + }, + { + "epoch": 0.40168101403104456, + "grad_norm": 3.5511592650022203, + "learning_rate": 1.3589281780751659e-06, + "loss": 1.1557, + "step": 2963 + }, + { + "epoch": 0.40181657967870943, + "grad_norm": 2.1840302794795843, + "learning_rate": 1.358518267438883e-06, + "loss": 1.179, + "step": 2964 + }, + { + "epoch": 0.4019521453263743, + "grad_norm": 1.6298109532151817, + "learning_rate": 1.3581082876651824e-06, + "loss": 1.2206, + "step": 2965 + }, + { + "epoch": 0.40208771097403917, + "grad_norm": 2.168265625743344, + "learning_rate": 1.3576982388331258e-06, + "loss": 1.1885, + "step": 2966 + }, + { + "epoch": 0.40222327662170404, + "grad_norm": 1.6943356668809466, + "learning_rate": 1.3572881210217869e-06, + "loss": 1.154, + "step": 2967 + }, + { + "epoch": 0.40235884226936897, + "grad_norm": 4.00229614256267, + "learning_rate": 1.3568779343102539e-06, + "loss": 1.2013, + "step": 2968 + }, + { + "epoch": 0.40249440791703384, + "grad_norm": 1.60803474608819, + "learning_rate": 1.3564676787776282e-06, + "loss": 1.129, + "step": 2969 + }, + { + "epoch": 0.4026299735646987, + "grad_norm": 1.6852603463898308, + "learning_rate": 1.356057354503025e-06, + "loss": 1.1979, + "step": 2970 + }, + { + "epoch": 0.4027655392123636, + "grad_norm": 1.6845983189853617, + "learning_rate": 1.3556469615655713e-06, + "loss": 1.2262, + "step": 2971 + }, + { + "epoch": 0.40290110486002845, + "grad_norm": 6.845392780008588, + "learning_rate": 1.355236500044408e-06, + "loss": 1.1681, + "step": 2972 + }, + { + "epoch": 0.4030366705076934, + "grad_norm": 3.0867251043389934, + "learning_rate": 1.3548259700186901e-06, + "loss": 1.1788, + "step": 2973 + }, + { + "epoch": 0.40317223615535824, + "grad_norm": 2.357680405302027, + "learning_rate": 1.3544153715675848e-06, + "loss": 1.2167, + "step": 2974 + }, + { + "epoch": 0.4033078018030231, + "grad_norm": 1.8382018115437522, + "learning_rate": 1.3540047047702725e-06, + "loss": 1.1627, + "step": 2975 + }, + { + "epoch": 0.403443367450688, + "grad_norm": 1.99633366713676, + "learning_rate": 1.353593969705947e-06, + "loss": 1.1672, + "step": 2976 + }, + { + "epoch": 0.40357893309835285, + "grad_norm": 1.9417139009650124, + "learning_rate": 1.353183166453816e-06, + "loss": 1.1976, + "step": 2977 + }, + { + "epoch": 0.4037144987460178, + "grad_norm": 2.585179628590509, + "learning_rate": 1.352772295093099e-06, + "loss": 1.1947, + "step": 2978 + }, + { + "epoch": 0.40385006439368265, + "grad_norm": 1.6166597263341234, + "learning_rate": 1.3523613557030298e-06, + "loss": 1.1819, + "step": 2979 + }, + { + "epoch": 0.4039856300413475, + "grad_norm": 1.7192268776923079, + "learning_rate": 1.3519503483628541e-06, + "loss": 1.1766, + "step": 2980 + }, + { + "epoch": 0.4041211956890124, + "grad_norm": 1.584607031785214, + "learning_rate": 1.351539273151832e-06, + "loss": 1.1474, + "step": 2981 + }, + { + "epoch": 0.40425676133667726, + "grad_norm": 1.7584682381723955, + "learning_rate": 1.3511281301492358e-06, + "loss": 1.1904, + "step": 2982 + }, + { + "epoch": 0.4043923269843422, + "grad_norm": 1.7851073575689953, + "learning_rate": 1.3507169194343514e-06, + "loss": 1.1759, + "step": 2983 + }, + { + "epoch": 0.40452789263200706, + "grad_norm": 3.4759360853309147, + "learning_rate": 1.3503056410864777e-06, + "loss": 1.184, + "step": 2984 + }, + { + "epoch": 0.40466345827967193, + "grad_norm": 1.6637150667773282, + "learning_rate": 1.349894295184926e-06, + "loss": 1.195, + "step": 2985 + }, + { + "epoch": 0.4047990239273368, + "grad_norm": 3.4901314618524424, + "learning_rate": 1.3494828818090215e-06, + "loss": 1.1705, + "step": 2986 + }, + { + "epoch": 0.40493458957500167, + "grad_norm": 1.6756209925265482, + "learning_rate": 1.349071401038102e-06, + "loss": 1.1776, + "step": 2987 + }, + { + "epoch": 0.4050701552226666, + "grad_norm": 1.7199207531055172, + "learning_rate": 1.348659852951518e-06, + "loss": 1.1509, + "step": 2988 + }, + { + "epoch": 0.40520572087033147, + "grad_norm": 1.9390599532937778, + "learning_rate": 1.3482482376286338e-06, + "loss": 1.1512, + "step": 2989 + }, + { + "epoch": 0.40534128651799634, + "grad_norm": 6.125957540448372, + "learning_rate": 1.3478365551488256e-06, + "loss": 1.1862, + "step": 2990 + }, + { + "epoch": 0.4054768521656612, + "grad_norm": 2.7133309470462628, + "learning_rate": 1.3474248055914834e-06, + "loss": 1.1661, + "step": 2991 + }, + { + "epoch": 0.4056124178133261, + "grad_norm": 1.7667973138387674, + "learning_rate": 1.3470129890360103e-06, + "loss": 1.1535, + "step": 2992 + }, + { + "epoch": 0.405747983460991, + "grad_norm": 1.6198632589418034, + "learning_rate": 1.3466011055618207e-06, + "loss": 1.1422, + "step": 2993 + }, + { + "epoch": 0.40588354910865587, + "grad_norm": 1.7782390831402193, + "learning_rate": 1.3461891552483442e-06, + "loss": 1.1777, + "step": 2994 + }, + { + "epoch": 0.40601911475632074, + "grad_norm": 1.8409270455939508, + "learning_rate": 1.3457771381750217e-06, + "loss": 1.1626, + "step": 2995 + }, + { + "epoch": 0.4061546804039856, + "grad_norm": 1.737697692713855, + "learning_rate": 1.3453650544213076e-06, + "loss": 1.1671, + "step": 2996 + }, + { + "epoch": 0.40629024605165054, + "grad_norm": 1.9217225864340892, + "learning_rate": 1.344952904066669e-06, + "loss": 1.1696, + "step": 2997 + }, + { + "epoch": 0.4064258116993154, + "grad_norm": 2.2483706256587563, + "learning_rate": 1.3445406871905855e-06, + "loss": 1.1712, + "step": 2998 + }, + { + "epoch": 0.4065613773469803, + "grad_norm": 1.8519817770370306, + "learning_rate": 1.34412840387255e-06, + "loss": 1.1639, + "step": 2999 + }, + { + "epoch": 0.40669694299464515, + "grad_norm": 1.8090682410579189, + "learning_rate": 1.3437160541920685e-06, + "loss": 1.1735, + "step": 3000 + }, + { + "epoch": 0.40683250864231, + "grad_norm": 2.9358758563215823, + "learning_rate": 1.3433036382286589e-06, + "loss": 1.1845, + "step": 3001 + }, + { + "epoch": 0.40696807428997495, + "grad_norm": 2.382482886839756, + "learning_rate": 1.3428911560618525e-06, + "loss": 1.1943, + "step": 3002 + }, + { + "epoch": 0.4071036399376398, + "grad_norm": 1.7761831990119288, + "learning_rate": 1.3424786077711933e-06, + "loss": 1.183, + "step": 3003 + }, + { + "epoch": 0.4072392055853047, + "grad_norm": 1.7242575646578275, + "learning_rate": 1.342065993436238e-06, + "loss": 1.1323, + "step": 3004 + }, + { + "epoch": 0.40737477123296956, + "grad_norm": 1.6334644914060732, + "learning_rate": 1.3416533131365563e-06, + "loss": 1.1603, + "step": 3005 + }, + { + "epoch": 0.4075103368806344, + "grad_norm": 4.775562545760106, + "learning_rate": 1.3412405669517296e-06, + "loss": 1.1752, + "step": 3006 + }, + { + "epoch": 0.40764590252829935, + "grad_norm": 1.9490332941480772, + "learning_rate": 1.3408277549613534e-06, + "loss": 1.1624, + "step": 3007 + }, + { + "epoch": 0.4077814681759642, + "grad_norm": 2.473759572841829, + "learning_rate": 1.3404148772450348e-06, + "loss": 1.184, + "step": 3008 + }, + { + "epoch": 0.4079170338236291, + "grad_norm": 1.8104804810550947, + "learning_rate": 1.340001933882394e-06, + "loss": 1.1512, + "step": 3009 + }, + { + "epoch": 0.40805259947129396, + "grad_norm": 2.513713600945744, + "learning_rate": 1.3395889249530642e-06, + "loss": 1.1946, + "step": 3010 + }, + { + "epoch": 0.40818816511895883, + "grad_norm": 2.2798066852207377, + "learning_rate": 1.339175850536691e-06, + "loss": 1.1707, + "step": 3011 + }, + { + "epoch": 0.40832373076662376, + "grad_norm": 2.6588519485216713, + "learning_rate": 1.338762710712932e-06, + "loss": 1.1787, + "step": 3012 + }, + { + "epoch": 0.40845929641428863, + "grad_norm": 1.6853348261380836, + "learning_rate": 1.3383495055614586e-06, + "loss": 1.1683, + "step": 3013 + }, + { + "epoch": 0.4085948620619535, + "grad_norm": 2.1778000829623996, + "learning_rate": 1.3379362351619537e-06, + "loss": 1.1827, + "step": 3014 + }, + { + "epoch": 0.40873042770961837, + "grad_norm": 1.6819374144592276, + "learning_rate": 1.3375228995941132e-06, + "loss": 1.2166, + "step": 3015 + }, + { + "epoch": 0.40886599335728324, + "grad_norm": 2.0399652343070223, + "learning_rate": 1.337109498937646e-06, + "loss": 1.2042, + "step": 3016 + }, + { + "epoch": 0.40900155900494817, + "grad_norm": 2.017657693184588, + "learning_rate": 1.3366960332722728e-06, + "loss": 1.1543, + "step": 3017 + }, + { + "epoch": 0.40913712465261304, + "grad_norm": 1.6773712733173962, + "learning_rate": 1.3362825026777272e-06, + "loss": 1.2116, + "step": 3018 + }, + { + "epoch": 0.4092726903002779, + "grad_norm": 1.637417109637194, + "learning_rate": 1.3358689072337554e-06, + "loss": 1.185, + "step": 3019 + }, + { + "epoch": 0.4094082559479428, + "grad_norm": 1.75233645891359, + "learning_rate": 1.3354552470201161e-06, + "loss": 1.1385, + "step": 3020 + }, + { + "epoch": 0.40954382159560765, + "grad_norm": 1.9060044056429097, + "learning_rate": 1.3350415221165805e-06, + "loss": 1.1687, + "step": 3021 + }, + { + "epoch": 0.4096793872432726, + "grad_norm": 1.5582300030599854, + "learning_rate": 1.3346277326029317e-06, + "loss": 1.159, + "step": 3022 + }, + { + "epoch": 0.40981495289093745, + "grad_norm": 1.7741752750774504, + "learning_rate": 1.3342138785589666e-06, + "loss": 1.1543, + "step": 3023 + }, + { + "epoch": 0.4099505185386023, + "grad_norm": 1.836928432110353, + "learning_rate": 1.3337999600644928e-06, + "loss": 1.2109, + "step": 3024 + }, + { + "epoch": 0.4100860841862672, + "grad_norm": 3.005852523992281, + "learning_rate": 1.3333859771993315e-06, + "loss": 1.1625, + "step": 3025 + }, + { + "epoch": 0.41022164983393206, + "grad_norm": 1.8401604378626297, + "learning_rate": 1.332971930043316e-06, + "loss": 1.1569, + "step": 3026 + }, + { + "epoch": 0.410357215481597, + "grad_norm": 2.4973676421872035, + "learning_rate": 1.3325578186762923e-06, + "loss": 1.1852, + "step": 3027 + }, + { + "epoch": 0.41049278112926185, + "grad_norm": 1.7263333485159604, + "learning_rate": 1.3321436431781183e-06, + "loss": 1.1757, + "step": 3028 + }, + { + "epoch": 0.4106283467769267, + "grad_norm": 1.7634693553916265, + "learning_rate": 1.3317294036286644e-06, + "loss": 1.1261, + "step": 3029 + }, + { + "epoch": 0.4107639124245916, + "grad_norm": 1.6966662834603952, + "learning_rate": 1.3313151001078135e-06, + "loss": 1.1446, + "step": 3030 + }, + { + "epoch": 0.41089947807225646, + "grad_norm": 2.697733192641991, + "learning_rate": 1.3309007326954608e-06, + "loss": 1.1524, + "step": 3031 + }, + { + "epoch": 0.4110350437199214, + "grad_norm": 2.161612237651701, + "learning_rate": 1.330486301471514e-06, + "loss": 1.1899, + "step": 3032 + }, + { + "epoch": 0.41117060936758626, + "grad_norm": 2.0460244369767535, + "learning_rate": 1.3300718065158924e-06, + "loss": 1.1343, + "step": 3033 + }, + { + "epoch": 0.41130617501525113, + "grad_norm": 1.8126315163707074, + "learning_rate": 1.3296572479085284e-06, + "loss": 1.2306, + "step": 3034 + }, + { + "epoch": 0.411441740662916, + "grad_norm": 1.7204587099913282, + "learning_rate": 1.3292426257293668e-06, + "loss": 1.1507, + "step": 3035 + }, + { + "epoch": 0.4115773063105809, + "grad_norm": 1.6276079415240052, + "learning_rate": 1.3288279400583631e-06, + "loss": 1.1871, + "step": 3036 + }, + { + "epoch": 0.4117128719582458, + "grad_norm": 1.5662208729745983, + "learning_rate": 1.3284131909754868e-06, + "loss": 1.1675, + "step": 3037 + }, + { + "epoch": 0.41184843760591067, + "grad_norm": 10.05894303377669, + "learning_rate": 1.3279983785607192e-06, + "loss": 1.1614, + "step": 3038 + }, + { + "epoch": 0.41198400325357554, + "grad_norm": 1.7414511404180846, + "learning_rate": 1.327583502894053e-06, + "loss": 1.1415, + "step": 3039 + }, + { + "epoch": 0.4121195689012404, + "grad_norm": 1.8164821261548358, + "learning_rate": 1.3271685640554943e-06, + "loss": 1.141, + "step": 3040 + }, + { + "epoch": 0.41225513454890533, + "grad_norm": 1.9780830004822414, + "learning_rate": 1.3267535621250604e-06, + "loss": 1.1686, + "step": 3041 + }, + { + "epoch": 0.4123907001965702, + "grad_norm": 1.6962441192070048, + "learning_rate": 1.3263384971827816e-06, + "loss": 1.1673, + "step": 3042 + }, + { + "epoch": 0.4125262658442351, + "grad_norm": 1.9299530767032598, + "learning_rate": 1.3259233693086993e-06, + "loss": 1.1637, + "step": 3043 + }, + { + "epoch": 0.41266183149189994, + "grad_norm": 2.216832427998063, + "learning_rate": 1.3255081785828678e-06, + "loss": 1.1796, + "step": 3044 + }, + { + "epoch": 0.4127973971395648, + "grad_norm": 1.622965377738106, + "learning_rate": 1.3250929250853537e-06, + "loss": 1.196, + "step": 3045 + }, + { + "epoch": 0.41293296278722974, + "grad_norm": 1.80550250016018, + "learning_rate": 1.324677608896235e-06, + "loss": 1.1731, + "step": 3046 + }, + { + "epoch": 0.4130685284348946, + "grad_norm": 2.282817061504149, + "learning_rate": 1.3242622300956027e-06, + "loss": 1.1539, + "step": 3047 + }, + { + "epoch": 0.4132040940825595, + "grad_norm": 2.7038594977979065, + "learning_rate": 1.3238467887635583e-06, + "loss": 1.175, + "step": 3048 + }, + { + "epoch": 0.41333965973022435, + "grad_norm": 2.821674453141396, + "learning_rate": 1.3234312849802173e-06, + "loss": 1.181, + "step": 3049 + }, + { + "epoch": 0.4134752253778892, + "grad_norm": 2.2630377519229783, + "learning_rate": 1.323015718825706e-06, + "loss": 1.148, + "step": 3050 + }, + { + "epoch": 0.41361079102555415, + "grad_norm": 1.8286209987912367, + "learning_rate": 1.3226000903801632e-06, + "loss": 1.1506, + "step": 3051 + }, + { + "epoch": 0.413746356673219, + "grad_norm": 2.1541845092860634, + "learning_rate": 1.322184399723739e-06, + "loss": 1.2008, + "step": 3052 + }, + { + "epoch": 0.4138819223208839, + "grad_norm": 2.170134316068743, + "learning_rate": 1.3217686469365967e-06, + "loss": 1.196, + "step": 3053 + }, + { + "epoch": 0.41401748796854876, + "grad_norm": 2.051181549905855, + "learning_rate": 1.3213528320989107e-06, + "loss": 1.1843, + "step": 3054 + }, + { + "epoch": 0.41415305361621363, + "grad_norm": 1.8209641971581707, + "learning_rate": 1.3209369552908676e-06, + "loss": 1.1385, + "step": 3055 + }, + { + "epoch": 0.41428861926387855, + "grad_norm": 2.116393469872813, + "learning_rate": 1.320521016592666e-06, + "loss": 1.1567, + "step": 3056 + }, + { + "epoch": 0.4144241849115434, + "grad_norm": 1.9026675277141971, + "learning_rate": 1.3201050160845164e-06, + "loss": 1.1736, + "step": 3057 + }, + { + "epoch": 0.4145597505592083, + "grad_norm": 1.8034832614054075, + "learning_rate": 1.3196889538466413e-06, + "loss": 1.1472, + "step": 3058 + }, + { + "epoch": 0.41469531620687317, + "grad_norm": 2.043091603133934, + "learning_rate": 1.319272829959275e-06, + "loss": 1.1572, + "step": 3059 + }, + { + "epoch": 0.41483088185453804, + "grad_norm": 2.0475401083896863, + "learning_rate": 1.3188566445026635e-06, + "loss": 1.1835, + "step": 3060 + }, + { + "epoch": 0.41496644750220296, + "grad_norm": 2.2154799763211876, + "learning_rate": 1.3184403975570648e-06, + "loss": 1.1544, + "step": 3061 + }, + { + "epoch": 0.41510201314986783, + "grad_norm": 2.216007273633919, + "learning_rate": 1.3180240892027494e-06, + "loss": 1.1647, + "step": 3062 + }, + { + "epoch": 0.4152375787975327, + "grad_norm": 1.708904398131868, + "learning_rate": 1.3176077195199984e-06, + "loss": 1.1594, + "step": 3063 + }, + { + "epoch": 0.4153731444451976, + "grad_norm": 1.977781181896816, + "learning_rate": 1.3171912885891061e-06, + "loss": 1.1926, + "step": 3064 + }, + { + "epoch": 0.41550871009286244, + "grad_norm": 2.818861563398981, + "learning_rate": 1.3167747964903775e-06, + "loss": 1.1955, + "step": 3065 + }, + { + "epoch": 0.41564427574052737, + "grad_norm": 3.170858308781756, + "learning_rate": 1.3163582433041296e-06, + "loss": 1.1686, + "step": 3066 + }, + { + "epoch": 0.41577984138819224, + "grad_norm": 1.6783883006038822, + "learning_rate": 1.3159416291106916e-06, + "loss": 1.2005, + "step": 3067 + }, + { + "epoch": 0.4159154070358571, + "grad_norm": 5.839674953139588, + "learning_rate": 1.3155249539904049e-06, + "loss": 1.1856, + "step": 3068 + }, + { + "epoch": 0.416050972683522, + "grad_norm": 1.7703617806785277, + "learning_rate": 1.3151082180236209e-06, + "loss": 1.1966, + "step": 3069 + }, + { + "epoch": 0.41618653833118685, + "grad_norm": 1.645623138714808, + "learning_rate": 1.3146914212907042e-06, + "loss": 1.1471, + "step": 3070 + }, + { + "epoch": 0.4163221039788518, + "grad_norm": 6.189783884685763, + "learning_rate": 1.3142745638720314e-06, + "loss": 1.1681, + "step": 3071 + }, + { + "epoch": 0.41645766962651665, + "grad_norm": 1.6886862944312517, + "learning_rate": 1.3138576458479893e-06, + "loss": 1.1976, + "step": 3072 + }, + { + "epoch": 0.4165932352741815, + "grad_norm": 2.3009427063868952, + "learning_rate": 1.3134406672989779e-06, + "loss": 1.163, + "step": 3073 + }, + { + "epoch": 0.4167288009218464, + "grad_norm": 2.00125177373353, + "learning_rate": 1.313023628305408e-06, + "loss": 1.1886, + "step": 3074 + }, + { + "epoch": 0.4168643665695113, + "grad_norm": 2.3845445818122695, + "learning_rate": 1.3126065289477019e-06, + "loss": 1.2175, + "step": 3075 + }, + { + "epoch": 0.4169999322171762, + "grad_norm": 1.6927186780985861, + "learning_rate": 1.3121893693062947e-06, + "loss": 1.1752, + "step": 3076 + }, + { + "epoch": 0.41713549786484105, + "grad_norm": 2.3110506602676715, + "learning_rate": 1.3117721494616319e-06, + "loss": 1.1589, + "step": 3077 + }, + { + "epoch": 0.4172710635125059, + "grad_norm": 1.589789423333547, + "learning_rate": 1.3113548694941708e-06, + "loss": 1.1075, + "step": 3078 + }, + { + "epoch": 0.4174066291601708, + "grad_norm": 1.6398254923106874, + "learning_rate": 1.3109375294843808e-06, + "loss": 1.1459, + "step": 3079 + }, + { + "epoch": 0.4175421948078357, + "grad_norm": 1.7936176752574484, + "learning_rate": 1.3105201295127426e-06, + "loss": 1.2269, + "step": 3080 + }, + { + "epoch": 0.4176777604555006, + "grad_norm": 2.4600481737684103, + "learning_rate": 1.3101026696597487e-06, + "loss": 1.1822, + "step": 3081 + }, + { + "epoch": 0.41781332610316546, + "grad_norm": 2.6745944199244307, + "learning_rate": 1.3096851500059028e-06, + "loss": 1.1455, + "step": 3082 + }, + { + "epoch": 0.41794889175083033, + "grad_norm": 2.2861606446176994, + "learning_rate": 1.3092675706317197e-06, + "loss": 1.1953, + "step": 3083 + }, + { + "epoch": 0.4180844573984952, + "grad_norm": 1.996338396462075, + "learning_rate": 1.3088499316177272e-06, + "loss": 1.2122, + "step": 3084 + }, + { + "epoch": 0.4182200230461601, + "grad_norm": 1.8296023180239218, + "learning_rate": 1.3084322330444635e-06, + "loss": 1.1523, + "step": 3085 + }, + { + "epoch": 0.418355588693825, + "grad_norm": 1.9945128172429119, + "learning_rate": 1.3080144749924782e-06, + "loss": 1.1445, + "step": 3086 + }, + { + "epoch": 0.41849115434148987, + "grad_norm": 1.8094527941192589, + "learning_rate": 1.3075966575423326e-06, + "loss": 1.2112, + "step": 3087 + }, + { + "epoch": 0.41862671998915474, + "grad_norm": 2.2363064935574894, + "learning_rate": 1.3071787807745996e-06, + "loss": 1.1651, + "step": 3088 + }, + { + "epoch": 0.4187622856368196, + "grad_norm": 1.804366649894951, + "learning_rate": 1.3067608447698633e-06, + "loss": 1.1459, + "step": 3089 + }, + { + "epoch": 0.41889785128448453, + "grad_norm": 1.6542551431640558, + "learning_rate": 1.3063428496087196e-06, + "loss": 1.1853, + "step": 3090 + }, + { + "epoch": 0.4190334169321494, + "grad_norm": 2.1474035666106497, + "learning_rate": 1.3059247953717758e-06, + "loss": 1.1875, + "step": 3091 + }, + { + "epoch": 0.4191689825798143, + "grad_norm": 2.1927138048393044, + "learning_rate": 1.3055066821396498e-06, + "loss": 1.1951, + "step": 3092 + }, + { + "epoch": 0.41930454822747915, + "grad_norm": 1.6705944041543275, + "learning_rate": 1.3050885099929716e-06, + "loss": 1.1506, + "step": 3093 + }, + { + "epoch": 0.419440113875144, + "grad_norm": 3.098663820111435, + "learning_rate": 1.3046702790123824e-06, + "loss": 1.1379, + "step": 3094 + }, + { + "epoch": 0.41957567952280894, + "grad_norm": 1.8351157052778162, + "learning_rate": 1.3042519892785353e-06, + "loss": 1.1736, + "step": 3095 + }, + { + "epoch": 0.4197112451704738, + "grad_norm": 3.3083328500210403, + "learning_rate": 1.3038336408720932e-06, + "loss": 1.1613, + "step": 3096 + }, + { + "epoch": 0.4198468108181387, + "grad_norm": 2.22814845783556, + "learning_rate": 1.303415233873732e-06, + "loss": 1.1911, + "step": 3097 + }, + { + "epoch": 0.41998237646580355, + "grad_norm": 1.6727366115741436, + "learning_rate": 1.3029967683641378e-06, + "loss": 1.183, + "step": 3098 + }, + { + "epoch": 0.4201179421134684, + "grad_norm": 2.0690810537330138, + "learning_rate": 1.3025782444240085e-06, + "loss": 1.178, + "step": 3099 + }, + { + "epoch": 0.42025350776113335, + "grad_norm": 2.5995496500517876, + "learning_rate": 1.3021596621340533e-06, + "loss": 1.1638, + "step": 3100 + }, + { + "epoch": 0.4203890734087982, + "grad_norm": 1.8480149003321473, + "learning_rate": 1.3017410215749924e-06, + "loss": 1.14, + "step": 3101 + }, + { + "epoch": 0.4205246390564631, + "grad_norm": 2.4402145287698795, + "learning_rate": 1.3013223228275571e-06, + "loss": 1.2252, + "step": 3102 + }, + { + "epoch": 0.42066020470412796, + "grad_norm": 2.0164868856072946, + "learning_rate": 1.3009035659724904e-06, + "loss": 1.1932, + "step": 3103 + }, + { + "epoch": 0.42079577035179283, + "grad_norm": 2.079826664650729, + "learning_rate": 1.3004847510905463e-06, + "loss": 1.1844, + "step": 3104 + }, + { + "epoch": 0.42093133599945776, + "grad_norm": 16.380274679911583, + "learning_rate": 1.30006587826249e-06, + "loss": 1.1813, + "step": 3105 + }, + { + "epoch": 0.4210669016471226, + "grad_norm": 3.0830888660665074, + "learning_rate": 1.2996469475690975e-06, + "loss": 1.1463, + "step": 3106 + }, + { + "epoch": 0.4212024672947875, + "grad_norm": 1.720293448317055, + "learning_rate": 1.2992279590911563e-06, + "loss": 1.2004, + "step": 3107 + }, + { + "epoch": 0.42133803294245237, + "grad_norm": 1.8687511557508691, + "learning_rate": 1.298808912909465e-06, + "loss": 1.1498, + "step": 3108 + }, + { + "epoch": 0.42147359859011724, + "grad_norm": 1.8005369706864232, + "learning_rate": 1.298389809104834e-06, + "loss": 1.185, + "step": 3109 + }, + { + "epoch": 0.42160916423778216, + "grad_norm": 1.9202436747845568, + "learning_rate": 1.297970647758083e-06, + "loss": 1.164, + "step": 3110 + }, + { + "epoch": 0.42174472988544703, + "grad_norm": 1.6360354830846116, + "learning_rate": 1.2975514289500451e-06, + "loss": 1.1821, + "step": 3111 + }, + { + "epoch": 0.4218802955331119, + "grad_norm": 1.8566320287732803, + "learning_rate": 1.2971321527615629e-06, + "loss": 1.2182, + "step": 3112 + }, + { + "epoch": 0.4220158611807768, + "grad_norm": 1.7626802572933564, + "learning_rate": 1.2967128192734902e-06, + "loss": 1.1861, + "step": 3113 + }, + { + "epoch": 0.4221514268284417, + "grad_norm": 2.416986372658114, + "learning_rate": 1.2962934285666924e-06, + "loss": 1.1849, + "step": 3114 + }, + { + "epoch": 0.42228699247610657, + "grad_norm": 1.6661893013560887, + "learning_rate": 1.295873980722046e-06, + "loss": 1.2078, + "step": 3115 + }, + { + "epoch": 0.42242255812377144, + "grad_norm": 1.9438854690642646, + "learning_rate": 1.2954544758204374e-06, + "loss": 1.1861, + "step": 3116 + }, + { + "epoch": 0.4225581237714363, + "grad_norm": 2.2001747571315455, + "learning_rate": 1.2950349139427659e-06, + "loss": 1.1628, + "step": 3117 + }, + { + "epoch": 0.4226936894191012, + "grad_norm": 1.8267785237341367, + "learning_rate": 1.2946152951699398e-06, + "loss": 1.172, + "step": 3118 + }, + { + "epoch": 0.4228292550667661, + "grad_norm": 1.6967067683249457, + "learning_rate": 1.2941956195828797e-06, + "loss": 1.1871, + "step": 3119 + }, + { + "epoch": 0.422964820714431, + "grad_norm": 1.8342079898008503, + "learning_rate": 1.2937758872625166e-06, + "loss": 1.166, + "step": 3120 + }, + { + "epoch": 0.42310038636209585, + "grad_norm": 2.4463057369773553, + "learning_rate": 1.2933560982897924e-06, + "loss": 1.1508, + "step": 3121 + }, + { + "epoch": 0.4232359520097607, + "grad_norm": 2.1060008114429385, + "learning_rate": 1.2929362527456604e-06, + "loss": 1.1884, + "step": 3122 + }, + { + "epoch": 0.4233715176574256, + "grad_norm": 3.5403168806091996, + "learning_rate": 1.2925163507110843e-06, + "loss": 1.171, + "step": 3123 + }, + { + "epoch": 0.4235070833050905, + "grad_norm": 1.935357183534972, + "learning_rate": 1.292096392267039e-06, + "loss": 1.1675, + "step": 3124 + }, + { + "epoch": 0.4236426489527554, + "grad_norm": 2.2392340694838575, + "learning_rate": 1.2916763774945101e-06, + "loss": 1.1677, + "step": 3125 + }, + { + "epoch": 0.42377821460042026, + "grad_norm": 1.9641705643897425, + "learning_rate": 1.2912563064744938e-06, + "loss": 1.1642, + "step": 3126 + }, + { + "epoch": 0.4239137802480851, + "grad_norm": 2.0579254273925, + "learning_rate": 1.2908361792879984e-06, + "loss": 1.1718, + "step": 3127 + }, + { + "epoch": 0.42404934589575, + "grad_norm": 3.5410437238539694, + "learning_rate": 1.2904159960160415e-06, + "loss": 1.1472, + "step": 3128 + }, + { + "epoch": 0.4241849115434149, + "grad_norm": 2.191846397939321, + "learning_rate": 1.289995756739652e-06, + "loss": 1.1758, + "step": 3129 + }, + { + "epoch": 0.4243204771910798, + "grad_norm": 1.780356789969084, + "learning_rate": 1.2895754615398697e-06, + "loss": 1.139, + "step": 3130 + }, + { + "epoch": 0.42445604283874466, + "grad_norm": 1.6171438479704194, + "learning_rate": 1.2891551104977457e-06, + "loss": 1.1329, + "step": 3131 + }, + { + "epoch": 0.42459160848640953, + "grad_norm": 1.5699622309289638, + "learning_rate": 1.2887347036943407e-06, + "loss": 1.1647, + "step": 3132 + }, + { + "epoch": 0.4247271741340744, + "grad_norm": 1.6932370611938556, + "learning_rate": 1.288314241210728e-06, + "loss": 1.1948, + "step": 3133 + }, + { + "epoch": 0.42486273978173933, + "grad_norm": 1.6349154112885063, + "learning_rate": 1.2878937231279892e-06, + "loss": 1.1789, + "step": 3134 + }, + { + "epoch": 0.4249983054294042, + "grad_norm": 2.3612965032634787, + "learning_rate": 1.2874731495272181e-06, + "loss": 1.1502, + "step": 3135 + }, + { + "epoch": 0.42513387107706907, + "grad_norm": 2.27228208086149, + "learning_rate": 1.2870525204895197e-06, + "loss": 1.2129, + "step": 3136 + }, + { + "epoch": 0.42526943672473394, + "grad_norm": 1.8993186022681066, + "learning_rate": 1.2866318360960084e-06, + "loss": 1.1531, + "step": 3137 + }, + { + "epoch": 0.4254050023723988, + "grad_norm": 1.8394449596888116, + "learning_rate": 1.2862110964278102e-06, + "loss": 1.1784, + "step": 3138 + }, + { + "epoch": 0.42554056802006374, + "grad_norm": 1.9509955326513393, + "learning_rate": 1.2857903015660612e-06, + "loss": 1.199, + "step": 3139 + }, + { + "epoch": 0.4256761336677286, + "grad_norm": 1.788408240169189, + "learning_rate": 1.2853694515919082e-06, + "loss": 1.1804, + "step": 3140 + }, + { + "epoch": 0.4258116993153935, + "grad_norm": 5.732284374124145, + "learning_rate": 1.2849485465865092e-06, + "loss": 1.1638, + "step": 3141 + }, + { + "epoch": 0.42594726496305835, + "grad_norm": 1.9404786574113106, + "learning_rate": 1.2845275866310324e-06, + "loss": 1.1827, + "step": 3142 + }, + { + "epoch": 0.4260828306107232, + "grad_norm": 2.215714688710891, + "learning_rate": 1.2841065718066563e-06, + "loss": 1.1518, + "step": 3143 + }, + { + "epoch": 0.42621839625838814, + "grad_norm": 2.0989976881526973, + "learning_rate": 1.2836855021945705e-06, + "loss": 1.1759, + "step": 3144 + }, + { + "epoch": 0.426353961906053, + "grad_norm": 2.272196069853469, + "learning_rate": 1.283264377875975e-06, + "loss": 1.1797, + "step": 3145 + }, + { + "epoch": 0.4264895275537179, + "grad_norm": 1.8674890932748403, + "learning_rate": 1.2828431989320797e-06, + "loss": 1.1463, + "step": 3146 + }, + { + "epoch": 0.42662509320138275, + "grad_norm": 3.254485941383661, + "learning_rate": 1.2824219654441067e-06, + "loss": 1.1568, + "step": 3147 + }, + { + "epoch": 0.4267606588490476, + "grad_norm": 4.239097189722022, + "learning_rate": 1.2820006774932866e-06, + "loss": 1.1932, + "step": 3148 + }, + { + "epoch": 0.42689622449671255, + "grad_norm": 2.2490870383550123, + "learning_rate": 1.281579335160862e-06, + "loss": 1.1634, + "step": 3149 + }, + { + "epoch": 0.4270317901443774, + "grad_norm": 1.7139308045793127, + "learning_rate": 1.281157938528085e-06, + "loss": 1.1381, + "step": 3150 + }, + { + "epoch": 0.4271673557920423, + "grad_norm": 1.6265497915456584, + "learning_rate": 1.280736487676219e-06, + "loss": 1.1726, + "step": 3151 + }, + { + "epoch": 0.42730292143970716, + "grad_norm": 1.8229258214480981, + "learning_rate": 1.2803149826865375e-06, + "loss": 1.167, + "step": 3152 + }, + { + "epoch": 0.4274384870873721, + "grad_norm": 1.9839831362396014, + "learning_rate": 1.279893423640324e-06, + "loss": 1.1617, + "step": 3153 + }, + { + "epoch": 0.42757405273503696, + "grad_norm": 2.2491702373734683, + "learning_rate": 1.2794718106188734e-06, + "loss": 1.2026, + "step": 3154 + }, + { + "epoch": 0.42770961838270183, + "grad_norm": 1.552758088551857, + "learning_rate": 1.27905014370349e-06, + "loss": 1.185, + "step": 3155 + }, + { + "epoch": 0.4278451840303667, + "grad_norm": 1.9328259621105293, + "learning_rate": 1.2786284229754892e-06, + "loss": 1.153, + "step": 3156 + }, + { + "epoch": 0.42798074967803157, + "grad_norm": 1.6663427157850337, + "learning_rate": 1.2782066485161961e-06, + "loss": 1.175, + "step": 3157 + }, + { + "epoch": 0.4281163153256965, + "grad_norm": 2.0001952730201835, + "learning_rate": 1.2777848204069473e-06, + "loss": 1.1955, + "step": 3158 + }, + { + "epoch": 0.42825188097336137, + "grad_norm": 1.7264991279565438, + "learning_rate": 1.2773629387290883e-06, + "loss": 1.1667, + "step": 3159 + }, + { + "epoch": 0.42838744662102624, + "grad_norm": 1.8294662370194177, + "learning_rate": 1.276941003563976e-06, + "loss": 1.139, + "step": 3160 + }, + { + "epoch": 0.4285230122686911, + "grad_norm": 1.5512026075899685, + "learning_rate": 1.276519014992977e-06, + "loss": 1.1877, + "step": 3161 + }, + { + "epoch": 0.428658577916356, + "grad_norm": 1.6069960329921702, + "learning_rate": 1.276096973097469e-06, + "loss": 1.1377, + "step": 3162 + }, + { + "epoch": 0.4287941435640209, + "grad_norm": 1.6880255615202786, + "learning_rate": 1.275674877958839e-06, + "loss": 1.1932, + "step": 3163 + }, + { + "epoch": 0.4289297092116858, + "grad_norm": 1.7133680864024008, + "learning_rate": 1.2752527296584847e-06, + "loss": 1.1647, + "step": 3164 + }, + { + "epoch": 0.42906527485935064, + "grad_norm": 1.492242888638747, + "learning_rate": 1.2748305282778142e-06, + "loss": 1.1485, + "step": 3165 + }, + { + "epoch": 0.4292008405070155, + "grad_norm": 1.7106267750951016, + "learning_rate": 1.2744082738982457e-06, + "loss": 1.168, + "step": 3166 + }, + { + "epoch": 0.4293364061546804, + "grad_norm": 1.8092385795409103, + "learning_rate": 1.2739859666012076e-06, + "loss": 1.1768, + "step": 3167 + }, + { + "epoch": 0.4294719718023453, + "grad_norm": 1.8264571080559888, + "learning_rate": 1.2735636064681387e-06, + "loss": 1.156, + "step": 3168 + }, + { + "epoch": 0.4296075374500102, + "grad_norm": 1.8887830438171176, + "learning_rate": 1.2731411935804877e-06, + "loss": 1.1631, + "step": 3169 + }, + { + "epoch": 0.42974310309767505, + "grad_norm": 1.899383430513542, + "learning_rate": 1.2727187280197133e-06, + "loss": 1.152, + "step": 3170 + }, + { + "epoch": 0.4298786687453399, + "grad_norm": 1.994330120947215, + "learning_rate": 1.272296209867285e-06, + "loss": 1.1793, + "step": 3171 + }, + { + "epoch": 0.4300142343930048, + "grad_norm": 1.9031598840303248, + "learning_rate": 1.2718736392046824e-06, + "loss": 1.1639, + "step": 3172 + }, + { + "epoch": 0.4301498000406697, + "grad_norm": 1.8048842445535338, + "learning_rate": 1.271451016113394e-06, + "loss": 1.1387, + "step": 3173 + }, + { + "epoch": 0.4302853656883346, + "grad_norm": 1.9449195371628156, + "learning_rate": 1.27102834067492e-06, + "loss": 1.178, + "step": 3174 + }, + { + "epoch": 0.43042093133599946, + "grad_norm": 2.389726585823647, + "learning_rate": 1.2706056129707703e-06, + "loss": 1.2056, + "step": 3175 + }, + { + "epoch": 0.4305564969836643, + "grad_norm": 1.7210076180325349, + "learning_rate": 1.2701828330824638e-06, + "loss": 1.1897, + "step": 3176 + }, + { + "epoch": 0.4306920626313292, + "grad_norm": 1.5288455067266968, + "learning_rate": 1.2697600010915306e-06, + "loss": 1.2044, + "step": 3177 + }, + { + "epoch": 0.4308276282789941, + "grad_norm": 2.1975345157080075, + "learning_rate": 1.2693371170795107e-06, + "loss": 1.139, + "step": 3178 + }, + { + "epoch": 0.430963193926659, + "grad_norm": 4.1953902117768225, + "learning_rate": 1.2689141811279536e-06, + "loss": 1.1571, + "step": 3179 + }, + { + "epoch": 0.43109875957432386, + "grad_norm": 1.6335383359067932, + "learning_rate": 1.2684911933184193e-06, + "loss": 1.1538, + "step": 3180 + }, + { + "epoch": 0.43123432522198873, + "grad_norm": 2.0546307375181194, + "learning_rate": 1.2680681537324779e-06, + "loss": 1.1824, + "step": 3181 + }, + { + "epoch": 0.4313698908696536, + "grad_norm": 1.6951148700561924, + "learning_rate": 1.267645062451709e-06, + "loss": 1.1411, + "step": 3182 + }, + { + "epoch": 0.43150545651731853, + "grad_norm": 1.8934368591137292, + "learning_rate": 1.2672219195577023e-06, + "loss": 1.2574, + "step": 3183 + }, + { + "epoch": 0.4316410221649834, + "grad_norm": 1.8453169929401623, + "learning_rate": 1.266798725132058e-06, + "loss": 1.1698, + "step": 3184 + }, + { + "epoch": 0.43177658781264827, + "grad_norm": 3.624897124210937, + "learning_rate": 1.2663754792563852e-06, + "loss": 1.1385, + "step": 3185 + }, + { + "epoch": 0.43191215346031314, + "grad_norm": 1.703626291279991, + "learning_rate": 1.2659521820123042e-06, + "loss": 1.1924, + "step": 3186 + }, + { + "epoch": 0.432047719107978, + "grad_norm": 2.0178938112739697, + "learning_rate": 1.265528833481444e-06, + "loss": 1.1829, + "step": 3187 + }, + { + "epoch": 0.43218328475564294, + "grad_norm": 2.0004355137593754, + "learning_rate": 1.2651054337454443e-06, + "loss": 1.1724, + "step": 3188 + }, + { + "epoch": 0.4323188504033078, + "grad_norm": 7.088209173339371, + "learning_rate": 1.2646819828859545e-06, + "loss": 1.1749, + "step": 3189 + }, + { + "epoch": 0.4324544160509727, + "grad_norm": 1.5905194028282774, + "learning_rate": 1.2642584809846333e-06, + "loss": 1.1727, + "step": 3190 + }, + { + "epoch": 0.43258998169863755, + "grad_norm": 2.2658248084449277, + "learning_rate": 1.2638349281231503e-06, + "loss": 1.1978, + "step": 3191 + }, + { + "epoch": 0.4327255473463024, + "grad_norm": 2.0482742107026546, + "learning_rate": 1.2634113243831836e-06, + "loss": 1.1899, + "step": 3192 + }, + { + "epoch": 0.43286111299396735, + "grad_norm": 1.8958474672516554, + "learning_rate": 1.2629876698464223e-06, + "loss": 1.1575, + "step": 3193 + }, + { + "epoch": 0.4329966786416322, + "grad_norm": 2.598165621319888, + "learning_rate": 1.2625639645945652e-06, + "loss": 1.1789, + "step": 3194 + }, + { + "epoch": 0.4331322442892971, + "grad_norm": 1.7452662514622603, + "learning_rate": 1.2621402087093195e-06, + "loss": 1.1809, + "step": 3195 + }, + { + "epoch": 0.43326780993696196, + "grad_norm": 1.8769439386595614, + "learning_rate": 1.261716402272404e-06, + "loss": 1.1661, + "step": 3196 + }, + { + "epoch": 0.4334033755846269, + "grad_norm": 2.279977773323194, + "learning_rate": 1.2612925453655462e-06, + "loss": 1.1553, + "step": 3197 + }, + { + "epoch": 0.43353894123229175, + "grad_norm": 2.207340785467249, + "learning_rate": 1.2608686380704838e-06, + "loss": 1.1238, + "step": 3198 + }, + { + "epoch": 0.4336745068799566, + "grad_norm": 2.2180092741117474, + "learning_rate": 1.2604446804689635e-06, + "loss": 1.1829, + "step": 3199 + }, + { + "epoch": 0.4338100725276215, + "grad_norm": 1.9413104500726144, + "learning_rate": 1.2600206726427422e-06, + "loss": 1.1694, + "step": 3200 + }, + { + "epoch": 0.43394563817528636, + "grad_norm": 1.7920951551214521, + "learning_rate": 1.2595966146735868e-06, + "loss": 1.1692, + "step": 3201 + }, + { + "epoch": 0.4340812038229513, + "grad_norm": 1.70875864205714, + "learning_rate": 1.2591725066432734e-06, + "loss": 1.1553, + "step": 3202 + }, + { + "epoch": 0.43421676947061616, + "grad_norm": 1.7282517975470448, + "learning_rate": 1.258748348633588e-06, + "loss": 1.1481, + "step": 3203 + }, + { + "epoch": 0.43435233511828103, + "grad_norm": 1.6181284005135514, + "learning_rate": 1.2583241407263259e-06, + "loss": 1.1612, + "step": 3204 + }, + { + "epoch": 0.4344879007659459, + "grad_norm": 2.0034725824863187, + "learning_rate": 1.2578998830032924e-06, + "loss": 1.181, + "step": 3205 + }, + { + "epoch": 0.43462346641361077, + "grad_norm": 2.336076417714654, + "learning_rate": 1.257475575546302e-06, + "loss": 1.1577, + "step": 3206 + }, + { + "epoch": 0.4347590320612757, + "grad_norm": 1.7080347848972504, + "learning_rate": 1.2570512184371796e-06, + "loss": 1.1552, + "step": 3207 + }, + { + "epoch": 0.43489459770894057, + "grad_norm": 2.069368593452264, + "learning_rate": 1.2566268117577583e-06, + "loss": 1.1701, + "step": 3208 + }, + { + "epoch": 0.43503016335660544, + "grad_norm": 1.7077977705940826, + "learning_rate": 1.2562023555898823e-06, + "loss": 1.168, + "step": 3209 + }, + { + "epoch": 0.4351657290042703, + "grad_norm": 2.3560561794326977, + "learning_rate": 1.2557778500154044e-06, + "loss": 1.157, + "step": 3210 + }, + { + "epoch": 0.4353012946519352, + "grad_norm": 2.318062475722456, + "learning_rate": 1.2553532951161868e-06, + "loss": 1.2071, + "step": 3211 + }, + { + "epoch": 0.4354368602996001, + "grad_norm": 1.71872309281461, + "learning_rate": 1.2549286909741024e-06, + "loss": 1.1219, + "step": 3212 + }, + { + "epoch": 0.435572425947265, + "grad_norm": 1.6199290295698576, + "learning_rate": 1.254504037671032e-06, + "loss": 1.2024, + "step": 3213 + }, + { + "epoch": 0.43570799159492984, + "grad_norm": 1.9127114443729478, + "learning_rate": 1.2540793352888667e-06, + "loss": 1.1453, + "step": 3214 + }, + { + "epoch": 0.4358435572425947, + "grad_norm": 1.6130756837436537, + "learning_rate": 1.2536545839095072e-06, + "loss": 1.1993, + "step": 3215 + }, + { + "epoch": 0.4359791228902596, + "grad_norm": 1.5080491809775511, + "learning_rate": 1.2532297836148636e-06, + "loss": 1.1494, + "step": 3216 + }, + { + "epoch": 0.4361146885379245, + "grad_norm": 1.9338843392075502, + "learning_rate": 1.2528049344868553e-06, + "loss": 1.1689, + "step": 3217 + }, + { + "epoch": 0.4362502541855894, + "grad_norm": 2.1291701837040518, + "learning_rate": 1.2523800366074104e-06, + "loss": 1.1698, + "step": 3218 + }, + { + "epoch": 0.43638581983325425, + "grad_norm": 1.97093054918003, + "learning_rate": 1.251955090058468e-06, + "loss": 1.183, + "step": 3219 + }, + { + "epoch": 0.4365213854809191, + "grad_norm": 1.725676459879176, + "learning_rate": 1.251530094921975e-06, + "loss": 1.1998, + "step": 3220 + }, + { + "epoch": 0.436656951128584, + "grad_norm": 2.8923100745576327, + "learning_rate": 1.2511050512798889e-06, + "loss": 1.1394, + "step": 3221 + }, + { + "epoch": 0.4367925167762489, + "grad_norm": 5.194918678115781, + "learning_rate": 1.2506799592141754e-06, + "loss": 1.1385, + "step": 3222 + }, + { + "epoch": 0.4369280824239138, + "grad_norm": 1.8184206137750731, + "learning_rate": 1.2502548188068109e-06, + "loss": 1.1836, + "step": 3223 + }, + { + "epoch": 0.43706364807157866, + "grad_norm": 1.7891084481719197, + "learning_rate": 1.24982963013978e-06, + "loss": 1.1337, + "step": 3224 + }, + { + "epoch": 0.43719921371924353, + "grad_norm": 1.8213095229311367, + "learning_rate": 1.2494043932950768e-06, + "loss": 1.1798, + "step": 3225 + }, + { + "epoch": 0.4373347793669084, + "grad_norm": 2.7380125979466907, + "learning_rate": 1.248979108354705e-06, + "loss": 1.1664, + "step": 3226 + }, + { + "epoch": 0.4374703450145733, + "grad_norm": 2.7236237649609647, + "learning_rate": 1.2485537754006776e-06, + "loss": 1.1408, + "step": 3227 + }, + { + "epoch": 0.4376059106622382, + "grad_norm": 2.4908185965344476, + "learning_rate": 1.2481283945150164e-06, + "loss": 1.1898, + "step": 3228 + }, + { + "epoch": 0.43774147630990307, + "grad_norm": 2.485582869373698, + "learning_rate": 1.2477029657797531e-06, + "loss": 1.194, + "step": 3229 + }, + { + "epoch": 0.43787704195756794, + "grad_norm": 1.6795401429284265, + "learning_rate": 1.247277489276928e-06, + "loss": 1.1945, + "step": 3230 + }, + { + "epoch": 0.4380126076052328, + "grad_norm": 1.833063283790289, + "learning_rate": 1.2468519650885912e-06, + "loss": 1.1863, + "step": 3231 + }, + { + "epoch": 0.43814817325289773, + "grad_norm": 2.331518138709688, + "learning_rate": 1.2464263932968012e-06, + "loss": 1.1574, + "step": 3232 + }, + { + "epoch": 0.4382837389005626, + "grad_norm": 1.6665805501591617, + "learning_rate": 1.2460007739836265e-06, + "loss": 1.1777, + "step": 3233 + }, + { + "epoch": 0.4384193045482275, + "grad_norm": 2.058217825381139, + "learning_rate": 1.2455751072311443e-06, + "loss": 1.1452, + "step": 3234 + }, + { + "epoch": 0.43855487019589234, + "grad_norm": 2.0390342674165267, + "learning_rate": 1.245149393121441e-06, + "loss": 1.1963, + "step": 3235 + }, + { + "epoch": 0.43869043584355727, + "grad_norm": 5.931277423459934, + "learning_rate": 1.2447236317366124e-06, + "loss": 1.1767, + "step": 3236 + }, + { + "epoch": 0.43882600149122214, + "grad_norm": 1.874356273307693, + "learning_rate": 1.2442978231587633e-06, + "loss": 1.1615, + "step": 3237 + }, + { + "epoch": 0.438961567138887, + "grad_norm": 1.9055752477010293, + "learning_rate": 1.2438719674700073e-06, + "loss": 1.1775, + "step": 3238 + }, + { + "epoch": 0.4390971327865519, + "grad_norm": 1.543729427826975, + "learning_rate": 1.2434460647524675e-06, + "loss": 1.1327, + "step": 3239 + }, + { + "epoch": 0.43923269843421675, + "grad_norm": 1.6687545322140112, + "learning_rate": 1.2430201150882755e-06, + "loss": 1.1472, + "step": 3240 + }, + { + "epoch": 0.4393682640818817, + "grad_norm": 2.058457935349614, + "learning_rate": 1.2425941185595726e-06, + "loss": 1.2026, + "step": 3241 + }, + { + "epoch": 0.43950382972954655, + "grad_norm": 2.409913900313761, + "learning_rate": 1.2421680752485092e-06, + "loss": 1.1636, + "step": 3242 + }, + { + "epoch": 0.4396393953772114, + "grad_norm": 1.8061581724585933, + "learning_rate": 1.241741985237244e-06, + "loss": 1.1089, + "step": 3243 + }, + { + "epoch": 0.4397749610248763, + "grad_norm": 2.5452534983267623, + "learning_rate": 1.241315848607945e-06, + "loss": 1.1872, + "step": 3244 + }, + { + "epoch": 0.43991052667254116, + "grad_norm": 1.8546520328880451, + "learning_rate": 1.2408896654427894e-06, + "loss": 1.1647, + "step": 3245 + }, + { + "epoch": 0.4400460923202061, + "grad_norm": 1.6291975460679378, + "learning_rate": 1.2404634358239632e-06, + "loss": 1.2272, + "step": 3246 + }, + { + "epoch": 0.44018165796787095, + "grad_norm": 1.8062802342987534, + "learning_rate": 1.2400371598336617e-06, + "loss": 1.2125, + "step": 3247 + }, + { + "epoch": 0.4403172236155358, + "grad_norm": 2.50170155441982, + "learning_rate": 1.2396108375540885e-06, + "loss": 1.1853, + "step": 3248 + }, + { + "epoch": 0.4404527892632007, + "grad_norm": 1.5596622169517325, + "learning_rate": 1.2391844690674567e-06, + "loss": 1.1591, + "step": 3249 + }, + { + "epoch": 0.44058835491086557, + "grad_norm": 2.1485604083303005, + "learning_rate": 1.2387580544559881e-06, + "loss": 1.1602, + "step": 3250 + }, + { + "epoch": 0.4407239205585305, + "grad_norm": 2.580889250728992, + "learning_rate": 1.2383315938019132e-06, + "loss": 1.174, + "step": 3251 + }, + { + "epoch": 0.44085948620619536, + "grad_norm": 1.6787990026437534, + "learning_rate": 1.2379050871874719e-06, + "loss": 1.1927, + "step": 3252 + }, + { + "epoch": 0.44099505185386023, + "grad_norm": 4.494307124050675, + "learning_rate": 1.2374785346949125e-06, + "loss": 1.1418, + "step": 3253 + }, + { + "epoch": 0.4411306175015251, + "grad_norm": 1.8570849349909508, + "learning_rate": 1.2370519364064919e-06, + "loss": 1.1705, + "step": 3254 + }, + { + "epoch": 0.44126618314918997, + "grad_norm": 2.3785305682376086, + "learning_rate": 1.2366252924044767e-06, + "loss": 1.1665, + "step": 3255 + }, + { + "epoch": 0.4414017487968549, + "grad_norm": 2.86585279220783, + "learning_rate": 1.236198602771142e-06, + "loss": 1.1673, + "step": 3256 + }, + { + "epoch": 0.44153731444451977, + "grad_norm": 1.7895948192423465, + "learning_rate": 1.2357718675887707e-06, + "loss": 1.1576, + "step": 3257 + }, + { + "epoch": 0.44167288009218464, + "grad_norm": 2.067834778529152, + "learning_rate": 1.235345086939656e-06, + "loss": 1.131, + "step": 3258 + }, + { + "epoch": 0.4418084457398495, + "grad_norm": 1.7461041223340945, + "learning_rate": 1.234918260906099e-06, + "loss": 1.1652, + "step": 3259 + }, + { + "epoch": 0.4419440113875144, + "grad_norm": 2.282922647066137, + "learning_rate": 1.2344913895704096e-06, + "loss": 1.1695, + "step": 3260 + }, + { + "epoch": 0.4420795770351793, + "grad_norm": 1.8255654570207593, + "learning_rate": 1.234064473014907e-06, + "loss": 1.1833, + "step": 3261 + }, + { + "epoch": 0.4422151426828442, + "grad_norm": 1.5366749768890589, + "learning_rate": 1.2336375113219182e-06, + "loss": 1.1835, + "step": 3262 + }, + { + "epoch": 0.44235070833050905, + "grad_norm": 3.638073260233927, + "learning_rate": 1.2332105045737796e-06, + "loss": 1.1707, + "step": 3263 + }, + { + "epoch": 0.4424862739781739, + "grad_norm": 2.2037929230603788, + "learning_rate": 1.2327834528528357e-06, + "loss": 1.1608, + "step": 3264 + }, + { + "epoch": 0.4426218396258388, + "grad_norm": 8.356682487026905, + "learning_rate": 1.2323563562414407e-06, + "loss": 1.1629, + "step": 3265 + }, + { + "epoch": 0.4427574052735037, + "grad_norm": 1.888118251414611, + "learning_rate": 1.2319292148219566e-06, + "loss": 1.1844, + "step": 3266 + }, + { + "epoch": 0.4428929709211686, + "grad_norm": 1.7358104161888488, + "learning_rate": 1.2315020286767538e-06, + "loss": 1.1665, + "step": 3267 + }, + { + "epoch": 0.44302853656883345, + "grad_norm": 1.745912785559445, + "learning_rate": 1.2310747978882126e-06, + "loss": 1.1705, + "step": 3268 + }, + { + "epoch": 0.4431641022164983, + "grad_norm": 2.2139396976006216, + "learning_rate": 1.2306475225387203e-06, + "loss": 1.1953, + "step": 3269 + }, + { + "epoch": 0.4432996678641632, + "grad_norm": 1.8560929133711186, + "learning_rate": 1.2302202027106739e-06, + "loss": 1.1599, + "step": 3270 + }, + { + "epoch": 0.4434352335118281, + "grad_norm": 2.1083411255244187, + "learning_rate": 1.2297928384864787e-06, + "loss": 1.2014, + "step": 3271 + }, + { + "epoch": 0.443570799159493, + "grad_norm": 1.8219175007890764, + "learning_rate": 1.2293654299485485e-06, + "loss": 1.1479, + "step": 3272 + }, + { + "epoch": 0.44370636480715786, + "grad_norm": 2.045926717961881, + "learning_rate": 1.2289379771793059e-06, + "loss": 1.1724, + "step": 3273 + }, + { + "epoch": 0.44384193045482273, + "grad_norm": 1.8267715862014289, + "learning_rate": 1.2285104802611812e-06, + "loss": 1.1828, + "step": 3274 + }, + { + "epoch": 0.44397749610248766, + "grad_norm": 1.8116747068803274, + "learning_rate": 1.2280829392766143e-06, + "loss": 1.2163, + "step": 3275 + }, + { + "epoch": 0.4441130617501525, + "grad_norm": 2.022170697966124, + "learning_rate": 1.2276553543080527e-06, + "loss": 1.1692, + "step": 3276 + }, + { + "epoch": 0.4442486273978174, + "grad_norm": 1.7829585910055603, + "learning_rate": 1.2272277254379533e-06, + "loss": 1.1716, + "step": 3277 + }, + { + "epoch": 0.44438419304548227, + "grad_norm": 2.1189055018198872, + "learning_rate": 1.2268000527487803e-06, + "loss": 1.1873, + "step": 3278 + }, + { + "epoch": 0.44451975869314714, + "grad_norm": 1.6533089740117464, + "learning_rate": 1.2263723363230076e-06, + "loss": 1.1673, + "step": 3279 + }, + { + "epoch": 0.44465532434081206, + "grad_norm": 1.6891394831306243, + "learning_rate": 1.2259445762431168e-06, + "loss": 1.1469, + "step": 3280 + }, + { + "epoch": 0.44479088998847693, + "grad_norm": 1.8741864062003184, + "learning_rate": 1.2255167725915981e-06, + "loss": 1.1743, + "step": 3281 + }, + { + "epoch": 0.4449264556361418, + "grad_norm": 2.3745036336110577, + "learning_rate": 1.2250889254509496e-06, + "loss": 1.1844, + "step": 3282 + }, + { + "epoch": 0.4450620212838067, + "grad_norm": 1.651140152754236, + "learning_rate": 1.2246610349036785e-06, + "loss": 1.1607, + "step": 3283 + }, + { + "epoch": 0.44519758693147155, + "grad_norm": 1.5654963910100748, + "learning_rate": 1.2242331010323005e-06, + "loss": 1.1689, + "step": 3284 + }, + { + "epoch": 0.44533315257913647, + "grad_norm": 1.5898209917004587, + "learning_rate": 1.2238051239193387e-06, + "loss": 1.167, + "step": 3285 + }, + { + "epoch": 0.44546871822680134, + "grad_norm": 1.7894422580895766, + "learning_rate": 1.2233771036473255e-06, + "loss": 1.1676, + "step": 3286 + }, + { + "epoch": 0.4456042838744662, + "grad_norm": 1.8849247799319366, + "learning_rate": 1.2229490402988014e-06, + "loss": 1.1721, + "step": 3287 + }, + { + "epoch": 0.4457398495221311, + "grad_norm": 1.7124910674025464, + "learning_rate": 1.2225209339563143e-06, + "loss": 1.1739, + "step": 3288 + }, + { + "epoch": 0.44587541516979595, + "grad_norm": 2.057718160795557, + "learning_rate": 1.2220927847024218e-06, + "loss": 1.1941, + "step": 3289 + }, + { + "epoch": 0.4460109808174609, + "grad_norm": 1.560476202575875, + "learning_rate": 1.2216645926196886e-06, + "loss": 1.1799, + "step": 3290 + }, + { + "epoch": 0.44614654646512575, + "grad_norm": 1.9500787959068193, + "learning_rate": 1.2212363577906889e-06, + "loss": 1.1393, + "step": 3291 + }, + { + "epoch": 0.4462821121127906, + "grad_norm": 2.1407116099946357, + "learning_rate": 1.2208080802980037e-06, + "loss": 1.147, + "step": 3292 + }, + { + "epoch": 0.4464176777604555, + "grad_norm": 1.7552260464816594, + "learning_rate": 1.220379760224223e-06, + "loss": 1.1791, + "step": 3293 + }, + { + "epoch": 0.44655324340812036, + "grad_norm": 1.5629455719623937, + "learning_rate": 1.2199513976519451e-06, + "loss": 1.1582, + "step": 3294 + }, + { + "epoch": 0.4466888090557853, + "grad_norm": 2.05049242666633, + "learning_rate": 1.2195229926637764e-06, + "loss": 1.1883, + "step": 3295 + }, + { + "epoch": 0.44682437470345016, + "grad_norm": 1.6538407728988653, + "learning_rate": 1.2190945453423315e-06, + "loss": 1.1152, + "step": 3296 + }, + { + "epoch": 0.446959940351115, + "grad_norm": 2.7592276837241414, + "learning_rate": 1.2186660557702328e-06, + "loss": 1.1983, + "step": 3297 + }, + { + "epoch": 0.4470955059987799, + "grad_norm": 1.840485533107654, + "learning_rate": 1.2182375240301114e-06, + "loss": 1.1425, + "step": 3298 + }, + { + "epoch": 0.44723107164644477, + "grad_norm": 1.6993434927471605, + "learning_rate": 1.217808950204606e-06, + "loss": 1.1823, + "step": 3299 + }, + { + "epoch": 0.4473666372941097, + "grad_norm": 1.7405351940928826, + "learning_rate": 1.217380334376364e-06, + "loss": 1.1627, + "step": 3300 + }, + { + "epoch": 0.44750220294177456, + "grad_norm": 1.7195135566569844, + "learning_rate": 1.2169516766280404e-06, + "loss": 1.2079, + "step": 3301 + }, + { + "epoch": 0.44763776858943943, + "grad_norm": 1.8303369881111615, + "learning_rate": 1.2165229770422986e-06, + "loss": 1.2116, + "step": 3302 + }, + { + "epoch": 0.4477733342371043, + "grad_norm": 1.7123006361609618, + "learning_rate": 1.2160942357018096e-06, + "loss": 1.1597, + "step": 3303 + }, + { + "epoch": 0.4479088998847692, + "grad_norm": 1.772363393534008, + "learning_rate": 1.215665452689253e-06, + "loss": 1.149, + "step": 3304 + }, + { + "epoch": 0.4480444655324341, + "grad_norm": 2.2113617978453415, + "learning_rate": 1.2152366280873163e-06, + "loss": 1.1907, + "step": 3305 + }, + { + "epoch": 0.44818003118009897, + "grad_norm": 1.7568679405775767, + "learning_rate": 1.2148077619786948e-06, + "loss": 1.1799, + "step": 3306 + }, + { + "epoch": 0.44831559682776384, + "grad_norm": 3.638264417635281, + "learning_rate": 1.214378854446092e-06, + "loss": 1.1525, + "step": 3307 + }, + { + "epoch": 0.4484511624754287, + "grad_norm": 1.9039098062365225, + "learning_rate": 1.2139499055722193e-06, + "loss": 1.187, + "step": 3308 + }, + { + "epoch": 0.4485867281230936, + "grad_norm": 2.6909123516709936, + "learning_rate": 1.213520915439796e-06, + "loss": 1.1039, + "step": 3309 + }, + { + "epoch": 0.4487222937707585, + "grad_norm": 2.3329887523625485, + "learning_rate": 1.2130918841315496e-06, + "loss": 1.1734, + "step": 3310 + }, + { + "epoch": 0.4488578594184234, + "grad_norm": 2.396432332401915, + "learning_rate": 1.2126628117302156e-06, + "loss": 1.1486, + "step": 3311 + }, + { + "epoch": 0.44899342506608825, + "grad_norm": 1.9056117946842923, + "learning_rate": 1.212233698318537e-06, + "loss": 1.1829, + "step": 3312 + }, + { + "epoch": 0.4491289907137531, + "grad_norm": 1.8738925772959805, + "learning_rate": 1.2118045439792648e-06, + "loss": 1.167, + "step": 3313 + }, + { + "epoch": 0.44926455636141804, + "grad_norm": 2.479504392706813, + "learning_rate": 1.2113753487951584e-06, + "loss": 1.1605, + "step": 3314 + }, + { + "epoch": 0.4494001220090829, + "grad_norm": 1.9766601665890717, + "learning_rate": 1.2109461128489842e-06, + "loss": 1.1685, + "step": 3315 + }, + { + "epoch": 0.4495356876567478, + "grad_norm": 4.460145662664352, + "learning_rate": 1.2105168362235176e-06, + "loss": 1.163, + "step": 3316 + }, + { + "epoch": 0.44967125330441265, + "grad_norm": 1.8455885725223837, + "learning_rate": 1.2100875190015405e-06, + "loss": 1.1502, + "step": 3317 + }, + { + "epoch": 0.4498068189520775, + "grad_norm": 3.047726972136579, + "learning_rate": 1.2096581612658438e-06, + "loss": 1.1876, + "step": 3318 + }, + { + "epoch": 0.44994238459974245, + "grad_norm": 3.1576837201776042, + "learning_rate": 1.2092287630992257e-06, + "loss": 1.1893, + "step": 3319 + }, + { + "epoch": 0.4500779502474073, + "grad_norm": 1.597341390322676, + "learning_rate": 1.208799324584492e-06, + "loss": 1.1397, + "step": 3320 + }, + { + "epoch": 0.4502135158950722, + "grad_norm": 1.9671253977026324, + "learning_rate": 1.2083698458044572e-06, + "loss": 1.2004, + "step": 3321 + }, + { + "epoch": 0.45034908154273706, + "grad_norm": 1.6283557323878992, + "learning_rate": 1.207940326841942e-06, + "loss": 1.1564, + "step": 3322 + }, + { + "epoch": 0.45048464719040193, + "grad_norm": 1.6694746416055808, + "learning_rate": 1.2075107677797763e-06, + "loss": 1.1471, + "step": 3323 + }, + { + "epoch": 0.45062021283806686, + "grad_norm": 1.8984633051279325, + "learning_rate": 1.2070811687007969e-06, + "loss": 1.1604, + "step": 3324 + }, + { + "epoch": 0.45075577848573173, + "grad_norm": 1.7658825718788258, + "learning_rate": 1.2066515296878488e-06, + "loss": 1.1897, + "step": 3325 + }, + { + "epoch": 0.4508913441333966, + "grad_norm": 2.223242988105278, + "learning_rate": 1.2062218508237845e-06, + "loss": 1.1993, + "step": 3326 + }, + { + "epoch": 0.45102690978106147, + "grad_norm": 1.6163385016526284, + "learning_rate": 1.2057921321914638e-06, + "loss": 1.1235, + "step": 3327 + }, + { + "epoch": 0.45116247542872634, + "grad_norm": 2.3252152078746295, + "learning_rate": 1.205362373873755e-06, + "loss": 1.1438, + "step": 3328 + }, + { + "epoch": 0.45129804107639127, + "grad_norm": 1.9376196195506021, + "learning_rate": 1.2049325759535334e-06, + "loss": 1.1653, + "step": 3329 + }, + { + "epoch": 0.45143360672405614, + "grad_norm": 1.8974035450007518, + "learning_rate": 1.2045027385136823e-06, + "loss": 1.1557, + "step": 3330 + }, + { + "epoch": 0.451569172371721, + "grad_norm": 1.8459726043402471, + "learning_rate": 1.2040728616370924e-06, + "loss": 1.1661, + "step": 3331 + }, + { + "epoch": 0.4517047380193859, + "grad_norm": 1.7408848652547508, + "learning_rate": 1.2036429454066616e-06, + "loss": 1.1784, + "step": 3332 + }, + { + "epoch": 0.45184030366705075, + "grad_norm": 1.9321502564035158, + "learning_rate": 1.2032129899052965e-06, + "loss": 1.2103, + "step": 3333 + }, + { + "epoch": 0.4519758693147157, + "grad_norm": 1.7606589720213328, + "learning_rate": 1.2027829952159104e-06, + "loss": 1.1927, + "step": 3334 + }, + { + "epoch": 0.45211143496238054, + "grad_norm": 1.757770899843897, + "learning_rate": 1.2023529614214242e-06, + "loss": 1.1614, + "step": 3335 + }, + { + "epoch": 0.4522470006100454, + "grad_norm": 2.020579215143475, + "learning_rate": 1.2019228886047666e-06, + "loss": 1.1802, + "step": 3336 + }, + { + "epoch": 0.4523825662577103, + "grad_norm": 2.667442687159964, + "learning_rate": 1.2014927768488739e-06, + "loss": 1.1412, + "step": 3337 + }, + { + "epoch": 0.45251813190537515, + "grad_norm": 2.083971660971251, + "learning_rate": 1.2010626262366896e-06, + "loss": 1.16, + "step": 3338 + }, + { + "epoch": 0.4526536975530401, + "grad_norm": 1.9625289805469819, + "learning_rate": 1.2006324368511651e-06, + "loss": 1.1803, + "step": 3339 + }, + { + "epoch": 0.45278926320070495, + "grad_norm": 2.255650447702502, + "learning_rate": 1.200202208775259e-06, + "loss": 1.1773, + "step": 3340 + }, + { + "epoch": 0.4529248288483698, + "grad_norm": 2.2550538775453286, + "learning_rate": 1.1997719420919368e-06, + "loss": 1.1695, + "step": 3341 + }, + { + "epoch": 0.4530603944960347, + "grad_norm": 1.8062149466487185, + "learning_rate": 1.1993416368841727e-06, + "loss": 1.1936, + "step": 3342 + }, + { + "epoch": 0.45319596014369956, + "grad_norm": 1.6950600157788067, + "learning_rate": 1.1989112932349473e-06, + "loss": 1.1542, + "step": 3343 + }, + { + "epoch": 0.4533315257913645, + "grad_norm": 1.9629148605375837, + "learning_rate": 1.1984809112272493e-06, + "loss": 1.1726, + "step": 3344 + }, + { + "epoch": 0.45346709143902936, + "grad_norm": 2.2246153558093003, + "learning_rate": 1.1980504909440743e-06, + "loss": 1.1667, + "step": 3345 + }, + { + "epoch": 0.4536026570866942, + "grad_norm": 2.4142093784649545, + "learning_rate": 1.1976200324684253e-06, + "loss": 1.2024, + "step": 3346 + }, + { + "epoch": 0.4537382227343591, + "grad_norm": 1.8139824143611265, + "learning_rate": 1.197189535883313e-06, + "loss": 1.1773, + "step": 3347 + }, + { + "epoch": 0.45387378838202397, + "grad_norm": 2.4128743387699245, + "learning_rate": 1.1967590012717552e-06, + "loss": 1.1218, + "step": 3348 + }, + { + "epoch": 0.4540093540296889, + "grad_norm": 2.0899181724769784, + "learning_rate": 1.1963284287167772e-06, + "loss": 1.2121, + "step": 3349 + }, + { + "epoch": 0.45414491967735376, + "grad_norm": 1.8469110835671358, + "learning_rate": 1.1958978183014111e-06, + "loss": 1.1715, + "step": 3350 + }, + { + "epoch": 0.45428048532501863, + "grad_norm": 1.9612626234486477, + "learning_rate": 1.1954671701086976e-06, + "loss": 1.1726, + "step": 3351 + }, + { + "epoch": 0.4544160509726835, + "grad_norm": 1.8128544830292153, + "learning_rate": 1.195036484221683e-06, + "loss": 1.1907, + "step": 3352 + }, + { + "epoch": 0.45455161662034843, + "grad_norm": 1.9983260546412638, + "learning_rate": 1.194605760723422e-06, + "loss": 1.1541, + "step": 3353 + }, + { + "epoch": 0.4546871822680133, + "grad_norm": 1.7054205922306427, + "learning_rate": 1.1941749996969762e-06, + "loss": 1.1859, + "step": 3354 + }, + { + "epoch": 0.45482274791567817, + "grad_norm": 3.023220700965942, + "learning_rate": 1.1937442012254144e-06, + "loss": 1.1651, + "step": 3355 + }, + { + "epoch": 0.45495831356334304, + "grad_norm": 1.99746960691175, + "learning_rate": 1.1933133653918126e-06, + "loss": 1.1871, + "step": 3356 + }, + { + "epoch": 0.4550938792110079, + "grad_norm": 1.9954454688858256, + "learning_rate": 1.1928824922792543e-06, + "loss": 1.1783, + "step": 3357 + }, + { + "epoch": 0.45522944485867284, + "grad_norm": 1.8268466746925052, + "learning_rate": 1.1924515819708298e-06, + "loss": 1.1816, + "step": 3358 + }, + { + "epoch": 0.4553650105063377, + "grad_norm": 1.7361399014291845, + "learning_rate": 1.1920206345496372e-06, + "loss": 1.188, + "step": 3359 + }, + { + "epoch": 0.4555005761540026, + "grad_norm": 1.889511327476453, + "learning_rate": 1.1915896500987809e-06, + "loss": 1.1188, + "step": 3360 + }, + { + "epoch": 0.45563614180166745, + "grad_norm": 1.9247597419795688, + "learning_rate": 1.1911586287013725e-06, + "loss": 1.1727, + "step": 3361 + }, + { + "epoch": 0.4557717074493323, + "grad_norm": 1.754987568680782, + "learning_rate": 1.1907275704405316e-06, + "loss": 1.1972, + "step": 3362 + }, + { + "epoch": 0.45590727309699725, + "grad_norm": 1.612289400487008, + "learning_rate": 1.1902964753993842e-06, + "loss": 1.1713, + "step": 3363 + }, + { + "epoch": 0.4560428387446621, + "grad_norm": 33.665174168279236, + "learning_rate": 1.1898653436610637e-06, + "loss": 1.1642, + "step": 3364 + }, + { + "epoch": 0.456178404392327, + "grad_norm": 2.0349172718300053, + "learning_rate": 1.1894341753087105e-06, + "loss": 1.1501, + "step": 3365 + }, + { + "epoch": 0.45631397003999186, + "grad_norm": 1.6835041015854877, + "learning_rate": 1.1890029704254716e-06, + "loss": 1.1608, + "step": 3366 + }, + { + "epoch": 0.4564495356876567, + "grad_norm": 1.563006544255749, + "learning_rate": 1.188571729094502e-06, + "loss": 1.1312, + "step": 3367 + }, + { + "epoch": 0.45658510133532165, + "grad_norm": 2.2188803217891966, + "learning_rate": 1.1881404513989629e-06, + "loss": 1.1682, + "step": 3368 + }, + { + "epoch": 0.4567206669829865, + "grad_norm": 1.9184914338587946, + "learning_rate": 1.1877091374220228e-06, + "loss": 1.1935, + "step": 3369 + }, + { + "epoch": 0.4568562326306514, + "grad_norm": 1.9888604907436176, + "learning_rate": 1.1872777872468572e-06, + "loss": 1.162, + "step": 3370 + }, + { + "epoch": 0.45699179827831626, + "grad_norm": 1.741358167292029, + "learning_rate": 1.1868464009566485e-06, + "loss": 1.1604, + "step": 3371 + }, + { + "epoch": 0.45712736392598113, + "grad_norm": 1.6780953288800908, + "learning_rate": 1.1864149786345868e-06, + "loss": 1.1542, + "step": 3372 + }, + { + "epoch": 0.45726292957364606, + "grad_norm": 1.7898911976105412, + "learning_rate": 1.1859835203638675e-06, + "loss": 1.1883, + "step": 3373 + }, + { + "epoch": 0.45739849522131093, + "grad_norm": 2.1798592897243503, + "learning_rate": 1.1855520262276943e-06, + "loss": 1.1, + "step": 3374 + }, + { + "epoch": 0.4575340608689758, + "grad_norm": 2.725238124524749, + "learning_rate": 1.1851204963092775e-06, + "loss": 1.1481, + "step": 3375 + }, + { + "epoch": 0.45766962651664067, + "grad_norm": 4.1649658228293305, + "learning_rate": 1.1846889306918344e-06, + "loss": 1.1889, + "step": 3376 + }, + { + "epoch": 0.45780519216430554, + "grad_norm": 1.8278485374382891, + "learning_rate": 1.1842573294585889e-06, + "loss": 1.1831, + "step": 3377 + }, + { + "epoch": 0.45794075781197047, + "grad_norm": 2.507871139672177, + "learning_rate": 1.1838256926927718e-06, + "loss": 1.1285, + "step": 3378 + }, + { + "epoch": 0.45807632345963534, + "grad_norm": 1.9414862873435403, + "learning_rate": 1.1833940204776208e-06, + "loss": 1.192, + "step": 3379 + }, + { + "epoch": 0.4582118891073002, + "grad_norm": 2.013402674878006, + "learning_rate": 1.1829623128963807e-06, + "loss": 1.1835, + "step": 3380 + }, + { + "epoch": 0.4583474547549651, + "grad_norm": 1.9208065437453445, + "learning_rate": 1.1825305700323025e-06, + "loss": 1.1586, + "step": 3381 + }, + { + "epoch": 0.45848302040262995, + "grad_norm": 1.6532018573118323, + "learning_rate": 1.182098791968645e-06, + "loss": 1.1303, + "step": 3382 + }, + { + "epoch": 0.4586185860502949, + "grad_norm": 1.9269568887353161, + "learning_rate": 1.1816669787886727e-06, + "loss": 1.1249, + "step": 3383 + }, + { + "epoch": 0.45875415169795974, + "grad_norm": 1.5751785253366921, + "learning_rate": 1.1812351305756575e-06, + "loss": 1.1822, + "step": 3384 + }, + { + "epoch": 0.4588897173456246, + "grad_norm": 2.201390015330437, + "learning_rate": 1.1808032474128782e-06, + "loss": 1.1905, + "step": 3385 + }, + { + "epoch": 0.4590252829932895, + "grad_norm": 1.9266883742841974, + "learning_rate": 1.1803713293836198e-06, + "loss": 1.1481, + "step": 3386 + }, + { + "epoch": 0.45916084864095436, + "grad_norm": 1.9576773386803235, + "learning_rate": 1.179939376571174e-06, + "loss": 1.2083, + "step": 3387 + }, + { + "epoch": 0.4592964142886193, + "grad_norm": 1.7039525924528949, + "learning_rate": 1.1795073890588401e-06, + "loss": 1.1708, + "step": 3388 + }, + { + "epoch": 0.45943197993628415, + "grad_norm": 3.420426878857649, + "learning_rate": 1.179075366929923e-06, + "loss": 1.1557, + "step": 3389 + }, + { + "epoch": 0.459567545583949, + "grad_norm": 2.1125806953743753, + "learning_rate": 1.1786433102677348e-06, + "loss": 1.1589, + "step": 3390 + }, + { + "epoch": 0.4597031112316139, + "grad_norm": 1.7003663076137632, + "learning_rate": 1.1782112191555946e-06, + "loss": 1.173, + "step": 3391 + }, + { + "epoch": 0.4598386768792788, + "grad_norm": 1.6690644764211569, + "learning_rate": 1.1777790936768272e-06, + "loss": 1.1917, + "step": 3392 + }, + { + "epoch": 0.4599742425269437, + "grad_norm": 2.346652802064257, + "learning_rate": 1.1773469339147653e-06, + "loss": 1.1452, + "step": 3393 + }, + { + "epoch": 0.46010980817460856, + "grad_norm": 2.0542727618408096, + "learning_rate": 1.1769147399527466e-06, + "loss": 1.1554, + "step": 3394 + }, + { + "epoch": 0.46024537382227343, + "grad_norm": 2.1932218547930655, + "learning_rate": 1.176482511874117e-06, + "loss": 1.202, + "step": 3395 + }, + { + "epoch": 0.4603809394699383, + "grad_norm": 1.566130394752823, + "learning_rate": 1.1760502497622281e-06, + "loss": 1.1684, + "step": 3396 + }, + { + "epoch": 0.4605165051176032, + "grad_norm": 5.402103352651644, + "learning_rate": 1.1756179537004383e-06, + "loss": 1.1755, + "step": 3397 + }, + { + "epoch": 0.4606520707652681, + "grad_norm": 1.8945195327421378, + "learning_rate": 1.175185623772112e-06, + "loss": 1.1415, + "step": 3398 + }, + { + "epoch": 0.46078763641293297, + "grad_norm": 1.742891257119672, + "learning_rate": 1.1747532600606213e-06, + "loss": 1.1535, + "step": 3399 + }, + { + "epoch": 0.46092320206059784, + "grad_norm": 1.8167137222597929, + "learning_rate": 1.174320862649344e-06, + "loss": 1.1667, + "step": 3400 + }, + { + "epoch": 0.4610587677082627, + "grad_norm": 1.9582026658110046, + "learning_rate": 1.173888431621664e-06, + "loss": 1.1688, + "step": 3401 + }, + { + "epoch": 0.46119433335592763, + "grad_norm": 2.702313410121902, + "learning_rate": 1.1734559670609727e-06, + "loss": 1.1392, + "step": 3402 + }, + { + "epoch": 0.4613298990035925, + "grad_norm": 1.6223332817323597, + "learning_rate": 1.1730234690506671e-06, + "loss": 1.1381, + "step": 3403 + }, + { + "epoch": 0.4614654646512574, + "grad_norm": 3.322638514740076, + "learning_rate": 1.1725909376741515e-06, + "loss": 1.1549, + "step": 3404 + }, + { + "epoch": 0.46160103029892224, + "grad_norm": 1.7249318237886038, + "learning_rate": 1.1721583730148356e-06, + "loss": 1.1612, + "step": 3405 + }, + { + "epoch": 0.4617365959465871, + "grad_norm": 2.5303448571913925, + "learning_rate": 1.1717257751561367e-06, + "loss": 1.1553, + "step": 3406 + }, + { + "epoch": 0.46187216159425204, + "grad_norm": 2.8275051899049033, + "learning_rate": 1.1712931441814775e-06, + "loss": 1.1404, + "step": 3407 + }, + { + "epoch": 0.4620077272419169, + "grad_norm": 1.8860847029349395, + "learning_rate": 1.1708604801742877e-06, + "loss": 1.1744, + "step": 3408 + }, + { + "epoch": 0.4621432928895818, + "grad_norm": 1.8239905718529041, + "learning_rate": 1.1704277832180027e-06, + "loss": 1.1599, + "step": 3409 + }, + { + "epoch": 0.46227885853724665, + "grad_norm": 2.034353240768107, + "learning_rate": 1.1699950533960652e-06, + "loss": 1.1652, + "step": 3410 + }, + { + "epoch": 0.4624144241849115, + "grad_norm": 2.5240672064995984, + "learning_rate": 1.1695622907919233e-06, + "loss": 1.1677, + "step": 3411 + }, + { + "epoch": 0.46254998983257645, + "grad_norm": 2.0788622135236743, + "learning_rate": 1.1691294954890323e-06, + "loss": 1.1717, + "step": 3412 + }, + { + "epoch": 0.4626855554802413, + "grad_norm": 1.6625052039268489, + "learning_rate": 1.168696667570853e-06, + "loss": 1.164, + "step": 3413 + }, + { + "epoch": 0.4628211211279062, + "grad_norm": 1.5713764931443854, + "learning_rate": 1.1682638071208532e-06, + "loss": 1.1503, + "step": 3414 + }, + { + "epoch": 0.46295668677557106, + "grad_norm": 1.622459804311142, + "learning_rate": 1.1678309142225062e-06, + "loss": 1.1727, + "step": 3415 + }, + { + "epoch": 0.46309225242323593, + "grad_norm": 1.581880252893236, + "learning_rate": 1.1673979889592923e-06, + "loss": 1.1634, + "step": 3416 + }, + { + "epoch": 0.46322781807090085, + "grad_norm": 2.015568968963271, + "learning_rate": 1.1669650314146973e-06, + "loss": 1.1969, + "step": 3417 + }, + { + "epoch": 0.4633633837185657, + "grad_norm": 1.8840528476192413, + "learning_rate": 1.166532041672214e-06, + "loss": 1.1967, + "step": 3418 + }, + { + "epoch": 0.4634989493662306, + "grad_norm": 5.323992972757216, + "learning_rate": 1.166099019815341e-06, + "loss": 1.1312, + "step": 3419 + }, + { + "epoch": 0.46363451501389547, + "grad_norm": 2.426643946225349, + "learning_rate": 1.1656659659275835e-06, + "loss": 1.1686, + "step": 3420 + }, + { + "epoch": 0.46377008066156034, + "grad_norm": 2.0472280084959866, + "learning_rate": 1.1652328800924517e-06, + "loss": 1.1457, + "step": 3421 + }, + { + "epoch": 0.46390564630922526, + "grad_norm": 2.053541447931404, + "learning_rate": 1.1647997623934636e-06, + "loss": 1.1577, + "step": 3422 + }, + { + "epoch": 0.46404121195689013, + "grad_norm": 2.1271854585010916, + "learning_rate": 1.164366612914142e-06, + "loss": 1.1468, + "step": 3423 + }, + { + "epoch": 0.464176777604555, + "grad_norm": 1.989941708071377, + "learning_rate": 1.1639334317380164e-06, + "loss": 1.1346, + "step": 3424 + }, + { + "epoch": 0.4643123432522199, + "grad_norm": 2.7837587297358932, + "learning_rate": 1.1635002189486228e-06, + "loss": 1.1822, + "step": 3425 + }, + { + "epoch": 0.46444790889988474, + "grad_norm": 3.6588495188915955, + "learning_rate": 1.1630669746295022e-06, + "loss": 1.1991, + "step": 3426 + }, + { + "epoch": 0.46458347454754967, + "grad_norm": 2.3755456512643303, + "learning_rate": 1.1626336988642029e-06, + "loss": 1.1686, + "step": 3427 + }, + { + "epoch": 0.46471904019521454, + "grad_norm": 10.76067331325682, + "learning_rate": 1.1622003917362788e-06, + "loss": 1.1491, + "step": 3428 + }, + { + "epoch": 0.4648546058428794, + "grad_norm": 1.6515566269101853, + "learning_rate": 1.1617670533292892e-06, + "loss": 1.1536, + "step": 3429 + }, + { + "epoch": 0.4649901714905443, + "grad_norm": 1.657389235674561, + "learning_rate": 1.1613336837268001e-06, + "loss": 1.1521, + "step": 3430 + }, + { + "epoch": 0.4651257371382092, + "grad_norm": 2.0653332115842686, + "learning_rate": 1.1609002830123837e-06, + "loss": 1.18, + "step": 3431 + }, + { + "epoch": 0.4652613027858741, + "grad_norm": 2.0189711361272105, + "learning_rate": 1.1604668512696179e-06, + "loss": 1.2072, + "step": 3432 + }, + { + "epoch": 0.46539686843353895, + "grad_norm": 1.8620866044585203, + "learning_rate": 1.1600333885820867e-06, + "loss": 1.1555, + "step": 3433 + }, + { + "epoch": 0.4655324340812038, + "grad_norm": 1.9756266790694788, + "learning_rate": 1.1595998950333793e-06, + "loss": 1.1634, + "step": 3434 + }, + { + "epoch": 0.4656679997288687, + "grad_norm": 2.5225996578073127, + "learning_rate": 1.159166370707092e-06, + "loss": 1.186, + "step": 3435 + }, + { + "epoch": 0.4658035653765336, + "grad_norm": 1.7028910542537403, + "learning_rate": 1.1587328156868266e-06, + "loss": 1.2283, + "step": 3436 + }, + { + "epoch": 0.4659391310241985, + "grad_norm": 2.1183128592827907, + "learning_rate": 1.1582992300561906e-06, + "loss": 1.2133, + "step": 3437 + }, + { + "epoch": 0.46607469667186335, + "grad_norm": 1.9849378910337858, + "learning_rate": 1.157865613898798e-06, + "loss": 1.1539, + "step": 3438 + }, + { + "epoch": 0.4662102623195282, + "grad_norm": 1.783854615065259, + "learning_rate": 1.1574319672982673e-06, + "loss": 1.143, + "step": 3439 + }, + { + "epoch": 0.4663458279671931, + "grad_norm": 2.9863082753590704, + "learning_rate": 1.1569982903382247e-06, + "loss": 1.1534, + "step": 3440 + }, + { + "epoch": 0.466481393614858, + "grad_norm": 1.9804708519609278, + "learning_rate": 1.156564583102301e-06, + "loss": 1.1147, + "step": 3441 + }, + { + "epoch": 0.4666169592625229, + "grad_norm": 1.6579833449856238, + "learning_rate": 1.1561308456741336e-06, + "loss": 1.1507, + "step": 3442 + }, + { + "epoch": 0.46675252491018776, + "grad_norm": 2.7852705861766895, + "learning_rate": 1.1556970781373648e-06, + "loss": 1.1872, + "step": 3443 + }, + { + "epoch": 0.46688809055785263, + "grad_norm": 1.5817786560410105, + "learning_rate": 1.1552632805756436e-06, + "loss": 1.171, + "step": 3444 + }, + { + "epoch": 0.4670236562055175, + "grad_norm": 1.8265466674813777, + "learning_rate": 1.154829453072624e-06, + "loss": 1.1952, + "step": 3445 + }, + { + "epoch": 0.4671592218531824, + "grad_norm": 4.297886414641935, + "learning_rate": 1.1543955957119667e-06, + "loss": 1.1662, + "step": 3446 + }, + { + "epoch": 0.4672947875008473, + "grad_norm": 2.3346694566922404, + "learning_rate": 1.1539617085773373e-06, + "loss": 1.1395, + "step": 3447 + }, + { + "epoch": 0.46743035314851217, + "grad_norm": 2.0179826792851263, + "learning_rate": 1.1535277917524079e-06, + "loss": 1.1755, + "step": 3448 + }, + { + "epoch": 0.46756591879617704, + "grad_norm": 1.7589073477717763, + "learning_rate": 1.153093845320856e-06, + "loss": 1.1721, + "step": 3449 + }, + { + "epoch": 0.4677014844438419, + "grad_norm": 2.5549683977988966, + "learning_rate": 1.152659869366364e-06, + "loss": 1.1618, + "step": 3450 + }, + { + "epoch": 0.46783705009150683, + "grad_norm": 1.6073961212744243, + "learning_rate": 1.1522258639726215e-06, + "loss": 1.1772, + "step": 3451 + }, + { + "epoch": 0.4679726157391717, + "grad_norm": 2.240908645446891, + "learning_rate": 1.1517918292233226e-06, + "loss": 1.1559, + "step": 3452 + }, + { + "epoch": 0.4681081813868366, + "grad_norm": 5.07891572127519, + "learning_rate": 1.1513577652021678e-06, + "loss": 1.1978, + "step": 3453 + }, + { + "epoch": 0.46824374703450145, + "grad_norm": 1.7832622748832156, + "learning_rate": 1.1509236719928627e-06, + "loss": 1.1963, + "step": 3454 + }, + { + "epoch": 0.4683793126821663, + "grad_norm": 2.042544930880501, + "learning_rate": 1.1504895496791185e-06, + "loss": 1.1795, + "step": 3455 + }, + { + "epoch": 0.46851487832983124, + "grad_norm": 2.8112138179248483, + "learning_rate": 1.1500553983446526e-06, + "loss": 1.1765, + "step": 3456 + }, + { + "epoch": 0.4686504439774961, + "grad_norm": 2.2603336771326306, + "learning_rate": 1.1496212180731877e-06, + "loss": 1.1948, + "step": 3457 + }, + { + "epoch": 0.468786009625161, + "grad_norm": 1.9725611030110086, + "learning_rate": 1.149187008948452e-06, + "loss": 1.2092, + "step": 3458 + }, + { + "epoch": 0.46892157527282585, + "grad_norm": 1.8962540125641931, + "learning_rate": 1.1487527710541794e-06, + "loss": 1.1512, + "step": 3459 + }, + { + "epoch": 0.4690571409204907, + "grad_norm": 2.3331086416935514, + "learning_rate": 1.1483185044741088e-06, + "loss": 1.1351, + "step": 3460 + }, + { + "epoch": 0.46919270656815565, + "grad_norm": 3.5465392375401916, + "learning_rate": 1.1478842092919854e-06, + "loss": 1.165, + "step": 3461 + }, + { + "epoch": 0.4693282722158205, + "grad_norm": 2.4985360556800913, + "learning_rate": 1.1474498855915596e-06, + "loss": 1.1373, + "step": 3462 + }, + { + "epoch": 0.4694638378634854, + "grad_norm": 2.5341908059096863, + "learning_rate": 1.1470155334565869e-06, + "loss": 1.199, + "step": 3463 + }, + { + "epoch": 0.46959940351115026, + "grad_norm": 2.557569205765838, + "learning_rate": 1.1465811529708295e-06, + "loss": 1.1599, + "step": 3464 + }, + { + "epoch": 0.46973496915881513, + "grad_norm": 2.471295601109584, + "learning_rate": 1.1461467442180537e-06, + "loss": 1.1721, + "step": 3465 + }, + { + "epoch": 0.46987053480648006, + "grad_norm": 1.6815431367769542, + "learning_rate": 1.1457123072820319e-06, + "loss": 1.1857, + "step": 3466 + }, + { + "epoch": 0.4700061004541449, + "grad_norm": 1.7321211479681724, + "learning_rate": 1.1452778422465416e-06, + "loss": 1.1688, + "step": 3467 + }, + { + "epoch": 0.4701416661018098, + "grad_norm": 2.5212281431743855, + "learning_rate": 1.1448433491953665e-06, + "loss": 1.1892, + "step": 3468 + }, + { + "epoch": 0.47027723174947467, + "grad_norm": 1.6557924569021327, + "learning_rate": 1.1444088282122945e-06, + "loss": 1.1868, + "step": 3469 + }, + { + "epoch": 0.47041279739713954, + "grad_norm": 2.22653458403482, + "learning_rate": 1.1439742793811205e-06, + "loss": 1.1759, + "step": 3470 + }, + { + "epoch": 0.47054836304480446, + "grad_norm": 1.667002941392208, + "learning_rate": 1.1435397027856425e-06, + "loss": 1.1615, + "step": 3471 + }, + { + "epoch": 0.47068392869246933, + "grad_norm": 2.3963880257287937, + "learning_rate": 1.1431050985096663e-06, + "loss": 1.1449, + "step": 3472 + }, + { + "epoch": 0.4708194943401342, + "grad_norm": 1.8633448306127343, + "learning_rate": 1.142670466637001e-06, + "loss": 1.1979, + "step": 3473 + }, + { + "epoch": 0.4709550599877991, + "grad_norm": 1.775338655685083, + "learning_rate": 1.142235807251463e-06, + "loss": 1.1463, + "step": 3474 + }, + { + "epoch": 0.471090625635464, + "grad_norm": 1.8878874984014997, + "learning_rate": 1.1418011204368717e-06, + "loss": 1.1629, + "step": 3475 + }, + { + "epoch": 0.47122619128312887, + "grad_norm": 7.693963530338666, + "learning_rate": 1.1413664062770538e-06, + "loss": 1.1368, + "step": 3476 + }, + { + "epoch": 0.47136175693079374, + "grad_norm": 1.6973963613332268, + "learning_rate": 1.1409316648558404e-06, + "loss": 1.1452, + "step": 3477 + }, + { + "epoch": 0.4714973225784586, + "grad_norm": 2.391563376661314, + "learning_rate": 1.140496896257068e-06, + "loss": 1.1836, + "step": 3478 + }, + { + "epoch": 0.4716328882261235, + "grad_norm": 1.757520359414841, + "learning_rate": 1.140062100564578e-06, + "loss": 1.1427, + "step": 3479 + }, + { + "epoch": 0.4717684538737884, + "grad_norm": 2.1070756921431353, + "learning_rate": 1.1396272778622175e-06, + "loss": 1.1724, + "step": 3480 + }, + { + "epoch": 0.4719040195214533, + "grad_norm": 3.731284221622922, + "learning_rate": 1.1391924282338388e-06, + "loss": 1.1459, + "step": 3481 + }, + { + "epoch": 0.47203958516911815, + "grad_norm": 1.6651789079107289, + "learning_rate": 1.1387575517632987e-06, + "loss": 1.1256, + "step": 3482 + }, + { + "epoch": 0.472175150816783, + "grad_norm": 1.9817450996514632, + "learning_rate": 1.1383226485344604e-06, + "loss": 1.1226, + "step": 3483 + }, + { + "epoch": 0.4723107164644479, + "grad_norm": 1.726875249403551, + "learning_rate": 1.137887718631191e-06, + "loss": 1.1876, + "step": 3484 + }, + { + "epoch": 0.4724462821121128, + "grad_norm": 2.481913473399808, + "learning_rate": 1.1374527621373636e-06, + "loss": 1.1635, + "step": 3485 + }, + { + "epoch": 0.4725818477597777, + "grad_norm": 2.5125000168423814, + "learning_rate": 1.1370177791368558e-06, + "loss": 1.1401, + "step": 3486 + }, + { + "epoch": 0.47271741340744255, + "grad_norm": 5.975921542312296, + "learning_rate": 1.136582769713551e-06, + "loss": 1.1583, + "step": 3487 + }, + { + "epoch": 0.4728529790551074, + "grad_norm": 2.0943566759526524, + "learning_rate": 1.136147733951337e-06, + "loss": 1.1418, + "step": 3488 + }, + { + "epoch": 0.4729885447027723, + "grad_norm": 2.9386424143646117, + "learning_rate": 1.1357126719341076e-06, + "loss": 1.1532, + "step": 3489 + }, + { + "epoch": 0.4731241103504372, + "grad_norm": 4.701715576348773, + "learning_rate": 1.1352775837457605e-06, + "loss": 1.1528, + "step": 3490 + }, + { + "epoch": 0.4732596759981021, + "grad_norm": 1.921859306413621, + "learning_rate": 1.134842469470199e-06, + "loss": 1.1722, + "step": 3491 + }, + { + "epoch": 0.47339524164576696, + "grad_norm": 2.179985015177083, + "learning_rate": 1.1344073291913317e-06, + "loss": 1.1974, + "step": 3492 + }, + { + "epoch": 0.47353080729343183, + "grad_norm": 1.7007154300951137, + "learning_rate": 1.133972162993072e-06, + "loss": 1.1674, + "step": 3493 + }, + { + "epoch": 0.4736663729410967, + "grad_norm": 1.8857153007814749, + "learning_rate": 1.1335369709593382e-06, + "loss": 1.1379, + "step": 3494 + }, + { + "epoch": 0.47380193858876163, + "grad_norm": 2.0778755250160215, + "learning_rate": 1.1331017531740533e-06, + "loss": 1.1544, + "step": 3495 + }, + { + "epoch": 0.4739375042364265, + "grad_norm": 4.032894183972698, + "learning_rate": 1.132666509721146e-06, + "loss": 1.1839, + "step": 3496 + }, + { + "epoch": 0.47407306988409137, + "grad_norm": 2.000968760315705, + "learning_rate": 1.1322312406845498e-06, + "loss": 1.1661, + "step": 3497 + }, + { + "epoch": 0.47420863553175624, + "grad_norm": 2.3733755389735287, + "learning_rate": 1.1317959461482028e-06, + "loss": 1.1897, + "step": 3498 + }, + { + "epoch": 0.4743442011794211, + "grad_norm": 1.7365313029925704, + "learning_rate": 1.1313606261960475e-06, + "loss": 1.1779, + "step": 3499 + }, + { + "epoch": 0.47447976682708604, + "grad_norm": 1.6257642395624357, + "learning_rate": 1.1309252809120324e-06, + "loss": 1.1411, + "step": 3500 + }, + { + "epoch": 0.4746153324747509, + "grad_norm": 2.293282431862866, + "learning_rate": 1.1304899103801105e-06, + "loss": 1.1898, + "step": 3501 + }, + { + "epoch": 0.4747508981224158, + "grad_norm": 1.643803672651027, + "learning_rate": 1.1300545146842393e-06, + "loss": 1.1439, + "step": 3502 + }, + { + "epoch": 0.47488646377008065, + "grad_norm": 1.6182381103015953, + "learning_rate": 1.1296190939083815e-06, + "loss": 1.1461, + "step": 3503 + }, + { + "epoch": 0.4750220294177455, + "grad_norm": 1.8669789524124245, + "learning_rate": 1.1291836481365045e-06, + "loss": 1.1648, + "step": 3504 + }, + { + "epoch": 0.47515759506541044, + "grad_norm": 2.5278465602034625, + "learning_rate": 1.128748177452581e-06, + "loss": 1.1347, + "step": 3505 + }, + { + "epoch": 0.4752931607130753, + "grad_norm": 2.815847755541399, + "learning_rate": 1.1283126819405873e-06, + "loss": 1.1923, + "step": 3506 + }, + { + "epoch": 0.4754287263607402, + "grad_norm": 1.738034706774129, + "learning_rate": 1.127877161684506e-06, + "loss": 1.164, + "step": 3507 + }, + { + "epoch": 0.47556429200840505, + "grad_norm": 1.908959772327263, + "learning_rate": 1.1274416167683234e-06, + "loss": 1.1574, + "step": 3508 + }, + { + "epoch": 0.4756998576560699, + "grad_norm": 1.6266336957460326, + "learning_rate": 1.127006047276031e-06, + "loss": 1.1309, + "step": 3509 + }, + { + "epoch": 0.47583542330373485, + "grad_norm": 1.9184657269250154, + "learning_rate": 1.126570453291625e-06, + "loss": 1.17, + "step": 3510 + }, + { + "epoch": 0.4759709889513997, + "grad_norm": 3.6577838204664737, + "learning_rate": 1.126134834899106e-06, + "loss": 1.1932, + "step": 3511 + }, + { + "epoch": 0.4761065545990646, + "grad_norm": 1.6353119253758517, + "learning_rate": 1.1256991921824798e-06, + "loss": 1.1822, + "step": 3512 + }, + { + "epoch": 0.47624212024672946, + "grad_norm": 2.518104565089321, + "learning_rate": 1.1252635252257567e-06, + "loss": 1.152, + "step": 3513 + }, + { + "epoch": 0.4763776858943944, + "grad_norm": 1.9459779434095064, + "learning_rate": 1.1248278341129516e-06, + "loss": 1.1122, + "step": 3514 + }, + { + "epoch": 0.47651325154205926, + "grad_norm": 1.7620557338307925, + "learning_rate": 1.1243921189280838e-06, + "loss": 1.1555, + "step": 3515 + }, + { + "epoch": 0.4766488171897241, + "grad_norm": 1.8151221108672064, + "learning_rate": 1.1239563797551777e-06, + "loss": 1.1619, + "step": 3516 + }, + { + "epoch": 0.476784382837389, + "grad_norm": 1.9309283934984982, + "learning_rate": 1.1235206166782622e-06, + "loss": 1.1583, + "step": 3517 + }, + { + "epoch": 0.47691994848505387, + "grad_norm": 1.8729450175383633, + "learning_rate": 1.1230848297813712e-06, + "loss": 1.1429, + "step": 3518 + }, + { + "epoch": 0.4770555141327188, + "grad_norm": 2.451883297066441, + "learning_rate": 1.122649019148542e-06, + "loss": 1.1373, + "step": 3519 + }, + { + "epoch": 0.47719107978038366, + "grad_norm": 2.6885196878406874, + "learning_rate": 1.122213184863818e-06, + "loss": 1.1702, + "step": 3520 + }, + { + "epoch": 0.47732664542804853, + "grad_norm": 2.4898502569068777, + "learning_rate": 1.1217773270112454e-06, + "loss": 1.2265, + "step": 3521 + }, + { + "epoch": 0.4774622110757134, + "grad_norm": 2.331132764531323, + "learning_rate": 1.121341445674877e-06, + "loss": 1.1422, + "step": 3522 + }, + { + "epoch": 0.4775977767233783, + "grad_norm": 2.8985871589247214, + "learning_rate": 1.1209055409387682e-06, + "loss": 1.1556, + "step": 3523 + }, + { + "epoch": 0.4777333423710432, + "grad_norm": 1.7840562061238543, + "learning_rate": 1.1204696128869803e-06, + "loss": 1.1249, + "step": 3524 + }, + { + "epoch": 0.47786890801870807, + "grad_norm": 1.5722472006667603, + "learning_rate": 1.1200336616035788e-06, + "loss": 1.1267, + "step": 3525 + }, + { + "epoch": 0.47800447366637294, + "grad_norm": 3.0631211879445877, + "learning_rate": 1.1195976871726332e-06, + "loss": 1.1784, + "step": 3526 + }, + { + "epoch": 0.4781400393140378, + "grad_norm": 1.9722840150269139, + "learning_rate": 1.1191616896782172e-06, + "loss": 1.1482, + "step": 3527 + }, + { + "epoch": 0.4782756049617027, + "grad_norm": 2.0123771708936675, + "learning_rate": 1.1187256692044103e-06, + "loss": 1.1822, + "step": 3528 + }, + { + "epoch": 0.4784111706093676, + "grad_norm": 2.6190223111110855, + "learning_rate": 1.1182896258352949e-06, + "loss": 1.1752, + "step": 3529 + }, + { + "epoch": 0.4785467362570325, + "grad_norm": 7.349086518766355, + "learning_rate": 1.1178535596549592e-06, + "loss": 1.1383, + "step": 3530 + }, + { + "epoch": 0.47868230190469735, + "grad_norm": 2.7215372513105978, + "learning_rate": 1.1174174707474947e-06, + "loss": 1.1093, + "step": 3531 + }, + { + "epoch": 0.4788178675523622, + "grad_norm": 2.1047923717426693, + "learning_rate": 1.116981359196998e-06, + "loss": 1.1929, + "step": 3532 + }, + { + "epoch": 0.4789534332000271, + "grad_norm": 1.742177605611333, + "learning_rate": 1.116545225087569e-06, + "loss": 1.1708, + "step": 3533 + }, + { + "epoch": 0.479088998847692, + "grad_norm": 2.967759257224494, + "learning_rate": 1.1161090685033138e-06, + "loss": 1.1884, + "step": 3534 + }, + { + "epoch": 0.4792245644953569, + "grad_norm": 1.7797652908108468, + "learning_rate": 1.1156728895283412e-06, + "loss": 1.165, + "step": 3535 + }, + { + "epoch": 0.47936013014302176, + "grad_norm": 2.222945477417753, + "learning_rate": 1.1152366882467647e-06, + "loss": 1.1648, + "step": 3536 + }, + { + "epoch": 0.4794956957906866, + "grad_norm": 6.712608653757115, + "learning_rate": 1.1148004647427027e-06, + "loss": 1.1693, + "step": 3537 + }, + { + "epoch": 0.4796312614383515, + "grad_norm": 2.039715239100601, + "learning_rate": 1.114364219100277e-06, + "loss": 1.1449, + "step": 3538 + }, + { + "epoch": 0.4797668270860164, + "grad_norm": 1.9365445427624146, + "learning_rate": 1.1139279514036147e-06, + "loss": 1.1497, + "step": 3539 + }, + { + "epoch": 0.4799023927336813, + "grad_norm": 1.9243978470218064, + "learning_rate": 1.1134916617368464e-06, + "loss": 1.1511, + "step": 3540 + }, + { + "epoch": 0.48003795838134616, + "grad_norm": 2.554060878001233, + "learning_rate": 1.1130553501841066e-06, + "loss": 1.1446, + "step": 3541 + }, + { + "epoch": 0.48017352402901103, + "grad_norm": 4.355854621237637, + "learning_rate": 1.112619016829535e-06, + "loss": 1.1751, + "step": 3542 + }, + { + "epoch": 0.4803090896766759, + "grad_norm": 3.7744790888661894, + "learning_rate": 1.1121826617572752e-06, + "loss": 1.1465, + "step": 3543 + }, + { + "epoch": 0.48044465532434083, + "grad_norm": 1.836165484913906, + "learning_rate": 1.1117462850514744e-06, + "loss": 1.1839, + "step": 3544 + }, + { + "epoch": 0.4805802209720057, + "grad_norm": 2.323480896812422, + "learning_rate": 1.1113098867962844e-06, + "loss": 1.2077, + "step": 3545 + }, + { + "epoch": 0.48071578661967057, + "grad_norm": 4.2827884889454175, + "learning_rate": 1.1108734670758616e-06, + "loss": 1.1159, + "step": 3546 + }, + { + "epoch": 0.48085135226733544, + "grad_norm": 2.0088607580806617, + "learning_rate": 1.1104370259743659e-06, + "loss": 1.2001, + "step": 3547 + }, + { + "epoch": 0.4809869179150003, + "grad_norm": 1.7903758015218056, + "learning_rate": 1.1100005635759612e-06, + "loss": 1.1414, + "step": 3548 + }, + { + "epoch": 0.48112248356266524, + "grad_norm": 1.8027603386967481, + "learning_rate": 1.1095640799648162e-06, + "loss": 1.1373, + "step": 3549 + }, + { + "epoch": 0.4812580492103301, + "grad_norm": 2.2532903639181088, + "learning_rate": 1.1091275752251035e-06, + "loss": 1.1491, + "step": 3550 + }, + { + "epoch": 0.481393614857995, + "grad_norm": 1.7920321683052787, + "learning_rate": 1.1086910494409993e-06, + "loss": 1.1397, + "step": 3551 + }, + { + "epoch": 0.48152918050565985, + "grad_norm": 2.082034545915359, + "learning_rate": 1.1082545026966841e-06, + "loss": 1.1372, + "step": 3552 + }, + { + "epoch": 0.4816647461533248, + "grad_norm": 2.6537129360504665, + "learning_rate": 1.1078179350763424e-06, + "loss": 1.1439, + "step": 3553 + }, + { + "epoch": 0.48180031180098964, + "grad_norm": 1.823553623699909, + "learning_rate": 1.107381346664163e-06, + "loss": 1.1408, + "step": 3554 + }, + { + "epoch": 0.4819358774486545, + "grad_norm": 2.3446365264367803, + "learning_rate": 1.1069447375443386e-06, + "loss": 1.1686, + "step": 3555 + }, + { + "epoch": 0.4820714430963194, + "grad_norm": 2.382316976835756, + "learning_rate": 1.106508107801066e-06, + "loss": 1.1333, + "step": 3556 + }, + { + "epoch": 0.48220700874398426, + "grad_norm": 1.8853172264081761, + "learning_rate": 1.1060714575185453e-06, + "loss": 1.1653, + "step": 3557 + }, + { + "epoch": 0.4823425743916492, + "grad_norm": 2.844741569806154, + "learning_rate": 1.105634786780981e-06, + "loss": 1.1506, + "step": 3558 + }, + { + "epoch": 0.48247814003931405, + "grad_norm": 1.734298863668088, + "learning_rate": 1.105198095672582e-06, + "loss": 1.1276, + "step": 3559 + }, + { + "epoch": 0.4826137056869789, + "grad_norm": 9.252769749005214, + "learning_rate": 1.104761384277561e-06, + "loss": 1.1304, + "step": 3560 + }, + { + "epoch": 0.4827492713346438, + "grad_norm": 2.0656236982924736, + "learning_rate": 1.1043246526801338e-06, + "loss": 1.1674, + "step": 3561 + }, + { + "epoch": 0.48288483698230866, + "grad_norm": 1.9958985670584783, + "learning_rate": 1.1038879009645205e-06, + "loss": 1.1161, + "step": 3562 + }, + { + "epoch": 0.4830204026299736, + "grad_norm": 1.780786369656333, + "learning_rate": 1.103451129214946e-06, + "loss": 1.1754, + "step": 3563 + }, + { + "epoch": 0.48315596827763846, + "grad_norm": 2.047598966119439, + "learning_rate": 1.1030143375156375e-06, + "loss": 1.1709, + "step": 3564 + }, + { + "epoch": 0.48329153392530333, + "grad_norm": 2.1946594821727055, + "learning_rate": 1.1025775259508275e-06, + "loss": 1.1929, + "step": 3565 + }, + { + "epoch": 0.4834270995729682, + "grad_norm": 2.4558855690374894, + "learning_rate": 1.1021406946047508e-06, + "loss": 1.1563, + "step": 3566 + }, + { + "epoch": 0.48356266522063307, + "grad_norm": 1.6517502559207469, + "learning_rate": 1.101703843561648e-06, + "loss": 1.1249, + "step": 3567 + }, + { + "epoch": 0.483698230868298, + "grad_norm": 1.859830010573973, + "learning_rate": 1.1012669729057615e-06, + "loss": 1.2162, + "step": 3568 + }, + { + "epoch": 0.48383379651596287, + "grad_norm": 1.594031936086895, + "learning_rate": 1.1008300827213385e-06, + "loss": 1.1915, + "step": 3569 + }, + { + "epoch": 0.48396936216362774, + "grad_norm": 2.557149691658303, + "learning_rate": 1.10039317309263e-06, + "loss": 1.2046, + "step": 3570 + }, + { + "epoch": 0.4841049278112926, + "grad_norm": 3.1427291744979478, + "learning_rate": 1.0999562441038909e-06, + "loss": 1.1557, + "step": 3571 + }, + { + "epoch": 0.4842404934589575, + "grad_norm": 2.5536346571850803, + "learning_rate": 1.0995192958393785e-06, + "loss": 1.177, + "step": 3572 + }, + { + "epoch": 0.4843760591066224, + "grad_norm": 1.7133737950548067, + "learning_rate": 1.099082328383356e-06, + "loss": 1.1691, + "step": 3573 + }, + { + "epoch": 0.4845116247542873, + "grad_norm": 1.6176283415102504, + "learning_rate": 1.098645341820088e-06, + "loss": 1.1515, + "step": 3574 + }, + { + "epoch": 0.48464719040195214, + "grad_norm": 1.8919673643819044, + "learning_rate": 1.098208336233845e-06, + "loss": 1.1643, + "step": 3575 + }, + { + "epoch": 0.484782756049617, + "grad_norm": 1.732609371760582, + "learning_rate": 1.0977713117088994e-06, + "loss": 1.1616, + "step": 3576 + }, + { + "epoch": 0.4849183216972819, + "grad_norm": 2.464887198195928, + "learning_rate": 1.097334268329528e-06, + "loss": 1.2254, + "step": 3577 + }, + { + "epoch": 0.4850538873449468, + "grad_norm": 1.8145104574894357, + "learning_rate": 1.0968972061800115e-06, + "loss": 1.1529, + "step": 3578 + }, + { + "epoch": 0.4851894529926117, + "grad_norm": 1.6665612842627462, + "learning_rate": 1.0964601253446332e-06, + "loss": 1.1328, + "step": 3579 + }, + { + "epoch": 0.48532501864027655, + "grad_norm": 1.8223514318684513, + "learning_rate": 1.0960230259076817e-06, + "loss": 1.1514, + "step": 3580 + }, + { + "epoch": 0.4854605842879414, + "grad_norm": 1.746613116589832, + "learning_rate": 1.0955859079534473e-06, + "loss": 1.1819, + "step": 3581 + }, + { + "epoch": 0.4855961499356063, + "grad_norm": 2.0651357819271827, + "learning_rate": 1.0951487715662253e-06, + "loss": 1.1462, + "step": 3582 + }, + { + "epoch": 0.4857317155832712, + "grad_norm": 1.8161064161489788, + "learning_rate": 1.0947116168303137e-06, + "loss": 1.1447, + "step": 3583 + }, + { + "epoch": 0.4858672812309361, + "grad_norm": 2.34923335780982, + "learning_rate": 1.0942744438300141e-06, + "loss": 1.1514, + "step": 3584 + }, + { + "epoch": 0.48600284687860096, + "grad_norm": 2.0786279023080647, + "learning_rate": 1.0938372526496324e-06, + "loss": 1.1502, + "step": 3585 + }, + { + "epoch": 0.48613841252626583, + "grad_norm": 1.6484966458508705, + "learning_rate": 1.0934000433734772e-06, + "loss": 1.1441, + "step": 3586 + }, + { + "epoch": 0.4862739781739307, + "grad_norm": 1.7457717577023777, + "learning_rate": 1.0929628160858611e-06, + "loss": 1.1704, + "step": 3587 + }, + { + "epoch": 0.4864095438215956, + "grad_norm": 2.2020988356546143, + "learning_rate": 1.0925255708710994e-06, + "loss": 1.161, + "step": 3588 + }, + { + "epoch": 0.4865451094692605, + "grad_norm": 2.512156821206157, + "learning_rate": 1.0920883078135118e-06, + "loss": 1.2102, + "step": 3589 + }, + { + "epoch": 0.48668067511692537, + "grad_norm": 2.5816291394957798, + "learning_rate": 1.0916510269974208e-06, + "loss": 1.1918, + "step": 3590 + }, + { + "epoch": 0.48681624076459024, + "grad_norm": 2.577311774906009, + "learning_rate": 1.091213728507153e-06, + "loss": 1.1469, + "step": 3591 + }, + { + "epoch": 0.48695180641225516, + "grad_norm": 1.752445794669743, + "learning_rate": 1.0907764124270374e-06, + "loss": 1.1749, + "step": 3592 + }, + { + "epoch": 0.48708737205992003, + "grad_norm": 1.8012773127676882, + "learning_rate": 1.0903390788414072e-06, + "loss": 1.1519, + "step": 3593 + }, + { + "epoch": 0.4872229377075849, + "grad_norm": 2.108801268854876, + "learning_rate": 1.089901727834599e-06, + "loss": 1.0984, + "step": 3594 + }, + { + "epoch": 0.4873585033552498, + "grad_norm": 1.889009976923399, + "learning_rate": 1.0894643594909518e-06, + "loss": 1.1693, + "step": 3595 + }, + { + "epoch": 0.48749406900291464, + "grad_norm": 2.0208041342976113, + "learning_rate": 1.0890269738948096e-06, + "loss": 1.1495, + "step": 3596 + }, + { + "epoch": 0.48762963465057957, + "grad_norm": 2.699527655004328, + "learning_rate": 1.088589571130518e-06, + "loss": 1.1595, + "step": 3597 + }, + { + "epoch": 0.48776520029824444, + "grad_norm": 1.820770981496219, + "learning_rate": 1.0881521512824268e-06, + "loss": 1.1734, + "step": 3598 + }, + { + "epoch": 0.4879007659459093, + "grad_norm": 4.000067094680213, + "learning_rate": 1.0877147144348892e-06, + "loss": 1.1411, + "step": 3599 + }, + { + "epoch": 0.4880363315935742, + "grad_norm": 2.0469843699751973, + "learning_rate": 1.087277260672261e-06, + "loss": 1.1919, + "step": 3600 + }, + { + "epoch": 0.48817189724123905, + "grad_norm": 2.34154822129999, + "learning_rate": 1.0868397900789024e-06, + "loss": 1.1599, + "step": 3601 + }, + { + "epoch": 0.488307462888904, + "grad_norm": 1.6414482402728343, + "learning_rate": 1.0864023027391753e-06, + "loss": 1.1613, + "step": 3602 + }, + { + "epoch": 0.48844302853656885, + "grad_norm": 2.745793442308951, + "learning_rate": 1.0859647987374464e-06, + "loss": 1.1515, + "step": 3603 + }, + { + "epoch": 0.4885785941842337, + "grad_norm": 3.6635600277670677, + "learning_rate": 1.0855272781580846e-06, + "loss": 1.1788, + "step": 3604 + }, + { + "epoch": 0.4887141598318986, + "grad_norm": 1.966922206995547, + "learning_rate": 1.0850897410854624e-06, + "loss": 1.1281, + "step": 3605 + }, + { + "epoch": 0.48884972547956346, + "grad_norm": 2.172447815527278, + "learning_rate": 1.084652187603955e-06, + "loss": 1.1836, + "step": 3606 + }, + { + "epoch": 0.4889852911272284, + "grad_norm": 2.5699847433544187, + "learning_rate": 1.0842146177979418e-06, + "loss": 1.1943, + "step": 3607 + }, + { + "epoch": 0.48912085677489325, + "grad_norm": 1.61996066607193, + "learning_rate": 1.0837770317518043e-06, + "loss": 1.1829, + "step": 3608 + }, + { + "epoch": 0.4892564224225581, + "grad_norm": 1.7349324774441492, + "learning_rate": 1.083339429549927e-06, + "loss": 1.1628, + "step": 3609 + }, + { + "epoch": 0.489391988070223, + "grad_norm": 3.8086894725353084, + "learning_rate": 1.0829018112766993e-06, + "loss": 1.1403, + "step": 3610 + }, + { + "epoch": 0.48952755371788786, + "grad_norm": 1.8914189751032047, + "learning_rate": 1.0824641770165112e-06, + "loss": 1.1763, + "step": 3611 + }, + { + "epoch": 0.4896631193655528, + "grad_norm": 1.6183612817843056, + "learning_rate": 1.0820265268537578e-06, + "loss": 1.213, + "step": 3612 + }, + { + "epoch": 0.48979868501321766, + "grad_norm": 1.8333861212242513, + "learning_rate": 1.0815888608728359e-06, + "loss": 1.1912, + "step": 3613 + }, + { + "epoch": 0.48993425066088253, + "grad_norm": 1.9153419412822419, + "learning_rate": 1.0811511791581463e-06, + "loss": 1.1652, + "step": 3614 + }, + { + "epoch": 0.4900698163085474, + "grad_norm": 2.3674750117140944, + "learning_rate": 1.0807134817940923e-06, + "loss": 1.1783, + "step": 3615 + }, + { + "epoch": 0.49020538195621227, + "grad_norm": 2.0218620027180885, + "learning_rate": 1.0802757688650805e-06, + "loss": 1.1963, + "step": 3616 + }, + { + "epoch": 0.4903409476038772, + "grad_norm": 4.0754103499209124, + "learning_rate": 1.0798380404555203e-06, + "loss": 1.1502, + "step": 3617 + }, + { + "epoch": 0.49047651325154207, + "grad_norm": 1.850727949329974, + "learning_rate": 1.0794002966498246e-06, + "loss": 1.1257, + "step": 3618 + }, + { + "epoch": 0.49061207889920694, + "grad_norm": 2.103331523388582, + "learning_rate": 1.0789625375324078e-06, + "loss": 1.1221, + "step": 3619 + }, + { + "epoch": 0.4907476445468718, + "grad_norm": 1.903749069937183, + "learning_rate": 1.0785247631876892e-06, + "loss": 1.1691, + "step": 3620 + }, + { + "epoch": 0.4908832101945367, + "grad_norm": 1.8803627539144747, + "learning_rate": 1.0780869737000898e-06, + "loss": 1.1258, + "step": 3621 + }, + { + "epoch": 0.4910187758422016, + "grad_norm": 2.1676558159427657, + "learning_rate": 1.0776491691540342e-06, + "loss": 1.1616, + "step": 3622 + }, + { + "epoch": 0.4911543414898665, + "grad_norm": 1.5832715080009987, + "learning_rate": 1.077211349633949e-06, + "loss": 1.1896, + "step": 3623 + }, + { + "epoch": 0.49128990713753135, + "grad_norm": 3.5632056155827905, + "learning_rate": 1.0767735152242646e-06, + "loss": 1.1778, + "step": 3624 + }, + { + "epoch": 0.4914254727851962, + "grad_norm": 1.7513502596121024, + "learning_rate": 1.0763356660094139e-06, + "loss": 1.1588, + "step": 3625 + }, + { + "epoch": 0.4915610384328611, + "grad_norm": 1.645842368037343, + "learning_rate": 1.0758978020738323e-06, + "loss": 1.1695, + "step": 3626 + }, + { + "epoch": 0.491696604080526, + "grad_norm": 1.6930474973578968, + "learning_rate": 1.0754599235019586e-06, + "loss": 1.1917, + "step": 3627 + }, + { + "epoch": 0.4918321697281909, + "grad_norm": 1.8997458168241605, + "learning_rate": 1.0750220303782345e-06, + "loss": 1.1829, + "step": 3628 + }, + { + "epoch": 0.49196773537585575, + "grad_norm": 1.6867260787234888, + "learning_rate": 1.074584122787104e-06, + "loss": 1.1909, + "step": 3629 + }, + { + "epoch": 0.4921033010235206, + "grad_norm": 1.7710524155012417, + "learning_rate": 1.074146200813014e-06, + "loss": 1.1616, + "step": 3630 + }, + { + "epoch": 0.49223886667118555, + "grad_norm": 1.5546726033702425, + "learning_rate": 1.0737082645404147e-06, + "loss": 1.1184, + "step": 3631 + }, + { + "epoch": 0.4923744323188504, + "grad_norm": 4.257179907491818, + "learning_rate": 1.0732703140537583e-06, + "loss": 1.1514, + "step": 3632 + }, + { + "epoch": 0.4925099979665153, + "grad_norm": 1.7308376235327032, + "learning_rate": 1.0728323494375e-06, + "loss": 1.1572, + "step": 3633 + }, + { + "epoch": 0.49264556361418016, + "grad_norm": 2.273445292513544, + "learning_rate": 1.0723943707760984e-06, + "loss": 1.1874, + "step": 3634 + }, + { + "epoch": 0.49278112926184503, + "grad_norm": 1.8025907692607013, + "learning_rate": 1.0719563781540135e-06, + "loss": 1.1558, + "step": 3635 + }, + { + "epoch": 0.49291669490950996, + "grad_norm": 1.8405672198189735, + "learning_rate": 1.071518371655709e-06, + "loss": 1.1771, + "step": 3636 + }, + { + "epoch": 0.4930522605571748, + "grad_norm": 1.7919376661435706, + "learning_rate": 1.0710803513656514e-06, + "loss": 1.1634, + "step": 3637 + }, + { + "epoch": 0.4931878262048397, + "grad_norm": 2.30531812361696, + "learning_rate": 1.0706423173683092e-06, + "loss": 1.155, + "step": 3638 + }, + { + "epoch": 0.49332339185250457, + "grad_norm": 1.9826249711175672, + "learning_rate": 1.0702042697481536e-06, + "loss": 1.1567, + "step": 3639 + }, + { + "epoch": 0.49345895750016944, + "grad_norm": 1.9968285829083612, + "learning_rate": 1.0697662085896583e-06, + "loss": 1.1661, + "step": 3640 + }, + { + "epoch": 0.49359452314783436, + "grad_norm": 3.4227076191569616, + "learning_rate": 1.0693281339773009e-06, + "loss": 1.1771, + "step": 3641 + }, + { + "epoch": 0.49373008879549923, + "grad_norm": 1.672322099233065, + "learning_rate": 1.0688900459955596e-06, + "loss": 1.1462, + "step": 3642 + }, + { + "epoch": 0.4938656544431641, + "grad_norm": 1.5974202231209076, + "learning_rate": 1.0684519447289171e-06, + "loss": 1.1322, + "step": 3643 + }, + { + "epoch": 0.494001220090829, + "grad_norm": 1.9842239180457142, + "learning_rate": 1.0680138302618572e-06, + "loss": 1.1768, + "step": 3644 + }, + { + "epoch": 0.49413678573849384, + "grad_norm": 1.9650928468901552, + "learning_rate": 1.0675757026788672e-06, + "loss": 1.2095, + "step": 3645 + }, + { + "epoch": 0.49427235138615877, + "grad_norm": 3.7062292332416176, + "learning_rate": 1.0671375620644363e-06, + "loss": 1.1561, + "step": 3646 + }, + { + "epoch": 0.49440791703382364, + "grad_norm": 1.9858463736968937, + "learning_rate": 1.0666994085030563e-06, + "loss": 1.1378, + "step": 3647 + }, + { + "epoch": 0.4945434826814885, + "grad_norm": 2.263981294185094, + "learning_rate": 1.066261242079222e-06, + "loss": 1.1348, + "step": 3648 + }, + { + "epoch": 0.4946790483291534, + "grad_norm": 1.7304212370766467, + "learning_rate": 1.0658230628774302e-06, + "loss": 1.22, + "step": 3649 + }, + { + "epoch": 0.49481461397681825, + "grad_norm": 2.203427553798413, + "learning_rate": 1.0653848709821806e-06, + "loss": 1.1665, + "step": 3650 + }, + { + "epoch": 0.4949501796244832, + "grad_norm": 6.573459474029164, + "learning_rate": 1.0649466664779744e-06, + "loss": 1.1578, + "step": 3651 + }, + { + "epoch": 0.49508574527214805, + "grad_norm": 1.7787404497997972, + "learning_rate": 1.0645084494493164e-06, + "loss": 1.1749, + "step": 3652 + }, + { + "epoch": 0.4952213109198129, + "grad_norm": 2.0227233767847292, + "learning_rate": 1.064070219980713e-06, + "loss": 1.1185, + "step": 3653 + }, + { + "epoch": 0.4953568765674778, + "grad_norm": 2.4924546502354064, + "learning_rate": 1.0636319781566736e-06, + "loss": 1.1504, + "step": 3654 + }, + { + "epoch": 0.49549244221514266, + "grad_norm": 1.8395049359576718, + "learning_rate": 1.0631937240617093e-06, + "loss": 1.1553, + "step": 3655 + }, + { + "epoch": 0.4956280078628076, + "grad_norm": 3.173937220363552, + "learning_rate": 1.062755457780334e-06, + "loss": 1.1717, + "step": 3656 + }, + { + "epoch": 0.49576357351047246, + "grad_norm": 1.8331877660648854, + "learning_rate": 1.0623171793970642e-06, + "loss": 1.1089, + "step": 3657 + }, + { + "epoch": 0.4958991391581373, + "grad_norm": 1.7749660636419504, + "learning_rate": 1.0618788889964182e-06, + "loss": 1.1804, + "step": 3658 + }, + { + "epoch": 0.4960347048058022, + "grad_norm": 2.5864171013334474, + "learning_rate": 1.061440586662917e-06, + "loss": 1.1877, + "step": 3659 + }, + { + "epoch": 0.49617027045346707, + "grad_norm": 1.6339624471722551, + "learning_rate": 1.0610022724810837e-06, + "loss": 1.1986, + "step": 3660 + }, + { + "epoch": 0.496305836101132, + "grad_norm": 1.5407209708856815, + "learning_rate": 1.0605639465354435e-06, + "loss": 1.1715, + "step": 3661 + }, + { + "epoch": 0.49644140174879686, + "grad_norm": 1.6577299238437297, + "learning_rate": 1.0601256089105242e-06, + "loss": 1.1855, + "step": 3662 + }, + { + "epoch": 0.49657696739646173, + "grad_norm": 1.65766824499725, + "learning_rate": 1.059687259690856e-06, + "loss": 1.1653, + "step": 3663 + }, + { + "epoch": 0.4967125330441266, + "grad_norm": 2.3948771244917353, + "learning_rate": 1.0592488989609708e-06, + "loss": 1.1464, + "step": 3664 + }, + { + "epoch": 0.4968480986917915, + "grad_norm": 1.9217647831957572, + "learning_rate": 1.0588105268054032e-06, + "loss": 1.1601, + "step": 3665 + }, + { + "epoch": 0.4969836643394564, + "grad_norm": 1.6595573095086678, + "learning_rate": 1.0583721433086899e-06, + "loss": 1.1816, + "step": 3666 + }, + { + "epoch": 0.49711922998712127, + "grad_norm": 1.8594653141114676, + "learning_rate": 1.0579337485553695e-06, + "loss": 1.1634, + "step": 3667 + }, + { + "epoch": 0.49725479563478614, + "grad_norm": 2.54484211823698, + "learning_rate": 1.0574953426299825e-06, + "loss": 1.1883, + "step": 3668 + }, + { + "epoch": 0.497390361282451, + "grad_norm": 2.1523581740202835, + "learning_rate": 1.057056925617073e-06, + "loss": 1.1562, + "step": 3669 + }, + { + "epoch": 0.49752592693011594, + "grad_norm": 1.6479433123773668, + "learning_rate": 1.0566184976011855e-06, + "loss": 1.1469, + "step": 3670 + }, + { + "epoch": 0.4976614925777808, + "grad_norm": 1.9533353860828289, + "learning_rate": 1.0561800586668678e-06, + "loss": 1.1754, + "step": 3671 + }, + { + "epoch": 0.4977970582254457, + "grad_norm": 2.081590155952598, + "learning_rate": 1.0557416088986692e-06, + "loss": 1.1316, + "step": 3672 + }, + { + "epoch": 0.49793262387311055, + "grad_norm": 1.7977976276215586, + "learning_rate": 1.0553031483811414e-06, + "loss": 1.1552, + "step": 3673 + }, + { + "epoch": 0.4980681895207754, + "grad_norm": 1.8068267283927653, + "learning_rate": 1.054864677198838e-06, + "loss": 1.1398, + "step": 3674 + }, + { + "epoch": 0.49820375516844034, + "grad_norm": 2.0238747258149674, + "learning_rate": 1.0544261954363146e-06, + "loss": 1.1708, + "step": 3675 + }, + { + "epoch": 0.4983393208161052, + "grad_norm": 1.781243312852086, + "learning_rate": 1.0539877031781289e-06, + "loss": 1.1604, + "step": 3676 + }, + { + "epoch": 0.4984748864637701, + "grad_norm": 1.8154967833868083, + "learning_rate": 1.053549200508841e-06, + "loss": 1.1406, + "step": 3677 + }, + { + "epoch": 0.49861045211143495, + "grad_norm": 1.9731203193983973, + "learning_rate": 1.0531106875130123e-06, + "loss": 1.1559, + "step": 3678 + }, + { + "epoch": 0.4987460177590998, + "grad_norm": 2.466413805495515, + "learning_rate": 1.0526721642752069e-06, + "loss": 1.1459, + "step": 3679 + }, + { + "epoch": 0.49888158340676475, + "grad_norm": 1.6630692911338953, + "learning_rate": 1.0522336308799904e-06, + "loss": 1.1338, + "step": 3680 + }, + { + "epoch": 0.4990171490544296, + "grad_norm": 1.9628668894226717, + "learning_rate": 1.0517950874119304e-06, + "loss": 1.1433, + "step": 3681 + }, + { + "epoch": 0.4991527147020945, + "grad_norm": 2.523159409940459, + "learning_rate": 1.0513565339555965e-06, + "loss": 1.1982, + "step": 3682 + }, + { + "epoch": 0.49928828034975936, + "grad_norm": 1.7462485684079783, + "learning_rate": 1.0509179705955607e-06, + "loss": 1.1772, + "step": 3683 + }, + { + "epoch": 0.49942384599742423, + "grad_norm": 2.280788331219759, + "learning_rate": 1.050479397416396e-06, + "loss": 1.18, + "step": 3684 + }, + { + "epoch": 0.49955941164508916, + "grad_norm": 1.872916670483736, + "learning_rate": 1.050040814502678e-06, + "loss": 1.1571, + "step": 3685 + }, + { + "epoch": 0.49969497729275403, + "grad_norm": 2.0539639653293644, + "learning_rate": 1.049602221938984e-06, + "loss": 1.15, + "step": 3686 + }, + { + "epoch": 0.4998305429404189, + "grad_norm": 1.7467890381274391, + "learning_rate": 1.0491636198098932e-06, + "loss": 1.1704, + "step": 3687 + }, + { + "epoch": 0.49996610858808377, + "grad_norm": 1.8834127308231146, + "learning_rate": 1.048725008199986e-06, + "loss": 1.1641, + "step": 3688 + }, + { + "epoch": 0.5001016742357487, + "grad_norm": 2.0530996494544826, + "learning_rate": 1.0482863871938459e-06, + "loss": 1.1598, + "step": 3689 + }, + { + "epoch": 0.5002372398834135, + "grad_norm": 1.9019200171643993, + "learning_rate": 1.047847756876057e-06, + "loss": 1.1843, + "step": 3690 + }, + { + "epoch": 0.5003728055310784, + "grad_norm": 2.1148197377073727, + "learning_rate": 1.0474091173312058e-06, + "loss": 1.1758, + "step": 3691 + }, + { + "epoch": 0.5005083711787434, + "grad_norm": 1.7010845019096017, + "learning_rate": 1.0469704686438807e-06, + "loss": 1.1587, + "step": 3692 + }, + { + "epoch": 0.5006439368264082, + "grad_norm": 3.8275347645885165, + "learning_rate": 1.0465318108986713e-06, + "loss": 1.1644, + "step": 3693 + }, + { + "epoch": 0.5007795024740731, + "grad_norm": 1.6586947726323282, + "learning_rate": 1.04609314418017e-06, + "loss": 1.194, + "step": 3694 + }, + { + "epoch": 0.5009150681217379, + "grad_norm": 1.95527856440022, + "learning_rate": 1.045654468572969e-06, + "loss": 1.1653, + "step": 3695 + }, + { + "epoch": 0.5010506337694028, + "grad_norm": 1.5508761264763185, + "learning_rate": 1.0452157841616645e-06, + "loss": 1.1201, + "step": 3696 + }, + { + "epoch": 0.5011861994170678, + "grad_norm": 1.6957683212989012, + "learning_rate": 1.044777091030853e-06, + "loss": 1.161, + "step": 3697 + }, + { + "epoch": 0.5013217650647326, + "grad_norm": 1.7738455461988132, + "learning_rate": 1.0443383892651325e-06, + "loss": 1.1372, + "step": 3698 + }, + { + "epoch": 0.5014573307123975, + "grad_norm": 2.0809340634739844, + "learning_rate": 1.043899678949104e-06, + "loss": 1.1594, + "step": 3699 + }, + { + "epoch": 0.5015928963600623, + "grad_norm": 4.558322966514518, + "learning_rate": 1.0434609601673687e-06, + "loss": 1.1449, + "step": 3700 + }, + { + "epoch": 0.5017284620077272, + "grad_norm": 1.6906118675525326, + "learning_rate": 1.0430222330045304e-06, + "loss": 1.1511, + "step": 3701 + }, + { + "epoch": 0.5018640276553922, + "grad_norm": 3.6695788989390614, + "learning_rate": 1.0425834975451942e-06, + "loss": 1.2474, + "step": 3702 + }, + { + "epoch": 0.501999593303057, + "grad_norm": 2.5006059669467184, + "learning_rate": 1.0421447538739664e-06, + "loss": 1.1785, + "step": 3703 + }, + { + "epoch": 0.5021351589507219, + "grad_norm": 5.996758780809494, + "learning_rate": 1.0417060020754555e-06, + "loss": 1.1179, + "step": 3704 + }, + { + "epoch": 0.5022707245983867, + "grad_norm": 1.74726980210793, + "learning_rate": 1.0412672422342714e-06, + "loss": 1.1551, + "step": 3705 + }, + { + "epoch": 0.5024062902460517, + "grad_norm": 1.9061944421546182, + "learning_rate": 1.0408284744350255e-06, + "loss": 1.1268, + "step": 3706 + }, + { + "epoch": 0.5025418558937166, + "grad_norm": 7.580408902643581, + "learning_rate": 1.0403896987623304e-06, + "loss": 1.1666, + "step": 3707 + }, + { + "epoch": 0.5026774215413814, + "grad_norm": 1.7015485034727813, + "learning_rate": 1.039950915300801e-06, + "loss": 1.1694, + "step": 3708 + }, + { + "epoch": 0.5028129871890463, + "grad_norm": 1.6944947305301814, + "learning_rate": 1.039512124135053e-06, + "loss": 1.1154, + "step": 3709 + }, + { + "epoch": 0.5029485528367111, + "grad_norm": 2.1441039973588905, + "learning_rate": 1.0390733253497033e-06, + "loss": 1.1651, + "step": 3710 + }, + { + "epoch": 0.5030841184843761, + "grad_norm": 2.1242329099039985, + "learning_rate": 1.0386345190293714e-06, + "loss": 1.1375, + "step": 3711 + }, + { + "epoch": 0.503219684132041, + "grad_norm": 1.7393120130370767, + "learning_rate": 1.0381957052586774e-06, + "loss": 1.1385, + "step": 3712 + }, + { + "epoch": 0.5033552497797058, + "grad_norm": 1.6361673782180113, + "learning_rate": 1.037756884122243e-06, + "loss": 1.156, + "step": 3713 + }, + { + "epoch": 0.5034908154273707, + "grad_norm": 1.5861450378332416, + "learning_rate": 1.037318055704692e-06, + "loss": 1.1515, + "step": 3714 + }, + { + "epoch": 0.5036263810750355, + "grad_norm": 4.137327531734821, + "learning_rate": 1.0368792200906482e-06, + "loss": 1.1348, + "step": 3715 + }, + { + "epoch": 0.5037619467227005, + "grad_norm": 1.856920148687339, + "learning_rate": 1.0364403773647379e-06, + "loss": 1.1214, + "step": 3716 + }, + { + "epoch": 0.5038975123703654, + "grad_norm": 2.012347116311769, + "learning_rate": 1.0360015276115888e-06, + "loss": 1.18, + "step": 3717 + }, + { + "epoch": 0.5040330780180302, + "grad_norm": 1.6415709021504332, + "learning_rate": 1.035562670915829e-06, + "loss": 1.1536, + "step": 3718 + }, + { + "epoch": 0.5041686436656951, + "grad_norm": 1.9996242821416275, + "learning_rate": 1.0351238073620887e-06, + "loss": 1.1709, + "step": 3719 + }, + { + "epoch": 0.50430420931336, + "grad_norm": 1.9630149886144825, + "learning_rate": 1.0346849370349997e-06, + "loss": 1.1476, + "step": 3720 + }, + { + "epoch": 0.5044397749610249, + "grad_norm": 2.2510491142012508, + "learning_rate": 1.0342460600191942e-06, + "loss": 1.1724, + "step": 3721 + }, + { + "epoch": 0.5045753406086898, + "grad_norm": 15.883628475583006, + "learning_rate": 1.0338071763993065e-06, + "loss": 1.1457, + "step": 3722 + }, + { + "epoch": 0.5047109062563546, + "grad_norm": 1.729493195923143, + "learning_rate": 1.0333682862599714e-06, + "loss": 1.1765, + "step": 3723 + }, + { + "epoch": 0.5048464719040195, + "grad_norm": 2.113297331645459, + "learning_rate": 1.032929389685826e-06, + "loss": 1.1219, + "step": 3724 + }, + { + "epoch": 0.5049820375516844, + "grad_norm": 1.8152905475934744, + "learning_rate": 1.0324904867615077e-06, + "loss": 1.2007, + "step": 3725 + }, + { + "epoch": 0.5051176031993493, + "grad_norm": 1.7088245207208046, + "learning_rate": 1.0320515775716554e-06, + "loss": 1.1607, + "step": 3726 + }, + { + "epoch": 0.5052531688470142, + "grad_norm": 1.6353607865454927, + "learning_rate": 1.0316126622009092e-06, + "loss": 1.1387, + "step": 3727 + }, + { + "epoch": 0.505388734494679, + "grad_norm": 2.447530030718977, + "learning_rate": 1.0311737407339106e-06, + "loss": 1.1538, + "step": 3728 + }, + { + "epoch": 0.505524300142344, + "grad_norm": 1.7552286737961122, + "learning_rate": 1.0307348132553024e-06, + "loss": 1.1346, + "step": 3729 + }, + { + "epoch": 0.5056598657900088, + "grad_norm": 1.8380733361958028, + "learning_rate": 1.030295879849728e-06, + "loss": 1.1763, + "step": 3730 + }, + { + "epoch": 0.5057954314376737, + "grad_norm": 1.7312063731905714, + "learning_rate": 1.0298569406018325e-06, + "loss": 1.1677, + "step": 3731 + }, + { + "epoch": 0.5059309970853386, + "grad_norm": 1.6088008484015248, + "learning_rate": 1.0294179955962614e-06, + "loss": 1.13, + "step": 3732 + }, + { + "epoch": 0.5060665627330034, + "grad_norm": 1.839982079244118, + "learning_rate": 1.0289790449176622e-06, + "loss": 1.1836, + "step": 3733 + }, + { + "epoch": 0.5062021283806684, + "grad_norm": 1.8399937304890943, + "learning_rate": 1.0285400886506828e-06, + "loss": 1.1462, + "step": 3734 + }, + { + "epoch": 0.5063376940283332, + "grad_norm": 2.0631354421927144, + "learning_rate": 1.0281011268799726e-06, + "loss": 1.1402, + "step": 3735 + }, + { + "epoch": 0.5064732596759981, + "grad_norm": 3.850910044714745, + "learning_rate": 1.0276621596901821e-06, + "loss": 1.1506, + "step": 3736 + }, + { + "epoch": 0.506608825323663, + "grad_norm": 1.5350789095759332, + "learning_rate": 1.0272231871659624e-06, + "loss": 1.1429, + "step": 3737 + }, + { + "epoch": 0.5067443909713278, + "grad_norm": 1.9559741695558461, + "learning_rate": 1.026784209391966e-06, + "loss": 1.1654, + "step": 3738 + }, + { + "epoch": 0.5068799566189928, + "grad_norm": 2.0178211102863526, + "learning_rate": 1.026345226452846e-06, + "loss": 1.1666, + "step": 3739 + }, + { + "epoch": 0.5070155222666576, + "grad_norm": 2.1210614260626586, + "learning_rate": 1.0259062384332573e-06, + "loss": 1.1525, + "step": 3740 + }, + { + "epoch": 0.5071510879143225, + "grad_norm": 2.46160758897193, + "learning_rate": 1.0254672454178547e-06, + "loss": 1.1671, + "step": 3741 + }, + { + "epoch": 0.5072866535619874, + "grad_norm": 1.9078001958573303, + "learning_rate": 1.0250282474912952e-06, + "loss": 1.1579, + "step": 3742 + }, + { + "epoch": 0.5074222192096522, + "grad_norm": 1.9290148798523683, + "learning_rate": 1.0245892447382354e-06, + "loss": 1.1508, + "step": 3743 + }, + { + "epoch": 0.5075577848573172, + "grad_norm": 1.887680503133244, + "learning_rate": 1.0241502372433342e-06, + "loss": 1.1563, + "step": 3744 + }, + { + "epoch": 0.507693350504982, + "grad_norm": 1.923030153973475, + "learning_rate": 1.02371122509125e-06, + "loss": 1.1489, + "step": 3745 + }, + { + "epoch": 0.5078289161526469, + "grad_norm": 2.1804248707145697, + "learning_rate": 1.0232722083666435e-06, + "loss": 1.1587, + "step": 3746 + }, + { + "epoch": 0.5079644818003118, + "grad_norm": 1.9121885572454276, + "learning_rate": 1.022833187154175e-06, + "loss": 1.1518, + "step": 3747 + }, + { + "epoch": 0.5081000474479767, + "grad_norm": 1.9028083068191124, + "learning_rate": 1.022394161538507e-06, + "loss": 1.1435, + "step": 3748 + }, + { + "epoch": 0.5082356130956416, + "grad_norm": 1.8184887775461849, + "learning_rate": 1.0219551316043016e-06, + "loss": 1.1654, + "step": 3749 + }, + { + "epoch": 0.5083711787433064, + "grad_norm": 2.10716378633194, + "learning_rate": 1.0215160974362223e-06, + "loss": 1.1103, + "step": 3750 + }, + { + "epoch": 0.5085067443909713, + "grad_norm": 1.9800088592616338, + "learning_rate": 1.0210770591189333e-06, + "loss": 1.1192, + "step": 3751 + }, + { + "epoch": 0.5086423100386362, + "grad_norm": 2.034372087958295, + "learning_rate": 1.0206380167371e-06, + "loss": 1.1627, + "step": 3752 + }, + { + "epoch": 0.5087778756863011, + "grad_norm": 4.562120955002243, + "learning_rate": 1.0201989703753881e-06, + "loss": 1.1669, + "step": 3753 + }, + { + "epoch": 0.508913441333966, + "grad_norm": 2.1381016289452925, + "learning_rate": 1.0197599201184642e-06, + "loss": 1.1863, + "step": 3754 + }, + { + "epoch": 0.5090490069816308, + "grad_norm": 2.0776317246692173, + "learning_rate": 1.0193208660509956e-06, + "loss": 1.1596, + "step": 3755 + }, + { + "epoch": 0.5091845726292957, + "grad_norm": 1.7044154323789542, + "learning_rate": 1.0188818082576505e-06, + "loss": 1.1547, + "step": 3756 + }, + { + "epoch": 0.5093201382769607, + "grad_norm": 1.5423535086082705, + "learning_rate": 1.0184427468230976e-06, + "loss": 1.1633, + "step": 3757 + }, + { + "epoch": 0.5094557039246255, + "grad_norm": 2.033913254679087, + "learning_rate": 1.0180036818320067e-06, + "loss": 1.1659, + "step": 3758 + }, + { + "epoch": 0.5095912695722904, + "grad_norm": 1.9183170080485885, + "learning_rate": 1.0175646133690479e-06, + "loss": 1.1579, + "step": 3759 + }, + { + "epoch": 0.5097268352199552, + "grad_norm": 1.7164998872224246, + "learning_rate": 1.017125541518892e-06, + "loss": 1.1582, + "step": 3760 + }, + { + "epoch": 0.5098624008676201, + "grad_norm": 1.720428766002654, + "learning_rate": 1.0166864663662104e-06, + "loss": 1.1779, + "step": 3761 + }, + { + "epoch": 0.5099979665152851, + "grad_norm": 1.8106074988551535, + "learning_rate": 1.016247387995676e-06, + "loss": 1.156, + "step": 3762 + }, + { + "epoch": 0.5101335321629499, + "grad_norm": 1.5930671995457926, + "learning_rate": 1.0158083064919605e-06, + "loss": 1.1221, + "step": 3763 + }, + { + "epoch": 0.5102690978106148, + "grad_norm": 2.4069253735741505, + "learning_rate": 1.0153692219397385e-06, + "loss": 1.1596, + "step": 3764 + }, + { + "epoch": 0.5104046634582796, + "grad_norm": 2.1500116719256392, + "learning_rate": 1.014930134423683e-06, + "loss": 1.1552, + "step": 3765 + }, + { + "epoch": 0.5105402291059445, + "grad_norm": 1.8262086769116037, + "learning_rate": 1.0144910440284689e-06, + "loss": 1.1531, + "step": 3766 + }, + { + "epoch": 0.5106757947536095, + "grad_norm": 1.8162644022365897, + "learning_rate": 1.0140519508387713e-06, + "loss": 1.1509, + "step": 3767 + }, + { + "epoch": 0.5108113604012743, + "grad_norm": 2.196310022580976, + "learning_rate": 1.013612854939266e-06, + "loss": 1.1886, + "step": 3768 + }, + { + "epoch": 0.5109469260489392, + "grad_norm": 1.8154274019837624, + "learning_rate": 1.013173756414629e-06, + "loss": 1.1387, + "step": 3769 + }, + { + "epoch": 0.5110824916966041, + "grad_norm": 1.5300617866205857, + "learning_rate": 1.0127346553495371e-06, + "loss": 1.1802, + "step": 3770 + }, + { + "epoch": 0.511218057344269, + "grad_norm": 1.8978904000672097, + "learning_rate": 1.0122955518286672e-06, + "loss": 1.1197, + "step": 3771 + }, + { + "epoch": 0.5113536229919339, + "grad_norm": 14.465152336306465, + "learning_rate": 1.0118564459366976e-06, + "loss": 1.1296, + "step": 3772 + }, + { + "epoch": 0.5114891886395987, + "grad_norm": 4.712262775332603, + "learning_rate": 1.0114173377583057e-06, + "loss": 1.1564, + "step": 3773 + }, + { + "epoch": 0.5116247542872636, + "grad_norm": 2.52293548139927, + "learning_rate": 1.0109782273781706e-06, + "loss": 1.1278, + "step": 3774 + }, + { + "epoch": 0.5117603199349285, + "grad_norm": 1.889384824942495, + "learning_rate": 1.0105391148809707e-06, + "loss": 1.1706, + "step": 3775 + }, + { + "epoch": 0.5118958855825934, + "grad_norm": 2.344621189826117, + "learning_rate": 1.010100000351386e-06, + "loss": 1.1668, + "step": 3776 + }, + { + "epoch": 0.5120314512302583, + "grad_norm": 4.140924930206694, + "learning_rate": 1.0096608838740956e-06, + "loss": 1.1046, + "step": 3777 + }, + { + "epoch": 0.5121670168779231, + "grad_norm": 10.0706901495025, + "learning_rate": 1.0092217655337806e-06, + "loss": 1.1957, + "step": 3778 + }, + { + "epoch": 0.512302582525588, + "grad_norm": 1.9217170353723316, + "learning_rate": 1.0087826454151205e-06, + "loss": 1.2078, + "step": 3779 + }, + { + "epoch": 0.512438148173253, + "grad_norm": 3.0722228474807394, + "learning_rate": 1.0083435236027967e-06, + "loss": 1.161, + "step": 3780 + }, + { + "epoch": 0.5125737138209178, + "grad_norm": 2.3569469419854454, + "learning_rate": 1.00790440018149e-06, + "loss": 1.1466, + "step": 3781 + }, + { + "epoch": 0.5127092794685827, + "grad_norm": 1.9760063820330427, + "learning_rate": 1.0074652752358822e-06, + "loss": 1.1268, + "step": 3782 + }, + { + "epoch": 0.5128448451162475, + "grad_norm": 2.044389307321208, + "learning_rate": 1.0070261488506551e-06, + "loss": 1.1453, + "step": 3783 + }, + { + "epoch": 0.5129804107639124, + "grad_norm": 9.109456602812825, + "learning_rate": 1.0065870211104906e-06, + "loss": 1.1968, + "step": 3784 + }, + { + "epoch": 0.5131159764115774, + "grad_norm": 3.4460060556672643, + "learning_rate": 1.006147892100071e-06, + "loss": 1.1635, + "step": 3785 + }, + { + "epoch": 0.5132515420592422, + "grad_norm": 1.7306298898469286, + "learning_rate": 1.0057087619040792e-06, + "loss": 1.1365, + "step": 3786 + }, + { + "epoch": 0.5133871077069071, + "grad_norm": 2.1463045717846656, + "learning_rate": 1.0052696306071974e-06, + "loss": 1.1633, + "step": 3787 + }, + { + "epoch": 0.5135226733545719, + "grad_norm": 1.6453942333744296, + "learning_rate": 1.0048304982941089e-06, + "loss": 1.1733, + "step": 3788 + }, + { + "epoch": 0.5136582390022368, + "grad_norm": 1.9009649085294906, + "learning_rate": 1.0043913650494972e-06, + "loss": 1.1563, + "step": 3789 + }, + { + "epoch": 0.5137938046499018, + "grad_norm": 2.426567446409491, + "learning_rate": 1.0039522309580453e-06, + "loss": 1.1503, + "step": 3790 + }, + { + "epoch": 0.5139293702975666, + "grad_norm": 1.6893782542307505, + "learning_rate": 1.003513096104437e-06, + "loss": 1.1409, + "step": 3791 + }, + { + "epoch": 0.5140649359452315, + "grad_norm": 6.061427203846186, + "learning_rate": 1.0030739605733557e-06, + "loss": 1.1739, + "step": 3792 + }, + { + "epoch": 0.5142005015928963, + "grad_norm": 1.847313619272036, + "learning_rate": 1.0026348244494853e-06, + "loss": 1.1722, + "step": 3793 + }, + { + "epoch": 0.5143360672405612, + "grad_norm": 1.7970039269977716, + "learning_rate": 1.0021956878175099e-06, + "loss": 1.1442, + "step": 3794 + }, + { + "epoch": 0.5144716328882262, + "grad_norm": 1.9975723415676387, + "learning_rate": 1.0017565507621135e-06, + "loss": 1.1495, + "step": 3795 + }, + { + "epoch": 0.514607198535891, + "grad_norm": 2.544708619190699, + "learning_rate": 1.0013174133679801e-06, + "loss": 1.1827, + "step": 3796 + }, + { + "epoch": 0.5147427641835559, + "grad_norm": 1.883110910845688, + "learning_rate": 1.0008782757197939e-06, + "loss": 1.1955, + "step": 3797 + }, + { + "epoch": 0.5148783298312207, + "grad_norm": 1.7968825547240888, + "learning_rate": 1.000439137902239e-06, + "loss": 1.1648, + "step": 3798 + }, + { + "epoch": 0.5150138954788857, + "grad_norm": 1.779621386981684, + "learning_rate": 1e-06, + "loss": 1.1501, + "step": 3799 + }, + { + "epoch": 0.5151494611265506, + "grad_norm": 1.6691987734514993, + "learning_rate": 9.995608620977612e-07, + "loss": 1.1546, + "step": 3800 + }, + { + "epoch": 0.5152850267742154, + "grad_norm": 1.673948513976532, + "learning_rate": 9.991217242802063e-07, + "loss": 1.1766, + "step": 3801 + }, + { + "epoch": 0.5154205924218803, + "grad_norm": 3.156234035639482, + "learning_rate": 9.986825866320202e-07, + "loss": 1.174, + "step": 3802 + }, + { + "epoch": 0.5155561580695451, + "grad_norm": 2.2537840143522185, + "learning_rate": 9.982434492378864e-07, + "loss": 1.202, + "step": 3803 + }, + { + "epoch": 0.5156917237172101, + "grad_norm": 2.214924199828218, + "learning_rate": 9.978043121824903e-07, + "loss": 1.1389, + "step": 3804 + }, + { + "epoch": 0.515827289364875, + "grad_norm": 3.1500245849603963, + "learning_rate": 9.973651755505146e-07, + "loss": 1.1292, + "step": 3805 + }, + { + "epoch": 0.5159628550125398, + "grad_norm": 1.883674189393768, + "learning_rate": 9.969260394266446e-07, + "loss": 1.1842, + "step": 3806 + }, + { + "epoch": 0.5160984206602047, + "grad_norm": 4.581658793960616, + "learning_rate": 9.96486903895563e-07, + "loss": 1.18, + "step": 3807 + }, + { + "epoch": 0.5162339863078695, + "grad_norm": 2.5909813163582758, + "learning_rate": 9.960477690419548e-07, + "loss": 1.1449, + "step": 3808 + }, + { + "epoch": 0.5163695519555345, + "grad_norm": 1.8204961489435787, + "learning_rate": 9.956086349505027e-07, + "loss": 1.1525, + "step": 3809 + }, + { + "epoch": 0.5165051176031994, + "grad_norm": 2.0956651125021915, + "learning_rate": 9.95169501705891e-07, + "loss": 1.1354, + "step": 3810 + }, + { + "epoch": 0.5166406832508642, + "grad_norm": 2.2137529792086146, + "learning_rate": 9.947303693928026e-07, + "loss": 1.1772, + "step": 3811 + }, + { + "epoch": 0.5167762488985291, + "grad_norm": 2.164159389640738, + "learning_rate": 9.94291238095921e-07, + "loss": 1.1654, + "step": 3812 + }, + { + "epoch": 0.516911814546194, + "grad_norm": 2.0248171410555944, + "learning_rate": 9.938521078999288e-07, + "loss": 1.1625, + "step": 3813 + }, + { + "epoch": 0.5170473801938589, + "grad_norm": 4.353789198964426, + "learning_rate": 9.934129788895093e-07, + "loss": 1.1482, + "step": 3814 + }, + { + "epoch": 0.5171829458415238, + "grad_norm": 1.8008312160834907, + "learning_rate": 9.92973851149345e-07, + "loss": 1.1921, + "step": 3815 + }, + { + "epoch": 0.5173185114891886, + "grad_norm": 1.6063183901075817, + "learning_rate": 9.92534724764118e-07, + "loss": 1.137, + "step": 3816 + }, + { + "epoch": 0.5174540771368535, + "grad_norm": 1.936998710506088, + "learning_rate": 9.920955998185102e-07, + "loss": 1.1245, + "step": 3817 + }, + { + "epoch": 0.5175896427845184, + "grad_norm": 1.8791384632256873, + "learning_rate": 9.916564763972035e-07, + "loss": 1.1682, + "step": 3818 + }, + { + "epoch": 0.5177252084321833, + "grad_norm": 1.7995439667242161, + "learning_rate": 9.912173545848796e-07, + "loss": 1.1187, + "step": 3819 + }, + { + "epoch": 0.5178607740798482, + "grad_norm": 1.9111755161096966, + "learning_rate": 9.907782344662194e-07, + "loss": 1.1534, + "step": 3820 + }, + { + "epoch": 0.517996339727513, + "grad_norm": 1.6646943822735754, + "learning_rate": 9.903391161259043e-07, + "loss": 1.157, + "step": 3821 + }, + { + "epoch": 0.518131905375178, + "grad_norm": 1.6244115572382738, + "learning_rate": 9.898999996486137e-07, + "loss": 1.1313, + "step": 3822 + }, + { + "epoch": 0.5182674710228428, + "grad_norm": 1.7303565356971649, + "learning_rate": 9.894608851190292e-07, + "loss": 1.1326, + "step": 3823 + }, + { + "epoch": 0.5184030366705077, + "grad_norm": 2.09528902853197, + "learning_rate": 9.890217726218293e-07, + "loss": 1.15, + "step": 3824 + }, + { + "epoch": 0.5185386023181726, + "grad_norm": 2.376232529491255, + "learning_rate": 9.885826622416942e-07, + "loss": 1.1877, + "step": 3825 + }, + { + "epoch": 0.5186741679658374, + "grad_norm": 2.9130933084084627, + "learning_rate": 9.88143554063302e-07, + "loss": 1.171, + "step": 3826 + }, + { + "epoch": 0.5188097336135024, + "grad_norm": 1.9046944442675133, + "learning_rate": 9.877044481713327e-07, + "loss": 1.152, + "step": 3827 + }, + { + "epoch": 0.5189452992611672, + "grad_norm": 1.9564405000151555, + "learning_rate": 9.872653446504632e-07, + "loss": 1.1259, + "step": 3828 + }, + { + "epoch": 0.5190808649088321, + "grad_norm": 1.6899054305653238, + "learning_rate": 9.86826243585371e-07, + "loss": 1.1948, + "step": 3829 + }, + { + "epoch": 0.519216430556497, + "grad_norm": 2.1226413936155986, + "learning_rate": 9.863871450607342e-07, + "loss": 1.1639, + "step": 3830 + }, + { + "epoch": 0.5193519962041618, + "grad_norm": 1.567198023020106, + "learning_rate": 9.859480491612288e-07, + "loss": 1.1621, + "step": 3831 + }, + { + "epoch": 0.5194875618518268, + "grad_norm": 2.3058352159242506, + "learning_rate": 9.855089559715314e-07, + "loss": 1.1502, + "step": 3832 + }, + { + "epoch": 0.5196231274994916, + "grad_norm": 1.7401570811479996, + "learning_rate": 9.850698655763171e-07, + "loss": 1.1719, + "step": 3833 + }, + { + "epoch": 0.5197586931471565, + "grad_norm": 1.8391822493211771, + "learning_rate": 9.846307780602619e-07, + "loss": 1.129, + "step": 3834 + }, + { + "epoch": 0.5198942587948214, + "grad_norm": 1.732592069134935, + "learning_rate": 9.841916935080392e-07, + "loss": 1.1592, + "step": 3835 + }, + { + "epoch": 0.5200298244424862, + "grad_norm": 1.8692412042253388, + "learning_rate": 9.837526120043242e-07, + "loss": 1.152, + "step": 3836 + }, + { + "epoch": 0.5201653900901512, + "grad_norm": 2.22472643203302, + "learning_rate": 9.833135336337893e-07, + "loss": 1.1202, + "step": 3837 + }, + { + "epoch": 0.520300955737816, + "grad_norm": 2.0244643874037354, + "learning_rate": 9.82874458481108e-07, + "loss": 1.1642, + "step": 3838 + }, + { + "epoch": 0.5204365213854809, + "grad_norm": 2.2143825835545, + "learning_rate": 9.82435386630952e-07, + "loss": 1.2, + "step": 3839 + }, + { + "epoch": 0.5205720870331458, + "grad_norm": 4.500836065457545, + "learning_rate": 9.819963181679934e-07, + "loss": 1.1254, + "step": 3840 + }, + { + "epoch": 0.5207076526808107, + "grad_norm": 1.7598244228624467, + "learning_rate": 9.81557253176902e-07, + "loss": 1.1129, + "step": 3841 + }, + { + "epoch": 0.5208432183284756, + "grad_norm": 5.375514112018413, + "learning_rate": 9.811181917423495e-07, + "loss": 1.1491, + "step": 3842 + }, + { + "epoch": 0.5209787839761404, + "grad_norm": 1.668476398920582, + "learning_rate": 9.806791339490047e-07, + "loss": 1.1149, + "step": 3843 + }, + { + "epoch": 0.5211143496238053, + "grad_norm": 1.9169390456257873, + "learning_rate": 9.802400798815357e-07, + "loss": 1.1718, + "step": 3844 + }, + { + "epoch": 0.5212499152714702, + "grad_norm": 1.9057648550430084, + "learning_rate": 9.79801029624612e-07, + "loss": 1.1283, + "step": 3845 + }, + { + "epoch": 0.5213854809191351, + "grad_norm": 1.589974484824779, + "learning_rate": 9.793619832629001e-07, + "loss": 1.1774, + "step": 3846 + }, + { + "epoch": 0.5215210465668, + "grad_norm": 1.8645602138414954, + "learning_rate": 9.789229408810668e-07, + "loss": 1.1473, + "step": 3847 + }, + { + "epoch": 0.5216566122144649, + "grad_norm": 1.9870337734061299, + "learning_rate": 9.784839025637778e-07, + "loss": 1.1524, + "step": 3848 + }, + { + "epoch": 0.5217921778621297, + "grad_norm": 1.6548082274601559, + "learning_rate": 9.780448683956983e-07, + "loss": 1.1623, + "step": 3849 + }, + { + "epoch": 0.5219277435097947, + "grad_norm": 1.845633811180532, + "learning_rate": 9.77605838461493e-07, + "loss": 1.1836, + "step": 3850 + }, + { + "epoch": 0.5220633091574595, + "grad_norm": 1.717021824588795, + "learning_rate": 9.771668128458251e-07, + "loss": 1.101, + "step": 3851 + }, + { + "epoch": 0.5221988748051244, + "grad_norm": 2.034291220696929, + "learning_rate": 9.767277916333564e-07, + "loss": 1.1213, + "step": 3852 + }, + { + "epoch": 0.5223344404527893, + "grad_norm": 2.1350460581687263, + "learning_rate": 9.762887749087501e-07, + "loss": 1.1712, + "step": 3853 + }, + { + "epoch": 0.5224700061004541, + "grad_norm": 1.746393118520977, + "learning_rate": 9.758497627566657e-07, + "loss": 1.1657, + "step": 3854 + }, + { + "epoch": 0.5226055717481191, + "grad_norm": 1.82013934435738, + "learning_rate": 9.754107552617645e-07, + "loss": 1.1902, + "step": 3855 + }, + { + "epoch": 0.5227411373957839, + "grad_norm": 1.7356704005078922, + "learning_rate": 9.749717525087051e-07, + "loss": 1.1583, + "step": 3856 + }, + { + "epoch": 0.5228767030434488, + "grad_norm": 1.7045301275410003, + "learning_rate": 9.745327545821452e-07, + "loss": 1.153, + "step": 3857 + }, + { + "epoch": 0.5230122686911137, + "grad_norm": 1.8239973260798157, + "learning_rate": 9.74093761566743e-07, + "loss": 1.109, + "step": 3858 + }, + { + "epoch": 0.5231478343387785, + "grad_norm": 2.003617297972475, + "learning_rate": 9.736547735471539e-07, + "loss": 1.1558, + "step": 3859 + }, + { + "epoch": 0.5232833999864435, + "grad_norm": 1.86969729891009, + "learning_rate": 9.732157906080343e-07, + "loss": 1.1909, + "step": 3860 + }, + { + "epoch": 0.5234189656341083, + "grad_norm": 2.7445287136028056, + "learning_rate": 9.727768128340375e-07, + "loss": 1.1419, + "step": 3861 + }, + { + "epoch": 0.5235545312817732, + "grad_norm": 2.295078782439458, + "learning_rate": 9.72337840309818e-07, + "loss": 1.17, + "step": 3862 + }, + { + "epoch": 0.5236900969294381, + "grad_norm": 1.9362682388765455, + "learning_rate": 9.718988731200271e-07, + "loss": 1.182, + "step": 3863 + }, + { + "epoch": 0.523825662577103, + "grad_norm": 1.7570191954746537, + "learning_rate": 9.714599113493171e-07, + "loss": 1.1511, + "step": 3864 + }, + { + "epoch": 0.5239612282247679, + "grad_norm": 2.321946192971077, + "learning_rate": 9.710209550823375e-07, + "loss": 1.1578, + "step": 3865 + }, + { + "epoch": 0.5240967938724327, + "grad_norm": 1.7378177941243627, + "learning_rate": 9.705820044037387e-07, + "loss": 1.1582, + "step": 3866 + }, + { + "epoch": 0.5242323595200976, + "grad_norm": 1.8870448972754446, + "learning_rate": 9.701430593981674e-07, + "loss": 1.1841, + "step": 3867 + }, + { + "epoch": 0.5243679251677625, + "grad_norm": 2.078855077335207, + "learning_rate": 9.697041201502718e-07, + "loss": 1.2247, + "step": 3868 + }, + { + "epoch": 0.5245034908154274, + "grad_norm": 2.4855573862474225, + "learning_rate": 9.692651867446973e-07, + "loss": 1.1824, + "step": 3869 + }, + { + "epoch": 0.5246390564630923, + "grad_norm": 1.6636312729045626, + "learning_rate": 9.688262592660893e-07, + "loss": 1.1362, + "step": 3870 + }, + { + "epoch": 0.5247746221107571, + "grad_norm": 1.743663659886543, + "learning_rate": 9.68387337799091e-07, + "loss": 1.1459, + "step": 3871 + }, + { + "epoch": 0.524910187758422, + "grad_norm": 1.8222087675101324, + "learning_rate": 9.679484224283447e-07, + "loss": 1.1969, + "step": 3872 + }, + { + "epoch": 0.525045753406087, + "grad_norm": 2.2386276650971184, + "learning_rate": 9.675095132384927e-07, + "loss": 1.1335, + "step": 3873 + }, + { + "epoch": 0.5251813190537518, + "grad_norm": 1.7670967319696007, + "learning_rate": 9.67070610314174e-07, + "loss": 1.1551, + "step": 3874 + }, + { + "epoch": 0.5253168847014167, + "grad_norm": 2.0409590473570423, + "learning_rate": 9.666317137400287e-07, + "loss": 1.1457, + "step": 3875 + }, + { + "epoch": 0.5254524503490815, + "grad_norm": 1.967307974473764, + "learning_rate": 9.661928236006936e-07, + "loss": 1.1059, + "step": 3876 + }, + { + "epoch": 0.5255880159967464, + "grad_norm": 2.007795238208127, + "learning_rate": 9.65753939980806e-07, + "loss": 1.2131, + "step": 3877 + }, + { + "epoch": 0.5257235816444114, + "grad_norm": 1.6898480032203056, + "learning_rate": 9.653150629650004e-07, + "loss": 1.137, + "step": 3878 + }, + { + "epoch": 0.5258591472920762, + "grad_norm": 1.953478395713701, + "learning_rate": 9.648761926379112e-07, + "loss": 1.1764, + "step": 3879 + }, + { + "epoch": 0.5259947129397411, + "grad_norm": 1.6529631153096045, + "learning_rate": 9.644373290841712e-07, + "loss": 1.1695, + "step": 3880 + }, + { + "epoch": 0.5261302785874059, + "grad_norm": 1.6903495305875647, + "learning_rate": 9.639984723884112e-07, + "loss": 1.1571, + "step": 3881 + }, + { + "epoch": 0.5262658442350708, + "grad_norm": 1.772707123394453, + "learning_rate": 9.635596226352618e-07, + "loss": 1.1674, + "step": 3882 + }, + { + "epoch": 0.5264014098827358, + "grad_norm": 2.565357976860703, + "learning_rate": 9.63120779909352e-07, + "loss": 1.1729, + "step": 3883 + }, + { + "epoch": 0.5265369755304006, + "grad_norm": 1.6289301274390364, + "learning_rate": 9.626819442953081e-07, + "loss": 1.1772, + "step": 3884 + }, + { + "epoch": 0.5266725411780655, + "grad_norm": 1.7114666389270514, + "learning_rate": 9.622431158777568e-07, + "loss": 1.146, + "step": 3885 + }, + { + "epoch": 0.5268081068257303, + "grad_norm": 2.36395062446654, + "learning_rate": 9.618042947413228e-07, + "loss": 1.1672, + "step": 3886 + }, + { + "epoch": 0.5269436724733952, + "grad_norm": 2.050378102474221, + "learning_rate": 9.613654809706288e-07, + "loss": 1.1393, + "step": 3887 + }, + { + "epoch": 0.5270792381210602, + "grad_norm": 3.3276476425220243, + "learning_rate": 9.60926674650297e-07, + "loss": 1.1541, + "step": 3888 + }, + { + "epoch": 0.527214803768725, + "grad_norm": 1.8439043814963074, + "learning_rate": 9.604878758649472e-07, + "loss": 1.1565, + "step": 3889 + }, + { + "epoch": 0.5273503694163899, + "grad_norm": 2.489770595085262, + "learning_rate": 9.60049084699199e-07, + "loss": 1.1258, + "step": 3890 + }, + { + "epoch": 0.5274859350640547, + "grad_norm": 2.1365473961059833, + "learning_rate": 9.596103012376695e-07, + "loss": 1.1167, + "step": 3891 + }, + { + "epoch": 0.5276215007117196, + "grad_norm": 1.629392199055481, + "learning_rate": 9.591715255649746e-07, + "loss": 1.1578, + "step": 3892 + }, + { + "epoch": 0.5277570663593846, + "grad_norm": 2.1893949470075733, + "learning_rate": 9.587327577657283e-07, + "loss": 1.1688, + "step": 3893 + }, + { + "epoch": 0.5278926320070494, + "grad_norm": 2.0251919420040587, + "learning_rate": 9.582939979245444e-07, + "loss": 1.1895, + "step": 3894 + }, + { + "epoch": 0.5280281976547143, + "grad_norm": 3.008281551303671, + "learning_rate": 9.578552461260335e-07, + "loss": 1.155, + "step": 3895 + }, + { + "epoch": 0.5281637633023791, + "grad_norm": 1.8011090289437504, + "learning_rate": 9.57416502454806e-07, + "loss": 1.1759, + "step": 3896 + }, + { + "epoch": 0.5282993289500441, + "grad_norm": 1.561590959135582, + "learning_rate": 9.569777669954693e-07, + "loss": 1.1733, + "step": 3897 + }, + { + "epoch": 0.528434894597709, + "grad_norm": 2.0560294114843454, + "learning_rate": 9.565390398326312e-07, + "loss": 1.1598, + "step": 3898 + }, + { + "epoch": 0.5285704602453738, + "grad_norm": 1.662665317210651, + "learning_rate": 9.561003210508963e-07, + "loss": 1.1646, + "step": 3899 + }, + { + "epoch": 0.5287060258930387, + "grad_norm": 1.6813021712566156, + "learning_rate": 9.556616107348675e-07, + "loss": 1.1342, + "step": 3900 + }, + { + "epoch": 0.5288415915407035, + "grad_norm": 2.0597035102487213, + "learning_rate": 9.552229089691474e-07, + "loss": 1.1545, + "step": 3901 + }, + { + "epoch": 0.5289771571883685, + "grad_norm": 2.069910080574056, + "learning_rate": 9.547842158383354e-07, + "loss": 1.1477, + "step": 3902 + }, + { + "epoch": 0.5291127228360334, + "grad_norm": 1.7824219815606532, + "learning_rate": 9.54345531427031e-07, + "loss": 1.1452, + "step": 3903 + }, + { + "epoch": 0.5292482884836982, + "grad_norm": 1.5868520997164586, + "learning_rate": 9.539068558198301e-07, + "loss": 1.1821, + "step": 3904 + }, + { + "epoch": 0.5293838541313631, + "grad_norm": 2.0249458729432632, + "learning_rate": 9.534681891013286e-07, + "loss": 1.1754, + "step": 3905 + }, + { + "epoch": 0.5295194197790279, + "grad_norm": 2.1429233188482835, + "learning_rate": 9.530295313561192e-07, + "loss": 1.1609, + "step": 3906 + }, + { + "epoch": 0.5296549854266929, + "grad_norm": 1.7570390123517756, + "learning_rate": 9.525908826687943e-07, + "loss": 1.1857, + "step": 3907 + }, + { + "epoch": 0.5297905510743578, + "grad_norm": 1.835383774823942, + "learning_rate": 9.521522431239429e-07, + "loss": 1.1358, + "step": 3908 + }, + { + "epoch": 0.5299261167220226, + "grad_norm": 11.266175033298948, + "learning_rate": 9.517136128061543e-07, + "loss": 1.1311, + "step": 3909 + }, + { + "epoch": 0.5300616823696875, + "grad_norm": 1.644158173037082, + "learning_rate": 9.51274991800014e-07, + "loss": 1.1139, + "step": 3910 + }, + { + "epoch": 0.5301972480173524, + "grad_norm": 1.6393225076643134, + "learning_rate": 9.508363801901069e-07, + "loss": 1.1598, + "step": 3911 + }, + { + "epoch": 0.5303328136650173, + "grad_norm": 1.6586039582283496, + "learning_rate": 9.50397778061016e-07, + "loss": 1.1664, + "step": 3912 + }, + { + "epoch": 0.5304683793126822, + "grad_norm": 1.8291991193483526, + "learning_rate": 9.49959185497322e-07, + "loss": 1.0772, + "step": 3913 + }, + { + "epoch": 0.530603944960347, + "grad_norm": 1.9594479862238152, + "learning_rate": 9.49520602583604e-07, + "loss": 1.1509, + "step": 3914 + }, + { + "epoch": 0.5307395106080119, + "grad_norm": 1.7860179501560682, + "learning_rate": 9.490820294044394e-07, + "loss": 1.144, + "step": 3915 + }, + { + "epoch": 0.5308750762556768, + "grad_norm": 2.759733084665647, + "learning_rate": 9.486434660444034e-07, + "loss": 1.1997, + "step": 3916 + }, + { + "epoch": 0.5310106419033417, + "grad_norm": 2.4515755019315395, + "learning_rate": 9.482049125880697e-07, + "loss": 1.1937, + "step": 3917 + }, + { + "epoch": 0.5311462075510066, + "grad_norm": 1.9145866215496212, + "learning_rate": 9.477663691200099e-07, + "loss": 1.1508, + "step": 3918 + }, + { + "epoch": 0.5312817731986714, + "grad_norm": 2.1001718508086977, + "learning_rate": 9.47327835724793e-07, + "loss": 1.1276, + "step": 3919 + }, + { + "epoch": 0.5314173388463364, + "grad_norm": 1.9411261222281873, + "learning_rate": 9.468893124869878e-07, + "loss": 1.1618, + "step": 3920 + }, + { + "epoch": 0.5315529044940012, + "grad_norm": 2.083071525567621, + "learning_rate": 9.464507994911589e-07, + "loss": 1.1491, + "step": 3921 + }, + { + "epoch": 0.5316884701416661, + "grad_norm": 2.427199060083555, + "learning_rate": 9.460122968218711e-07, + "loss": 1.1303, + "step": 3922 + }, + { + "epoch": 0.531824035789331, + "grad_norm": 2.1886677849175618, + "learning_rate": 9.455738045636853e-07, + "loss": 1.1261, + "step": 3923 + }, + { + "epoch": 0.5319596014369958, + "grad_norm": 1.828614395019273, + "learning_rate": 9.451353228011622e-07, + "loss": 1.1326, + "step": 3924 + }, + { + "epoch": 0.5320951670846608, + "grad_norm": 1.6378103782659563, + "learning_rate": 9.446968516188584e-07, + "loss": 1.1504, + "step": 3925 + }, + { + "epoch": 0.5322307327323256, + "grad_norm": 1.8892841837324164, + "learning_rate": 9.442583911013308e-07, + "loss": 1.1279, + "step": 3926 + }, + { + "epoch": 0.5323662983799905, + "grad_norm": 3.162497031140194, + "learning_rate": 9.438199413331323e-07, + "loss": 1.1522, + "step": 3927 + }, + { + "epoch": 0.5325018640276554, + "grad_norm": 1.7101548995267017, + "learning_rate": 9.433815023988144e-07, + "loss": 1.1698, + "step": 3928 + }, + { + "epoch": 0.5326374296753202, + "grad_norm": 1.725041612841162, + "learning_rate": 9.429430743829272e-07, + "loss": 1.1104, + "step": 3929 + }, + { + "epoch": 0.5327729953229852, + "grad_norm": 1.7502096545763368, + "learning_rate": 9.425046573700174e-07, + "loss": 1.1006, + "step": 3930 + }, + { + "epoch": 0.5329085609706501, + "grad_norm": 2.0239104497379916, + "learning_rate": 9.420662514446309e-07, + "loss": 1.1545, + "step": 3931 + }, + { + "epoch": 0.5330441266183149, + "grad_norm": 2.6365039530592123, + "learning_rate": 9.4162785669131e-07, + "loss": 1.1376, + "step": 3932 + }, + { + "epoch": 0.5331796922659798, + "grad_norm": 2.002531945208696, + "learning_rate": 9.411894731945968e-07, + "loss": 1.1838, + "step": 3933 + }, + { + "epoch": 0.5333152579136446, + "grad_norm": 1.7294116794971193, + "learning_rate": 9.40751101039029e-07, + "loss": 1.1612, + "step": 3934 + }, + { + "epoch": 0.5334508235613096, + "grad_norm": 1.6628381460344257, + "learning_rate": 9.403127403091441e-07, + "loss": 1.1502, + "step": 3935 + }, + { + "epoch": 0.5335863892089745, + "grad_norm": 1.730533042395271, + "learning_rate": 9.398743910894755e-07, + "loss": 1.1917, + "step": 3936 + }, + { + "epoch": 0.5337219548566393, + "grad_norm": 7.725797766439608, + "learning_rate": 9.394360534645566e-07, + "loss": 1.149, + "step": 3937 + }, + { + "epoch": 0.5338575205043042, + "grad_norm": 5.250292056293935, + "learning_rate": 9.389977275189163e-07, + "loss": 1.1487, + "step": 3938 + }, + { + "epoch": 0.533993086151969, + "grad_norm": 1.626747147821128, + "learning_rate": 9.38559413337083e-07, + "loss": 1.1656, + "step": 3939 + }, + { + "epoch": 0.534128651799634, + "grad_norm": 1.9807970888330417, + "learning_rate": 9.381211110035819e-07, + "loss": 1.1657, + "step": 3940 + }, + { + "epoch": 0.5342642174472989, + "grad_norm": 1.7495472315461091, + "learning_rate": 9.376828206029358e-07, + "loss": 1.1446, + "step": 3941 + }, + { + "epoch": 0.5343997830949637, + "grad_norm": 1.8405251592261824, + "learning_rate": 9.372445422196662e-07, + "loss": 1.1377, + "step": 3942 + }, + { + "epoch": 0.5345353487426286, + "grad_norm": 2.3183189782086737, + "learning_rate": 9.368062759382908e-07, + "loss": 1.1459, + "step": 3943 + }, + { + "epoch": 0.5346709143902935, + "grad_norm": 2.4427226612918735, + "learning_rate": 9.363680218433267e-07, + "loss": 1.1728, + "step": 3944 + }, + { + "epoch": 0.5348064800379584, + "grad_norm": 2.736891612544138, + "learning_rate": 9.359297800192871e-07, + "loss": 1.1645, + "step": 3945 + }, + { + "epoch": 0.5349420456856233, + "grad_norm": 1.9712178019335544, + "learning_rate": 9.354915505506838e-07, + "loss": 1.173, + "step": 3946 + }, + { + "epoch": 0.5350776113332881, + "grad_norm": 2.8215592792774427, + "learning_rate": 9.350533335220256e-07, + "loss": 1.1692, + "step": 3947 + }, + { + "epoch": 0.535213176980953, + "grad_norm": 2.252994839763575, + "learning_rate": 9.346151290178195e-07, + "loss": 1.1529, + "step": 3948 + }, + { + "epoch": 0.5353487426286179, + "grad_norm": 2.442331938523169, + "learning_rate": 9.341769371225696e-07, + "loss": 1.1491, + "step": 3949 + }, + { + "epoch": 0.5354843082762828, + "grad_norm": 1.9663848592315654, + "learning_rate": 9.337387579207779e-07, + "loss": 1.1407, + "step": 3950 + }, + { + "epoch": 0.5356198739239477, + "grad_norm": 1.9723204310187934, + "learning_rate": 9.333005914969434e-07, + "loss": 1.1537, + "step": 3951 + }, + { + "epoch": 0.5357554395716125, + "grad_norm": 1.7988012777540519, + "learning_rate": 9.328624379355639e-07, + "loss": 1.1479, + "step": 3952 + }, + { + "epoch": 0.5358910052192775, + "grad_norm": 4.492791420798566, + "learning_rate": 9.324242973211326e-07, + "loss": 1.1243, + "step": 3953 + }, + { + "epoch": 0.5360265708669423, + "grad_norm": 1.7655613929709657, + "learning_rate": 9.319861697381427e-07, + "loss": 1.153, + "step": 3954 + }, + { + "epoch": 0.5361621365146072, + "grad_norm": 1.9062869485789593, + "learning_rate": 9.315480552710832e-07, + "loss": 1.0979, + "step": 3955 + }, + { + "epoch": 0.5362977021622721, + "grad_norm": 2.7258596458919655, + "learning_rate": 9.311099540044402e-07, + "loss": 1.1522, + "step": 3956 + }, + { + "epoch": 0.5364332678099369, + "grad_norm": 1.9366625384331388, + "learning_rate": 9.306718660226996e-07, + "loss": 1.1859, + "step": 3957 + }, + { + "epoch": 0.5365688334576019, + "grad_norm": 1.9151397647424866, + "learning_rate": 9.302337914103416e-07, + "loss": 1.1793, + "step": 3958 + }, + { + "epoch": 0.5367043991052667, + "grad_norm": 1.689885347562821, + "learning_rate": 9.297957302518469e-07, + "loss": 1.2227, + "step": 3959 + }, + { + "epoch": 0.5368399647529316, + "grad_norm": 1.6621848558963304, + "learning_rate": 9.293576826316909e-07, + "loss": 1.1333, + "step": 3960 + }, + { + "epoch": 0.5369755304005965, + "grad_norm": 2.0372425072080302, + "learning_rate": 9.289196486343487e-07, + "loss": 1.156, + "step": 3961 + }, + { + "epoch": 0.5371110960482613, + "grad_norm": 2.0047604577092155, + "learning_rate": 9.284816283442907e-07, + "loss": 1.1585, + "step": 3962 + }, + { + "epoch": 0.5372466616959263, + "grad_norm": 1.855096840478235, + "learning_rate": 9.280436218459866e-07, + "loss": 1.1459, + "step": 3963 + }, + { + "epoch": 0.5373822273435911, + "grad_norm": 2.161478541796894, + "learning_rate": 9.276056292239016e-07, + "loss": 1.179, + "step": 3964 + }, + { + "epoch": 0.537517792991256, + "grad_norm": 2.098432715642182, + "learning_rate": 9.271676505625e-07, + "loss": 1.145, + "step": 3965 + }, + { + "epoch": 0.5376533586389209, + "grad_norm": 2.539661067136378, + "learning_rate": 9.267296859462416e-07, + "loss": 1.1477, + "step": 3966 + }, + { + "epoch": 0.5377889242865858, + "grad_norm": 1.614720189109032, + "learning_rate": 9.262917354595854e-07, + "loss": 1.1893, + "step": 3967 + }, + { + "epoch": 0.5379244899342507, + "grad_norm": 1.7896301326244275, + "learning_rate": 9.258537991869861e-07, + "loss": 1.1481, + "step": 3968 + }, + { + "epoch": 0.5380600555819155, + "grad_norm": 3.0053973549568602, + "learning_rate": 9.254158772128961e-07, + "loss": 1.1438, + "step": 3969 + }, + { + "epoch": 0.5381956212295804, + "grad_norm": 1.8659802329932114, + "learning_rate": 9.249779696217658e-07, + "loss": 1.1862, + "step": 3970 + }, + { + "epoch": 0.5383311868772453, + "grad_norm": 2.0946639221030834, + "learning_rate": 9.245400764980413e-07, + "loss": 1.1087, + "step": 3971 + }, + { + "epoch": 0.5384667525249102, + "grad_norm": 2.5842340534017882, + "learning_rate": 9.241021979261681e-07, + "loss": 1.1661, + "step": 3972 + }, + { + "epoch": 0.5386023181725751, + "grad_norm": 2.2535046755290775, + "learning_rate": 9.236643339905863e-07, + "loss": 1.1477, + "step": 3973 + }, + { + "epoch": 0.5387378838202399, + "grad_norm": 2.120575385699149, + "learning_rate": 9.232264847757356e-07, + "loss": 1.1788, + "step": 3974 + }, + { + "epoch": 0.5388734494679048, + "grad_norm": 2.764844495560589, + "learning_rate": 9.227886503660509e-07, + "loss": 1.1217, + "step": 3975 + }, + { + "epoch": 0.5390090151155698, + "grad_norm": 2.227074039211462, + "learning_rate": 9.223508308459659e-07, + "loss": 1.1514, + "step": 3976 + }, + { + "epoch": 0.5391445807632346, + "grad_norm": 2.218991580203107, + "learning_rate": 9.219130262999101e-07, + "loss": 1.1482, + "step": 3977 + }, + { + "epoch": 0.5392801464108995, + "grad_norm": 1.6729539193679195, + "learning_rate": 9.214752368123107e-07, + "loss": 1.1667, + "step": 3978 + }, + { + "epoch": 0.5394157120585643, + "grad_norm": 2.214045393306334, + "learning_rate": 9.21037462467592e-07, + "loss": 1.1491, + "step": 3979 + }, + { + "epoch": 0.5395512777062292, + "grad_norm": 1.802246725014951, + "learning_rate": 9.205997033501756e-07, + "loss": 1.1424, + "step": 3980 + }, + { + "epoch": 0.5396868433538942, + "grad_norm": 2.435531072654333, + "learning_rate": 9.201619595444795e-07, + "loss": 1.2005, + "step": 3981 + }, + { + "epoch": 0.539822409001559, + "grad_norm": 2.126317631782132, + "learning_rate": 9.197242311349195e-07, + "loss": 1.1638, + "step": 3982 + }, + { + "epoch": 0.5399579746492239, + "grad_norm": 1.784345681374923, + "learning_rate": 9.192865182059077e-07, + "loss": 1.1379, + "step": 3983 + }, + { + "epoch": 0.5400935402968887, + "grad_norm": 1.8049882125848895, + "learning_rate": 9.188488208418538e-07, + "loss": 1.1172, + "step": 3984 + }, + { + "epoch": 0.5402291059445536, + "grad_norm": 1.561547284286687, + "learning_rate": 9.184111391271642e-07, + "loss": 1.1244, + "step": 3985 + }, + { + "epoch": 0.5403646715922186, + "grad_norm": 2.2009225908239536, + "learning_rate": 9.179734731462423e-07, + "loss": 1.1392, + "step": 3986 + }, + { + "epoch": 0.5405002372398834, + "grad_norm": 1.888604877240904, + "learning_rate": 9.175358229834888e-07, + "loss": 1.1031, + "step": 3987 + }, + { + "epoch": 0.5406358028875483, + "grad_norm": 1.9563007960672518, + "learning_rate": 9.170981887233007e-07, + "loss": 1.17, + "step": 3988 + }, + { + "epoch": 0.5407713685352131, + "grad_norm": 2.291736121649058, + "learning_rate": 9.166605704500728e-07, + "loss": 1.1218, + "step": 3989 + }, + { + "epoch": 0.540906934182878, + "grad_norm": 2.31826710033524, + "learning_rate": 9.162229682481957e-07, + "loss": 1.1446, + "step": 3990 + }, + { + "epoch": 0.541042499830543, + "grad_norm": 2.7261013095553506, + "learning_rate": 9.157853822020582e-07, + "loss": 1.1672, + "step": 3991 + }, + { + "epoch": 0.5411780654782078, + "grad_norm": 2.0172504268100266, + "learning_rate": 9.153478123960446e-07, + "loss": 1.1475, + "step": 3992 + }, + { + "epoch": 0.5413136311258727, + "grad_norm": 1.835190786243105, + "learning_rate": 9.149102589145376e-07, + "loss": 1.1655, + "step": 3993 + }, + { + "epoch": 0.5414491967735375, + "grad_norm": 8.146395426603187, + "learning_rate": 9.144727218419151e-07, + "loss": 1.1923, + "step": 3994 + }, + { + "epoch": 0.5415847624212025, + "grad_norm": 1.7047740094953332, + "learning_rate": 9.140352012625536e-07, + "loss": 1.1389, + "step": 3995 + }, + { + "epoch": 0.5417203280688674, + "grad_norm": 1.7289372665710414, + "learning_rate": 9.135976972608248e-07, + "loss": 1.1622, + "step": 3996 + }, + { + "epoch": 0.5418558937165322, + "grad_norm": 1.989194340227825, + "learning_rate": 9.131602099210978e-07, + "loss": 1.1911, + "step": 3997 + }, + { + "epoch": 0.5419914593641971, + "grad_norm": 1.966957733628747, + "learning_rate": 9.127227393277391e-07, + "loss": 1.1713, + "step": 3998 + }, + { + "epoch": 0.5421270250118619, + "grad_norm": 1.684588769283311, + "learning_rate": 9.12285285565111e-07, + "loss": 1.1448, + "step": 3999 + }, + { + "epoch": 0.5422625906595269, + "grad_norm": 2.265219037481919, + "learning_rate": 9.118478487175735e-07, + "loss": 1.1601, + "step": 4000 + }, + { + "epoch": 0.5423981563071918, + "grad_norm": 1.6358640611060304, + "learning_rate": 9.114104288694821e-07, + "loss": 1.1631, + "step": 4001 + }, + { + "epoch": 0.5425337219548566, + "grad_norm": 1.7558898024471052, + "learning_rate": 9.109730261051905e-07, + "loss": 1.168, + "step": 4002 + }, + { + "epoch": 0.5426692876025215, + "grad_norm": 3.18835816088336, + "learning_rate": 9.105356405090479e-07, + "loss": 1.1444, + "step": 4003 + }, + { + "epoch": 0.5428048532501863, + "grad_norm": 1.8789862554476866, + "learning_rate": 9.100982721654011e-07, + "loss": 1.1549, + "step": 4004 + }, + { + "epoch": 0.5429404188978513, + "grad_norm": 1.8507390206859957, + "learning_rate": 9.096609211585926e-07, + "loss": 1.1583, + "step": 4005 + }, + { + "epoch": 0.5430759845455162, + "grad_norm": 1.5913993596163802, + "learning_rate": 9.092235875729627e-07, + "loss": 1.1503, + "step": 4006 + }, + { + "epoch": 0.543211550193181, + "grad_norm": 1.9111326440223608, + "learning_rate": 9.087862714928471e-07, + "loss": 1.1248, + "step": 4007 + }, + { + "epoch": 0.5433471158408459, + "grad_norm": 1.939347221641338, + "learning_rate": 9.083489730025791e-07, + "loss": 1.1547, + "step": 4008 + }, + { + "epoch": 0.5434826814885109, + "grad_norm": 1.8831843288587302, + "learning_rate": 9.079116921864883e-07, + "loss": 1.1557, + "step": 4009 + }, + { + "epoch": 0.5436182471361757, + "grad_norm": 1.7732099035764706, + "learning_rate": 9.074744291289007e-07, + "loss": 1.1334, + "step": 4010 + }, + { + "epoch": 0.5437538127838406, + "grad_norm": 1.7272197960958413, + "learning_rate": 9.070371839141393e-07, + "loss": 1.1172, + "step": 4011 + }, + { + "epoch": 0.5438893784315054, + "grad_norm": 2.0167683385103885, + "learning_rate": 9.065999566265229e-07, + "loss": 1.1515, + "step": 4012 + }, + { + "epoch": 0.5440249440791703, + "grad_norm": 5.687485359374524, + "learning_rate": 9.061627473503677e-07, + "loss": 1.1899, + "step": 4013 + }, + { + "epoch": 0.5441605097268353, + "grad_norm": 1.8477553275800422, + "learning_rate": 9.057255561699859e-07, + "loss": 1.1601, + "step": 4014 + }, + { + "epoch": 0.5442960753745001, + "grad_norm": 1.7910149689769725, + "learning_rate": 9.052883831696865e-07, + "loss": 1.1247, + "step": 4015 + }, + { + "epoch": 0.544431641022165, + "grad_norm": 2.138424565580996, + "learning_rate": 9.048512284337747e-07, + "loss": 1.1556, + "step": 4016 + }, + { + "epoch": 0.5445672066698298, + "grad_norm": 3.7590250796959728, + "learning_rate": 9.044140920465529e-07, + "loss": 1.1773, + "step": 4017 + }, + { + "epoch": 0.5447027723174948, + "grad_norm": 2.340946833302093, + "learning_rate": 9.039769740923182e-07, + "loss": 1.147, + "step": 4018 + }, + { + "epoch": 0.5448383379651597, + "grad_norm": 2.5271094897885846, + "learning_rate": 9.035398746553667e-07, + "loss": 1.1358, + "step": 4019 + }, + { + "epoch": 0.5449739036128245, + "grad_norm": 1.7089076028502639, + "learning_rate": 9.031027938199884e-07, + "loss": 1.1637, + "step": 4020 + }, + { + "epoch": 0.5451094692604894, + "grad_norm": 2.148308689003966, + "learning_rate": 9.02665731670472e-07, + "loss": 1.1366, + "step": 4021 + }, + { + "epoch": 0.5452450349081542, + "grad_norm": 1.8147857478342744, + "learning_rate": 9.022286882911005e-07, + "loss": 1.1683, + "step": 4022 + }, + { + "epoch": 0.5453806005558192, + "grad_norm": 1.9727189284701496, + "learning_rate": 9.01791663766155e-07, + "loss": 1.1482, + "step": 4023 + }, + { + "epoch": 0.5455161662034841, + "grad_norm": 2.079742600079112, + "learning_rate": 9.01354658179912e-07, + "loss": 1.1277, + "step": 4024 + }, + { + "epoch": 0.5456517318511489, + "grad_norm": 2.2103746759816527, + "learning_rate": 9.009176716166442e-07, + "loss": 1.1448, + "step": 4025 + }, + { + "epoch": 0.5457872974988138, + "grad_norm": 1.600524006660917, + "learning_rate": 9.004807041606217e-07, + "loss": 1.1477, + "step": 4026 + }, + { + "epoch": 0.5459228631464786, + "grad_norm": 2.0955662395417867, + "learning_rate": 9.000437558961094e-07, + "loss": 1.1064, + "step": 4027 + }, + { + "epoch": 0.5460584287941436, + "grad_norm": 1.7347865109350258, + "learning_rate": 8.996068269073701e-07, + "loss": 1.1478, + "step": 4028 + }, + { + "epoch": 0.5461939944418085, + "grad_norm": 2.1352083831512694, + "learning_rate": 8.991699172786614e-07, + "loss": 1.1162, + "step": 4029 + }, + { + "epoch": 0.5463295600894733, + "grad_norm": 1.8501892687019712, + "learning_rate": 8.987330270942388e-07, + "loss": 1.1982, + "step": 4030 + }, + { + "epoch": 0.5464651257371382, + "grad_norm": 1.8901804841288476, + "learning_rate": 8.98296156438352e-07, + "loss": 1.1545, + "step": 4031 + }, + { + "epoch": 0.546600691384803, + "grad_norm": 2.7160132799542005, + "learning_rate": 8.978593053952492e-07, + "loss": 1.1358, + "step": 4032 + }, + { + "epoch": 0.546736257032468, + "grad_norm": 1.8148062523472317, + "learning_rate": 8.974224740491725e-07, + "loss": 1.16, + "step": 4033 + }, + { + "epoch": 0.5468718226801329, + "grad_norm": 2.6886519737162256, + "learning_rate": 8.969856624843625e-07, + "loss": 1.1539, + "step": 4034 + }, + { + "epoch": 0.5470073883277977, + "grad_norm": 2.2522310237042547, + "learning_rate": 8.965488707850539e-07, + "loss": 1.1529, + "step": 4035 + }, + { + "epoch": 0.5471429539754626, + "grad_norm": 1.6128628553917401, + "learning_rate": 8.961120990354794e-07, + "loss": 1.167, + "step": 4036 + }, + { + "epoch": 0.5472785196231275, + "grad_norm": 1.7111860897896043, + "learning_rate": 8.956753473198662e-07, + "loss": 1.1633, + "step": 4037 + }, + { + "epoch": 0.5474140852707924, + "grad_norm": 1.9762713309980982, + "learning_rate": 8.952386157224391e-07, + "loss": 1.1485, + "step": 4038 + }, + { + "epoch": 0.5475496509184573, + "grad_norm": 1.6025263683661857, + "learning_rate": 8.948019043274181e-07, + "loss": 1.1451, + "step": 4039 + }, + { + "epoch": 0.5476852165661221, + "grad_norm": 1.9267104203293506, + "learning_rate": 8.943652132190189e-07, + "loss": 1.1677, + "step": 4040 + }, + { + "epoch": 0.547820782213787, + "grad_norm": 4.97237112980809, + "learning_rate": 8.939285424814551e-07, + "loss": 1.1612, + "step": 4041 + }, + { + "epoch": 0.5479563478614519, + "grad_norm": 1.7149689675583335, + "learning_rate": 8.934918921989341e-07, + "loss": 1.1488, + "step": 4042 + }, + { + "epoch": 0.5480919135091168, + "grad_norm": 1.8455077869712422, + "learning_rate": 8.930552624556615e-07, + "loss": 1.1559, + "step": 4043 + }, + { + "epoch": 0.5482274791567817, + "grad_norm": 1.886256888594135, + "learning_rate": 8.92618653335837e-07, + "loss": 1.1619, + "step": 4044 + }, + { + "epoch": 0.5483630448044465, + "grad_norm": 2.121107700943377, + "learning_rate": 8.921820649236576e-07, + "loss": 1.1425, + "step": 4045 + }, + { + "epoch": 0.5484986104521115, + "grad_norm": 2.74824815023826, + "learning_rate": 8.917454973033161e-07, + "loss": 1.1347, + "step": 4046 + }, + { + "epoch": 0.5486341760997763, + "grad_norm": 2.285362356067566, + "learning_rate": 8.913089505590007e-07, + "loss": 1.1678, + "step": 4047 + }, + { + "epoch": 0.5487697417474412, + "grad_norm": 2.0474739662767543, + "learning_rate": 8.908724247748963e-07, + "loss": 1.1511, + "step": 4048 + }, + { + "epoch": 0.5489053073951061, + "grad_norm": 1.9499177296850623, + "learning_rate": 8.904359200351837e-07, + "loss": 1.1882, + "step": 4049 + }, + { + "epoch": 0.5490408730427709, + "grad_norm": 2.0145667716415594, + "learning_rate": 8.899994364240385e-07, + "loss": 1.1512, + "step": 4050 + }, + { + "epoch": 0.5491764386904359, + "grad_norm": 1.9511566937721592, + "learning_rate": 8.895629740256343e-07, + "loss": 1.1368, + "step": 4051 + }, + { + "epoch": 0.5493120043381007, + "grad_norm": 1.6905636404994728, + "learning_rate": 8.891265329241387e-07, + "loss": 1.2179, + "step": 4052 + }, + { + "epoch": 0.5494475699857656, + "grad_norm": 3.6129867011251764, + "learning_rate": 8.886901132037155e-07, + "loss": 1.1931, + "step": 4053 + }, + { + "epoch": 0.5495831356334305, + "grad_norm": 1.9121818227399439, + "learning_rate": 8.88253714948526e-07, + "loss": 1.1722, + "step": 4054 + }, + { + "epoch": 0.5497187012810953, + "grad_norm": 2.295819256502725, + "learning_rate": 8.87817338242725e-07, + "loss": 1.1874, + "step": 4055 + }, + { + "epoch": 0.5498542669287603, + "grad_norm": 3.4241961303674233, + "learning_rate": 8.873809831704652e-07, + "loss": 1.1616, + "step": 4056 + }, + { + "epoch": 0.5499898325764251, + "grad_norm": 8.19795869393772, + "learning_rate": 8.869446498158935e-07, + "loss": 1.1375, + "step": 4057 + }, + { + "epoch": 0.55012539822409, + "grad_norm": 2.3466681337682593, + "learning_rate": 8.865083382631539e-07, + "loss": 1.1568, + "step": 4058 + }, + { + "epoch": 0.5502609638717549, + "grad_norm": 1.7343499527998456, + "learning_rate": 8.860720485963851e-07, + "loss": 1.1579, + "step": 4059 + }, + { + "epoch": 0.5503965295194198, + "grad_norm": 1.533458947023769, + "learning_rate": 8.856357808997229e-07, + "loss": 1.1678, + "step": 4060 + }, + { + "epoch": 0.5505320951670847, + "grad_norm": 1.8684881729971219, + "learning_rate": 8.851995352572972e-07, + "loss": 1.1725, + "step": 4061 + }, + { + "epoch": 0.5506676608147495, + "grad_norm": 2.3535850036054478, + "learning_rate": 8.847633117532353e-07, + "loss": 1.1525, + "step": 4062 + }, + { + "epoch": 0.5508032264624144, + "grad_norm": 1.871336395615346, + "learning_rate": 8.843271104716588e-07, + "loss": 1.202, + "step": 4063 + }, + { + "epoch": 0.5509387921100793, + "grad_norm": 1.7739829406558507, + "learning_rate": 8.838909314966863e-07, + "loss": 1.1608, + "step": 4064 + }, + { + "epoch": 0.5510743577577442, + "grad_norm": 2.0094008989550263, + "learning_rate": 8.834547749124307e-07, + "loss": 1.1654, + "step": 4065 + }, + { + "epoch": 0.5512099234054091, + "grad_norm": 2.4504490837382438, + "learning_rate": 8.830186408030023e-07, + "loss": 1.129, + "step": 4066 + }, + { + "epoch": 0.5513454890530739, + "grad_norm": 2.598064420997788, + "learning_rate": 8.825825292525056e-07, + "loss": 1.1914, + "step": 4067 + }, + { + "epoch": 0.5514810547007388, + "grad_norm": 3.029632199653493, + "learning_rate": 8.821464403450408e-07, + "loss": 1.1636, + "step": 4068 + }, + { + "epoch": 0.5516166203484038, + "grad_norm": 2.9797122751265315, + "learning_rate": 8.817103741647052e-07, + "loss": 1.1322, + "step": 4069 + }, + { + "epoch": 0.5517521859960686, + "grad_norm": 1.9296674038295942, + "learning_rate": 8.812743307955899e-07, + "loss": 1.1781, + "step": 4070 + }, + { + "epoch": 0.5518877516437335, + "grad_norm": 1.7570309752179132, + "learning_rate": 8.80838310321783e-07, + "loss": 1.133, + "step": 4071 + }, + { + "epoch": 0.5520233172913983, + "grad_norm": 2.723245508176723, + "learning_rate": 8.80402312827367e-07, + "loss": 1.1394, + "step": 4072 + }, + { + "epoch": 0.5521588829390632, + "grad_norm": 2.4331491431040453, + "learning_rate": 8.799663383964213e-07, + "loss": 1.1253, + "step": 4073 + }, + { + "epoch": 0.5522944485867282, + "grad_norm": 1.7711950577988798, + "learning_rate": 8.795303871130196e-07, + "loss": 1.1583, + "step": 4074 + }, + { + "epoch": 0.552430014234393, + "grad_norm": 2.6551360563992734, + "learning_rate": 8.790944590612318e-07, + "loss": 1.1425, + "step": 4075 + }, + { + "epoch": 0.5525655798820579, + "grad_norm": 2.028207658494865, + "learning_rate": 8.786585543251232e-07, + "loss": 1.1598, + "step": 4076 + }, + { + "epoch": 0.5527011455297227, + "grad_norm": 4.035594071006749, + "learning_rate": 8.782226729887546e-07, + "loss": 1.1526, + "step": 4077 + }, + { + "epoch": 0.5528367111773876, + "grad_norm": 2.346057236584483, + "learning_rate": 8.777868151361823e-07, + "loss": 1.157, + "step": 4078 + }, + { + "epoch": 0.5529722768250526, + "grad_norm": 1.8880827390116177, + "learning_rate": 8.773509808514581e-07, + "loss": 1.114, + "step": 4079 + }, + { + "epoch": 0.5531078424727174, + "grad_norm": 2.00910221832397, + "learning_rate": 8.769151702186289e-07, + "loss": 1.1658, + "step": 4080 + }, + { + "epoch": 0.5532434081203823, + "grad_norm": 3.180826115856291, + "learning_rate": 8.764793833217377e-07, + "loss": 1.1691, + "step": 4081 + }, + { + "epoch": 0.5533789737680471, + "grad_norm": 1.5052312406271395, + "learning_rate": 8.760436202448223e-07, + "loss": 1.1052, + "step": 4082 + }, + { + "epoch": 0.553514539415712, + "grad_norm": 1.8559431904987356, + "learning_rate": 8.756078810719163e-07, + "loss": 1.1527, + "step": 4083 + }, + { + "epoch": 0.553650105063377, + "grad_norm": 2.0335585835166388, + "learning_rate": 8.751721658870488e-07, + "loss": 1.1515, + "step": 4084 + }, + { + "epoch": 0.5537856707110418, + "grad_norm": 1.6391147052092727, + "learning_rate": 8.747364747742433e-07, + "loss": 1.1256, + "step": 4085 + }, + { + "epoch": 0.5539212363587067, + "grad_norm": 1.9875867772289735, + "learning_rate": 8.743008078175202e-07, + "loss": 1.1722, + "step": 4086 + }, + { + "epoch": 0.5540568020063716, + "grad_norm": 1.7212974137278927, + "learning_rate": 8.73865165100894e-07, + "loss": 1.1511, + "step": 4087 + }, + { + "epoch": 0.5541923676540365, + "grad_norm": 2.106453626665776, + "learning_rate": 8.734295467083752e-07, + "loss": 1.1556, + "step": 4088 + }, + { + "epoch": 0.5543279333017014, + "grad_norm": 1.676378328974611, + "learning_rate": 8.729939527239688e-07, + "loss": 1.1533, + "step": 4089 + }, + { + "epoch": 0.5544634989493662, + "grad_norm": 2.1754067475113077, + "learning_rate": 8.725583832316767e-07, + "loss": 1.1681, + "step": 4090 + }, + { + "epoch": 0.5545990645970311, + "grad_norm": 1.78706312202473, + "learning_rate": 8.721228383154939e-07, + "loss": 1.1705, + "step": 4091 + }, + { + "epoch": 0.554734630244696, + "grad_norm": 1.7170448125240758, + "learning_rate": 8.716873180594128e-07, + "loss": 1.1227, + "step": 4092 + }, + { + "epoch": 0.5548701958923609, + "grad_norm": 1.6836219901506568, + "learning_rate": 8.71251822547419e-07, + "loss": 1.1728, + "step": 4093 + }, + { + "epoch": 0.5550057615400258, + "grad_norm": 1.8391543799386696, + "learning_rate": 8.708163518634956e-07, + "loss": 1.167, + "step": 4094 + }, + { + "epoch": 0.5551413271876906, + "grad_norm": 1.5698969179097118, + "learning_rate": 8.703809060916188e-07, + "loss": 1.143, + "step": 4095 + }, + { + "epoch": 0.5552768928353555, + "grad_norm": 2.6614022338542482, + "learning_rate": 8.699454853157608e-07, + "loss": 1.1594, + "step": 4096 + }, + { + "epoch": 0.5554124584830205, + "grad_norm": 2.0541599813269844, + "learning_rate": 8.695100896198898e-07, + "loss": 1.1553, + "step": 4097 + }, + { + "epoch": 0.5555480241306853, + "grad_norm": 3.0504826605272464, + "learning_rate": 8.690747190879676e-07, + "loss": 1.1669, + "step": 4098 + }, + { + "epoch": 0.5556835897783502, + "grad_norm": 1.9700467730990177, + "learning_rate": 8.686393738039527e-07, + "loss": 1.1348, + "step": 4099 + }, + { + "epoch": 0.555819155426015, + "grad_norm": 1.875093588733607, + "learning_rate": 8.682040538517973e-07, + "loss": 1.1657, + "step": 4100 + }, + { + "epoch": 0.5559547210736799, + "grad_norm": 1.558047955528762, + "learning_rate": 8.677687593154503e-07, + "loss": 1.1208, + "step": 4101 + }, + { + "epoch": 0.5560902867213449, + "grad_norm": 1.7067895246568312, + "learning_rate": 8.673334902788536e-07, + "loss": 1.1326, + "step": 4102 + }, + { + "epoch": 0.5562258523690097, + "grad_norm": 1.6695585092265002, + "learning_rate": 8.668982468259467e-07, + "loss": 1.1982, + "step": 4103 + }, + { + "epoch": 0.5563614180166746, + "grad_norm": 2.2823752721838497, + "learning_rate": 8.664630290406618e-07, + "loss": 1.1721, + "step": 4104 + }, + { + "epoch": 0.5564969836643394, + "grad_norm": 1.5440629705228632, + "learning_rate": 8.660278370069281e-07, + "loss": 1.1619, + "step": 4105 + }, + { + "epoch": 0.5566325493120043, + "grad_norm": 2.0400390882214587, + "learning_rate": 8.655926708086684e-07, + "loss": 1.1478, + "step": 4106 + }, + { + "epoch": 0.5567681149596693, + "grad_norm": 1.5543539080033355, + "learning_rate": 8.651575305298011e-07, + "loss": 1.1516, + "step": 4107 + }, + { + "epoch": 0.5569036806073341, + "grad_norm": 2.2110606262344183, + "learning_rate": 8.6472241625424e-07, + "loss": 1.1823, + "step": 4108 + }, + { + "epoch": 0.557039246254999, + "grad_norm": 10.02399910840154, + "learning_rate": 8.642873280658924e-07, + "loss": 1.166, + "step": 4109 + }, + { + "epoch": 0.5571748119026638, + "grad_norm": 1.6667100787678981, + "learning_rate": 8.63852266048663e-07, + "loss": 1.1394, + "step": 4110 + }, + { + "epoch": 0.5573103775503288, + "grad_norm": 1.7745141137253742, + "learning_rate": 8.634172302864491e-07, + "loss": 1.135, + "step": 4111 + }, + { + "epoch": 0.5574459431979937, + "grad_norm": 1.604208583173529, + "learning_rate": 8.629822208631442e-07, + "loss": 1.1488, + "step": 4112 + }, + { + "epoch": 0.5575815088456585, + "grad_norm": 1.5511814942835283, + "learning_rate": 8.625472378626365e-07, + "loss": 1.1483, + "step": 4113 + }, + { + "epoch": 0.5577170744933234, + "grad_norm": 1.6631224453640594, + "learning_rate": 8.62112281368809e-07, + "loss": 1.1386, + "step": 4114 + }, + { + "epoch": 0.5578526401409882, + "grad_norm": 1.6400299009763393, + "learning_rate": 8.616773514655395e-07, + "loss": 1.1165, + "step": 4115 + }, + { + "epoch": 0.5579882057886532, + "grad_norm": 1.7257472128486355, + "learning_rate": 8.612424482367014e-07, + "loss": 1.1327, + "step": 4116 + }, + { + "epoch": 0.5581237714363181, + "grad_norm": 1.537554833122454, + "learning_rate": 8.608075717661611e-07, + "loss": 1.1384, + "step": 4117 + }, + { + "epoch": 0.5582593370839829, + "grad_norm": 2.8171233188950127, + "learning_rate": 8.603727221377826e-07, + "loss": 1.1758, + "step": 4118 + }, + { + "epoch": 0.5583949027316478, + "grad_norm": 1.986499412521854, + "learning_rate": 8.599378994354218e-07, + "loss": 1.1873, + "step": 4119 + }, + { + "epoch": 0.5585304683793126, + "grad_norm": 2.1192005439829087, + "learning_rate": 8.595031037429321e-07, + "loss": 1.1525, + "step": 4120 + }, + { + "epoch": 0.5586660340269776, + "grad_norm": 2.0414810932073104, + "learning_rate": 8.590683351441594e-07, + "loss": 1.1899, + "step": 4121 + }, + { + "epoch": 0.5588015996746425, + "grad_norm": 1.9400204394191778, + "learning_rate": 8.586335937229462e-07, + "loss": 1.1794, + "step": 4122 + }, + { + "epoch": 0.5589371653223073, + "grad_norm": 2.23775638452982, + "learning_rate": 8.581988795631285e-07, + "loss": 1.165, + "step": 4123 + }, + { + "epoch": 0.5590727309699722, + "grad_norm": 1.998341775055061, + "learning_rate": 8.577641927485373e-07, + "loss": 1.1375, + "step": 4124 + }, + { + "epoch": 0.559208296617637, + "grad_norm": 2.4518295444503297, + "learning_rate": 8.573295333629991e-07, + "loss": 1.1673, + "step": 4125 + }, + { + "epoch": 0.559343862265302, + "grad_norm": 2.1285067131647812, + "learning_rate": 8.568949014903339e-07, + "loss": 1.1552, + "step": 4126 + }, + { + "epoch": 0.5594794279129669, + "grad_norm": 2.582744641881162, + "learning_rate": 8.564602972143576e-07, + "loss": 1.1774, + "step": 4127 + }, + { + "epoch": 0.5596149935606317, + "grad_norm": 1.901955277580258, + "learning_rate": 8.560257206188797e-07, + "loss": 1.1501, + "step": 4128 + }, + { + "epoch": 0.5597505592082966, + "grad_norm": 2.2896927421006135, + "learning_rate": 8.555911717877053e-07, + "loss": 1.1443, + "step": 4129 + }, + { + "epoch": 0.5598861248559615, + "grad_norm": 1.7322527194822355, + "learning_rate": 8.551566508046334e-07, + "loss": 1.1507, + "step": 4130 + }, + { + "epoch": 0.5600216905036264, + "grad_norm": 6.716703448027312, + "learning_rate": 8.547221577534583e-07, + "loss": 1.1505, + "step": 4131 + }, + { + "epoch": 0.5601572561512913, + "grad_norm": 2.215015317135392, + "learning_rate": 8.542876927179679e-07, + "loss": 1.1652, + "step": 4132 + }, + { + "epoch": 0.5602928217989561, + "grad_norm": 1.6188524188734446, + "learning_rate": 8.538532557819463e-07, + "loss": 1.1254, + "step": 4133 + }, + { + "epoch": 0.560428387446621, + "grad_norm": 1.9347010189912348, + "learning_rate": 8.534188470291704e-07, + "loss": 1.1425, + "step": 4134 + }, + { + "epoch": 0.5605639530942859, + "grad_norm": 1.8490378328573291, + "learning_rate": 8.529844665434129e-07, + "loss": 1.1743, + "step": 4135 + }, + { + "epoch": 0.5606995187419508, + "grad_norm": 2.0478880623428988, + "learning_rate": 8.525501144084409e-07, + "loss": 1.1495, + "step": 4136 + }, + { + "epoch": 0.5608350843896157, + "grad_norm": 2.366159336413695, + "learning_rate": 8.521157907080148e-07, + "loss": 1.1384, + "step": 4137 + }, + { + "epoch": 0.5609706500372805, + "grad_norm": 1.8277149241444088, + "learning_rate": 8.516814955258916e-07, + "loss": 1.1259, + "step": 4138 + }, + { + "epoch": 0.5611062156849455, + "grad_norm": 17.778861304835953, + "learning_rate": 8.512472289458208e-07, + "loss": 1.1939, + "step": 4139 + }, + { + "epoch": 0.5612417813326103, + "grad_norm": 1.7580519692551633, + "learning_rate": 8.508129910515482e-07, + "loss": 1.1533, + "step": 4140 + }, + { + "epoch": 0.5613773469802752, + "grad_norm": 5.069817804941126, + "learning_rate": 8.503787819268124e-07, + "loss": 1.1657, + "step": 4141 + }, + { + "epoch": 0.5615129126279401, + "grad_norm": 1.6492040153031977, + "learning_rate": 8.499446016553473e-07, + "loss": 1.1534, + "step": 4142 + }, + { + "epoch": 0.5616484782756049, + "grad_norm": 1.7549279041407004, + "learning_rate": 8.495104503208816e-07, + "loss": 1.2, + "step": 4143 + }, + { + "epoch": 0.5617840439232699, + "grad_norm": 1.5400751828651844, + "learning_rate": 8.490763280071375e-07, + "loss": 1.1513, + "step": 4144 + }, + { + "epoch": 0.5619196095709347, + "grad_norm": 1.6794258787596148, + "learning_rate": 8.486422347978323e-07, + "loss": 1.1239, + "step": 4145 + }, + { + "epoch": 0.5620551752185996, + "grad_norm": 1.5338088513217776, + "learning_rate": 8.482081707766775e-07, + "loss": 1.1094, + "step": 4146 + }, + { + "epoch": 0.5621907408662645, + "grad_norm": 3.9010806779206066, + "learning_rate": 8.477741360273785e-07, + "loss": 1.172, + "step": 4147 + }, + { + "epoch": 0.5623263065139293, + "grad_norm": 2.1994447243921615, + "learning_rate": 8.47340130633636e-07, + "loss": 1.1293, + "step": 4148 + }, + { + "epoch": 0.5624618721615943, + "grad_norm": 1.9844717572329682, + "learning_rate": 8.46906154679144e-07, + "loss": 1.1636, + "step": 4149 + }, + { + "epoch": 0.5625974378092591, + "grad_norm": 2.031552605197567, + "learning_rate": 8.46472208247592e-07, + "loss": 1.1548, + "step": 4150 + }, + { + "epoch": 0.562733003456924, + "grad_norm": 1.8284108229247038, + "learning_rate": 8.460382914226628e-07, + "loss": 1.1276, + "step": 4151 + }, + { + "epoch": 0.5628685691045889, + "grad_norm": 1.6919649210312402, + "learning_rate": 8.456044042880333e-07, + "loss": 1.1433, + "step": 4152 + }, + { + "epoch": 0.5630041347522537, + "grad_norm": 2.3537709854401716, + "learning_rate": 8.451705469273763e-07, + "loss": 1.1685, + "step": 4153 + }, + { + "epoch": 0.5631397003999187, + "grad_norm": 1.7418437271947724, + "learning_rate": 8.447367194243567e-07, + "loss": 1.1728, + "step": 4154 + }, + { + "epoch": 0.5632752660475835, + "grad_norm": 2.1642653281912074, + "learning_rate": 8.443029218626355e-07, + "loss": 1.1618, + "step": 4155 + }, + { + "epoch": 0.5634108316952484, + "grad_norm": 1.8552495771232007, + "learning_rate": 8.438691543258665e-07, + "loss": 1.1545, + "step": 4156 + }, + { + "epoch": 0.5635463973429133, + "grad_norm": 1.8536971894376644, + "learning_rate": 8.434354168976989e-07, + "loss": 1.1606, + "step": 4157 + }, + { + "epoch": 0.5636819629905782, + "grad_norm": 2.209160518812464, + "learning_rate": 8.430017096617751e-07, + "loss": 1.2128, + "step": 4158 + }, + { + "epoch": 0.5638175286382431, + "grad_norm": 2.310848697704944, + "learning_rate": 8.425680327017326e-07, + "loss": 1.1009, + "step": 4159 + }, + { + "epoch": 0.5639530942859079, + "grad_norm": 2.1139009836551725, + "learning_rate": 8.42134386101202e-07, + "loss": 1.1415, + "step": 4160 + }, + { + "epoch": 0.5640886599335728, + "grad_norm": 3.4012454959113203, + "learning_rate": 8.417007699438093e-07, + "loss": 1.1543, + "step": 4161 + }, + { + "epoch": 0.5642242255812377, + "grad_norm": 2.997721781574039, + "learning_rate": 8.412671843131731e-07, + "loss": 1.1267, + "step": 4162 + }, + { + "epoch": 0.5643597912289026, + "grad_norm": 2.0710761093873455, + "learning_rate": 8.408336292929079e-07, + "loss": 1.125, + "step": 4163 + }, + { + "epoch": 0.5644953568765675, + "grad_norm": 2.221681713762746, + "learning_rate": 8.40400104966621e-07, + "loss": 1.1667, + "step": 4164 + }, + { + "epoch": 0.5646309225242324, + "grad_norm": 1.8384716223480175, + "learning_rate": 8.399666114179136e-07, + "loss": 1.178, + "step": 4165 + }, + { + "epoch": 0.5647664881718972, + "grad_norm": 1.9589825129702656, + "learning_rate": 8.395331487303823e-07, + "loss": 1.145, + "step": 4166 + }, + { + "epoch": 0.5649020538195622, + "grad_norm": 1.7248054637636934, + "learning_rate": 8.390997169876161e-07, + "loss": 1.2045, + "step": 4167 + }, + { + "epoch": 0.565037619467227, + "grad_norm": 1.989760881422966, + "learning_rate": 8.386663162732001e-07, + "loss": 1.1794, + "step": 4168 + }, + { + "epoch": 0.5651731851148919, + "grad_norm": 1.9741452080525654, + "learning_rate": 8.38232946670711e-07, + "loss": 1.1254, + "step": 4169 + }, + { + "epoch": 0.5653087507625568, + "grad_norm": 3.171531886343852, + "learning_rate": 8.377996082637215e-07, + "loss": 1.1237, + "step": 4170 + }, + { + "epoch": 0.5654443164102216, + "grad_norm": 1.6473139691009222, + "learning_rate": 8.37366301135797e-07, + "loss": 1.1721, + "step": 4171 + }, + { + "epoch": 0.5655798820578866, + "grad_norm": 1.773804655993186, + "learning_rate": 8.369330253704979e-07, + "loss": 1.1869, + "step": 4172 + }, + { + "epoch": 0.5657154477055514, + "grad_norm": 1.9133364193217959, + "learning_rate": 8.364997810513774e-07, + "loss": 1.1309, + "step": 4173 + }, + { + "epoch": 0.5658510133532163, + "grad_norm": 1.6822776392707883, + "learning_rate": 8.360665682619837e-07, + "loss": 1.1165, + "step": 4174 + }, + { + "epoch": 0.5659865790008812, + "grad_norm": 2.081836622891455, + "learning_rate": 8.356333870858581e-07, + "loss": 1.1917, + "step": 4175 + }, + { + "epoch": 0.566122144648546, + "grad_norm": 1.7689821276843931, + "learning_rate": 8.352002376065364e-07, + "loss": 1.1593, + "step": 4176 + }, + { + "epoch": 0.566257710296211, + "grad_norm": 1.611821281641696, + "learning_rate": 8.347671199075481e-07, + "loss": 1.1138, + "step": 4177 + }, + { + "epoch": 0.5663932759438758, + "grad_norm": 1.8912934600723663, + "learning_rate": 8.343340340724168e-07, + "loss": 1.1525, + "step": 4178 + }, + { + "epoch": 0.5665288415915407, + "grad_norm": 2.2748518755219425, + "learning_rate": 8.339009801846589e-07, + "loss": 1.1294, + "step": 4179 + }, + { + "epoch": 0.5666644072392056, + "grad_norm": 2.0056610137735866, + "learning_rate": 8.334679583277859e-07, + "loss": 1.1176, + "step": 4180 + }, + { + "epoch": 0.5667999728868705, + "grad_norm": 1.8615843752185905, + "learning_rate": 8.330349685853027e-07, + "loss": 1.1707, + "step": 4181 + }, + { + "epoch": 0.5669355385345354, + "grad_norm": 1.7414031459183155, + "learning_rate": 8.326020110407079e-07, + "loss": 1.1417, + "step": 4182 + }, + { + "epoch": 0.5670711041822002, + "grad_norm": 2.510789190108499, + "learning_rate": 8.32169085777494e-07, + "loss": 1.1501, + "step": 4183 + }, + { + "epoch": 0.5672066698298651, + "grad_norm": 2.103160024419977, + "learning_rate": 8.317361928791467e-07, + "loss": 1.1848, + "step": 4184 + }, + { + "epoch": 0.56734223547753, + "grad_norm": 2.1870579939004835, + "learning_rate": 8.313033324291469e-07, + "loss": 1.2016, + "step": 4185 + }, + { + "epoch": 0.5674778011251949, + "grad_norm": 1.9727356550124064, + "learning_rate": 8.308705045109675e-07, + "loss": 1.1772, + "step": 4186 + }, + { + "epoch": 0.5676133667728598, + "grad_norm": 1.5894496007077357, + "learning_rate": 8.304377092080766e-07, + "loss": 1.1599, + "step": 4187 + }, + { + "epoch": 0.5677489324205246, + "grad_norm": 1.8166210519219141, + "learning_rate": 8.300049466039346e-07, + "loss": 1.1835, + "step": 4188 + }, + { + "epoch": 0.5678844980681895, + "grad_norm": 2.6497518964560056, + "learning_rate": 8.295722167819973e-07, + "loss": 1.1152, + "step": 4189 + }, + { + "epoch": 0.5680200637158545, + "grad_norm": 2.642035039294208, + "learning_rate": 8.291395198257122e-07, + "loss": 1.1745, + "step": 4190 + }, + { + "epoch": 0.5681556293635193, + "grad_norm": 1.96097949657457, + "learning_rate": 8.287068558185224e-07, + "loss": 1.1646, + "step": 4191 + }, + { + "epoch": 0.5682911950111842, + "grad_norm": 1.7728443022869669, + "learning_rate": 8.282742248438634e-07, + "loss": 1.1148, + "step": 4192 + }, + { + "epoch": 0.568426760658849, + "grad_norm": 1.6249585380612628, + "learning_rate": 8.278416269851643e-07, + "loss": 1.1651, + "step": 4193 + }, + { + "epoch": 0.5685623263065139, + "grad_norm": 2.2023748033604518, + "learning_rate": 8.274090623258489e-07, + "loss": 1.1563, + "step": 4194 + }, + { + "epoch": 0.5686978919541789, + "grad_norm": 1.6238531652530634, + "learning_rate": 8.269765309493328e-07, + "loss": 1.114, + "step": 4195 + }, + { + "epoch": 0.5688334576018437, + "grad_norm": 2.2078474402513413, + "learning_rate": 8.265440329390276e-07, + "loss": 1.1615, + "step": 4196 + }, + { + "epoch": 0.5689690232495086, + "grad_norm": 1.6259762315505533, + "learning_rate": 8.261115683783361e-07, + "loss": 1.1392, + "step": 4197 + }, + { + "epoch": 0.5691045888971734, + "grad_norm": 10.05889391125694, + "learning_rate": 8.256791373506563e-07, + "loss": 1.1144, + "step": 4198 + }, + { + "epoch": 0.5692401545448383, + "grad_norm": 1.9537982625685049, + "learning_rate": 8.252467399393786e-07, + "loss": 1.1399, + "step": 4199 + }, + { + "epoch": 0.5693757201925033, + "grad_norm": 1.7189532601431146, + "learning_rate": 8.248143762278879e-07, + "loss": 1.148, + "step": 4200 + }, + { + "epoch": 0.5695112858401681, + "grad_norm": 2.1182773056427644, + "learning_rate": 8.243820462995617e-07, + "loss": 1.1117, + "step": 4201 + }, + { + "epoch": 0.569646851487833, + "grad_norm": 2.447374049688282, + "learning_rate": 8.239497502377719e-07, + "loss": 1.1596, + "step": 4202 + }, + { + "epoch": 0.5697824171354978, + "grad_norm": 2.115898409154712, + "learning_rate": 8.235174881258827e-07, + "loss": 1.1738, + "step": 4203 + }, + { + "epoch": 0.5699179827831627, + "grad_norm": 1.6663772284685034, + "learning_rate": 8.230852600472533e-07, + "loss": 1.1714, + "step": 4204 + }, + { + "epoch": 0.5700535484308277, + "grad_norm": 3.994983867298492, + "learning_rate": 8.226530660852349e-07, + "loss": 1.1435, + "step": 4205 + }, + { + "epoch": 0.5701891140784925, + "grad_norm": 1.8652593643608482, + "learning_rate": 8.222209063231727e-07, + "loss": 1.1357, + "step": 4206 + }, + { + "epoch": 0.5703246797261574, + "grad_norm": 1.9034647172443526, + "learning_rate": 8.217887808444056e-07, + "loss": 1.145, + "step": 4207 + }, + { + "epoch": 0.5704602453738222, + "grad_norm": 1.6885549763898424, + "learning_rate": 8.213566897322651e-07, + "loss": 1.1396, + "step": 4208 + }, + { + "epoch": 0.5705958110214872, + "grad_norm": 1.9653487428447485, + "learning_rate": 8.209246330700772e-07, + "loss": 1.144, + "step": 4209 + }, + { + "epoch": 0.5707313766691521, + "grad_norm": 1.9427444379191783, + "learning_rate": 8.204926109411601e-07, + "loss": 1.1446, + "step": 4210 + }, + { + "epoch": 0.5708669423168169, + "grad_norm": 1.8406801974985876, + "learning_rate": 8.20060623428826e-07, + "loss": 1.1624, + "step": 4211 + }, + { + "epoch": 0.5710025079644818, + "grad_norm": 2.365486595146696, + "learning_rate": 8.196286706163804e-07, + "loss": 1.1268, + "step": 4212 + }, + { + "epoch": 0.5711380736121466, + "grad_norm": 1.9415387072463413, + "learning_rate": 8.191967525871219e-07, + "loss": 1.1423, + "step": 4213 + }, + { + "epoch": 0.5712736392598116, + "grad_norm": 2.05778070220801, + "learning_rate": 8.187648694243423e-07, + "loss": 1.1189, + "step": 4214 + }, + { + "epoch": 0.5714092049074765, + "grad_norm": 1.9270865008595037, + "learning_rate": 8.183330212113273e-07, + "loss": 1.1507, + "step": 4215 + }, + { + "epoch": 0.5715447705551413, + "grad_norm": 2.1644142514929103, + "learning_rate": 8.179012080313549e-07, + "loss": 1.1117, + "step": 4216 + }, + { + "epoch": 0.5716803362028062, + "grad_norm": 1.6350519691858827, + "learning_rate": 8.174694299676974e-07, + "loss": 1.1613, + "step": 4217 + }, + { + "epoch": 0.571815901850471, + "grad_norm": 1.6276798315835583, + "learning_rate": 8.170376871036193e-07, + "loss": 1.1684, + "step": 4218 + }, + { + "epoch": 0.571951467498136, + "grad_norm": 2.345157314767901, + "learning_rate": 8.166059795223793e-07, + "loss": 1.1429, + "step": 4219 + }, + { + "epoch": 0.5720870331458009, + "grad_norm": 1.510345524272408, + "learning_rate": 8.161743073072286e-07, + "loss": 1.1491, + "step": 4220 + }, + { + "epoch": 0.5722225987934657, + "grad_norm": 1.5575989635624832, + "learning_rate": 8.157426705414113e-07, + "loss": 1.1759, + "step": 4221 + }, + { + "epoch": 0.5723581644411306, + "grad_norm": 1.6836149192505878, + "learning_rate": 8.153110693081657e-07, + "loss": 1.137, + "step": 4222 + }, + { + "epoch": 0.5724937300887954, + "grad_norm": 2.584025902825, + "learning_rate": 8.148795036907224e-07, + "loss": 1.1867, + "step": 4223 + }, + { + "epoch": 0.5726292957364604, + "grad_norm": 2.0059470205885446, + "learning_rate": 8.144479737723058e-07, + "loss": 1.1379, + "step": 4224 + }, + { + "epoch": 0.5727648613841253, + "grad_norm": 1.9986682012715682, + "learning_rate": 8.140164796361327e-07, + "loss": 1.1935, + "step": 4225 + }, + { + "epoch": 0.5729004270317901, + "grad_norm": 2.097769021202476, + "learning_rate": 8.135850213654135e-07, + "loss": 1.1304, + "step": 4226 + }, + { + "epoch": 0.573035992679455, + "grad_norm": 1.7340077095950495, + "learning_rate": 8.131535990433513e-07, + "loss": 1.142, + "step": 4227 + }, + { + "epoch": 0.5731715583271199, + "grad_norm": 1.822403362459403, + "learning_rate": 8.127222127531429e-07, + "loss": 1.1817, + "step": 4228 + }, + { + "epoch": 0.5733071239747848, + "grad_norm": 1.6110227164725588, + "learning_rate": 8.122908625779771e-07, + "loss": 1.1367, + "step": 4229 + }, + { + "epoch": 0.5734426896224497, + "grad_norm": 2.485628468555756, + "learning_rate": 8.118595486010372e-07, + "loss": 1.1677, + "step": 4230 + }, + { + "epoch": 0.5735782552701145, + "grad_norm": 3.1748289467779185, + "learning_rate": 8.114282709054978e-07, + "loss": 1.1513, + "step": 4231 + }, + { + "epoch": 0.5737138209177794, + "grad_norm": 2.0079243718664164, + "learning_rate": 8.109970295745284e-07, + "loss": 1.1664, + "step": 4232 + }, + { + "epoch": 0.5738493865654443, + "grad_norm": 1.8739024252114955, + "learning_rate": 8.105658246912895e-07, + "loss": 1.1223, + "step": 4233 + }, + { + "epoch": 0.5739849522131092, + "grad_norm": 1.6078108935654833, + "learning_rate": 8.101346563389363e-07, + "loss": 1.1106, + "step": 4234 + }, + { + "epoch": 0.5741205178607741, + "grad_norm": 2.6308024624360917, + "learning_rate": 8.097035246006161e-07, + "loss": 1.181, + "step": 4235 + }, + { + "epoch": 0.5742560835084389, + "grad_norm": 2.1667853935892696, + "learning_rate": 8.092724295594685e-07, + "loss": 1.1589, + "step": 4236 + }, + { + "epoch": 0.5743916491561039, + "grad_norm": 2.0018764437164176, + "learning_rate": 8.088413712986279e-07, + "loss": 1.1707, + "step": 4237 + }, + { + "epoch": 0.5745272148037687, + "grad_norm": 3.2133851795699337, + "learning_rate": 8.084103499012194e-07, + "loss": 1.1556, + "step": 4238 + }, + { + "epoch": 0.5746627804514336, + "grad_norm": 1.6737345884482984, + "learning_rate": 8.07979365450363e-07, + "loss": 1.1661, + "step": 4239 + }, + { + "epoch": 0.5747983460990985, + "grad_norm": 1.858819495603842, + "learning_rate": 8.075484180291701e-07, + "loss": 1.1507, + "step": 4240 + }, + { + "epoch": 0.5749339117467633, + "grad_norm": 3.0676033006710295, + "learning_rate": 8.071175077207457e-07, + "loss": 1.1771, + "step": 4241 + }, + { + "epoch": 0.5750694773944283, + "grad_norm": 1.7540449661168591, + "learning_rate": 8.066866346081873e-07, + "loss": 1.1486, + "step": 4242 + }, + { + "epoch": 0.5752050430420931, + "grad_norm": 1.6302207336867272, + "learning_rate": 8.062557987745856e-07, + "loss": 1.1536, + "step": 4243 + }, + { + "epoch": 0.575340608689758, + "grad_norm": 2.0579557508978494, + "learning_rate": 8.058250003030238e-07, + "loss": 1.2058, + "step": 4244 + }, + { + "epoch": 0.5754761743374229, + "grad_norm": 2.3311310975885093, + "learning_rate": 8.053942392765781e-07, + "loss": 1.1659, + "step": 4245 + }, + { + "epoch": 0.5756117399850877, + "grad_norm": 2.121736290248236, + "learning_rate": 8.049635157783169e-07, + "loss": 1.1566, + "step": 4246 + }, + { + "epoch": 0.5757473056327527, + "grad_norm": 2.3610713721951293, + "learning_rate": 8.045328298913024e-07, + "loss": 1.1603, + "step": 4247 + }, + { + "epoch": 0.5758828712804176, + "grad_norm": 1.6415301512366165, + "learning_rate": 8.041021816985887e-07, + "loss": 1.1289, + "step": 4248 + }, + { + "epoch": 0.5760184369280824, + "grad_norm": 2.0938185414557404, + "learning_rate": 8.03671571283223e-07, + "loss": 1.1658, + "step": 4249 + }, + { + "epoch": 0.5761540025757473, + "grad_norm": 4.3554564968922405, + "learning_rate": 8.03240998728245e-07, + "loss": 1.1634, + "step": 4250 + }, + { + "epoch": 0.5762895682234122, + "grad_norm": 1.8567545159750962, + "learning_rate": 8.028104641166871e-07, + "loss": 1.1173, + "step": 4251 + }, + { + "epoch": 0.5764251338710771, + "grad_norm": 1.6536893816472682, + "learning_rate": 8.02379967531575e-07, + "loss": 1.1556, + "step": 4252 + }, + { + "epoch": 0.576560699518742, + "grad_norm": 1.7788100577077597, + "learning_rate": 8.019495090559257e-07, + "loss": 1.1506, + "step": 4253 + }, + { + "epoch": 0.5766962651664068, + "grad_norm": 2.0941500862110747, + "learning_rate": 8.015190887727509e-07, + "loss": 1.1544, + "step": 4254 + }, + { + "epoch": 0.5768318308140717, + "grad_norm": 1.7844136300029148, + "learning_rate": 8.010887067650526e-07, + "loss": 1.1312, + "step": 4255 + }, + { + "epoch": 0.5769673964617366, + "grad_norm": 2.077500761163505, + "learning_rate": 8.006583631158275e-07, + "loss": 1.1688, + "step": 4256 + }, + { + "epoch": 0.5771029621094015, + "grad_norm": 1.6965796922682512, + "learning_rate": 8.002280579080632e-07, + "loss": 1.1411, + "step": 4257 + }, + { + "epoch": 0.5772385277570664, + "grad_norm": 1.6591478497645757, + "learning_rate": 7.997977912247413e-07, + "loss": 1.1522, + "step": 4258 + }, + { + "epoch": 0.5773740934047312, + "grad_norm": 2.1126827319557955, + "learning_rate": 7.993675631488348e-07, + "loss": 1.1248, + "step": 4259 + }, + { + "epoch": 0.5775096590523962, + "grad_norm": 3.62863930822699, + "learning_rate": 7.989373737633103e-07, + "loss": 1.1444, + "step": 4260 + }, + { + "epoch": 0.577645224700061, + "grad_norm": 2.210939009956114, + "learning_rate": 7.985072231511259e-07, + "loss": 1.1085, + "step": 4261 + }, + { + "epoch": 0.5777807903477259, + "grad_norm": 2.22043318526031, + "learning_rate": 7.980771113952335e-07, + "loss": 1.1516, + "step": 4262 + }, + { + "epoch": 0.5779163559953908, + "grad_norm": 1.9409665035221921, + "learning_rate": 7.976470385785762e-07, + "loss": 1.1271, + "step": 4263 + }, + { + "epoch": 0.5780519216430556, + "grad_norm": 1.7729753290103971, + "learning_rate": 7.972170047840898e-07, + "loss": 1.1631, + "step": 4264 + }, + { + "epoch": 0.5781874872907206, + "grad_norm": 1.9646523477309497, + "learning_rate": 7.967870100947038e-07, + "loss": 1.1841, + "step": 4265 + }, + { + "epoch": 0.5783230529383854, + "grad_norm": 7.39656388923314, + "learning_rate": 7.963570545933384e-07, + "loss": 1.12, + "step": 4266 + }, + { + "epoch": 0.5784586185860503, + "grad_norm": 1.8191600893143038, + "learning_rate": 7.95927138362908e-07, + "loss": 1.1594, + "step": 4267 + }, + { + "epoch": 0.5785941842337152, + "grad_norm": 3.504127001050931, + "learning_rate": 7.954972614863177e-07, + "loss": 1.1305, + "step": 4268 + }, + { + "epoch": 0.57872974988138, + "grad_norm": 2.313127164226708, + "learning_rate": 7.950674240464667e-07, + "loss": 1.1743, + "step": 4269 + }, + { + "epoch": 0.578865315529045, + "grad_norm": 1.7533760996111274, + "learning_rate": 7.946376261262449e-07, + "loss": 1.1842, + "step": 4270 + }, + { + "epoch": 0.5790008811767098, + "grad_norm": 1.7852655499269865, + "learning_rate": 7.942078678085363e-07, + "loss": 1.1834, + "step": 4271 + }, + { + "epoch": 0.5791364468243747, + "grad_norm": 1.7813786989493576, + "learning_rate": 7.937781491762156e-07, + "loss": 1.1745, + "step": 4272 + }, + { + "epoch": 0.5792720124720396, + "grad_norm": 2.3138921477197405, + "learning_rate": 7.933484703121513e-07, + "loss": 1.1393, + "step": 4273 + }, + { + "epoch": 0.5794075781197044, + "grad_norm": 1.9069778799540098, + "learning_rate": 7.929188312992031e-07, + "loss": 1.1578, + "step": 4274 + }, + { + "epoch": 0.5795431437673694, + "grad_norm": 1.5638783969239405, + "learning_rate": 7.924892322202236e-07, + "loss": 1.1509, + "step": 4275 + }, + { + "epoch": 0.5796787094150342, + "grad_norm": 5.07662916569585, + "learning_rate": 7.920596731580582e-07, + "loss": 1.1444, + "step": 4276 + }, + { + "epoch": 0.5798142750626991, + "grad_norm": 1.9858661533878679, + "learning_rate": 7.91630154195543e-07, + "loss": 1.2093, + "step": 4277 + }, + { + "epoch": 0.579949840710364, + "grad_norm": 2.060715174818223, + "learning_rate": 7.912006754155078e-07, + "loss": 1.1881, + "step": 4278 + }, + { + "epoch": 0.5800854063580289, + "grad_norm": 1.82066810701504, + "learning_rate": 7.907712369007743e-07, + "loss": 1.166, + "step": 4279 + }, + { + "epoch": 0.5802209720056938, + "grad_norm": 1.8270077066103623, + "learning_rate": 7.903418387341564e-07, + "loss": 1.1657, + "step": 4280 + }, + { + "epoch": 0.5803565376533586, + "grad_norm": 4.036193243613946, + "learning_rate": 7.899124809984595e-07, + "loss": 1.1558, + "step": 4281 + }, + { + "epoch": 0.5804921033010235, + "grad_norm": 2.6934978438968895, + "learning_rate": 7.894831637764828e-07, + "loss": 1.1641, + "step": 4282 + }, + { + "epoch": 0.5806276689486884, + "grad_norm": 1.9107351905319057, + "learning_rate": 7.890538871510156e-07, + "loss": 1.156, + "step": 4283 + }, + { + "epoch": 0.5807632345963533, + "grad_norm": 1.7162407096668433, + "learning_rate": 7.886246512048418e-07, + "loss": 1.137, + "step": 4284 + }, + { + "epoch": 0.5808988002440182, + "grad_norm": 2.298131334124045, + "learning_rate": 7.88195456020735e-07, + "loss": 1.1101, + "step": 4285 + }, + { + "epoch": 0.581034365891683, + "grad_norm": 2.311118518469499, + "learning_rate": 7.87766301681463e-07, + "loss": 1.1748, + "step": 4286 + }, + { + "epoch": 0.5811699315393479, + "grad_norm": 4.168928221475837, + "learning_rate": 7.873371882697841e-07, + "loss": 1.1553, + "step": 4287 + }, + { + "epoch": 0.5813054971870129, + "grad_norm": 1.6636857071132543, + "learning_rate": 7.869081158684503e-07, + "loss": 1.1321, + "step": 4288 + }, + { + "epoch": 0.5814410628346777, + "grad_norm": 1.7360651292219766, + "learning_rate": 7.864790845602038e-07, + "loss": 1.1072, + "step": 4289 + }, + { + "epoch": 0.5815766284823426, + "grad_norm": 1.6073362152965833, + "learning_rate": 7.860500944277809e-07, + "loss": 1.1643, + "step": 4290 + }, + { + "epoch": 0.5817121941300074, + "grad_norm": 2.540771647783247, + "learning_rate": 7.856211455539084e-07, + "loss": 1.1577, + "step": 4291 + }, + { + "epoch": 0.5818477597776723, + "grad_norm": 1.9169434802121765, + "learning_rate": 7.851922380213053e-07, + "loss": 1.1732, + "step": 4292 + }, + { + "epoch": 0.5819833254253373, + "grad_norm": 1.7011847910829212, + "learning_rate": 7.847633719126839e-07, + "loss": 1.1179, + "step": 4293 + }, + { + "epoch": 0.5821188910730021, + "grad_norm": 9.035623357742631, + "learning_rate": 7.84334547310747e-07, + "loss": 1.1605, + "step": 4294 + }, + { + "epoch": 0.582254456720667, + "grad_norm": 2.325282730556441, + "learning_rate": 7.839057642981905e-07, + "loss": 1.1377, + "step": 4295 + }, + { + "epoch": 0.5823900223683318, + "grad_norm": 2.032392261105263, + "learning_rate": 7.834770229577015e-07, + "loss": 1.1884, + "step": 4296 + }, + { + "epoch": 0.5825255880159967, + "grad_norm": 1.8644030593154997, + "learning_rate": 7.830483233719597e-07, + "loss": 1.1332, + "step": 4297 + }, + { + "epoch": 0.5826611536636617, + "grad_norm": 2.1403650661701668, + "learning_rate": 7.826196656236357e-07, + "loss": 1.1367, + "step": 4298 + }, + { + "epoch": 0.5827967193113265, + "grad_norm": 2.610168026815205, + "learning_rate": 7.821910497953939e-07, + "loss": 1.1143, + "step": 4299 + }, + { + "epoch": 0.5829322849589914, + "grad_norm": 1.6002314908612059, + "learning_rate": 7.817624759698884e-07, + "loss": 1.0952, + "step": 4300 + }, + { + "epoch": 0.5830678506066562, + "grad_norm": 1.776845148520408, + "learning_rate": 7.813339442297671e-07, + "loss": 1.1544, + "step": 4301 + }, + { + "epoch": 0.5832034162543211, + "grad_norm": 1.8752995474328151, + "learning_rate": 7.809054546576686e-07, + "loss": 1.1915, + "step": 4302 + }, + { + "epoch": 0.5833389819019861, + "grad_norm": 2.414241201066857, + "learning_rate": 7.804770073362236e-07, + "loss": 1.1484, + "step": 4303 + }, + { + "epoch": 0.5834745475496509, + "grad_norm": 3.308937954420301, + "learning_rate": 7.800486023480551e-07, + "loss": 1.1803, + "step": 4304 + }, + { + "epoch": 0.5836101131973158, + "grad_norm": 1.9834409037630925, + "learning_rate": 7.796202397757771e-07, + "loss": 1.172, + "step": 4305 + }, + { + "epoch": 0.5837456788449806, + "grad_norm": 1.6705046832160344, + "learning_rate": 7.791919197019967e-07, + "loss": 1.1524, + "step": 4306 + }, + { + "epoch": 0.5838812444926456, + "grad_norm": 2.493454174288493, + "learning_rate": 7.787636422093114e-07, + "loss": 1.173, + "step": 4307 + }, + { + "epoch": 0.5840168101403105, + "grad_norm": 1.8809136460823275, + "learning_rate": 7.783354073803114e-07, + "loss": 1.1891, + "step": 4308 + }, + { + "epoch": 0.5841523757879753, + "grad_norm": 2.5103096127296327, + "learning_rate": 7.779072152975783e-07, + "loss": 1.1538, + "step": 4309 + }, + { + "epoch": 0.5842879414356402, + "grad_norm": 2.7316198770412554, + "learning_rate": 7.774790660436857e-07, + "loss": 1.1255, + "step": 4310 + }, + { + "epoch": 0.584423507083305, + "grad_norm": 1.6405299424759519, + "learning_rate": 7.770509597011986e-07, + "loss": 1.1152, + "step": 4311 + }, + { + "epoch": 0.58455907273097, + "grad_norm": 2.9409179665815035, + "learning_rate": 7.766228963526744e-07, + "loss": 1.155, + "step": 4312 + }, + { + "epoch": 0.5846946383786349, + "grad_norm": 10.070378410140165, + "learning_rate": 7.761948760806611e-07, + "loss": 1.1724, + "step": 4313 + }, + { + "epoch": 0.5848302040262997, + "grad_norm": 1.6770903715317305, + "learning_rate": 7.757668989676995e-07, + "loss": 1.1241, + "step": 4314 + }, + { + "epoch": 0.5849657696739646, + "grad_norm": 2.0634365554494645, + "learning_rate": 7.753389650963212e-07, + "loss": 1.1286, + "step": 4315 + }, + { + "epoch": 0.5851013353216294, + "grad_norm": 1.8140677643549907, + "learning_rate": 7.749110745490505e-07, + "loss": 1.1393, + "step": 4316 + }, + { + "epoch": 0.5852369009692944, + "grad_norm": 1.9581101872102695, + "learning_rate": 7.744832274084019e-07, + "loss": 1.1485, + "step": 4317 + }, + { + "epoch": 0.5853724666169593, + "grad_norm": 3.255755380020454, + "learning_rate": 7.740554237568832e-07, + "loss": 1.1281, + "step": 4318 + }, + { + "epoch": 0.5855080322646241, + "grad_norm": 2.1676770901746574, + "learning_rate": 7.736276636769925e-07, + "loss": 1.1911, + "step": 4319 + }, + { + "epoch": 0.585643597912289, + "grad_norm": 1.5466005173595232, + "learning_rate": 7.731999472512196e-07, + "loss": 1.1105, + "step": 4320 + }, + { + "epoch": 0.5857791635599539, + "grad_norm": 1.7384788215547589, + "learning_rate": 7.727722745620471e-07, + "loss": 1.1629, + "step": 4321 + }, + { + "epoch": 0.5859147292076188, + "grad_norm": 2.5437669351285, + "learning_rate": 7.723446456919473e-07, + "loss": 1.1683, + "step": 4322 + }, + { + "epoch": 0.5860502948552837, + "grad_norm": 2.252052078175162, + "learning_rate": 7.719170607233861e-07, + "loss": 1.1397, + "step": 4323 + }, + { + "epoch": 0.5861858605029485, + "grad_norm": 1.6905584269962934, + "learning_rate": 7.714895197388188e-07, + "loss": 1.1471, + "step": 4324 + }, + { + "epoch": 0.5863214261506134, + "grad_norm": 3.59730773394509, + "learning_rate": 7.710620228206944e-07, + "loss": 1.171, + "step": 4325 + }, + { + "epoch": 0.5864569917982784, + "grad_norm": 3.7980180327416138, + "learning_rate": 7.706345700514512e-07, + "loss": 1.1716, + "step": 4326 + }, + { + "epoch": 0.5865925574459432, + "grad_norm": 2.089873313007694, + "learning_rate": 7.702071615135212e-07, + "loss": 1.1045, + "step": 4327 + }, + { + "epoch": 0.5867281230936081, + "grad_norm": 1.8451978519186165, + "learning_rate": 7.697797972893258e-07, + "loss": 1.1295, + "step": 4328 + }, + { + "epoch": 0.5868636887412729, + "grad_norm": 1.6972405719560941, + "learning_rate": 7.693524774612797e-07, + "loss": 1.1459, + "step": 4329 + }, + { + "epoch": 0.5869992543889379, + "grad_norm": 1.7933603459681784, + "learning_rate": 7.689252021117874e-07, + "loss": 1.1538, + "step": 4330 + }, + { + "epoch": 0.5871348200366028, + "grad_norm": 1.905719177353885, + "learning_rate": 7.684979713232461e-07, + "loss": 1.1173, + "step": 4331 + }, + { + "epoch": 0.5872703856842676, + "grad_norm": 2.034204408015787, + "learning_rate": 7.680707851780433e-07, + "loss": 1.1451, + "step": 4332 + }, + { + "epoch": 0.5874059513319325, + "grad_norm": 1.778546872773122, + "learning_rate": 7.676436437585593e-07, + "loss": 1.1429, + "step": 4333 + }, + { + "epoch": 0.5875415169795973, + "grad_norm": 1.8926637717992567, + "learning_rate": 7.672165471471643e-07, + "loss": 1.166, + "step": 4334 + }, + { + "epoch": 0.5876770826272623, + "grad_norm": 2.027306257340349, + "learning_rate": 7.667894954262205e-07, + "loss": 1.1662, + "step": 4335 + }, + { + "epoch": 0.5878126482749272, + "grad_norm": 2.181889201847765, + "learning_rate": 7.66362488678082e-07, + "loss": 1.1715, + "step": 4336 + }, + { + "epoch": 0.587948213922592, + "grad_norm": 4.465824728687874, + "learning_rate": 7.659355269850929e-07, + "loss": 1.1613, + "step": 4337 + }, + { + "epoch": 0.5880837795702569, + "grad_norm": 1.6106777033268103, + "learning_rate": 7.655086104295904e-07, + "loss": 1.1607, + "step": 4338 + }, + { + "epoch": 0.5882193452179217, + "grad_norm": 1.6113968535131318, + "learning_rate": 7.65081739093901e-07, + "loss": 1.1488, + "step": 4339 + }, + { + "epoch": 0.5883549108655867, + "grad_norm": 3.2126766268020663, + "learning_rate": 7.646549130603439e-07, + "loss": 1.1784, + "step": 4340 + }, + { + "epoch": 0.5884904765132516, + "grad_norm": 1.8943664449732491, + "learning_rate": 7.642281324112292e-07, + "loss": 1.1487, + "step": 4341 + }, + { + "epoch": 0.5886260421609164, + "grad_norm": 2.6009982981754947, + "learning_rate": 7.638013972288581e-07, + "loss": 1.1192, + "step": 4342 + }, + { + "epoch": 0.5887616078085813, + "grad_norm": 1.679978805003671, + "learning_rate": 7.63374707595523e-07, + "loss": 1.1245, + "step": 4343 + }, + { + "epoch": 0.5888971734562461, + "grad_norm": 1.935699712321735, + "learning_rate": 7.629480635935082e-07, + "loss": 1.1538, + "step": 4344 + }, + { + "epoch": 0.5890327391039111, + "grad_norm": 1.9469470864519831, + "learning_rate": 7.625214653050874e-07, + "loss": 1.1381, + "step": 4345 + }, + { + "epoch": 0.589168304751576, + "grad_norm": 2.1889183957705933, + "learning_rate": 7.620949128125282e-07, + "loss": 1.142, + "step": 4346 + }, + { + "epoch": 0.5893038703992408, + "grad_norm": 1.9762238875168006, + "learning_rate": 7.616684061980867e-07, + "loss": 1.1569, + "step": 4347 + }, + { + "epoch": 0.5894394360469057, + "grad_norm": 8.188023102179788, + "learning_rate": 7.612419455440119e-07, + "loss": 1.1661, + "step": 4348 + }, + { + "epoch": 0.5895750016945706, + "grad_norm": 2.815946037507315, + "learning_rate": 7.608155309325435e-07, + "loss": 1.1566, + "step": 4349 + }, + { + "epoch": 0.5897105673422355, + "grad_norm": 1.7410168226179525, + "learning_rate": 7.603891624459114e-07, + "loss": 1.1293, + "step": 4350 + }, + { + "epoch": 0.5898461329899004, + "grad_norm": 1.9787764819017009, + "learning_rate": 7.599628401663384e-07, + "loss": 1.1842, + "step": 4351 + }, + { + "epoch": 0.5899816986375652, + "grad_norm": 2.815853580558087, + "learning_rate": 7.595365641760367e-07, + "loss": 1.1699, + "step": 4352 + }, + { + "epoch": 0.5901172642852301, + "grad_norm": 1.8673268953318323, + "learning_rate": 7.591103345572109e-07, + "loss": 1.1162, + "step": 4353 + }, + { + "epoch": 0.590252829932895, + "grad_norm": 1.7769024061924155, + "learning_rate": 7.58684151392055e-07, + "loss": 1.1717, + "step": 4354 + }, + { + "epoch": 0.5903883955805599, + "grad_norm": 1.922781139214135, + "learning_rate": 7.582580147627562e-07, + "loss": 1.1731, + "step": 4355 + }, + { + "epoch": 0.5905239612282248, + "grad_norm": 1.6892845945102615, + "learning_rate": 7.578319247514906e-07, + "loss": 1.1051, + "step": 4356 + }, + { + "epoch": 0.5906595268758896, + "grad_norm": 1.6780956755422851, + "learning_rate": 7.574058814404272e-07, + "loss": 1.1636, + "step": 4357 + }, + { + "epoch": 0.5907950925235546, + "grad_norm": 2.001641610090902, + "learning_rate": 7.569798849117241e-07, + "loss": 1.1671, + "step": 4358 + }, + { + "epoch": 0.5909306581712194, + "grad_norm": 1.858714963785523, + "learning_rate": 7.565539352475325e-07, + "loss": 1.1639, + "step": 4359 + }, + { + "epoch": 0.5910662238188843, + "grad_norm": 1.869455587492637, + "learning_rate": 7.561280325299924e-07, + "loss": 1.1253, + "step": 4360 + }, + { + "epoch": 0.5912017894665492, + "grad_norm": 2.0882965271134903, + "learning_rate": 7.557021768412366e-07, + "loss": 1.1657, + "step": 4361 + }, + { + "epoch": 0.591337355114214, + "grad_norm": 2.0598979794120424, + "learning_rate": 7.552763682633877e-07, + "loss": 1.1227, + "step": 4362 + }, + { + "epoch": 0.591472920761879, + "grad_norm": 4.217182543423187, + "learning_rate": 7.548506068785589e-07, + "loss": 1.1034, + "step": 4363 + }, + { + "epoch": 0.5916084864095438, + "grad_norm": 2.0259221426338443, + "learning_rate": 7.544248927688561e-07, + "loss": 1.1689, + "step": 4364 + }, + { + "epoch": 0.5917440520572087, + "grad_norm": 1.6325441802581113, + "learning_rate": 7.539992260163735e-07, + "loss": 1.1381, + "step": 4365 + }, + { + "epoch": 0.5918796177048736, + "grad_norm": 2.1446131773159354, + "learning_rate": 7.535736067031991e-07, + "loss": 1.1943, + "step": 4366 + }, + { + "epoch": 0.5920151833525384, + "grad_norm": 1.8885846567659001, + "learning_rate": 7.531480349114088e-07, + "loss": 1.1362, + "step": 4367 + }, + { + "epoch": 0.5921507490002034, + "grad_norm": 1.6993720382978674, + "learning_rate": 7.527225107230721e-07, + "loss": 1.1441, + "step": 4368 + }, + { + "epoch": 0.5922863146478682, + "grad_norm": 1.7870494507644292, + "learning_rate": 7.52297034220247e-07, + "loss": 1.1398, + "step": 4369 + }, + { + "epoch": 0.5924218802955331, + "grad_norm": 1.8007080176845982, + "learning_rate": 7.518716054849836e-07, + "loss": 1.1641, + "step": 4370 + }, + { + "epoch": 0.592557445943198, + "grad_norm": 2.05048141212677, + "learning_rate": 7.514462245993225e-07, + "loss": 1.1613, + "step": 4371 + }, + { + "epoch": 0.5926930115908629, + "grad_norm": 2.7004431619011635, + "learning_rate": 7.51020891645295e-07, + "loss": 1.1236, + "step": 4372 + }, + { + "epoch": 0.5928285772385278, + "grad_norm": 4.808863348169402, + "learning_rate": 7.505956067049232e-07, + "loss": 1.1793, + "step": 4373 + }, + { + "epoch": 0.5929641428861926, + "grad_norm": 1.844401838165601, + "learning_rate": 7.501703698602202e-07, + "loss": 1.1521, + "step": 4374 + }, + { + "epoch": 0.5930997085338575, + "grad_norm": 1.5708058815787287, + "learning_rate": 7.497451811931891e-07, + "loss": 1.137, + "step": 4375 + }, + { + "epoch": 0.5932352741815224, + "grad_norm": 1.8823360732384185, + "learning_rate": 7.493200407858245e-07, + "loss": 1.1437, + "step": 4376 + }, + { + "epoch": 0.5933708398291873, + "grad_norm": 1.7746927058839952, + "learning_rate": 7.488949487201112e-07, + "loss": 1.1711, + "step": 4377 + }, + { + "epoch": 0.5935064054768522, + "grad_norm": 1.5139273721322655, + "learning_rate": 7.48469905078025e-07, + "loss": 1.1608, + "step": 4378 + }, + { + "epoch": 0.593641971124517, + "grad_norm": 1.9634904989366797, + "learning_rate": 7.480449099415322e-07, + "loss": 1.1576, + "step": 4379 + }, + { + "epoch": 0.5937775367721819, + "grad_norm": 1.69611948730812, + "learning_rate": 7.476199633925894e-07, + "loss": 1.149, + "step": 4380 + }, + { + "epoch": 0.5939131024198469, + "grad_norm": 2.2440482490706177, + "learning_rate": 7.471950655131451e-07, + "loss": 1.1304, + "step": 4381 + }, + { + "epoch": 0.5940486680675117, + "grad_norm": 1.7383898362073622, + "learning_rate": 7.467702163851363e-07, + "loss": 1.1357, + "step": 4382 + }, + { + "epoch": 0.5941842337151766, + "grad_norm": 2.818989283873769, + "learning_rate": 7.463454160904927e-07, + "loss": 1.1465, + "step": 4383 + }, + { + "epoch": 0.5943197993628414, + "grad_norm": 1.668269551590689, + "learning_rate": 7.459206647111331e-07, + "loss": 1.1719, + "step": 4384 + }, + { + "epoch": 0.5944553650105063, + "grad_norm": 1.8375715344502617, + "learning_rate": 7.454959623289682e-07, + "loss": 1.1249, + "step": 4385 + }, + { + "epoch": 0.5945909306581713, + "grad_norm": 3.2203161858385916, + "learning_rate": 7.450713090258976e-07, + "loss": 1.1084, + "step": 4386 + }, + { + "epoch": 0.5947264963058361, + "grad_norm": 2.4143018260793045, + "learning_rate": 7.44646704883813e-07, + "loss": 1.1667, + "step": 4387 + }, + { + "epoch": 0.594862061953501, + "grad_norm": 1.8496333149310282, + "learning_rate": 7.442221499845955e-07, + "loss": 1.136, + "step": 4388 + }, + { + "epoch": 0.5949976276011658, + "grad_norm": 1.9624447085186876, + "learning_rate": 7.437976444101177e-07, + "loss": 1.1541, + "step": 4389 + }, + { + "epoch": 0.5951331932488307, + "grad_norm": 2.0430243780631177, + "learning_rate": 7.433731882422418e-07, + "loss": 1.1264, + "step": 4390 + }, + { + "epoch": 0.5952687588964957, + "grad_norm": 2.2035649326995976, + "learning_rate": 7.429487815628206e-07, + "loss": 1.127, + "step": 4391 + }, + { + "epoch": 0.5954043245441605, + "grad_norm": 2.1459182292779615, + "learning_rate": 7.425244244536981e-07, + "loss": 1.184, + "step": 4392 + }, + { + "epoch": 0.5955398901918254, + "grad_norm": 1.6773175747961466, + "learning_rate": 7.421001169967076e-07, + "loss": 1.1776, + "step": 4393 + }, + { + "epoch": 0.5956754558394902, + "grad_norm": 4.950720359547826, + "learning_rate": 7.416758592736742e-07, + "loss": 1.1234, + "step": 4394 + }, + { + "epoch": 0.5958110214871551, + "grad_norm": 1.7425445554451924, + "learning_rate": 7.41251651366412e-07, + "loss": 1.1464, + "step": 4395 + }, + { + "epoch": 0.5959465871348201, + "grad_norm": 1.6560114694679073, + "learning_rate": 7.408274933567267e-07, + "loss": 1.1411, + "step": 4396 + }, + { + "epoch": 0.5960821527824849, + "grad_norm": 1.8401388507692944, + "learning_rate": 7.404033853264131e-07, + "loss": 1.1303, + "step": 4397 + }, + { + "epoch": 0.5962177184301498, + "grad_norm": 1.6959427999384378, + "learning_rate": 7.399793273572578e-07, + "loss": 1.1924, + "step": 4398 + }, + { + "epoch": 0.5963532840778146, + "grad_norm": 1.9412446257225984, + "learning_rate": 7.395553195310364e-07, + "loss": 1.16, + "step": 4399 + }, + { + "epoch": 0.5964888497254796, + "grad_norm": 1.7841526205679217, + "learning_rate": 7.391313619295163e-07, + "loss": 1.1229, + "step": 4400 + }, + { + "epoch": 0.5966244153731445, + "grad_norm": 1.9280872403759732, + "learning_rate": 7.387074546344536e-07, + "loss": 1.1557, + "step": 4401 + }, + { + "epoch": 0.5967599810208093, + "grad_norm": 2.1313789724770684, + "learning_rate": 7.382835977275959e-07, + "loss": 1.1566, + "step": 4402 + }, + { + "epoch": 0.5968955466684742, + "grad_norm": 1.6641861546176075, + "learning_rate": 7.378597912906805e-07, + "loss": 1.1454, + "step": 4403 + }, + { + "epoch": 0.5970311123161391, + "grad_norm": 1.7573438622637871, + "learning_rate": 7.374360354054348e-07, + "loss": 1.1211, + "step": 4404 + }, + { + "epoch": 0.597166677963804, + "grad_norm": 1.8055705056548725, + "learning_rate": 7.370123301535777e-07, + "loss": 1.1607, + "step": 4405 + }, + { + "epoch": 0.5973022436114689, + "grad_norm": 1.699676438933352, + "learning_rate": 7.365886756168165e-07, + "loss": 1.1879, + "step": 4406 + }, + { + "epoch": 0.5974378092591337, + "grad_norm": 1.9255933329107109, + "learning_rate": 7.3616507187685e-07, + "loss": 1.17, + "step": 4407 + }, + { + "epoch": 0.5975733749067986, + "grad_norm": 2.3207933071982647, + "learning_rate": 7.357415190153666e-07, + "loss": 1.109, + "step": 4408 + }, + { + "epoch": 0.5977089405544636, + "grad_norm": 2.44157284058181, + "learning_rate": 7.353180171140455e-07, + "loss": 1.1619, + "step": 4409 + }, + { + "epoch": 0.5978445062021284, + "grad_norm": 2.812698522228516, + "learning_rate": 7.348945662545556e-07, + "loss": 1.1357, + "step": 4410 + }, + { + "epoch": 0.5979800718497933, + "grad_norm": 35.4977153933604, + "learning_rate": 7.34471166518556e-07, + "loss": 1.1701, + "step": 4411 + }, + { + "epoch": 0.5981156374974581, + "grad_norm": 2.3170133979558636, + "learning_rate": 7.340478179876957e-07, + "loss": 1.1772, + "step": 4412 + }, + { + "epoch": 0.598251203145123, + "grad_norm": 1.9490239900839814, + "learning_rate": 7.336245207436147e-07, + "loss": 1.1559, + "step": 4413 + }, + { + "epoch": 0.598386768792788, + "grad_norm": 1.7583624024687887, + "learning_rate": 7.332012748679419e-07, + "loss": 1.1296, + "step": 4414 + }, + { + "epoch": 0.5985223344404528, + "grad_norm": 1.6717093580338178, + "learning_rate": 7.327780804422977e-07, + "loss": 1.1679, + "step": 4415 + }, + { + "epoch": 0.5986579000881177, + "grad_norm": 2.544048334054225, + "learning_rate": 7.32354937548291e-07, + "loss": 1.1776, + "step": 4416 + }, + { + "epoch": 0.5987934657357825, + "grad_norm": 2.9269234076049777, + "learning_rate": 7.319318462675223e-07, + "loss": 1.1228, + "step": 4417 + }, + { + "epoch": 0.5989290313834474, + "grad_norm": 1.9494737054885858, + "learning_rate": 7.315088066815809e-07, + "loss": 1.1363, + "step": 4418 + }, + { + "epoch": 0.5990645970311124, + "grad_norm": 1.7952432721619318, + "learning_rate": 7.310858188720466e-07, + "loss": 1.1356, + "step": 4419 + }, + { + "epoch": 0.5992001626787772, + "grad_norm": 2.723023529990852, + "learning_rate": 7.306628829204897e-07, + "loss": 1.1434, + "step": 4420 + }, + { + "epoch": 0.5993357283264421, + "grad_norm": 1.649060922141291, + "learning_rate": 7.302399989084695e-07, + "loss": 1.1803, + "step": 4421 + }, + { + "epoch": 0.5994712939741069, + "grad_norm": 1.5688367197559214, + "learning_rate": 7.298171669175365e-07, + "loss": 1.1507, + "step": 4422 + }, + { + "epoch": 0.5996068596217718, + "grad_norm": 1.6968167995223364, + "learning_rate": 7.293943870292299e-07, + "loss": 1.158, + "step": 4423 + }, + { + "epoch": 0.5997424252694368, + "grad_norm": 2.781733086025854, + "learning_rate": 7.289716593250798e-07, + "loss": 1.1645, + "step": 4424 + }, + { + "epoch": 0.5998779909171016, + "grad_norm": 2.6656167283971968, + "learning_rate": 7.285489838866057e-07, + "loss": 1.1684, + "step": 4425 + }, + { + "epoch": 0.6000135565647665, + "grad_norm": 1.8886514202984759, + "learning_rate": 7.281263607953177e-07, + "loss": 1.1187, + "step": 4426 + }, + { + "epoch": 0.6001491222124313, + "grad_norm": 2.194927213556653, + "learning_rate": 7.277037901327145e-07, + "loss": 1.1293, + "step": 4427 + }, + { + "epoch": 0.6002846878600963, + "grad_norm": 2.2027314305512222, + "learning_rate": 7.272812719802865e-07, + "loss": 1.1581, + "step": 4428 + }, + { + "epoch": 0.6004202535077612, + "grad_norm": 1.8341820565221576, + "learning_rate": 7.268588064195122e-07, + "loss": 1.1461, + "step": 4429 + }, + { + "epoch": 0.600555819155426, + "grad_norm": 2.373588409353179, + "learning_rate": 7.264363935318612e-07, + "loss": 1.1575, + "step": 4430 + }, + { + "epoch": 0.6006913848030909, + "grad_norm": 1.6837403534767004, + "learning_rate": 7.260140333987925e-07, + "loss": 1.1528, + "step": 4431 + }, + { + "epoch": 0.6008269504507557, + "grad_norm": 1.786757304059328, + "learning_rate": 7.255917261017543e-07, + "loss": 1.1493, + "step": 4432 + }, + { + "epoch": 0.6009625160984207, + "grad_norm": 1.8273818213234996, + "learning_rate": 7.25169471722186e-07, + "loss": 1.151, + "step": 4433 + }, + { + "epoch": 0.6010980817460856, + "grad_norm": 1.9702858825794614, + "learning_rate": 7.247472703415154e-07, + "loss": 1.1637, + "step": 4434 + }, + { + "epoch": 0.6012336473937504, + "grad_norm": 2.0055850345112907, + "learning_rate": 7.243251220411612e-07, + "loss": 1.2148, + "step": 4435 + }, + { + "epoch": 0.6013692130414153, + "grad_norm": 1.5382806979253116, + "learning_rate": 7.23903026902531e-07, + "loss": 1.1573, + "step": 4436 + }, + { + "epoch": 0.6015047786890801, + "grad_norm": 1.696462253304018, + "learning_rate": 7.234809850070231e-07, + "loss": 1.1089, + "step": 4437 + }, + { + "epoch": 0.6016403443367451, + "grad_norm": 2.4647879478564008, + "learning_rate": 7.230589964360242e-07, + "loss": 1.1339, + "step": 4438 + }, + { + "epoch": 0.60177590998441, + "grad_norm": 2.664773388688054, + "learning_rate": 7.226370612709119e-07, + "loss": 1.1544, + "step": 4439 + }, + { + "epoch": 0.6019114756320748, + "grad_norm": 2.553525729229021, + "learning_rate": 7.222151795930528e-07, + "loss": 1.1372, + "step": 4440 + }, + { + "epoch": 0.6020470412797397, + "grad_norm": 2.032025064702722, + "learning_rate": 7.21793351483804e-07, + "loss": 1.1838, + "step": 4441 + }, + { + "epoch": 0.6021826069274046, + "grad_norm": 1.771853687553065, + "learning_rate": 7.213715770245108e-07, + "loss": 1.1707, + "step": 4442 + }, + { + "epoch": 0.6023181725750695, + "grad_norm": 3.4375237164595203, + "learning_rate": 7.209498562965101e-07, + "loss": 1.1828, + "step": 4443 + }, + { + "epoch": 0.6024537382227344, + "grad_norm": 1.8820957919684904, + "learning_rate": 7.205281893811264e-07, + "loss": 1.1509, + "step": 4444 + }, + { + "epoch": 0.6025893038703992, + "grad_norm": 2.6648937720000867, + "learning_rate": 7.201065763596758e-07, + "loss": 1.1578, + "step": 4445 + }, + { + "epoch": 0.6027248695180641, + "grad_norm": 2.044804954188377, + "learning_rate": 7.196850173134628e-07, + "loss": 1.1314, + "step": 4446 + }, + { + "epoch": 0.602860435165729, + "grad_norm": 1.9797032981924738, + "learning_rate": 7.192635123237809e-07, + "loss": 1.1377, + "step": 4447 + }, + { + "epoch": 0.6029960008133939, + "grad_norm": 2.0183260146167017, + "learning_rate": 7.188420614719152e-07, + "loss": 1.1727, + "step": 4448 + }, + { + "epoch": 0.6031315664610588, + "grad_norm": 1.7610607083027399, + "learning_rate": 7.184206648391381e-07, + "loss": 1.1606, + "step": 4449 + }, + { + "epoch": 0.6032671321087236, + "grad_norm": 1.9120140821482667, + "learning_rate": 7.179993225067136e-07, + "loss": 1.1809, + "step": 4450 + }, + { + "epoch": 0.6034026977563886, + "grad_norm": 5.9849601594383, + "learning_rate": 7.175780345558934e-07, + "loss": 1.1362, + "step": 4451 + }, + { + "epoch": 0.6035382634040534, + "grad_norm": 1.8565160004912011, + "learning_rate": 7.171568010679203e-07, + "loss": 1.1455, + "step": 4452 + }, + { + "epoch": 0.6036738290517183, + "grad_norm": 2.236492822314479, + "learning_rate": 7.167356221240251e-07, + "loss": 1.1334, + "step": 4453 + }, + { + "epoch": 0.6038093946993832, + "grad_norm": 5.932292706950365, + "learning_rate": 7.163144978054296e-07, + "loss": 1.1602, + "step": 4454 + }, + { + "epoch": 0.603944960347048, + "grad_norm": 1.9299585064270592, + "learning_rate": 7.158934281933435e-07, + "loss": 1.1282, + "step": 4455 + }, + { + "epoch": 0.604080525994713, + "grad_norm": 2.5459757950208544, + "learning_rate": 7.154724133689676e-07, + "loss": 1.1703, + "step": 4456 + }, + { + "epoch": 0.6042160916423778, + "grad_norm": 2.1920465266108287, + "learning_rate": 7.150514534134905e-07, + "loss": 1.1826, + "step": 4457 + }, + { + "epoch": 0.6043516572900427, + "grad_norm": 1.7561755752436101, + "learning_rate": 7.146305484080916e-07, + "loss": 1.1292, + "step": 4458 + }, + { + "epoch": 0.6044872229377076, + "grad_norm": 3.7525359927038333, + "learning_rate": 7.142096984339392e-07, + "loss": 1.1563, + "step": 4459 + }, + { + "epoch": 0.6046227885853724, + "grad_norm": 2.365886084151196, + "learning_rate": 7.137889035721898e-07, + "loss": 1.1271, + "step": 4460 + }, + { + "epoch": 0.6047583542330374, + "grad_norm": 2.2407563899815712, + "learning_rate": 7.133681639039917e-07, + "loss": 1.1428, + "step": 4461 + }, + { + "epoch": 0.6048939198807022, + "grad_norm": 4.349679079273553, + "learning_rate": 7.129474795104802e-07, + "loss": 1.1551, + "step": 4462 + }, + { + "epoch": 0.6050294855283671, + "grad_norm": 1.6293216338000829, + "learning_rate": 7.12526850472782e-07, + "loss": 1.1671, + "step": 4463 + }, + { + "epoch": 0.605165051176032, + "grad_norm": 19.711965952824613, + "learning_rate": 7.121062768720109e-07, + "loss": 1.1129, + "step": 4464 + }, + { + "epoch": 0.6053006168236968, + "grad_norm": 3.919546631010359, + "learning_rate": 7.116857587892724e-07, + "loss": 1.1115, + "step": 4465 + }, + { + "epoch": 0.6054361824713618, + "grad_norm": 3.2299985397048507, + "learning_rate": 7.112652963056589e-07, + "loss": 1.1087, + "step": 4466 + }, + { + "epoch": 0.6055717481190266, + "grad_norm": 2.6090308311241843, + "learning_rate": 7.108448895022544e-07, + "loss": 1.1127, + "step": 4467 + }, + { + "epoch": 0.6057073137666915, + "grad_norm": 1.7000499169383991, + "learning_rate": 7.104245384601303e-07, + "loss": 1.1221, + "step": 4468 + }, + { + "epoch": 0.6058428794143564, + "grad_norm": 1.6783780482573676, + "learning_rate": 7.100042432603481e-07, + "loss": 1.1356, + "step": 4469 + }, + { + "epoch": 0.6059784450620213, + "grad_norm": 2.0347764661978296, + "learning_rate": 7.095840039839587e-07, + "loss": 1.1554, + "step": 4470 + }, + { + "epoch": 0.6061140107096862, + "grad_norm": 1.9919765187534377, + "learning_rate": 7.091638207120015e-07, + "loss": 1.1772, + "step": 4471 + }, + { + "epoch": 0.606249576357351, + "grad_norm": 1.7752403873564933, + "learning_rate": 7.087436935255058e-07, + "loss": 1.1556, + "step": 4472 + }, + { + "epoch": 0.6063851420050159, + "grad_norm": 2.071774589947884, + "learning_rate": 7.083236225054901e-07, + "loss": 1.174, + "step": 4473 + }, + { + "epoch": 0.6065207076526808, + "grad_norm": 2.490234011291386, + "learning_rate": 7.079036077329612e-07, + "loss": 1.152, + "step": 4474 + }, + { + "epoch": 0.6066562733003457, + "grad_norm": 1.8013425594583883, + "learning_rate": 7.074836492889158e-07, + "loss": 1.1678, + "step": 4475 + }, + { + "epoch": 0.6067918389480106, + "grad_norm": 1.9874176191281492, + "learning_rate": 7.070637472543397e-07, + "loss": 1.1564, + "step": 4476 + }, + { + "epoch": 0.6069274045956754, + "grad_norm": 2.384196182913291, + "learning_rate": 7.066439017102076e-07, + "loss": 1.1579, + "step": 4477 + }, + { + "epoch": 0.6070629702433403, + "grad_norm": 1.6727467392581248, + "learning_rate": 7.062241127374838e-07, + "loss": 1.0831, + "step": 4478 + }, + { + "epoch": 0.6071985358910053, + "grad_norm": 2.177316964577621, + "learning_rate": 7.058043804171203e-07, + "loss": 1.166, + "step": 4479 + }, + { + "epoch": 0.6073341015386701, + "grad_norm": 4.339211080473473, + "learning_rate": 7.053847048300603e-07, + "loss": 1.1171, + "step": 4480 + }, + { + "epoch": 0.607469667186335, + "grad_norm": 1.6945698666896285, + "learning_rate": 7.04965086057234e-07, + "loss": 1.152, + "step": 4481 + }, + { + "epoch": 0.6076052328339999, + "grad_norm": 1.5950361899848255, + "learning_rate": 7.045455241795624e-07, + "loss": 1.1372, + "step": 4482 + }, + { + "epoch": 0.6077407984816647, + "grad_norm": 1.7226166529426632, + "learning_rate": 7.041260192779539e-07, + "loss": 1.1623, + "step": 4483 + }, + { + "epoch": 0.6078763641293297, + "grad_norm": 2.022488615384596, + "learning_rate": 7.037065714333075e-07, + "loss": 1.1532, + "step": 4484 + }, + { + "epoch": 0.6080119297769945, + "grad_norm": 1.58490412806936, + "learning_rate": 7.032871807265096e-07, + "loss": 1.164, + "step": 4485 + }, + { + "epoch": 0.6081474954246594, + "grad_norm": 1.7948514153862134, + "learning_rate": 7.028678472384373e-07, + "loss": 1.151, + "step": 4486 + }, + { + "epoch": 0.6082830610723243, + "grad_norm": 1.959544205627284, + "learning_rate": 7.02448571049955e-07, + "loss": 1.146, + "step": 4487 + }, + { + "epoch": 0.6084186267199891, + "grad_norm": 1.870283885042002, + "learning_rate": 7.020293522419168e-07, + "loss": 1.1574, + "step": 4488 + }, + { + "epoch": 0.6085541923676541, + "grad_norm": 1.7395486985756918, + "learning_rate": 7.016101908951663e-07, + "loss": 1.1705, + "step": 4489 + }, + { + "epoch": 0.6086897580153189, + "grad_norm": 1.5799372564585918, + "learning_rate": 7.011910870905349e-07, + "loss": 1.0903, + "step": 4490 + }, + { + "epoch": 0.6088253236629838, + "grad_norm": 1.9217811992438067, + "learning_rate": 7.00772040908844e-07, + "loss": 1.1316, + "step": 4491 + }, + { + "epoch": 0.6089608893106487, + "grad_norm": 2.2756535867092875, + "learning_rate": 7.003530524309025e-07, + "loss": 1.1221, + "step": 4492 + }, + { + "epoch": 0.6090964549583135, + "grad_norm": 2.59098936920798, + "learning_rate": 6.999341217375103e-07, + "loss": 1.133, + "step": 4493 + }, + { + "epoch": 0.6092320206059785, + "grad_norm": 3.7976754236872368, + "learning_rate": 6.995152489094535e-07, + "loss": 1.1606, + "step": 4494 + }, + { + "epoch": 0.6093675862536433, + "grad_norm": 1.5994994356830625, + "learning_rate": 6.990964340275095e-07, + "loss": 1.177, + "step": 4495 + }, + { + "epoch": 0.6095031519013082, + "grad_norm": 2.010276401979071, + "learning_rate": 6.986776771724427e-07, + "loss": 1.1796, + "step": 4496 + }, + { + "epoch": 0.6096387175489731, + "grad_norm": 1.8882564985614354, + "learning_rate": 6.982589784250077e-07, + "loss": 1.1771, + "step": 4497 + }, + { + "epoch": 0.609774283196638, + "grad_norm": 2.1020395087191823, + "learning_rate": 6.978403378659466e-07, + "loss": 1.1426, + "step": 4498 + }, + { + "epoch": 0.6099098488443029, + "grad_norm": 1.6601507401124977, + "learning_rate": 6.974217555759913e-07, + "loss": 1.1467, + "step": 4499 + }, + { + "epoch": 0.6100454144919677, + "grad_norm": 2.1533177567510196, + "learning_rate": 6.970032316358623e-07, + "loss": 1.1648, + "step": 4500 + }, + { + "epoch": 0.6101809801396326, + "grad_norm": 2.757696114772496, + "learning_rate": 6.965847661262681e-07, + "loss": 1.1665, + "step": 4501 + }, + { + "epoch": 0.6103165457872975, + "grad_norm": 1.9205626195922012, + "learning_rate": 6.96166359127907e-07, + "loss": 1.1135, + "step": 4502 + }, + { + "epoch": 0.6104521114349624, + "grad_norm": 1.6822371596887427, + "learning_rate": 6.957480107214648e-07, + "loss": 1.1373, + "step": 4503 + }, + { + "epoch": 0.6105876770826273, + "grad_norm": 5.462120940014232, + "learning_rate": 6.953297209876174e-07, + "loss": 1.1359, + "step": 4504 + }, + { + "epoch": 0.6107232427302921, + "grad_norm": 1.7637277727981902, + "learning_rate": 6.949114900070284e-07, + "loss": 1.0948, + "step": 4505 + }, + { + "epoch": 0.610858808377957, + "grad_norm": 1.8850864188829937, + "learning_rate": 6.944933178603503e-07, + "loss": 1.1516, + "step": 4506 + }, + { + "epoch": 0.610994374025622, + "grad_norm": 2.0393739295770366, + "learning_rate": 6.940752046282242e-07, + "loss": 1.1613, + "step": 4507 + }, + { + "epoch": 0.6111299396732868, + "grad_norm": 3.0327684527232, + "learning_rate": 6.936571503912803e-07, + "loss": 1.1656, + "step": 4508 + }, + { + "epoch": 0.6112655053209517, + "grad_norm": 2.0038722635005035, + "learning_rate": 6.932391552301366e-07, + "loss": 1.1835, + "step": 4509 + }, + { + "epoch": 0.6114010709686165, + "grad_norm": 2.0995268930031554, + "learning_rate": 6.928212192254006e-07, + "loss": 1.1691, + "step": 4510 + }, + { + "epoch": 0.6115366366162814, + "grad_norm": 2.060447174457294, + "learning_rate": 6.924033424576674e-07, + "loss": 1.1794, + "step": 4511 + }, + { + "epoch": 0.6116722022639464, + "grad_norm": 2.655159127060646, + "learning_rate": 6.91985525007522e-07, + "loss": 1.1145, + "step": 4512 + }, + { + "epoch": 0.6118077679116112, + "grad_norm": 1.994999145453265, + "learning_rate": 6.915677669555363e-07, + "loss": 1.1846, + "step": 4513 + }, + { + "epoch": 0.6119433335592761, + "grad_norm": 3.0282375360986125, + "learning_rate": 6.911500683822726e-07, + "loss": 1.1632, + "step": 4514 + }, + { + "epoch": 0.6120788992069409, + "grad_norm": 2.5880903463832055, + "learning_rate": 6.907324293682803e-07, + "loss": 1.1899, + "step": 4515 + }, + { + "epoch": 0.6122144648546058, + "grad_norm": 2.0430380665910355, + "learning_rate": 6.903148499940974e-07, + "loss": 1.1184, + "step": 4516 + }, + { + "epoch": 0.6123500305022708, + "grad_norm": 1.8156792895027891, + "learning_rate": 6.898973303402516e-07, + "loss": 1.1438, + "step": 4517 + }, + { + "epoch": 0.6124855961499356, + "grad_norm": 2.05018063437125, + "learning_rate": 6.894798704872574e-07, + "loss": 1.1393, + "step": 4518 + }, + { + "epoch": 0.6126211617976005, + "grad_norm": 2.17789980564529, + "learning_rate": 6.890624705156194e-07, + "loss": 1.1462, + "step": 4519 + }, + { + "epoch": 0.6127567274452653, + "grad_norm": 2.0080904151795087, + "learning_rate": 6.886451305058293e-07, + "loss": 1.1417, + "step": 4520 + }, + { + "epoch": 0.6128922930929303, + "grad_norm": 1.891485464832641, + "learning_rate": 6.882278505383685e-07, + "loss": 1.1634, + "step": 4521 + }, + { + "epoch": 0.6130278587405952, + "grad_norm": 2.455648915543942, + "learning_rate": 6.878106306937053e-07, + "loss": 1.1166, + "step": 4522 + }, + { + "epoch": 0.61316342438826, + "grad_norm": 1.713482915833575, + "learning_rate": 6.873934710522979e-07, + "loss": 1.1323, + "step": 4523 + }, + { + "epoch": 0.6132989900359249, + "grad_norm": 2.279172320896996, + "learning_rate": 6.86976371694592e-07, + "loss": 1.1741, + "step": 4524 + }, + { + "epoch": 0.6134345556835897, + "grad_norm": 2.156277462863712, + "learning_rate": 6.865593327010221e-07, + "loss": 1.1116, + "step": 4525 + }, + { + "epoch": 0.6135701213312547, + "grad_norm": 1.6944927392602454, + "learning_rate": 6.861423541520104e-07, + "loss": 1.1926, + "step": 4526 + }, + { + "epoch": 0.6137056869789196, + "grad_norm": 1.8360970063870652, + "learning_rate": 6.857254361279688e-07, + "loss": 1.1245, + "step": 4527 + }, + { + "epoch": 0.6138412526265844, + "grad_norm": 2.2279778356309987, + "learning_rate": 6.853085787092956e-07, + "loss": 1.1386, + "step": 4528 + }, + { + "epoch": 0.6139768182742493, + "grad_norm": 2.318788100395138, + "learning_rate": 6.848917819763793e-07, + "loss": 1.1711, + "step": 4529 + }, + { + "epoch": 0.6141123839219141, + "grad_norm": 1.8934468615318707, + "learning_rate": 6.844750460095956e-07, + "loss": 1.1392, + "step": 4530 + }, + { + "epoch": 0.6142479495695791, + "grad_norm": 2.5352851502137197, + "learning_rate": 6.840583708893083e-07, + "loss": 1.1654, + "step": 4531 + }, + { + "epoch": 0.614383515217244, + "grad_norm": 2.1555311032200364, + "learning_rate": 6.836417566958707e-07, + "loss": 1.2166, + "step": 4532 + }, + { + "epoch": 0.6145190808649088, + "grad_norm": 1.7068806295120194, + "learning_rate": 6.832252035096227e-07, + "loss": 1.144, + "step": 4533 + }, + { + "epoch": 0.6146546465125737, + "grad_norm": 1.716303026424118, + "learning_rate": 6.82808711410894e-07, + "loss": 1.1997, + "step": 4534 + }, + { + "epoch": 0.6147902121602385, + "grad_norm": 1.714464321049465, + "learning_rate": 6.823922804800016e-07, + "loss": 1.1429, + "step": 4535 + }, + { + "epoch": 0.6149257778079035, + "grad_norm": 2.257354039475894, + "learning_rate": 6.819759107972507e-07, + "loss": 1.2039, + "step": 4536 + }, + { + "epoch": 0.6150613434555684, + "grad_norm": 1.643758394917728, + "learning_rate": 6.815596024429351e-07, + "loss": 1.149, + "step": 4537 + }, + { + "epoch": 0.6151969091032332, + "grad_norm": 2.2953677280013904, + "learning_rate": 6.811433554973366e-07, + "loss": 1.2074, + "step": 4538 + }, + { + "epoch": 0.6153324747508981, + "grad_norm": 3.21499891416038, + "learning_rate": 6.807271700407251e-07, + "loss": 1.127, + "step": 4539 + }, + { + "epoch": 0.615468040398563, + "grad_norm": 1.9483849086507672, + "learning_rate": 6.803110461533587e-07, + "loss": 1.1039, + "step": 4540 + }, + { + "epoch": 0.6156036060462279, + "grad_norm": 1.875958057725106, + "learning_rate": 6.798949839154834e-07, + "loss": 1.1607, + "step": 4541 + }, + { + "epoch": 0.6157391716938928, + "grad_norm": 1.7525032276203893, + "learning_rate": 6.79478983407334e-07, + "loss": 1.1079, + "step": 4542 + }, + { + "epoch": 0.6158747373415576, + "grad_norm": 1.6119352969426006, + "learning_rate": 6.790630447091325e-07, + "loss": 1.1353, + "step": 4543 + }, + { + "epoch": 0.6160103029892225, + "grad_norm": 2.627310357905016, + "learning_rate": 6.786471679010895e-07, + "loss": 1.1367, + "step": 4544 + }, + { + "epoch": 0.6161458686368874, + "grad_norm": 3.5877887512869533, + "learning_rate": 6.782313530634036e-07, + "loss": 1.169, + "step": 4545 + }, + { + "epoch": 0.6162814342845523, + "grad_norm": 1.8463849515185953, + "learning_rate": 6.77815600276261e-07, + "loss": 1.1407, + "step": 4546 + }, + { + "epoch": 0.6164169999322172, + "grad_norm": 1.9017236171139176, + "learning_rate": 6.773999096198373e-07, + "loss": 1.1373, + "step": 4547 + }, + { + "epoch": 0.616552565579882, + "grad_norm": 2.3186897503625037, + "learning_rate": 6.769842811742941e-07, + "loss": 1.1242, + "step": 4548 + }, + { + "epoch": 0.616688131227547, + "grad_norm": 2.2044138005632443, + "learning_rate": 6.765687150197827e-07, + "loss": 1.1716, + "step": 4549 + }, + { + "epoch": 0.6168236968752118, + "grad_norm": 4.122880851631573, + "learning_rate": 6.761532112364414e-07, + "loss": 1.1473, + "step": 4550 + }, + { + "epoch": 0.6169592625228767, + "grad_norm": 3.8261217766325424, + "learning_rate": 6.757377699043976e-07, + "loss": 1.1588, + "step": 4551 + }, + { + "epoch": 0.6170948281705416, + "grad_norm": 2.8183630868602902, + "learning_rate": 6.753223911037646e-07, + "loss": 1.174, + "step": 4552 + }, + { + "epoch": 0.6172303938182064, + "grad_norm": 1.897106112271481, + "learning_rate": 6.749070749146461e-07, + "loss": 1.1749, + "step": 4553 + }, + { + "epoch": 0.6173659594658714, + "grad_norm": 2.5730563332414977, + "learning_rate": 6.744918214171318e-07, + "loss": 1.1515, + "step": 4554 + }, + { + "epoch": 0.6175015251135362, + "grad_norm": 6.445622024673837, + "learning_rate": 6.740766306913007e-07, + "loss": 1.1448, + "step": 4555 + }, + { + "epoch": 0.6176370907612011, + "grad_norm": 1.716140377688948, + "learning_rate": 6.736615028172183e-07, + "loss": 1.1952, + "step": 4556 + }, + { + "epoch": 0.617772656408866, + "grad_norm": 2.0304974378734384, + "learning_rate": 6.732464378749394e-07, + "loss": 1.1434, + "step": 4557 + }, + { + "epoch": 0.6179082220565308, + "grad_norm": 2.464390142087097, + "learning_rate": 6.728314359445058e-07, + "loss": 1.2206, + "step": 4558 + }, + { + "epoch": 0.6180437877041958, + "grad_norm": 1.832988762768351, + "learning_rate": 6.724164971059469e-07, + "loss": 1.1451, + "step": 4559 + }, + { + "epoch": 0.6181793533518606, + "grad_norm": 2.040120374836048, + "learning_rate": 6.720016214392812e-07, + "loss": 1.1336, + "step": 4560 + }, + { + "epoch": 0.6183149189995255, + "grad_norm": 1.8352684501302625, + "learning_rate": 6.715868090245131e-07, + "loss": 1.1564, + "step": 4561 + }, + { + "epoch": 0.6184504846471904, + "grad_norm": 2.0005420942698633, + "learning_rate": 6.711720599416373e-07, + "loss": 1.0973, + "step": 4562 + }, + { + "epoch": 0.6185860502948552, + "grad_norm": 1.793816836421142, + "learning_rate": 6.707573742706334e-07, + "loss": 1.155, + "step": 4563 + }, + { + "epoch": 0.6187216159425202, + "grad_norm": 5.975746422541012, + "learning_rate": 6.703427520914715e-07, + "loss": 1.1063, + "step": 4564 + }, + { + "epoch": 0.6188571815901851, + "grad_norm": 2.6846687413968877, + "learning_rate": 6.699281934841073e-07, + "loss": 1.171, + "step": 4565 + }, + { + "epoch": 0.6189927472378499, + "grad_norm": 3.2252633242847835, + "learning_rate": 6.69513698528486e-07, + "loss": 1.1499, + "step": 4566 + }, + { + "epoch": 0.6191283128855148, + "grad_norm": 2.112729338518887, + "learning_rate": 6.69099267304539e-07, + "loss": 1.1443, + "step": 4567 + }, + { + "epoch": 0.6192638785331797, + "grad_norm": 4.577903080857328, + "learning_rate": 6.686848998921864e-07, + "loss": 1.1582, + "step": 4568 + }, + { + "epoch": 0.6193994441808446, + "grad_norm": 2.3632876535818834, + "learning_rate": 6.682705963713355e-07, + "loss": 1.1649, + "step": 4569 + }, + { + "epoch": 0.6195350098285095, + "grad_norm": 2.9269755123934953, + "learning_rate": 6.678563568218816e-07, + "loss": 1.1394, + "step": 4570 + }, + { + "epoch": 0.6196705754761743, + "grad_norm": 1.8032143265848448, + "learning_rate": 6.674421813237079e-07, + "loss": 1.2033, + "step": 4571 + }, + { + "epoch": 0.6198061411238392, + "grad_norm": 2.3611466589985226, + "learning_rate": 6.670280699566841e-07, + "loss": 1.1331, + "step": 4572 + }, + { + "epoch": 0.6199417067715041, + "grad_norm": 2.0439623872045605, + "learning_rate": 6.666140228006687e-07, + "loss": 1.1054, + "step": 4573 + }, + { + "epoch": 0.620077272419169, + "grad_norm": 3.3601811548423406, + "learning_rate": 6.662000399355075e-07, + "loss": 1.1952, + "step": 4574 + }, + { + "epoch": 0.6202128380668339, + "grad_norm": 5.394678259345336, + "learning_rate": 6.657861214410338e-07, + "loss": 1.1476, + "step": 4575 + }, + { + "epoch": 0.6203484037144987, + "grad_norm": 2.9545357291719023, + "learning_rate": 6.653722673970681e-07, + "loss": 1.158, + "step": 4576 + }, + { + "epoch": 0.6204839693621637, + "grad_norm": 1.860975016047937, + "learning_rate": 6.649584778834196e-07, + "loss": 1.1217, + "step": 4577 + }, + { + "epoch": 0.6206195350098285, + "grad_norm": 1.799523448738814, + "learning_rate": 6.645447529798838e-07, + "loss": 1.1408, + "step": 4578 + }, + { + "epoch": 0.6207551006574934, + "grad_norm": 1.8760426185659989, + "learning_rate": 6.641310927662447e-07, + "loss": 1.1363, + "step": 4579 + }, + { + "epoch": 0.6208906663051583, + "grad_norm": 2.961171354100362, + "learning_rate": 6.637174973222727e-07, + "loss": 1.1398, + "step": 4580 + }, + { + "epoch": 0.6210262319528231, + "grad_norm": 1.9469428964921829, + "learning_rate": 6.633039667277274e-07, + "loss": 1.1643, + "step": 4581 + }, + { + "epoch": 0.6211617976004881, + "grad_norm": 5.380615414003732, + "learning_rate": 6.62890501062354e-07, + "loss": 1.1494, + "step": 4582 + }, + { + "epoch": 0.6212973632481529, + "grad_norm": 2.042787329312657, + "learning_rate": 6.624771004058868e-07, + "loss": 1.1592, + "step": 4583 + }, + { + "epoch": 0.6214329288958178, + "grad_norm": 2.618315235787418, + "learning_rate": 6.620637648380463e-07, + "loss": 1.1671, + "step": 4584 + }, + { + "epoch": 0.6215684945434827, + "grad_norm": 1.9859185140592288, + "learning_rate": 6.616504944385415e-07, + "loss": 1.1256, + "step": 4585 + }, + { + "epoch": 0.6217040601911475, + "grad_norm": 2.6291763005482505, + "learning_rate": 6.612372892870681e-07, + "loss": 1.1748, + "step": 4586 + }, + { + "epoch": 0.6218396258388125, + "grad_norm": 2.0488987987044607, + "learning_rate": 6.608241494633092e-07, + "loss": 1.1557, + "step": 4587 + }, + { + "epoch": 0.6219751914864773, + "grad_norm": 2.2692245452547892, + "learning_rate": 6.604110750469358e-07, + "loss": 1.1538, + "step": 4588 + }, + { + "epoch": 0.6221107571341422, + "grad_norm": 1.7629527170172337, + "learning_rate": 6.599980661176059e-07, + "loss": 1.1567, + "step": 4589 + }, + { + "epoch": 0.6222463227818071, + "grad_norm": 1.810825686120638, + "learning_rate": 6.595851227549656e-07, + "loss": 1.1213, + "step": 4590 + }, + { + "epoch": 0.622381888429472, + "grad_norm": 1.62866428420446, + "learning_rate": 6.591722450386468e-07, + "loss": 1.1447, + "step": 4591 + }, + { + "epoch": 0.6225174540771369, + "grad_norm": 2.2211487300618313, + "learning_rate": 6.587594330482707e-07, + "loss": 1.2017, + "step": 4592 + }, + { + "epoch": 0.6226530197248017, + "grad_norm": 1.6703101269783895, + "learning_rate": 6.583466868634437e-07, + "loss": 1.1612, + "step": 4593 + }, + { + "epoch": 0.6227885853724666, + "grad_norm": 1.885581918158365, + "learning_rate": 6.579340065637619e-07, + "loss": 1.1458, + "step": 4594 + }, + { + "epoch": 0.6229241510201315, + "grad_norm": 2.0438542587956636, + "learning_rate": 6.575213922288064e-07, + "loss": 1.1411, + "step": 4595 + }, + { + "epoch": 0.6230597166677964, + "grad_norm": 1.6523884890754001, + "learning_rate": 6.571088439381475e-07, + "loss": 1.1591, + "step": 4596 + }, + { + "epoch": 0.6231952823154613, + "grad_norm": 2.8991394619496207, + "learning_rate": 6.566963617713412e-07, + "loss": 1.1781, + "step": 4597 + }, + { + "epoch": 0.6233308479631261, + "grad_norm": 2.224948142918635, + "learning_rate": 6.562839458079315e-07, + "loss": 1.1398, + "step": 4598 + }, + { + "epoch": 0.623466413610791, + "grad_norm": 2.220957715951851, + "learning_rate": 6.558715961274501e-07, + "loss": 1.1388, + "step": 4599 + }, + { + "epoch": 0.623601979258456, + "grad_norm": 1.7033733157026878, + "learning_rate": 6.554593128094145e-07, + "loss": 1.1459, + "step": 4600 + }, + { + "epoch": 0.6237375449061208, + "grad_norm": 2.4674935744538393, + "learning_rate": 6.550470959333313e-07, + "loss": 1.1324, + "step": 4601 + }, + { + "epoch": 0.6238731105537857, + "grad_norm": 2.148744808968567, + "learning_rate": 6.546349455786925e-07, + "loss": 1.1464, + "step": 4602 + }, + { + "epoch": 0.6240086762014505, + "grad_norm": 2.1804129044836014, + "learning_rate": 6.542228618249784e-07, + "loss": 1.1907, + "step": 4603 + }, + { + "epoch": 0.6241442418491154, + "grad_norm": 1.862038154867224, + "learning_rate": 6.538108447516557e-07, + "loss": 1.162, + "step": 4604 + }, + { + "epoch": 0.6242798074967804, + "grad_norm": 2.0227679843107436, + "learning_rate": 6.533988944381792e-07, + "loss": 1.1601, + "step": 4605 + }, + { + "epoch": 0.6244153731444452, + "grad_norm": 1.928960319372985, + "learning_rate": 6.529870109639899e-07, + "loss": 1.1377, + "step": 4606 + }, + { + "epoch": 0.6245509387921101, + "grad_norm": 1.908166005542769, + "learning_rate": 6.525751944085166e-07, + "loss": 1.1705, + "step": 4607 + }, + { + "epoch": 0.6246865044397749, + "grad_norm": 1.795259196665218, + "learning_rate": 6.521634448511743e-07, + "loss": 1.1547, + "step": 4608 + }, + { + "epoch": 0.6248220700874398, + "grad_norm": 12.388149621668687, + "learning_rate": 6.517517623713664e-07, + "loss": 1.1772, + "step": 4609 + }, + { + "epoch": 0.6249576357351048, + "grad_norm": 1.9936829196148986, + "learning_rate": 6.513401470484817e-07, + "loss": 1.1204, + "step": 4610 + }, + { + "epoch": 0.6250932013827696, + "grad_norm": 2.711483504980579, + "learning_rate": 6.50928598961898e-07, + "loss": 1.1648, + "step": 4611 + }, + { + "epoch": 0.6252287670304345, + "grad_norm": 1.5900580255449752, + "learning_rate": 6.505171181909782e-07, + "loss": 1.1514, + "step": 4612 + }, + { + "epoch": 0.6253643326780993, + "grad_norm": 2.2793447110837954, + "learning_rate": 6.501057048150738e-07, + "loss": 1.1574, + "step": 4613 + }, + { + "epoch": 0.6254998983257642, + "grad_norm": 2.1472263148411996, + "learning_rate": 6.496943589135225e-07, + "loss": 1.1509, + "step": 4614 + }, + { + "epoch": 0.6256354639734292, + "grad_norm": 2.0116769143348057, + "learning_rate": 6.492830805656484e-07, + "loss": 1.1699, + "step": 4615 + }, + { + "epoch": 0.625771029621094, + "grad_norm": 1.6970288506378373, + "learning_rate": 6.488718698507643e-07, + "loss": 1.1292, + "step": 4616 + }, + { + "epoch": 0.6259065952687589, + "grad_norm": 2.8066878694821376, + "learning_rate": 6.484607268481681e-07, + "loss": 1.1872, + "step": 4617 + }, + { + "epoch": 0.6260421609164237, + "grad_norm": 1.995711583849673, + "learning_rate": 6.480496516371461e-07, + "loss": 1.172, + "step": 4618 + }, + { + "epoch": 0.6261777265640887, + "grad_norm": 1.7162075141677813, + "learning_rate": 6.476386442969703e-07, + "loss": 1.1393, + "step": 4619 + }, + { + "epoch": 0.6263132922117536, + "grad_norm": 1.6189420641000578, + "learning_rate": 6.472277049069011e-07, + "loss": 1.1251, + "step": 4620 + }, + { + "epoch": 0.6264488578594184, + "grad_norm": 3.2207414825986103, + "learning_rate": 6.468168335461839e-07, + "loss": 1.1587, + "step": 4621 + }, + { + "epoch": 0.6265844235070833, + "grad_norm": 6.816262381987979, + "learning_rate": 6.464060302940528e-07, + "loss": 1.1277, + "step": 4622 + }, + { + "epoch": 0.6267199891547481, + "grad_norm": 1.8344430760244175, + "learning_rate": 6.459952952297274e-07, + "loss": 1.156, + "step": 4623 + }, + { + "epoch": 0.6268555548024131, + "grad_norm": 2.1949686712668024, + "learning_rate": 6.455846284324153e-07, + "loss": 1.1655, + "step": 4624 + }, + { + "epoch": 0.626991120450078, + "grad_norm": 2.3097963213671124, + "learning_rate": 6.451740299813097e-07, + "loss": 1.1467, + "step": 4625 + }, + { + "epoch": 0.6271266860977428, + "grad_norm": 1.8644405436467038, + "learning_rate": 6.447634999555919e-07, + "loss": 1.179, + "step": 4626 + }, + { + "epoch": 0.6272622517454077, + "grad_norm": 1.9171067667028903, + "learning_rate": 6.443530384344291e-07, + "loss": 1.1894, + "step": 4627 + }, + { + "epoch": 0.6273978173930725, + "grad_norm": 3.3934888638275167, + "learning_rate": 6.439426454969752e-07, + "loss": 1.138, + "step": 4628 + }, + { + "epoch": 0.6275333830407375, + "grad_norm": 1.9278089006750638, + "learning_rate": 6.435323212223718e-07, + "loss": 1.1369, + "step": 4629 + }, + { + "epoch": 0.6276689486884024, + "grad_norm": 1.989465232836569, + "learning_rate": 6.431220656897463e-07, + "loss": 1.1075, + "step": 4630 + }, + { + "epoch": 0.6278045143360672, + "grad_norm": 1.7086171175524372, + "learning_rate": 6.427118789782136e-07, + "loss": 1.1699, + "step": 4631 + }, + { + "epoch": 0.6279400799837321, + "grad_norm": 1.9938944638891725, + "learning_rate": 6.423017611668744e-07, + "loss": 1.1382, + "step": 4632 + }, + { + "epoch": 0.628075645631397, + "grad_norm": 1.8621176129101669, + "learning_rate": 6.418917123348176e-07, + "loss": 1.182, + "step": 4633 + }, + { + "epoch": 0.6282112112790619, + "grad_norm": 1.705305823715603, + "learning_rate": 6.41481732561117e-07, + "loss": 1.142, + "step": 4634 + }, + { + "epoch": 0.6283467769267268, + "grad_norm": 2.4660823359419948, + "learning_rate": 6.410718219248344e-07, + "loss": 1.174, + "step": 4635 + }, + { + "epoch": 0.6284823425743916, + "grad_norm": 2.9763438774261597, + "learning_rate": 6.406619805050177e-07, + "loss": 1.1367, + "step": 4636 + }, + { + "epoch": 0.6286179082220565, + "grad_norm": 2.037797040136, + "learning_rate": 6.402522083807016e-07, + "loss": 1.1719, + "step": 4637 + }, + { + "epoch": 0.6287534738697214, + "grad_norm": 3.113929954902727, + "learning_rate": 6.398425056309073e-07, + "loss": 1.1502, + "step": 4638 + }, + { + "epoch": 0.6288890395173863, + "grad_norm": 1.7157930493135714, + "learning_rate": 6.394328723346433e-07, + "loss": 1.1591, + "step": 4639 + }, + { + "epoch": 0.6290246051650512, + "grad_norm": 2.0464934981125804, + "learning_rate": 6.390233085709034e-07, + "loss": 1.166, + "step": 4640 + }, + { + "epoch": 0.629160170812716, + "grad_norm": 2.1166053409908914, + "learning_rate": 6.386138144186693e-07, + "loss": 1.1524, + "step": 4641 + }, + { + "epoch": 0.629295736460381, + "grad_norm": 1.9131114256685207, + "learning_rate": 6.382043899569083e-07, + "loss": 1.1299, + "step": 4642 + }, + { + "epoch": 0.6294313021080459, + "grad_norm": 2.0354670098689533, + "learning_rate": 6.377950352645748e-07, + "loss": 1.1261, + "step": 4643 + }, + { + "epoch": 0.6295668677557107, + "grad_norm": 4.115368316481516, + "learning_rate": 6.373857504206099e-07, + "loss": 1.1522, + "step": 4644 + }, + { + "epoch": 0.6297024334033756, + "grad_norm": 1.6681745994529034, + "learning_rate": 6.369765355039405e-07, + "loss": 1.1451, + "step": 4645 + }, + { + "epoch": 0.6298379990510404, + "grad_norm": 1.9194934440028328, + "learning_rate": 6.365673905934809e-07, + "loss": 1.1038, + "step": 4646 + }, + { + "epoch": 0.6299735646987054, + "grad_norm": 6.551943240198343, + "learning_rate": 6.361583157681309e-07, + "loss": 1.1268, + "step": 4647 + }, + { + "epoch": 0.6301091303463703, + "grad_norm": 19.743650026881824, + "learning_rate": 6.357493111067781e-07, + "loss": 1.1784, + "step": 4648 + }, + { + "epoch": 0.6302446959940351, + "grad_norm": 1.7832988993479093, + "learning_rate": 6.353403766882951e-07, + "loss": 1.1727, + "step": 4649 + }, + { + "epoch": 0.6303802616417, + "grad_norm": 2.7072697012798126, + "learning_rate": 6.349315125915424e-07, + "loss": 1.1909, + "step": 4650 + }, + { + "epoch": 0.6305158272893648, + "grad_norm": 1.8095601574889015, + "learning_rate": 6.345227188953653e-07, + "loss": 1.1279, + "step": 4651 + }, + { + "epoch": 0.6306513929370298, + "grad_norm": 1.753282932912547, + "learning_rate": 6.341139956785974e-07, + "loss": 1.1463, + "step": 4652 + }, + { + "epoch": 0.6307869585846947, + "grad_norm": 2.2379396114173837, + "learning_rate": 6.337053430200571e-07, + "loss": 1.1824, + "step": 4653 + }, + { + "epoch": 0.6309225242323595, + "grad_norm": 4.049120359485157, + "learning_rate": 6.332967609985502e-07, + "loss": 1.1363, + "step": 4654 + }, + { + "epoch": 0.6310580898800244, + "grad_norm": 3.973027396898687, + "learning_rate": 6.328882496928685e-07, + "loss": 1.1019, + "step": 4655 + }, + { + "epoch": 0.6311936555276892, + "grad_norm": 1.6089874041715138, + "learning_rate": 6.324798091817897e-07, + "loss": 1.1576, + "step": 4656 + }, + { + "epoch": 0.6313292211753542, + "grad_norm": 9.667370337734324, + "learning_rate": 6.320714395440789e-07, + "loss": 1.1243, + "step": 4657 + }, + { + "epoch": 0.6314647868230191, + "grad_norm": 2.7929254438082642, + "learning_rate": 6.316631408584865e-07, + "loss": 1.1431, + "step": 4658 + }, + { + "epoch": 0.6316003524706839, + "grad_norm": 1.8561304841225483, + "learning_rate": 6.312549132037501e-07, + "loss": 1.1151, + "step": 4659 + }, + { + "epoch": 0.6317359181183488, + "grad_norm": 2.991053726398921, + "learning_rate": 6.308467566585927e-07, + "loss": 1.1409, + "step": 4660 + }, + { + "epoch": 0.6318714837660137, + "grad_norm": 2.2603840909014696, + "learning_rate": 6.304386713017249e-07, + "loss": 1.1628, + "step": 4661 + }, + { + "epoch": 0.6320070494136786, + "grad_norm": 1.6859127750991694, + "learning_rate": 6.300306572118417e-07, + "loss": 1.143, + "step": 4662 + }, + { + "epoch": 0.6321426150613435, + "grad_norm": 2.2692763071500557, + "learning_rate": 6.296227144676262e-07, + "loss": 1.1579, + "step": 4663 + }, + { + "epoch": 0.6322781807090083, + "grad_norm": 1.6963619051916476, + "learning_rate": 6.292148431477465e-07, + "loss": 1.1446, + "step": 4664 + }, + { + "epoch": 0.6324137463566732, + "grad_norm": 1.882799244899725, + "learning_rate": 6.288070433308575e-07, + "loss": 1.1512, + "step": 4665 + }, + { + "epoch": 0.6325493120043381, + "grad_norm": 1.721935550195165, + "learning_rate": 6.283993150956002e-07, + "loss": 1.1593, + "step": 4666 + }, + { + "epoch": 0.632684877652003, + "grad_norm": 1.7114594295242624, + "learning_rate": 6.279916585206018e-07, + "loss": 1.1249, + "step": 4667 + }, + { + "epoch": 0.6328204432996679, + "grad_norm": 1.9632449635822853, + "learning_rate": 6.275840736844754e-07, + "loss": 1.1304, + "step": 4668 + }, + { + "epoch": 0.6329560089473327, + "grad_norm": 2.217070807402325, + "learning_rate": 6.27176560665821e-07, + "loss": 1.126, + "step": 4669 + }, + { + "epoch": 0.6330915745949977, + "grad_norm": 2.043452089576641, + "learning_rate": 6.267691195432239e-07, + "loss": 1.1463, + "step": 4670 + }, + { + "epoch": 0.6332271402426625, + "grad_norm": 3.1410992694523703, + "learning_rate": 6.263617503952559e-07, + "loss": 1.164, + "step": 4671 + }, + { + "epoch": 0.6333627058903274, + "grad_norm": 1.764142229540525, + "learning_rate": 6.259544533004751e-07, + "loss": 1.2033, + "step": 4672 + }, + { + "epoch": 0.6334982715379923, + "grad_norm": 1.7945772423069888, + "learning_rate": 6.255472283374253e-07, + "loss": 1.1259, + "step": 4673 + }, + { + "epoch": 0.6336338371856571, + "grad_norm": 1.7613075086020802, + "learning_rate": 6.251400755846371e-07, + "loss": 1.1647, + "step": 4674 + }, + { + "epoch": 0.6337694028333221, + "grad_norm": 1.6100605165007758, + "learning_rate": 6.247329951206259e-07, + "loss": 1.1471, + "step": 4675 + }, + { + "epoch": 0.6339049684809869, + "grad_norm": 1.79640832668439, + "learning_rate": 6.243259870238948e-07, + "loss": 1.1376, + "step": 4676 + }, + { + "epoch": 0.6340405341286518, + "grad_norm": 2.1433853591627354, + "learning_rate": 6.239190513729313e-07, + "loss": 1.1306, + "step": 4677 + }, + { + "epoch": 0.6341760997763167, + "grad_norm": 2.094152554638725, + "learning_rate": 6.235121882462107e-07, + "loss": 1.1645, + "step": 4678 + }, + { + "epoch": 0.6343116654239815, + "grad_norm": 2.1294156000809696, + "learning_rate": 6.23105397722192e-07, + "loss": 1.1578, + "step": 4679 + }, + { + "epoch": 0.6344472310716465, + "grad_norm": 2.005221823117285, + "learning_rate": 6.226986798793231e-07, + "loss": 1.1256, + "step": 4680 + }, + { + "epoch": 0.6345827967193113, + "grad_norm": 3.0420926229379113, + "learning_rate": 6.22292034796035e-07, + "loss": 1.139, + "step": 4681 + }, + { + "epoch": 0.6347183623669762, + "grad_norm": 2.3389866632650187, + "learning_rate": 6.21885462550747e-07, + "loss": 1.1555, + "step": 4682 + }, + { + "epoch": 0.6348539280146411, + "grad_norm": 1.6294181870902278, + "learning_rate": 6.214789632218628e-07, + "loss": 1.188, + "step": 4683 + }, + { + "epoch": 0.634989493662306, + "grad_norm": 1.814875885393905, + "learning_rate": 6.210725368877723e-07, + "loss": 1.0864, + "step": 4684 + }, + { + "epoch": 0.6351250593099709, + "grad_norm": 3.946382535951212, + "learning_rate": 6.206661836268525e-07, + "loss": 1.1354, + "step": 4685 + }, + { + "epoch": 0.6352606249576357, + "grad_norm": 1.7397664152752526, + "learning_rate": 6.202599035174645e-07, + "loss": 1.1145, + "step": 4686 + }, + { + "epoch": 0.6353961906053006, + "grad_norm": 4.234664143635901, + "learning_rate": 6.19853696637957e-07, + "loss": 1.0973, + "step": 4687 + }, + { + "epoch": 0.6355317562529655, + "grad_norm": 1.6789192611886365, + "learning_rate": 6.194475630666629e-07, + "loss": 1.1981, + "step": 4688 + }, + { + "epoch": 0.6356673219006304, + "grad_norm": 2.494525863302106, + "learning_rate": 6.190415028819029e-07, + "loss": 1.1153, + "step": 4689 + }, + { + "epoch": 0.6358028875482953, + "grad_norm": 1.724998968282449, + "learning_rate": 6.186355161619814e-07, + "loss": 1.1437, + "step": 4690 + }, + { + "epoch": 0.6359384531959601, + "grad_norm": 1.8390548994871831, + "learning_rate": 6.182296029851908e-07, + "loss": 1.1549, + "step": 4691 + }, + { + "epoch": 0.636074018843625, + "grad_norm": 2.85915446698862, + "learning_rate": 6.178237634298073e-07, + "loss": 1.1714, + "step": 4692 + }, + { + "epoch": 0.63620958449129, + "grad_norm": 1.7187163505520635, + "learning_rate": 6.174179975740949e-07, + "loss": 1.1603, + "step": 4693 + }, + { + "epoch": 0.6363451501389548, + "grad_norm": 2.6715020866718495, + "learning_rate": 6.170123054963012e-07, + "loss": 1.1176, + "step": 4694 + }, + { + "epoch": 0.6364807157866197, + "grad_norm": 2.2645109629647817, + "learning_rate": 6.166066872746616e-07, + "loss": 1.1727, + "step": 4695 + }, + { + "epoch": 0.6366162814342845, + "grad_norm": 1.8048492533195044, + "learning_rate": 6.162011429873959e-07, + "loss": 1.1421, + "step": 4696 + }, + { + "epoch": 0.6367518470819494, + "grad_norm": 2.021201735073335, + "learning_rate": 6.157956727127102e-07, + "loss": 1.1242, + "step": 4697 + }, + { + "epoch": 0.6368874127296144, + "grad_norm": 2.440825082809372, + "learning_rate": 6.153902765287966e-07, + "loss": 1.137, + "step": 4698 + }, + { + "epoch": 0.6370229783772792, + "grad_norm": 1.693987434190931, + "learning_rate": 6.149849545138319e-07, + "loss": 1.1049, + "step": 4699 + }, + { + "epoch": 0.6371585440249441, + "grad_norm": 1.8780629762345717, + "learning_rate": 6.145797067459799e-07, + "loss": 1.1488, + "step": 4700 + }, + { + "epoch": 0.6372941096726089, + "grad_norm": 1.7564053187234163, + "learning_rate": 6.141745333033889e-07, + "loss": 1.1247, + "step": 4701 + }, + { + "epoch": 0.6374296753202738, + "grad_norm": 2.4489644807614814, + "learning_rate": 6.137694342641937e-07, + "loss": 1.138, + "step": 4702 + }, + { + "epoch": 0.6375652409679388, + "grad_norm": 1.7240943370906139, + "learning_rate": 6.133644097065143e-07, + "loss": 1.1547, + "step": 4703 + }, + { + "epoch": 0.6377008066156036, + "grad_norm": 1.86498135161543, + "learning_rate": 6.129594597084567e-07, + "loss": 1.1351, + "step": 4704 + }, + { + "epoch": 0.6378363722632685, + "grad_norm": 2.0024024814250145, + "learning_rate": 6.125545843481119e-07, + "loss": 1.0967, + "step": 4705 + }, + { + "epoch": 0.6379719379109333, + "grad_norm": 5.1717947223543606, + "learning_rate": 6.121497837035576e-07, + "loss": 1.0995, + "step": 4706 + }, + { + "epoch": 0.6381075035585982, + "grad_norm": 2.333482189251766, + "learning_rate": 6.117450578528556e-07, + "loss": 1.1382, + "step": 4707 + }, + { + "epoch": 0.6382430692062632, + "grad_norm": 1.9356923524662644, + "learning_rate": 6.11340406874055e-07, + "loss": 1.1764, + "step": 4708 + }, + { + "epoch": 0.638378634853928, + "grad_norm": 3.006705424434063, + "learning_rate": 6.109358308451885e-07, + "loss": 1.1208, + "step": 4709 + }, + { + "epoch": 0.6385142005015929, + "grad_norm": 1.8417836337364724, + "learning_rate": 6.105313298442764e-07, + "loss": 1.1502, + "step": 4710 + }, + { + "epoch": 0.6386497661492577, + "grad_norm": 1.8202013841974545, + "learning_rate": 6.10126903949323e-07, + "loss": 1.18, + "step": 4711 + }, + { + "epoch": 0.6387853317969227, + "grad_norm": 1.5721564004503576, + "learning_rate": 6.097225532383184e-07, + "loss": 1.1146, + "step": 4712 + }, + { + "epoch": 0.6389208974445876, + "grad_norm": 2.63379313817079, + "learning_rate": 6.093182777892392e-07, + "loss": 1.1575, + "step": 4713 + }, + { + "epoch": 0.6390564630922524, + "grad_norm": 2.233995289289699, + "learning_rate": 6.089140776800456e-07, + "loss": 1.1538, + "step": 4714 + }, + { + "epoch": 0.6391920287399173, + "grad_norm": 1.8897595886183796, + "learning_rate": 6.085099529886857e-07, + "loss": 1.1347, + "step": 4715 + }, + { + "epoch": 0.6393275943875821, + "grad_norm": 1.9502458585917652, + "learning_rate": 6.081059037930907e-07, + "loss": 1.1584, + "step": 4716 + }, + { + "epoch": 0.6394631600352471, + "grad_norm": 4.816663108050989, + "learning_rate": 6.07701930171179e-07, + "loss": 1.1532, + "step": 4717 + }, + { + "epoch": 0.639598725682912, + "grad_norm": 1.8940640668277264, + "learning_rate": 6.072980322008532e-07, + "loss": 1.1317, + "step": 4718 + }, + { + "epoch": 0.6397342913305768, + "grad_norm": 3.8679522338175647, + "learning_rate": 6.068942099600025e-07, + "loss": 1.1347, + "step": 4719 + }, + { + "epoch": 0.6398698569782417, + "grad_norm": 1.8902840879523488, + "learning_rate": 6.064904635264999e-07, + "loss": 1.1447, + "step": 4720 + }, + { + "epoch": 0.6400054226259067, + "grad_norm": 1.7981316083693482, + "learning_rate": 6.060867929782057e-07, + "loss": 1.1247, + "step": 4721 + }, + { + "epoch": 0.6401409882735715, + "grad_norm": 2.106086169413983, + "learning_rate": 6.056831983929638e-07, + "loss": 1.1101, + "step": 4722 + }, + { + "epoch": 0.6402765539212364, + "grad_norm": 1.5046870622311588, + "learning_rate": 6.052796798486049e-07, + "loss": 1.1519, + "step": 4723 + }, + { + "epoch": 0.6404121195689012, + "grad_norm": 1.6975307596808276, + "learning_rate": 6.048762374229435e-07, + "loss": 1.1421, + "step": 4724 + }, + { + "epoch": 0.6405476852165661, + "grad_norm": 1.822172519122503, + "learning_rate": 6.044728711937812e-07, + "loss": 1.1418, + "step": 4725 + }, + { + "epoch": 0.6406832508642311, + "grad_norm": 1.7324561560904517, + "learning_rate": 6.040695812389036e-07, + "loss": 1.1658, + "step": 4726 + }, + { + "epoch": 0.6408188165118959, + "grad_norm": 1.659696113079681, + "learning_rate": 6.036663676360816e-07, + "loss": 1.1307, + "step": 4727 + }, + { + "epoch": 0.6409543821595608, + "grad_norm": 2.833355222082676, + "learning_rate": 6.032632304630726e-07, + "loss": 1.1655, + "step": 4728 + }, + { + "epoch": 0.6410899478072256, + "grad_norm": 2.2136378478775227, + "learning_rate": 6.028601697976175e-07, + "loss": 1.1449, + "step": 4729 + }, + { + "epoch": 0.6412255134548905, + "grad_norm": 2.1805069000910677, + "learning_rate": 6.024571857174442e-07, + "loss": 1.1384, + "step": 4730 + }, + { + "epoch": 0.6413610791025555, + "grad_norm": 1.9397518059916454, + "learning_rate": 6.020542783002643e-07, + "loss": 1.1825, + "step": 4731 + }, + { + "epoch": 0.6414966447502203, + "grad_norm": 2.088220828781457, + "learning_rate": 6.01651447623776e-07, + "loss": 1.1073, + "step": 4732 + }, + { + "epoch": 0.6416322103978852, + "grad_norm": 1.5865212943488514, + "learning_rate": 6.012486937656613e-07, + "loss": 1.1454, + "step": 4733 + }, + { + "epoch": 0.64176777604555, + "grad_norm": 1.7878308534427145, + "learning_rate": 6.008460168035887e-07, + "loss": 1.1789, + "step": 4734 + }, + { + "epoch": 0.641903341693215, + "grad_norm": 2.1482341612516342, + "learning_rate": 6.004434168152109e-07, + "loss": 1.1334, + "step": 4735 + }, + { + "epoch": 0.6420389073408799, + "grad_norm": 1.977684415986616, + "learning_rate": 6.000408938781665e-07, + "loss": 1.1462, + "step": 4736 + }, + { + "epoch": 0.6421744729885447, + "grad_norm": 1.696037684509932, + "learning_rate": 5.996384480700783e-07, + "loss": 1.1434, + "step": 4737 + }, + { + "epoch": 0.6423100386362096, + "grad_norm": 1.7416594332133823, + "learning_rate": 5.992360794685554e-07, + "loss": 1.1316, + "step": 4738 + }, + { + "epoch": 0.6424456042838744, + "grad_norm": 3.337064455253838, + "learning_rate": 5.988337881511909e-07, + "loss": 1.1491, + "step": 4739 + }, + { + "epoch": 0.6425811699315394, + "grad_norm": 2.3459019562621704, + "learning_rate": 5.984315741955639e-07, + "loss": 1.1673, + "step": 4740 + }, + { + "epoch": 0.6427167355792043, + "grad_norm": 2.082254959054436, + "learning_rate": 5.98029437679238e-07, + "loss": 1.2079, + "step": 4741 + }, + { + "epoch": 0.6428523012268691, + "grad_norm": 1.9727213970625261, + "learning_rate": 5.976273786797619e-07, + "loss": 1.1849, + "step": 4742 + }, + { + "epoch": 0.642987866874534, + "grad_norm": 1.7470588269128888, + "learning_rate": 5.972253972746701e-07, + "loss": 1.1625, + "step": 4743 + }, + { + "epoch": 0.6431234325221988, + "grad_norm": 1.7453417590146547, + "learning_rate": 5.968234935414807e-07, + "loss": 1.1132, + "step": 4744 + }, + { + "epoch": 0.6432589981698638, + "grad_norm": 1.5650198334049656, + "learning_rate": 5.964216675576983e-07, + "loss": 1.1064, + "step": 4745 + }, + { + "epoch": 0.6433945638175287, + "grad_norm": 1.8105742427316949, + "learning_rate": 5.960199194008115e-07, + "loss": 1.1497, + "step": 4746 + }, + { + "epoch": 0.6435301294651935, + "grad_norm": 2.1415669944469387, + "learning_rate": 5.956182491482946e-07, + "loss": 1.1505, + "step": 4747 + }, + { + "epoch": 0.6436656951128584, + "grad_norm": 2.0373430603459384, + "learning_rate": 5.952166568776062e-07, + "loss": 1.0996, + "step": 4748 + }, + { + "epoch": 0.6438012607605232, + "grad_norm": 1.6131723308647323, + "learning_rate": 5.948151426661904e-07, + "loss": 1.171, + "step": 4749 + }, + { + "epoch": 0.6439368264081882, + "grad_norm": 1.864592817155579, + "learning_rate": 5.944137065914759e-07, + "loss": 1.1241, + "step": 4750 + }, + { + "epoch": 0.6440723920558531, + "grad_norm": 2.1341549510678637, + "learning_rate": 5.94012348730877e-07, + "loss": 1.1475, + "step": 4751 + }, + { + "epoch": 0.6442079577035179, + "grad_norm": 2.974852259656679, + "learning_rate": 5.936110691617915e-07, + "loss": 1.1579, + "step": 4752 + }, + { + "epoch": 0.6443435233511828, + "grad_norm": 8.137379641006824, + "learning_rate": 5.932098679616038e-07, + "loss": 1.1273, + "step": 4753 + }, + { + "epoch": 0.6444790889988476, + "grad_norm": 1.673008010857656, + "learning_rate": 5.928087452076821e-07, + "loss": 1.1106, + "step": 4754 + }, + { + "epoch": 0.6446146546465126, + "grad_norm": 1.6608211865916527, + "learning_rate": 5.924077009773794e-07, + "loss": 1.1587, + "step": 4755 + }, + { + "epoch": 0.6447502202941775, + "grad_norm": 10.462746656986004, + "learning_rate": 5.920067353480345e-07, + "loss": 1.1233, + "step": 4756 + }, + { + "epoch": 0.6448857859418423, + "grad_norm": 1.7562114395619226, + "learning_rate": 5.916058483969698e-07, + "loss": 1.1691, + "step": 4757 + }, + { + "epoch": 0.6450213515895072, + "grad_norm": 5.319710341850418, + "learning_rate": 5.912050402014941e-07, + "loss": 1.1306, + "step": 4758 + }, + { + "epoch": 0.6451569172371721, + "grad_norm": 1.6333951687507013, + "learning_rate": 5.908043108388989e-07, + "loss": 1.1352, + "step": 4759 + }, + { + "epoch": 0.645292482884837, + "grad_norm": 3.512362851101197, + "learning_rate": 5.90403660386463e-07, + "loss": 1.1548, + "step": 4760 + }, + { + "epoch": 0.6454280485325019, + "grad_norm": 2.7837204567173526, + "learning_rate": 5.900030889214476e-07, + "loss": 1.1627, + "step": 4761 + }, + { + "epoch": 0.6455636141801667, + "grad_norm": 2.4257179721295903, + "learning_rate": 5.896025965211005e-07, + "loss": 1.1135, + "step": 4762 + }, + { + "epoch": 0.6456991798278316, + "grad_norm": 2.126989225514602, + "learning_rate": 5.89202183262653e-07, + "loss": 1.1308, + "step": 4763 + }, + { + "epoch": 0.6458347454754965, + "grad_norm": 2.8648383975503653, + "learning_rate": 5.888018492233219e-07, + "loss": 1.1531, + "step": 4764 + }, + { + "epoch": 0.6459703111231614, + "grad_norm": 1.8060161854864716, + "learning_rate": 5.884015944803084e-07, + "loss": 1.1387, + "step": 4765 + }, + { + "epoch": 0.6461058767708263, + "grad_norm": 1.8111606536047469, + "learning_rate": 5.880014191107982e-07, + "loss": 1.1284, + "step": 4766 + }, + { + "epoch": 0.6462414424184911, + "grad_norm": 2.3291606318088585, + "learning_rate": 5.876013231919628e-07, + "loss": 1.1381, + "step": 4767 + }, + { + "epoch": 0.6463770080661561, + "grad_norm": 2.0041792643247582, + "learning_rate": 5.872013068009565e-07, + "loss": 1.1367, + "step": 4768 + }, + { + "epoch": 0.6465125737138209, + "grad_norm": 1.9904059862770151, + "learning_rate": 5.868013700149197e-07, + "loss": 1.1142, + "step": 4769 + }, + { + "epoch": 0.6466481393614858, + "grad_norm": 1.8203012085486316, + "learning_rate": 5.864015129109771e-07, + "loss": 1.1536, + "step": 4770 + }, + { + "epoch": 0.6467837050091507, + "grad_norm": 1.8637885960055771, + "learning_rate": 5.860017355662381e-07, + "loss": 1.1921, + "step": 4771 + }, + { + "epoch": 0.6469192706568155, + "grad_norm": 2.522693785075004, + "learning_rate": 5.856020380577964e-07, + "loss": 1.1185, + "step": 4772 + }, + { + "epoch": 0.6470548363044805, + "grad_norm": 1.8760073019329606, + "learning_rate": 5.852024204627308e-07, + "loss": 1.1283, + "step": 4773 + }, + { + "epoch": 0.6471904019521453, + "grad_norm": 1.8685639544808952, + "learning_rate": 5.84802882858104e-07, + "loss": 1.1501, + "step": 4774 + }, + { + "epoch": 0.6473259675998102, + "grad_norm": 1.8956128020904335, + "learning_rate": 5.844034253209641e-07, + "loss": 1.1155, + "step": 4775 + }, + { + "epoch": 0.6474615332474751, + "grad_norm": 1.8811156421859776, + "learning_rate": 5.840040479283428e-07, + "loss": 1.1368, + "step": 4776 + }, + { + "epoch": 0.6475970988951399, + "grad_norm": 1.705767488719623, + "learning_rate": 5.836047507572575e-07, + "loss": 1.136, + "step": 4777 + }, + { + "epoch": 0.6477326645428049, + "grad_norm": 1.6465850419801864, + "learning_rate": 5.832055338847089e-07, + "loss": 1.1311, + "step": 4778 + }, + { + "epoch": 0.6478682301904697, + "grad_norm": 1.8298084943116568, + "learning_rate": 5.828063973876833e-07, + "loss": 1.1456, + "step": 4779 + }, + { + "epoch": 0.6480037958381346, + "grad_norm": 2.056861951523894, + "learning_rate": 5.824073413431507e-07, + "loss": 1.1113, + "step": 4780 + }, + { + "epoch": 0.6481393614857995, + "grad_norm": 9.041686000084582, + "learning_rate": 5.820083658280661e-07, + "loss": 1.1398, + "step": 4781 + }, + { + "epoch": 0.6482749271334644, + "grad_norm": 2.1404321305409884, + "learning_rate": 5.816094709193688e-07, + "loss": 1.136, + "step": 4782 + }, + { + "epoch": 0.6484104927811293, + "grad_norm": 1.8149862678723494, + "learning_rate": 5.812106566939824e-07, + "loss": 1.1361, + "step": 4783 + }, + { + "epoch": 0.6485460584287941, + "grad_norm": 1.9738069817726047, + "learning_rate": 5.808119232288151e-07, + "loss": 1.1429, + "step": 4784 + }, + { + "epoch": 0.648681624076459, + "grad_norm": 2.0173626884365032, + "learning_rate": 5.804132706007597e-07, + "loss": 1.1658, + "step": 4785 + }, + { + "epoch": 0.6488171897241239, + "grad_norm": 1.9430182765353265, + "learning_rate": 5.800146988866927e-07, + "loss": 1.1455, + "step": 4786 + }, + { + "epoch": 0.6489527553717888, + "grad_norm": 1.7782861460800614, + "learning_rate": 5.796162081634761e-07, + "loss": 1.1346, + "step": 4787 + }, + { + "epoch": 0.6490883210194537, + "grad_norm": 3.2660402837298017, + "learning_rate": 5.792177985079558e-07, + "loss": 1.1075, + "step": 4788 + }, + { + "epoch": 0.6492238866671185, + "grad_norm": 2.1609695395978163, + "learning_rate": 5.788194699969608e-07, + "loss": 1.1646, + "step": 4789 + }, + { + "epoch": 0.6493594523147834, + "grad_norm": 1.723062445028006, + "learning_rate": 5.784212227073073e-07, + "loss": 1.1413, + "step": 4790 + }, + { + "epoch": 0.6494950179624484, + "grad_norm": 1.6294735914578506, + "learning_rate": 5.780230567157924e-07, + "loss": 1.1843, + "step": 4791 + }, + { + "epoch": 0.6496305836101132, + "grad_norm": 1.9620310302105264, + "learning_rate": 5.776249720992009e-07, + "loss": 1.1292, + "step": 4792 + }, + { + "epoch": 0.6497661492577781, + "grad_norm": 1.8758899095325599, + "learning_rate": 5.772269689342988e-07, + "loss": 1.1304, + "step": 4793 + }, + { + "epoch": 0.6499017149054429, + "grad_norm": 1.557577484882107, + "learning_rate": 5.768290472978392e-07, + "loss": 1.094, + "step": 4794 + }, + { + "epoch": 0.6500372805531078, + "grad_norm": 1.883163109436426, + "learning_rate": 5.764312072665574e-07, + "loss": 1.1704, + "step": 4795 + }, + { + "epoch": 0.6501728462007728, + "grad_norm": 1.9260394735966824, + "learning_rate": 5.760334489171735e-07, + "loss": 1.1359, + "step": 4796 + }, + { + "epoch": 0.6503084118484376, + "grad_norm": 2.3037401338498285, + "learning_rate": 5.756357723263926e-07, + "loss": 1.1634, + "step": 4797 + }, + { + "epoch": 0.6504439774961025, + "grad_norm": 2.0220679196248317, + "learning_rate": 5.752381775709032e-07, + "loss": 1.1587, + "step": 4798 + }, + { + "epoch": 0.6505795431437674, + "grad_norm": 1.6713262697351101, + "learning_rate": 5.748406647273784e-07, + "loss": 1.1459, + "step": 4799 + }, + { + "epoch": 0.6507151087914322, + "grad_norm": 2.208264054549015, + "learning_rate": 5.744432338724754e-07, + "loss": 1.1485, + "step": 4800 + }, + { + "epoch": 0.6508506744390972, + "grad_norm": 1.688582036906875, + "learning_rate": 5.740458850828356e-07, + "loss": 1.132, + "step": 4801 + }, + { + "epoch": 0.650986240086762, + "grad_norm": 2.1915788049959115, + "learning_rate": 5.736486184350846e-07, + "loss": 1.1778, + "step": 4802 + }, + { + "epoch": 0.6511218057344269, + "grad_norm": 2.672315884822392, + "learning_rate": 5.732514340058321e-07, + "loss": 1.1163, + "step": 4803 + }, + { + "epoch": 0.6512573713820918, + "grad_norm": 1.7532273367979467, + "learning_rate": 5.728543318716721e-07, + "loss": 1.2041, + "step": 4804 + }, + { + "epoch": 0.6513929370297566, + "grad_norm": 2.1937129027024906, + "learning_rate": 5.724573121091825e-07, + "loss": 1.1991, + "step": 4805 + }, + { + "epoch": 0.6515285026774216, + "grad_norm": 1.8445947169719255, + "learning_rate": 5.720603747949253e-07, + "loss": 1.204, + "step": 4806 + }, + { + "epoch": 0.6516640683250864, + "grad_norm": 3.751273874337128, + "learning_rate": 5.716635200054469e-07, + "loss": 1.1387, + "step": 4807 + }, + { + "epoch": 0.6517996339727513, + "grad_norm": 1.816441537814419, + "learning_rate": 5.712667478172776e-07, + "loss": 1.1905, + "step": 4808 + }, + { + "epoch": 0.6519351996204162, + "grad_norm": 2.442025654986171, + "learning_rate": 5.708700583069319e-07, + "loss": 1.1164, + "step": 4809 + }, + { + "epoch": 0.652070765268081, + "grad_norm": 3.5662651470010513, + "learning_rate": 5.704734515509085e-07, + "loss": 1.1602, + "step": 4810 + }, + { + "epoch": 0.652206330915746, + "grad_norm": 1.8991404741803604, + "learning_rate": 5.700769276256886e-07, + "loss": 1.1702, + "step": 4811 + }, + { + "epoch": 0.6523418965634108, + "grad_norm": 3.0623859120958365, + "learning_rate": 5.696804866077404e-07, + "loss": 1.1697, + "step": 4812 + }, + { + "epoch": 0.6524774622110757, + "grad_norm": 3.6575939003283886, + "learning_rate": 5.692841285735128e-07, + "loss": 1.1536, + "step": 4813 + }, + { + "epoch": 0.6526130278587406, + "grad_norm": 1.788621762587626, + "learning_rate": 5.68887853599442e-07, + "loss": 1.1526, + "step": 4814 + }, + { + "epoch": 0.6527485935064055, + "grad_norm": 2.139109175959902, + "learning_rate": 5.684916617619453e-07, + "loss": 1.1584, + "step": 4815 + }, + { + "epoch": 0.6528841591540704, + "grad_norm": 2.5929167705322413, + "learning_rate": 5.680955531374255e-07, + "loss": 1.1658, + "step": 4816 + }, + { + "epoch": 0.6530197248017352, + "grad_norm": 1.895858504026069, + "learning_rate": 5.676995278022688e-07, + "loss": 1.1687, + "step": 4817 + }, + { + "epoch": 0.6531552904494001, + "grad_norm": 2.3020454686152902, + "learning_rate": 5.67303585832846e-07, + "loss": 1.1423, + "step": 4818 + }, + { + "epoch": 0.653290856097065, + "grad_norm": 1.8501847838163794, + "learning_rate": 5.669077273055111e-07, + "loss": 1.1533, + "step": 4819 + }, + { + "epoch": 0.6534264217447299, + "grad_norm": 2.0321654734522587, + "learning_rate": 5.665119522966024e-07, + "loss": 1.1551, + "step": 4820 + }, + { + "epoch": 0.6535619873923948, + "grad_norm": 2.050145069153044, + "learning_rate": 5.661162608824419e-07, + "loss": 1.1235, + "step": 4821 + }, + { + "epoch": 0.6536975530400596, + "grad_norm": 1.8346090904516708, + "learning_rate": 5.657206531393358e-07, + "loss": 1.1301, + "step": 4822 + }, + { + "epoch": 0.6538331186877245, + "grad_norm": 8.855212373544592, + "learning_rate": 5.653251291435735e-07, + "loss": 1.1164, + "step": 4823 + }, + { + "epoch": 0.6539686843353895, + "grad_norm": 1.767406524388507, + "learning_rate": 5.64929688971429e-07, + "loss": 1.1671, + "step": 4824 + }, + { + "epoch": 0.6541042499830543, + "grad_norm": 2.245191413927645, + "learning_rate": 5.645343326991602e-07, + "loss": 1.1173, + "step": 4825 + }, + { + "epoch": 0.6542398156307192, + "grad_norm": 2.5256544006644965, + "learning_rate": 5.641390604030072e-07, + "loss": 1.1362, + "step": 4826 + }, + { + "epoch": 0.654375381278384, + "grad_norm": 1.7760803891097825, + "learning_rate": 5.637438721591967e-07, + "loss": 1.1728, + "step": 4827 + }, + { + "epoch": 0.6545109469260489, + "grad_norm": 1.9836311631860428, + "learning_rate": 5.633487680439361e-07, + "loss": 1.1684, + "step": 4828 + }, + { + "epoch": 0.6546465125737139, + "grad_norm": 1.937976467574596, + "learning_rate": 5.629537481334195e-07, + "loss": 1.1302, + "step": 4829 + }, + { + "epoch": 0.6547820782213787, + "grad_norm": 2.6514909275134753, + "learning_rate": 5.625588125038221e-07, + "loss": 1.159, + "step": 4830 + }, + { + "epoch": 0.6549176438690436, + "grad_norm": 2.190150054546346, + "learning_rate": 5.621639612313056e-07, + "loss": 1.1641, + "step": 4831 + }, + { + "epoch": 0.6550532095167084, + "grad_norm": 3.8379712382452067, + "learning_rate": 5.617691943920122e-07, + "loss": 1.1058, + "step": 4832 + }, + { + "epoch": 0.6551887751643733, + "grad_norm": 1.803171337837875, + "learning_rate": 5.613745120620712e-07, + "loss": 1.1555, + "step": 4833 + }, + { + "epoch": 0.6553243408120383, + "grad_norm": 2.3651138892589003, + "learning_rate": 5.609799143175927e-07, + "loss": 1.0792, + "step": 4834 + }, + { + "epoch": 0.6554599064597031, + "grad_norm": 1.8634302208668354, + "learning_rate": 5.605854012346729e-07, + "loss": 1.1183, + "step": 4835 + }, + { + "epoch": 0.655595472107368, + "grad_norm": 1.9584830210132327, + "learning_rate": 5.601909728893892e-07, + "loss": 1.1063, + "step": 4836 + }, + { + "epoch": 0.6557310377550328, + "grad_norm": 2.2101792386000407, + "learning_rate": 5.597966293578055e-07, + "loss": 1.1702, + "step": 4837 + }, + { + "epoch": 0.6558666034026978, + "grad_norm": 1.7243200198561555, + "learning_rate": 5.594023707159668e-07, + "loss": 1.1627, + "step": 4838 + }, + { + "epoch": 0.6560021690503627, + "grad_norm": 2.206241170714317, + "learning_rate": 5.590081970399028e-07, + "loss": 1.141, + "step": 4839 + }, + { + "epoch": 0.6561377346980275, + "grad_norm": 1.5848235829349613, + "learning_rate": 5.586141084056273e-07, + "loss": 1.1692, + "step": 4840 + }, + { + "epoch": 0.6562733003456924, + "grad_norm": 2.7729467482693226, + "learning_rate": 5.582201048891367e-07, + "loss": 1.1445, + "step": 4841 + }, + { + "epoch": 0.6564088659933572, + "grad_norm": 1.6884173773550486, + "learning_rate": 5.578261865664118e-07, + "loss": 1.1332, + "step": 4842 + }, + { + "epoch": 0.6565444316410222, + "grad_norm": 2.4695439116631044, + "learning_rate": 5.574323535134164e-07, + "loss": 1.1559, + "step": 4843 + }, + { + "epoch": 0.6566799972886871, + "grad_norm": 1.7381944484180671, + "learning_rate": 5.570386058060983e-07, + "loss": 1.1106, + "step": 4844 + }, + { + "epoch": 0.6568155629363519, + "grad_norm": 2.070767655380669, + "learning_rate": 5.566449435203886e-07, + "loss": 1.1132, + "step": 4845 + }, + { + "epoch": 0.6569511285840168, + "grad_norm": 1.8738359898189323, + "learning_rate": 5.562513667322018e-07, + "loss": 1.1354, + "step": 4846 + }, + { + "epoch": 0.6570866942316816, + "grad_norm": 4.3851803631916, + "learning_rate": 5.558578755174363e-07, + "loss": 1.1523, + "step": 4847 + }, + { + "epoch": 0.6572222598793466, + "grad_norm": 1.9672133875397868, + "learning_rate": 5.554644699519735e-07, + "loss": 1.1636, + "step": 4848 + }, + { + "epoch": 0.6573578255270115, + "grad_norm": 1.6315807072124129, + "learning_rate": 5.550711501116788e-07, + "loss": 1.1867, + "step": 4849 + }, + { + "epoch": 0.6574933911746763, + "grad_norm": 1.7891330652085478, + "learning_rate": 5.546779160724012e-07, + "loss": 1.1167, + "step": 4850 + }, + { + "epoch": 0.6576289568223412, + "grad_norm": 2.115656815697166, + "learning_rate": 5.542847679099715e-07, + "loss": 1.1313, + "step": 4851 + }, + { + "epoch": 0.657764522470006, + "grad_norm": 1.7040124695662504, + "learning_rate": 5.538917057002069e-07, + "loss": 1.1361, + "step": 4852 + }, + { + "epoch": 0.657900088117671, + "grad_norm": 1.7255529187019638, + "learning_rate": 5.534987295189049e-07, + "loss": 1.1426, + "step": 4853 + }, + { + "epoch": 0.6580356537653359, + "grad_norm": 2.2770034215753316, + "learning_rate": 5.531058394418487e-07, + "loss": 1.1291, + "step": 4854 + }, + { + "epoch": 0.6581712194130007, + "grad_norm": 1.9084739859879536, + "learning_rate": 5.527130355448035e-07, + "loss": 1.1585, + "step": 4855 + }, + { + "epoch": 0.6583067850606656, + "grad_norm": 1.7239218329670236, + "learning_rate": 5.523203179035189e-07, + "loss": 1.1041, + "step": 4856 + }, + { + "epoch": 0.6584423507083305, + "grad_norm": 1.715038381914756, + "learning_rate": 5.519276865937272e-07, + "loss": 1.1222, + "step": 4857 + }, + { + "epoch": 0.6585779163559954, + "grad_norm": 3.2164045163437938, + "learning_rate": 5.515351416911442e-07, + "loss": 1.1504, + "step": 4858 + }, + { + "epoch": 0.6587134820036603, + "grad_norm": 5.195217428732781, + "learning_rate": 5.511426832714694e-07, + "loss": 1.1932, + "step": 4859 + }, + { + "epoch": 0.6588490476513251, + "grad_norm": 1.7099829050566844, + "learning_rate": 5.507503114103849e-07, + "loss": 1.2141, + "step": 4860 + }, + { + "epoch": 0.65898461329899, + "grad_norm": 2.5304053727391382, + "learning_rate": 5.503580261835566e-07, + "loss": 1.1416, + "step": 4861 + }, + { + "epoch": 0.6591201789466549, + "grad_norm": 1.837221947684456, + "learning_rate": 5.499658276666338e-07, + "loss": 1.1398, + "step": 4862 + }, + { + "epoch": 0.6592557445943198, + "grad_norm": 1.8118300542013306, + "learning_rate": 5.495737159352487e-07, + "loss": 1.1105, + "step": 4863 + }, + { + "epoch": 0.6593913102419847, + "grad_norm": 1.8897255174290954, + "learning_rate": 5.491816910650171e-07, + "loss": 1.1464, + "step": 4864 + }, + { + "epoch": 0.6595268758896495, + "grad_norm": 1.9947878501055636, + "learning_rate": 5.48789753131538e-07, + "loss": 1.1374, + "step": 4865 + }, + { + "epoch": 0.6596624415373145, + "grad_norm": 1.6490969695705584, + "learning_rate": 5.483979022103935e-07, + "loss": 1.1394, + "step": 4866 + }, + { + "epoch": 0.6597980071849793, + "grad_norm": 1.744761445507176, + "learning_rate": 5.480061383771481e-07, + "loss": 1.1902, + "step": 4867 + }, + { + "epoch": 0.6599335728326442, + "grad_norm": 1.7446258665262908, + "learning_rate": 5.476144617073519e-07, + "loss": 1.1418, + "step": 4868 + }, + { + "epoch": 0.6600691384803091, + "grad_norm": 3.9189712383008226, + "learning_rate": 5.472228722765351e-07, + "loss": 1.1859, + "step": 4869 + }, + { + "epoch": 0.6602047041279739, + "grad_norm": 1.600314177303711, + "learning_rate": 5.46831370160214e-07, + "loss": 1.1397, + "step": 4870 + }, + { + "epoch": 0.6603402697756389, + "grad_norm": 1.8030930392073148, + "learning_rate": 5.464399554338856e-07, + "loss": 1.1196, + "step": 4871 + }, + { + "epoch": 0.6604758354233037, + "grad_norm": 1.9398979191040868, + "learning_rate": 5.460486281730322e-07, + "loss": 1.157, + "step": 4872 + }, + { + "epoch": 0.6606114010709686, + "grad_norm": 1.845775048194894, + "learning_rate": 5.456573884531168e-07, + "loss": 1.1639, + "step": 4873 + }, + { + "epoch": 0.6607469667186335, + "grad_norm": 1.6073791117438114, + "learning_rate": 5.452662363495884e-07, + "loss": 1.1697, + "step": 4874 + }, + { + "epoch": 0.6608825323662983, + "grad_norm": 1.666189815966232, + "learning_rate": 5.448751719378762e-07, + "loss": 1.1236, + "step": 4875 + }, + { + "epoch": 0.6610180980139633, + "grad_norm": 1.7874592644666012, + "learning_rate": 5.444841952933953e-07, + "loss": 1.1742, + "step": 4876 + }, + { + "epoch": 0.6611536636616281, + "grad_norm": 1.9942110171797238, + "learning_rate": 5.440933064915413e-07, + "loss": 1.1297, + "step": 4877 + }, + { + "epoch": 0.661289229309293, + "grad_norm": 2.3670099918130827, + "learning_rate": 5.437025056076945e-07, + "loss": 1.1685, + "step": 4878 + }, + { + "epoch": 0.6614247949569579, + "grad_norm": 2.7704845199054144, + "learning_rate": 5.433117927172176e-07, + "loss": 1.1425, + "step": 4879 + }, + { + "epoch": 0.6615603606046228, + "grad_norm": 2.0424686354749175, + "learning_rate": 5.429211678954566e-07, + "loss": 1.1404, + "step": 4880 + }, + { + "epoch": 0.6616959262522877, + "grad_norm": 1.9166386251747634, + "learning_rate": 5.425306312177404e-07, + "loss": 1.1415, + "step": 4881 + }, + { + "epoch": 0.6618314918999526, + "grad_norm": 2.0577996879097626, + "learning_rate": 5.421401827593812e-07, + "loss": 1.1796, + "step": 4882 + }, + { + "epoch": 0.6619670575476174, + "grad_norm": 1.644162830661933, + "learning_rate": 5.417498225956734e-07, + "loss": 1.1768, + "step": 4883 + }, + { + "epoch": 0.6621026231952823, + "grad_norm": 2.8331278168887772, + "learning_rate": 5.413595508018951e-07, + "loss": 1.1459, + "step": 4884 + }, + { + "epoch": 0.6622381888429472, + "grad_norm": 1.7369985982747376, + "learning_rate": 5.409693674533071e-07, + "loss": 1.1781, + "step": 4885 + }, + { + "epoch": 0.6623737544906121, + "grad_norm": 3.0915788380358036, + "learning_rate": 5.405792726251532e-07, + "loss": 1.1442, + "step": 4886 + }, + { + "epoch": 0.662509320138277, + "grad_norm": 1.8221595645913187, + "learning_rate": 5.401892663926606e-07, + "loss": 1.1403, + "step": 4887 + }, + { + "epoch": 0.6626448857859418, + "grad_norm": 2.421050844778035, + "learning_rate": 5.397993488310378e-07, + "loss": 1.1801, + "step": 4888 + }, + { + "epoch": 0.6627804514336068, + "grad_norm": 1.6576334427389534, + "learning_rate": 5.394095200154786e-07, + "loss": 1.0992, + "step": 4889 + }, + { + "epoch": 0.6629160170812716, + "grad_norm": 1.8360563897232098, + "learning_rate": 5.39019780021157e-07, + "loss": 1.1619, + "step": 4890 + }, + { + "epoch": 0.6630515827289365, + "grad_norm": 3.2807108760705965, + "learning_rate": 5.386301289232329e-07, + "loss": 1.1325, + "step": 4891 + }, + { + "epoch": 0.6631871483766014, + "grad_norm": 2.2653654316797076, + "learning_rate": 5.382405667968457e-07, + "loss": 1.0963, + "step": 4892 + }, + { + "epoch": 0.6633227140242662, + "grad_norm": 2.4491827555405905, + "learning_rate": 5.378510937171212e-07, + "loss": 1.152, + "step": 4893 + }, + { + "epoch": 0.6634582796719312, + "grad_norm": 2.2506869438549075, + "learning_rate": 5.37461709759165e-07, + "loss": 1.1583, + "step": 4894 + }, + { + "epoch": 0.663593845319596, + "grad_norm": 2.370101469471269, + "learning_rate": 5.370724149980668e-07, + "loss": 1.1172, + "step": 4895 + }, + { + "epoch": 0.6637294109672609, + "grad_norm": 1.7939571879067078, + "learning_rate": 5.366832095088994e-07, + "loss": 1.1409, + "step": 4896 + }, + { + "epoch": 0.6638649766149258, + "grad_norm": 1.6803647014030776, + "learning_rate": 5.362940933667177e-07, + "loss": 1.1352, + "step": 4897 + }, + { + "epoch": 0.6640005422625906, + "grad_norm": 2.468576631536228, + "learning_rate": 5.359050666465599e-07, + "loss": 1.146, + "step": 4898 + }, + { + "epoch": 0.6641361079102556, + "grad_norm": 3.322868505930707, + "learning_rate": 5.355161294234465e-07, + "loss": 1.151, + "step": 4899 + }, + { + "epoch": 0.6642716735579204, + "grad_norm": 2.2431578499899003, + "learning_rate": 5.351272817723813e-07, + "loss": 1.162, + "step": 4900 + }, + { + "epoch": 0.6644072392055853, + "grad_norm": 6.928870472232903, + "learning_rate": 5.347385237683504e-07, + "loss": 1.1474, + "step": 4901 + }, + { + "epoch": 0.6645428048532502, + "grad_norm": 1.868636793483785, + "learning_rate": 5.343498554863225e-07, + "loss": 1.1504, + "step": 4902 + }, + { + "epoch": 0.664678370500915, + "grad_norm": 1.7792261192689378, + "learning_rate": 5.339612770012494e-07, + "loss": 1.175, + "step": 4903 + }, + { + "epoch": 0.66481393614858, + "grad_norm": 1.6868089676383473, + "learning_rate": 5.335727883880654e-07, + "loss": 1.1365, + "step": 4904 + }, + { + "epoch": 0.6649495017962448, + "grad_norm": 1.9256831239914998, + "learning_rate": 5.331843897216873e-07, + "loss": 1.1396, + "step": 4905 + }, + { + "epoch": 0.6650850674439097, + "grad_norm": 1.9579034060423899, + "learning_rate": 5.327960810770149e-07, + "loss": 1.1062, + "step": 4906 + }, + { + "epoch": 0.6652206330915746, + "grad_norm": 1.717392774436381, + "learning_rate": 5.324078625289304e-07, + "loss": 1.1748, + "step": 4907 + }, + { + "epoch": 0.6653561987392395, + "grad_norm": 1.9917025367055718, + "learning_rate": 5.320197341522985e-07, + "loss": 1.1921, + "step": 4908 + }, + { + "epoch": 0.6654917643869044, + "grad_norm": 2.0848228312699777, + "learning_rate": 5.316316960219673e-07, + "loss": 1.1381, + "step": 4909 + }, + { + "epoch": 0.6656273300345692, + "grad_norm": 1.7208813621232353, + "learning_rate": 5.312437482127659e-07, + "loss": 1.1808, + "step": 4910 + }, + { + "epoch": 0.6657628956822341, + "grad_norm": 2.4483619702044894, + "learning_rate": 5.30855890799508e-07, + "loss": 1.1132, + "step": 4911 + }, + { + "epoch": 0.665898461329899, + "grad_norm": 1.9412023968192726, + "learning_rate": 5.304681238569877e-07, + "loss": 1.1465, + "step": 4912 + }, + { + "epoch": 0.6660340269775639, + "grad_norm": 3.306438163248223, + "learning_rate": 5.300804474599842e-07, + "loss": 1.1588, + "step": 4913 + }, + { + "epoch": 0.6661695926252288, + "grad_norm": 1.870943748169584, + "learning_rate": 5.296928616832568e-07, + "loss": 1.129, + "step": 4914 + }, + { + "epoch": 0.6663051582728936, + "grad_norm": 1.9177675479862217, + "learning_rate": 5.293053666015485e-07, + "loss": 1.176, + "step": 4915 + }, + { + "epoch": 0.6664407239205585, + "grad_norm": 2.0957936037352893, + "learning_rate": 5.28917962289585e-07, + "loss": 1.1312, + "step": 4916 + }, + { + "epoch": 0.6665762895682235, + "grad_norm": 2.085412578121973, + "learning_rate": 5.28530648822074e-07, + "loss": 1.0967, + "step": 4917 + }, + { + "epoch": 0.6667118552158883, + "grad_norm": 1.8004290218898809, + "learning_rate": 5.281434262737056e-07, + "loss": 1.1721, + "step": 4918 + }, + { + "epoch": 0.6668474208635532, + "grad_norm": 2.0608401097604974, + "learning_rate": 5.277562947191529e-07, + "loss": 1.1399, + "step": 4919 + }, + { + "epoch": 0.666982986511218, + "grad_norm": 7.515091923769975, + "learning_rate": 5.273692542330713e-07, + "loss": 1.1684, + "step": 4920 + }, + { + "epoch": 0.6671185521588829, + "grad_norm": 1.9062843617031093, + "learning_rate": 5.269823048900981e-07, + "loss": 1.1865, + "step": 4921 + }, + { + "epoch": 0.6672541178065479, + "grad_norm": 2.5177129028635528, + "learning_rate": 5.265954467648539e-07, + "loss": 1.1145, + "step": 4922 + }, + { + "epoch": 0.6673896834542127, + "grad_norm": 2.3214279081831957, + "learning_rate": 5.262086799319405e-07, + "loss": 1.1634, + "step": 4923 + }, + { + "epoch": 0.6675252491018776, + "grad_norm": 1.8544440096703432, + "learning_rate": 5.258220044659438e-07, + "loss": 1.1419, + "step": 4924 + }, + { + "epoch": 0.6676608147495424, + "grad_norm": 1.896007369977626, + "learning_rate": 5.2543542044143e-07, + "loss": 1.1311, + "step": 4925 + }, + { + "epoch": 0.6677963803972073, + "grad_norm": 1.94479957763714, + "learning_rate": 5.2504892793295e-07, + "loss": 1.1433, + "step": 4926 + }, + { + "epoch": 0.6679319460448723, + "grad_norm": 2.104265927693193, + "learning_rate": 5.246625270150346e-07, + "loss": 1.1376, + "step": 4927 + }, + { + "epoch": 0.6680675116925371, + "grad_norm": 1.8987577353340368, + "learning_rate": 5.242762177621994e-07, + "loss": 1.2027, + "step": 4928 + }, + { + "epoch": 0.668203077340202, + "grad_norm": 1.761099778461033, + "learning_rate": 5.238900002489398e-07, + "loss": 1.1198, + "step": 4929 + }, + { + "epoch": 0.6683386429878668, + "grad_norm": 1.9867557064100108, + "learning_rate": 5.235038745497363e-07, + "loss": 1.1043, + "step": 4930 + }, + { + "epoch": 0.6684742086355318, + "grad_norm": 1.8193418318777506, + "learning_rate": 5.231178407390484e-07, + "loss": 1.1497, + "step": 4931 + }, + { + "epoch": 0.6686097742831967, + "grad_norm": 1.7847038397879245, + "learning_rate": 5.227318988913216e-07, + "loss": 1.1445, + "step": 4932 + }, + { + "epoch": 0.6687453399308615, + "grad_norm": 2.197043988897054, + "learning_rate": 5.223460490809799e-07, + "loss": 1.1224, + "step": 4933 + }, + { + "epoch": 0.6688809055785264, + "grad_norm": 1.7305797999520216, + "learning_rate": 5.21960291382433e-07, + "loss": 1.1974, + "step": 4934 + }, + { + "epoch": 0.6690164712261912, + "grad_norm": 1.871978278498084, + "learning_rate": 5.215746258700698e-07, + "loss": 1.1248, + "step": 4935 + }, + { + "epoch": 0.6691520368738562, + "grad_norm": 1.6291297853161881, + "learning_rate": 5.211890526182642e-07, + "loss": 1.1386, + "step": 4936 + }, + { + "epoch": 0.6692876025215211, + "grad_norm": 1.772141086888954, + "learning_rate": 5.208035717013702e-07, + "loss": 1.1449, + "step": 4937 + }, + { + "epoch": 0.6694231681691859, + "grad_norm": 1.9658803290048514, + "learning_rate": 5.204181831937245e-07, + "loss": 1.1883, + "step": 4938 + }, + { + "epoch": 0.6695587338168508, + "grad_norm": 1.6775873837048927, + "learning_rate": 5.200328871696468e-07, + "loss": 1.1877, + "step": 4939 + }, + { + "epoch": 0.6696942994645156, + "grad_norm": 1.8446997608033207, + "learning_rate": 5.19647683703438e-07, + "loss": 1.154, + "step": 4940 + }, + { + "epoch": 0.6698298651121806, + "grad_norm": 1.7282686671736218, + "learning_rate": 5.192625728693819e-07, + "loss": 1.1645, + "step": 4941 + }, + { + "epoch": 0.6699654307598455, + "grad_norm": 1.9292170202328573, + "learning_rate": 5.188775547417439e-07, + "loss": 1.1552, + "step": 4942 + }, + { + "epoch": 0.6701009964075103, + "grad_norm": 2.012831860550488, + "learning_rate": 5.184926293947716e-07, + "loss": 1.1751, + "step": 4943 + }, + { + "epoch": 0.6702365620551752, + "grad_norm": 1.779250741412479, + "learning_rate": 5.181077969026951e-07, + "loss": 1.1404, + "step": 4944 + }, + { + "epoch": 0.67037212770284, + "grad_norm": 1.79004566329686, + "learning_rate": 5.17723057339726e-07, + "loss": 1.138, + "step": 4945 + }, + { + "epoch": 0.670507693350505, + "grad_norm": 2.8573312210640283, + "learning_rate": 5.173384107800585e-07, + "loss": 1.1484, + "step": 4946 + }, + { + "epoch": 0.6706432589981699, + "grad_norm": 2.236078882776069, + "learning_rate": 5.169538572978684e-07, + "loss": 1.1448, + "step": 4947 + }, + { + "epoch": 0.6707788246458347, + "grad_norm": 1.7050572576570668, + "learning_rate": 5.165693969673142e-07, + "loss": 1.1307, + "step": 4948 + }, + { + "epoch": 0.6709143902934996, + "grad_norm": 1.8192251948979914, + "learning_rate": 5.161850298625362e-07, + "loss": 1.0979, + "step": 4949 + }, + { + "epoch": 0.6710499559411645, + "grad_norm": 2.2505733831619534, + "learning_rate": 5.158007560576557e-07, + "loss": 1.1527, + "step": 4950 + }, + { + "epoch": 0.6711855215888294, + "grad_norm": 1.913149403587009, + "learning_rate": 5.154165756267774e-07, + "loss": 1.185, + "step": 4951 + }, + { + "epoch": 0.6713210872364943, + "grad_norm": 1.9712741784590244, + "learning_rate": 5.150324886439874e-07, + "loss": 1.1519, + "step": 4952 + }, + { + "epoch": 0.6714566528841591, + "grad_norm": 2.3117962286932587, + "learning_rate": 5.14648495183354e-07, + "loss": 1.1506, + "step": 4953 + }, + { + "epoch": 0.671592218531824, + "grad_norm": 1.796775324110737, + "learning_rate": 5.142645953189271e-07, + "loss": 1.1355, + "step": 4954 + }, + { + "epoch": 0.6717277841794889, + "grad_norm": 2.0325772727467295, + "learning_rate": 5.138807891247388e-07, + "loss": 1.1554, + "step": 4955 + }, + { + "epoch": 0.6718633498271538, + "grad_norm": 1.8679476672055544, + "learning_rate": 5.13497076674803e-07, + "loss": 1.157, + "step": 4956 + }, + { + "epoch": 0.6719989154748187, + "grad_norm": 1.8963230927561556, + "learning_rate": 5.13113458043116e-07, + "loss": 1.145, + "step": 4957 + }, + { + "epoch": 0.6721344811224835, + "grad_norm": 2.3339683308689603, + "learning_rate": 5.127299333036552e-07, + "loss": 1.1382, + "step": 4958 + }, + { + "epoch": 0.6722700467701485, + "grad_norm": 2.250945666902969, + "learning_rate": 5.123465025303804e-07, + "loss": 1.1463, + "step": 4959 + }, + { + "epoch": 0.6724056124178134, + "grad_norm": 2.450241907401851, + "learning_rate": 5.119631657972334e-07, + "loss": 1.1077, + "step": 4960 + }, + { + "epoch": 0.6725411780654782, + "grad_norm": 3.2217247648364107, + "learning_rate": 5.115799231781377e-07, + "loss": 1.1267, + "step": 4961 + }, + { + "epoch": 0.6726767437131431, + "grad_norm": 1.911896005240658, + "learning_rate": 5.111967747469983e-07, + "loss": 1.1563, + "step": 4962 + }, + { + "epoch": 0.6728123093608079, + "grad_norm": 7.17364199391422, + "learning_rate": 5.108137205777026e-07, + "loss": 1.154, + "step": 4963 + }, + { + "epoch": 0.6729478750084729, + "grad_norm": 3.046627104362123, + "learning_rate": 5.104307607441193e-07, + "loss": 1.128, + "step": 4964 + }, + { + "epoch": 0.6730834406561378, + "grad_norm": 1.7683272178308254, + "learning_rate": 5.100478953200999e-07, + "loss": 1.0855, + "step": 4965 + }, + { + "epoch": 0.6732190063038026, + "grad_norm": 1.9737776490937589, + "learning_rate": 5.096651243794756e-07, + "loss": 1.1195, + "step": 4966 + }, + { + "epoch": 0.6733545719514675, + "grad_norm": 1.6664891420524943, + "learning_rate": 5.092824479960625e-07, + "loss": 1.1064, + "step": 4967 + }, + { + "epoch": 0.6734901375991323, + "grad_norm": 2.1233131192796986, + "learning_rate": 5.088998662436548e-07, + "loss": 1.1093, + "step": 4968 + }, + { + "epoch": 0.6736257032467973, + "grad_norm": 2.9516290631175393, + "learning_rate": 5.085173791960324e-07, + "loss": 1.0921, + "step": 4969 + }, + { + "epoch": 0.6737612688944622, + "grad_norm": 1.6005555593690795, + "learning_rate": 5.081349869269529e-07, + "loss": 1.1241, + "step": 4970 + }, + { + "epoch": 0.673896834542127, + "grad_norm": 2.008604539177456, + "learning_rate": 5.077526895101596e-07, + "loss": 1.1057, + "step": 4971 + }, + { + "epoch": 0.6740324001897919, + "grad_norm": 1.9228647640597991, + "learning_rate": 5.073704870193736e-07, + "loss": 1.1071, + "step": 4972 + }, + { + "epoch": 0.6741679658374568, + "grad_norm": 2.1815591948501436, + "learning_rate": 5.069883795283015e-07, + "loss": 1.1501, + "step": 4973 + }, + { + "epoch": 0.6743035314851217, + "grad_norm": 1.651652959052335, + "learning_rate": 5.066063671106281e-07, + "loss": 1.1567, + "step": 4974 + }, + { + "epoch": 0.6744390971327866, + "grad_norm": 5.487591867817006, + "learning_rate": 5.062244498400228e-07, + "loss": 1.1265, + "step": 4975 + }, + { + "epoch": 0.6745746627804514, + "grad_norm": 3.823504764499656, + "learning_rate": 5.058426277901344e-07, + "loss": 1.1945, + "step": 4976 + }, + { + "epoch": 0.6747102284281163, + "grad_norm": 1.9282032028256977, + "learning_rate": 5.054609010345947e-07, + "loss": 1.147, + "step": 4977 + }, + { + "epoch": 0.6748457940757812, + "grad_norm": 1.8391289201745504, + "learning_rate": 5.050792696470165e-07, + "loss": 1.1822, + "step": 4978 + }, + { + "epoch": 0.6749813597234461, + "grad_norm": 1.7512812184089577, + "learning_rate": 5.046977337009945e-07, + "loss": 1.1111, + "step": 4979 + }, + { + "epoch": 0.675116925371111, + "grad_norm": 6.55735079247119, + "learning_rate": 5.043162932701048e-07, + "loss": 1.1452, + "step": 4980 + }, + { + "epoch": 0.6752524910187758, + "grad_norm": 2.369920074135184, + "learning_rate": 5.039349484279053e-07, + "loss": 1.1427, + "step": 4981 + }, + { + "epoch": 0.6753880566664408, + "grad_norm": 1.7954933840650138, + "learning_rate": 5.035536992479352e-07, + "loss": 1.1542, + "step": 4982 + }, + { + "epoch": 0.6755236223141056, + "grad_norm": 1.78305796790474, + "learning_rate": 5.031725458037157e-07, + "loss": 1.143, + "step": 4983 + }, + { + "epoch": 0.6756591879617705, + "grad_norm": 1.6959948702542433, + "learning_rate": 5.027914881687489e-07, + "loss": 1.1212, + "step": 4984 + }, + { + "epoch": 0.6757947536094354, + "grad_norm": 2.0419715936840914, + "learning_rate": 5.024105264165188e-07, + "loss": 1.1379, + "step": 4985 + }, + { + "epoch": 0.6759303192571002, + "grad_norm": 2.9937067754558315, + "learning_rate": 5.020296606204915e-07, + "loss": 1.1549, + "step": 4986 + }, + { + "epoch": 0.6760658849047652, + "grad_norm": 2.8561723938842585, + "learning_rate": 5.016488908541125e-07, + "loss": 1.1199, + "step": 4987 + }, + { + "epoch": 0.67620145055243, + "grad_norm": 1.7922504651378723, + "learning_rate": 5.01268217190812e-07, + "loss": 1.1348, + "step": 4988 + }, + { + "epoch": 0.6763370162000949, + "grad_norm": 1.8171250469202496, + "learning_rate": 5.008876397039983e-07, + "loss": 1.1635, + "step": 4989 + }, + { + "epoch": 0.6764725818477598, + "grad_norm": 1.8097662881577066, + "learning_rate": 5.005071584670644e-07, + "loss": 1.1428, + "step": 4990 + }, + { + "epoch": 0.6766081474954246, + "grad_norm": 3.1388817502306736, + "learning_rate": 5.001267735533811e-07, + "loss": 1.1522, + "step": 4991 + }, + { + "epoch": 0.6767437131430896, + "grad_norm": 1.6637374105955727, + "learning_rate": 4.997464850363049e-07, + "loss": 1.1742, + "step": 4992 + }, + { + "epoch": 0.6768792787907544, + "grad_norm": 2.0142025781612967, + "learning_rate": 4.993662929891698e-07, + "loss": 1.1937, + "step": 4993 + }, + { + "epoch": 0.6770148444384193, + "grad_norm": 2.000659608863876, + "learning_rate": 4.989861974852934e-07, + "loss": 1.1616, + "step": 4994 + }, + { + "epoch": 0.6771504100860842, + "grad_norm": 8.597543301018243, + "learning_rate": 4.986061985979739e-07, + "loss": 1.1634, + "step": 4995 + }, + { + "epoch": 0.677285975733749, + "grad_norm": 1.947594491510231, + "learning_rate": 4.982262964004913e-07, + "loss": 1.1099, + "step": 4996 + }, + { + "epoch": 0.677421541381414, + "grad_norm": 1.4942688733876819, + "learning_rate": 4.978464909661067e-07, + "loss": 1.1162, + "step": 4997 + }, + { + "epoch": 0.6775571070290788, + "grad_norm": 2.2308495104847554, + "learning_rate": 4.974667823680626e-07, + "loss": 1.1802, + "step": 4998 + }, + { + "epoch": 0.6776926726767437, + "grad_norm": 7.653374211243017, + "learning_rate": 4.970871706795827e-07, + "loss": 1.1337, + "step": 4999 + }, + { + "epoch": 0.6778282383244086, + "grad_norm": 1.6039771902939703, + "learning_rate": 4.967076559738722e-07, + "loss": 1.1625, + "step": 5000 + }, + { + "epoch": 0.6779638039720735, + "grad_norm": 1.9633293593513037, + "learning_rate": 4.963282383241175e-07, + "loss": 1.1425, + "step": 5001 + }, + { + "epoch": 0.6780993696197384, + "grad_norm": 2.958040414883653, + "learning_rate": 4.959489178034863e-07, + "loss": 1.1391, + "step": 5002 + }, + { + "epoch": 0.6782349352674032, + "grad_norm": 1.6181780665570968, + "learning_rate": 4.955696944851276e-07, + "loss": 1.1299, + "step": 5003 + }, + { + "epoch": 0.6783705009150681, + "grad_norm": 2.3637785444174075, + "learning_rate": 4.951905684421716e-07, + "loss": 1.1795, + "step": 5004 + }, + { + "epoch": 0.678506066562733, + "grad_norm": 2.289875918293436, + "learning_rate": 4.948115397477296e-07, + "loss": 1.1508, + "step": 5005 + }, + { + "epoch": 0.6786416322103979, + "grad_norm": 3.1397714580669964, + "learning_rate": 4.94432608474895e-07, + "loss": 1.1476, + "step": 5006 + }, + { + "epoch": 0.6787771978580628, + "grad_norm": 1.7697171626636006, + "learning_rate": 4.940537746967403e-07, + "loss": 1.1358, + "step": 5007 + }, + { + "epoch": 0.6789127635057276, + "grad_norm": 1.7344441557961077, + "learning_rate": 4.936750384863222e-07, + "loss": 1.1219, + "step": 5008 + }, + { + "epoch": 0.6790483291533925, + "grad_norm": 1.789187416193793, + "learning_rate": 4.932963999166755e-07, + "loss": 1.1622, + "step": 5009 + }, + { + "epoch": 0.6791838948010575, + "grad_norm": 14.578428226991623, + "learning_rate": 4.929178590608191e-07, + "loss": 1.1665, + "step": 5010 + }, + { + "epoch": 0.6793194604487223, + "grad_norm": 2.159577271808813, + "learning_rate": 4.925394159917506e-07, + "loss": 1.1215, + "step": 5011 + }, + { + "epoch": 0.6794550260963872, + "grad_norm": 2.1236527293189233, + "learning_rate": 4.921610707824501e-07, + "loss": 1.1675, + "step": 5012 + }, + { + "epoch": 0.679590591744052, + "grad_norm": 1.4978472327400143, + "learning_rate": 4.917828235058785e-07, + "loss": 1.1646, + "step": 5013 + }, + { + "epoch": 0.6797261573917169, + "grad_norm": 1.919431282039284, + "learning_rate": 4.914046742349777e-07, + "loss": 1.129, + "step": 5014 + }, + { + "epoch": 0.6798617230393819, + "grad_norm": 1.8881033480537677, + "learning_rate": 4.910266230426708e-07, + "loss": 1.1395, + "step": 5015 + }, + { + "epoch": 0.6799972886870467, + "grad_norm": 1.8041316289629497, + "learning_rate": 4.906486700018622e-07, + "loss": 1.1606, + "step": 5016 + }, + { + "epoch": 0.6801328543347116, + "grad_norm": 2.225538179194552, + "learning_rate": 4.90270815185437e-07, + "loss": 1.1622, + "step": 5017 + }, + { + "epoch": 0.6802684199823764, + "grad_norm": 2.5434962278475393, + "learning_rate": 4.898930586662614e-07, + "loss": 1.1395, + "step": 5018 + }, + { + "epoch": 0.6804039856300413, + "grad_norm": 2.307132213536817, + "learning_rate": 4.89515400517183e-07, + "loss": 1.1486, + "step": 5019 + }, + { + "epoch": 0.6805395512777063, + "grad_norm": 1.9553213762782022, + "learning_rate": 4.891378408110301e-07, + "loss": 1.1316, + "step": 5020 + }, + { + "epoch": 0.6806751169253711, + "grad_norm": 2.002629348033198, + "learning_rate": 4.887603796206124e-07, + "loss": 1.1471, + "step": 5021 + }, + { + "epoch": 0.680810682573036, + "grad_norm": 2.734604576666178, + "learning_rate": 4.883830170187193e-07, + "loss": 1.132, + "step": 5022 + }, + { + "epoch": 0.6809462482207008, + "grad_norm": 1.7164194770560526, + "learning_rate": 4.880057530781237e-07, + "loss": 1.1578, + "step": 5023 + }, + { + "epoch": 0.6810818138683657, + "grad_norm": 1.774012122026986, + "learning_rate": 4.876285878715763e-07, + "loss": 1.117, + "step": 5024 + }, + { + "epoch": 0.6812173795160307, + "grad_norm": 2.776167790774858, + "learning_rate": 4.872515214718123e-07, + "loss": 1.1902, + "step": 5025 + }, + { + "epoch": 0.6813529451636955, + "grad_norm": 1.6486112908981003, + "learning_rate": 4.86874553951544e-07, + "loss": 1.1243, + "step": 5026 + }, + { + "epoch": 0.6814885108113604, + "grad_norm": 2.0563512613909647, + "learning_rate": 4.864976853834684e-07, + "loss": 1.1432, + "step": 5027 + }, + { + "epoch": 0.6816240764590252, + "grad_norm": 2.036065356455968, + "learning_rate": 4.861209158402601e-07, + "loss": 1.134, + "step": 5028 + }, + { + "epoch": 0.6817596421066902, + "grad_norm": 1.7881088633139293, + "learning_rate": 4.857442453945779e-07, + "loss": 1.1173, + "step": 5029 + }, + { + "epoch": 0.6818952077543551, + "grad_norm": 1.7386061574264517, + "learning_rate": 4.853676741190576e-07, + "loss": 1.1291, + "step": 5030 + }, + { + "epoch": 0.6820307734020199, + "grad_norm": 2.8752923377305324, + "learning_rate": 4.849912020863198e-07, + "loss": 1.1662, + "step": 5031 + }, + { + "epoch": 0.6821663390496848, + "grad_norm": 1.8254249828738147, + "learning_rate": 4.846148293689629e-07, + "loss": 1.1344, + "step": 5032 + }, + { + "epoch": 0.6823019046973496, + "grad_norm": 2.1503984835346115, + "learning_rate": 4.842385560395687e-07, + "loss": 1.1424, + "step": 5033 + }, + { + "epoch": 0.6824374703450146, + "grad_norm": 3.181445352576919, + "learning_rate": 4.838623821706973e-07, + "loss": 1.1289, + "step": 5034 + }, + { + "epoch": 0.6825730359926795, + "grad_norm": 2.669079965507158, + "learning_rate": 4.834863078348915e-07, + "loss": 1.1845, + "step": 5035 + }, + { + "epoch": 0.6827086016403443, + "grad_norm": 1.8241266438378543, + "learning_rate": 4.831103331046739e-07, + "loss": 1.1443, + "step": 5036 + }, + { + "epoch": 0.6828441672880092, + "grad_norm": 1.70657594297962, + "learning_rate": 4.827344580525487e-07, + "loss": 1.1219, + "step": 5037 + }, + { + "epoch": 0.6829797329356742, + "grad_norm": 1.7653118699908665, + "learning_rate": 4.82358682751e-07, + "loss": 1.1211, + "step": 5038 + }, + { + "epoch": 0.683115298583339, + "grad_norm": 1.8333178375493715, + "learning_rate": 4.819830072724934e-07, + "loss": 1.1619, + "step": 5039 + }, + { + "epoch": 0.6832508642310039, + "grad_norm": 1.7950245302822658, + "learning_rate": 4.816074316894749e-07, + "loss": 1.1531, + "step": 5040 + }, + { + "epoch": 0.6833864298786687, + "grad_norm": 1.7259832755374223, + "learning_rate": 4.812319560743713e-07, + "loss": 1.1931, + "step": 5041 + }, + { + "epoch": 0.6835219955263336, + "grad_norm": 2.0694569528682965, + "learning_rate": 4.8085658049959e-07, + "loss": 1.1251, + "step": 5042 + }, + { + "epoch": 0.6836575611739986, + "grad_norm": 1.8480045357356578, + "learning_rate": 4.804813050375194e-07, + "loss": 1.1116, + "step": 5043 + }, + { + "epoch": 0.6837931268216634, + "grad_norm": 1.9832750589084536, + "learning_rate": 4.801061297605282e-07, + "loss": 1.1467, + "step": 5044 + }, + { + "epoch": 0.6839286924693283, + "grad_norm": 2.0691602337905506, + "learning_rate": 4.797310547409661e-07, + "loss": 1.1163, + "step": 5045 + }, + { + "epoch": 0.6840642581169931, + "grad_norm": 1.648663173002453, + "learning_rate": 4.793560800511634e-07, + "loss": 1.1487, + "step": 5046 + }, + { + "epoch": 0.684199823764658, + "grad_norm": 1.95739641895773, + "learning_rate": 4.789812057634308e-07, + "loss": 1.1897, + "step": 5047 + }, + { + "epoch": 0.684335389412323, + "grad_norm": 5.209175473585513, + "learning_rate": 4.786064319500604e-07, + "loss": 1.1443, + "step": 5048 + }, + { + "epoch": 0.6844709550599878, + "grad_norm": 1.7244926234941944, + "learning_rate": 4.782317586833236e-07, + "loss": 1.1328, + "step": 5049 + }, + { + "epoch": 0.6846065207076527, + "grad_norm": 2.3000627827580886, + "learning_rate": 4.778571860354737e-07, + "loss": 1.1414, + "step": 5050 + }, + { + "epoch": 0.6847420863553175, + "grad_norm": 2.0404313698274428, + "learning_rate": 4.774827140787437e-07, + "loss": 1.1579, + "step": 5051 + }, + { + "epoch": 0.6848776520029825, + "grad_norm": 2.3605524720374462, + "learning_rate": 4.77108342885348e-07, + "loss": 1.1476, + "step": 5052 + }, + { + "epoch": 0.6850132176506474, + "grad_norm": 2.421485481821315, + "learning_rate": 4.767340725274809e-07, + "loss": 1.1296, + "step": 5053 + }, + { + "epoch": 0.6851487832983122, + "grad_norm": 2.2465893939250425, + "learning_rate": 4.763599030773173e-07, + "loss": 1.1688, + "step": 5054 + }, + { + "epoch": 0.6852843489459771, + "grad_norm": 2.110827829338083, + "learning_rate": 4.7598583460701324e-07, + "loss": 1.1352, + "step": 5055 + }, + { + "epoch": 0.6854199145936419, + "grad_norm": 1.8523805540591856, + "learning_rate": 4.756118671887046e-07, + "loss": 1.1315, + "step": 5056 + }, + { + "epoch": 0.6855554802413069, + "grad_norm": 2.080701444291509, + "learning_rate": 4.7523800089450804e-07, + "loss": 1.1386, + "step": 5057 + }, + { + "epoch": 0.6856910458889718, + "grad_norm": 2.146619300343528, + "learning_rate": 4.748642357965208e-07, + "loss": 1.1563, + "step": 5058 + }, + { + "epoch": 0.6858266115366366, + "grad_norm": 2.1206596978794376, + "learning_rate": 4.7449057196682063e-07, + "loss": 1.1367, + "step": 5059 + }, + { + "epoch": 0.6859621771843015, + "grad_norm": 1.9505484208654258, + "learning_rate": 4.7411700947746534e-07, + "loss": 1.1459, + "step": 5060 + }, + { + "epoch": 0.6860977428319663, + "grad_norm": 3.34999439942788, + "learning_rate": 4.737435484004939e-07, + "loss": 1.1659, + "step": 5061 + }, + { + "epoch": 0.6862333084796313, + "grad_norm": 1.642873325830551, + "learning_rate": 4.7337018880792544e-07, + "loss": 1.1658, + "step": 5062 + }, + { + "epoch": 0.6863688741272962, + "grad_norm": 1.526413702126744, + "learning_rate": 4.729969307717583e-07, + "loss": 1.1105, + "step": 5063 + }, + { + "epoch": 0.686504439774961, + "grad_norm": 2.5388091901981547, + "learning_rate": 4.7262377436397396e-07, + "loss": 1.1826, + "step": 5064 + }, + { + "epoch": 0.6866400054226259, + "grad_norm": 3.334988618800827, + "learning_rate": 4.722507196565311e-07, + "loss": 1.1325, + "step": 5065 + }, + { + "epoch": 0.6867755710702907, + "grad_norm": 3.138323009766919, + "learning_rate": 4.718777667213719e-07, + "loss": 1.1788, + "step": 5066 + }, + { + "epoch": 0.6869111367179557, + "grad_norm": 1.6451296346463442, + "learning_rate": 4.7150491563041597e-07, + "loss": 1.1308, + "step": 5067 + }, + { + "epoch": 0.6870467023656206, + "grad_norm": 1.8955145989215798, + "learning_rate": 4.7113216645556606e-07, + "loss": 1.1011, + "step": 5068 + }, + { + "epoch": 0.6871822680132854, + "grad_norm": 2.0542527621609636, + "learning_rate": 4.707595192687025e-07, + "loss": 1.143, + "step": 5069 + }, + { + "epoch": 0.6873178336609503, + "grad_norm": 1.5832640570193075, + "learning_rate": 4.703869741416888e-07, + "loss": 1.1512, + "step": 5070 + }, + { + "epoch": 0.6874533993086152, + "grad_norm": 2.1480320268382336, + "learning_rate": 4.700145311463659e-07, + "loss": 1.0904, + "step": 5071 + }, + { + "epoch": 0.6875889649562801, + "grad_norm": 2.043654259079959, + "learning_rate": 4.696421903545579e-07, + "loss": 1.1466, + "step": 5072 + }, + { + "epoch": 0.687724530603945, + "grad_norm": 2.905632239383569, + "learning_rate": 4.692699518380664e-07, + "loss": 1.1479, + "step": 5073 + }, + { + "epoch": 0.6878600962516098, + "grad_norm": 2.00058601002724, + "learning_rate": 4.6889781566867617e-07, + "loss": 1.1628, + "step": 5074 + }, + { + "epoch": 0.6879956618992747, + "grad_norm": 2.0893475223824485, + "learning_rate": 4.685257819181494e-07, + "loss": 1.1378, + "step": 5075 + }, + { + "epoch": 0.6881312275469396, + "grad_norm": 1.8709188257835523, + "learning_rate": 4.6815385065823053e-07, + "loss": 1.1116, + "step": 5076 + }, + { + "epoch": 0.6882667931946045, + "grad_norm": 2.883898942025847, + "learning_rate": 4.677820219606433e-07, + "loss": 1.1437, + "step": 5077 + }, + { + "epoch": 0.6884023588422694, + "grad_norm": 2.089993673542408, + "learning_rate": 4.6741029589709216e-07, + "loss": 1.1506, + "step": 5078 + }, + { + "epoch": 0.6885379244899342, + "grad_norm": 2.455327165689213, + "learning_rate": 4.6703867253926144e-07, + "loss": 1.1186, + "step": 5079 + }, + { + "epoch": 0.6886734901375992, + "grad_norm": 1.5960851886756864, + "learning_rate": 4.666671519588158e-07, + "loss": 1.1126, + "step": 5080 + }, + { + "epoch": 0.688809055785264, + "grad_norm": 1.9624601746802792, + "learning_rate": 4.662957342274e-07, + "loss": 1.1615, + "step": 5081 + }, + { + "epoch": 0.6889446214329289, + "grad_norm": 1.7704281534312996, + "learning_rate": 4.6592441941663896e-07, + "loss": 1.1665, + "step": 5082 + }, + { + "epoch": 0.6890801870805938, + "grad_norm": 3.6139138795799397, + "learning_rate": 4.655532075981383e-07, + "loss": 1.1432, + "step": 5083 + }, + { + "epoch": 0.6892157527282586, + "grad_norm": 5.474973027898383, + "learning_rate": 4.6518209884348227e-07, + "loss": 1.1533, + "step": 5084 + }, + { + "epoch": 0.6893513183759236, + "grad_norm": 1.7580266174152825, + "learning_rate": 4.648110932242375e-07, + "loss": 1.1561, + "step": 5085 + }, + { + "epoch": 0.6894868840235884, + "grad_norm": 2.079680421019604, + "learning_rate": 4.644401908119482e-07, + "loss": 1.1436, + "step": 5086 + }, + { + "epoch": 0.6896224496712533, + "grad_norm": 2.119691666958498, + "learning_rate": 4.640693916781414e-07, + "loss": 1.1419, + "step": 5087 + }, + { + "epoch": 0.6897580153189182, + "grad_norm": 1.938290878288503, + "learning_rate": 4.636986958943212e-07, + "loss": 1.1083, + "step": 5088 + }, + { + "epoch": 0.689893580966583, + "grad_norm": 1.8063376380165785, + "learning_rate": 4.6332810353197503e-07, + "loss": 1.1485, + "step": 5089 + }, + { + "epoch": 0.690029146614248, + "grad_norm": 2.1879948580294863, + "learning_rate": 4.629576146625674e-07, + "loss": 1.1743, + "step": 5090 + }, + { + "epoch": 0.6901647122619128, + "grad_norm": 1.9827184973753236, + "learning_rate": 4.625872293575448e-07, + "loss": 1.0994, + "step": 5091 + }, + { + "epoch": 0.6903002779095777, + "grad_norm": 2.0802868574197815, + "learning_rate": 4.6221694768833276e-07, + "loss": 1.1322, + "step": 5092 + }, + { + "epoch": 0.6904358435572426, + "grad_norm": 1.8200226985318657, + "learning_rate": 4.6184676972633753e-07, + "loss": 1.1336, + "step": 5093 + }, + { + "epoch": 0.6905714092049074, + "grad_norm": 2.507313083135033, + "learning_rate": 4.614766955429447e-07, + "loss": 1.1586, + "step": 5094 + }, + { + "epoch": 0.6907069748525724, + "grad_norm": 1.7504669501238492, + "learning_rate": 4.6110672520952033e-07, + "loss": 1.1481, + "step": 5095 + }, + { + "epoch": 0.6908425405002372, + "grad_norm": 1.7957818197589939, + "learning_rate": 4.607368587974102e-07, + "loss": 1.1522, + "step": 5096 + }, + { + "epoch": 0.6909781061479021, + "grad_norm": 2.485200501114595, + "learning_rate": 4.6036709637794026e-07, + "loss": 1.1495, + "step": 5097 + }, + { + "epoch": 0.691113671795567, + "grad_norm": 1.7888616884229662, + "learning_rate": 4.599974380224161e-07, + "loss": 1.1746, + "step": 5098 + }, + { + "epoch": 0.6912492374432319, + "grad_norm": 2.3283447464805236, + "learning_rate": 4.5962788380212346e-07, + "loss": 1.1495, + "step": 5099 + }, + { + "epoch": 0.6913848030908968, + "grad_norm": 1.7377746708193749, + "learning_rate": 4.592584337883281e-07, + "loss": 1.1335, + "step": 5100 + }, + { + "epoch": 0.6915203687385616, + "grad_norm": 1.962031396372365, + "learning_rate": 4.5888908805227536e-07, + "loss": 1.1577, + "step": 5101 + }, + { + "epoch": 0.6916559343862265, + "grad_norm": 2.5145569217711667, + "learning_rate": 4.585198466651907e-07, + "loss": 1.1494, + "step": 5102 + }, + { + "epoch": 0.6917915000338914, + "grad_norm": 1.8550989433416205, + "learning_rate": 4.581507096982794e-07, + "loss": 1.1091, + "step": 5103 + }, + { + "epoch": 0.6919270656815563, + "grad_norm": 2.375064884466349, + "learning_rate": 4.5778167722272674e-07, + "loss": 1.1189, + "step": 5104 + }, + { + "epoch": 0.6920626313292212, + "grad_norm": 1.5742482582474417, + "learning_rate": 4.57412749309698e-07, + "loss": 1.119, + "step": 5105 + }, + { + "epoch": 0.692198196976886, + "grad_norm": 2.2068484951544916, + "learning_rate": 4.570439260303368e-07, + "loss": 1.1218, + "step": 5106 + }, + { + "epoch": 0.6923337626245509, + "grad_norm": 2.105558307564573, + "learning_rate": 4.566752074557694e-07, + "loss": 1.1638, + "step": 5107 + }, + { + "epoch": 0.6924693282722159, + "grad_norm": 13.473941836777712, + "learning_rate": 4.563065936570988e-07, + "loss": 1.1415, + "step": 5108 + }, + { + "epoch": 0.6926048939198807, + "grad_norm": 2.0901449344339094, + "learning_rate": 4.559380847054106e-07, + "loss": 1.1432, + "step": 5109 + }, + { + "epoch": 0.6927404595675456, + "grad_norm": 8.228733288777564, + "learning_rate": 4.555696806717679e-07, + "loss": 1.1633, + "step": 5110 + }, + { + "epoch": 0.6928760252152104, + "grad_norm": 1.9378429929808354, + "learning_rate": 4.552013816272148e-07, + "loss": 1.1225, + "step": 5111 + }, + { + "epoch": 0.6930115908628753, + "grad_norm": 1.8739247306357578, + "learning_rate": 4.548331876427749e-07, + "loss": 1.1233, + "step": 5112 + }, + { + "epoch": 0.6931471565105403, + "grad_norm": 1.7691806049745016, + "learning_rate": 4.544650987894514e-07, + "loss": 1.1321, + "step": 5113 + }, + { + "epoch": 0.6932827221582051, + "grad_norm": 1.8668016420737397, + "learning_rate": 4.5409711513822745e-07, + "loss": 1.1788, + "step": 5114 + }, + { + "epoch": 0.69341828780587, + "grad_norm": 1.6360232437791342, + "learning_rate": 4.537292367600658e-07, + "loss": 1.144, + "step": 5115 + }, + { + "epoch": 0.6935538534535349, + "grad_norm": 2.7074304032381553, + "learning_rate": 4.5336146372590876e-07, + "loss": 1.124, + "step": 5116 + }, + { + "epoch": 0.6936894191011997, + "grad_norm": 1.7046679266598936, + "learning_rate": 4.5299379610667865e-07, + "loss": 1.2063, + "step": 5117 + }, + { + "epoch": 0.6938249847488647, + "grad_norm": 1.9481372438053226, + "learning_rate": 4.5262623397327706e-07, + "loss": 1.1438, + "step": 5118 + }, + { + "epoch": 0.6939605503965295, + "grad_norm": 2.119857841202536, + "learning_rate": 4.522587773965856e-07, + "loss": 1.1798, + "step": 5119 + }, + { + "epoch": 0.6940961160441944, + "grad_norm": 1.9710735222056244, + "learning_rate": 4.518914264474657e-07, + "loss": 1.1628, + "step": 5120 + }, + { + "epoch": 0.6942316816918593, + "grad_norm": 1.8898391179371108, + "learning_rate": 4.5152418119675684e-07, + "loss": 1.1233, + "step": 5121 + }, + { + "epoch": 0.6943672473395242, + "grad_norm": 1.9193111737049855, + "learning_rate": 4.5115704171528103e-07, + "loss": 1.1092, + "step": 5122 + }, + { + "epoch": 0.6945028129871891, + "grad_norm": 1.6680787669421402, + "learning_rate": 4.507900080738367e-07, + "loss": 1.1837, + "step": 5123 + }, + { + "epoch": 0.6946383786348539, + "grad_norm": 1.5676780782937902, + "learning_rate": 4.5042308034320487e-07, + "loss": 1.1732, + "step": 5124 + }, + { + "epoch": 0.6947739442825188, + "grad_norm": 3.097570564381899, + "learning_rate": 4.500562585941432e-07, + "loss": 1.1075, + "step": 5125 + }, + { + "epoch": 0.6949095099301837, + "grad_norm": 1.8750146307032705, + "learning_rate": 4.496895428973917e-07, + "loss": 1.1653, + "step": 5126 + }, + { + "epoch": 0.6950450755778486, + "grad_norm": 3.722337248327742, + "learning_rate": 4.4932293332366733e-07, + "loss": 1.172, + "step": 5127 + }, + { + "epoch": 0.6951806412255135, + "grad_norm": 2.3559570061684947, + "learning_rate": 4.489564299436691e-07, + "loss": 1.1358, + "step": 5128 + }, + { + "epoch": 0.6953162068731783, + "grad_norm": 1.7257316318488374, + "learning_rate": 4.4859003282807305e-07, + "loss": 1.1172, + "step": 5129 + }, + { + "epoch": 0.6954517725208432, + "grad_norm": 2.21344394404433, + "learning_rate": 4.4822374204753734e-07, + "loss": 1.1668, + "step": 5130 + }, + { + "epoch": 0.6955873381685082, + "grad_norm": 1.504742440729728, + "learning_rate": 4.4785755767269675e-07, + "loss": 1.1351, + "step": 5131 + }, + { + "epoch": 0.695722903816173, + "grad_norm": 1.9675255251892572, + "learning_rate": 4.474914797741686e-07, + "loss": 1.1281, + "step": 5132 + }, + { + "epoch": 0.6958584694638379, + "grad_norm": 3.388950732538444, + "learning_rate": 4.471255084225468e-07, + "loss": 1.1241, + "step": 5133 + }, + { + "epoch": 0.6959940351115027, + "grad_norm": 3.6470634890929574, + "learning_rate": 4.467596436884068e-07, + "loss": 1.1462, + "step": 5134 + }, + { + "epoch": 0.6961296007591676, + "grad_norm": 4.133927173023628, + "learning_rate": 4.463938856423023e-07, + "loss": 1.1469, + "step": 5135 + }, + { + "epoch": 0.6962651664068326, + "grad_norm": 2.2584876239688803, + "learning_rate": 4.4602823435476723e-07, + "loss": 1.1555, + "step": 5136 + }, + { + "epoch": 0.6964007320544974, + "grad_norm": 1.7500981908740767, + "learning_rate": 4.4566268989631427e-07, + "loss": 1.152, + "step": 5137 + }, + { + "epoch": 0.6965362977021623, + "grad_norm": 1.6975228287640503, + "learning_rate": 4.452972523374359e-07, + "loss": 1.1456, + "step": 5138 + }, + { + "epoch": 0.6966718633498271, + "grad_norm": 4.724177208373811, + "learning_rate": 4.4493192174860394e-07, + "loss": 1.1737, + "step": 5139 + }, + { + "epoch": 0.696807428997492, + "grad_norm": 1.7592428408108254, + "learning_rate": 4.4456669820026935e-07, + "loss": 1.1521, + "step": 5140 + }, + { + "epoch": 0.696942994645157, + "grad_norm": 1.789669265544865, + "learning_rate": 4.442015817628627e-07, + "loss": 1.1587, + "step": 5141 + }, + { + "epoch": 0.6970785602928218, + "grad_norm": 1.883966413827092, + "learning_rate": 4.438365725067937e-07, + "loss": 1.1645, + "step": 5142 + }, + { + "epoch": 0.6972141259404867, + "grad_norm": 1.8437331249541473, + "learning_rate": 4.434716705024518e-07, + "loss": 1.137, + "step": 5143 + }, + { + "epoch": 0.6973496915881515, + "grad_norm": 2.7554475839828294, + "learning_rate": 4.4310687582020524e-07, + "loss": 1.1154, + "step": 5144 + }, + { + "epoch": 0.6974852572358164, + "grad_norm": 3.802791128837698, + "learning_rate": 4.4274218853040213e-07, + "loss": 1.1356, + "step": 5145 + }, + { + "epoch": 0.6976208228834814, + "grad_norm": 1.9491324847496427, + "learning_rate": 4.4237760870336883e-07, + "loss": 1.1401, + "step": 5146 + }, + { + "epoch": 0.6977563885311462, + "grad_norm": 4.704320032646723, + "learning_rate": 4.420131364094122e-07, + "loss": 1.1033, + "step": 5147 + }, + { + "epoch": 0.6978919541788111, + "grad_norm": 2.1037442556449446, + "learning_rate": 4.4164877171881765e-07, + "loss": 1.2051, + "step": 5148 + }, + { + "epoch": 0.6980275198264759, + "grad_norm": 2.0909819673753987, + "learning_rate": 4.4128451470185013e-07, + "loss": 1.1786, + "step": 5149 + }, + { + "epoch": 0.6981630854741409, + "grad_norm": 2.5428390470794544, + "learning_rate": 4.409203654287538e-07, + "loss": 1.1441, + "step": 5150 + }, + { + "epoch": 0.6982986511218058, + "grad_norm": 2.5293036741759956, + "learning_rate": 4.4055632396975174e-07, + "loss": 1.1295, + "step": 5151 + }, + { + "epoch": 0.6984342167694706, + "grad_norm": 5.090788448408619, + "learning_rate": 4.4019239039504676e-07, + "loss": 1.1294, + "step": 5152 + }, + { + "epoch": 0.6985697824171355, + "grad_norm": 3.1395890606041337, + "learning_rate": 4.3982856477482034e-07, + "loss": 1.1409, + "step": 5153 + }, + { + "epoch": 0.6987053480648003, + "grad_norm": 1.7430876774909096, + "learning_rate": 4.394648471792335e-07, + "loss": 1.157, + "step": 5154 + }, + { + "epoch": 0.6988409137124653, + "grad_norm": 3.1841999252680857, + "learning_rate": 4.391012376784263e-07, + "loss": 1.1426, + "step": 5155 + }, + { + "epoch": 0.6989764793601302, + "grad_norm": 1.9203870377602885, + "learning_rate": 4.3873773634251796e-07, + "loss": 1.124, + "step": 5156 + }, + { + "epoch": 0.699112045007795, + "grad_norm": 1.6041178208389868, + "learning_rate": 4.3837434324160684e-07, + "loss": 1.1511, + "step": 5157 + }, + { + "epoch": 0.6992476106554599, + "grad_norm": 1.594843495066651, + "learning_rate": 4.380110584457705e-07, + "loss": 1.1211, + "step": 5158 + }, + { + "epoch": 0.6993831763031247, + "grad_norm": 2.165753533064386, + "learning_rate": 4.376478820250653e-07, + "loss": 1.1287, + "step": 5159 + }, + { + "epoch": 0.6995187419507897, + "grad_norm": 2.0453383424139306, + "learning_rate": 4.3728481404952724e-07, + "loss": 1.1388, + "step": 5160 + }, + { + "epoch": 0.6996543075984546, + "grad_norm": 1.579763381987814, + "learning_rate": 4.369218545891713e-07, + "loss": 1.1778, + "step": 5161 + }, + { + "epoch": 0.6997898732461194, + "grad_norm": 1.7875282310073164, + "learning_rate": 4.3655900371399025e-07, + "loss": 1.1412, + "step": 5162 + }, + { + "epoch": 0.6999254388937843, + "grad_norm": 1.8747474076993123, + "learning_rate": 4.361962614939586e-07, + "loss": 1.1597, + "step": 5163 + }, + { + "epoch": 0.7000610045414492, + "grad_norm": 2.1707693173405755, + "learning_rate": 4.358336279990268e-07, + "loss": 1.1415, + "step": 5164 + }, + { + "epoch": 0.7001965701891141, + "grad_norm": 3.037375571943312, + "learning_rate": 4.354711032991273e-07, + "loss": 1.127, + "step": 5165 + }, + { + "epoch": 0.700332135836779, + "grad_norm": 1.6118773585224304, + "learning_rate": 4.3510868746416875e-07, + "loss": 1.1538, + "step": 5166 + }, + { + "epoch": 0.7004677014844438, + "grad_norm": 2.0311764130123544, + "learning_rate": 4.3474638056404146e-07, + "loss": 1.1452, + "step": 5167 + }, + { + "epoch": 0.7006032671321087, + "grad_norm": 2.202582223436084, + "learning_rate": 4.343841826686121e-07, + "loss": 1.1407, + "step": 5168 + }, + { + "epoch": 0.7007388327797736, + "grad_norm": 2.04537685652524, + "learning_rate": 4.3402209384772925e-07, + "loss": 1.1607, + "step": 5169 + }, + { + "epoch": 0.7008743984274385, + "grad_norm": 2.312738200987221, + "learning_rate": 4.336601141712172e-07, + "loss": 1.1061, + "step": 5170 + }, + { + "epoch": 0.7010099640751034, + "grad_norm": 2.413624713602256, + "learning_rate": 4.332982437088825e-07, + "loss": 1.0967, + "step": 5171 + }, + { + "epoch": 0.7011455297227682, + "grad_norm": 1.6552907160851191, + "learning_rate": 4.3293648253050786e-07, + "loss": 1.1357, + "step": 5172 + }, + { + "epoch": 0.7012810953704331, + "grad_norm": 1.9037887538129499, + "learning_rate": 4.3257483070585644e-07, + "loss": 1.1682, + "step": 5173 + }, + { + "epoch": 0.701416661018098, + "grad_norm": 10.479633612175608, + "learning_rate": 4.3221328830466996e-07, + "loss": 1.1186, + "step": 5174 + }, + { + "epoch": 0.7015522266657629, + "grad_norm": 2.7655172838832254, + "learning_rate": 4.318518553966689e-07, + "loss": 1.1586, + "step": 5175 + }, + { + "epoch": 0.7016877923134278, + "grad_norm": 1.961434661627398, + "learning_rate": 4.3149053205155295e-07, + "loss": 1.1593, + "step": 5176 + }, + { + "epoch": 0.7018233579610926, + "grad_norm": 1.8448620520112884, + "learning_rate": 4.3112931833900036e-07, + "loss": 1.1158, + "step": 5177 + }, + { + "epoch": 0.7019589236087576, + "grad_norm": 1.9278117245657622, + "learning_rate": 4.307682143286683e-07, + "loss": 1.1492, + "step": 5178 + }, + { + "epoch": 0.7020944892564224, + "grad_norm": 4.405495612140319, + "learning_rate": 4.3040722009019284e-07, + "loss": 1.12, + "step": 5179 + }, + { + "epoch": 0.7022300549040873, + "grad_norm": 3.404582562530176, + "learning_rate": 4.300463356931888e-07, + "loss": 1.1374, + "step": 5180 + }, + { + "epoch": 0.7023656205517522, + "grad_norm": 1.7478584620276654, + "learning_rate": 4.296855612072501e-07, + "loss": 1.1124, + "step": 5181 + }, + { + "epoch": 0.702501186199417, + "grad_norm": 2.310450749028635, + "learning_rate": 4.293248967019495e-07, + "loss": 1.1948, + "step": 5182 + }, + { + "epoch": 0.702636751847082, + "grad_norm": 1.8274476288290955, + "learning_rate": 4.289643422468372e-07, + "loss": 1.1363, + "step": 5183 + }, + { + "epoch": 0.7027723174947468, + "grad_norm": 2.0381344067874085, + "learning_rate": 4.286038979114447e-07, + "loss": 1.1349, + "step": 5184 + }, + { + "epoch": 0.7029078831424117, + "grad_norm": 1.8374487765007133, + "learning_rate": 4.282435637652795e-07, + "loss": 1.1663, + "step": 5185 + }, + { + "epoch": 0.7030434487900766, + "grad_norm": 1.9771609312274687, + "learning_rate": 4.278833398778305e-07, + "loss": 1.1414, + "step": 5186 + }, + { + "epoch": 0.7031790144377414, + "grad_norm": 1.8047273671456923, + "learning_rate": 4.2752322631856275e-07, + "loss": 1.1446, + "step": 5187 + }, + { + "epoch": 0.7033145800854064, + "grad_norm": 2.8946698066287193, + "learning_rate": 4.2716322315692266e-07, + "loss": 1.1404, + "step": 5188 + }, + { + "epoch": 0.7034501457330712, + "grad_norm": 2.0109064939004964, + "learning_rate": 4.2680333046233286e-07, + "loss": 1.1313, + "step": 5189 + }, + { + "epoch": 0.7035857113807361, + "grad_norm": 1.791330995527185, + "learning_rate": 4.2644354830419627e-07, + "loss": 1.1296, + "step": 5190 + }, + { + "epoch": 0.703721277028401, + "grad_norm": 2.053615248158614, + "learning_rate": 4.2608387675189404e-07, + "loss": 1.1243, + "step": 5191 + }, + { + "epoch": 0.7038568426760659, + "grad_norm": 2.9571448424533764, + "learning_rate": 4.2572431587478594e-07, + "loss": 1.1278, + "step": 5192 + }, + { + "epoch": 0.7039924083237308, + "grad_norm": 3.4835773739386577, + "learning_rate": 4.253648657422105e-07, + "loss": 1.1621, + "step": 5193 + }, + { + "epoch": 0.7041279739713956, + "grad_norm": 2.1878401438266213, + "learning_rate": 4.2500552642348475e-07, + "loss": 1.1441, + "step": 5194 + }, + { + "epoch": 0.7042635396190605, + "grad_norm": 1.825649687642672, + "learning_rate": 4.2464629798790453e-07, + "loss": 1.139, + "step": 5195 + }, + { + "epoch": 0.7043991052667254, + "grad_norm": 1.682895811475502, + "learning_rate": 4.242871805047442e-07, + "loss": 1.1644, + "step": 5196 + }, + { + "epoch": 0.7045346709143903, + "grad_norm": 1.6127318596522016, + "learning_rate": 4.2392817404325665e-07, + "loss": 1.1522, + "step": 5197 + }, + { + "epoch": 0.7046702365620552, + "grad_norm": 1.6181560177063143, + "learning_rate": 4.2356927867267355e-07, + "loss": 1.1273, + "step": 5198 + }, + { + "epoch": 0.7048058022097201, + "grad_norm": 2.0453660825289495, + "learning_rate": 4.23210494462205e-07, + "loss": 1.1511, + "step": 5199 + }, + { + "epoch": 0.7049413678573849, + "grad_norm": 1.9063054545984568, + "learning_rate": 4.228518214810396e-07, + "loss": 1.1954, + "step": 5200 + }, + { + "epoch": 0.7050769335050499, + "grad_norm": 2.1542832852700493, + "learning_rate": 4.2249325979834484e-07, + "loss": 1.1797, + "step": 5201 + }, + { + "epoch": 0.7052124991527147, + "grad_norm": 1.8178845781456736, + "learning_rate": 4.221348094832666e-07, + "loss": 1.1515, + "step": 5202 + }, + { + "epoch": 0.7053480648003796, + "grad_norm": 3.0432982977695646, + "learning_rate": 4.217764706049283e-07, + "loss": 1.152, + "step": 5203 + }, + { + "epoch": 0.7054836304480445, + "grad_norm": 1.6864467701396513, + "learning_rate": 4.2141824323243416e-07, + "loss": 1.1617, + "step": 5204 + }, + { + "epoch": 0.7056191960957093, + "grad_norm": 2.628620530070862, + "learning_rate": 4.21060127434864e-07, + "loss": 1.1017, + "step": 5205 + }, + { + "epoch": 0.7057547617433743, + "grad_norm": 2.0785198408217243, + "learning_rate": 4.207021232812792e-07, + "loss": 1.1212, + "step": 5206 + }, + { + "epoch": 0.7058903273910391, + "grad_norm": 2.7488473792445665, + "learning_rate": 4.2034423084071637e-07, + "loss": 1.137, + "step": 5207 + }, + { + "epoch": 0.706025893038704, + "grad_norm": 1.7400948640433656, + "learning_rate": 4.199864501821939e-07, + "loss": 1.1122, + "step": 5208 + }, + { + "epoch": 0.7061614586863689, + "grad_norm": 3.8630721119447147, + "learning_rate": 4.196287813747058e-07, + "loss": 1.1361, + "step": 5209 + }, + { + "epoch": 0.7062970243340337, + "grad_norm": 1.8599085707858538, + "learning_rate": 4.1927122448722597e-07, + "loss": 1.1281, + "step": 5210 + }, + { + "epoch": 0.7064325899816987, + "grad_norm": 2.2287539554827207, + "learning_rate": 4.1891377958870657e-07, + "loss": 1.1339, + "step": 5211 + }, + { + "epoch": 0.7065681556293635, + "grad_norm": 1.7394864102807448, + "learning_rate": 4.18556446748078e-07, + "loss": 1.1739, + "step": 5212 + }, + { + "epoch": 0.7067037212770284, + "grad_norm": 2.0285450123165596, + "learning_rate": 4.1819922603424895e-07, + "loss": 1.1008, + "step": 5213 + }, + { + "epoch": 0.7068392869246933, + "grad_norm": 1.7244875321614377, + "learning_rate": 4.1784211751610675e-07, + "loss": 1.173, + "step": 5214 + }, + { + "epoch": 0.7069748525723581, + "grad_norm": 2.054499702214317, + "learning_rate": 4.174851212625169e-07, + "loss": 1.1418, + "step": 5215 + }, + { + "epoch": 0.7071104182200231, + "grad_norm": 1.8662183537722574, + "learning_rate": 4.171282373423234e-07, + "loss": 1.1712, + "step": 5216 + }, + { + "epoch": 0.7072459838676879, + "grad_norm": 1.6504728528139647, + "learning_rate": 4.167714658243486e-07, + "loss": 1.1705, + "step": 5217 + }, + { + "epoch": 0.7073815495153528, + "grad_norm": 3.2632869546255536, + "learning_rate": 4.1641480677739236e-07, + "loss": 1.172, + "step": 5218 + }, + { + "epoch": 0.7075171151630177, + "grad_norm": 2.7821418164499625, + "learning_rate": 4.160582602702347e-07, + "loss": 1.12, + "step": 5219 + }, + { + "epoch": 0.7076526808106826, + "grad_norm": 1.6786046280464935, + "learning_rate": 4.1570182637163153e-07, + "loss": 1.118, + "step": 5220 + }, + { + "epoch": 0.7077882464583475, + "grad_norm": 2.2671930734789467, + "learning_rate": 4.153455051503196e-07, + "loss": 1.1403, + "step": 5221 + }, + { + "epoch": 0.7079238121060123, + "grad_norm": 2.5213148506631384, + "learning_rate": 4.149892966750114e-07, + "loss": 1.1561, + "step": 5222 + }, + { + "epoch": 0.7080593777536772, + "grad_norm": 2.1619555214455684, + "learning_rate": 4.1463320101440027e-07, + "loss": 1.1809, + "step": 5223 + }, + { + "epoch": 0.7081949434013421, + "grad_norm": 1.7853517756111288, + "learning_rate": 4.1427721823715487e-07, + "loss": 1.1432, + "step": 5224 + }, + { + "epoch": 0.708330509049007, + "grad_norm": 3.4471523555302857, + "learning_rate": 4.1392134841192537e-07, + "loss": 1.17, + "step": 5225 + }, + { + "epoch": 0.7084660746966719, + "grad_norm": 2.058263759061494, + "learning_rate": 4.135655916073368e-07, + "loss": 1.1575, + "step": 5226 + }, + { + "epoch": 0.7086016403443367, + "grad_norm": 1.6066237371349104, + "learning_rate": 4.132099478919957e-07, + "loss": 1.1472, + "step": 5227 + }, + { + "epoch": 0.7087372059920016, + "grad_norm": 3.1282155862720304, + "learning_rate": 4.1285441733448344e-07, + "loss": 1.1371, + "step": 5228 + }, + { + "epoch": 0.7088727716396666, + "grad_norm": 1.6950436278544732, + "learning_rate": 4.124990000033629e-07, + "loss": 1.1453, + "step": 5229 + }, + { + "epoch": 0.7090083372873314, + "grad_norm": 1.9989959149203316, + "learning_rate": 4.1214369596717244e-07, + "loss": 1.1622, + "step": 5230 + }, + { + "epoch": 0.7091439029349963, + "grad_norm": 1.9165712404345954, + "learning_rate": 4.1178850529442996e-07, + "loss": 1.1382, + "step": 5231 + }, + { + "epoch": 0.7092794685826611, + "grad_norm": 2.041022562128498, + "learning_rate": 4.1143342805363123e-07, + "loss": 1.117, + "step": 5232 + }, + { + "epoch": 0.709415034230326, + "grad_norm": 3.2989942383279867, + "learning_rate": 4.1107846431325e-07, + "loss": 1.1443, + "step": 5233 + }, + { + "epoch": 0.709550599877991, + "grad_norm": 2.1099021347994182, + "learning_rate": 4.1072361414173815e-07, + "loss": 1.1458, + "step": 5234 + }, + { + "epoch": 0.7096861655256558, + "grad_norm": 1.9924249137879988, + "learning_rate": 4.10368877607526e-07, + "loss": 1.1323, + "step": 5235 + }, + { + "epoch": 0.7098217311733207, + "grad_norm": 2.201560918918191, + "learning_rate": 4.100142547790214e-07, + "loss": 1.1319, + "step": 5236 + }, + { + "epoch": 0.7099572968209855, + "grad_norm": 1.9882893485172846, + "learning_rate": 4.096597457246108e-07, + "loss": 1.1304, + "step": 5237 + }, + { + "epoch": 0.7100928624686504, + "grad_norm": 2.1642742983357137, + "learning_rate": 4.0930535051265835e-07, + "loss": 1.1392, + "step": 5238 + }, + { + "epoch": 0.7102284281163154, + "grad_norm": 2.843739788240136, + "learning_rate": 4.0895106921150644e-07, + "loss": 1.1736, + "step": 5239 + }, + { + "epoch": 0.7103639937639802, + "grad_norm": 2.8889561770209466, + "learning_rate": 4.0859690188947525e-07, + "loss": 1.1302, + "step": 5240 + }, + { + "epoch": 0.7104995594116451, + "grad_norm": 2.484285658705714, + "learning_rate": 4.0824284861486346e-07, + "loss": 1.174, + "step": 5241 + }, + { + "epoch": 0.7106351250593099, + "grad_norm": 6.1324264717889285, + "learning_rate": 4.0788890945594714e-07, + "loss": 1.1758, + "step": 5242 + }, + { + "epoch": 0.7107706907069749, + "grad_norm": 2.649147289492657, + "learning_rate": 4.0753508448098085e-07, + "loss": 1.1115, + "step": 5243 + }, + { + "epoch": 0.7109062563546398, + "grad_norm": 2.3020850238668094, + "learning_rate": 4.0718137375819717e-07, + "loss": 1.1373, + "step": 5244 + }, + { + "epoch": 0.7110418220023046, + "grad_norm": 2.19029451205056, + "learning_rate": 4.0682777735580586e-07, + "loss": 1.1445, + "step": 5245 + }, + { + "epoch": 0.7111773876499695, + "grad_norm": 1.8980125382140511, + "learning_rate": 4.064742953419954e-07, + "loss": 1.1575, + "step": 5246 + }, + { + "epoch": 0.7113129532976343, + "grad_norm": 2.4053710548362144, + "learning_rate": 4.061209277849321e-07, + "loss": 1.1081, + "step": 5247 + }, + { + "epoch": 0.7114485189452993, + "grad_norm": 6.029772790589842, + "learning_rate": 4.057676747527601e-07, + "loss": 1.1477, + "step": 5248 + }, + { + "epoch": 0.7115840845929642, + "grad_norm": 1.8821637314965236, + "learning_rate": 4.054145363136013e-07, + "loss": 1.1593, + "step": 5249 + }, + { + "epoch": 0.711719650240629, + "grad_norm": 3.6372424216840633, + "learning_rate": 4.05061512535556e-07, + "loss": 1.1236, + "step": 5250 + }, + { + "epoch": 0.7118552158882939, + "grad_norm": 1.9465706168357433, + "learning_rate": 4.047086034867018e-07, + "loss": 1.1217, + "step": 5251 + }, + { + "epoch": 0.7119907815359587, + "grad_norm": 2.7780603870133245, + "learning_rate": 4.0435580923509436e-07, + "loss": 1.133, + "step": 5252 + }, + { + "epoch": 0.7121263471836237, + "grad_norm": 1.8370125104042823, + "learning_rate": 4.040031298487675e-07, + "loss": 1.1613, + "step": 5253 + }, + { + "epoch": 0.7122619128312886, + "grad_norm": 1.7765981297226, + "learning_rate": 4.036505653957325e-07, + "loss": 1.1504, + "step": 5254 + }, + { + "epoch": 0.7123974784789534, + "grad_norm": 2.345514409903206, + "learning_rate": 4.032981159439787e-07, + "loss": 1.1599, + "step": 5255 + }, + { + "epoch": 0.7125330441266183, + "grad_norm": 1.8287313697483845, + "learning_rate": 4.029457815614731e-07, + "loss": 1.1397, + "step": 5256 + }, + { + "epoch": 0.7126686097742831, + "grad_norm": 2.2519070327608053, + "learning_rate": 4.025935623161607e-07, + "loss": 1.1497, + "step": 5257 + }, + { + "epoch": 0.7128041754219481, + "grad_norm": 3.023479542163979, + "learning_rate": 4.022414582759646e-07, + "loss": 1.1665, + "step": 5258 + }, + { + "epoch": 0.712939741069613, + "grad_norm": 2.1303849086202047, + "learning_rate": 4.01889469508784e-07, + "loss": 1.1462, + "step": 5259 + }, + { + "epoch": 0.7130753067172778, + "grad_norm": 1.6674020304895976, + "learning_rate": 4.0153759608249883e-07, + "loss": 1.1535, + "step": 5260 + }, + { + "epoch": 0.7132108723649427, + "grad_norm": 1.8612706043980523, + "learning_rate": 4.011858380649634e-07, + "loss": 1.0826, + "step": 5261 + }, + { + "epoch": 0.7133464380126076, + "grad_norm": 2.8201166680485144, + "learning_rate": 4.008341955240132e-07, + "loss": 1.1653, + "step": 5262 + }, + { + "epoch": 0.7134820036602725, + "grad_norm": 1.7663707058946942, + "learning_rate": 4.0048266852745815e-07, + "loss": 1.1285, + "step": 5263 + }, + { + "epoch": 0.7136175693079374, + "grad_norm": 1.9976763941337268, + "learning_rate": 4.0013125714308883e-07, + "loss": 1.1495, + "step": 5264 + }, + { + "epoch": 0.7137531349556022, + "grad_norm": 3.6688397377081996, + "learning_rate": 3.9977996143867086e-07, + "loss": 1.167, + "step": 5265 + }, + { + "epoch": 0.7138887006032671, + "grad_norm": 1.6810148891451107, + "learning_rate": 3.9942878148195015e-07, + "loss": 1.109, + "step": 5266 + }, + { + "epoch": 0.714024266250932, + "grad_norm": 2.238113219593441, + "learning_rate": 3.9907771734064756e-07, + "loss": 1.1638, + "step": 5267 + }, + { + "epoch": 0.7141598318985969, + "grad_norm": 2.938221137104574, + "learning_rate": 3.987267690824646e-07, + "loss": 1.1747, + "step": 5268 + }, + { + "epoch": 0.7142953975462618, + "grad_norm": 1.7108912601455648, + "learning_rate": 3.983759367750772e-07, + "loss": 1.1187, + "step": 5269 + }, + { + "epoch": 0.7144309631939266, + "grad_norm": 1.864011620461243, + "learning_rate": 3.980252204861423e-07, + "loss": 1.1424, + "step": 5270 + }, + { + "epoch": 0.7145665288415916, + "grad_norm": 2.0505933168538113, + "learning_rate": 3.9767462028329156e-07, + "loss": 1.1543, + "step": 5271 + }, + { + "epoch": 0.7147020944892564, + "grad_norm": 1.7359161185147376, + "learning_rate": 3.973241362341357e-07, + "loss": 1.1053, + "step": 5272 + }, + { + "epoch": 0.7148376601369213, + "grad_norm": 2.071003284516309, + "learning_rate": 3.9697376840626304e-07, + "loss": 1.1418, + "step": 5273 + }, + { + "epoch": 0.7149732257845862, + "grad_norm": 2.0693892090105486, + "learning_rate": 3.9662351686723914e-07, + "loss": 1.1673, + "step": 5274 + }, + { + "epoch": 0.715108791432251, + "grad_norm": 1.8911002918137365, + "learning_rate": 3.962733816846073e-07, + "loss": 1.1061, + "step": 5275 + }, + { + "epoch": 0.715244357079916, + "grad_norm": 1.7272626272199845, + "learning_rate": 3.9592336292588825e-07, + "loss": 1.1138, + "step": 5276 + }, + { + "epoch": 0.7153799227275809, + "grad_norm": 2.008322164984965, + "learning_rate": 3.9557346065858034e-07, + "loss": 1.1468, + "step": 5277 + }, + { + "epoch": 0.7155154883752457, + "grad_norm": 1.872346231654201, + "learning_rate": 3.952236749501594e-07, + "loss": 1.156, + "step": 5278 + }, + { + "epoch": 0.7156510540229106, + "grad_norm": 1.952452053123552, + "learning_rate": 3.948740058680791e-07, + "loss": 1.1583, + "step": 5279 + }, + { + "epoch": 0.7157866196705754, + "grad_norm": 1.6508579178388125, + "learning_rate": 3.9452445347977e-07, + "loss": 1.1503, + "step": 5280 + }, + { + "epoch": 0.7159221853182404, + "grad_norm": 1.6781961346787155, + "learning_rate": 3.941750178526413e-07, + "loss": 1.1507, + "step": 5281 + }, + { + "epoch": 0.7160577509659053, + "grad_norm": 1.779367362345257, + "learning_rate": 3.938256990540775e-07, + "loss": 1.1654, + "step": 5282 + }, + { + "epoch": 0.7161933166135701, + "grad_norm": 1.623527756579532, + "learning_rate": 3.934764971514434e-07, + "loss": 1.1228, + "step": 5283 + }, + { + "epoch": 0.716328882261235, + "grad_norm": 2.6117296475443452, + "learning_rate": 3.931274122120786e-07, + "loss": 1.1641, + "step": 5284 + }, + { + "epoch": 0.7164644479088998, + "grad_norm": 1.7080397578279096, + "learning_rate": 3.9277844430330277e-07, + "loss": 1.1793, + "step": 5285 + }, + { + "epoch": 0.7166000135565648, + "grad_norm": 1.4638335563521396, + "learning_rate": 3.9242959349241036e-07, + "loss": 1.1548, + "step": 5286 + }, + { + "epoch": 0.7167355792042297, + "grad_norm": 1.6215180790343762, + "learning_rate": 3.9208085984667507e-07, + "loss": 1.1299, + "step": 5287 + }, + { + "epoch": 0.7168711448518945, + "grad_norm": 1.7453899053562913, + "learning_rate": 3.917322434333472e-07, + "loss": 1.1555, + "step": 5288 + }, + { + "epoch": 0.7170067104995594, + "grad_norm": 3.3187733613031676, + "learning_rate": 3.913837443196549e-07, + "loss": 1.1314, + "step": 5289 + }, + { + "epoch": 0.7171422761472243, + "grad_norm": 2.670361312110984, + "learning_rate": 3.9103536257280343e-07, + "loss": 1.1593, + "step": 5290 + }, + { + "epoch": 0.7172778417948892, + "grad_norm": 1.792752525511277, + "learning_rate": 3.9068709825997534e-07, + "loss": 1.1594, + "step": 5291 + }, + { + "epoch": 0.7174134074425541, + "grad_norm": 2.0732363163185505, + "learning_rate": 3.903389514483308e-07, + "loss": 1.1205, + "step": 5292 + }, + { + "epoch": 0.7175489730902189, + "grad_norm": 2.674210979908145, + "learning_rate": 3.899909222050071e-07, + "loss": 1.1111, + "step": 5293 + }, + { + "epoch": 0.7176845387378838, + "grad_norm": 2.145016204627095, + "learning_rate": 3.896430105971188e-07, + "loss": 1.1503, + "step": 5294 + }, + { + "epoch": 0.7178201043855487, + "grad_norm": 1.907550338951806, + "learning_rate": 3.8929521669175813e-07, + "loss": 1.171, + "step": 5295 + }, + { + "epoch": 0.7179556700332136, + "grad_norm": 2.8169353611766943, + "learning_rate": 3.889475405559943e-07, + "loss": 1.1464, + "step": 5296 + }, + { + "epoch": 0.7180912356808785, + "grad_norm": 3.3302693198890214, + "learning_rate": 3.88599982256874e-07, + "loss": 1.1904, + "step": 5297 + }, + { + "epoch": 0.7182268013285433, + "grad_norm": 2.2069928983969973, + "learning_rate": 3.8825254186142097e-07, + "loss": 1.1633, + "step": 5298 + }, + { + "epoch": 0.7183623669762083, + "grad_norm": 2.3126792940512315, + "learning_rate": 3.8790521943663633e-07, + "loss": 1.1333, + "step": 5299 + }, + { + "epoch": 0.7184979326238731, + "grad_norm": 1.8195557792469503, + "learning_rate": 3.875580150494986e-07, + "loss": 1.1417, + "step": 5300 + }, + { + "epoch": 0.718633498271538, + "grad_norm": 2.7976008134176675, + "learning_rate": 3.8721092876696373e-07, + "loss": 1.1439, + "step": 5301 + }, + { + "epoch": 0.7187690639192029, + "grad_norm": 2.590864465789253, + "learning_rate": 3.868639606559635e-07, + "loss": 1.1582, + "step": 5302 + }, + { + "epoch": 0.7189046295668677, + "grad_norm": 2.5736112038059327, + "learning_rate": 3.8651711078340923e-07, + "loss": 1.1487, + "step": 5303 + }, + { + "epoch": 0.7190401952145327, + "grad_norm": 2.2738373603528124, + "learning_rate": 3.86170379216187e-07, + "loss": 1.126, + "step": 5304 + }, + { + "epoch": 0.7191757608621975, + "grad_norm": 1.608228147689681, + "learning_rate": 3.8582376602116254e-07, + "loss": 1.1401, + "step": 5305 + }, + { + "epoch": 0.7193113265098624, + "grad_norm": 1.7304435205703075, + "learning_rate": 3.854772712651765e-07, + "loss": 1.1524, + "step": 5306 + }, + { + "epoch": 0.7194468921575273, + "grad_norm": 2.208588864839864, + "learning_rate": 3.8513089501504783e-07, + "loss": 1.129, + "step": 5307 + }, + { + "epoch": 0.7195824578051921, + "grad_norm": 1.6749926745161892, + "learning_rate": 3.847846373375726e-07, + "loss": 1.1205, + "step": 5308 + }, + { + "epoch": 0.7197180234528571, + "grad_norm": 1.9861456744191444, + "learning_rate": 3.844384982995239e-07, + "loss": 1.1847, + "step": 5309 + }, + { + "epoch": 0.7198535891005219, + "grad_norm": 4.0094940910316845, + "learning_rate": 3.8409247796765185e-07, + "loss": 1.162, + "step": 5310 + }, + { + "epoch": 0.7199891547481868, + "grad_norm": 2.100694306543575, + "learning_rate": 3.837465764086837e-07, + "loss": 1.0941, + "step": 5311 + }, + { + "epoch": 0.7201247203958517, + "grad_norm": 1.6599039099708073, + "learning_rate": 3.83400793689324e-07, + "loss": 1.0943, + "step": 5312 + }, + { + "epoch": 0.7202602860435166, + "grad_norm": 2.7205302953059327, + "learning_rate": 3.83055129876254e-07, + "loss": 1.1796, + "step": 5313 + }, + { + "epoch": 0.7203958516911815, + "grad_norm": 5.350678635197967, + "learning_rate": 3.8270958503613225e-07, + "loss": 1.1347, + "step": 5314 + }, + { + "epoch": 0.7205314173388463, + "grad_norm": 2.039272964212453, + "learning_rate": 3.8236415923559463e-07, + "loss": 1.1584, + "step": 5315 + }, + { + "epoch": 0.7206669829865112, + "grad_norm": 1.5750135457657144, + "learning_rate": 3.820188525412538e-07, + "loss": 1.1314, + "step": 5316 + }, + { + "epoch": 0.7208025486341761, + "grad_norm": 2.454604644189658, + "learning_rate": 3.8167366501969855e-07, + "loss": 1.1277, + "step": 5317 + }, + { + "epoch": 0.720938114281841, + "grad_norm": 1.7206753420438825, + "learning_rate": 3.8132859673749685e-07, + "loss": 1.1452, + "step": 5318 + }, + { + "epoch": 0.7210736799295059, + "grad_norm": 4.449130928942647, + "learning_rate": 3.809836477611912e-07, + "loss": 1.103, + "step": 5319 + }, + { + "epoch": 0.7212092455771707, + "grad_norm": 1.8688947955804098, + "learning_rate": 3.806388181573035e-07, + "loss": 1.1218, + "step": 5320 + }, + { + "epoch": 0.7213448112248356, + "grad_norm": 1.9192413914895832, + "learning_rate": 3.8029410799233006e-07, + "loss": 1.159, + "step": 5321 + }, + { + "epoch": 0.7214803768725006, + "grad_norm": 1.9227736663679587, + "learning_rate": 3.7994951733274695e-07, + "loss": 1.1424, + "step": 5322 + }, + { + "epoch": 0.7216159425201654, + "grad_norm": 2.214623980860575, + "learning_rate": 3.7960504624500436e-07, + "loss": 1.1609, + "step": 5323 + }, + { + "epoch": 0.7217515081678303, + "grad_norm": 2.109433471357157, + "learning_rate": 3.792606947955321e-07, + "loss": 1.1594, + "step": 5324 + }, + { + "epoch": 0.7218870738154951, + "grad_norm": 2.2837425783943734, + "learning_rate": 3.7891646305073456e-07, + "loss": 1.1445, + "step": 5325 + }, + { + "epoch": 0.72202263946316, + "grad_norm": 1.7622967320431857, + "learning_rate": 3.78572351076995e-07, + "loss": 1.1138, + "step": 5326 + }, + { + "epoch": 0.722158205110825, + "grad_norm": 1.8564037301160436, + "learning_rate": 3.7822835894067185e-07, + "loss": 1.1112, + "step": 5327 + }, + { + "epoch": 0.7222937707584898, + "grad_norm": 2.5185909437987335, + "learning_rate": 3.7788448670810225e-07, + "loss": 1.1369, + "step": 5328 + }, + { + "epoch": 0.7224293364061547, + "grad_norm": 1.9695211669829336, + "learning_rate": 3.775407344455984e-07, + "loss": 1.1656, + "step": 5329 + }, + { + "epoch": 0.7225649020538195, + "grad_norm": 2.1473583221830337, + "learning_rate": 3.7719710221945055e-07, + "loss": 1.1628, + "step": 5330 + }, + { + "epoch": 0.7227004677014844, + "grad_norm": 1.8119787444319233, + "learning_rate": 3.768535900959253e-07, + "loss": 1.1123, + "step": 5331 + }, + { + "epoch": 0.7228360333491494, + "grad_norm": 2.086152585612141, + "learning_rate": 3.765101981412665e-07, + "loss": 1.1695, + "step": 5332 + }, + { + "epoch": 0.7229715989968142, + "grad_norm": 2.253660008579615, + "learning_rate": 3.7616692642169443e-07, + "loss": 1.1247, + "step": 5333 + }, + { + "epoch": 0.7231071646444791, + "grad_norm": 1.6612715257559538, + "learning_rate": 3.7582377500340636e-07, + "loss": 1.1486, + "step": 5334 + }, + { + "epoch": 0.7232427302921439, + "grad_norm": 1.7129433592726262, + "learning_rate": 3.7548074395257634e-07, + "loss": 1.1469, + "step": 5335 + }, + { + "epoch": 0.7233782959398088, + "grad_norm": 1.8430296128648023, + "learning_rate": 3.751378333353552e-07, + "loss": 1.1227, + "step": 5336 + }, + { + "epoch": 0.7235138615874738, + "grad_norm": 1.9985011049657158, + "learning_rate": 3.747950432178706e-07, + "loss": 1.1466, + "step": 5337 + }, + { + "epoch": 0.7236494272351386, + "grad_norm": 2.464876539825348, + "learning_rate": 3.744523736662267e-07, + "loss": 1.147, + "step": 5338 + }, + { + "epoch": 0.7237849928828035, + "grad_norm": 1.6742631800787124, + "learning_rate": 3.7410982474650486e-07, + "loss": 1.1681, + "step": 5339 + }, + { + "epoch": 0.7239205585304683, + "grad_norm": 2.5600495377985006, + "learning_rate": 3.7376739652476287e-07, + "loss": 1.1586, + "step": 5340 + }, + { + "epoch": 0.7240561241781333, + "grad_norm": 1.9012560252153161, + "learning_rate": 3.734250890670352e-07, + "loss": 1.1351, + "step": 5341 + }, + { + "epoch": 0.7241916898257982, + "grad_norm": 1.5859201122800877, + "learning_rate": 3.730829024393333e-07, + "loss": 1.1286, + "step": 5342 + }, + { + "epoch": 0.724327255473463, + "grad_norm": 1.8621149734790405, + "learning_rate": 3.727408367076453e-07, + "loss": 1.1477, + "step": 5343 + }, + { + "epoch": 0.7244628211211279, + "grad_norm": 1.652812193712211, + "learning_rate": 3.723988919379354e-07, + "loss": 1.1195, + "step": 5344 + }, + { + "epoch": 0.7245983867687927, + "grad_norm": 2.3782361527632108, + "learning_rate": 3.7205706819614527e-07, + "loss": 1.127, + "step": 5345 + }, + { + "epoch": 0.7247339524164577, + "grad_norm": 1.7027675501873154, + "learning_rate": 3.717153655481927e-07, + "loss": 1.1612, + "step": 5346 + }, + { + "epoch": 0.7248695180641226, + "grad_norm": 1.9091840960574018, + "learning_rate": 3.7137378405997267e-07, + "loss": 1.1376, + "step": 5347 + }, + { + "epoch": 0.7250050837117874, + "grad_norm": 2.4899190165944867, + "learning_rate": 3.710323237973563e-07, + "loss": 1.1207, + "step": 5348 + }, + { + "epoch": 0.7251406493594523, + "grad_norm": 2.1093721579680373, + "learning_rate": 3.7069098482619145e-07, + "loss": 1.144, + "step": 5349 + }, + { + "epoch": 0.7252762150071171, + "grad_norm": 1.750132422610806, + "learning_rate": 3.703497672123026e-07, + "loss": 1.1768, + "step": 5350 + }, + { + "epoch": 0.7254117806547821, + "grad_norm": 1.6506591773755297, + "learning_rate": 3.7000867102149114e-07, + "loss": 1.13, + "step": 5351 + }, + { + "epoch": 0.725547346302447, + "grad_norm": 1.8734980854171974, + "learning_rate": 3.6966769631953466e-07, + "loss": 1.0983, + "step": 5352 + }, + { + "epoch": 0.7256829119501118, + "grad_norm": 1.9950882011578774, + "learning_rate": 3.693268431721873e-07, + "loss": 1.1879, + "step": 5353 + }, + { + "epoch": 0.7258184775977767, + "grad_norm": 2.0293666218030273, + "learning_rate": 3.6898611164518e-07, + "loss": 1.1804, + "step": 5354 + }, + { + "epoch": 0.7259540432454417, + "grad_norm": 2.432819000960922, + "learning_rate": 3.6864550180422014e-07, + "loss": 1.1503, + "step": 5355 + }, + { + "epoch": 0.7260896088931065, + "grad_norm": 1.93792116069325, + "learning_rate": 3.683050137149918e-07, + "loss": 1.1586, + "step": 5356 + }, + { + "epoch": 0.7262251745407714, + "grad_norm": 1.8676496612937747, + "learning_rate": 3.6796464744315545e-07, + "loss": 1.1297, + "step": 5357 + }, + { + "epoch": 0.7263607401884362, + "grad_norm": 2.037395055172971, + "learning_rate": 3.6762440305434726e-07, + "loss": 1.1689, + "step": 5358 + }, + { + "epoch": 0.7264963058361011, + "grad_norm": 1.778950487664888, + "learning_rate": 3.6728428061418195e-07, + "loss": 1.1283, + "step": 5359 + }, + { + "epoch": 0.7266318714837661, + "grad_norm": 1.6650283112180377, + "learning_rate": 3.66944280188248e-07, + "loss": 1.1591, + "step": 5360 + }, + { + "epoch": 0.7267674371314309, + "grad_norm": 1.7689110619097475, + "learning_rate": 3.6660440184211326e-07, + "loss": 1.1367, + "step": 5361 + }, + { + "epoch": 0.7269030027790958, + "grad_norm": 1.9292681295579766, + "learning_rate": 3.662646456413193e-07, + "loss": 1.1717, + "step": 5362 + }, + { + "epoch": 0.7270385684267606, + "grad_norm": 2.054736305746803, + "learning_rate": 3.6592501165138666e-07, + "loss": 1.1507, + "step": 5363 + }, + { + "epoch": 0.7271741340744255, + "grad_norm": 1.518355683918837, + "learning_rate": 3.6558549993780985e-07, + "loss": 1.1587, + "step": 5364 + }, + { + "epoch": 0.7273096997220905, + "grad_norm": 2.263270272462833, + "learning_rate": 3.6524611056606226e-07, + "loss": 1.1519, + "step": 5365 + }, + { + "epoch": 0.7274452653697553, + "grad_norm": 1.9345394608338935, + "learning_rate": 3.6490684360159106e-07, + "loss": 1.1188, + "step": 5366 + }, + { + "epoch": 0.7275808310174202, + "grad_norm": 1.7407804606248916, + "learning_rate": 3.6456769910982264e-07, + "loss": 1.1238, + "step": 5367 + }, + { + "epoch": 0.727716396665085, + "grad_norm": 3.033074649658338, + "learning_rate": 3.6422867715615703e-07, + "loss": 1.1732, + "step": 5368 + }, + { + "epoch": 0.72785196231275, + "grad_norm": 5.017426765344989, + "learning_rate": 3.638897778059732e-07, + "loss": 1.1772, + "step": 5369 + }, + { + "epoch": 0.7279875279604149, + "grad_norm": 1.7133324437703272, + "learning_rate": 3.6355100112462425e-07, + "loss": 1.1606, + "step": 5370 + }, + { + "epoch": 0.7281230936080797, + "grad_norm": 2.0999729982462143, + "learning_rate": 3.632123471774409e-07, + "loss": 1.1694, + "step": 5371 + }, + { + "epoch": 0.7282586592557446, + "grad_norm": 2.070538277557972, + "learning_rate": 3.628738160297299e-07, + "loss": 1.1025, + "step": 5372 + }, + { + "epoch": 0.7283942249034094, + "grad_norm": 1.7146010895711397, + "learning_rate": 3.625354077467743e-07, + "loss": 1.1567, + "step": 5373 + }, + { + "epoch": 0.7285297905510744, + "grad_norm": 2.3952086763780973, + "learning_rate": 3.6219712239383336e-07, + "loss": 1.1165, + "step": 5374 + }, + { + "epoch": 0.7286653561987393, + "grad_norm": 3.609750228506736, + "learning_rate": 3.6185896003614303e-07, + "loss": 1.1468, + "step": 5375 + }, + { + "epoch": 0.7288009218464041, + "grad_norm": 2.1235414648469395, + "learning_rate": 3.6152092073891504e-07, + "loss": 1.0879, + "step": 5376 + }, + { + "epoch": 0.728936487494069, + "grad_norm": 2.0580531874157986, + "learning_rate": 3.6118300456733764e-07, + "loss": 1.2118, + "step": 5377 + }, + { + "epoch": 0.7290720531417338, + "grad_norm": 1.9373306535318915, + "learning_rate": 3.6084521158657555e-07, + "loss": 1.1068, + "step": 5378 + }, + { + "epoch": 0.7292076187893988, + "grad_norm": 2.140000997653004, + "learning_rate": 3.605075418617687e-07, + "loss": 1.1643, + "step": 5379 + }, + { + "epoch": 0.7293431844370637, + "grad_norm": 2.080584867081869, + "learning_rate": 3.6016999545803504e-07, + "loss": 1.1606, + "step": 5380 + }, + { + "epoch": 0.7294787500847285, + "grad_norm": 1.6873727290629965, + "learning_rate": 3.5983257244046674e-07, + "loss": 1.1497, + "step": 5381 + }, + { + "epoch": 0.7296143157323934, + "grad_norm": 2.075751282830892, + "learning_rate": 3.594952728741343e-07, + "loss": 1.1626, + "step": 5382 + }, + { + "epoch": 0.7297498813800583, + "grad_norm": 1.892784121925564, + "learning_rate": 3.591580968240819e-07, + "loss": 1.1711, + "step": 5383 + }, + { + "epoch": 0.7298854470277232, + "grad_norm": 2.720019266320375, + "learning_rate": 3.5882104435533276e-07, + "loss": 1.1634, + "step": 5384 + }, + { + "epoch": 0.7300210126753881, + "grad_norm": 2.6891878029791707, + "learning_rate": 3.584841155328837e-07, + "loss": 1.1316, + "step": 5385 + }, + { + "epoch": 0.7301565783230529, + "grad_norm": 1.6259719159832462, + "learning_rate": 3.581473104217092e-07, + "loss": 1.1519, + "step": 5386 + }, + { + "epoch": 0.7302921439707178, + "grad_norm": 2.2298182110066485, + "learning_rate": 3.578106290867593e-07, + "loss": 1.1387, + "step": 5387 + }, + { + "epoch": 0.7304277096183827, + "grad_norm": 1.6106162104600041, + "learning_rate": 3.5747407159296063e-07, + "loss": 1.134, + "step": 5388 + }, + { + "epoch": 0.7305632752660476, + "grad_norm": 2.222786959325563, + "learning_rate": 3.571376380052152e-07, + "loss": 1.1288, + "step": 5389 + }, + { + "epoch": 0.7306988409137125, + "grad_norm": 1.7790325078242202, + "learning_rate": 3.5680132838840205e-07, + "loss": 1.1119, + "step": 5390 + }, + { + "epoch": 0.7308344065613773, + "grad_norm": 2.5830050997209852, + "learning_rate": 3.564651428073755e-07, + "loss": 1.1627, + "step": 5391 + }, + { + "epoch": 0.7309699722090423, + "grad_norm": 1.834883126111348, + "learning_rate": 3.561290813269665e-07, + "loss": 1.1187, + "step": 5392 + }, + { + "epoch": 0.7311055378567071, + "grad_norm": 1.6274554798908194, + "learning_rate": 3.5579314401198166e-07, + "loss": 1.1688, + "step": 5393 + }, + { + "epoch": 0.731241103504372, + "grad_norm": 1.7421142818297692, + "learning_rate": 3.5545733092720396e-07, + "loss": 1.141, + "step": 5394 + }, + { + "epoch": 0.7313766691520369, + "grad_norm": 2.0095436722076245, + "learning_rate": 3.551216421373924e-07, + "loss": 1.162, + "step": 5395 + }, + { + "epoch": 0.7315122347997017, + "grad_norm": 1.6213954161198272, + "learning_rate": 3.5478607770728164e-07, + "loss": 1.1585, + "step": 5396 + }, + { + "epoch": 0.7316478004473667, + "grad_norm": 3.3358799186536623, + "learning_rate": 3.544506377015829e-07, + "loss": 1.1573, + "step": 5397 + }, + { + "epoch": 0.7317833660950315, + "grad_norm": 2.235978896856941, + "learning_rate": 3.5411532218498296e-07, + "loss": 1.1049, + "step": 5398 + }, + { + "epoch": 0.7319189317426964, + "grad_norm": 2.084896237618205, + "learning_rate": 3.537801312221448e-07, + "loss": 1.12, + "step": 5399 + }, + { + "epoch": 0.7320544973903613, + "grad_norm": 1.9189597437966794, + "learning_rate": 3.5344506487770774e-07, + "loss": 1.1263, + "step": 5400 + }, + { + "epoch": 0.7321900630380261, + "grad_norm": 1.7814605749741128, + "learning_rate": 3.5311012321628577e-07, + "loss": 1.1482, + "step": 5401 + }, + { + "epoch": 0.7323256286856911, + "grad_norm": 2.1119182051289327, + "learning_rate": 3.527753063024708e-07, + "loss": 1.1446, + "step": 5402 + }, + { + "epoch": 0.7324611943333559, + "grad_norm": 1.7316877734549891, + "learning_rate": 3.524406142008285e-07, + "loss": 1.1396, + "step": 5403 + }, + { + "epoch": 0.7325967599810208, + "grad_norm": 2.6768051572546083, + "learning_rate": 3.5210604697590297e-07, + "loss": 1.1322, + "step": 5404 + }, + { + "epoch": 0.7327323256286857, + "grad_norm": 1.958711450384282, + "learning_rate": 3.5177160469221176e-07, + "loss": 1.1854, + "step": 5405 + }, + { + "epoch": 0.7328678912763505, + "grad_norm": 1.7755717091960297, + "learning_rate": 3.514372874142497e-07, + "loss": 1.1949, + "step": 5406 + }, + { + "epoch": 0.7330034569240155, + "grad_norm": 1.8876112415870487, + "learning_rate": 3.511030952064874e-07, + "loss": 1.1658, + "step": 5407 + }, + { + "epoch": 0.7331390225716803, + "grad_norm": 1.7904487510383003, + "learning_rate": 3.507690281333712e-07, + "loss": 1.142, + "step": 5408 + }, + { + "epoch": 0.7332745882193452, + "grad_norm": 1.8701018433502576, + "learning_rate": 3.504350862593231e-07, + "loss": 1.1502, + "step": 5409 + }, + { + "epoch": 0.7334101538670101, + "grad_norm": 2.069626307392217, + "learning_rate": 3.501012696487412e-07, + "loss": 1.1357, + "step": 5410 + }, + { + "epoch": 0.733545719514675, + "grad_norm": 2.6315426007570997, + "learning_rate": 3.497675783659995e-07, + "loss": 1.1476, + "step": 5411 + }, + { + "epoch": 0.7336812851623399, + "grad_norm": 2.3419923590871803, + "learning_rate": 3.4943401247544766e-07, + "loss": 1.1536, + "step": 5412 + }, + { + "epoch": 0.7338168508100047, + "grad_norm": 1.8850345309364855, + "learning_rate": 3.491005720414113e-07, + "loss": 1.125, + "step": 5413 + }, + { + "epoch": 0.7339524164576696, + "grad_norm": 1.8185694292427357, + "learning_rate": 3.487672571281918e-07, + "loss": 1.1306, + "step": 5414 + }, + { + "epoch": 0.7340879821053345, + "grad_norm": 2.2584183287965884, + "learning_rate": 3.4843406780006644e-07, + "loss": 1.1219, + "step": 5415 + }, + { + "epoch": 0.7342235477529994, + "grad_norm": 1.9536436659505731, + "learning_rate": 3.481010041212874e-07, + "loss": 1.1478, + "step": 5416 + }, + { + "epoch": 0.7343591134006643, + "grad_norm": 2.2651848342112313, + "learning_rate": 3.477680661560846e-07, + "loss": 1.1731, + "step": 5417 + }, + { + "epoch": 0.7344946790483291, + "grad_norm": 1.9517461844220028, + "learning_rate": 3.4743525396866114e-07, + "loss": 1.2074, + "step": 5418 + }, + { + "epoch": 0.734630244695994, + "grad_norm": 1.6385161263805357, + "learning_rate": 3.471025676231986e-07, + "loss": 1.1346, + "step": 5419 + }, + { + "epoch": 0.734765810343659, + "grad_norm": 3.076908699623478, + "learning_rate": 3.467700071838515e-07, + "loss": 1.2041, + "step": 5420 + }, + { + "epoch": 0.7349013759913238, + "grad_norm": 2.4987828571520274, + "learning_rate": 3.4643757271475293e-07, + "loss": 1.1336, + "step": 5421 + }, + { + "epoch": 0.7350369416389887, + "grad_norm": 1.9255655304617245, + "learning_rate": 3.4610526428000897e-07, + "loss": 1.15, + "step": 5422 + }, + { + "epoch": 0.7351725072866535, + "grad_norm": 1.8623236010553337, + "learning_rate": 3.457730819437038e-07, + "loss": 1.1598, + "step": 5423 + }, + { + "epoch": 0.7353080729343184, + "grad_norm": 1.8911433428203699, + "learning_rate": 3.454410257698951e-07, + "loss": 1.1188, + "step": 5424 + }, + { + "epoch": 0.7354436385819834, + "grad_norm": 1.7503902167383878, + "learning_rate": 3.451090958226184e-07, + "loss": 1.1464, + "step": 5425 + }, + { + "epoch": 0.7355792042296482, + "grad_norm": 3.271161135602159, + "learning_rate": 3.447772921658825e-07, + "loss": 1.1343, + "step": 5426 + }, + { + "epoch": 0.7357147698773131, + "grad_norm": 1.9256330219834268, + "learning_rate": 3.444456148636744e-07, + "loss": 1.1524, + "step": 5427 + }, + { + "epoch": 0.7358503355249779, + "grad_norm": 1.761180157586624, + "learning_rate": 3.441140639799546e-07, + "loss": 1.1429, + "step": 5428 + }, + { + "epoch": 0.7359859011726428, + "grad_norm": 1.9277242101776624, + "learning_rate": 3.4378263957866026e-07, + "loss": 1.115, + "step": 5429 + }, + { + "epoch": 0.7361214668203078, + "grad_norm": 2.2976076048507523, + "learning_rate": 3.4345134172370407e-07, + "loss": 1.107, + "step": 5430 + }, + { + "epoch": 0.7362570324679726, + "grad_norm": 1.8039620907914906, + "learning_rate": 3.431201704789741e-07, + "loss": 1.1639, + "step": 5431 + }, + { + "epoch": 0.7363925981156375, + "grad_norm": 1.612221710950573, + "learning_rate": 3.427891259083342e-07, + "loss": 1.1276, + "step": 5432 + }, + { + "epoch": 0.7365281637633023, + "grad_norm": 2.0896803468878327, + "learning_rate": 3.4245820807562365e-07, + "loss": 1.1308, + "step": 5433 + }, + { + "epoch": 0.7366637294109672, + "grad_norm": 1.7896636851306404, + "learning_rate": 3.4212741704465733e-07, + "loss": 1.1449, + "step": 5434 + }, + { + "epoch": 0.7367992950586322, + "grad_norm": 2.736997277165478, + "learning_rate": 3.4179675287922573e-07, + "loss": 1.1417, + "step": 5435 + }, + { + "epoch": 0.736934860706297, + "grad_norm": 2.0861333955121815, + "learning_rate": 3.4146621564309476e-07, + "loss": 1.1384, + "step": 5436 + }, + { + "epoch": 0.7370704263539619, + "grad_norm": 2.0636849270435627, + "learning_rate": 3.41135805400006e-07, + "loss": 1.1667, + "step": 5437 + }, + { + "epoch": 0.7372059920016268, + "grad_norm": 1.7101999449967444, + "learning_rate": 3.408055222136763e-07, + "loss": 1.1447, + "step": 5438 + }, + { + "epoch": 0.7373415576492917, + "grad_norm": 4.461374311969462, + "learning_rate": 3.4047536614779837e-07, + "loss": 1.1605, + "step": 5439 + }, + { + "epoch": 0.7374771232969566, + "grad_norm": 3.3047205075271386, + "learning_rate": 3.4014533726604046e-07, + "loss": 1.1434, + "step": 5440 + }, + { + "epoch": 0.7376126889446214, + "grad_norm": 1.7579152894936032, + "learning_rate": 3.398154356320454e-07, + "loss": 1.1061, + "step": 5441 + }, + { + "epoch": 0.7377482545922863, + "grad_norm": 2.4504489733761163, + "learning_rate": 3.394856613094322e-07, + "loss": 1.0889, + "step": 5442 + }, + { + "epoch": 0.7378838202399512, + "grad_norm": 2.1498865581087965, + "learning_rate": 3.3915601436179564e-07, + "loss": 1.1097, + "step": 5443 + }, + { + "epoch": 0.7380193858876161, + "grad_norm": 2.054556784708828, + "learning_rate": 3.388264948527052e-07, + "loss": 1.1399, + "step": 5444 + }, + { + "epoch": 0.738154951535281, + "grad_norm": 2.975585089872114, + "learning_rate": 3.384971028457063e-07, + "loss": 1.1855, + "step": 5445 + }, + { + "epoch": 0.7382905171829458, + "grad_norm": 1.8999695895991486, + "learning_rate": 3.381678384043195e-07, + "loss": 1.1422, + "step": 5446 + }, + { + "epoch": 0.7384260828306107, + "grad_norm": 1.6815831956730867, + "learning_rate": 3.378387015920409e-07, + "loss": 1.1315, + "step": 5447 + }, + { + "epoch": 0.7385616484782757, + "grad_norm": 7.176067252174773, + "learning_rate": 3.3750969247234184e-07, + "loss": 1.1618, + "step": 5448 + }, + { + "epoch": 0.7386972141259405, + "grad_norm": 1.9002834540155513, + "learning_rate": 3.371808111086694e-07, + "loss": 1.1335, + "step": 5449 + }, + { + "epoch": 0.7388327797736054, + "grad_norm": 2.7858305143280875, + "learning_rate": 3.3685205756444534e-07, + "loss": 1.1031, + "step": 5450 + }, + { + "epoch": 0.7389683454212702, + "grad_norm": 2.1320012936185555, + "learning_rate": 3.365234319030675e-07, + "loss": 1.1492, + "step": 5451 + }, + { + "epoch": 0.7391039110689351, + "grad_norm": 2.044134290920972, + "learning_rate": 3.361949341879087e-07, + "loss": 1.134, + "step": 5452 + }, + { + "epoch": 0.7392394767166001, + "grad_norm": 1.8295017484916394, + "learning_rate": 3.35866564482317e-07, + "loss": 1.1537, + "step": 5453 + }, + { + "epoch": 0.7393750423642649, + "grad_norm": 1.8443563690356874, + "learning_rate": 3.3553832284961603e-07, + "loss": 1.1379, + "step": 5454 + }, + { + "epoch": 0.7395106080119298, + "grad_norm": 1.831867407599177, + "learning_rate": 3.352102093531045e-07, + "loss": 1.1376, + "step": 5455 + }, + { + "epoch": 0.7396461736595946, + "grad_norm": 1.7635729417181567, + "learning_rate": 3.348822240560569e-07, + "loss": 1.1525, + "step": 5456 + }, + { + "epoch": 0.7397817393072595, + "grad_norm": 2.4046620586695915, + "learning_rate": 3.345543670217217e-07, + "loss": 1.1383, + "step": 5457 + }, + { + "epoch": 0.7399173049549245, + "grad_norm": 3.7204228406185473, + "learning_rate": 3.3422663831332477e-07, + "loss": 1.1309, + "step": 5458 + }, + { + "epoch": 0.7400528706025893, + "grad_norm": 1.8288438667029114, + "learning_rate": 3.338990379940646e-07, + "loss": 1.1685, + "step": 5459 + }, + { + "epoch": 0.7401884362502542, + "grad_norm": 3.6401061789403824, + "learning_rate": 3.335715661271178e-07, + "loss": 1.1298, + "step": 5460 + }, + { + "epoch": 0.740324001897919, + "grad_norm": 1.582584288422982, + "learning_rate": 3.3324422277563326e-07, + "loss": 1.1484, + "step": 5461 + }, + { + "epoch": 0.740459567545584, + "grad_norm": 4.912822254817951, + "learning_rate": 3.32917008002738e-07, + "loss": 1.1354, + "step": 5462 + }, + { + "epoch": 0.7405951331932489, + "grad_norm": 1.9083353300477053, + "learning_rate": 3.3258992187153144e-07, + "loss": 1.1379, + "step": 5463 + }, + { + "epoch": 0.7407306988409137, + "grad_norm": 2.3173021713742283, + "learning_rate": 3.322629644450909e-07, + "loss": 1.149, + "step": 5464 + }, + { + "epoch": 0.7408662644885786, + "grad_norm": 2.2836782311367303, + "learning_rate": 3.319361357864663e-07, + "loss": 1.1368, + "step": 5465 + }, + { + "epoch": 0.7410018301362434, + "grad_norm": 2.0433998661823543, + "learning_rate": 3.316094359586852e-07, + "loss": 1.1208, + "step": 5466 + }, + { + "epoch": 0.7411373957839084, + "grad_norm": 1.8742122936203531, + "learning_rate": 3.3128286502474803e-07, + "loss": 1.1575, + "step": 5467 + }, + { + "epoch": 0.7412729614315733, + "grad_norm": 1.7300838212072398, + "learning_rate": 3.3095642304763183e-07, + "loss": 1.16, + "step": 5468 + }, + { + "epoch": 0.7414085270792381, + "grad_norm": 2.021535155531787, + "learning_rate": 3.306301100902883e-07, + "loss": 1.0881, + "step": 5469 + }, + { + "epoch": 0.741544092726903, + "grad_norm": 2.156506649715323, + "learning_rate": 3.303039262156443e-07, + "loss": 1.0987, + "step": 5470 + }, + { + "epoch": 0.7416796583745678, + "grad_norm": 1.8593040877165063, + "learning_rate": 3.2997787148660195e-07, + "loss": 1.1274, + "step": 5471 + }, + { + "epoch": 0.7418152240222328, + "grad_norm": 1.638781951664012, + "learning_rate": 3.296519459660383e-07, + "loss": 1.1951, + "step": 5472 + }, + { + "epoch": 0.7419507896698977, + "grad_norm": 1.8701234198296643, + "learning_rate": 3.293261497168054e-07, + "loss": 1.136, + "step": 5473 + }, + { + "epoch": 0.7420863553175625, + "grad_norm": 1.9502844182273475, + "learning_rate": 3.2900048280173055e-07, + "loss": 1.1226, + "step": 5474 + }, + { + "epoch": 0.7422219209652274, + "grad_norm": 2.142625515389453, + "learning_rate": 3.2867494528361605e-07, + "loss": 1.1666, + "step": 5475 + }, + { + "epoch": 0.7423574866128922, + "grad_norm": 1.7992055695693465, + "learning_rate": 3.2834953722523915e-07, + "loss": 1.1259, + "step": 5476 + }, + { + "epoch": 0.7424930522605572, + "grad_norm": 1.6134623308914238, + "learning_rate": 3.2802425868935277e-07, + "loss": 1.1152, + "step": 5477 + }, + { + "epoch": 0.7426286179082221, + "grad_norm": 2.3336183342614323, + "learning_rate": 3.276991097386831e-07, + "loss": 1.1483, + "step": 5478 + }, + { + "epoch": 0.7427641835558869, + "grad_norm": 1.9300634292142895, + "learning_rate": 3.27374090435934e-07, + "loss": 1.1678, + "step": 5479 + }, + { + "epoch": 0.7428997492035518, + "grad_norm": 2.008215967127246, + "learning_rate": 3.270492008437815e-07, + "loss": 1.1013, + "step": 5480 + }, + { + "epoch": 0.7430353148512167, + "grad_norm": 2.18006610071517, + "learning_rate": 3.267244410248794e-07, + "loss": 1.106, + "step": 5481 + }, + { + "epoch": 0.7431708804988816, + "grad_norm": 1.6705310843150072, + "learning_rate": 3.2639981104185355e-07, + "loss": 1.1378, + "step": 5482 + }, + { + "epoch": 0.7433064461465465, + "grad_norm": 1.7300242632007032, + "learning_rate": 3.260753109573078e-07, + "loss": 1.1489, + "step": 5483 + }, + { + "epoch": 0.7434420117942113, + "grad_norm": 2.044240835048312, + "learning_rate": 3.2575094083381837e-07, + "loss": 1.1847, + "step": 5484 + }, + { + "epoch": 0.7435775774418762, + "grad_norm": 1.9238699067514933, + "learning_rate": 3.2542670073393776e-07, + "loss": 1.1102, + "step": 5485 + }, + { + "epoch": 0.7437131430895411, + "grad_norm": 2.0415572856654585, + "learning_rate": 3.251025907201932e-07, + "loss": 1.1255, + "step": 5486 + }, + { + "epoch": 0.743848708737206, + "grad_norm": 3.762236146251629, + "learning_rate": 3.247786108550866e-07, + "loss": 1.1614, + "step": 5487 + }, + { + "epoch": 0.7439842743848709, + "grad_norm": 1.7508274173754725, + "learning_rate": 3.244547612010952e-07, + "loss": 1.1573, + "step": 5488 + }, + { + "epoch": 0.7441198400325357, + "grad_norm": 1.761139944088288, + "learning_rate": 3.241310418206705e-07, + "loss": 1.1699, + "step": 5489 + }, + { + "epoch": 0.7442554056802007, + "grad_norm": 1.7601002414746216, + "learning_rate": 3.238074527762394e-07, + "loss": 1.1276, + "step": 5490 + }, + { + "epoch": 0.7443909713278655, + "grad_norm": 1.7857960427451454, + "learning_rate": 3.2348399413020365e-07, + "loss": 1.1392, + "step": 5491 + }, + { + "epoch": 0.7445265369755304, + "grad_norm": 2.0328555202329293, + "learning_rate": 3.231606659449394e-07, + "loss": 1.1766, + "step": 5492 + }, + { + "epoch": 0.7446621026231953, + "grad_norm": 1.8194561251479262, + "learning_rate": 3.228374682827982e-07, + "loss": 1.1323, + "step": 5493 + }, + { + "epoch": 0.7447976682708601, + "grad_norm": 1.944718316889268, + "learning_rate": 3.2251440120610596e-07, + "loss": 1.1547, + "step": 5494 + }, + { + "epoch": 0.7449332339185251, + "grad_norm": 1.636031045263389, + "learning_rate": 3.2219146477716376e-07, + "loss": 1.1351, + "step": 5495 + }, + { + "epoch": 0.7450687995661899, + "grad_norm": 1.7664741515571294, + "learning_rate": 3.2186865905824724e-07, + "loss": 1.1244, + "step": 5496 + }, + { + "epoch": 0.7452043652138548, + "grad_norm": 1.9285606777818831, + "learning_rate": 3.215459841116073e-07, + "loss": 1.1213, + "step": 5497 + }, + { + "epoch": 0.7453399308615197, + "grad_norm": 2.9138214540010314, + "learning_rate": 3.212234399994682e-07, + "loss": 1.1095, + "step": 5498 + }, + { + "epoch": 0.7454754965091845, + "grad_norm": 1.7022911264776888, + "learning_rate": 3.209010267840315e-07, + "loss": 1.1727, + "step": 5499 + }, + { + "epoch": 0.7456110621568495, + "grad_norm": 2.266954107785398, + "learning_rate": 3.205787445274707e-07, + "loss": 1.1844, + "step": 5500 + }, + { + "epoch": 0.7457466278045143, + "grad_norm": 1.8952436806369592, + "learning_rate": 3.2025659329193654e-07, + "loss": 1.1649, + "step": 5501 + }, + { + "epoch": 0.7458821934521792, + "grad_norm": 2.422740978547242, + "learning_rate": 3.1993457313955217e-07, + "loss": 1.1501, + "step": 5502 + }, + { + "epoch": 0.7460177590998441, + "grad_norm": 1.5490930529281228, + "learning_rate": 3.19612684132418e-07, + "loss": 1.1212, + "step": 5503 + }, + { + "epoch": 0.746153324747509, + "grad_norm": 2.7354651365857037, + "learning_rate": 3.1929092633260667e-07, + "loss": 1.0944, + "step": 5504 + }, + { + "epoch": 0.7462888903951739, + "grad_norm": 1.8986538716571524, + "learning_rate": 3.1896929980216704e-07, + "loss": 1.1334, + "step": 5505 + }, + { + "epoch": 0.7464244560428387, + "grad_norm": 1.673176040942352, + "learning_rate": 3.186478046031221e-07, + "loss": 1.1349, + "step": 5506 + }, + { + "epoch": 0.7465600216905036, + "grad_norm": 2.3100231291618307, + "learning_rate": 3.1832644079746984e-07, + "loss": 1.1707, + "step": 5507 + }, + { + "epoch": 0.7466955873381685, + "grad_norm": 1.6841539704781578, + "learning_rate": 3.180052084471827e-07, + "loss": 1.1583, + "step": 5508 + }, + { + "epoch": 0.7468311529858334, + "grad_norm": 2.307361905326161, + "learning_rate": 3.176841076142077e-07, + "loss": 1.1838, + "step": 5509 + }, + { + "epoch": 0.7469667186334983, + "grad_norm": 1.8281481133438073, + "learning_rate": 3.173631383604667e-07, + "loss": 1.1401, + "step": 5510 + }, + { + "epoch": 0.7471022842811631, + "grad_norm": 1.8775907871475679, + "learning_rate": 3.170423007478561e-07, + "loss": 1.1125, + "step": 5511 + }, + { + "epoch": 0.747237849928828, + "grad_norm": 1.6158038971541073, + "learning_rate": 3.167215948382471e-07, + "loss": 1.1245, + "step": 5512 + }, + { + "epoch": 0.747373415576493, + "grad_norm": 1.6834648653327666, + "learning_rate": 3.164010206934845e-07, + "loss": 1.1471, + "step": 5513 + }, + { + "epoch": 0.7475089812241578, + "grad_norm": 1.7768036220020051, + "learning_rate": 3.160805783753897e-07, + "loss": 1.1366, + "step": 5514 + }, + { + "epoch": 0.7476445468718227, + "grad_norm": 1.9510268223861083, + "learning_rate": 3.1576026794575615e-07, + "loss": 1.1565, + "step": 5515 + }, + { + "epoch": 0.7477801125194876, + "grad_norm": 1.6860846370884983, + "learning_rate": 3.154400894663546e-07, + "loss": 1.1833, + "step": 5516 + }, + { + "epoch": 0.7479156781671524, + "grad_norm": 1.9532906198408433, + "learning_rate": 3.1512004299892747e-07, + "loss": 1.1472, + "step": 5517 + }, + { + "epoch": 0.7480512438148174, + "grad_norm": 2.0499415645887296, + "learning_rate": 3.1480012860519453e-07, + "loss": 1.1418, + "step": 5518 + }, + { + "epoch": 0.7481868094624822, + "grad_norm": 6.8464167811306025, + "learning_rate": 3.1448034634684764e-07, + "loss": 1.1292, + "step": 5519 + }, + { + "epoch": 0.7483223751101471, + "grad_norm": 2.164867688055845, + "learning_rate": 3.141606962855553e-07, + "loss": 1.1608, + "step": 5520 + }, + { + "epoch": 0.748457940757812, + "grad_norm": 1.582153330974147, + "learning_rate": 3.1384117848295843e-07, + "loss": 1.127, + "step": 5521 + }, + { + "epoch": 0.7485935064054768, + "grad_norm": 2.204326317457279, + "learning_rate": 3.135217930006747e-07, + "loss": 1.169, + "step": 5522 + }, + { + "epoch": 0.7487290720531418, + "grad_norm": 1.9912834914836102, + "learning_rate": 3.1320253990029387e-07, + "loss": 1.191, + "step": 5523 + }, + { + "epoch": 0.7488646377008066, + "grad_norm": 3.244523825044724, + "learning_rate": 3.128834192433826e-07, + "loss": 1.115, + "step": 5524 + }, + { + "epoch": 0.7490002033484715, + "grad_norm": 3.79603199554512, + "learning_rate": 3.125644310914798e-07, + "loss": 1.1338, + "step": 5525 + }, + { + "epoch": 0.7491357689961364, + "grad_norm": 1.7003643311718113, + "learning_rate": 3.122455755061002e-07, + "loss": 1.1499, + "step": 5526 + }, + { + "epoch": 0.7492713346438012, + "grad_norm": 1.817066533731218, + "learning_rate": 3.1192685254873254e-07, + "loss": 1.1174, + "step": 5527 + }, + { + "epoch": 0.7494069002914662, + "grad_norm": 1.9266375903827406, + "learning_rate": 3.1160826228084004e-07, + "loss": 1.1208, + "step": 5528 + }, + { + "epoch": 0.749542465939131, + "grad_norm": 1.9277715027237508, + "learning_rate": 3.1128980476386035e-07, + "loss": 1.1767, + "step": 5529 + }, + { + "epoch": 0.7496780315867959, + "grad_norm": 1.639241073890671, + "learning_rate": 3.109714800592055e-07, + "loss": 1.1216, + "step": 5530 + }, + { + "epoch": 0.7498135972344608, + "grad_norm": 1.9431508229208472, + "learning_rate": 3.106532882282618e-07, + "loss": 1.1346, + "step": 5531 + }, + { + "epoch": 0.7499491628821257, + "grad_norm": 2.1545622579346175, + "learning_rate": 3.103352293323901e-07, + "loss": 1.1135, + "step": 5532 + }, + { + "epoch": 0.7500847285297906, + "grad_norm": 1.6374587649950647, + "learning_rate": 3.1001730343292556e-07, + "loss": 1.146, + "step": 5533 + }, + { + "epoch": 0.7502202941774554, + "grad_norm": 2.0937904279207524, + "learning_rate": 3.096995105911776e-07, + "loss": 1.1728, + "step": 5534 + }, + { + "epoch": 0.7503558598251203, + "grad_norm": 1.6121973615952434, + "learning_rate": 3.093818508684302e-07, + "loss": 1.1341, + "step": 5535 + }, + { + "epoch": 0.7504914254727852, + "grad_norm": 1.8562994806649038, + "learning_rate": 3.090643243259414e-07, + "loss": 1.1634, + "step": 5536 + }, + { + "epoch": 0.7506269911204501, + "grad_norm": 2.2416981848735293, + "learning_rate": 3.0874693102494374e-07, + "loss": 1.1684, + "step": 5537 + }, + { + "epoch": 0.750762556768115, + "grad_norm": 1.8965770362883674, + "learning_rate": 3.084296710266441e-07, + "loss": 1.1138, + "step": 5538 + }, + { + "epoch": 0.7508981224157798, + "grad_norm": 5.480291025826685, + "learning_rate": 3.081125443922237e-07, + "loss": 1.1706, + "step": 5539 + }, + { + "epoch": 0.7510336880634447, + "grad_norm": 2.289531840894614, + "learning_rate": 3.077955511828374e-07, + "loss": 1.1376, + "step": 5540 + }, + { + "epoch": 0.7511692537111097, + "grad_norm": 2.4268636322917545, + "learning_rate": 3.074786914596151e-07, + "loss": 1.1639, + "step": 5541 + }, + { + "epoch": 0.7513048193587745, + "grad_norm": 1.9198333927750928, + "learning_rate": 3.071619652836608e-07, + "loss": 1.129, + "step": 5542 + }, + { + "epoch": 0.7514403850064394, + "grad_norm": 2.4458715793734265, + "learning_rate": 3.068453727160525e-07, + "loss": 1.161, + "step": 5543 + }, + { + "epoch": 0.7515759506541042, + "grad_norm": 1.6785969004783297, + "learning_rate": 3.065289138178426e-07, + "loss": 1.1135, + "step": 5544 + }, + { + "epoch": 0.7517115163017691, + "grad_norm": 1.7672169034035248, + "learning_rate": 3.062125886500578e-07, + "loss": 1.1384, + "step": 5545 + }, + { + "epoch": 0.7518470819494341, + "grad_norm": 1.7443702325700774, + "learning_rate": 3.0589639727369886e-07, + "loss": 1.1544, + "step": 5546 + }, + { + "epoch": 0.7519826475970989, + "grad_norm": 1.7759208766568688, + "learning_rate": 3.0558033974974076e-07, + "loss": 1.1581, + "step": 5547 + }, + { + "epoch": 0.7521182132447638, + "grad_norm": 1.5127408338989243, + "learning_rate": 3.052644161391328e-07, + "loss": 1.1197, + "step": 5548 + }, + { + "epoch": 0.7522537788924286, + "grad_norm": 1.7811662418314598, + "learning_rate": 3.0494862650279816e-07, + "loss": 1.1477, + "step": 5549 + }, + { + "epoch": 0.7523893445400935, + "grad_norm": 1.7094596085786002, + "learning_rate": 3.046329709016345e-07, + "loss": 1.1347, + "step": 5550 + }, + { + "epoch": 0.7525249101877585, + "grad_norm": 1.7512406016346531, + "learning_rate": 3.043174493965136e-07, + "loss": 1.1488, + "step": 5551 + }, + { + "epoch": 0.7526604758354233, + "grad_norm": 2.7485604642051404, + "learning_rate": 3.040020620482812e-07, + "loss": 1.1239, + "step": 5552 + }, + { + "epoch": 0.7527960414830882, + "grad_norm": 2.1086527313673478, + "learning_rate": 3.0368680891775755e-07, + "loss": 1.153, + "step": 5553 + }, + { + "epoch": 0.752931607130753, + "grad_norm": 1.6323522752569082, + "learning_rate": 3.033716900657357e-07, + "loss": 1.1531, + "step": 5554 + }, + { + "epoch": 0.753067172778418, + "grad_norm": 1.7967309552069828, + "learning_rate": 3.0305670555298533e-07, + "loss": 1.1502, + "step": 5555 + }, + { + "epoch": 0.7532027384260829, + "grad_norm": 2.2039013445509132, + "learning_rate": 3.027418554402473e-07, + "loss": 1.1188, + "step": 5556 + }, + { + "epoch": 0.7533383040737477, + "grad_norm": 2.7173705725291897, + "learning_rate": 3.024271397882393e-07, + "loss": 1.1418, + "step": 5557 + }, + { + "epoch": 0.7534738697214126, + "grad_norm": 1.8596261655594797, + "learning_rate": 3.021125586576504e-07, + "loss": 1.1889, + "step": 5558 + }, + { + "epoch": 0.7536094353690774, + "grad_norm": 1.9093508188425337, + "learning_rate": 3.017981121091464e-07, + "loss": 1.1353, + "step": 5559 + }, + { + "epoch": 0.7537450010167424, + "grad_norm": 2.009170350924535, + "learning_rate": 3.014838002033645e-07, + "loss": 1.1716, + "step": 5560 + }, + { + "epoch": 0.7538805666644073, + "grad_norm": 3.5739133375460077, + "learning_rate": 3.0116962300091876e-07, + "loss": 1.1021, + "step": 5561 + }, + { + "epoch": 0.7540161323120721, + "grad_norm": 2.04828705259333, + "learning_rate": 3.0085558056239426e-07, + "loss": 1.125, + "step": 5562 + }, + { + "epoch": 0.754151697959737, + "grad_norm": 2.4534743959950887, + "learning_rate": 3.0054167294835306e-07, + "loss": 1.0929, + "step": 5563 + }, + { + "epoch": 0.7542872636074018, + "grad_norm": 1.9608743815735499, + "learning_rate": 3.002279002193283e-07, + "loss": 1.1458, + "step": 5564 + }, + { + "epoch": 0.7544228292550668, + "grad_norm": 2.0183783480766198, + "learning_rate": 2.9991426243583005e-07, + "loss": 1.085, + "step": 5565 + }, + { + "epoch": 0.7545583949027317, + "grad_norm": 1.8912695080666613, + "learning_rate": 2.9960075965833974e-07, + "loss": 1.1828, + "step": 5566 + }, + { + "epoch": 0.7546939605503965, + "grad_norm": 2.565716038596604, + "learning_rate": 2.9928739194731444e-07, + "loss": 1.1305, + "step": 5567 + }, + { + "epoch": 0.7548295261980614, + "grad_norm": 1.8398019436282507, + "learning_rate": 2.9897415936318436e-07, + "loss": 1.1606, + "step": 5568 + }, + { + "epoch": 0.7549650918457262, + "grad_norm": 2.823228191411793, + "learning_rate": 2.986610619663542e-07, + "loss": 1.0955, + "step": 5569 + }, + { + "epoch": 0.7551006574933912, + "grad_norm": 1.914325704249519, + "learning_rate": 2.983480998172022e-07, + "loss": 1.1387, + "step": 5570 + }, + { + "epoch": 0.7552362231410561, + "grad_norm": 1.960577765730128, + "learning_rate": 2.980352729760807e-07, + "loss": 1.0976, + "step": 5571 + }, + { + "epoch": 0.7553717887887209, + "grad_norm": 1.791177548685952, + "learning_rate": 2.9772258150331565e-07, + "loss": 1.1339, + "step": 5572 + }, + { + "epoch": 0.7555073544363858, + "grad_norm": 1.776098109786755, + "learning_rate": 2.974100254592075e-07, + "loss": 1.1694, + "step": 5573 + }, + { + "epoch": 0.7556429200840507, + "grad_norm": 1.9152560510205148, + "learning_rate": 2.970976049040299e-07, + "loss": 1.1458, + "step": 5574 + }, + { + "epoch": 0.7557784857317156, + "grad_norm": 2.3875564059091303, + "learning_rate": 2.967853198980309e-07, + "loss": 1.1307, + "step": 5575 + }, + { + "epoch": 0.7559140513793805, + "grad_norm": 1.9715493171921834, + "learning_rate": 2.964731705014324e-07, + "loss": 1.1769, + "step": 5576 + }, + { + "epoch": 0.7560496170270453, + "grad_norm": 1.7636521095593571, + "learning_rate": 2.9616115677442897e-07, + "loss": 1.1471, + "step": 5577 + }, + { + "epoch": 0.7561851826747102, + "grad_norm": 2.1517177694337346, + "learning_rate": 2.9584927877719145e-07, + "loss": 1.1018, + "step": 5578 + }, + { + "epoch": 0.7563207483223751, + "grad_norm": 1.9527476526579626, + "learning_rate": 2.9553753656986155e-07, + "loss": 1.1515, + "step": 5579 + }, + { + "epoch": 0.75645631397004, + "grad_norm": 2.8244145286264986, + "learning_rate": 2.952259302125578e-07, + "loss": 1.1452, + "step": 5580 + }, + { + "epoch": 0.7565918796177049, + "grad_norm": 1.9408153747058234, + "learning_rate": 2.9491445976536977e-07, + "loss": 1.1089, + "step": 5581 + }, + { + "epoch": 0.7567274452653697, + "grad_norm": 1.6674573625124867, + "learning_rate": 2.9460312528836274e-07, + "loss": 1.1019, + "step": 5582 + }, + { + "epoch": 0.7568630109130347, + "grad_norm": 2.7710325897552717, + "learning_rate": 2.942919268415748e-07, + "loss": 1.0908, + "step": 5583 + }, + { + "epoch": 0.7569985765606995, + "grad_norm": 2.1118221195732008, + "learning_rate": 2.9398086448501837e-07, + "loss": 1.1275, + "step": 5584 + }, + { + "epoch": 0.7571341422083644, + "grad_norm": 2.208169741739155, + "learning_rate": 2.9366993827867913e-07, + "loss": 1.1473, + "step": 5585 + }, + { + "epoch": 0.7572697078560293, + "grad_norm": 1.7693547580432514, + "learning_rate": 2.9335914828251694e-07, + "loss": 1.1312, + "step": 5586 + }, + { + "epoch": 0.7574052735036941, + "grad_norm": 5.288420402272099, + "learning_rate": 2.9304849455646505e-07, + "loss": 1.168, + "step": 5587 + }, + { + "epoch": 0.7575408391513591, + "grad_norm": 1.8867236912284175, + "learning_rate": 2.9273797716043067e-07, + "loss": 1.1257, + "step": 5588 + }, + { + "epoch": 0.7576764047990239, + "grad_norm": 2.0868667354643424, + "learning_rate": 2.9242759615429467e-07, + "loss": 1.1294, + "step": 5589 + }, + { + "epoch": 0.7578119704466888, + "grad_norm": 1.775593344988937, + "learning_rate": 2.9211735159791153e-07, + "loss": 1.1301, + "step": 5590 + }, + { + "epoch": 0.7579475360943537, + "grad_norm": 1.664504497558939, + "learning_rate": 2.918072435511093e-07, + "loss": 1.1718, + "step": 5591 + }, + { + "epoch": 0.7580831017420185, + "grad_norm": 2.1942362480468116, + "learning_rate": 2.914972720736901e-07, + "loss": 1.1502, + "step": 5592 + }, + { + "epoch": 0.7582186673896835, + "grad_norm": 2.328296732224499, + "learning_rate": 2.9118743722542937e-07, + "loss": 1.1386, + "step": 5593 + }, + { + "epoch": 0.7583542330373484, + "grad_norm": 1.8553049397457972, + "learning_rate": 2.908777390660765e-07, + "loss": 1.1821, + "step": 5594 + }, + { + "epoch": 0.7584897986850132, + "grad_norm": 3.022428135473794, + "learning_rate": 2.9056817765535404e-07, + "loss": 1.1374, + "step": 5595 + }, + { + "epoch": 0.7586253643326781, + "grad_norm": 2.5228635926940104, + "learning_rate": 2.9025875305295886e-07, + "loss": 1.2061, + "step": 5596 + }, + { + "epoch": 0.758760929980343, + "grad_norm": 1.596269112454787, + "learning_rate": 2.8994946531856035e-07, + "loss": 1.1609, + "step": 5597 + }, + { + "epoch": 0.7588964956280079, + "grad_norm": 1.6816949639911898, + "learning_rate": 2.8964031451180316e-07, + "loss": 1.1608, + "step": 5598 + }, + { + "epoch": 0.7590320612756728, + "grad_norm": 1.709701892216582, + "learning_rate": 2.893313006923035e-07, + "loss": 1.1242, + "step": 5599 + }, + { + "epoch": 0.7591676269233376, + "grad_norm": 1.7645694093435702, + "learning_rate": 2.8902242391965335e-07, + "loss": 1.1498, + "step": 5600 + }, + { + "epoch": 0.7593031925710025, + "grad_norm": 1.9828638615777867, + "learning_rate": 2.8871368425341634e-07, + "loss": 1.1844, + "step": 5601 + }, + { + "epoch": 0.7594387582186674, + "grad_norm": 2.0288590098755352, + "learning_rate": 2.8840508175313095e-07, + "loss": 1.1932, + "step": 5602 + }, + { + "epoch": 0.7595743238663323, + "grad_norm": 1.7120482866938, + "learning_rate": 2.880966164783084e-07, + "loss": 1.1297, + "step": 5603 + }, + { + "epoch": 0.7597098895139972, + "grad_norm": 1.6925915755160714, + "learning_rate": 2.87788288488434e-07, + "loss": 1.1185, + "step": 5604 + }, + { + "epoch": 0.759845455161662, + "grad_norm": 1.827360193486837, + "learning_rate": 2.8748009784296625e-07, + "loss": 1.1404, + "step": 5605 + }, + { + "epoch": 0.759981020809327, + "grad_norm": 2.2222601253993877, + "learning_rate": 2.871720446013374e-07, + "loss": 1.179, + "step": 5606 + }, + { + "epoch": 0.7601165864569918, + "grad_norm": 1.7559551674642848, + "learning_rate": 2.8686412882295287e-07, + "loss": 1.1142, + "step": 5607 + }, + { + "epoch": 0.7602521521046567, + "grad_norm": 2.245901359189321, + "learning_rate": 2.865563505671921e-07, + "loss": 1.1591, + "step": 5608 + }, + { + "epoch": 0.7603877177523216, + "grad_norm": 2.3649724883499585, + "learning_rate": 2.8624870989340757e-07, + "loss": 1.1263, + "step": 5609 + }, + { + "epoch": 0.7605232833999864, + "grad_norm": 1.982412918848455, + "learning_rate": 2.8594120686092515e-07, + "loss": 1.1606, + "step": 5610 + }, + { + "epoch": 0.7606588490476514, + "grad_norm": 2.1363247452498246, + "learning_rate": 2.8563384152904503e-07, + "loss": 1.1616, + "step": 5611 + }, + { + "epoch": 0.7607944146953162, + "grad_norm": 2.665269332643173, + "learning_rate": 2.8532661395703905e-07, + "loss": 1.1482, + "step": 5612 + }, + { + "epoch": 0.7609299803429811, + "grad_norm": 2.0557021675306792, + "learning_rate": 2.8501952420415486e-07, + "loss": 1.176, + "step": 5613 + }, + { + "epoch": 0.761065545990646, + "grad_norm": 1.9697427704109451, + "learning_rate": 2.847125723296111e-07, + "loss": 1.1149, + "step": 5614 + }, + { + "epoch": 0.7612011116383108, + "grad_norm": 3.8008299243671226, + "learning_rate": 2.8440575839260227e-07, + "loss": 1.1265, + "step": 5615 + }, + { + "epoch": 0.7613366772859758, + "grad_norm": 1.9308487383522361, + "learning_rate": 2.8409908245229374e-07, + "loss": 1.1401, + "step": 5616 + }, + { + "epoch": 0.7614722429336406, + "grad_norm": 1.859006237900147, + "learning_rate": 2.8379254456782685e-07, + "loss": 1.1459, + "step": 5617 + }, + { + "epoch": 0.7616078085813055, + "grad_norm": 1.628877584804959, + "learning_rate": 2.8348614479831367e-07, + "loss": 1.1059, + "step": 5618 + }, + { + "epoch": 0.7617433742289704, + "grad_norm": 2.2256815421687324, + "learning_rate": 2.8317988320284223e-07, + "loss": 1.1434, + "step": 5619 + }, + { + "epoch": 0.7618789398766352, + "grad_norm": 5.303443112373101, + "learning_rate": 2.828737598404716e-07, + "loss": 1.1377, + "step": 5620 + }, + { + "epoch": 0.7620145055243002, + "grad_norm": 1.6743338862800614, + "learning_rate": 2.8256777477023617e-07, + "loss": 1.1274, + "step": 5621 + }, + { + "epoch": 0.762150071171965, + "grad_norm": 2.7573987280900307, + "learning_rate": 2.822619280511418e-07, + "loss": 1.1688, + "step": 5622 + }, + { + "epoch": 0.7622856368196299, + "grad_norm": 2.10793651059393, + "learning_rate": 2.8195621974216975e-07, + "loss": 1.1528, + "step": 5623 + }, + { + "epoch": 0.7624212024672948, + "grad_norm": 3.1697501849603107, + "learning_rate": 2.816506499022725e-07, + "loss": 1.1453, + "step": 5624 + }, + { + "epoch": 0.7625567681149596, + "grad_norm": 3.149582901461255, + "learning_rate": 2.8134521859037707e-07, + "loss": 1.1512, + "step": 5625 + }, + { + "epoch": 0.7626923337626246, + "grad_norm": 1.8889406836428067, + "learning_rate": 2.810399258653836e-07, + "loss": 1.1767, + "step": 5626 + }, + { + "epoch": 0.7628278994102894, + "grad_norm": 1.6114898052382134, + "learning_rate": 2.807347717861653e-07, + "loss": 1.1472, + "step": 5627 + }, + { + "epoch": 0.7629634650579543, + "grad_norm": 2.121053033629547, + "learning_rate": 2.8042975641156864e-07, + "loss": 1.146, + "step": 5628 + }, + { + "epoch": 0.7630990307056192, + "grad_norm": 1.5657043035802585, + "learning_rate": 2.8012487980041354e-07, + "loss": 1.1556, + "step": 5629 + }, + { + "epoch": 0.7632345963532841, + "grad_norm": 2.806641729956573, + "learning_rate": 2.798201420114931e-07, + "loss": 1.1429, + "step": 5630 + }, + { + "epoch": 0.763370162000949, + "grad_norm": 1.614985308466738, + "learning_rate": 2.795155431035735e-07, + "loss": 1.084, + "step": 5631 + }, + { + "epoch": 0.7635057276486138, + "grad_norm": 2.2244165811434993, + "learning_rate": 2.7921108313539423e-07, + "loss": 1.1943, + "step": 5632 + }, + { + "epoch": 0.7636412932962787, + "grad_norm": 1.9313145373701066, + "learning_rate": 2.78906762165668e-07, + "loss": 1.1619, + "step": 5633 + }, + { + "epoch": 0.7637768589439436, + "grad_norm": 1.666066397284968, + "learning_rate": 2.786025802530807e-07, + "loss": 1.1648, + "step": 5634 + }, + { + "epoch": 0.7639124245916085, + "grad_norm": 1.6915537028232845, + "learning_rate": 2.782985374562915e-07, + "loss": 1.1541, + "step": 5635 + }, + { + "epoch": 0.7640479902392734, + "grad_norm": 1.795037736445389, + "learning_rate": 2.779946338339325e-07, + "loss": 1.1286, + "step": 5636 + }, + { + "epoch": 0.7641835558869382, + "grad_norm": 1.8626631730954684, + "learning_rate": 2.776908694446095e-07, + "loss": 1.1274, + "step": 5637 + }, + { + "epoch": 0.7643191215346031, + "grad_norm": 1.620735365568842, + "learning_rate": 2.773872443469005e-07, + "loss": 1.1212, + "step": 5638 + }, + { + "epoch": 0.7644546871822681, + "grad_norm": 1.890108817005347, + "learning_rate": 2.770837585993575e-07, + "loss": 1.0972, + "step": 5639 + }, + { + "epoch": 0.7645902528299329, + "grad_norm": 1.5998704379933468, + "learning_rate": 2.767804122605053e-07, + "loss": 1.1172, + "step": 5640 + }, + { + "epoch": 0.7647258184775978, + "grad_norm": 1.8018402651810743, + "learning_rate": 2.764772053888419e-07, + "loss": 1.1459, + "step": 5641 + }, + { + "epoch": 0.7648613841252626, + "grad_norm": 4.197767212903407, + "learning_rate": 2.7617413804283815e-07, + "loss": 1.1923, + "step": 5642 + }, + { + "epoch": 0.7649969497729275, + "grad_norm": 1.7395053601202708, + "learning_rate": 2.7587121028093853e-07, + "loss": 1.1345, + "step": 5643 + }, + { + "epoch": 0.7651325154205925, + "grad_norm": 2.266814917660025, + "learning_rate": 2.7556842216155996e-07, + "loss": 1.153, + "step": 5644 + }, + { + "epoch": 0.7652680810682573, + "grad_norm": 1.6184941538780244, + "learning_rate": 2.752657737430928e-07, + "loss": 1.1216, + "step": 5645 + }, + { + "epoch": 0.7654036467159222, + "grad_norm": 1.8583502678847386, + "learning_rate": 2.749632650839006e-07, + "loss": 1.1741, + "step": 5646 + }, + { + "epoch": 0.765539212363587, + "grad_norm": 4.826434988157324, + "learning_rate": 2.746608962423196e-07, + "loss": 1.1384, + "step": 5647 + }, + { + "epoch": 0.7656747780112519, + "grad_norm": 2.6324815696203965, + "learning_rate": 2.7435866727665924e-07, + "loss": 1.1318, + "step": 5648 + }, + { + "epoch": 0.7658103436589169, + "grad_norm": 2.6026478951457634, + "learning_rate": 2.74056578245202e-07, + "loss": 1.1132, + "step": 5649 + }, + { + "epoch": 0.7659459093065817, + "grad_norm": 2.0424022423854753, + "learning_rate": 2.7375462920620354e-07, + "loss": 1.1444, + "step": 5650 + }, + { + "epoch": 0.7660814749542466, + "grad_norm": 3.148647641989985, + "learning_rate": 2.7345282021789204e-07, + "loss": 1.1633, + "step": 5651 + }, + { + "epoch": 0.7662170406019114, + "grad_norm": 3.7138460469353705, + "learning_rate": 2.731511513384696e-07, + "loss": 1.106, + "step": 5652 + }, + { + "epoch": 0.7663526062495764, + "grad_norm": 2.0191210654455536, + "learning_rate": 2.7284962262610946e-07, + "loss": 1.1643, + "step": 5653 + }, + { + "epoch": 0.7664881718972413, + "grad_norm": 1.8269397608090234, + "learning_rate": 2.7254823413896056e-07, + "loss": 1.1322, + "step": 5654 + }, + { + "epoch": 0.7666237375449061, + "grad_norm": 1.5022871197101797, + "learning_rate": 2.7224698593514183e-07, + "loss": 1.158, + "step": 5655 + }, + { + "epoch": 0.766759303192571, + "grad_norm": 2.540254918418817, + "learning_rate": 2.7194587807274803e-07, + "loss": 1.1286, + "step": 5656 + }, + { + "epoch": 0.7668948688402358, + "grad_norm": 2.3972906791690143, + "learning_rate": 2.7164491060984417e-07, + "loss": 1.1498, + "step": 5657 + }, + { + "epoch": 0.7670304344879008, + "grad_norm": 1.9002242436355024, + "learning_rate": 2.713440836044705e-07, + "loss": 1.1565, + "step": 5658 + }, + { + "epoch": 0.7671660001355657, + "grad_norm": 2.1957865697320167, + "learning_rate": 2.710433971146381e-07, + "loss": 1.132, + "step": 5659 + }, + { + "epoch": 0.7673015657832305, + "grad_norm": 1.820071289792276, + "learning_rate": 2.7074285119833315e-07, + "loss": 1.1723, + "step": 5660 + }, + { + "epoch": 0.7674371314308954, + "grad_norm": 1.8152537456937403, + "learning_rate": 2.704424459135123e-07, + "loss": 1.1356, + "step": 5661 + }, + { + "epoch": 0.7675726970785602, + "grad_norm": 1.9144807276707079, + "learning_rate": 2.701421813181076e-07, + "loss": 1.1081, + "step": 5662 + }, + { + "epoch": 0.7677082627262252, + "grad_norm": 1.630291761429129, + "learning_rate": 2.6984205747002153e-07, + "loss": 1.0945, + "step": 5663 + }, + { + "epoch": 0.7678438283738901, + "grad_norm": 1.728996863668509, + "learning_rate": 2.6954207442713174e-07, + "loss": 1.1376, + "step": 5664 + }, + { + "epoch": 0.7679793940215549, + "grad_norm": 3.010512281990323, + "learning_rate": 2.692422322472866e-07, + "loss": 1.1785, + "step": 5665 + }, + { + "epoch": 0.7681149596692198, + "grad_norm": 1.9951914787510538, + "learning_rate": 2.689425309883089e-07, + "loss": 1.126, + "step": 5666 + }, + { + "epoch": 0.7682505253168846, + "grad_norm": 2.570706300214789, + "learning_rate": 2.6864297070799336e-07, + "loss": 1.1213, + "step": 5667 + }, + { + "epoch": 0.7683860909645496, + "grad_norm": 1.913354586960707, + "learning_rate": 2.6834355146410793e-07, + "loss": 1.149, + "step": 5668 + }, + { + "epoch": 0.7685216566122145, + "grad_norm": 1.806104264020953, + "learning_rate": 2.6804427331439327e-07, + "loss": 1.127, + "step": 5669 + }, + { + "epoch": 0.7686572222598793, + "grad_norm": 6.778753192779829, + "learning_rate": 2.677451363165628e-07, + "loss": 1.1554, + "step": 5670 + }, + { + "epoch": 0.7687927879075442, + "grad_norm": 1.7999234771540795, + "learning_rate": 2.674461405283027e-07, + "loss": 1.1909, + "step": 5671 + }, + { + "epoch": 0.7689283535552092, + "grad_norm": 2.036888198445882, + "learning_rate": 2.671472860072721e-07, + "loss": 1.1293, + "step": 5672 + }, + { + "epoch": 0.769063919202874, + "grad_norm": 1.7764404164461731, + "learning_rate": 2.6684857281110286e-07, + "loss": 1.2062, + "step": 5673 + }, + { + "epoch": 0.7691994848505389, + "grad_norm": 2.454161066728569, + "learning_rate": 2.6655000099739857e-07, + "loss": 1.1233, + "step": 5674 + }, + { + "epoch": 0.7693350504982037, + "grad_norm": 2.093863764537634, + "learning_rate": 2.662515706237376e-07, + "loss": 1.104, + "step": 5675 + }, + { + "epoch": 0.7694706161458686, + "grad_norm": 1.8570002667922032, + "learning_rate": 2.6595328174766885e-07, + "loss": 1.1174, + "step": 5676 + }, + { + "epoch": 0.7696061817935336, + "grad_norm": 1.7284401402394818, + "learning_rate": 2.656551344267162e-07, + "loss": 1.1645, + "step": 5677 + }, + { + "epoch": 0.7697417474411984, + "grad_norm": 3.4420038578926913, + "learning_rate": 2.6535712871837357e-07, + "loss": 1.1211, + "step": 5678 + }, + { + "epoch": 0.7698773130888633, + "grad_norm": 1.8966344210185397, + "learning_rate": 2.6505926468011044e-07, + "loss": 1.1485, + "step": 5679 + }, + { + "epoch": 0.7700128787365281, + "grad_norm": 1.8258979430761866, + "learning_rate": 2.6476154236936643e-07, + "loss": 1.1216, + "step": 5680 + }, + { + "epoch": 0.770148444384193, + "grad_norm": 2.9747916112145223, + "learning_rate": 2.6446396184355545e-07, + "loss": 1.1234, + "step": 5681 + }, + { + "epoch": 0.770284010031858, + "grad_norm": 2.036185592479562, + "learning_rate": 2.641665231600634e-07, + "loss": 1.1533, + "step": 5682 + }, + { + "epoch": 0.7704195756795228, + "grad_norm": 2.00168395783832, + "learning_rate": 2.6386922637624906e-07, + "loss": 1.0757, + "step": 5683 + }, + { + "epoch": 0.7705551413271877, + "grad_norm": 1.677435929022345, + "learning_rate": 2.635720715494438e-07, + "loss": 1.1191, + "step": 5684 + }, + { + "epoch": 0.7706907069748525, + "grad_norm": 2.245072535458002, + "learning_rate": 2.6327505873695157e-07, + "loss": 1.2146, + "step": 5685 + }, + { + "epoch": 0.7708262726225175, + "grad_norm": 1.6170687616092716, + "learning_rate": 2.629781879960488e-07, + "loss": 1.1284, + "step": 5686 + }, + { + "epoch": 0.7709618382701824, + "grad_norm": 1.7511473765970011, + "learning_rate": 2.626814593839848e-07, + "loss": 1.1359, + "step": 5687 + }, + { + "epoch": 0.7710974039178472, + "grad_norm": 1.9181836437158368, + "learning_rate": 2.623848729579813e-07, + "loss": 1.1623, + "step": 5688 + }, + { + "epoch": 0.7712329695655121, + "grad_norm": 2.789908195715468, + "learning_rate": 2.620884287752327e-07, + "loss": 1.1496, + "step": 5689 + }, + { + "epoch": 0.7713685352131769, + "grad_norm": 1.7122413737964062, + "learning_rate": 2.61792126892906e-07, + "loss": 1.1429, + "step": 5690 + }, + { + "epoch": 0.7715041008608419, + "grad_norm": 2.3751294418178537, + "learning_rate": 2.614959673681404e-07, + "loss": 1.1342, + "step": 5691 + }, + { + "epoch": 0.7716396665085068, + "grad_norm": 2.05219969948566, + "learning_rate": 2.611999502580482e-07, + "loss": 1.1347, + "step": 5692 + }, + { + "epoch": 0.7717752321561716, + "grad_norm": 2.0319778452295845, + "learning_rate": 2.6090407561971405e-07, + "loss": 1.0937, + "step": 5693 + }, + { + "epoch": 0.7719107978038365, + "grad_norm": 2.17391334943846, + "learning_rate": 2.6060834351019433e-07, + "loss": 1.1468, + "step": 5694 + }, + { + "epoch": 0.7720463634515013, + "grad_norm": 1.9269888505681132, + "learning_rate": 2.6031275398651986e-07, + "loss": 1.1728, + "step": 5695 + }, + { + "epoch": 0.7721819290991663, + "grad_norm": 1.9334699963970314, + "learning_rate": 2.6001730710569123e-07, + "loss": 1.1161, + "step": 5696 + }, + { + "epoch": 0.7723174947468312, + "grad_norm": 1.7835472167627562, + "learning_rate": 2.597220029246846e-07, + "loss": 1.1288, + "step": 5697 + }, + { + "epoch": 0.772453060394496, + "grad_norm": 1.7716013290169998, + "learning_rate": 2.594268415004457e-07, + "loss": 1.1745, + "step": 5698 + }, + { + "epoch": 0.7725886260421609, + "grad_norm": 3.9681162984655876, + "learning_rate": 2.591318228898953e-07, + "loss": 1.2289, + "step": 5699 + }, + { + "epoch": 0.7727241916898258, + "grad_norm": 2.606556110885668, + "learning_rate": 2.5883694714992446e-07, + "loss": 1.1311, + "step": 5700 + }, + { + "epoch": 0.7728597573374907, + "grad_norm": 2.044029591787401, + "learning_rate": 2.5854221433739797e-07, + "loss": 1.1646, + "step": 5701 + }, + { + "epoch": 0.7729953229851556, + "grad_norm": 2.5560763074510127, + "learning_rate": 2.582476245091527e-07, + "loss": 1.1501, + "step": 5702 + }, + { + "epoch": 0.7731308886328204, + "grad_norm": 1.960337634256339, + "learning_rate": 2.579531777219981e-07, + "loss": 1.1449, + "step": 5703 + }, + { + "epoch": 0.7732664542804853, + "grad_norm": 2.218059550149924, + "learning_rate": 2.576588740327158e-07, + "loss": 1.1156, + "step": 5704 + }, + { + "epoch": 0.7734020199281502, + "grad_norm": 1.79194628599034, + "learning_rate": 2.573647134980599e-07, + "loss": 1.1337, + "step": 5705 + }, + { + "epoch": 0.7735375855758151, + "grad_norm": 2.14018915143239, + "learning_rate": 2.57070696174757e-07, + "loss": 1.1493, + "step": 5706 + }, + { + "epoch": 0.77367315122348, + "grad_norm": 1.894925420994337, + "learning_rate": 2.5677682211950604e-07, + "loss": 1.1302, + "step": 5707 + }, + { + "epoch": 0.7738087168711448, + "grad_norm": 1.8648794543426026, + "learning_rate": 2.564830913889783e-07, + "loss": 1.1423, + "step": 5708 + }, + { + "epoch": 0.7739442825188098, + "grad_norm": 1.6782201103313785, + "learning_rate": 2.561895040398173e-07, + "loss": 1.134, + "step": 5709 + }, + { + "epoch": 0.7740798481664746, + "grad_norm": 1.7730903773696625, + "learning_rate": 2.5589606012863964e-07, + "loss": 1.14, + "step": 5710 + }, + { + "epoch": 0.7742154138141395, + "grad_norm": 2.8028318933005862, + "learning_rate": 2.556027597120325e-07, + "loss": 1.1467, + "step": 5711 + }, + { + "epoch": 0.7743509794618044, + "grad_norm": 1.9398490910099533, + "learning_rate": 2.553096028465578e-07, + "loss": 1.1459, + "step": 5712 + }, + { + "epoch": 0.7744865451094692, + "grad_norm": 2.2968219512564576, + "learning_rate": 2.550165895887474e-07, + "loss": 1.1386, + "step": 5713 + }, + { + "epoch": 0.7746221107571342, + "grad_norm": 1.7577436337108898, + "learning_rate": 2.547237199951078e-07, + "loss": 1.1623, + "step": 5714 + }, + { + "epoch": 0.774757676404799, + "grad_norm": 2.2032496272069366, + "learning_rate": 2.5443099412211535e-07, + "loss": 1.1561, + "step": 5715 + }, + { + "epoch": 0.7748932420524639, + "grad_norm": 1.8668375825362717, + "learning_rate": 2.54138412026221e-07, + "loss": 1.1718, + "step": 5716 + }, + { + "epoch": 0.7750288077001288, + "grad_norm": 2.068862693842728, + "learning_rate": 2.5384597376384596e-07, + "loss": 1.1564, + "step": 5717 + }, + { + "epoch": 0.7751643733477936, + "grad_norm": 1.6458835923398782, + "learning_rate": 2.535536793913856e-07, + "loss": 1.1471, + "step": 5718 + }, + { + "epoch": 0.7752999389954586, + "grad_norm": 2.560237774899055, + "learning_rate": 2.532615289652055e-07, + "loss": 1.1357, + "step": 5719 + }, + { + "epoch": 0.7754355046431234, + "grad_norm": 1.8802995092611188, + "learning_rate": 2.5296952254164573e-07, + "loss": 1.1235, + "step": 5720 + }, + { + "epoch": 0.7755710702907883, + "grad_norm": 1.7194480398954257, + "learning_rate": 2.5267766017701664e-07, + "loss": 1.1017, + "step": 5721 + }, + { + "epoch": 0.7757066359384532, + "grad_norm": 2.0915383559823986, + "learning_rate": 2.5238594192760165e-07, + "loss": 1.1302, + "step": 5722 + }, + { + "epoch": 0.775842201586118, + "grad_norm": 2.3167238982494323, + "learning_rate": 2.5209436784965657e-07, + "loss": 1.138, + "step": 5723 + }, + { + "epoch": 0.775977767233783, + "grad_norm": 2.159569426283523, + "learning_rate": 2.5180293799940886e-07, + "loss": 1.1208, + "step": 5724 + }, + { + "epoch": 0.7761133328814478, + "grad_norm": 1.8461312151518516, + "learning_rate": 2.5151165243305885e-07, + "loss": 1.1261, + "step": 5725 + }, + { + "epoch": 0.7762488985291127, + "grad_norm": 2.7123817409059967, + "learning_rate": 2.512205112067783e-07, + "loss": 1.1284, + "step": 5726 + }, + { + "epoch": 0.7763844641767776, + "grad_norm": 1.9135454229907816, + "learning_rate": 2.5092951437671184e-07, + "loss": 1.1893, + "step": 5727 + }, + { + "epoch": 0.7765200298244425, + "grad_norm": 1.7836373239325416, + "learning_rate": 2.5063866199897556e-07, + "loss": 1.1308, + "step": 5728 + }, + { + "epoch": 0.7766555954721074, + "grad_norm": 1.8557433660741218, + "learning_rate": 2.5034795412965825e-07, + "loss": 1.1046, + "step": 5729 + }, + { + "epoch": 0.7767911611197722, + "grad_norm": 3.7491445688257947, + "learning_rate": 2.500573908248207e-07, + "loss": 1.1648, + "step": 5730 + }, + { + "epoch": 0.7769267267674371, + "grad_norm": 1.8997365981926968, + "learning_rate": 2.497669721404956e-07, + "loss": 1.1308, + "step": 5731 + }, + { + "epoch": 0.777062292415102, + "grad_norm": 1.5865845661587552, + "learning_rate": 2.494766981326878e-07, + "loss": 1.1399, + "step": 5732 + }, + { + "epoch": 0.7771978580627669, + "grad_norm": 2.3234439942666327, + "learning_rate": 2.4918656885737465e-07, + "loss": 1.1046, + "step": 5733 + }, + { + "epoch": 0.7773334237104318, + "grad_norm": 1.8489708820215722, + "learning_rate": 2.488965843705051e-07, + "loss": 1.1491, + "step": 5734 + }, + { + "epoch": 0.7774689893580966, + "grad_norm": 5.312355918598942, + "learning_rate": 2.4860674472800036e-07, + "loss": 1.1416, + "step": 5735 + }, + { + "epoch": 0.7776045550057615, + "grad_norm": 2.049504729474085, + "learning_rate": 2.483170499857541e-07, + "loss": 1.183, + "step": 5736 + }, + { + "epoch": 0.7777401206534265, + "grad_norm": 2.175698817059581, + "learning_rate": 2.48027500199631e-07, + "loss": 1.1457, + "step": 5737 + }, + { + "epoch": 0.7778756863010913, + "grad_norm": 1.880103482272691, + "learning_rate": 2.477380954254689e-07, + "loss": 1.1265, + "step": 5738 + }, + { + "epoch": 0.7780112519487562, + "grad_norm": 2.087553248282905, + "learning_rate": 2.4744883571907694e-07, + "loss": 1.1226, + "step": 5739 + }, + { + "epoch": 0.778146817596421, + "grad_norm": 2.4727497038742707, + "learning_rate": 2.471597211362367e-07, + "loss": 1.1655, + "step": 5740 + }, + { + "epoch": 0.7782823832440859, + "grad_norm": 6.189103402690944, + "learning_rate": 2.468707517327019e-07, + "loss": 1.1755, + "step": 5741 + }, + { + "epoch": 0.7784179488917509, + "grad_norm": 2.9138687242290944, + "learning_rate": 2.465819275641976e-07, + "loss": 1.1434, + "step": 5742 + }, + { + "epoch": 0.7785535145394157, + "grad_norm": 4.084361506909097, + "learning_rate": 2.462932486864215e-07, + "loss": 1.1549, + "step": 5743 + }, + { + "epoch": 0.7786890801870806, + "grad_norm": 1.9447183522581681, + "learning_rate": 2.4600471515504293e-07, + "loss": 1.1016, + "step": 5744 + }, + { + "epoch": 0.7788246458347454, + "grad_norm": 2.4955017322375586, + "learning_rate": 2.4571632702570356e-07, + "loss": 1.1246, + "step": 5745 + }, + { + "epoch": 0.7789602114824103, + "grad_norm": 3.054973081550896, + "learning_rate": 2.454280843540164e-07, + "loss": 1.1599, + "step": 5746 + }, + { + "epoch": 0.7790957771300753, + "grad_norm": 1.8351679538956476, + "learning_rate": 2.4513998719556693e-07, + "loss": 1.1396, + "step": 5747 + }, + { + "epoch": 0.7792313427777401, + "grad_norm": 1.7623204363713387, + "learning_rate": 2.448520356059125e-07, + "loss": 1.1296, + "step": 5748 + }, + { + "epoch": 0.779366908425405, + "grad_norm": 1.7961588145207772, + "learning_rate": 2.4456422964058254e-07, + "loss": 1.1902, + "step": 5749 + }, + { + "epoch": 0.7795024740730698, + "grad_norm": 1.9327474753287663, + "learning_rate": 2.442765693550772e-07, + "loss": 1.1488, + "step": 5750 + }, + { + "epoch": 0.7796380397207348, + "grad_norm": 1.7899921096159364, + "learning_rate": 2.4398905480487073e-07, + "loss": 1.0917, + "step": 5751 + }, + { + "epoch": 0.7797736053683997, + "grad_norm": 2.2716165470221736, + "learning_rate": 2.4370168604540697e-07, + "loss": 1.1232, + "step": 5752 + }, + { + "epoch": 0.7799091710160645, + "grad_norm": 1.8043013245424737, + "learning_rate": 2.4341446313210365e-07, + "loss": 1.1444, + "step": 5753 + }, + { + "epoch": 0.7800447366637294, + "grad_norm": 1.9173185081653654, + "learning_rate": 2.4312738612034843e-07, + "loss": 1.1415, + "step": 5754 + }, + { + "epoch": 0.7801803023113943, + "grad_norm": 1.6856065892994403, + "learning_rate": 2.428404550655031e-07, + "loss": 1.1164, + "step": 5755 + }, + { + "epoch": 0.7803158679590592, + "grad_norm": 2.946711444314928, + "learning_rate": 2.425536700228986e-07, + "loss": 1.1403, + "step": 5756 + }, + { + "epoch": 0.7804514336067241, + "grad_norm": 3.431125456474364, + "learning_rate": 2.422670310478406e-07, + "loss": 1.1434, + "step": 5757 + }, + { + "epoch": 0.7805869992543889, + "grad_norm": 2.7635334689691953, + "learning_rate": 2.4198053819560394e-07, + "loss": 1.1315, + "step": 5758 + }, + { + "epoch": 0.7807225649020538, + "grad_norm": 18.243855658285618, + "learning_rate": 2.4169419152143766e-07, + "loss": 1.1656, + "step": 5759 + }, + { + "epoch": 0.7808581305497188, + "grad_norm": 1.6658757682040162, + "learning_rate": 2.414079910805601e-07, + "loss": 1.1481, + "step": 5760 + }, + { + "epoch": 0.7809936961973836, + "grad_norm": 1.6153150344291225, + "learning_rate": 2.4112193692816416e-07, + "loss": 1.1483, + "step": 5761 + }, + { + "epoch": 0.7811292618450485, + "grad_norm": 1.8999712915725417, + "learning_rate": 2.4083602911941224e-07, + "loss": 1.1592, + "step": 5762 + }, + { + "epoch": 0.7812648274927133, + "grad_norm": 2.328713121604467, + "learning_rate": 2.405502677094395e-07, + "loss": 1.1138, + "step": 5763 + }, + { + "epoch": 0.7814003931403782, + "grad_norm": 2.0697583083091904, + "learning_rate": 2.4026465275335306e-07, + "loss": 1.1654, + "step": 5764 + }, + { + "epoch": 0.7815359587880432, + "grad_norm": 2.339580483866321, + "learning_rate": 2.399791843062312e-07, + "loss": 1.117, + "step": 5765 + }, + { + "epoch": 0.781671524435708, + "grad_norm": 1.5843064782377827, + "learning_rate": 2.396938624231245e-07, + "loss": 1.1143, + "step": 5766 + }, + { + "epoch": 0.7818070900833729, + "grad_norm": 1.5370255363382181, + "learning_rate": 2.3940868715905495e-07, + "loss": 1.1379, + "step": 5767 + }, + { + "epoch": 0.7819426557310377, + "grad_norm": 1.7833734968019281, + "learning_rate": 2.3912365856901627e-07, + "loss": 1.127, + "step": 5768 + }, + { + "epoch": 0.7820782213787026, + "grad_norm": 3.5742605905556117, + "learning_rate": 2.38838776707974e-07, + "loss": 1.2018, + "step": 5769 + }, + { + "epoch": 0.7822137870263676, + "grad_norm": 1.6466231117352548, + "learning_rate": 2.3855404163086556e-07, + "loss": 1.1155, + "step": 5770 + }, + { + "epoch": 0.7823493526740324, + "grad_norm": 2.8358616963440415, + "learning_rate": 2.3826945339259964e-07, + "loss": 1.1308, + "step": 5771 + }, + { + "epoch": 0.7824849183216973, + "grad_norm": 2.0279572700165995, + "learning_rate": 2.379850120480571e-07, + "loss": 1.1908, + "step": 5772 + }, + { + "epoch": 0.7826204839693621, + "grad_norm": 2.8334207996319374, + "learning_rate": 2.3770071765208956e-07, + "loss": 1.1574, + "step": 5773 + }, + { + "epoch": 0.782756049617027, + "grad_norm": 4.386188418225304, + "learning_rate": 2.3741657025952188e-07, + "loss": 1.1684, + "step": 5774 + }, + { + "epoch": 0.782891615264692, + "grad_norm": 1.8410940050130384, + "learning_rate": 2.3713256992514853e-07, + "loss": 1.1721, + "step": 5775 + }, + { + "epoch": 0.7830271809123568, + "grad_norm": 1.6902801439942803, + "learning_rate": 2.3684871670373806e-07, + "loss": 1.1798, + "step": 5776 + }, + { + "epoch": 0.7831627465600217, + "grad_norm": 1.6185247342024067, + "learning_rate": 2.365650106500282e-07, + "loss": 1.111, + "step": 5777 + }, + { + "epoch": 0.7832983122076865, + "grad_norm": 1.9544520694441552, + "learning_rate": 2.3628145181872994e-07, + "loss": 1.1312, + "step": 5778 + }, + { + "epoch": 0.7834338778553515, + "grad_norm": 2.1295587540878103, + "learning_rate": 2.359980402645253e-07, + "loss": 1.1842, + "step": 5779 + }, + { + "epoch": 0.7835694435030164, + "grad_norm": 1.962091013068739, + "learning_rate": 2.3571477604206792e-07, + "loss": 1.176, + "step": 5780 + }, + { + "epoch": 0.7837050091506812, + "grad_norm": 2.420794402119144, + "learning_rate": 2.3543165920598308e-07, + "loss": 1.156, + "step": 5781 + }, + { + "epoch": 0.7838405747983461, + "grad_norm": 1.7717571575469213, + "learning_rate": 2.3514868981086755e-07, + "loss": 1.108, + "step": 5782 + }, + { + "epoch": 0.7839761404460109, + "grad_norm": 2.019535979734364, + "learning_rate": 2.3486586791128982e-07, + "loss": 1.1904, + "step": 5783 + }, + { + "epoch": 0.7841117060936759, + "grad_norm": 1.7803638629310723, + "learning_rate": 2.345831935617899e-07, + "loss": 1.1263, + "step": 5784 + }, + { + "epoch": 0.7842472717413408, + "grad_norm": 1.9089122752216308, + "learning_rate": 2.3430066681687932e-07, + "loss": 1.1266, + "step": 5785 + }, + { + "epoch": 0.7843828373890056, + "grad_norm": 1.923341572720763, + "learning_rate": 2.3401828773104103e-07, + "loss": 1.11, + "step": 5786 + }, + { + "epoch": 0.7845184030366705, + "grad_norm": 1.82670096345958, + "learning_rate": 2.3373605635872972e-07, + "loss": 1.1059, + "step": 5787 + }, + { + "epoch": 0.7846539686843353, + "grad_norm": 1.6855802863203815, + "learning_rate": 2.334539727543713e-07, + "loss": 1.1431, + "step": 5788 + }, + { + "epoch": 0.7847895343320003, + "grad_norm": 3.075864271986848, + "learning_rate": 2.3317203697236353e-07, + "loss": 1.1898, + "step": 5789 + }, + { + "epoch": 0.7849250999796652, + "grad_norm": 1.957415239391203, + "learning_rate": 2.3289024906707555e-07, + "loss": 1.1619, + "step": 5790 + }, + { + "epoch": 0.78506066562733, + "grad_norm": 1.8718446772888455, + "learning_rate": 2.3260860909284773e-07, + "loss": 1.161, + "step": 5791 + }, + { + "epoch": 0.7851962312749949, + "grad_norm": 2.466055255669323, + "learning_rate": 2.3232711710399255e-07, + "loss": 1.143, + "step": 5792 + }, + { + "epoch": 0.7853317969226598, + "grad_norm": 1.7605384160167228, + "learning_rate": 2.3204577315479269e-07, + "loss": 1.1545, + "step": 5793 + }, + { + "epoch": 0.7854673625703247, + "grad_norm": 1.8150546604770832, + "learning_rate": 2.3176457729950417e-07, + "loss": 1.1166, + "step": 5794 + }, + { + "epoch": 0.7856029282179896, + "grad_norm": 2.4797940054604095, + "learning_rate": 2.3148352959235218e-07, + "loss": 1.1376, + "step": 5795 + }, + { + "epoch": 0.7857384938656544, + "grad_norm": 2.0227744006872954, + "learning_rate": 2.3120263008753582e-07, + "loss": 1.1865, + "step": 5796 + }, + { + "epoch": 0.7858740595133193, + "grad_norm": 1.8986572937728998, + "learning_rate": 2.309218788392232e-07, + "loss": 1.1416, + "step": 5797 + }, + { + "epoch": 0.7860096251609842, + "grad_norm": 1.9263120382550951, + "learning_rate": 2.3064127590155603e-07, + "loss": 1.142, + "step": 5798 + }, + { + "epoch": 0.7861451908086491, + "grad_norm": 1.9033397133489156, + "learning_rate": 2.3036082132864555e-07, + "loss": 1.147, + "step": 5799 + }, + { + "epoch": 0.786280756456314, + "grad_norm": 2.1550710575173215, + "learning_rate": 2.300805151745756e-07, + "loss": 1.1548, + "step": 5800 + }, + { + "epoch": 0.7864163221039788, + "grad_norm": 1.843571842719603, + "learning_rate": 2.2980035749340088e-07, + "loss": 1.1262, + "step": 5801 + }, + { + "epoch": 0.7865518877516438, + "grad_norm": 1.6475129662328083, + "learning_rate": 2.2952034833914757e-07, + "loss": 1.1453, + "step": 5802 + }, + { + "epoch": 0.7866874533993086, + "grad_norm": 4.15542186442814, + "learning_rate": 2.292404877658134e-07, + "loss": 1.1432, + "step": 5803 + }, + { + "epoch": 0.7868230190469735, + "grad_norm": 2.0111026721386316, + "learning_rate": 2.2896077582736705e-07, + "loss": 1.1417, + "step": 5804 + }, + { + "epoch": 0.7869585846946384, + "grad_norm": 1.888332943219119, + "learning_rate": 2.2868121257774885e-07, + "loss": 1.1298, + "step": 5805 + }, + { + "epoch": 0.7870941503423032, + "grad_norm": 2.034903342369246, + "learning_rate": 2.2840179807087044e-07, + "loss": 1.1564, + "step": 5806 + }, + { + "epoch": 0.7872297159899682, + "grad_norm": 1.5915866241886407, + "learning_rate": 2.2812253236061497e-07, + "loss": 1.1125, + "step": 5807 + }, + { + "epoch": 0.787365281637633, + "grad_norm": 2.2149907113748344, + "learning_rate": 2.2784341550083574e-07, + "loss": 1.1259, + "step": 5808 + }, + { + "epoch": 0.7875008472852979, + "grad_norm": 4.699857604484971, + "learning_rate": 2.275644475453593e-07, + "loss": 1.1494, + "step": 5809 + }, + { + "epoch": 0.7876364129329628, + "grad_norm": 1.8843597956474176, + "learning_rate": 2.272856285479814e-07, + "loss": 1.1953, + "step": 5810 + }, + { + "epoch": 0.7877719785806276, + "grad_norm": 2.840431917046554, + "learning_rate": 2.2700695856247122e-07, + "loss": 1.1205, + "step": 5811 + }, + { + "epoch": 0.7879075442282926, + "grad_norm": 2.3415332767346833, + "learning_rate": 2.2672843764256678e-07, + "loss": 1.1271, + "step": 5812 + }, + { + "epoch": 0.7880431098759574, + "grad_norm": 2.114885372209233, + "learning_rate": 2.264500658419799e-07, + "loss": 1.1272, + "step": 5813 + }, + { + "epoch": 0.7881786755236223, + "grad_norm": 1.782165716184466, + "learning_rate": 2.261718432143912e-07, + "loss": 1.1265, + "step": 5814 + }, + { + "epoch": 0.7883142411712872, + "grad_norm": 2.571657261432421, + "learning_rate": 2.2589376981345487e-07, + "loss": 1.1009, + "step": 5815 + }, + { + "epoch": 0.788449806818952, + "grad_norm": 5.799679082335375, + "learning_rate": 2.25615845692794e-07, + "loss": 1.1402, + "step": 5816 + }, + { + "epoch": 0.788585372466617, + "grad_norm": 2.202451275898357, + "learning_rate": 2.253380709060053e-07, + "loss": 1.1332, + "step": 5817 + }, + { + "epoch": 0.7887209381142818, + "grad_norm": 1.849194143098968, + "learning_rate": 2.2506044550665438e-07, + "loss": 1.1418, + "step": 5818 + }, + { + "epoch": 0.7888565037619467, + "grad_norm": 2.3777257958217386, + "learning_rate": 2.247829695482799e-07, + "loss": 1.1232, + "step": 5819 + }, + { + "epoch": 0.7889920694096116, + "grad_norm": 1.8301072485608354, + "learning_rate": 2.2450564308439036e-07, + "loss": 1.1647, + "step": 5820 + }, + { + "epoch": 0.7891276350572765, + "grad_norm": 1.7490567410068338, + "learning_rate": 2.2422846616846613e-07, + "loss": 1.1126, + "step": 5821 + }, + { + "epoch": 0.7892632007049414, + "grad_norm": 2.5333176209026678, + "learning_rate": 2.2395143885395873e-07, + "loss": 1.1445, + "step": 5822 + }, + { + "epoch": 0.7893987663526062, + "grad_norm": 1.7713023791283222, + "learning_rate": 2.236745611942905e-07, + "loss": 1.1661, + "step": 5823 + }, + { + "epoch": 0.7895343320002711, + "grad_norm": 3.1952122522289335, + "learning_rate": 2.2339783324285523e-07, + "loss": 1.1446, + "step": 5824 + }, + { + "epoch": 0.789669897647936, + "grad_norm": 1.6406142926834908, + "learning_rate": 2.231212550530177e-07, + "loss": 1.1515, + "step": 5825 + }, + { + "epoch": 0.7898054632956009, + "grad_norm": 6.200542112750877, + "learning_rate": 2.2284482667811378e-07, + "loss": 1.1326, + "step": 5826 + }, + { + "epoch": 0.7899410289432658, + "grad_norm": 1.7200145961487867, + "learning_rate": 2.2256854817145065e-07, + "loss": 1.1441, + "step": 5827 + }, + { + "epoch": 0.7900765945909306, + "grad_norm": 2.038980289292458, + "learning_rate": 2.2229241958630617e-07, + "loss": 1.1736, + "step": 5828 + }, + { + "epoch": 0.7902121602385955, + "grad_norm": 1.8065922017604605, + "learning_rate": 2.2201644097592987e-07, + "loss": 1.1297, + "step": 5829 + }, + { + "epoch": 0.7903477258862605, + "grad_norm": 1.8317950084059942, + "learning_rate": 2.217406123935418e-07, + "loss": 1.1669, + "step": 5830 + }, + { + "epoch": 0.7904832915339253, + "grad_norm": 1.8133274541305886, + "learning_rate": 2.2146493389233357e-07, + "loss": 1.1086, + "step": 5831 + }, + { + "epoch": 0.7906188571815902, + "grad_norm": 1.8095442879242507, + "learning_rate": 2.211894055254673e-07, + "loss": 1.1531, + "step": 5832 + }, + { + "epoch": 0.7907544228292551, + "grad_norm": 1.9879206773306697, + "learning_rate": 2.20914027346077e-07, + "loss": 1.166, + "step": 5833 + }, + { + "epoch": 0.7908899884769199, + "grad_norm": 2.291261452555658, + "learning_rate": 2.206387994072665e-07, + "loss": 1.1173, + "step": 5834 + }, + { + "epoch": 0.7910255541245849, + "grad_norm": 1.9278477263254994, + "learning_rate": 2.2036372176211148e-07, + "loss": 1.1481, + "step": 5835 + }, + { + "epoch": 0.7911611197722497, + "grad_norm": 1.8340572940571023, + "learning_rate": 2.200887944636588e-07, + "loss": 1.1355, + "step": 5836 + }, + { + "epoch": 0.7912966854199146, + "grad_norm": 1.579457133332385, + "learning_rate": 2.198140175649259e-07, + "loss": 1.1788, + "step": 5837 + }, + { + "epoch": 0.7914322510675795, + "grad_norm": 1.6469234266755413, + "learning_rate": 2.195393911189012e-07, + "loss": 1.1743, + "step": 5838 + }, + { + "epoch": 0.7915678167152443, + "grad_norm": 1.8353406744163259, + "learning_rate": 2.192649151785444e-07, + "loss": 1.1638, + "step": 5839 + }, + { + "epoch": 0.7917033823629093, + "grad_norm": 2.3107172014544215, + "learning_rate": 2.1899058979678586e-07, + "loss": 1.1445, + "step": 5840 + }, + { + "epoch": 0.7918389480105741, + "grad_norm": 3.074792663635941, + "learning_rate": 2.1871641502652728e-07, + "loss": 1.1403, + "step": 5841 + }, + { + "epoch": 0.791974513658239, + "grad_norm": 6.796220178671116, + "learning_rate": 2.1844239092064088e-07, + "loss": 1.1624, + "step": 5842 + }, + { + "epoch": 0.7921100793059039, + "grad_norm": 1.8515540234526475, + "learning_rate": 2.181685175319702e-07, + "loss": 1.1183, + "step": 5843 + }, + { + "epoch": 0.7922456449535688, + "grad_norm": 6.267115631768789, + "learning_rate": 2.1789479491332953e-07, + "loss": 1.1638, + "step": 5844 + }, + { + "epoch": 0.7923812106012337, + "grad_norm": 2.0589815778914775, + "learning_rate": 2.176212231175041e-07, + "loss": 1.1712, + "step": 5845 + }, + { + "epoch": 0.7925167762488985, + "grad_norm": 1.7430007150341373, + "learning_rate": 2.1734780219725e-07, + "loss": 1.1405, + "step": 5846 + }, + { + "epoch": 0.7926523418965634, + "grad_norm": 1.7469794159881458, + "learning_rate": 2.1707453220529448e-07, + "loss": 1.1712, + "step": 5847 + }, + { + "epoch": 0.7927879075442283, + "grad_norm": 9.925724455714414, + "learning_rate": 2.1680141319433564e-07, + "loss": 1.1435, + "step": 5848 + }, + { + "epoch": 0.7929234731918932, + "grad_norm": 2.3071663360347094, + "learning_rate": 2.165284452170415e-07, + "loss": 1.1062, + "step": 5849 + }, + { + "epoch": 0.7930590388395581, + "grad_norm": 2.7775994095851284, + "learning_rate": 2.1625562832605281e-07, + "loss": 1.159, + "step": 5850 + }, + { + "epoch": 0.7931946044872229, + "grad_norm": 2.093751652972958, + "learning_rate": 2.159829625739793e-07, + "loss": 1.1634, + "step": 5851 + }, + { + "epoch": 0.7933301701348878, + "grad_norm": 2.6568285157065077, + "learning_rate": 2.157104480134032e-07, + "loss": 1.1094, + "step": 5852 + }, + { + "epoch": 0.7934657357825528, + "grad_norm": 2.0181352703228557, + "learning_rate": 2.1543808469687596e-07, + "loss": 1.1087, + "step": 5853 + }, + { + "epoch": 0.7936013014302176, + "grad_norm": 1.6747499884517534, + "learning_rate": 2.1516587267692165e-07, + "loss": 1.0874, + "step": 5854 + }, + { + "epoch": 0.7937368670778825, + "grad_norm": 2.189805525754781, + "learning_rate": 2.1489381200603307e-07, + "loss": 1.1856, + "step": 5855 + }, + { + "epoch": 0.7938724327255473, + "grad_norm": 2.4600316180026387, + "learning_rate": 2.1462190273667624e-07, + "loss": 1.1286, + "step": 5856 + }, + { + "epoch": 0.7940079983732122, + "grad_norm": 1.9982339518298382, + "learning_rate": 2.1435014492128545e-07, + "loss": 1.0888, + "step": 5857 + }, + { + "epoch": 0.7941435640208772, + "grad_norm": 1.4967218691967514, + "learning_rate": 2.1407853861226833e-07, + "loss": 1.1326, + "step": 5858 + }, + { + "epoch": 0.794279129668542, + "grad_norm": 2.1749582761812, + "learning_rate": 2.1380708386200075e-07, + "loss": 1.1385, + "step": 5859 + }, + { + "epoch": 0.7944146953162069, + "grad_norm": 2.2617802424846953, + "learning_rate": 2.1353578072283175e-07, + "loss": 1.1538, + "step": 5860 + }, + { + "epoch": 0.7945502609638717, + "grad_norm": 1.8316617045651826, + "learning_rate": 2.1326462924707912e-07, + "loss": 1.1359, + "step": 5861 + }, + { + "epoch": 0.7946858266115366, + "grad_norm": 2.259592359980429, + "learning_rate": 2.129936294870327e-07, + "loss": 1.1584, + "step": 5862 + }, + { + "epoch": 0.7948213922592016, + "grad_norm": 1.99546219056723, + "learning_rate": 2.127227814949526e-07, + "loss": 1.145, + "step": 5863 + }, + { + "epoch": 0.7949569579068664, + "grad_norm": 1.9016045608649566, + "learning_rate": 2.124520853230697e-07, + "loss": 1.1568, + "step": 5864 + }, + { + "epoch": 0.7950925235545313, + "grad_norm": 2.0501162739592558, + "learning_rate": 2.1218154102358554e-07, + "loss": 1.1646, + "step": 5865 + }, + { + "epoch": 0.7952280892021961, + "grad_norm": 1.6926416199591499, + "learning_rate": 2.1191114864867255e-07, + "loss": 1.1385, + "step": 5866 + }, + { + "epoch": 0.795363654849861, + "grad_norm": 1.820064642350009, + "learning_rate": 2.1164090825047388e-07, + "loss": 1.1285, + "step": 5867 + }, + { + "epoch": 0.795499220497526, + "grad_norm": 2.0552854240953633, + "learning_rate": 2.1137081988110294e-07, + "loss": 1.1556, + "step": 5868 + }, + { + "epoch": 0.7956347861451908, + "grad_norm": 1.667453434735838, + "learning_rate": 2.1110088359264445e-07, + "loss": 1.1637, + "step": 5869 + }, + { + "epoch": 0.7957703517928557, + "grad_norm": 3.019347760948753, + "learning_rate": 2.108310994371534e-07, + "loss": 1.1564, + "step": 5870 + }, + { + "epoch": 0.7959059174405205, + "grad_norm": 2.328216997842197, + "learning_rate": 2.105614674666556e-07, + "loss": 1.0958, + "step": 5871 + }, + { + "epoch": 0.7960414830881855, + "grad_norm": 1.5911939163841635, + "learning_rate": 2.1029198773314693e-07, + "loss": 1.1572, + "step": 5872 + }, + { + "epoch": 0.7961770487358504, + "grad_norm": 1.666510738518348, + "learning_rate": 2.1002266028859539e-07, + "loss": 1.1518, + "step": 5873 + }, + { + "epoch": 0.7963126143835152, + "grad_norm": 1.6823207544764032, + "learning_rate": 2.0975348518493762e-07, + "loss": 1.142, + "step": 5874 + }, + { + "epoch": 0.7964481800311801, + "grad_norm": 1.718149114662994, + "learning_rate": 2.094844624740828e-07, + "loss": 1.1398, + "step": 5875 + }, + { + "epoch": 0.7965837456788449, + "grad_norm": 1.6391707387557288, + "learning_rate": 2.092155922079093e-07, + "loss": 1.1519, + "step": 5876 + }, + { + "epoch": 0.7967193113265099, + "grad_norm": 2.248256995201791, + "learning_rate": 2.0894687443826675e-07, + "loss": 1.1253, + "step": 5877 + }, + { + "epoch": 0.7968548769741748, + "grad_norm": 1.6720168902070662, + "learning_rate": 2.0867830921697527e-07, + "loss": 1.1678, + "step": 5878 + }, + { + "epoch": 0.7969904426218396, + "grad_norm": 1.994184929470376, + "learning_rate": 2.0840989659582552e-07, + "loss": 1.1039, + "step": 5879 + }, + { + "epoch": 0.7971260082695045, + "grad_norm": 1.7662997757853403, + "learning_rate": 2.081416366265787e-07, + "loss": 1.1305, + "step": 5880 + }, + { + "epoch": 0.7972615739171693, + "grad_norm": 2.2588311101016147, + "learning_rate": 2.078735293609668e-07, + "loss": 1.1397, + "step": 5881 + }, + { + "epoch": 0.7973971395648343, + "grad_norm": 1.6610073799146639, + "learning_rate": 2.0760557485069208e-07, + "loss": 1.092, + "step": 5882 + }, + { + "epoch": 0.7975327052124992, + "grad_norm": 2.0293930689206805, + "learning_rate": 2.073377731474275e-07, + "loss": 1.1338, + "step": 5883 + }, + { + "epoch": 0.797668270860164, + "grad_norm": 3.1767413477791187, + "learning_rate": 2.0707012430281646e-07, + "loss": 1.1547, + "step": 5884 + }, + { + "epoch": 0.7978038365078289, + "grad_norm": 2.018027743498049, + "learning_rate": 2.0680262836847294e-07, + "loss": 1.1127, + "step": 5885 + }, + { + "epoch": 0.7979394021554937, + "grad_norm": 1.6574716543736263, + "learning_rate": 2.065352853959814e-07, + "loss": 1.1314, + "step": 5886 + }, + { + "epoch": 0.7980749678031587, + "grad_norm": 1.9231395571943544, + "learning_rate": 2.0626809543689682e-07, + "loss": 1.1335, + "step": 5887 + }, + { + "epoch": 0.7982105334508236, + "grad_norm": 1.848919930358066, + "learning_rate": 2.0600105854274474e-07, + "loss": 1.1065, + "step": 5888 + }, + { + "epoch": 0.7983460990984884, + "grad_norm": 1.8527627312161528, + "learning_rate": 2.0573417476502108e-07, + "loss": 1.1477, + "step": 5889 + }, + { + "epoch": 0.7984816647461533, + "grad_norm": 2.8761263494489495, + "learning_rate": 2.0546744415519223e-07, + "loss": 1.1032, + "step": 5890 + }, + { + "epoch": 0.7986172303938182, + "grad_norm": 1.979686754310988, + "learning_rate": 2.052008667646954e-07, + "loss": 1.169, + "step": 5891 + }, + { + "epoch": 0.7987527960414831, + "grad_norm": 2.013447540426962, + "learning_rate": 2.049344426449371e-07, + "loss": 1.0929, + "step": 5892 + }, + { + "epoch": 0.798888361689148, + "grad_norm": 1.8546759433424698, + "learning_rate": 2.0466817184729624e-07, + "loss": 1.1028, + "step": 5893 + }, + { + "epoch": 0.7990239273368128, + "grad_norm": 1.5355626445049952, + "learning_rate": 2.0440205442311987e-07, + "loss": 1.1384, + "step": 5894 + }, + { + "epoch": 0.7991594929844777, + "grad_norm": 1.82987877736223, + "learning_rate": 2.041360904237278e-07, + "loss": 1.128, + "step": 5895 + }, + { + "epoch": 0.7992950586321426, + "grad_norm": 1.7300922069125668, + "learning_rate": 2.0387027990040827e-07, + "loss": 1.1227, + "step": 5896 + }, + { + "epoch": 0.7994306242798075, + "grad_norm": 1.7466446324051927, + "learning_rate": 2.0360462290442105e-07, + "loss": 1.1045, + "step": 5897 + }, + { + "epoch": 0.7995661899274724, + "grad_norm": 1.7126478381522239, + "learning_rate": 2.033391194869959e-07, + "loss": 1.1578, + "step": 5898 + }, + { + "epoch": 0.7997017555751372, + "grad_norm": 1.9089096006575976, + "learning_rate": 2.03073769699333e-07, + "loss": 1.1519, + "step": 5899 + }, + { + "epoch": 0.7998373212228022, + "grad_norm": 1.775450690295981, + "learning_rate": 2.0280857359260316e-07, + "loss": 1.1337, + "step": 5900 + }, + { + "epoch": 0.799972886870467, + "grad_norm": 2.010070629787447, + "learning_rate": 2.025435312179472e-07, + "loss": 1.1635, + "step": 5901 + }, + { + "epoch": 0.8001084525181319, + "grad_norm": 2.7394766784730695, + "learning_rate": 2.0227864262647664e-07, + "loss": 1.1353, + "step": 5902 + }, + { + "epoch": 0.8002440181657968, + "grad_norm": 1.9544688852240315, + "learning_rate": 2.0201390786927286e-07, + "loss": 1.139, + "step": 5903 + }, + { + "epoch": 0.8003795838134616, + "grad_norm": 2.2791984170307615, + "learning_rate": 2.017493269973881e-07, + "loss": 1.1289, + "step": 5904 + }, + { + "epoch": 0.8005151494611266, + "grad_norm": 1.7329967854599062, + "learning_rate": 2.014849000618446e-07, + "loss": 1.1299, + "step": 5905 + }, + { + "epoch": 0.8006507151087914, + "grad_norm": 1.7574594552853335, + "learning_rate": 2.012206271136353e-07, + "loss": 1.1379, + "step": 5906 + }, + { + "epoch": 0.8007862807564563, + "grad_norm": 3.437523528954408, + "learning_rate": 2.0095650820372234e-07, + "loss": 1.1566, + "step": 5907 + }, + { + "epoch": 0.8009218464041212, + "grad_norm": 1.9124188954581718, + "learning_rate": 2.006925433830401e-07, + "loss": 1.1284, + "step": 5908 + }, + { + "epoch": 0.801057412051786, + "grad_norm": 1.8520626908134528, + "learning_rate": 2.0042873270249094e-07, + "loss": 1.1505, + "step": 5909 + }, + { + "epoch": 0.801192977699451, + "grad_norm": 3.3258437235741343, + "learning_rate": 2.0016507621294975e-07, + "loss": 1.1191, + "step": 5910 + }, + { + "epoch": 0.8013285433471159, + "grad_norm": 2.3000598512994026, + "learning_rate": 1.9990157396525963e-07, + "loss": 1.1534, + "step": 5911 + }, + { + "epoch": 0.8014641089947807, + "grad_norm": 2.648723305866583, + "learning_rate": 1.9963822601023595e-07, + "loss": 1.134, + "step": 5912 + }, + { + "epoch": 0.8015996746424456, + "grad_norm": 2.0262614934006358, + "learning_rate": 1.9937503239866205e-07, + "loss": 1.153, + "step": 5913 + }, + { + "epoch": 0.8017352402901105, + "grad_norm": 2.026277300395037, + "learning_rate": 1.9911199318129403e-07, + "loss": 1.1592, + "step": 5914 + }, + { + "epoch": 0.8018708059377754, + "grad_norm": 1.8259981808907655, + "learning_rate": 1.9884910840885571e-07, + "loss": 1.1417, + "step": 5915 + }, + { + "epoch": 0.8020063715854403, + "grad_norm": 1.5839537050502466, + "learning_rate": 1.9858637813204349e-07, + "loss": 1.1433, + "step": 5916 + }, + { + "epoch": 0.8021419372331051, + "grad_norm": 3.1055783031443323, + "learning_rate": 1.983238024015217e-07, + "loss": 1.1348, + "step": 5917 + }, + { + "epoch": 0.80227750288077, + "grad_norm": 2.2927102321181914, + "learning_rate": 1.9806138126792716e-07, + "loss": 1.1696, + "step": 5918 + }, + { + "epoch": 0.8024130685284349, + "grad_norm": 1.5315971154409442, + "learning_rate": 1.9779911478186485e-07, + "loss": 1.1533, + "step": 5919 + }, + { + "epoch": 0.8025486341760998, + "grad_norm": 1.717466003127385, + "learning_rate": 1.9753700299391107e-07, + "loss": 1.1171, + "step": 5920 + }, + { + "epoch": 0.8026841998237647, + "grad_norm": 2.181441641424434, + "learning_rate": 1.9727504595461198e-07, + "loss": 1.0868, + "step": 5921 + }, + { + "epoch": 0.8028197654714295, + "grad_norm": 2.001087186256601, + "learning_rate": 1.970132437144839e-07, + "loss": 1.1508, + "step": 5922 + }, + { + "epoch": 0.8029553311190945, + "grad_norm": 1.7076583924989204, + "learning_rate": 1.967515963240135e-07, + "loss": 1.1203, + "step": 5923 + }, + { + "epoch": 0.8030908967667593, + "grad_norm": 2.3734954112616493, + "learning_rate": 1.9649010383365717e-07, + "loss": 1.143, + "step": 5924 + }, + { + "epoch": 0.8032264624144242, + "grad_norm": 1.9945139985442428, + "learning_rate": 1.962287662938419e-07, + "loss": 1.1116, + "step": 5925 + }, + { + "epoch": 0.8033620280620891, + "grad_norm": 1.613817584345971, + "learning_rate": 1.9596758375496435e-07, + "loss": 1.1219, + "step": 5926 + }, + { + "epoch": 0.8034975937097539, + "grad_norm": 3.6349369813488406, + "learning_rate": 1.9570655626739176e-07, + "loss": 1.1447, + "step": 5927 + }, + { + "epoch": 0.8036331593574189, + "grad_norm": 1.688148933419202, + "learning_rate": 1.9544568388146098e-07, + "loss": 1.1243, + "step": 5928 + }, + { + "epoch": 0.8037687250050837, + "grad_norm": 1.7466152298501039, + "learning_rate": 1.951849666474793e-07, + "loss": 1.1334, + "step": 5929 + }, + { + "epoch": 0.8039042906527486, + "grad_norm": 2.283676049369807, + "learning_rate": 1.9492440461572401e-07, + "loss": 1.1151, + "step": 5930 + }, + { + "epoch": 0.8040398563004135, + "grad_norm": 2.0111024091252103, + "learning_rate": 1.9466399783644249e-07, + "loss": 1.1442, + "step": 5931 + }, + { + "epoch": 0.8041754219480783, + "grad_norm": 2.7165299256472677, + "learning_rate": 1.9440374635985224e-07, + "loss": 1.1285, + "step": 5932 + }, + { + "epoch": 0.8043109875957433, + "grad_norm": 1.7646587005448195, + "learning_rate": 1.941436502361402e-07, + "loss": 1.1584, + "step": 5933 + }, + { + "epoch": 0.8044465532434081, + "grad_norm": 3.8042001459142347, + "learning_rate": 1.9388370951546428e-07, + "loss": 1.1508, + "step": 5934 + }, + { + "epoch": 0.804582118891073, + "grad_norm": 2.3690079833583444, + "learning_rate": 1.9362392424795183e-07, + "loss": 1.1506, + "step": 5935 + }, + { + "epoch": 0.8047176845387379, + "grad_norm": 1.9679930750687806, + "learning_rate": 1.933642944837004e-07, + "loss": 1.1408, + "step": 5936 + }, + { + "epoch": 0.8048532501864027, + "grad_norm": 2.3339724956223677, + "learning_rate": 1.9310482027277763e-07, + "loss": 1.1323, + "step": 5937 + }, + { + "epoch": 0.8049888158340677, + "grad_norm": 2.6644415396780703, + "learning_rate": 1.9284550166522108e-07, + "loss": 1.1374, + "step": 5938 + }, + { + "epoch": 0.8051243814817325, + "grad_norm": 1.9548566933417846, + "learning_rate": 1.9258633871103814e-07, + "loss": 1.1473, + "step": 5939 + }, + { + "epoch": 0.8052599471293974, + "grad_norm": 1.7088578685981635, + "learning_rate": 1.923273314602065e-07, + "loss": 1.146, + "step": 5940 + }, + { + "epoch": 0.8053955127770623, + "grad_norm": 2.6992844999409367, + "learning_rate": 1.920684799626736e-07, + "loss": 1.13, + "step": 5941 + }, + { + "epoch": 0.8055310784247272, + "grad_norm": 1.891082459389027, + "learning_rate": 1.9180978426835693e-07, + "loss": 1.1569, + "step": 5942 + }, + { + "epoch": 0.8056666440723921, + "grad_norm": 2.9583906545938694, + "learning_rate": 1.9155124442714387e-07, + "loss": 1.1863, + "step": 5943 + }, + { + "epoch": 0.8058022097200569, + "grad_norm": 1.8293599287289501, + "learning_rate": 1.912928604888918e-07, + "loss": 1.1109, + "step": 5944 + }, + { + "epoch": 0.8059377753677218, + "grad_norm": 2.6923900723995886, + "learning_rate": 1.91034632503428e-07, + "loss": 1.1551, + "step": 5945 + }, + { + "epoch": 0.8060733410153867, + "grad_norm": 1.7267429948688517, + "learning_rate": 1.907765605205498e-07, + "loss": 1.1653, + "step": 5946 + }, + { + "epoch": 0.8062089066630516, + "grad_norm": 4.059466826427658, + "learning_rate": 1.9051864459002454e-07, + "loss": 1.1539, + "step": 5947 + }, + { + "epoch": 0.8063444723107165, + "grad_norm": 2.0638049567941152, + "learning_rate": 1.9026088476158851e-07, + "loss": 1.116, + "step": 5948 + }, + { + "epoch": 0.8064800379583813, + "grad_norm": 1.8993406875593712, + "learning_rate": 1.9000328108494967e-07, + "loss": 1.1245, + "step": 5949 + }, + { + "epoch": 0.8066156036060462, + "grad_norm": 1.5330279768890922, + "learning_rate": 1.897458336097838e-07, + "loss": 1.1832, + "step": 5950 + }, + { + "epoch": 0.8067511692537112, + "grad_norm": 8.010981457889255, + "learning_rate": 1.8948854238573874e-07, + "loss": 1.1418, + "step": 5951 + }, + { + "epoch": 0.806886734901376, + "grad_norm": 2.321537337342278, + "learning_rate": 1.8923140746242994e-07, + "loss": 1.1851, + "step": 5952 + }, + { + "epoch": 0.8070223005490409, + "grad_norm": 1.5974361707149545, + "learning_rate": 1.8897442888944492e-07, + "loss": 1.1811, + "step": 5953 + }, + { + "epoch": 0.8071578661967057, + "grad_norm": 4.1879639630824235, + "learning_rate": 1.8871760671633895e-07, + "loss": 1.0691, + "step": 5954 + }, + { + "epoch": 0.8072934318443706, + "grad_norm": 1.8520296823959481, + "learning_rate": 1.884609409926391e-07, + "loss": 1.1359, + "step": 5955 + }, + { + "epoch": 0.8074289974920356, + "grad_norm": 2.018527938380092, + "learning_rate": 1.882044317678404e-07, + "loss": 1.1522, + "step": 5956 + }, + { + "epoch": 0.8075645631397004, + "grad_norm": 1.6911338405626548, + "learning_rate": 1.8794807909140963e-07, + "loss": 1.1718, + "step": 5957 + }, + { + "epoch": 0.8077001287873653, + "grad_norm": 2.49840707126161, + "learning_rate": 1.8769188301278126e-07, + "loss": 1.1905, + "step": 5958 + }, + { + "epoch": 0.8078356944350301, + "grad_norm": 2.0731008405207865, + "learning_rate": 1.8743584358136188e-07, + "loss": 1.1512, + "step": 5959 + }, + { + "epoch": 0.807971260082695, + "grad_norm": 1.7379079441210905, + "learning_rate": 1.8717996084652587e-07, + "loss": 1.1602, + "step": 5960 + }, + { + "epoch": 0.80810682573036, + "grad_norm": 5.477900254766507, + "learning_rate": 1.8692423485761833e-07, + "loss": 1.1195, + "step": 5961 + }, + { + "epoch": 0.8082423913780248, + "grad_norm": 2.2158779605309147, + "learning_rate": 1.86668665663954e-07, + "loss": 1.1299, + "step": 5962 + }, + { + "epoch": 0.8083779570256897, + "grad_norm": 2.2507257843673947, + "learning_rate": 1.8641325331481762e-07, + "loss": 1.1473, + "step": 5963 + }, + { + "epoch": 0.8085135226733545, + "grad_norm": 1.634432009748848, + "learning_rate": 1.861579978594632e-07, + "loss": 1.1093, + "step": 5964 + }, + { + "epoch": 0.8086490883210194, + "grad_norm": 1.753612732321569, + "learning_rate": 1.859028993471148e-07, + "loss": 1.115, + "step": 5965 + }, + { + "epoch": 0.8087846539686844, + "grad_norm": 1.8197715542470572, + "learning_rate": 1.8564795782696607e-07, + "loss": 1.1413, + "step": 5966 + }, + { + "epoch": 0.8089202196163492, + "grad_norm": 4.125978795793437, + "learning_rate": 1.8539317334818072e-07, + "loss": 1.1166, + "step": 5967 + }, + { + "epoch": 0.8090557852640141, + "grad_norm": 1.548973979071263, + "learning_rate": 1.8513854595989198e-07, + "loss": 1.1076, + "step": 5968 + }, + { + "epoch": 0.8091913509116789, + "grad_norm": 1.678256540322654, + "learning_rate": 1.848840757112019e-07, + "loss": 1.119, + "step": 5969 + }, + { + "epoch": 0.8093269165593439, + "grad_norm": 1.8642912029220742, + "learning_rate": 1.8462976265118436e-07, + "loss": 1.1493, + "step": 5970 + }, + { + "epoch": 0.8094624822070088, + "grad_norm": 2.4994390660630654, + "learning_rate": 1.8437560682888043e-07, + "loss": 1.1367, + "step": 5971 + }, + { + "epoch": 0.8095980478546736, + "grad_norm": 1.983212852158069, + "learning_rate": 1.8412160829330304e-07, + "loss": 1.1334, + "step": 5972 + }, + { + "epoch": 0.8097336135023385, + "grad_norm": 1.684595605370834, + "learning_rate": 1.8386776709343278e-07, + "loss": 1.0717, + "step": 5973 + }, + { + "epoch": 0.8098691791500033, + "grad_norm": 1.6904510253905656, + "learning_rate": 1.8361408327822203e-07, + "loss": 1.1305, + "step": 5974 + }, + { + "epoch": 0.8100047447976683, + "grad_norm": 3.0740263937262906, + "learning_rate": 1.8336055689659091e-07, + "loss": 1.1465, + "step": 5975 + }, + { + "epoch": 0.8101403104453332, + "grad_norm": 2.028957726986963, + "learning_rate": 1.831071879974302e-07, + "loss": 1.1386, + "step": 5976 + }, + { + "epoch": 0.810275876092998, + "grad_norm": 3.127053253481115, + "learning_rate": 1.8285397662960022e-07, + "loss": 1.1523, + "step": 5977 + }, + { + "epoch": 0.8104114417406629, + "grad_norm": 2.757882820332549, + "learning_rate": 1.8260092284193062e-07, + "loss": 1.1295, + "step": 5978 + }, + { + "epoch": 0.8105470073883277, + "grad_norm": 2.655661441658755, + "learning_rate": 1.823480266832209e-07, + "loss": 1.1499, + "step": 5979 + }, + { + "epoch": 0.8106825730359927, + "grad_norm": 1.7422770755030752, + "learning_rate": 1.8209528820224008e-07, + "loss": 1.1521, + "step": 5980 + }, + { + "epoch": 0.8108181386836576, + "grad_norm": 1.6094222926325399, + "learning_rate": 1.8184270744772678e-07, + "loss": 1.1314, + "step": 5981 + }, + { + "epoch": 0.8109537043313224, + "grad_norm": 2.721930927327932, + "learning_rate": 1.815902844683892e-07, + "loss": 1.162, + "step": 5982 + }, + { + "epoch": 0.8110892699789873, + "grad_norm": 1.8051753060448161, + "learning_rate": 1.8133801931290516e-07, + "loss": 1.1456, + "step": 5983 + }, + { + "epoch": 0.8112248356266522, + "grad_norm": 1.5319825259792947, + "learning_rate": 1.8108591202992195e-07, + "loss": 1.153, + "step": 5984 + }, + { + "epoch": 0.8113604012743171, + "grad_norm": 1.7597872290030934, + "learning_rate": 1.808339626680565e-07, + "loss": 1.139, + "step": 5985 + }, + { + "epoch": 0.811495966921982, + "grad_norm": 2.4841676979621643, + "learning_rate": 1.8058217127589526e-07, + "loss": 1.1083, + "step": 5986 + }, + { + "epoch": 0.8116315325696468, + "grad_norm": 2.469172733952108, + "learning_rate": 1.8033053790199415e-07, + "loss": 1.1754, + "step": 5987 + }, + { + "epoch": 0.8117670982173117, + "grad_norm": 1.9981785205712508, + "learning_rate": 1.8007906259487904e-07, + "loss": 1.1309, + "step": 5988 + }, + { + "epoch": 0.8119026638649767, + "grad_norm": 2.3001389891342905, + "learning_rate": 1.7982774540304402e-07, + "loss": 1.1073, + "step": 5989 + }, + { + "epoch": 0.8120382295126415, + "grad_norm": 7.533390956578013, + "learning_rate": 1.7957658637495488e-07, + "loss": 1.1388, + "step": 5990 + }, + { + "epoch": 0.8121737951603064, + "grad_norm": 1.8124338694899689, + "learning_rate": 1.7932558555904453e-07, + "loss": 1.0958, + "step": 5991 + }, + { + "epoch": 0.8123093608079712, + "grad_norm": 14.99230289834608, + "learning_rate": 1.790747430037174e-07, + "loss": 1.1486, + "step": 5992 + }, + { + "epoch": 0.8124449264556362, + "grad_norm": 1.663144319241622, + "learning_rate": 1.7882405875734564e-07, + "loss": 1.1736, + "step": 5993 + }, + { + "epoch": 0.8125804921033011, + "grad_norm": 1.9077210691480564, + "learning_rate": 1.785735328682727e-07, + "loss": 1.1115, + "step": 5994 + }, + { + "epoch": 0.8127160577509659, + "grad_norm": 1.7115314517606777, + "learning_rate": 1.7832316538480973e-07, + "loss": 1.1244, + "step": 5995 + }, + { + "epoch": 0.8128516233986308, + "grad_norm": 2.1954628344023392, + "learning_rate": 1.7807295635523845e-07, + "loss": 1.101, + "step": 5996 + }, + { + "epoch": 0.8129871890462956, + "grad_norm": 1.9806198324790873, + "learning_rate": 1.7782290582780958e-07, + "loss": 1.1402, + "step": 5997 + }, + { + "epoch": 0.8131227546939606, + "grad_norm": 2.2874861245125713, + "learning_rate": 1.7757301385074342e-07, + "loss": 1.0952, + "step": 5998 + }, + { + "epoch": 0.8132583203416255, + "grad_norm": 1.6972284935114141, + "learning_rate": 1.7732328047222978e-07, + "loss": 1.1358, + "step": 5999 + }, + { + "epoch": 0.8133938859892903, + "grad_norm": 2.453498531689311, + "learning_rate": 1.7707370574042769e-07, + "loss": 1.1288, + "step": 6000 + }, + { + "epoch": 0.8135294516369552, + "grad_norm": 1.9764742908012518, + "learning_rate": 1.7682428970346553e-07, + "loss": 1.2043, + "step": 6001 + }, + { + "epoch": 0.81366501728462, + "grad_norm": 1.6642591156279227, + "learning_rate": 1.765750324094415e-07, + "loss": 1.1477, + "step": 6002 + }, + { + "epoch": 0.813800582932285, + "grad_norm": 2.6371375042527907, + "learning_rate": 1.763259339064226e-07, + "loss": 1.1419, + "step": 6003 + }, + { + "epoch": 0.8139361485799499, + "grad_norm": 4.539526993517952, + "learning_rate": 1.7607699424244582e-07, + "loss": 1.165, + "step": 6004 + }, + { + "epoch": 0.8140717142276147, + "grad_norm": 2.741445094142396, + "learning_rate": 1.7582821346551711e-07, + "loss": 1.1058, + "step": 6005 + }, + { + "epoch": 0.8142072798752796, + "grad_norm": 1.820592276520772, + "learning_rate": 1.7557959162361148e-07, + "loss": 1.0939, + "step": 6006 + }, + { + "epoch": 0.8143428455229444, + "grad_norm": 2.1530012063842436, + "learning_rate": 1.753311287646745e-07, + "loss": 1.1928, + "step": 6007 + }, + { + "epoch": 0.8144784111706094, + "grad_norm": 2.171175245984261, + "learning_rate": 1.7508282493661918e-07, + "loss": 1.1517, + "step": 6008 + }, + { + "epoch": 0.8146139768182743, + "grad_norm": 2.0385637580420233, + "learning_rate": 1.7483468018733017e-07, + "loss": 1.1152, + "step": 6009 + }, + { + "epoch": 0.8147495424659391, + "grad_norm": 2.100731404771374, + "learning_rate": 1.7458669456465914e-07, + "loss": 1.0913, + "step": 6010 + }, + { + "epoch": 0.814885108113604, + "grad_norm": 1.6647490732639882, + "learning_rate": 1.7433886811642916e-07, + "loss": 1.1211, + "step": 6011 + }, + { + "epoch": 0.8150206737612689, + "grad_norm": 5.4515680398995645, + "learning_rate": 1.740912008904305e-07, + "loss": 1.1526, + "step": 6012 + }, + { + "epoch": 0.8151562394089338, + "grad_norm": 1.7665281008189584, + "learning_rate": 1.7384369293442501e-07, + "loss": 1.1531, + "step": 6013 + }, + { + "epoch": 0.8152918050565987, + "grad_norm": 2.049702311376679, + "learning_rate": 1.7359634429614145e-07, + "loss": 1.1106, + "step": 6014 + }, + { + "epoch": 0.8154273707042635, + "grad_norm": 1.756554324926667, + "learning_rate": 1.7334915502328028e-07, + "loss": 1.1777, + "step": 6015 + }, + { + "epoch": 0.8155629363519284, + "grad_norm": 2.0222648115868775, + "learning_rate": 1.7310212516350908e-07, + "loss": 1.1563, + "step": 6016 + }, + { + "epoch": 0.8156985019995933, + "grad_norm": 1.8071799568069142, + "learning_rate": 1.7285525476446594e-07, + "loss": 1.1155, + "step": 6017 + }, + { + "epoch": 0.8158340676472582, + "grad_norm": 1.936054050606689, + "learning_rate": 1.7260854387375778e-07, + "loss": 1.1074, + "step": 6018 + }, + { + "epoch": 0.8159696332949231, + "grad_norm": 2.0030644007120753, + "learning_rate": 1.7236199253896089e-07, + "loss": 1.1403, + "step": 6019 + }, + { + "epoch": 0.8161051989425879, + "grad_norm": 1.7201766545626715, + "learning_rate": 1.7211560080762078e-07, + "loss": 1.1529, + "step": 6020 + }, + { + "epoch": 0.8162407645902529, + "grad_norm": 51.718628632624046, + "learning_rate": 1.718693687272521e-07, + "loss": 1.1347, + "step": 6021 + }, + { + "epoch": 0.8163763302379177, + "grad_norm": 2.4203903930414787, + "learning_rate": 1.716232963453389e-07, + "loss": 1.1738, + "step": 6022 + }, + { + "epoch": 0.8165118958855826, + "grad_norm": 1.8104239389527421, + "learning_rate": 1.7137738370933408e-07, + "loss": 1.1684, + "step": 6023 + }, + { + "epoch": 0.8166474615332475, + "grad_norm": 2.2224602876166952, + "learning_rate": 1.7113163086666016e-07, + "loss": 1.143, + "step": 6024 + }, + { + "epoch": 0.8167830271809123, + "grad_norm": 1.9836608399925866, + "learning_rate": 1.7088603786470845e-07, + "loss": 1.1113, + "step": 6025 + }, + { + "epoch": 0.8169185928285773, + "grad_norm": 1.6854075706738105, + "learning_rate": 1.7064060475083975e-07, + "loss": 1.1456, + "step": 6026 + }, + { + "epoch": 0.8170541584762421, + "grad_norm": 1.9864872059488075, + "learning_rate": 1.7039533157238394e-07, + "loss": 1.1212, + "step": 6027 + }, + { + "epoch": 0.817189724123907, + "grad_norm": 2.093287123920618, + "learning_rate": 1.7015021837663979e-07, + "loss": 1.1543, + "step": 6028 + }, + { + "epoch": 0.8173252897715719, + "grad_norm": 1.7262584611908651, + "learning_rate": 1.6990526521087567e-07, + "loss": 1.1493, + "step": 6029 + }, + { + "epoch": 0.8174608554192367, + "grad_norm": 1.8879144108234591, + "learning_rate": 1.696604721223288e-07, + "loss": 1.1441, + "step": 6030 + }, + { + "epoch": 0.8175964210669017, + "grad_norm": 2.597821540378448, + "learning_rate": 1.6941583915820578e-07, + "loss": 1.1693, + "step": 6031 + }, + { + "epoch": 0.8177319867145665, + "grad_norm": 1.7248782709541066, + "learning_rate": 1.6917136636568176e-07, + "loss": 1.1303, + "step": 6032 + }, + { + "epoch": 0.8178675523622314, + "grad_norm": 2.04911282486367, + "learning_rate": 1.6892705379190153e-07, + "loss": 1.1104, + "step": 6033 + }, + { + "epoch": 0.8180031180098963, + "grad_norm": 2.283551169983214, + "learning_rate": 1.6868290148397878e-07, + "loss": 1.1448, + "step": 6034 + }, + { + "epoch": 0.8181386836575611, + "grad_norm": 1.783880668493409, + "learning_rate": 1.6843890948899665e-07, + "loss": 1.1968, + "step": 6035 + }, + { + "epoch": 0.8182742493052261, + "grad_norm": 1.7703987553784006, + "learning_rate": 1.6819507785400677e-07, + "loss": 1.1751, + "step": 6036 + }, + { + "epoch": 0.8184098149528909, + "grad_norm": 2.045596991399769, + "learning_rate": 1.6795140662603026e-07, + "loss": 1.1422, + "step": 6037 + }, + { + "epoch": 0.8185453806005558, + "grad_norm": 4.073253909366451, + "learning_rate": 1.6770789585205725e-07, + "loss": 1.1217, + "step": 6038 + }, + { + "epoch": 0.8186809462482207, + "grad_norm": 1.7245784455541688, + "learning_rate": 1.6746454557904677e-07, + "loss": 1.0799, + "step": 6039 + }, + { + "epoch": 0.8188165118958856, + "grad_norm": 1.8769690052646233, + "learning_rate": 1.6722135585392706e-07, + "loss": 1.1514, + "step": 6040 + }, + { + "epoch": 0.8189520775435505, + "grad_norm": 3.7788252483738987, + "learning_rate": 1.6697832672359525e-07, + "loss": 1.1401, + "step": 6041 + }, + { + "epoch": 0.8190876431912153, + "grad_norm": 5.546029816564097, + "learning_rate": 1.6673545823491774e-07, + "loss": 1.1518, + "step": 6042 + }, + { + "epoch": 0.8192232088388802, + "grad_norm": 1.8187025774603909, + "learning_rate": 1.6649275043472965e-07, + "loss": 1.1497, + "step": 6043 + }, + { + "epoch": 0.8193587744865451, + "grad_norm": 1.9830044750255114, + "learning_rate": 1.6625020336983565e-07, + "loss": 1.1151, + "step": 6044 + }, + { + "epoch": 0.81949434013421, + "grad_norm": 1.8525638849900223, + "learning_rate": 1.6600781708700816e-07, + "loss": 1.129, + "step": 6045 + }, + { + "epoch": 0.8196299057818749, + "grad_norm": 1.8841484555084704, + "learning_rate": 1.6576559163299053e-07, + "loss": 1.1136, + "step": 6046 + }, + { + "epoch": 0.8197654714295397, + "grad_norm": 1.784069724373243, + "learning_rate": 1.6552352705449302e-07, + "loss": 1.1292, + "step": 6047 + }, + { + "epoch": 0.8199010370772046, + "grad_norm": 1.6716227174543736, + "learning_rate": 1.6528162339819685e-07, + "loss": 1.1176, + "step": 6048 + }, + { + "epoch": 0.8200366027248696, + "grad_norm": 1.8121966815779407, + "learning_rate": 1.6503988071075026e-07, + "loss": 1.148, + "step": 6049 + }, + { + "epoch": 0.8201721683725344, + "grad_norm": 2.221999518604163, + "learning_rate": 1.647982990387724e-07, + "loss": 1.1089, + "step": 6050 + }, + { + "epoch": 0.8203077340201993, + "grad_norm": 2.550427740464439, + "learning_rate": 1.6455687842884936e-07, + "loss": 1.1009, + "step": 6051 + }, + { + "epoch": 0.8204432996678641, + "grad_norm": 2.1873652901007654, + "learning_rate": 1.643156189275382e-07, + "loss": 1.1783, + "step": 6052 + }, + { + "epoch": 0.820578865315529, + "grad_norm": 2.5381263144017296, + "learning_rate": 1.6407452058136294e-07, + "loss": 1.118, + "step": 6053 + }, + { + "epoch": 0.820714430963194, + "grad_norm": 2.256516961078957, + "learning_rate": 1.6383358343681852e-07, + "loss": 1.1694, + "step": 6054 + }, + { + "epoch": 0.8208499966108588, + "grad_norm": 4.176066967279017, + "learning_rate": 1.6359280754036675e-07, + "loss": 1.1144, + "step": 6055 + }, + { + "epoch": 0.8209855622585237, + "grad_norm": 1.926252084630101, + "learning_rate": 1.6335219293844038e-07, + "loss": 1.1245, + "step": 6056 + }, + { + "epoch": 0.8211211279061885, + "grad_norm": 3.1212381273929344, + "learning_rate": 1.6311173967743918e-07, + "loss": 1.1644, + "step": 6057 + }, + { + "epoch": 0.8212566935538534, + "grad_norm": 1.8540948340977716, + "learning_rate": 1.6287144780373308e-07, + "loss": 1.1374, + "step": 6058 + }, + { + "epoch": 0.8213922592015184, + "grad_norm": 1.6851101218434452, + "learning_rate": 1.6263131736366032e-07, + "loss": 1.1684, + "step": 6059 + }, + { + "epoch": 0.8215278248491832, + "grad_norm": 1.928024532301526, + "learning_rate": 1.623913484035282e-07, + "loss": 1.1374, + "step": 6060 + }, + { + "epoch": 0.8216633904968481, + "grad_norm": 2.518696838405431, + "learning_rate": 1.6215154096961292e-07, + "loss": 1.1404, + "step": 6061 + }, + { + "epoch": 0.8217989561445129, + "grad_norm": 1.6416665728802466, + "learning_rate": 1.619118951081594e-07, + "loss": 1.1335, + "step": 6062 + }, + { + "epoch": 0.8219345217921779, + "grad_norm": 3.5168326793756184, + "learning_rate": 1.616724108653813e-07, + "loss": 1.1407, + "step": 6063 + }, + { + "epoch": 0.8220700874398428, + "grad_norm": 1.7733535856956042, + "learning_rate": 1.614330882874616e-07, + "loss": 1.1481, + "step": 6064 + }, + { + "epoch": 0.8222056530875076, + "grad_norm": 2.5942154613560287, + "learning_rate": 1.611939274205515e-07, + "loss": 1.1546, + "step": 6065 + }, + { + "epoch": 0.8223412187351725, + "grad_norm": 1.5258857398079777, + "learning_rate": 1.6095492831077128e-07, + "loss": 1.1482, + "step": 6066 + }, + { + "epoch": 0.8224767843828373, + "grad_norm": 1.6828121252431687, + "learning_rate": 1.6071609100421048e-07, + "loss": 1.14, + "step": 6067 + }, + { + "epoch": 0.8226123500305023, + "grad_norm": 2.844061740019552, + "learning_rate": 1.6047741554692606e-07, + "loss": 1.1851, + "step": 6068 + }, + { + "epoch": 0.8227479156781672, + "grad_norm": 1.9973573999778746, + "learning_rate": 1.6023890198494584e-07, + "loss": 1.1716, + "step": 6069 + }, + { + "epoch": 0.822883481325832, + "grad_norm": 2.3955521567349014, + "learning_rate": 1.6000055036426407e-07, + "loss": 1.1529, + "step": 6070 + }, + { + "epoch": 0.8230190469734969, + "grad_norm": 1.7856677209488236, + "learning_rate": 1.5976236073084627e-07, + "loss": 1.1214, + "step": 6071 + }, + { + "epoch": 0.8231546126211619, + "grad_norm": 1.788436033520174, + "learning_rate": 1.595243331306244e-07, + "loss": 1.163, + "step": 6072 + }, + { + "epoch": 0.8232901782688267, + "grad_norm": 2.189176822650787, + "learning_rate": 1.592864676095006e-07, + "loss": 1.127, + "step": 6073 + }, + { + "epoch": 0.8234257439164916, + "grad_norm": 1.9994318811410523, + "learning_rate": 1.5904876421334534e-07, + "loss": 1.0947, + "step": 6074 + }, + { + "epoch": 0.8235613095641564, + "grad_norm": 1.9247200246605887, + "learning_rate": 1.5881122298799788e-07, + "loss": 1.1315, + "step": 6075 + }, + { + "epoch": 0.8236968752118213, + "grad_norm": 1.9052765543144352, + "learning_rate": 1.585738439792661e-07, + "loss": 1.1549, + "step": 6076 + }, + { + "epoch": 0.8238324408594863, + "grad_norm": 1.6563337887461316, + "learning_rate": 1.5833662723292662e-07, + "loss": 1.1339, + "step": 6077 + }, + { + "epoch": 0.8239680065071511, + "grad_norm": 2.823195190228193, + "learning_rate": 1.5809957279472496e-07, + "loss": 1.1342, + "step": 6078 + }, + { + "epoch": 0.824103572154816, + "grad_norm": 1.7102354095155508, + "learning_rate": 1.578626807103751e-07, + "loss": 1.1073, + "step": 6079 + }, + { + "epoch": 0.8242391378024808, + "grad_norm": 2.5495391768025315, + "learning_rate": 1.5762595102555987e-07, + "loss": 1.125, + "step": 6080 + }, + { + "epoch": 0.8243747034501457, + "grad_norm": 3.5430070446187627, + "learning_rate": 1.5738938378593068e-07, + "loss": 1.1586, + "step": 6081 + }, + { + "epoch": 0.8245102690978107, + "grad_norm": 2.228343572975163, + "learning_rate": 1.5715297903710767e-07, + "loss": 1.1466, + "step": 6082 + }, + { + "epoch": 0.8246458347454755, + "grad_norm": 3.4764271570159107, + "learning_rate": 1.5691673682467967e-07, + "loss": 1.1404, + "step": 6083 + }, + { + "epoch": 0.8247814003931404, + "grad_norm": 1.791617057306227, + "learning_rate": 1.5668065719420398e-07, + "loss": 1.1659, + "step": 6084 + }, + { + "epoch": 0.8249169660408052, + "grad_norm": 2.462481596854804, + "learning_rate": 1.564447401912069e-07, + "loss": 1.1171, + "step": 6085 + }, + { + "epoch": 0.8250525316884701, + "grad_norm": 1.6535886482000794, + "learning_rate": 1.5620898586118292e-07, + "loss": 1.1389, + "step": 6086 + }, + { + "epoch": 0.8251880973361351, + "grad_norm": 1.597754275284057, + "learning_rate": 1.5597339424959588e-07, + "loss": 1.1584, + "step": 6087 + }, + { + "epoch": 0.8253236629837999, + "grad_norm": 2.24486095784946, + "learning_rate": 1.557379654018769e-07, + "loss": 1.1464, + "step": 6088 + }, + { + "epoch": 0.8254592286314648, + "grad_norm": 3.012549867023175, + "learning_rate": 1.555026993634275e-07, + "loss": 1.1069, + "step": 6089 + }, + { + "epoch": 0.8255947942791296, + "grad_norm": 3.546965524894584, + "learning_rate": 1.5526759617961614e-07, + "loss": 1.1532, + "step": 6090 + }, + { + "epoch": 0.8257303599267946, + "grad_norm": 1.9668119412190164, + "learning_rate": 1.5503265589578128e-07, + "loss": 1.1825, + "step": 6091 + }, + { + "epoch": 0.8258659255744595, + "grad_norm": 3.5823253846420604, + "learning_rate": 1.5479787855722858e-07, + "loss": 1.0955, + "step": 6092 + }, + { + "epoch": 0.8260014912221243, + "grad_norm": 1.652097801042175, + "learning_rate": 1.5456326420923382e-07, + "loss": 1.1429, + "step": 6093 + }, + { + "epoch": 0.8261370568697892, + "grad_norm": 2.557641626145817, + "learning_rate": 1.543288128970399e-07, + "loss": 1.1404, + "step": 6094 + }, + { + "epoch": 0.826272622517454, + "grad_norm": 4.585339517217566, + "learning_rate": 1.5409452466585903e-07, + "loss": 1.1564, + "step": 6095 + }, + { + "epoch": 0.826408188165119, + "grad_norm": 2.7186805923899082, + "learning_rate": 1.5386039956087194e-07, + "loss": 1.1459, + "step": 6096 + }, + { + "epoch": 0.8265437538127839, + "grad_norm": 1.7006467668961174, + "learning_rate": 1.5362643762722782e-07, + "loss": 1.1243, + "step": 6097 + }, + { + "epoch": 0.8266793194604487, + "grad_norm": 2.312882498506986, + "learning_rate": 1.5339263891004427e-07, + "loss": 1.1512, + "step": 6098 + }, + { + "epoch": 0.8268148851081136, + "grad_norm": 1.8667497931485324, + "learning_rate": 1.5315900345440757e-07, + "loss": 1.1588, + "step": 6099 + }, + { + "epoch": 0.8269504507557784, + "grad_norm": 3.825162637907689, + "learning_rate": 1.5292553130537255e-07, + "loss": 1.1159, + "step": 6100 + }, + { + "epoch": 0.8270860164034434, + "grad_norm": 7.036385254549954, + "learning_rate": 1.526922225079623e-07, + "loss": 1.1361, + "step": 6101 + }, + { + "epoch": 0.8272215820511083, + "grad_norm": 1.6016021319882088, + "learning_rate": 1.524590771071691e-07, + "loss": 1.1454, + "step": 6102 + }, + { + "epoch": 0.8273571476987731, + "grad_norm": 55.14442518972997, + "learning_rate": 1.5222609514795225e-07, + "loss": 1.1268, + "step": 6103 + }, + { + "epoch": 0.827492713346438, + "grad_norm": 1.8181787434499816, + "learning_rate": 1.5199327667524154e-07, + "loss": 1.1502, + "step": 6104 + }, + { + "epoch": 0.8276282789941029, + "grad_norm": 2.1005457148407123, + "learning_rate": 1.5176062173393312e-07, + "loss": 1.1141, + "step": 6105 + }, + { + "epoch": 0.8277638446417678, + "grad_norm": 2.7403733951543785, + "learning_rate": 1.5152813036889378e-07, + "loss": 1.1299, + "step": 6106 + }, + { + "epoch": 0.8278994102894327, + "grad_norm": 1.7418110256967947, + "learning_rate": 1.5129580262495656e-07, + "loss": 1.154, + "step": 6107 + }, + { + "epoch": 0.8280349759370975, + "grad_norm": 1.7094973940662384, + "learning_rate": 1.5106363854692493e-07, + "loss": 1.1774, + "step": 6108 + }, + { + "epoch": 0.8281705415847624, + "grad_norm": 2.1833724179812135, + "learning_rate": 1.5083163817956913e-07, + "loss": 1.1386, + "step": 6109 + }, + { + "epoch": 0.8283061072324273, + "grad_norm": 1.7460194040849943, + "learning_rate": 1.5059980156762942e-07, + "loss": 1.1284, + "step": 6110 + }, + { + "epoch": 0.8284416728800922, + "grad_norm": 1.8819636125016435, + "learning_rate": 1.5036812875581274e-07, + "loss": 1.1047, + "step": 6111 + }, + { + "epoch": 0.8285772385277571, + "grad_norm": 2.5146816396003016, + "learning_rate": 1.5013661978879632e-07, + "loss": 1.1581, + "step": 6112 + }, + { + "epoch": 0.8287128041754219, + "grad_norm": 7.9298130654042325, + "learning_rate": 1.4990527471122382e-07, + "loss": 1.1123, + "step": 6113 + }, + { + "epoch": 0.8288483698230869, + "grad_norm": 1.8444495939073744, + "learning_rate": 1.4967409356770945e-07, + "loss": 1.1517, + "step": 6114 + }, + { + "epoch": 0.8289839354707517, + "grad_norm": 2.2008811489537643, + "learning_rate": 1.4944307640283382e-07, + "loss": 1.1478, + "step": 6115 + }, + { + "epoch": 0.8291195011184166, + "grad_norm": 2.0929554685094343, + "learning_rate": 1.4921222326114692e-07, + "loss": 1.1631, + "step": 6116 + }, + { + "epoch": 0.8292550667660815, + "grad_norm": 1.612947245638032, + "learning_rate": 1.4898153418716708e-07, + "loss": 1.1451, + "step": 6117 + }, + { + "epoch": 0.8293906324137463, + "grad_norm": 23.973497423755276, + "learning_rate": 1.4875100922538087e-07, + "loss": 1.0831, + "step": 6118 + }, + { + "epoch": 0.8295261980614113, + "grad_norm": 1.9227649257591497, + "learning_rate": 1.4852064842024325e-07, + "loss": 1.1585, + "step": 6119 + }, + { + "epoch": 0.8296617637090761, + "grad_norm": 1.5266000085182776, + "learning_rate": 1.4829045181617727e-07, + "loss": 1.1387, + "step": 6120 + }, + { + "epoch": 0.829797329356741, + "grad_norm": 1.6998362314526998, + "learning_rate": 1.4806041945757474e-07, + "loss": 1.1815, + "step": 6121 + }, + { + "epoch": 0.8299328950044059, + "grad_norm": 2.6800215363843742, + "learning_rate": 1.4783055138879562e-07, + "loss": 1.1863, + "step": 6122 + }, + { + "epoch": 0.8300684606520707, + "grad_norm": 1.9459300852005739, + "learning_rate": 1.476008476541679e-07, + "loss": 1.2031, + "step": 6123 + }, + { + "epoch": 0.8302040262997357, + "grad_norm": 1.9866932347154056, + "learning_rate": 1.473713082979884e-07, + "loss": 1.0931, + "step": 6124 + }, + { + "epoch": 0.8303395919474005, + "grad_norm": 1.6849300703291292, + "learning_rate": 1.4714193336452174e-07, + "loss": 1.132, + "step": 6125 + }, + { + "epoch": 0.8304751575950654, + "grad_norm": 1.8292720288783921, + "learning_rate": 1.4691272289800115e-07, + "loss": 1.1326, + "step": 6126 + }, + { + "epoch": 0.8306107232427303, + "grad_norm": 2.25395744907266, + "learning_rate": 1.4668367694262817e-07, + "loss": 1.1884, + "step": 6127 + }, + { + "epoch": 0.8307462888903951, + "grad_norm": 2.029604244188756, + "learning_rate": 1.4645479554257267e-07, + "loss": 1.1369, + "step": 6128 + }, + { + "epoch": 0.8308818545380601, + "grad_norm": 1.878389820573412, + "learning_rate": 1.4622607874197214e-07, + "loss": 1.1287, + "step": 6129 + }, + { + "epoch": 0.8310174201857249, + "grad_norm": 2.8003445649669314, + "learning_rate": 1.4599752658493304e-07, + "loss": 1.1426, + "step": 6130 + }, + { + "epoch": 0.8311529858333898, + "grad_norm": 1.7010976358191896, + "learning_rate": 1.457691391155298e-07, + "loss": 1.1518, + "step": 6131 + }, + { + "epoch": 0.8312885514810547, + "grad_norm": 1.909828421534506, + "learning_rate": 1.4554091637780518e-07, + "loss": 1.1479, + "step": 6132 + }, + { + "epoch": 0.8314241171287196, + "grad_norm": 2.1547333502841926, + "learning_rate": 1.4531285841577024e-07, + "loss": 1.1346, + "step": 6133 + }, + { + "epoch": 0.8315596827763845, + "grad_norm": 1.9837580019262795, + "learning_rate": 1.4508496527340398e-07, + "loss": 1.0998, + "step": 6134 + }, + { + "epoch": 0.8316952484240493, + "grad_norm": 5.136294717505057, + "learning_rate": 1.448572369946539e-07, + "loss": 1.1406, + "step": 6135 + }, + { + "epoch": 0.8318308140717142, + "grad_norm": 1.7460552568405727, + "learning_rate": 1.446296736234356e-07, + "loss": 1.1232, + "step": 6136 + }, + { + "epoch": 0.8319663797193791, + "grad_norm": 2.2148559133649597, + "learning_rate": 1.444022752036328e-07, + "loss": 1.172, + "step": 6137 + }, + { + "epoch": 0.832101945367044, + "grad_norm": 2.53446576413332, + "learning_rate": 1.4417504177909767e-07, + "loss": 1.1454, + "step": 6138 + }, + { + "epoch": 0.8322375110147089, + "grad_norm": 3.9987844554049317, + "learning_rate": 1.4394797339365017e-07, + "loss": 1.135, + "step": 6139 + }, + { + "epoch": 0.8323730766623737, + "grad_norm": 1.972625723885487, + "learning_rate": 1.437210700910787e-07, + "loss": 1.1564, + "step": 6140 + }, + { + "epoch": 0.8325086423100386, + "grad_norm": 2.122733007947952, + "learning_rate": 1.4349433191513994e-07, + "loss": 1.1611, + "step": 6141 + }, + { + "epoch": 0.8326442079577036, + "grad_norm": 1.9971858053620677, + "learning_rate": 1.4326775890955833e-07, + "loss": 1.1351, + "step": 6142 + }, + { + "epoch": 0.8327797736053684, + "grad_norm": 6.089973971366798, + "learning_rate": 1.4304135111802707e-07, + "loss": 1.166, + "step": 6143 + }, + { + "epoch": 0.8329153392530333, + "grad_norm": 3.457609475177327, + "learning_rate": 1.4281510858420632e-07, + "loss": 1.1446, + "step": 6144 + }, + { + "epoch": 0.8330509049006981, + "grad_norm": 2.0932428576147726, + "learning_rate": 1.4258903135172605e-07, + "loss": 1.1503, + "step": 6145 + }, + { + "epoch": 0.833186470548363, + "grad_norm": 2.071554228554578, + "learning_rate": 1.423631194641828e-07, + "loss": 1.108, + "step": 6146 + }, + { + "epoch": 0.833322036196028, + "grad_norm": 3.7023352242125616, + "learning_rate": 1.421373729651425e-07, + "loss": 1.1289, + "step": 6147 + }, + { + "epoch": 0.8334576018436928, + "grad_norm": 2.1392109627535776, + "learning_rate": 1.4191179189813796e-07, + "loss": 1.1087, + "step": 6148 + }, + { + "epoch": 0.8335931674913577, + "grad_norm": 1.6700341731963986, + "learning_rate": 1.4168637630667135e-07, + "loss": 1.1625, + "step": 6149 + }, + { + "epoch": 0.8337287331390226, + "grad_norm": 1.9604248646891644, + "learning_rate": 1.4146112623421158e-07, + "loss": 1.1543, + "step": 6150 + }, + { + "epoch": 0.8338642987866874, + "grad_norm": 2.016027814389016, + "learning_rate": 1.4123604172419713e-07, + "loss": 1.1198, + "step": 6151 + }, + { + "epoch": 0.8339998644343524, + "grad_norm": 2.399138932032356, + "learning_rate": 1.410111228200329e-07, + "loss": 1.1928, + "step": 6152 + }, + { + "epoch": 0.8341354300820172, + "grad_norm": 2.17538328360098, + "learning_rate": 1.407863695650936e-07, + "loss": 1.1546, + "step": 6153 + }, + { + "epoch": 0.8342709957296821, + "grad_norm": 1.8231408416218464, + "learning_rate": 1.405617820027204e-07, + "loss": 1.1132, + "step": 6154 + }, + { + "epoch": 0.834406561377347, + "grad_norm": 1.776430077512962, + "learning_rate": 1.4033736017622388e-07, + "loss": 1.1744, + "step": 6155 + }, + { + "epoch": 0.8345421270250118, + "grad_norm": 2.294046528556166, + "learning_rate": 1.4011310412888145e-07, + "loss": 1.165, + "step": 6156 + }, + { + "epoch": 0.8346776926726768, + "grad_norm": 2.374447136355484, + "learning_rate": 1.398890139039395e-07, + "loss": 1.1356, + "step": 6157 + }, + { + "epoch": 0.8348132583203416, + "grad_norm": 1.9195651438970642, + "learning_rate": 1.3966508954461175e-07, + "loss": 1.1118, + "step": 6158 + }, + { + "epoch": 0.8349488239680065, + "grad_norm": 2.003832326939662, + "learning_rate": 1.3944133109408053e-07, + "loss": 1.1786, + "step": 6159 + }, + { + "epoch": 0.8350843896156714, + "grad_norm": 1.6108131389314297, + "learning_rate": 1.3921773859549569e-07, + "loss": 1.1404, + "step": 6160 + }, + { + "epoch": 0.8352199552633363, + "grad_norm": 2.768327377829161, + "learning_rate": 1.389943120919753e-07, + "loss": 1.1673, + "step": 6161 + }, + { + "epoch": 0.8353555209110012, + "grad_norm": 1.9264720435847351, + "learning_rate": 1.3877105162660564e-07, + "loss": 1.1255, + "step": 6162 + }, + { + "epoch": 0.835491086558666, + "grad_norm": 1.8747249517378752, + "learning_rate": 1.385479572424404e-07, + "loss": 1.1161, + "step": 6163 + }, + { + "epoch": 0.8356266522063309, + "grad_norm": 2.3312562554435807, + "learning_rate": 1.3832502898250174e-07, + "loss": 1.1448, + "step": 6164 + }, + { + "epoch": 0.8357622178539958, + "grad_norm": 2.5410139331907144, + "learning_rate": 1.3810226688977967e-07, + "loss": 1.1255, + "step": 6165 + }, + { + "epoch": 0.8358977835016607, + "grad_norm": 1.844911724198829, + "learning_rate": 1.378796710072322e-07, + "loss": 1.1536, + "step": 6166 + }, + { + "epoch": 0.8360333491493256, + "grad_norm": 1.7105364764686601, + "learning_rate": 1.3765724137778456e-07, + "loss": 1.1477, + "step": 6167 + }, + { + "epoch": 0.8361689147969904, + "grad_norm": 2.3831999284176346, + "learning_rate": 1.3743497804433147e-07, + "loss": 1.1286, + "step": 6168 + }, + { + "epoch": 0.8363044804446553, + "grad_norm": 1.6467922800711072, + "learning_rate": 1.3721288104973372e-07, + "loss": 1.1173, + "step": 6169 + }, + { + "epoch": 0.8364400460923203, + "grad_norm": 10.496316065750152, + "learning_rate": 1.3699095043682184e-07, + "loss": 1.1048, + "step": 6170 + }, + { + "epoch": 0.8365756117399851, + "grad_norm": 2.338743002300305, + "learning_rate": 1.3676918624839285e-07, + "loss": 1.1588, + "step": 6171 + }, + { + "epoch": 0.83671117738765, + "grad_norm": 1.7692540646604378, + "learning_rate": 1.3654758852721226e-07, + "loss": 1.1331, + "step": 6172 + }, + { + "epoch": 0.8368467430353148, + "grad_norm": 1.6564506813316853, + "learning_rate": 1.363261573160136e-07, + "loss": 1.1902, + "step": 6173 + }, + { + "epoch": 0.8369823086829797, + "grad_norm": 2.0687148250341387, + "learning_rate": 1.3610489265749801e-07, + "loss": 1.1514, + "step": 6174 + }, + { + "epoch": 0.8371178743306447, + "grad_norm": 2.1194442293471063, + "learning_rate": 1.3588379459433485e-07, + "loss": 1.0788, + "step": 6175 + }, + { + "epoch": 0.8372534399783095, + "grad_norm": 1.7875892741930335, + "learning_rate": 1.3566286316916087e-07, + "loss": 1.1458, + "step": 6176 + }, + { + "epoch": 0.8373890056259744, + "grad_norm": 1.8035058338594765, + "learning_rate": 1.354420984245811e-07, + "loss": 1.1614, + "step": 6177 + }, + { + "epoch": 0.8375245712736392, + "grad_norm": 2.081858152379622, + "learning_rate": 1.3522150040316826e-07, + "loss": 1.149, + "step": 6178 + }, + { + "epoch": 0.8376601369213041, + "grad_norm": 2.2553898787920152, + "learning_rate": 1.350010691474629e-07, + "loss": 1.1532, + "step": 6179 + }, + { + "epoch": 0.8377957025689691, + "grad_norm": 4.309213830017875, + "learning_rate": 1.3478080469997344e-07, + "loss": 1.1683, + "step": 6180 + }, + { + "epoch": 0.8379312682166339, + "grad_norm": 1.7628074977607113, + "learning_rate": 1.3456070710317624e-07, + "loss": 1.1085, + "step": 6181 + }, + { + "epoch": 0.8380668338642988, + "grad_norm": 1.6309525270441465, + "learning_rate": 1.3434077639951525e-07, + "loss": 1.1382, + "step": 6182 + }, + { + "epoch": 0.8382023995119636, + "grad_norm": 1.6243596164293685, + "learning_rate": 1.341210126314024e-07, + "loss": 1.1213, + "step": 6183 + }, + { + "epoch": 0.8383379651596286, + "grad_norm": 2.0987998316526473, + "learning_rate": 1.3390141584121772e-07, + "loss": 1.1367, + "step": 6184 + }, + { + "epoch": 0.8384735308072935, + "grad_norm": 1.7506629178437227, + "learning_rate": 1.33681986071308e-07, + "loss": 1.1509, + "step": 6185 + }, + { + "epoch": 0.8386090964549583, + "grad_norm": 3.6417171689602776, + "learning_rate": 1.3346272336398934e-07, + "loss": 1.1464, + "step": 6186 + }, + { + "epoch": 0.8387446621026232, + "grad_norm": 1.636173304363034, + "learning_rate": 1.3324362776154408e-07, + "loss": 1.1548, + "step": 6187 + }, + { + "epoch": 0.838880227750288, + "grad_norm": 1.9237282314220687, + "learning_rate": 1.3302469930622383e-07, + "loss": 1.1797, + "step": 6188 + }, + { + "epoch": 0.839015793397953, + "grad_norm": 1.9168146282021248, + "learning_rate": 1.3280593804024642e-07, + "loss": 1.1396, + "step": 6189 + }, + { + "epoch": 0.8391513590456179, + "grad_norm": 1.67343170281057, + "learning_rate": 1.3258734400579908e-07, + "loss": 1.1274, + "step": 6190 + }, + { + "epoch": 0.8392869246932827, + "grad_norm": 2.025271275610353, + "learning_rate": 1.323689172450353e-07, + "loss": 1.1422, + "step": 6191 + }, + { + "epoch": 0.8394224903409476, + "grad_norm": 1.4915206182539487, + "learning_rate": 1.3215065780007718e-07, + "loss": 1.1118, + "step": 6192 + }, + { + "epoch": 0.8395580559886124, + "grad_norm": 1.9585818009460247, + "learning_rate": 1.3193256571301426e-07, + "loss": 1.1433, + "step": 6193 + }, + { + "epoch": 0.8396936216362774, + "grad_norm": 1.9629476490182411, + "learning_rate": 1.3171464102590392e-07, + "loss": 1.0962, + "step": 6194 + }, + { + "epoch": 0.8398291872839423, + "grad_norm": 2.4124374211889523, + "learning_rate": 1.3149688378077128e-07, + "loss": 1.1607, + "step": 6195 + }, + { + "epoch": 0.8399647529316071, + "grad_norm": 2.6241956664176955, + "learning_rate": 1.3127929401960903e-07, + "loss": 1.1276, + "step": 6196 + }, + { + "epoch": 0.840100318579272, + "grad_norm": 1.5021203655214086, + "learning_rate": 1.3106187178437768e-07, + "loss": 1.1432, + "step": 6197 + }, + { + "epoch": 0.8402358842269368, + "grad_norm": 1.6566756350672727, + "learning_rate": 1.3084461711700544e-07, + "loss": 1.1046, + "step": 6198 + }, + { + "epoch": 0.8403714498746018, + "grad_norm": 2.3153535380501373, + "learning_rate": 1.3062753005938798e-07, + "loss": 1.143, + "step": 6199 + }, + { + "epoch": 0.8405070155222667, + "grad_norm": 1.6599349763275335, + "learning_rate": 1.30410610653389e-07, + "loss": 1.1583, + "step": 6200 + }, + { + "epoch": 0.8406425811699315, + "grad_norm": 1.9166325458696363, + "learning_rate": 1.3019385894083988e-07, + "loss": 1.1305, + "step": 6201 + }, + { + "epoch": 0.8407781468175964, + "grad_norm": 1.6633963678381962, + "learning_rate": 1.2997727496353872e-07, + "loss": 1.1736, + "step": 6202 + }, + { + "epoch": 0.8409137124652613, + "grad_norm": 2.607575820191517, + "learning_rate": 1.2976085876325303e-07, + "loss": 1.1569, + "step": 6203 + }, + { + "epoch": 0.8410492781129262, + "grad_norm": 3.2360198165673113, + "learning_rate": 1.2954461038171603e-07, + "loss": 1.1104, + "step": 6204 + }, + { + "epoch": 0.8411848437605911, + "grad_norm": 1.9246017767133297, + "learning_rate": 1.2932852986063046e-07, + "loss": 1.1755, + "step": 6205 + }, + { + "epoch": 0.8413204094082559, + "grad_norm": 1.716797187428448, + "learning_rate": 1.2911261724166468e-07, + "loss": 1.1383, + "step": 6206 + }, + { + "epoch": 0.8414559750559208, + "grad_norm": 2.864184995885761, + "learning_rate": 1.2889687256645686e-07, + "loss": 1.1445, + "step": 6207 + }, + { + "epoch": 0.8415915407035857, + "grad_norm": 3.093442027127321, + "learning_rate": 1.286812958766106e-07, + "loss": 1.1155, + "step": 6208 + }, + { + "epoch": 0.8417271063512506, + "grad_norm": 2.0392169332095618, + "learning_rate": 1.284658872136991e-07, + "loss": 1.147, + "step": 6209 + }, + { + "epoch": 0.8418626719989155, + "grad_norm": 1.6062276515899507, + "learning_rate": 1.2825064661926133e-07, + "loss": 1.1595, + "step": 6210 + }, + { + "epoch": 0.8419982376465803, + "grad_norm": 1.6861428657726025, + "learning_rate": 1.280355741348056e-07, + "loss": 1.1861, + "step": 6211 + }, + { + "epoch": 0.8421338032942453, + "grad_norm": 1.6846780519654054, + "learning_rate": 1.278206698018064e-07, + "loss": 1.1357, + "step": 6212 + }, + { + "epoch": 0.8422693689419101, + "grad_norm": 1.743740200499029, + "learning_rate": 1.2760593366170635e-07, + "loss": 1.1336, + "step": 6213 + }, + { + "epoch": 0.842404934589575, + "grad_norm": 1.8912290854118503, + "learning_rate": 1.273913657559158e-07, + "loss": 1.1494, + "step": 6214 + }, + { + "epoch": 0.8425405002372399, + "grad_norm": 2.3337319659019102, + "learning_rate": 1.271769661258124e-07, + "loss": 1.1686, + "step": 6215 + }, + { + "epoch": 0.8426760658849047, + "grad_norm": 1.7632729363307744, + "learning_rate": 1.2696273481274144e-07, + "loss": 1.142, + "step": 6216 + }, + { + "epoch": 0.8428116315325697, + "grad_norm": 2.088141667583963, + "learning_rate": 1.2674867185801575e-07, + "loss": 1.1837, + "step": 6217 + }, + { + "epoch": 0.8429471971802345, + "grad_norm": 1.6474351094924657, + "learning_rate": 1.2653477730291563e-07, + "loss": 1.126, + "step": 6218 + }, + { + "epoch": 0.8430827628278994, + "grad_norm": 1.7982241845003253, + "learning_rate": 1.2632105118868896e-07, + "loss": 1.1766, + "step": 6219 + }, + { + "epoch": 0.8432183284755643, + "grad_norm": 2.365699074760233, + "learning_rate": 1.2610749355655125e-07, + "loss": 1.1539, + "step": 6220 + }, + { + "epoch": 0.8433538941232291, + "grad_norm": 4.195938116858856, + "learning_rate": 1.2589410444768522e-07, + "loss": 1.1353, + "step": 6221 + }, + { + "epoch": 0.8434894597708941, + "grad_norm": 1.877749487346024, + "learning_rate": 1.256808839032415e-07, + "loss": 1.1411, + "step": 6222 + }, + { + "epoch": 0.8436250254185589, + "grad_norm": 1.8267664270524033, + "learning_rate": 1.2546783196433774e-07, + "loss": 1.0877, + "step": 6223 + }, + { + "epoch": 0.8437605910662238, + "grad_norm": 1.802383286009516, + "learning_rate": 1.2525494867205954e-07, + "loss": 1.1877, + "step": 6224 + }, + { + "epoch": 0.8438961567138887, + "grad_norm": 4.068952578278857, + "learning_rate": 1.2504223406745963e-07, + "loss": 1.1307, + "step": 6225 + }, + { + "epoch": 0.8440317223615535, + "grad_norm": 1.9550243432563266, + "learning_rate": 1.2482968819155837e-07, + "loss": 1.103, + "step": 6226 + }, + { + "epoch": 0.8441672880092185, + "grad_norm": 1.7130013927037244, + "learning_rate": 1.2461731108534378e-07, + "loss": 1.1494, + "step": 6227 + }, + { + "epoch": 0.8443028536568834, + "grad_norm": 2.3555883849550168, + "learning_rate": 1.244051027897708e-07, + "loss": 1.1479, + "step": 6228 + }, + { + "epoch": 0.8444384193045482, + "grad_norm": 2.460791312406269, + "learning_rate": 1.2419306334576207e-07, + "loss": 1.0942, + "step": 6229 + }, + { + "epoch": 0.8445739849522131, + "grad_norm": 1.6411719151098434, + "learning_rate": 1.2398119279420793e-07, + "loss": 1.1281, + "step": 6230 + }, + { + "epoch": 0.844709550599878, + "grad_norm": 1.7553068398578118, + "learning_rate": 1.2376949117596592e-07, + "loss": 1.1317, + "step": 6231 + }, + { + "epoch": 0.8448451162475429, + "grad_norm": 1.8967769429055443, + "learning_rate": 1.2355795853186102e-07, + "loss": 1.1214, + "step": 6232 + }, + { + "epoch": 0.8449806818952078, + "grad_norm": 2.144477964702437, + "learning_rate": 1.233465949026855e-07, + "loss": 1.1358, + "step": 6233 + }, + { + "epoch": 0.8451162475428726, + "grad_norm": 1.8355420567476093, + "learning_rate": 1.2313540032919935e-07, + "loss": 1.1044, + "step": 6234 + }, + { + "epoch": 0.8452518131905375, + "grad_norm": 1.7272711615340095, + "learning_rate": 1.2292437485212957e-07, + "loss": 1.2039, + "step": 6235 + }, + { + "epoch": 0.8453873788382024, + "grad_norm": 1.6324798893020813, + "learning_rate": 1.2271351851217104e-07, + "loss": 1.1099, + "step": 6236 + }, + { + "epoch": 0.8455229444858673, + "grad_norm": 3.023192974658072, + "learning_rate": 1.225028313499855e-07, + "loss": 1.1449, + "step": 6237 + }, + { + "epoch": 0.8456585101335322, + "grad_norm": 4.282335992834041, + "learning_rate": 1.222923134062025e-07, + "loss": 1.1208, + "step": 6238 + }, + { + "epoch": 0.845794075781197, + "grad_norm": 3.1160075754179966, + "learning_rate": 1.220819647214185e-07, + "loss": 1.1506, + "step": 6239 + }, + { + "epoch": 0.845929641428862, + "grad_norm": 1.5753553238684093, + "learning_rate": 1.2187178533619803e-07, + "loss": 1.128, + "step": 6240 + }, + { + "epoch": 0.8460652070765268, + "grad_norm": 2.289807052059847, + "learning_rate": 1.216617752910718e-07, + "loss": 1.1341, + "step": 6241 + }, + { + "epoch": 0.8462007727241917, + "grad_norm": 1.7694217357869495, + "learning_rate": 1.2145193462653946e-07, + "loss": 1.1077, + "step": 6242 + }, + { + "epoch": 0.8463363383718566, + "grad_norm": 1.9776776881471474, + "learning_rate": 1.212422633830663e-07, + "loss": 1.1969, + "step": 6243 + }, + { + "epoch": 0.8464719040195214, + "grad_norm": 2.534932827207732, + "learning_rate": 1.2103276160108656e-07, + "loss": 1.1599, + "step": 6244 + }, + { + "epoch": 0.8466074696671864, + "grad_norm": 4.995341728113508, + "learning_rate": 1.208234293210002e-07, + "loss": 1.1286, + "step": 6245 + }, + { + "epoch": 0.8467430353148512, + "grad_norm": 1.8276705357367817, + "learning_rate": 1.2061426658317608e-07, + "loss": 1.1273, + "step": 6246 + }, + { + "epoch": 0.8468786009625161, + "grad_norm": 1.811284831556266, + "learning_rate": 1.2040527342794872e-07, + "loss": 1.1281, + "step": 6247 + }, + { + "epoch": 0.847014166610181, + "grad_norm": 2.54231508998573, + "learning_rate": 1.2019644989562184e-07, + "loss": 1.1427, + "step": 6248 + }, + { + "epoch": 0.8471497322578458, + "grad_norm": 1.9463337420541957, + "learning_rate": 1.1998779602646436e-07, + "loss": 1.1821, + "step": 6249 + }, + { + "epoch": 0.8472852979055108, + "grad_norm": 1.6794637788667484, + "learning_rate": 1.1977931186071443e-07, + "loss": 1.0817, + "step": 6250 + }, + { + "epoch": 0.8474208635531756, + "grad_norm": 1.8466439812740438, + "learning_rate": 1.1957099743857568e-07, + "loss": 1.1457, + "step": 6251 + }, + { + "epoch": 0.8475564292008405, + "grad_norm": 1.8800809019961422, + "learning_rate": 1.1936285280022096e-07, + "loss": 1.1358, + "step": 6252 + }, + { + "epoch": 0.8476919948485054, + "grad_norm": 2.2170847834003933, + "learning_rate": 1.1915487798578816e-07, + "loss": 1.1098, + "step": 6253 + }, + { + "epoch": 0.8478275604961703, + "grad_norm": 3.280509684696955, + "learning_rate": 1.1894707303538476e-07, + "loss": 1.1306, + "step": 6254 + }, + { + "epoch": 0.8479631261438352, + "grad_norm": 1.7824527666797443, + "learning_rate": 1.1873943798908336e-07, + "loss": 1.1442, + "step": 6255 + }, + { + "epoch": 0.8480986917915, + "grad_norm": 1.7442174274908473, + "learning_rate": 1.1853197288692518e-07, + "loss": 1.1653, + "step": 6256 + }, + { + "epoch": 0.8482342574391649, + "grad_norm": 1.924292691387795, + "learning_rate": 1.183246777689182e-07, + "loss": 1.1089, + "step": 6257 + }, + { + "epoch": 0.8483698230868298, + "grad_norm": 1.81597626989229, + "learning_rate": 1.1811755267503754e-07, + "loss": 1.1497, + "step": 6258 + }, + { + "epoch": 0.8485053887344947, + "grad_norm": 2.007521843432338, + "learning_rate": 1.179105976452256e-07, + "loss": 1.1328, + "step": 6259 + }, + { + "epoch": 0.8486409543821596, + "grad_norm": 1.8304030237452635, + "learning_rate": 1.1770381271939223e-07, + "loss": 1.1753, + "step": 6260 + }, + { + "epoch": 0.8487765200298244, + "grad_norm": 1.7172665924291515, + "learning_rate": 1.1749719793741409e-07, + "loss": 1.1413, + "step": 6261 + }, + { + "epoch": 0.8489120856774893, + "grad_norm": 2.1483760249566317, + "learning_rate": 1.172907533391353e-07, + "loss": 1.1552, + "step": 6262 + }, + { + "epoch": 0.8490476513251543, + "grad_norm": 1.7193014925753212, + "learning_rate": 1.1708447896436724e-07, + "loss": 1.1047, + "step": 6263 + }, + { + "epoch": 0.8491832169728191, + "grad_norm": 2.7699525433586523, + "learning_rate": 1.1687837485288766e-07, + "loss": 1.1174, + "step": 6264 + }, + { + "epoch": 0.849318782620484, + "grad_norm": 1.9332145308831243, + "learning_rate": 1.1667244104444308e-07, + "loss": 1.0927, + "step": 6265 + }, + { + "epoch": 0.8494543482681488, + "grad_norm": 1.647036772190935, + "learning_rate": 1.1646667757874507e-07, + "loss": 1.1011, + "step": 6266 + }, + { + "epoch": 0.8495899139158137, + "grad_norm": 1.7528032300548524, + "learning_rate": 1.1626108449547467e-07, + "loss": 1.1168, + "step": 6267 + }, + { + "epoch": 0.8497254795634787, + "grad_norm": 1.957486194983612, + "learning_rate": 1.1605566183427807e-07, + "loss": 1.1286, + "step": 6268 + }, + { + "epoch": 0.8498610452111435, + "grad_norm": 2.4733106506595437, + "learning_rate": 1.1585040963476966e-07, + "loss": 1.147, + "step": 6269 + }, + { + "epoch": 0.8499966108588084, + "grad_norm": 2.9880726919665155, + "learning_rate": 1.156453279365307e-07, + "loss": 1.1144, + "step": 6270 + }, + { + "epoch": 0.8501321765064732, + "grad_norm": 2.8721022861076206, + "learning_rate": 1.1544041677910954e-07, + "loss": 1.1006, + "step": 6271 + }, + { + "epoch": 0.8502677421541381, + "grad_norm": 2.4463221359861826, + "learning_rate": 1.152356762020218e-07, + "loss": 1.1624, + "step": 6272 + }, + { + "epoch": 0.8504033078018031, + "grad_norm": 1.7957745083622927, + "learning_rate": 1.1503110624474987e-07, + "loss": 1.1151, + "step": 6273 + }, + { + "epoch": 0.8505388734494679, + "grad_norm": 1.7473221307725135, + "learning_rate": 1.1482670694674367e-07, + "loss": 1.1564, + "step": 6274 + }, + { + "epoch": 0.8506744390971328, + "grad_norm": 2.0223542782033386, + "learning_rate": 1.146224783474199e-07, + "loss": 1.0949, + "step": 6275 + }, + { + "epoch": 0.8508100047447976, + "grad_norm": 9.295657123096404, + "learning_rate": 1.1441842048616234e-07, + "loss": 1.1524, + "step": 6276 + }, + { + "epoch": 0.8509455703924625, + "grad_norm": 2.0500926154890906, + "learning_rate": 1.1421453340232213e-07, + "loss": 1.1263, + "step": 6277 + }, + { + "epoch": 0.8510811360401275, + "grad_norm": 1.8187127558349458, + "learning_rate": 1.140108171352172e-07, + "loss": 1.118, + "step": 6278 + }, + { + "epoch": 0.8512167016877923, + "grad_norm": 2.238536675242392, + "learning_rate": 1.1380727172413262e-07, + "loss": 1.1418, + "step": 6279 + }, + { + "epoch": 0.8513522673354572, + "grad_norm": 2.2348043381765277, + "learning_rate": 1.1360389720832042e-07, + "loss": 1.1475, + "step": 6280 + }, + { + "epoch": 0.851487832983122, + "grad_norm": 1.9298726357112934, + "learning_rate": 1.1340069362699988e-07, + "loss": 1.1528, + "step": 6281 + }, + { + "epoch": 0.851623398630787, + "grad_norm": 1.8854441908525683, + "learning_rate": 1.1319766101935724e-07, + "loss": 1.1492, + "step": 6282 + }, + { + "epoch": 0.8517589642784519, + "grad_norm": 1.6817516953479215, + "learning_rate": 1.1299479942454592e-07, + "loss": 1.1168, + "step": 6283 + }, + { + "epoch": 0.8518945299261167, + "grad_norm": 1.8212405147903672, + "learning_rate": 1.1279210888168544e-07, + "loss": 1.1305, + "step": 6284 + }, + { + "epoch": 0.8520300955737816, + "grad_norm": 1.727949354927743, + "learning_rate": 1.1258958942986396e-07, + "loss": 1.1358, + "step": 6285 + }, + { + "epoch": 0.8521656612214464, + "grad_norm": 1.9127178930728703, + "learning_rate": 1.1238724110813502e-07, + "loss": 1.1767, + "step": 6286 + }, + { + "epoch": 0.8523012268691114, + "grad_norm": 2.6842924085343918, + "learning_rate": 1.1218506395552063e-07, + "loss": 1.1135, + "step": 6287 + }, + { + "epoch": 0.8524367925167763, + "grad_norm": 2.039211911978617, + "learning_rate": 1.1198305801100827e-07, + "loss": 1.1534, + "step": 6288 + }, + { + "epoch": 0.8525723581644411, + "grad_norm": 2.1701933733010947, + "learning_rate": 1.11781223313554e-07, + "loss": 1.1736, + "step": 6289 + }, + { + "epoch": 0.852707923812106, + "grad_norm": 2.1080493759409813, + "learning_rate": 1.1157955990207946e-07, + "loss": 1.0909, + "step": 6290 + }, + { + "epoch": 0.8528434894597708, + "grad_norm": 2.6112140410293088, + "learning_rate": 1.1137806781547398e-07, + "loss": 1.1561, + "step": 6291 + }, + { + "epoch": 0.8529790551074358, + "grad_norm": 2.267228609350358, + "learning_rate": 1.1117674709259372e-07, + "loss": 1.1296, + "step": 6292 + }, + { + "epoch": 0.8531146207551007, + "grad_norm": 1.8999624451348438, + "learning_rate": 1.1097559777226196e-07, + "loss": 1.1183, + "step": 6293 + }, + { + "epoch": 0.8532501864027655, + "grad_norm": 1.640576529805139, + "learning_rate": 1.1077461989326864e-07, + "loss": 1.1585, + "step": 6294 + }, + { + "epoch": 0.8533857520504304, + "grad_norm": 1.642720406356296, + "learning_rate": 1.1057381349437067e-07, + "loss": 1.1049, + "step": 6295 + }, + { + "epoch": 0.8535213176980952, + "grad_norm": 2.2010855297926746, + "learning_rate": 1.1037317861429208e-07, + "loss": 1.1498, + "step": 6296 + }, + { + "epoch": 0.8536568833457602, + "grad_norm": 2.0168801242217196, + "learning_rate": 1.1017271529172367e-07, + "loss": 1.1334, + "step": 6297 + }, + { + "epoch": 0.8537924489934251, + "grad_norm": 2.1637484138289356, + "learning_rate": 1.0997242356532333e-07, + "loss": 1.1516, + "step": 6298 + }, + { + "epoch": 0.8539280146410899, + "grad_norm": 1.6137270292324335, + "learning_rate": 1.0977230347371568e-07, + "loss": 1.097, + "step": 6299 + }, + { + "epoch": 0.8540635802887548, + "grad_norm": 1.6439147865503327, + "learning_rate": 1.0957235505549233e-07, + "loss": 1.1388, + "step": 6300 + }, + { + "epoch": 0.8541991459364197, + "grad_norm": 1.7597881373201079, + "learning_rate": 1.0937257834921144e-07, + "loss": 1.1149, + "step": 6301 + }, + { + "epoch": 0.8543347115840846, + "grad_norm": 1.5859697004669802, + "learning_rate": 1.0917297339339892e-07, + "loss": 1.1229, + "step": 6302 + }, + { + "epoch": 0.8544702772317495, + "grad_norm": 2.5979185706241896, + "learning_rate": 1.0897354022654648e-07, + "loss": 1.123, + "step": 6303 + }, + { + "epoch": 0.8546058428794143, + "grad_norm": 2.6648498066555324, + "learning_rate": 1.0877427888711377e-07, + "loss": 1.1139, + "step": 6304 + }, + { + "epoch": 0.8547414085270792, + "grad_norm": 2.3485306609717003, + "learning_rate": 1.0857518941352605e-07, + "loss": 1.1648, + "step": 6305 + }, + { + "epoch": 0.8548769741747442, + "grad_norm": 2.6491525351427367, + "learning_rate": 1.0837627184417697e-07, + "loss": 1.1471, + "step": 6306 + }, + { + "epoch": 0.855012539822409, + "grad_norm": 2.2598024614994814, + "learning_rate": 1.0817752621742537e-07, + "loss": 1.1234, + "step": 6307 + }, + { + "epoch": 0.8551481054700739, + "grad_norm": 1.9493624329649903, + "learning_rate": 1.0797895257159872e-07, + "loss": 1.1766, + "step": 6308 + }, + { + "epoch": 0.8552836711177387, + "grad_norm": 1.652976806590768, + "learning_rate": 1.077805509449895e-07, + "loss": 1.1925, + "step": 6309 + }, + { + "epoch": 0.8554192367654037, + "grad_norm": 1.7262496867383395, + "learning_rate": 1.0758232137585854e-07, + "loss": 1.118, + "step": 6310 + }, + { + "epoch": 0.8555548024130686, + "grad_norm": 2.182745860881029, + "learning_rate": 1.073842639024325e-07, + "loss": 1.1337, + "step": 6311 + }, + { + "epoch": 0.8556903680607334, + "grad_norm": 1.9387103730788804, + "learning_rate": 1.0718637856290525e-07, + "loss": 1.1459, + "step": 6312 + }, + { + "epoch": 0.8558259337083983, + "grad_norm": 2.228608204112476, + "learning_rate": 1.069886653954375e-07, + "loss": 1.184, + "step": 6313 + }, + { + "epoch": 0.8559614993560631, + "grad_norm": 2.034130003046483, + "learning_rate": 1.0679112443815652e-07, + "loss": 1.138, + "step": 6314 + }, + { + "epoch": 0.8560970650037281, + "grad_norm": 2.1237943233107, + "learning_rate": 1.0659375572915674e-07, + "loss": 1.1446, + "step": 6315 + }, + { + "epoch": 0.856232630651393, + "grad_norm": 1.8950296626129737, + "learning_rate": 1.0639655930649894e-07, + "loss": 1.0907, + "step": 6316 + }, + { + "epoch": 0.8563681962990578, + "grad_norm": 2.7916809527373916, + "learning_rate": 1.0619953520821112e-07, + "loss": 1.1776, + "step": 6317 + }, + { + "epoch": 0.8565037619467227, + "grad_norm": 5.287518816389908, + "learning_rate": 1.0600268347228757e-07, + "loss": 1.1694, + "step": 6318 + }, + { + "epoch": 0.8566393275943875, + "grad_norm": 1.6729945935898913, + "learning_rate": 1.0580600413668983e-07, + "loss": 1.1423, + "step": 6319 + }, + { + "epoch": 0.8567748932420525, + "grad_norm": 2.770426669090396, + "learning_rate": 1.0560949723934587e-07, + "loss": 1.183, + "step": 6320 + }, + { + "epoch": 0.8569104588897174, + "grad_norm": 1.7512747480072508, + "learning_rate": 1.0541316281815038e-07, + "loss": 1.1713, + "step": 6321 + }, + { + "epoch": 0.8570460245373822, + "grad_norm": 1.7871830063210152, + "learning_rate": 1.0521700091096508e-07, + "loss": 1.1351, + "step": 6322 + }, + { + "epoch": 0.8571815901850471, + "grad_norm": 1.7631729800111149, + "learning_rate": 1.0502101155561816e-07, + "loss": 1.1439, + "step": 6323 + }, + { + "epoch": 0.857317155832712, + "grad_norm": 1.8119621020388827, + "learning_rate": 1.0482519478990481e-07, + "loss": 1.1473, + "step": 6324 + }, + { + "epoch": 0.8574527214803769, + "grad_norm": 2.334020157749086, + "learning_rate": 1.0462955065158618e-07, + "loss": 1.1236, + "step": 6325 + }, + { + "epoch": 0.8575882871280418, + "grad_norm": 1.8947347085363038, + "learning_rate": 1.0443407917839141e-07, + "loss": 1.1209, + "step": 6326 + }, + { + "epoch": 0.8577238527757066, + "grad_norm": 1.7131857980766547, + "learning_rate": 1.0423878040801514e-07, + "loss": 1.1301, + "step": 6327 + }, + { + "epoch": 0.8578594184233715, + "grad_norm": 1.8981221249679556, + "learning_rate": 1.0404365437811946e-07, + "loss": 1.109, + "step": 6328 + }, + { + "epoch": 0.8579949840710364, + "grad_norm": 1.908887112063313, + "learning_rate": 1.0384870112633271e-07, + "loss": 1.0993, + "step": 6329 + }, + { + "epoch": 0.8581305497187013, + "grad_norm": 1.9356801337366973, + "learning_rate": 1.0365392069025014e-07, + "loss": 1.1399, + "step": 6330 + }, + { + "epoch": 0.8582661153663662, + "grad_norm": 1.5456649197934267, + "learning_rate": 1.034593131074336e-07, + "loss": 1.1224, + "step": 6331 + }, + { + "epoch": 0.858401681014031, + "grad_norm": 1.671843529343042, + "learning_rate": 1.0326487841541176e-07, + "loss": 1.1489, + "step": 6332 + }, + { + "epoch": 0.858537246661696, + "grad_norm": 1.979078078300221, + "learning_rate": 1.030706166516796e-07, + "loss": 1.1337, + "step": 6333 + }, + { + "epoch": 0.8586728123093608, + "grad_norm": 2.3494944416391523, + "learning_rate": 1.0287652785369916e-07, + "loss": 1.1924, + "step": 6334 + }, + { + "epoch": 0.8588083779570257, + "grad_norm": 2.5432337749061253, + "learning_rate": 1.0268261205889894e-07, + "loss": 1.0903, + "step": 6335 + }, + { + "epoch": 0.8589439436046906, + "grad_norm": 5.047328447886946, + "learning_rate": 1.0248886930467393e-07, + "loss": 1.1013, + "step": 6336 + }, + { + "epoch": 0.8590795092523554, + "grad_norm": 1.9769971433693727, + "learning_rate": 1.022952996283859e-07, + "loss": 1.1262, + "step": 6337 + }, + { + "epoch": 0.8592150749000204, + "grad_norm": 12.156495624158573, + "learning_rate": 1.0210190306736333e-07, + "loss": 1.123, + "step": 6338 + }, + { + "epoch": 0.8593506405476852, + "grad_norm": 1.7220724770437112, + "learning_rate": 1.0190867965890137e-07, + "loss": 1.124, + "step": 6339 + }, + { + "epoch": 0.8594862061953501, + "grad_norm": 1.4939919567755284, + "learning_rate": 1.0171562944026102e-07, + "loss": 1.1365, + "step": 6340 + }, + { + "epoch": 0.859621771843015, + "grad_norm": 2.563604328399515, + "learning_rate": 1.0152275244867137e-07, + "loss": 1.1684, + "step": 6341 + }, + { + "epoch": 0.8597573374906798, + "grad_norm": 1.9506413264923679, + "learning_rate": 1.0133004872132623e-07, + "loss": 1.1471, + "step": 6342 + }, + { + "epoch": 0.8598929031383448, + "grad_norm": 1.7410399188833907, + "learning_rate": 1.0113751829538808e-07, + "loss": 1.1647, + "step": 6343 + }, + { + "epoch": 0.8600284687860096, + "grad_norm": 2.562891671835255, + "learning_rate": 1.009451612079838e-07, + "loss": 1.1061, + "step": 6344 + }, + { + "epoch": 0.8601640344336745, + "grad_norm": 2.4672548432147985, + "learning_rate": 1.0075297749620904e-07, + "loss": 1.1389, + "step": 6345 + }, + { + "epoch": 0.8602996000813394, + "grad_norm": 1.6836298840475836, + "learning_rate": 1.0056096719712382e-07, + "loss": 1.1167, + "step": 6346 + }, + { + "epoch": 0.8604351657290042, + "grad_norm": 1.9656701517726958, + "learning_rate": 1.0036913034775673e-07, + "loss": 1.1301, + "step": 6347 + }, + { + "epoch": 0.8605707313766692, + "grad_norm": 3.1893518778136367, + "learning_rate": 1.0017746698510122e-07, + "loss": 1.1567, + "step": 6348 + }, + { + "epoch": 0.860706297024334, + "grad_norm": 1.925427382965406, + "learning_rate": 9.998597714611889e-08, + "loss": 1.1473, + "step": 6349 + }, + { + "epoch": 0.8608418626719989, + "grad_norm": 1.7141553342379825, + "learning_rate": 9.979466086773614e-08, + "loss": 1.1483, + "step": 6350 + }, + { + "epoch": 0.8609774283196638, + "grad_norm": 1.7563413633965042, + "learning_rate": 9.960351818684764e-08, + "loss": 1.0986, + "step": 6351 + }, + { + "epoch": 0.8611129939673287, + "grad_norm": 3.7580000847532107, + "learning_rate": 9.941254914031316e-08, + "loss": 1.1517, + "step": 6352 + }, + { + "epoch": 0.8612485596149936, + "grad_norm": 1.6457730348851856, + "learning_rate": 9.922175376495979e-08, + "loss": 1.112, + "step": 6353 + }, + { + "epoch": 0.8613841252626584, + "grad_norm": 1.7960243856879712, + "learning_rate": 9.903113209758096e-08, + "loss": 1.1715, + "step": 6354 + }, + { + "epoch": 0.8615196909103233, + "grad_norm": 2.161177162574391, + "learning_rate": 9.88406841749364e-08, + "loss": 1.1403, + "step": 6355 + }, + { + "epoch": 0.8616552565579882, + "grad_norm": 2.477857675136244, + "learning_rate": 9.865041003375263e-08, + "loss": 1.1454, + "step": 6356 + }, + { + "epoch": 0.8617908222056531, + "grad_norm": 2.5247939801819057, + "learning_rate": 9.846030971072239e-08, + "loss": 1.1434, + "step": 6357 + }, + { + "epoch": 0.861926387853318, + "grad_norm": 1.9442886068815097, + "learning_rate": 9.827038324250514e-08, + "loss": 1.1194, + "step": 6358 + }, + { + "epoch": 0.8620619535009828, + "grad_norm": 1.8617089346367677, + "learning_rate": 9.80806306657267e-08, + "loss": 1.1524, + "step": 6359 + }, + { + "epoch": 0.8621975191486477, + "grad_norm": 1.6561041663966538, + "learning_rate": 9.789105201697923e-08, + "loss": 1.1015, + "step": 6360 + }, + { + "epoch": 0.8623330847963127, + "grad_norm": 2.1258325479319837, + "learning_rate": 9.77016473328216e-08, + "loss": 1.1289, + "step": 6361 + }, + { + "epoch": 0.8624686504439775, + "grad_norm": 1.8304525955725597, + "learning_rate": 9.751241664977927e-08, + "loss": 1.1697, + "step": 6362 + }, + { + "epoch": 0.8626042160916424, + "grad_norm": 1.8581396037777824, + "learning_rate": 9.732336000434304e-08, + "loss": 1.16, + "step": 6363 + }, + { + "epoch": 0.8627397817393072, + "grad_norm": 1.9263089716043587, + "learning_rate": 9.713447743297198e-08, + "loss": 1.1357, + "step": 6364 + }, + { + "epoch": 0.8628753473869721, + "grad_norm": 2.107126157156885, + "learning_rate": 9.694576897208984e-08, + "loss": 1.1295, + "step": 6365 + }, + { + "epoch": 0.8630109130346371, + "grad_norm": 4.461028066826279, + "learning_rate": 9.675723465808827e-08, + "loss": 1.1426, + "step": 6366 + }, + { + "epoch": 0.8631464786823019, + "grad_norm": 1.6190079135649131, + "learning_rate": 9.656887452732399e-08, + "loss": 1.1409, + "step": 6367 + }, + { + "epoch": 0.8632820443299668, + "grad_norm": 1.616603430940407, + "learning_rate": 9.638068861612091e-08, + "loss": 1.1181, + "step": 6368 + }, + { + "epoch": 0.8634176099776316, + "grad_norm": 1.7239947410629752, + "learning_rate": 9.619267696076938e-08, + "loss": 1.1152, + "step": 6369 + }, + { + "epoch": 0.8635531756252965, + "grad_norm": 1.9222811970751839, + "learning_rate": 9.600483959752592e-08, + "loss": 1.1572, + "step": 6370 + }, + { + "epoch": 0.8636887412729615, + "grad_norm": 3.2935946669604848, + "learning_rate": 9.581717656261335e-08, + "loss": 1.1393, + "step": 6371 + }, + { + "epoch": 0.8638243069206263, + "grad_norm": 1.7907081805018734, + "learning_rate": 9.562968789222114e-08, + "loss": 1.1251, + "step": 6372 + }, + { + "epoch": 0.8639598725682912, + "grad_norm": 1.978845878170512, + "learning_rate": 9.544237362250495e-08, + "loss": 1.1556, + "step": 6373 + }, + { + "epoch": 0.864095438215956, + "grad_norm": 1.6714797109457122, + "learning_rate": 9.525523378958688e-08, + "loss": 1.0916, + "step": 6374 + }, + { + "epoch": 0.864231003863621, + "grad_norm": 3.801261959337403, + "learning_rate": 9.50682684295554e-08, + "loss": 1.0988, + "step": 6375 + }, + { + "epoch": 0.8643665695112859, + "grad_norm": 1.5863661813721055, + "learning_rate": 9.488147757846521e-08, + "loss": 1.1187, + "step": 6376 + }, + { + "epoch": 0.8645021351589507, + "grad_norm": 2.6155321489987498, + "learning_rate": 9.46948612723375e-08, + "loss": 1.1439, + "step": 6377 + }, + { + "epoch": 0.8646377008066156, + "grad_norm": 1.7861517339697166, + "learning_rate": 9.450841954715971e-08, + "loss": 1.1687, + "step": 6378 + }, + { + "epoch": 0.8647732664542804, + "grad_norm": 1.758134399325192, + "learning_rate": 9.432215243888575e-08, + "loss": 1.1275, + "step": 6379 + }, + { + "epoch": 0.8649088321019454, + "grad_norm": 1.9052458152073284, + "learning_rate": 9.413605998343566e-08, + "loss": 1.1231, + "step": 6380 + }, + { + "epoch": 0.8650443977496103, + "grad_norm": 2.184565117915111, + "learning_rate": 9.395014221669595e-08, + "loss": 1.1287, + "step": 6381 + }, + { + "epoch": 0.8651799633972751, + "grad_norm": 1.5854843609678646, + "learning_rate": 9.376439917451962e-08, + "loss": 1.1479, + "step": 6382 + }, + { + "epoch": 0.86531552904494, + "grad_norm": 2.4572859755015704, + "learning_rate": 9.357883089272512e-08, + "loss": 1.1473, + "step": 6383 + }, + { + "epoch": 0.8654510946926048, + "grad_norm": 1.6240183306111986, + "learning_rate": 9.33934374070986e-08, + "loss": 1.1356, + "step": 6384 + }, + { + "epoch": 0.8655866603402698, + "grad_norm": 1.6546709782261257, + "learning_rate": 9.320821875339091e-08, + "loss": 1.1265, + "step": 6385 + }, + { + "epoch": 0.8657222259879347, + "grad_norm": 1.6998025954797622, + "learning_rate": 9.302317496732092e-08, + "loss": 1.1219, + "step": 6386 + }, + { + "epoch": 0.8658577916355995, + "grad_norm": 1.6895620247078484, + "learning_rate": 9.283830608457199e-08, + "loss": 1.1269, + "step": 6387 + }, + { + "epoch": 0.8659933572832644, + "grad_norm": 1.9937628616838519, + "learning_rate": 9.265361214079548e-08, + "loss": 1.1623, + "step": 6388 + }, + { + "epoch": 0.8661289229309294, + "grad_norm": 2.0546351581087414, + "learning_rate": 9.246909317160744e-08, + "loss": 1.1147, + "step": 6389 + }, + { + "epoch": 0.8662644885785942, + "grad_norm": 2.323054289802963, + "learning_rate": 9.228474921259121e-08, + "loss": 1.1745, + "step": 6390 + }, + { + "epoch": 0.8664000542262591, + "grad_norm": 1.6546365065438091, + "learning_rate": 9.210058029929602e-08, + "loss": 1.1462, + "step": 6391 + }, + { + "epoch": 0.8665356198739239, + "grad_norm": 2.9392586721436054, + "learning_rate": 9.191658646723732e-08, + "loss": 1.1575, + "step": 6392 + }, + { + "epoch": 0.8666711855215888, + "grad_norm": 1.9747817240295014, + "learning_rate": 9.173276775189709e-08, + "loss": 1.1228, + "step": 6393 + }, + { + "epoch": 0.8668067511692538, + "grad_norm": 2.340148325850363, + "learning_rate": 9.154912418872306e-08, + "loss": 1.1681, + "step": 6394 + }, + { + "epoch": 0.8669423168169186, + "grad_norm": 5.148897261292847, + "learning_rate": 9.136565581312961e-08, + "loss": 1.1328, + "step": 6395 + }, + { + "epoch": 0.8670778824645835, + "grad_norm": 1.914943682037645, + "learning_rate": 9.118236266049705e-08, + "loss": 1.1304, + "step": 6396 + }, + { + "epoch": 0.8672134481122483, + "grad_norm": 2.999006545862084, + "learning_rate": 9.099924476617216e-08, + "loss": 1.1147, + "step": 6397 + }, + { + "epoch": 0.8673490137599132, + "grad_norm": 1.928247393388114, + "learning_rate": 9.081630216546766e-08, + "loss": 1.153, + "step": 6398 + }, + { + "epoch": 0.8674845794075782, + "grad_norm": 3.042587689529908, + "learning_rate": 9.063353489366287e-08, + "loss": 1.1467, + "step": 6399 + }, + { + "epoch": 0.867620145055243, + "grad_norm": 2.340097759721643, + "learning_rate": 9.045094298600232e-08, + "loss": 1.1697, + "step": 6400 + }, + { + "epoch": 0.8677557107029079, + "grad_norm": 2.028703833735949, + "learning_rate": 9.026852647769822e-08, + "loss": 1.1349, + "step": 6401 + }, + { + "epoch": 0.8678912763505727, + "grad_norm": 1.76698013665226, + "learning_rate": 9.008628540392749e-08, + "loss": 1.1278, + "step": 6402 + }, + { + "epoch": 0.8680268419982377, + "grad_norm": 1.7879757371794536, + "learning_rate": 8.990421979983465e-08, + "loss": 1.1557, + "step": 6403 + }, + { + "epoch": 0.8681624076459026, + "grad_norm": 1.989340046430742, + "learning_rate": 8.972232970052873e-08, + "loss": 1.1216, + "step": 6404 + }, + { + "epoch": 0.8682979732935674, + "grad_norm": 1.782787088586076, + "learning_rate": 8.954061514108657e-08, + "loss": 1.1597, + "step": 6405 + }, + { + "epoch": 0.8684335389412323, + "grad_norm": 1.7513996925916937, + "learning_rate": 8.93590761565497e-08, + "loss": 1.1471, + "step": 6406 + }, + { + "epoch": 0.8685691045888971, + "grad_norm": 1.5457106643258431, + "learning_rate": 8.917771278192709e-08, + "loss": 1.1228, + "step": 6407 + }, + { + "epoch": 0.8687046702365621, + "grad_norm": 1.7618666465964525, + "learning_rate": 8.899652505219279e-08, + "loss": 1.0734, + "step": 6408 + }, + { + "epoch": 0.868840235884227, + "grad_norm": 5.1874254865094285, + "learning_rate": 8.881551300228785e-08, + "loss": 1.1244, + "step": 6409 + }, + { + "epoch": 0.8689758015318918, + "grad_norm": 2.3620641852772435, + "learning_rate": 8.863467666711865e-08, + "loss": 1.1262, + "step": 6410 + }, + { + "epoch": 0.8691113671795567, + "grad_norm": 2.655324727132359, + "learning_rate": 8.845401608155822e-08, + "loss": 1.1479, + "step": 6411 + }, + { + "epoch": 0.8692469328272215, + "grad_norm": 1.692606843146902, + "learning_rate": 8.827353128044535e-08, + "loss": 1.1562, + "step": 6412 + }, + { + "epoch": 0.8693824984748865, + "grad_norm": 1.8377327306897078, + "learning_rate": 8.809322229858529e-08, + "loss": 1.1265, + "step": 6413 + }, + { + "epoch": 0.8695180641225514, + "grad_norm": 2.6947676231757844, + "learning_rate": 8.791308917074925e-08, + "loss": 1.1436, + "step": 6414 + }, + { + "epoch": 0.8696536297702162, + "grad_norm": 2.2116898279113957, + "learning_rate": 8.773313193167431e-08, + "loss": 1.0977, + "step": 6415 + }, + { + "epoch": 0.8697891954178811, + "grad_norm": 1.6582952821541452, + "learning_rate": 8.755335061606383e-08, + "loss": 1.0865, + "step": 6416 + }, + { + "epoch": 0.869924761065546, + "grad_norm": 1.8544930496686594, + "learning_rate": 8.737374525858743e-08, + "loss": 1.1405, + "step": 6417 + }, + { + "epoch": 0.8700603267132109, + "grad_norm": 2.0228968406510592, + "learning_rate": 8.719431589388026e-08, + "loss": 1.0845, + "step": 6418 + }, + { + "epoch": 0.8701958923608758, + "grad_norm": 1.7360877045597363, + "learning_rate": 8.701506255654411e-08, + "loss": 1.1575, + "step": 6419 + }, + { + "epoch": 0.8703314580085406, + "grad_norm": 1.9700291786797395, + "learning_rate": 8.683598528114644e-08, + "loss": 1.1129, + "step": 6420 + }, + { + "epoch": 0.8704670236562055, + "grad_norm": 1.9096262675802422, + "learning_rate": 8.665708410222095e-08, + "loss": 1.1391, + "step": 6421 + }, + { + "epoch": 0.8706025893038704, + "grad_norm": 2.465454519869327, + "learning_rate": 8.647835905426726e-08, + "loss": 1.1359, + "step": 6422 + }, + { + "epoch": 0.8707381549515353, + "grad_norm": 5.225017206238359, + "learning_rate": 8.629981017175136e-08, + "loss": 1.1373, + "step": 6423 + }, + { + "epoch": 0.8708737205992002, + "grad_norm": 1.839816978124513, + "learning_rate": 8.61214374891045e-08, + "loss": 1.1901, + "step": 6424 + }, + { + "epoch": 0.871009286246865, + "grad_norm": 2.042411079942866, + "learning_rate": 8.59432410407248e-08, + "loss": 1.1793, + "step": 6425 + }, + { + "epoch": 0.87114485189453, + "grad_norm": 1.7334815080156525, + "learning_rate": 8.576522086097593e-08, + "loss": 1.1328, + "step": 6426 + }, + { + "epoch": 0.8712804175421948, + "grad_norm": 1.95895992997314, + "learning_rate": 8.55873769841876e-08, + "loss": 1.1145, + "step": 6427 + }, + { + "epoch": 0.8714159831898597, + "grad_norm": 1.8693976532888532, + "learning_rate": 8.540970944465575e-08, + "loss": 1.1327, + "step": 6428 + }, + { + "epoch": 0.8715515488375246, + "grad_norm": 1.5958874915520316, + "learning_rate": 8.523221827664206e-08, + "loss": 1.1345, + "step": 6429 + }, + { + "epoch": 0.8716871144851894, + "grad_norm": 2.0626016191590355, + "learning_rate": 8.505490351437438e-08, + "loss": 1.1304, + "step": 6430 + }, + { + "epoch": 0.8718226801328544, + "grad_norm": 1.828040802424155, + "learning_rate": 8.487776519204637e-08, + "loss": 1.0561, + "step": 6431 + }, + { + "epoch": 0.8719582457805192, + "grad_norm": 1.8124759363376035, + "learning_rate": 8.470080334381791e-08, + "loss": 1.1193, + "step": 6432 + }, + { + "epoch": 0.8720938114281841, + "grad_norm": 2.9629653713029476, + "learning_rate": 8.452401800381448e-08, + "loss": 1.1498, + "step": 6433 + }, + { + "epoch": 0.872229377075849, + "grad_norm": 2.672421978927302, + "learning_rate": 8.434740920612792e-08, + "loss": 1.0985, + "step": 6434 + }, + { + "epoch": 0.8723649427235138, + "grad_norm": 2.2946858429954116, + "learning_rate": 8.417097698481568e-08, + "loss": 1.131, + "step": 6435 + }, + { + "epoch": 0.8725005083711788, + "grad_norm": 1.6538508084236532, + "learning_rate": 8.399472137390152e-08, + "loss": 1.135, + "step": 6436 + }, + { + "epoch": 0.8726360740188436, + "grad_norm": 1.7191586861848696, + "learning_rate": 8.38186424073748e-08, + "loss": 1.1387, + "step": 6437 + }, + { + "epoch": 0.8727716396665085, + "grad_norm": 1.664695984829714, + "learning_rate": 8.364274011919114e-08, + "loss": 1.1394, + "step": 6438 + }, + { + "epoch": 0.8729072053141734, + "grad_norm": 1.9198441124150405, + "learning_rate": 8.346701454327143e-08, + "loss": 1.1767, + "step": 6439 + }, + { + "epoch": 0.8730427709618382, + "grad_norm": 1.6533982080318481, + "learning_rate": 8.329146571350365e-08, + "loss": 1.1278, + "step": 6440 + }, + { + "epoch": 0.8731783366095032, + "grad_norm": 4.208373676930938, + "learning_rate": 8.311609366374028e-08, + "loss": 1.1658, + "step": 6441 + }, + { + "epoch": 0.873313902257168, + "grad_norm": 2.164922492604288, + "learning_rate": 8.294089842780117e-08, + "loss": 1.1291, + "step": 6442 + }, + { + "epoch": 0.8734494679048329, + "grad_norm": 3.826807379393319, + "learning_rate": 8.27658800394706e-08, + "loss": 1.0853, + "step": 6443 + }, + { + "epoch": 0.8735850335524978, + "grad_norm": 1.749656638081257, + "learning_rate": 8.259103853250027e-08, + "loss": 1.1235, + "step": 6444 + }, + { + "epoch": 0.8737205992001627, + "grad_norm": 1.747059267138261, + "learning_rate": 8.241637394060619e-08, + "loss": 1.1236, + "step": 6445 + }, + { + "epoch": 0.8738561648478276, + "grad_norm": 1.7856003386082375, + "learning_rate": 8.224188629747175e-08, + "loss": 1.0881, + "step": 6446 + }, + { + "epoch": 0.8739917304954924, + "grad_norm": 2.7039887116565833, + "learning_rate": 8.206757563674493e-08, + "loss": 1.1514, + "step": 6447 + }, + { + "epoch": 0.8741272961431573, + "grad_norm": 1.6562534908284436, + "learning_rate": 8.189344199204073e-08, + "loss": 1.1662, + "step": 6448 + }, + { + "epoch": 0.8742628617908222, + "grad_norm": 1.7498281442363768, + "learning_rate": 8.171948539693874e-08, + "loss": 1.1381, + "step": 6449 + }, + { + "epoch": 0.8743984274384871, + "grad_norm": 1.957646897840696, + "learning_rate": 8.154570588498599e-08, + "loss": 1.1354, + "step": 6450 + }, + { + "epoch": 0.874533993086152, + "grad_norm": 2.1346861163603745, + "learning_rate": 8.13721034896938e-08, + "loss": 1.117, + "step": 6451 + }, + { + "epoch": 0.8746695587338168, + "grad_norm": 1.7782920560225477, + "learning_rate": 8.119867824454018e-08, + "loss": 1.1549, + "step": 6452 + }, + { + "epoch": 0.8748051243814817, + "grad_norm": 2.585917265718191, + "learning_rate": 8.102543018296892e-08, + "loss": 1.1417, + "step": 6453 + }, + { + "epoch": 0.8749406900291467, + "grad_norm": 1.8424330122494461, + "learning_rate": 8.085235933838952e-08, + "loss": 1.1012, + "step": 6454 + }, + { + "epoch": 0.8750762556768115, + "grad_norm": 1.7549300894634328, + "learning_rate": 8.067946574417739e-08, + "loss": 1.0997, + "step": 6455 + }, + { + "epoch": 0.8752118213244764, + "grad_norm": 2.1083379016790214, + "learning_rate": 8.050674943367352e-08, + "loss": 1.1596, + "step": 6456 + }, + { + "epoch": 0.8753473869721412, + "grad_norm": 2.252809396009884, + "learning_rate": 8.033421044018496e-08, + "loss": 1.15, + "step": 6457 + }, + { + "epoch": 0.8754829526198061, + "grad_norm": 2.703081794641011, + "learning_rate": 8.016184879698462e-08, + "loss": 1.1404, + "step": 6458 + }, + { + "epoch": 0.8756185182674711, + "grad_norm": 1.995642659279966, + "learning_rate": 7.998966453731093e-08, + "loss": 1.1399, + "step": 6459 + }, + { + "epoch": 0.8757540839151359, + "grad_norm": 1.6484780989781318, + "learning_rate": 7.981765769436833e-08, + "loss": 1.1395, + "step": 6460 + }, + { + "epoch": 0.8758896495628008, + "grad_norm": 1.712596129098464, + "learning_rate": 7.964582830132704e-08, + "loss": 1.1274, + "step": 6461 + }, + { + "epoch": 0.8760252152104656, + "grad_norm": 2.4580521017153676, + "learning_rate": 7.94741763913227e-08, + "loss": 1.1698, + "step": 6462 + }, + { + "epoch": 0.8761607808581305, + "grad_norm": 1.6399134766925867, + "learning_rate": 7.930270199745748e-08, + "loss": 1.1398, + "step": 6463 + }, + { + "epoch": 0.8762963465057955, + "grad_norm": 2.9045496830493387, + "learning_rate": 7.913140515279837e-08, + "loss": 1.1309, + "step": 6464 + }, + { + "epoch": 0.8764319121534603, + "grad_norm": 1.853291485456398, + "learning_rate": 7.896028589037929e-08, + "loss": 1.148, + "step": 6465 + }, + { + "epoch": 0.8765674778011252, + "grad_norm": 1.7591583079051485, + "learning_rate": 7.87893442431985e-08, + "loss": 1.1579, + "step": 6466 + }, + { + "epoch": 0.8767030434487901, + "grad_norm": 2.45903421938785, + "learning_rate": 7.86185802442212e-08, + "loss": 1.1294, + "step": 6467 + }, + { + "epoch": 0.876838609096455, + "grad_norm": 1.822865532701106, + "learning_rate": 7.844799392637769e-08, + "loss": 1.1757, + "step": 6468 + }, + { + "epoch": 0.8769741747441199, + "grad_norm": 2.417590478302076, + "learning_rate": 7.827758532256435e-08, + "loss": 1.1259, + "step": 6469 + }, + { + "epoch": 0.8771097403917847, + "grad_norm": 2.150272970636816, + "learning_rate": 7.810735446564298e-08, + "loss": 1.1693, + "step": 6470 + }, + { + "epoch": 0.8772453060394496, + "grad_norm": 2.0816057276308984, + "learning_rate": 7.793730138844134e-08, + "loss": 1.1531, + "step": 6471 + }, + { + "epoch": 0.8773808716871145, + "grad_norm": 1.6591630582596637, + "learning_rate": 7.776742612375275e-08, + "loss": 1.1265, + "step": 6472 + }, + { + "epoch": 0.8775164373347794, + "grad_norm": 3.0536936167163784, + "learning_rate": 7.759772870433645e-08, + "loss": 1.1219, + "step": 6473 + }, + { + "epoch": 0.8776520029824443, + "grad_norm": 2.5883130488882418, + "learning_rate": 7.742820916291714e-08, + "loss": 1.1584, + "step": 6474 + }, + { + "epoch": 0.8777875686301091, + "grad_norm": 3.4460120924764244, + "learning_rate": 7.725886753218536e-08, + "loss": 1.1225, + "step": 6475 + }, + { + "epoch": 0.877923134277774, + "grad_norm": 1.611288174099806, + "learning_rate": 7.708970384479729e-08, + "loss": 1.0814, + "step": 6476 + }, + { + "epoch": 0.878058699925439, + "grad_norm": 2.7711323805834653, + "learning_rate": 7.692071813337487e-08, + "loss": 1.1814, + "step": 6477 + }, + { + "epoch": 0.8781942655731038, + "grad_norm": 3.747710880594771, + "learning_rate": 7.675191043050556e-08, + "loss": 1.1443, + "step": 6478 + }, + { + "epoch": 0.8783298312207687, + "grad_norm": 1.7617536711277373, + "learning_rate": 7.658328076874287e-08, + "loss": 1.0902, + "step": 6479 + }, + { + "epoch": 0.8784653968684335, + "grad_norm": 2.6537771193229887, + "learning_rate": 7.641482918060504e-08, + "loss": 1.1921, + "step": 6480 + }, + { + "epoch": 0.8786009625160984, + "grad_norm": 1.6796081608072557, + "learning_rate": 7.624655569857751e-08, + "loss": 1.1006, + "step": 6481 + }, + { + "epoch": 0.8787365281637634, + "grad_norm": 1.6613723215359724, + "learning_rate": 7.607846035510957e-08, + "loss": 1.16, + "step": 6482 + }, + { + "epoch": 0.8788720938114282, + "grad_norm": 2.660110974262545, + "learning_rate": 7.591054318261802e-08, + "loss": 1.1796, + "step": 6483 + }, + { + "epoch": 0.8790076594590931, + "grad_norm": 2.1156025813259536, + "learning_rate": 7.574280421348356e-08, + "loss": 1.1657, + "step": 6484 + }, + { + "epoch": 0.8791432251067579, + "grad_norm": 1.7269722587174925, + "learning_rate": 7.557524348005395e-08, + "loss": 1.114, + "step": 6485 + }, + { + "epoch": 0.8792787907544228, + "grad_norm": 1.8842104957438535, + "learning_rate": 7.540786101464136e-08, + "loss": 1.1473, + "step": 6486 + }, + { + "epoch": 0.8794143564020878, + "grad_norm": 2.65859677639849, + "learning_rate": 7.524065684952475e-08, + "loss": 1.1478, + "step": 6487 + }, + { + "epoch": 0.8795499220497526, + "grad_norm": 1.8070017846741304, + "learning_rate": 7.507363101694775e-08, + "loss": 1.134, + "step": 6488 + }, + { + "epoch": 0.8796854876974175, + "grad_norm": 4.020223199639944, + "learning_rate": 7.490678354912006e-08, + "loss": 1.1729, + "step": 6489 + }, + { + "epoch": 0.8798210533450823, + "grad_norm": 2.0598342631581263, + "learning_rate": 7.474011447821704e-08, + "loss": 1.1356, + "step": 6490 + }, + { + "epoch": 0.8799566189927472, + "grad_norm": 15.825380685217736, + "learning_rate": 7.457362383637922e-08, + "loss": 1.1101, + "step": 6491 + }, + { + "epoch": 0.8800921846404122, + "grad_norm": 1.7537746165714558, + "learning_rate": 7.440731165571323e-08, + "loss": 1.1293, + "step": 6492 + }, + { + "epoch": 0.880227750288077, + "grad_norm": 1.668143101935234, + "learning_rate": 7.42411779682911e-08, + "loss": 1.1393, + "step": 6493 + }, + { + "epoch": 0.8803633159357419, + "grad_norm": 1.5690818359806484, + "learning_rate": 7.407522280615019e-08, + "loss": 1.1292, + "step": 6494 + }, + { + "epoch": 0.8804988815834067, + "grad_norm": 1.4823237802353644, + "learning_rate": 7.39094462012938e-08, + "loss": 1.1852, + "step": 6495 + }, + { + "epoch": 0.8806344472310716, + "grad_norm": 3.0407982709052104, + "learning_rate": 7.374384818569069e-08, + "loss": 1.1428, + "step": 6496 + }, + { + "epoch": 0.8807700128787366, + "grad_norm": 1.8365776074610398, + "learning_rate": 7.357842879127474e-08, + "loss": 1.1465, + "step": 6497 + }, + { + "epoch": 0.8809055785264014, + "grad_norm": 1.9846734856987747, + "learning_rate": 7.341318804994645e-08, + "loss": 1.1587, + "step": 6498 + }, + { + "epoch": 0.8810411441740663, + "grad_norm": 1.654483831605577, + "learning_rate": 7.324812599357044e-08, + "loss": 1.1083, + "step": 6499 + }, + { + "epoch": 0.8811767098217311, + "grad_norm": 1.7875430980706335, + "learning_rate": 7.308324265397836e-08, + "loss": 1.1476, + "step": 6500 + }, + { + "epoch": 0.8813122754693961, + "grad_norm": 3.2934615211142644, + "learning_rate": 7.291853806296599e-08, + "loss": 1.1415, + "step": 6501 + }, + { + "epoch": 0.881447841117061, + "grad_norm": 1.914934823902239, + "learning_rate": 7.275401225229583e-08, + "loss": 1.1257, + "step": 6502 + }, + { + "epoch": 0.8815834067647258, + "grad_norm": 2.618572614627152, + "learning_rate": 7.258966525369492e-08, + "loss": 1.1563, + "step": 6503 + }, + { + "epoch": 0.8817189724123907, + "grad_norm": 1.737876806064117, + "learning_rate": 7.242549709885693e-08, + "loss": 1.0892, + "step": 6504 + }, + { + "epoch": 0.8818545380600555, + "grad_norm": 1.6641941863735326, + "learning_rate": 7.226150781943963e-08, + "loss": 1.1151, + "step": 6505 + }, + { + "epoch": 0.8819901037077205, + "grad_norm": 1.9977681429618914, + "learning_rate": 7.209769744706772e-08, + "loss": 1.0984, + "step": 6506 + }, + { + "epoch": 0.8821256693553854, + "grad_norm": 2.7593099042880653, + "learning_rate": 7.193406601333018e-08, + "loss": 1.1552, + "step": 6507 + }, + { + "epoch": 0.8822612350030502, + "grad_norm": 2.350168531900302, + "learning_rate": 7.177061354978242e-08, + "loss": 1.1416, + "step": 6508 + }, + { + "epoch": 0.8823968006507151, + "grad_norm": 2.1763178973000095, + "learning_rate": 7.160734008794489e-08, + "loss": 1.1387, + "step": 6509 + }, + { + "epoch": 0.8825323662983799, + "grad_norm": 2.278993588683687, + "learning_rate": 7.144424565930341e-08, + "loss": 1.1355, + "step": 6510 + }, + { + "epoch": 0.8826679319460449, + "grad_norm": 2.447666937518223, + "learning_rate": 7.128133029530969e-08, + "loss": 1.1083, + "step": 6511 + }, + { + "epoch": 0.8828034975937098, + "grad_norm": 2.114484884173671, + "learning_rate": 7.111859402738052e-08, + "loss": 1.1544, + "step": 6512 + }, + { + "epoch": 0.8829390632413746, + "grad_norm": 1.5719108514590139, + "learning_rate": 7.095603688689833e-08, + "loss": 1.1123, + "step": 6513 + }, + { + "epoch": 0.8830746288890395, + "grad_norm": 1.7742523309225513, + "learning_rate": 7.079365890521106e-08, + "loss": 1.1135, + "step": 6514 + }, + { + "epoch": 0.8832101945367044, + "grad_norm": 1.9651746407183057, + "learning_rate": 7.063146011363186e-08, + "loss": 1.1094, + "step": 6515 + }, + { + "epoch": 0.8833457601843693, + "grad_norm": 2.615209881918816, + "learning_rate": 7.046944054343961e-08, + "loss": 1.1547, + "step": 6516 + }, + { + "epoch": 0.8834813258320342, + "grad_norm": 1.907433134097656, + "learning_rate": 7.030760022587856e-08, + "loss": 1.1615, + "step": 6517 + }, + { + "epoch": 0.883616891479699, + "grad_norm": 2.0790802240771304, + "learning_rate": 7.014593919215816e-08, + "loss": 1.1269, + "step": 6518 + }, + { + "epoch": 0.8837524571273639, + "grad_norm": 1.908075534540681, + "learning_rate": 6.998445747345371e-08, + "loss": 1.1272, + "step": 6519 + }, + { + "epoch": 0.8838880227750288, + "grad_norm": 2.1986979576862575, + "learning_rate": 6.982315510090542e-08, + "loss": 1.1604, + "step": 6520 + }, + { + "epoch": 0.8840235884226937, + "grad_norm": 3.3911291429686705, + "learning_rate": 6.966203210561927e-08, + "loss": 1.113, + "step": 6521 + }, + { + "epoch": 0.8841591540703586, + "grad_norm": 1.7389265840070545, + "learning_rate": 6.950108851866687e-08, + "loss": 1.1339, + "step": 6522 + }, + { + "epoch": 0.8842947197180234, + "grad_norm": 2.192035720373438, + "learning_rate": 6.934032437108439e-08, + "loss": 1.1524, + "step": 6523 + }, + { + "epoch": 0.8844302853656884, + "grad_norm": 3.066136874404219, + "learning_rate": 6.917973969387424e-08, + "loss": 1.1399, + "step": 6524 + }, + { + "epoch": 0.8845658510133532, + "grad_norm": 2.0224746005259373, + "learning_rate": 6.901933451800379e-08, + "loss": 1.1472, + "step": 6525 + }, + { + "epoch": 0.8847014166610181, + "grad_norm": 1.97895826145527, + "learning_rate": 6.885910887440593e-08, + "loss": 1.1356, + "step": 6526 + }, + { + "epoch": 0.884836982308683, + "grad_norm": 1.601090248128626, + "learning_rate": 6.869906279397897e-08, + "loss": 1.1434, + "step": 6527 + }, + { + "epoch": 0.8849725479563478, + "grad_norm": 2.1940856573417697, + "learning_rate": 6.853919630758653e-08, + "loss": 1.1499, + "step": 6528 + }, + { + "epoch": 0.8851081136040128, + "grad_norm": 1.824841480578177, + "learning_rate": 6.837950944605763e-08, + "loss": 1.1116, + "step": 6529 + }, + { + "epoch": 0.8852436792516776, + "grad_norm": 1.7979957538415279, + "learning_rate": 6.822000224018653e-08, + "loss": 1.1198, + "step": 6530 + }, + { + "epoch": 0.8853792448993425, + "grad_norm": 2.003561416760149, + "learning_rate": 6.806067472073296e-08, + "loss": 1.152, + "step": 6531 + }, + { + "epoch": 0.8855148105470074, + "grad_norm": 2.996048103030528, + "learning_rate": 6.790152691842199e-08, + "loss": 1.1658, + "step": 6532 + }, + { + "epoch": 0.8856503761946722, + "grad_norm": 1.788291637102259, + "learning_rate": 6.774255886394397e-08, + "loss": 1.1058, + "step": 6533 + }, + { + "epoch": 0.8857859418423372, + "grad_norm": 2.753692137293281, + "learning_rate": 6.758377058795473e-08, + "loss": 1.1255, + "step": 6534 + }, + { + "epoch": 0.885921507490002, + "grad_norm": 2.479453658977365, + "learning_rate": 6.742516212107541e-08, + "loss": 1.1605, + "step": 6535 + }, + { + "epoch": 0.8860570731376669, + "grad_norm": 1.6564878468605677, + "learning_rate": 6.726673349389201e-08, + "loss": 1.1567, + "step": 6536 + }, + { + "epoch": 0.8861926387853318, + "grad_norm": 3.3952464847088426, + "learning_rate": 6.710848473695674e-08, + "loss": 1.1343, + "step": 6537 + }, + { + "epoch": 0.8863282044329966, + "grad_norm": 1.6629858712160441, + "learning_rate": 6.69504158807862e-08, + "loss": 1.1342, + "step": 6538 + }, + { + "epoch": 0.8864637700806616, + "grad_norm": 2.996834246674295, + "learning_rate": 6.679252695586312e-08, + "loss": 1.1402, + "step": 6539 + }, + { + "epoch": 0.8865993357283264, + "grad_norm": 1.8320408840940319, + "learning_rate": 6.663481799263471e-08, + "loss": 1.0753, + "step": 6540 + }, + { + "epoch": 0.8867349013759913, + "grad_norm": 1.799416343602753, + "learning_rate": 6.647728902151428e-08, + "loss": 1.1264, + "step": 6541 + }, + { + "epoch": 0.8868704670236562, + "grad_norm": 5.644921978614418, + "learning_rate": 6.631994007287966e-08, + "loss": 1.1412, + "step": 6542 + }, + { + "epoch": 0.887006032671321, + "grad_norm": 1.9467233823520789, + "learning_rate": 6.616277117707492e-08, + "loss": 1.1327, + "step": 6543 + }, + { + "epoch": 0.887141598318986, + "grad_norm": 2.752631504938017, + "learning_rate": 6.600578236440812e-08, + "loss": 1.212, + "step": 6544 + }, + { + "epoch": 0.8872771639666509, + "grad_norm": 1.8650024815627557, + "learning_rate": 6.584897366515407e-08, + "loss": 1.1349, + "step": 6545 + }, + { + "epoch": 0.8874127296143157, + "grad_norm": 1.8721084327969897, + "learning_rate": 6.569234510955135e-08, + "loss": 1.1636, + "step": 6546 + }, + { + "epoch": 0.8875482952619806, + "grad_norm": 1.8147260254436504, + "learning_rate": 6.553589672780524e-08, + "loss": 1.1354, + "step": 6547 + }, + { + "epoch": 0.8876838609096455, + "grad_norm": 2.2622499676130774, + "learning_rate": 6.537962855008483e-08, + "loss": 1.1304, + "step": 6548 + }, + { + "epoch": 0.8878194265573104, + "grad_norm": 3.6992883622526893, + "learning_rate": 6.522354060652602e-08, + "loss": 1.149, + "step": 6549 + }, + { + "epoch": 0.8879549922049753, + "grad_norm": 1.7523591993937109, + "learning_rate": 6.50676329272285e-08, + "loss": 1.109, + "step": 6550 + }, + { + "epoch": 0.8880905578526401, + "grad_norm": 1.8396252667655812, + "learning_rate": 6.491190554225811e-08, + "loss": 1.0851, + "step": 6551 + }, + { + "epoch": 0.888226123500305, + "grad_norm": 2.227804033864536, + "learning_rate": 6.475635848164562e-08, + "loss": 1.1275, + "step": 6552 + }, + { + "epoch": 0.8883616891479699, + "grad_norm": 1.9614080338482218, + "learning_rate": 6.460099177538703e-08, + "loss": 1.1461, + "step": 6553 + }, + { + "epoch": 0.8884972547956348, + "grad_norm": 2.8523487398204694, + "learning_rate": 6.444580545344358e-08, + "loss": 1.1215, + "step": 6554 + }, + { + "epoch": 0.8886328204432997, + "grad_norm": 1.5581155224962762, + "learning_rate": 6.429079954574168e-08, + "loss": 1.1296, + "step": 6555 + }, + { + "epoch": 0.8887683860909645, + "grad_norm": 2.189555394286035, + "learning_rate": 6.413597408217309e-08, + "loss": 1.1358, + "step": 6556 + }, + { + "epoch": 0.8889039517386295, + "grad_norm": 1.870775255331432, + "learning_rate": 6.398132909259457e-08, + "loss": 1.0901, + "step": 6557 + }, + { + "epoch": 0.8890395173862943, + "grad_norm": 1.7080183908461486, + "learning_rate": 6.382686460682851e-08, + "loss": 1.143, + "step": 6558 + }, + { + "epoch": 0.8891750830339592, + "grad_norm": 1.6640441388971368, + "learning_rate": 6.367258065466152e-08, + "loss": 1.1282, + "step": 6559 + }, + { + "epoch": 0.8893106486816241, + "grad_norm": 2.4341167603733562, + "learning_rate": 6.35184772658468e-08, + "loss": 1.1042, + "step": 6560 + }, + { + "epoch": 0.8894462143292889, + "grad_norm": 2.2262508271214743, + "learning_rate": 6.336455447010126e-08, + "loss": 1.1583, + "step": 6561 + }, + { + "epoch": 0.8895817799769539, + "grad_norm": 1.7132675678293754, + "learning_rate": 6.321081229710834e-08, + "loss": 1.1387, + "step": 6562 + }, + { + "epoch": 0.8897173456246187, + "grad_norm": 1.699293696407783, + "learning_rate": 6.305725077651558e-08, + "loss": 1.1474, + "step": 6563 + }, + { + "epoch": 0.8898529112722836, + "grad_norm": 2.3138363174585588, + "learning_rate": 6.290386993793617e-08, + "loss": 1.1239, + "step": 6564 + }, + { + "epoch": 0.8899884769199485, + "grad_norm": 7.938951443039966, + "learning_rate": 6.275066981094857e-08, + "loss": 1.1251, + "step": 6565 + }, + { + "epoch": 0.8901240425676133, + "grad_norm": 1.731927865901536, + "learning_rate": 6.259765042509602e-08, + "loss": 1.1302, + "step": 6566 + }, + { + "epoch": 0.8902596082152783, + "grad_norm": 1.8137787024852847, + "learning_rate": 6.244481180988714e-08, + "loss": 1.1249, + "step": 6567 + }, + { + "epoch": 0.8903951738629431, + "grad_norm": 1.5768021847889244, + "learning_rate": 6.229215399479582e-08, + "loss": 1.1555, + "step": 6568 + }, + { + "epoch": 0.890530739510608, + "grad_norm": 2.1022876264655435, + "learning_rate": 6.213967700926071e-08, + "loss": 1.1296, + "step": 6569 + }, + { + "epoch": 0.8906663051582729, + "grad_norm": 1.707687202205395, + "learning_rate": 6.198738088268585e-08, + "loss": 1.1416, + "step": 6570 + }, + { + "epoch": 0.8908018708059378, + "grad_norm": 1.525734732124275, + "learning_rate": 6.183526564444042e-08, + "loss": 1.1454, + "step": 6571 + }, + { + "epoch": 0.8909374364536027, + "grad_norm": 1.8254856744949666, + "learning_rate": 6.16833313238585e-08, + "loss": 1.1194, + "step": 6572 + }, + { + "epoch": 0.8910730021012675, + "grad_norm": 2.124334112601339, + "learning_rate": 6.153157795023956e-08, + "loss": 1.1732, + "step": 6573 + }, + { + "epoch": 0.8912085677489324, + "grad_norm": 1.630922183688529, + "learning_rate": 6.138000555284806e-08, + "loss": 1.1212, + "step": 6574 + }, + { + "epoch": 0.8913441333965973, + "grad_norm": 3.094909473005925, + "learning_rate": 6.12286141609134e-08, + "loss": 1.1421, + "step": 6575 + }, + { + "epoch": 0.8914796990442622, + "grad_norm": 1.9596434735988246, + "learning_rate": 6.107740380363036e-08, + "loss": 1.1387, + "step": 6576 + }, + { + "epoch": 0.8916152646919271, + "grad_norm": 1.6462386860690867, + "learning_rate": 6.092637451015847e-08, + "loss": 1.2163, + "step": 6577 + }, + { + "epoch": 0.8917508303395919, + "grad_norm": 1.9798856243725116, + "learning_rate": 6.07755263096229e-08, + "loss": 1.1617, + "step": 6578 + }, + { + "epoch": 0.8918863959872568, + "grad_norm": 3.4886553427978915, + "learning_rate": 6.062485923111293e-08, + "loss": 1.1305, + "step": 6579 + }, + { + "epoch": 0.8920219616349218, + "grad_norm": 2.215085539219178, + "learning_rate": 6.047437330368421e-08, + "loss": 1.1555, + "step": 6580 + }, + { + "epoch": 0.8921575272825866, + "grad_norm": 2.1693624595950016, + "learning_rate": 6.032406855635619e-08, + "loss": 1.1912, + "step": 6581 + }, + { + "epoch": 0.8922930929302515, + "grad_norm": 2.086360772035384, + "learning_rate": 6.017394501811445e-08, + "loss": 1.1865, + "step": 6582 + }, + { + "epoch": 0.8924286585779163, + "grad_norm": 1.9044077046260828, + "learning_rate": 6.002400271790864e-08, + "loss": 1.0685, + "step": 6583 + }, + { + "epoch": 0.8925642242255812, + "grad_norm": 2.718787072555157, + "learning_rate": 5.987424168465439e-08, + "loss": 1.1633, + "step": 6584 + }, + { + "epoch": 0.8926997898732462, + "grad_norm": 1.8032154851226005, + "learning_rate": 5.972466194723159e-08, + "loss": 1.1509, + "step": 6585 + }, + { + "epoch": 0.892835355520911, + "grad_norm": 1.81157372697028, + "learning_rate": 5.957526353448572e-08, + "loss": 1.1664, + "step": 6586 + }, + { + "epoch": 0.8929709211685759, + "grad_norm": 3.1461924368521355, + "learning_rate": 5.9426046475226975e-08, + "loss": 1.1433, + "step": 6587 + }, + { + "epoch": 0.8931064868162407, + "grad_norm": 1.776911051631953, + "learning_rate": 5.9277010798230666e-08, + "loss": 1.1449, + "step": 6588 + }, + { + "epoch": 0.8932420524639056, + "grad_norm": 1.6399770431389162, + "learning_rate": 5.912815653223724e-08, + "loss": 1.0813, + "step": 6589 + }, + { + "epoch": 0.8933776181115706, + "grad_norm": 1.7016064454636086, + "learning_rate": 5.897948370595207e-08, + "loss": 1.1082, + "step": 6590 + }, + { + "epoch": 0.8935131837592354, + "grad_norm": 3.5845028027790433, + "learning_rate": 5.8830992348045563e-08, + "loss": 1.1306, + "step": 6591 + }, + { + "epoch": 0.8936487494069003, + "grad_norm": 1.9945663226670722, + "learning_rate": 5.8682682487152915e-08, + "loss": 1.1133, + "step": 6592 + }, + { + "epoch": 0.8937843150545651, + "grad_norm": 3.2356562446346175, + "learning_rate": 5.8534554151874805e-08, + "loss": 1.1439, + "step": 6593 + }, + { + "epoch": 0.89391988070223, + "grad_norm": 1.6965032774543347, + "learning_rate": 5.8386607370776274e-08, + "loss": 1.1237, + "step": 6594 + }, + { + "epoch": 0.894055446349895, + "grad_norm": 2.1087063876931573, + "learning_rate": 5.823884217238817e-08, + "loss": 1.1156, + "step": 6595 + }, + { + "epoch": 0.8941910119975598, + "grad_norm": 1.9755978937932837, + "learning_rate": 5.809125858520514e-08, + "loss": 1.1152, + "step": 6596 + }, + { + "epoch": 0.8943265776452247, + "grad_norm": 1.757876591401235, + "learning_rate": 5.794385663768819e-08, + "loss": 1.1371, + "step": 6597 + }, + { + "epoch": 0.8944621432928895, + "grad_norm": 2.0345296005755253, + "learning_rate": 5.7796636358262155e-08, + "loss": 1.138, + "step": 6598 + }, + { + "epoch": 0.8945977089405545, + "grad_norm": 1.8091866729637018, + "learning_rate": 5.764959777531775e-08, + "loss": 1.1408, + "step": 6599 + }, + { + "epoch": 0.8947332745882194, + "grad_norm": 1.8656612439043583, + "learning_rate": 5.750274091720964e-08, + "loss": 1.1412, + "step": 6600 + }, + { + "epoch": 0.8948688402358842, + "grad_norm": 2.378265630060403, + "learning_rate": 5.7356065812258604e-08, + "loss": 1.1238, + "step": 6601 + }, + { + "epoch": 0.8950044058835491, + "grad_norm": 1.7607569548391533, + "learning_rate": 5.720957248874925e-08, + "loss": 1.1316, + "step": 6602 + }, + { + "epoch": 0.8951399715312139, + "grad_norm": 1.731516356115962, + "learning_rate": 5.706326097493219e-08, + "loss": 1.1048, + "step": 6603 + }, + { + "epoch": 0.8952755371788789, + "grad_norm": 1.7360666607420787, + "learning_rate": 5.691713129902187e-08, + "loss": 1.1437, + "step": 6604 + }, + { + "epoch": 0.8954111028265438, + "grad_norm": 1.6870836886615719, + "learning_rate": 5.677118348919874e-08, + "loss": 1.1271, + "step": 6605 + }, + { + "epoch": 0.8955466684742086, + "grad_norm": 1.6535337401169583, + "learning_rate": 5.662541757360739e-08, + "loss": 1.151, + "step": 6606 + }, + { + "epoch": 0.8956822341218735, + "grad_norm": 1.8917660025169987, + "learning_rate": 5.6479833580357796e-08, + "loss": 1.1336, + "step": 6607 + }, + { + "epoch": 0.8958177997695383, + "grad_norm": 2.3574670123271377, + "learning_rate": 5.633443153752448e-08, + "loss": 1.1002, + "step": 6608 + }, + { + "epoch": 0.8959533654172033, + "grad_norm": 1.859487138721219, + "learning_rate": 5.6189211473147256e-08, + "loss": 1.1258, + "step": 6609 + }, + { + "epoch": 0.8960889310648682, + "grad_norm": 1.6828467303582308, + "learning_rate": 5.60441734152306e-08, + "loss": 1.1537, + "step": 6610 + }, + { + "epoch": 0.896224496712533, + "grad_norm": 1.941420502964534, + "learning_rate": 5.5899317391744025e-08, + "loss": 1.1464, + "step": 6611 + }, + { + "epoch": 0.8963600623601979, + "grad_norm": 2.3126265533807855, + "learning_rate": 5.575464343062175e-08, + "loss": 1.1542, + "step": 6612 + }, + { + "epoch": 0.8964956280078628, + "grad_norm": 2.9879123217769843, + "learning_rate": 5.561015155976312e-08, + "loss": 1.1537, + "step": 6613 + }, + { + "epoch": 0.8966311936555277, + "grad_norm": 2.0612251808162534, + "learning_rate": 5.546584180703207e-08, + "loss": 1.1561, + "step": 6614 + }, + { + "epoch": 0.8967667593031926, + "grad_norm": 1.862534153261607, + "learning_rate": 5.5321714200257884e-08, + "loss": 1.1223, + "step": 6615 + }, + { + "epoch": 0.8969023249508574, + "grad_norm": 1.8822944636441217, + "learning_rate": 5.5177768767234236e-08, + "loss": 1.123, + "step": 6616 + }, + { + "epoch": 0.8970378905985223, + "grad_norm": 1.9569473239671715, + "learning_rate": 5.50340055357198e-08, + "loss": 1.1445, + "step": 6617 + }, + { + "epoch": 0.8971734562461872, + "grad_norm": 2.354764987177449, + "learning_rate": 5.4890424533438394e-08, + "loss": 1.1376, + "step": 6618 + }, + { + "epoch": 0.8973090218938521, + "grad_norm": 1.9025231715447235, + "learning_rate": 5.4747025788078546e-08, + "loss": 1.1693, + "step": 6619 + }, + { + "epoch": 0.897444587541517, + "grad_norm": 1.5893995717027236, + "learning_rate": 5.460380932729303e-08, + "loss": 1.1371, + "step": 6620 + }, + { + "epoch": 0.8975801531891818, + "grad_norm": 2.6239962199826343, + "learning_rate": 5.4460775178700736e-08, + "loss": 1.111, + "step": 6621 + }, + { + "epoch": 0.8977157188368468, + "grad_norm": 2.298915627580696, + "learning_rate": 5.431792336988417e-08, + "loss": 1.1295, + "step": 6622 + }, + { + "epoch": 0.8978512844845117, + "grad_norm": 1.7387633093914854, + "learning_rate": 5.417525392839129e-08, + "loss": 1.1583, + "step": 6623 + }, + { + "epoch": 0.8979868501321765, + "grad_norm": 1.6690530045072867, + "learning_rate": 5.4032766881734745e-08, + "loss": 1.1219, + "step": 6624 + }, + { + "epoch": 0.8981224157798414, + "grad_norm": 2.2031981665906573, + "learning_rate": 5.3890462257392246e-08, + "loss": 1.1371, + "step": 6625 + }, + { + "epoch": 0.8982579814275062, + "grad_norm": 1.6231712047336095, + "learning_rate": 5.3748340082805824e-08, + "loss": 1.127, + "step": 6626 + }, + { + "epoch": 0.8983935470751712, + "grad_norm": 1.745747735170393, + "learning_rate": 5.360640038538278e-08, + "loss": 1.1212, + "step": 6627 + }, + { + "epoch": 0.8985291127228361, + "grad_norm": 1.6966457687880216, + "learning_rate": 5.3464643192495104e-08, + "loss": 1.1492, + "step": 6628 + }, + { + "epoch": 0.8986646783705009, + "grad_norm": 1.723631814815818, + "learning_rate": 5.33230685314795e-08, + "loss": 1.1192, + "step": 6629 + }, + { + "epoch": 0.8988002440181658, + "grad_norm": 2.321686097519481, + "learning_rate": 5.3181676429637447e-08, + "loss": 1.1068, + "step": 6630 + }, + { + "epoch": 0.8989358096658306, + "grad_norm": 1.6923789046262308, + "learning_rate": 5.304046691423536e-08, + "loss": 1.1435, + "step": 6631 + }, + { + "epoch": 0.8990713753134956, + "grad_norm": 2.309557836247023, + "learning_rate": 5.289944001250446e-08, + "loss": 1.164, + "step": 6632 + }, + { + "epoch": 0.8992069409611605, + "grad_norm": 2.0585721584948176, + "learning_rate": 5.275859575164054e-08, + "loss": 1.1867, + "step": 6633 + }, + { + "epoch": 0.8993425066088253, + "grad_norm": 2.3722781670914945, + "learning_rate": 5.2617934158804557e-08, + "loss": 1.158, + "step": 6634 + }, + { + "epoch": 0.8994780722564902, + "grad_norm": 1.7682808199280535, + "learning_rate": 5.247745526112146e-08, + "loss": 1.1421, + "step": 6635 + }, + { + "epoch": 0.899613637904155, + "grad_norm": 2.3593079564831156, + "learning_rate": 5.233715908568215e-08, + "loss": 1.122, + "step": 6636 + }, + { + "epoch": 0.89974920355182, + "grad_norm": 1.7275945470365435, + "learning_rate": 5.219704565954097e-08, + "loss": 1.1234, + "step": 6637 + }, + { + "epoch": 0.8998847691994849, + "grad_norm": 3.0519549920112072, + "learning_rate": 5.2057115009718434e-08, + "loss": 1.1504, + "step": 6638 + }, + { + "epoch": 0.9000203348471497, + "grad_norm": 1.8173020718100557, + "learning_rate": 5.191736716319828e-08, + "loss": 1.16, + "step": 6639 + }, + { + "epoch": 0.9001559004948146, + "grad_norm": 1.7239638711748004, + "learning_rate": 5.17778021469305e-08, + "loss": 1.1528, + "step": 6640 + }, + { + "epoch": 0.9002914661424795, + "grad_norm": 1.8103000578313733, + "learning_rate": 5.1638419987828365e-08, + "loss": 1.1489, + "step": 6641 + }, + { + "epoch": 0.9004270317901444, + "grad_norm": 2.2245950545600643, + "learning_rate": 5.149922071277146e-08, + "loss": 1.0947, + "step": 6642 + }, + { + "epoch": 0.9005625974378093, + "grad_norm": 2.6871025901088754, + "learning_rate": 5.136020434860244e-08, + "loss": 1.1131, + "step": 6643 + }, + { + "epoch": 0.9006981630854741, + "grad_norm": 1.8487771105279036, + "learning_rate": 5.122137092213019e-08, + "loss": 1.1335, + "step": 6644 + }, + { + "epoch": 0.900833728733139, + "grad_norm": 2.3664450257730154, + "learning_rate": 5.108272046012718e-08, + "loss": 1.1772, + "step": 6645 + }, + { + "epoch": 0.9009692943808039, + "grad_norm": 2.026134079257399, + "learning_rate": 5.094425298933136e-08, + "loss": 1.1628, + "step": 6646 + }, + { + "epoch": 0.9011048600284688, + "grad_norm": 1.7585068220152709, + "learning_rate": 5.080596853644492e-08, + "loss": 1.1084, + "step": 6647 + }, + { + "epoch": 0.9012404256761337, + "grad_norm": 2.1189971760224235, + "learning_rate": 5.066786712813498e-08, + "loss": 1.1733, + "step": 6648 + }, + { + "epoch": 0.9013759913237985, + "grad_norm": 2.0238727074249496, + "learning_rate": 5.052994879103323e-08, + "loss": 1.139, + "step": 6649 + }, + { + "epoch": 0.9015115569714635, + "grad_norm": 2.1351305583368076, + "learning_rate": 5.0392213551736176e-08, + "loss": 1.1357, + "step": 6650 + }, + { + "epoch": 0.9016471226191283, + "grad_norm": 1.7154285718391653, + "learning_rate": 5.0254661436805015e-08, + "loss": 1.1385, + "step": 6651 + }, + { + "epoch": 0.9017826882667932, + "grad_norm": 1.963005739881801, + "learning_rate": 5.0117292472765635e-08, + "loss": 1.1263, + "step": 6652 + }, + { + "epoch": 0.9019182539144581, + "grad_norm": 1.736027938415211, + "learning_rate": 4.9980106686108416e-08, + "loss": 1.1519, + "step": 6653 + }, + { + "epoch": 0.9020538195621229, + "grad_norm": 1.8306759757714315, + "learning_rate": 4.9843104103288625e-08, + "loss": 1.1221, + "step": 6654 + }, + { + "epoch": 0.9021893852097879, + "grad_norm": 1.8532542764763318, + "learning_rate": 4.9706284750726135e-08, + "loss": 1.1411, + "step": 6655 + }, + { + "epoch": 0.9023249508574527, + "grad_norm": 2.098411706455503, + "learning_rate": 4.956964865480551e-08, + "loss": 1.1311, + "step": 6656 + }, + { + "epoch": 0.9024605165051176, + "grad_norm": 1.7431500243874734, + "learning_rate": 4.9433195841875995e-08, + "loss": 1.1206, + "step": 6657 + }, + { + "epoch": 0.9025960821527825, + "grad_norm": 1.6952908325892682, + "learning_rate": 4.9296926338251e-08, + "loss": 1.0982, + "step": 6658 + }, + { + "epoch": 0.9027316478004473, + "grad_norm": 1.5796687856348512, + "learning_rate": 4.916084017020972e-08, + "loss": 1.1244, + "step": 6659 + }, + { + "epoch": 0.9028672134481123, + "grad_norm": 1.7577827531138124, + "learning_rate": 4.9024937363994714e-08, + "loss": 1.1885, + "step": 6660 + }, + { + "epoch": 0.9030027790957771, + "grad_norm": 1.7390054401446535, + "learning_rate": 4.888921794581424e-08, + "loss": 1.1274, + "step": 6661 + }, + { + "epoch": 0.903138344743442, + "grad_norm": 1.6856054831505694, + "learning_rate": 4.875368194184026e-08, + "loss": 1.1243, + "step": 6662 + }, + { + "epoch": 0.9032739103911069, + "grad_norm": 2.2853822516100557, + "learning_rate": 4.8618329378210085e-08, + "loss": 1.1656, + "step": 6663 + }, + { + "epoch": 0.9034094760387718, + "grad_norm": 1.9345093206253006, + "learning_rate": 4.848316028102539e-08, + "loss": 1.1297, + "step": 6664 + }, + { + "epoch": 0.9035450416864367, + "grad_norm": 1.8683874020510776, + "learning_rate": 4.834817467635233e-08, + "loss": 1.164, + "step": 6665 + }, + { + "epoch": 0.9036806073341015, + "grad_norm": 1.854262606286493, + "learning_rate": 4.821337259022196e-08, + "loss": 1.0837, + "step": 6666 + }, + { + "epoch": 0.9038161729817664, + "grad_norm": 1.8333406241112102, + "learning_rate": 4.807875404862971e-08, + "loss": 1.1454, + "step": 6667 + }, + { + "epoch": 0.9039517386294313, + "grad_norm": 1.9806527548373372, + "learning_rate": 4.794431907753571e-08, + "loss": 1.1246, + "step": 6668 + }, + { + "epoch": 0.9040873042770962, + "grad_norm": 2.299833609640615, + "learning_rate": 4.781006770286478e-08, + "loss": 1.151, + "step": 6669 + }, + { + "epoch": 0.9042228699247611, + "grad_norm": 1.7364376403884978, + "learning_rate": 4.767599995050609e-08, + "loss": 1.1491, + "step": 6670 + }, + { + "epoch": 0.9043584355724259, + "grad_norm": 1.584234057952154, + "learning_rate": 4.7542115846313734e-08, + "loss": 1.1154, + "step": 6671 + }, + { + "epoch": 0.9044940012200908, + "grad_norm": 1.5648652550606708, + "learning_rate": 4.740841541610596e-08, + "loss": 1.1326, + "step": 6672 + }, + { + "epoch": 0.9046295668677558, + "grad_norm": 1.7835695816855208, + "learning_rate": 4.727489868566603e-08, + "loss": 1.149, + "step": 6673 + }, + { + "epoch": 0.9047651325154206, + "grad_norm": 1.7652105146796984, + "learning_rate": 4.714156568074157e-08, + "loss": 1.1349, + "step": 6674 + }, + { + "epoch": 0.9049006981630855, + "grad_norm": 2.239524077208324, + "learning_rate": 4.700841642704478e-08, + "loss": 1.1229, + "step": 6675 + }, + { + "epoch": 0.9050362638107503, + "grad_norm": 3.199177820411182, + "learning_rate": 4.687545095025225e-08, + "loss": 1.1078, + "step": 6676 + }, + { + "epoch": 0.9051718294584152, + "grad_norm": 1.8660126754235602, + "learning_rate": 4.6742669276005786e-08, + "loss": 1.1642, + "step": 6677 + }, + { + "epoch": 0.9053073951060802, + "grad_norm": 1.923243061284446, + "learning_rate": 4.661007142991069e-08, + "loss": 1.1152, + "step": 6678 + }, + { + "epoch": 0.905442960753745, + "grad_norm": 3.730911195337825, + "learning_rate": 4.6477657437537953e-08, + "loss": 1.1006, + "step": 6679 + }, + { + "epoch": 0.9055785264014099, + "grad_norm": 3.690428975327078, + "learning_rate": 4.634542732442204e-08, + "loss": 1.1282, + "step": 6680 + }, + { + "epoch": 0.9057140920490747, + "grad_norm": 1.7304968224329345, + "learning_rate": 4.62133811160631e-08, + "loss": 1.1213, + "step": 6681 + }, + { + "epoch": 0.9058496576967396, + "grad_norm": 1.7510334547655368, + "learning_rate": 4.608151883792466e-08, + "loss": 1.0972, + "step": 6682 + }, + { + "epoch": 0.9059852233444046, + "grad_norm": 1.9058729126314788, + "learning_rate": 4.5949840515435715e-08, + "loss": 1.1669, + "step": 6683 + }, + { + "epoch": 0.9061207889920694, + "grad_norm": 2.2011611555978488, + "learning_rate": 4.581834617398916e-08, + "loss": 1.1201, + "step": 6684 + }, + { + "epoch": 0.9062563546397343, + "grad_norm": 1.7202256590857774, + "learning_rate": 4.568703583894262e-08, + "loss": 1.1095, + "step": 6685 + }, + { + "epoch": 0.9063919202873991, + "grad_norm": 2.2890908640002343, + "learning_rate": 4.555590953561839e-08, + "loss": 1.1361, + "step": 6686 + }, + { + "epoch": 0.906527485935064, + "grad_norm": 2.1365483409288495, + "learning_rate": 4.542496728930301e-08, + "loss": 1.1272, + "step": 6687 + }, + { + "epoch": 0.906663051582729, + "grad_norm": 1.6485012261521166, + "learning_rate": 4.529420912524773e-08, + "loss": 1.1104, + "step": 6688 + }, + { + "epoch": 0.9067986172303938, + "grad_norm": 3.389315682837512, + "learning_rate": 4.516363506866827e-08, + "loss": 1.1127, + "step": 6689 + }, + { + "epoch": 0.9069341828780587, + "grad_norm": 2.1028663023278624, + "learning_rate": 4.503324514474483e-08, + "loss": 1.16, + "step": 6690 + }, + { + "epoch": 0.9070697485257235, + "grad_norm": 1.7187201784905757, + "learning_rate": 4.4903039378621945e-08, + "loss": 1.1042, + "step": 6691 + }, + { + "epoch": 0.9072053141733885, + "grad_norm": 1.8931452796830697, + "learning_rate": 4.477301779540887e-08, + "loss": 1.1448, + "step": 6692 + }, + { + "epoch": 0.9073408798210534, + "grad_norm": 2.188479136327171, + "learning_rate": 4.4643180420179113e-08, + "loss": 1.1197, + "step": 6693 + }, + { + "epoch": 0.9074764454687182, + "grad_norm": 1.7264198386628034, + "learning_rate": 4.451352727797109e-08, + "loss": 1.1114, + "step": 6694 + }, + { + "epoch": 0.9076120111163831, + "grad_norm": 1.6235782545948454, + "learning_rate": 4.4384058393786895e-08, + "loss": 1.1578, + "step": 6695 + }, + { + "epoch": 0.9077475767640479, + "grad_norm": 3.435551300599336, + "learning_rate": 4.425477379259424e-08, + "loss": 1.126, + "step": 6696 + }, + { + "epoch": 0.9078831424117129, + "grad_norm": 3.0014782373326767, + "learning_rate": 4.412567349932384e-08, + "loss": 1.1509, + "step": 6697 + }, + { + "epoch": 0.9080187080593778, + "grad_norm": 1.6276857092930952, + "learning_rate": 4.399675753887244e-08, + "loss": 1.1398, + "step": 6698 + }, + { + "epoch": 0.9081542737070426, + "grad_norm": 1.8086736734419662, + "learning_rate": 4.386802593609984e-08, + "loss": 1.1539, + "step": 6699 + }, + { + "epoch": 0.9082898393547075, + "grad_norm": 2.7401486873946843, + "learning_rate": 4.37394787158315e-08, + "loss": 1.118, + "step": 6700 + }, + { + "epoch": 0.9084254050023723, + "grad_norm": 2.2251449161672534, + "learning_rate": 4.3611115902856044e-08, + "loss": 1.1441, + "step": 6701 + }, + { + "epoch": 0.9085609706500373, + "grad_norm": 1.971573713303151, + "learning_rate": 4.3482937521928e-08, + "loss": 1.1613, + "step": 6702 + }, + { + "epoch": 0.9086965362977022, + "grad_norm": 2.5312224507395458, + "learning_rate": 4.335494359776493e-08, + "loss": 1.1531, + "step": 6703 + }, + { + "epoch": 0.908832101945367, + "grad_norm": 2.029078211829093, + "learning_rate": 4.322713415504975e-08, + "loss": 1.1199, + "step": 6704 + }, + { + "epoch": 0.9089676675930319, + "grad_norm": 2.3247817531358907, + "learning_rate": 4.3099509218429416e-08, + "loss": 1.1056, + "step": 6705 + }, + { + "epoch": 0.9091032332406969, + "grad_norm": 2.1365702865126552, + "learning_rate": 4.297206881251547e-08, + "loss": 1.1376, + "step": 6706 + }, + { + "epoch": 0.9092387988883617, + "grad_norm": 2.1250640793205022, + "learning_rate": 4.284481296188369e-08, + "loss": 1.1532, + "step": 6707 + }, + { + "epoch": 0.9093743645360266, + "grad_norm": 4.45908423288674, + "learning_rate": 4.271774169107445e-08, + "loss": 1.1219, + "step": 6708 + }, + { + "epoch": 0.9095099301836914, + "grad_norm": 1.8942913999265902, + "learning_rate": 4.259085502459236e-08, + "loss": 1.1496, + "step": 6709 + }, + { + "epoch": 0.9096454958313563, + "grad_norm": 2.224857629050478, + "learning_rate": 4.246415298690653e-08, + "loss": 1.1341, + "step": 6710 + }, + { + "epoch": 0.9097810614790213, + "grad_norm": 3.370239913957756, + "learning_rate": 4.2337635602450514e-08, + "loss": 1.1087, + "step": 6711 + }, + { + "epoch": 0.9099166271266861, + "grad_norm": 2.051396100650791, + "learning_rate": 4.2211302895622136e-08, + "loss": 1.1361, + "step": 6712 + }, + { + "epoch": 0.910052192774351, + "grad_norm": 1.8729967885577476, + "learning_rate": 4.208515489078368e-08, + "loss": 1.1241, + "step": 6713 + }, + { + "epoch": 0.9101877584220158, + "grad_norm": 2.6766205245820878, + "learning_rate": 4.19591916122618e-08, + "loss": 1.1442, + "step": 6714 + }, + { + "epoch": 0.9103233240696808, + "grad_norm": 2.842391656991963, + "learning_rate": 4.18334130843474e-08, + "loss": 1.1198, + "step": 6715 + }, + { + "epoch": 0.9104588897173457, + "grad_norm": 1.939529707770244, + "learning_rate": 4.1707819331296076e-08, + "loss": 1.1715, + "step": 6716 + }, + { + "epoch": 0.9105944553650105, + "grad_norm": 4.389129614873665, + "learning_rate": 4.158241037732746e-08, + "loss": 1.146, + "step": 6717 + }, + { + "epoch": 0.9107300210126754, + "grad_norm": 3.2246100130502358, + "learning_rate": 4.1457186246625863e-08, + "loss": 1.1404, + "step": 6718 + }, + { + "epoch": 0.9108655866603402, + "grad_norm": 1.6882622199562056, + "learning_rate": 4.133214696333942e-08, + "loss": 1.1248, + "step": 6719 + }, + { + "epoch": 0.9110011523080052, + "grad_norm": 1.7982039672286283, + "learning_rate": 4.1207292551581284e-08, + "loss": 1.1367, + "step": 6720 + }, + { + "epoch": 0.9111367179556701, + "grad_norm": 1.9894656594772344, + "learning_rate": 4.1082623035428424e-08, + "loss": 1.1476, + "step": 6721 + }, + { + "epoch": 0.9112722836033349, + "grad_norm": 1.7034763843194192, + "learning_rate": 4.095813843892259e-08, + "loss": 1.1274, + "step": 6722 + }, + { + "epoch": 0.9114078492509998, + "grad_norm": 1.8586218884658252, + "learning_rate": 4.08338387860695e-08, + "loss": 1.1491, + "step": 6723 + }, + { + "epoch": 0.9115434148986646, + "grad_norm": 2.525110793124299, + "learning_rate": 4.0709724100839395e-08, + "loss": 1.1541, + "step": 6724 + }, + { + "epoch": 0.9116789805463296, + "grad_norm": 3.152723550151487, + "learning_rate": 4.058579440716681e-08, + "loss": 1.1186, + "step": 6725 + }, + { + "epoch": 0.9118145461939945, + "grad_norm": 5.891425790530256, + "learning_rate": 4.046204972895062e-08, + "loss": 1.1497, + "step": 6726 + }, + { + "epoch": 0.9119501118416593, + "grad_norm": 2.1838556790804766, + "learning_rate": 4.0338490090053966e-08, + "loss": 1.153, + "step": 6727 + }, + { + "epoch": 0.9120856774893242, + "grad_norm": 1.92062773796606, + "learning_rate": 4.0215115514304456e-08, + "loss": 1.1506, + "step": 6728 + }, + { + "epoch": 0.912221243136989, + "grad_norm": 1.9982728606552977, + "learning_rate": 4.009192602549383e-08, + "loss": 1.1499, + "step": 6729 + }, + { + "epoch": 0.912356808784654, + "grad_norm": 2.5402870776406266, + "learning_rate": 3.996892164737819e-08, + "loss": 1.1419, + "step": 6730 + }, + { + "epoch": 0.9124923744323189, + "grad_norm": 1.95310097232544, + "learning_rate": 3.9846102403678027e-08, + "loss": 1.1215, + "step": 6731 + }, + { + "epoch": 0.9126279400799837, + "grad_norm": 1.9404171046598466, + "learning_rate": 3.972346831807793e-08, + "loss": 1.1826, + "step": 6732 + }, + { + "epoch": 0.9127635057276486, + "grad_norm": 10.177903367878796, + "learning_rate": 3.960101941422711e-08, + "loss": 1.1235, + "step": 6733 + }, + { + "epoch": 0.9128990713753135, + "grad_norm": 2.319576829504312, + "learning_rate": 3.947875571573867e-08, + "loss": 1.1401, + "step": 6734 + }, + { + "epoch": 0.9130346370229784, + "grad_norm": 6.382250682287488, + "learning_rate": 3.93566772461904e-08, + "loss": 1.154, + "step": 6735 + }, + { + "epoch": 0.9131702026706433, + "grad_norm": 3.106609920965618, + "learning_rate": 3.923478402912395e-08, + "loss": 1.1428, + "step": 6736 + }, + { + "epoch": 0.9133057683183081, + "grad_norm": 2.150533133388726, + "learning_rate": 3.911307608804582e-08, + "loss": 1.103, + "step": 6737 + }, + { + "epoch": 0.913441333965973, + "grad_norm": 1.5631573668021042, + "learning_rate": 3.899155344642579e-08, + "loss": 1.1731, + "step": 6738 + }, + { + "epoch": 0.9135768996136379, + "grad_norm": 1.8374147334362587, + "learning_rate": 3.887021612769936e-08, + "loss": 1.1423, + "step": 6739 + }, + { + "epoch": 0.9137124652613028, + "grad_norm": 1.5724103422967883, + "learning_rate": 3.8749064155264685e-08, + "loss": 1.1267, + "step": 6740 + }, + { + "epoch": 0.9138480309089677, + "grad_norm": 1.5937695420165996, + "learning_rate": 3.862809755248564e-08, + "loss": 1.1376, + "step": 6741 + }, + { + "epoch": 0.9139835965566325, + "grad_norm": 1.9586743358670977, + "learning_rate": 3.850731634268911e-08, + "loss": 1.1185, + "step": 6742 + }, + { + "epoch": 0.9141191622042975, + "grad_norm": 2.0042061126677035, + "learning_rate": 3.838672054916725e-08, + "loss": 1.1577, + "step": 6743 + }, + { + "epoch": 0.9142547278519623, + "grad_norm": 2.0969132607015277, + "learning_rate": 3.826631019517568e-08, + "loss": 1.1373, + "step": 6744 + }, + { + "epoch": 0.9143902934996272, + "grad_norm": 2.5644128888915603, + "learning_rate": 3.814608530393493e-08, + "loss": 1.1771, + "step": 6745 + }, + { + "epoch": 0.9145258591472921, + "grad_norm": 2.6036624015569116, + "learning_rate": 3.802604589862912e-08, + "loss": 1.1467, + "step": 6746 + }, + { + "epoch": 0.9146614247949569, + "grad_norm": 1.670205811967, + "learning_rate": 3.790619200240697e-08, + "loss": 1.098, + "step": 6747 + }, + { + "epoch": 0.9147969904426219, + "grad_norm": 1.7127575021340928, + "learning_rate": 3.7786523638381306e-08, + "loss": 1.1417, + "step": 6748 + }, + { + "epoch": 0.9149325560902867, + "grad_norm": 1.6776741841280904, + "learning_rate": 3.766704082962935e-08, + "loss": 1.1599, + "step": 6749 + }, + { + "epoch": 0.9150681217379516, + "grad_norm": 1.807898442740659, + "learning_rate": 3.754774359919244e-08, + "loss": 1.1622, + "step": 6750 + }, + { + "epoch": 0.9152036873856165, + "grad_norm": 1.6467925965655623, + "learning_rate": 3.7428631970076065e-08, + "loss": 1.1574, + "step": 6751 + }, + { + "epoch": 0.9153392530332813, + "grad_norm": 1.917071630693933, + "learning_rate": 3.730970596524985e-08, + "loss": 1.1353, + "step": 6752 + }, + { + "epoch": 0.9154748186809463, + "grad_norm": 2.046378326439752, + "learning_rate": 3.719096560764778e-08, + "loss": 1.1107, + "step": 6753 + }, + { + "epoch": 0.9156103843286111, + "grad_norm": 2.0481151952768686, + "learning_rate": 3.707241092016811e-08, + "loss": 1.1154, + "step": 6754 + }, + { + "epoch": 0.915745949976276, + "grad_norm": 1.7127272574589487, + "learning_rate": 3.69540419256732e-08, + "loss": 1.1679, + "step": 6755 + }, + { + "epoch": 0.9158815156239409, + "grad_norm": 1.7772297205583394, + "learning_rate": 3.683585864698946e-08, + "loss": 1.17, + "step": 6756 + }, + { + "epoch": 0.9160170812716057, + "grad_norm": 1.6159590743781607, + "learning_rate": 3.6717861106907447e-08, + "loss": 1.1487, + "step": 6757 + }, + { + "epoch": 0.9161526469192707, + "grad_norm": 1.8259846693584356, + "learning_rate": 3.66000493281825e-08, + "loss": 1.1042, + "step": 6758 + }, + { + "epoch": 0.9162882125669355, + "grad_norm": 1.9623037182725351, + "learning_rate": 3.648242333353324e-08, + "loss": 1.142, + "step": 6759 + }, + { + "epoch": 0.9164237782146004, + "grad_norm": 3.2422427216206575, + "learning_rate": 3.6364983145643066e-08, + "loss": 1.1425, + "step": 6760 + }, + { + "epoch": 0.9165593438622653, + "grad_norm": 2.1503040391854777, + "learning_rate": 3.624772878715954e-08, + "loss": 1.1378, + "step": 6761 + }, + { + "epoch": 0.9166949095099302, + "grad_norm": 2.3688964824272816, + "learning_rate": 3.6130660280694005e-08, + "loss": 1.0864, + "step": 6762 + }, + { + "epoch": 0.9168304751575951, + "grad_norm": 2.2323044522078486, + "learning_rate": 3.6013777648822406e-08, + "loss": 1.0992, + "step": 6763 + }, + { + "epoch": 0.9169660408052599, + "grad_norm": 1.811161660560687, + "learning_rate": 3.58970809140845e-08, + "loss": 1.1435, + "step": 6764 + }, + { + "epoch": 0.9171016064529248, + "grad_norm": 1.7565840607616288, + "learning_rate": 3.5780570098984273e-08, + "loss": 1.1552, + "step": 6765 + }, + { + "epoch": 0.9172371721005897, + "grad_norm": 2.8560792460807276, + "learning_rate": 3.5664245225990206e-08, + "loss": 1.142, + "step": 6766 + }, + { + "epoch": 0.9173727377482546, + "grad_norm": 3.052114712161826, + "learning_rate": 3.554810631753436e-08, + "loss": 1.1406, + "step": 6767 + }, + { + "epoch": 0.9175083033959195, + "grad_norm": 1.579131655898127, + "learning_rate": 3.543215339601324e-08, + "loss": 1.1301, + "step": 6768 + }, + { + "epoch": 0.9176438690435843, + "grad_norm": 1.6184266285785587, + "learning_rate": 3.531638648378754e-08, + "loss": 1.1077, + "step": 6769 + }, + { + "epoch": 0.9177794346912492, + "grad_norm": 2.237218027919577, + "learning_rate": 3.520080560318195e-08, + "loss": 1.1424, + "step": 6770 + }, + { + "epoch": 0.9179150003389142, + "grad_norm": 1.7733242148302064, + "learning_rate": 3.508541077648541e-08, + "loss": 1.1859, + "step": 6771 + }, + { + "epoch": 0.918050565986579, + "grad_norm": 1.8756202306463075, + "learning_rate": 3.497020202595069e-08, + "loss": 1.1288, + "step": 6772 + }, + { + "epoch": 0.9181861316342439, + "grad_norm": 1.5612943356511677, + "learning_rate": 3.485517937379512e-08, + "loss": 1.1188, + "step": 6773 + }, + { + "epoch": 0.9183216972819087, + "grad_norm": 1.9822898368698996, + "learning_rate": 3.474034284219995e-08, + "loss": 1.1259, + "step": 6774 + }, + { + "epoch": 0.9184572629295736, + "grad_norm": 1.8980818055545816, + "learning_rate": 3.462569245331004e-08, + "loss": 1.1383, + "step": 6775 + }, + { + "epoch": 0.9185928285772386, + "grad_norm": 1.7659213500819209, + "learning_rate": 3.451122822923547e-08, + "loss": 1.1124, + "step": 6776 + }, + { + "epoch": 0.9187283942249034, + "grad_norm": 2.1420286865167, + "learning_rate": 3.4396950192049134e-08, + "loss": 1.0907, + "step": 6777 + }, + { + "epoch": 0.9188639598725683, + "grad_norm": 1.7397366423522125, + "learning_rate": 3.4282858363789194e-08, + "loss": 1.1104, + "step": 6778 + }, + { + "epoch": 0.9189995255202331, + "grad_norm": 4.048698982777152, + "learning_rate": 3.4168952766456924e-08, + "loss": 1.134, + "step": 6779 + }, + { + "epoch": 0.919135091167898, + "grad_norm": 2.3218781437866847, + "learning_rate": 3.405523342201855e-08, + "loss": 1.1227, + "step": 6780 + }, + { + "epoch": 0.919270656815563, + "grad_norm": 2.97876038378886, + "learning_rate": 3.39417003524034e-08, + "loss": 1.1656, + "step": 6781 + }, + { + "epoch": 0.9194062224632278, + "grad_norm": 1.8402537013654527, + "learning_rate": 3.3828353579505975e-08, + "loss": 1.1236, + "step": 6782 + }, + { + "epoch": 0.9195417881108927, + "grad_norm": 1.751309305858762, + "learning_rate": 3.3715193125184005e-08, + "loss": 1.16, + "step": 6783 + }, + { + "epoch": 0.9196773537585576, + "grad_norm": 2.044409515282348, + "learning_rate": 3.3602219011259595e-08, + "loss": 1.1229, + "step": 6784 + }, + { + "epoch": 0.9198129194062225, + "grad_norm": 1.951260092938747, + "learning_rate": 3.3489431259518975e-08, + "loss": 1.1342, + "step": 6785 + }, + { + "epoch": 0.9199484850538874, + "grad_norm": 2.186052273908093, + "learning_rate": 3.337682989171242e-08, + "loss": 1.1753, + "step": 6786 + }, + { + "epoch": 0.9200840507015522, + "grad_norm": 2.427809461362353, + "learning_rate": 3.326441492955412e-08, + "loss": 1.0976, + "step": 6787 + }, + { + "epoch": 0.9202196163492171, + "grad_norm": 1.990865647139709, + "learning_rate": 3.3152186394722506e-08, + "loss": 1.1411, + "step": 6788 + }, + { + "epoch": 0.920355181996882, + "grad_norm": 2.2732316454494046, + "learning_rate": 3.304014430885982e-08, + "loss": 1.1544, + "step": 6789 + }, + { + "epoch": 0.9204907476445469, + "grad_norm": 1.6686973102120475, + "learning_rate": 3.292828869357267e-08, + "loss": 1.1686, + "step": 6790 + }, + { + "epoch": 0.9206263132922118, + "grad_norm": 1.5426195116262198, + "learning_rate": 3.281661957043147e-08, + "loss": 1.1588, + "step": 6791 + }, + { + "epoch": 0.9207618789398766, + "grad_norm": 1.7696359493689175, + "learning_rate": 3.270513696097055e-08, + "loss": 1.1492, + "step": 6792 + }, + { + "epoch": 0.9208974445875415, + "grad_norm": 1.750511394979674, + "learning_rate": 3.2593840886688815e-08, + "loss": 1.1289, + "step": 6793 + }, + { + "epoch": 0.9210330102352065, + "grad_norm": 2.848300440391529, + "learning_rate": 3.248273136904844e-08, + "loss": 1.1257, + "step": 6794 + }, + { + "epoch": 0.9211685758828713, + "grad_norm": 1.98093897571148, + "learning_rate": 3.23718084294764e-08, + "loss": 1.1397, + "step": 6795 + }, + { + "epoch": 0.9213041415305362, + "grad_norm": 1.6123166048598938, + "learning_rate": 3.226107208936279e-08, + "loss": 1.1115, + "step": 6796 + }, + { + "epoch": 0.921439707178201, + "grad_norm": 2.0196969107777507, + "learning_rate": 3.2150522370062886e-08, + "loss": 1.1136, + "step": 6797 + }, + { + "epoch": 0.9215752728258659, + "grad_norm": 1.8613624302227993, + "learning_rate": 3.204015929289483e-08, + "loss": 1.1422, + "step": 6798 + }, + { + "epoch": 0.9217108384735309, + "grad_norm": 1.7810486443904543, + "learning_rate": 3.1929982879141613e-08, + "loss": 1.1419, + "step": 6799 + }, + { + "epoch": 0.9218464041211957, + "grad_norm": 1.7129776309861038, + "learning_rate": 3.181999315004946e-08, + "loss": 1.1091, + "step": 6800 + }, + { + "epoch": 0.9219819697688606, + "grad_norm": 1.7204309514291232, + "learning_rate": 3.171019012682952e-08, + "loss": 1.1579, + "step": 6801 + }, + { + "epoch": 0.9221175354165254, + "grad_norm": 2.22523272633456, + "learning_rate": 3.160057383065606e-08, + "loss": 1.1164, + "step": 6802 + }, + { + "epoch": 0.9222531010641903, + "grad_norm": 2.0148327423351775, + "learning_rate": 3.149114428266786e-08, + "loss": 1.1498, + "step": 6803 + }, + { + "epoch": 0.9223886667118553, + "grad_norm": 1.776210426508428, + "learning_rate": 3.138190150396758e-08, + "loss": 1.1331, + "step": 6804 + }, + { + "epoch": 0.9225242323595201, + "grad_norm": 1.7950243100049472, + "learning_rate": 3.1272845515621816e-08, + "loss": 1.1786, + "step": 6805 + }, + { + "epoch": 0.922659798007185, + "grad_norm": 1.9109759083167919, + "learning_rate": 3.116397633866108e-08, + "loss": 1.1465, + "step": 6806 + }, + { + "epoch": 0.9227953636548498, + "grad_norm": 1.771218505107996, + "learning_rate": 3.1055293994080024e-08, + "loss": 1.1497, + "step": 6807 + }, + { + "epoch": 0.9229309293025147, + "grad_norm": 2.437395680128072, + "learning_rate": 3.09467985028371e-08, + "loss": 1.1814, + "step": 6808 + }, + { + "epoch": 0.9230664949501797, + "grad_norm": 2.2110223094988495, + "learning_rate": 3.08384898858548e-08, + "loss": 1.1417, + "step": 6809 + }, + { + "epoch": 0.9232020605978445, + "grad_norm": 2.0199221953603197, + "learning_rate": 3.073036816401975e-08, + "loss": 1.1444, + "step": 6810 + }, + { + "epoch": 0.9233376262455094, + "grad_norm": 4.3340064602782675, + "learning_rate": 3.062243335818215e-08, + "loss": 1.116, + "step": 6811 + }, + { + "epoch": 0.9234731918931742, + "grad_norm": 1.726484029838967, + "learning_rate": 3.051468548915648e-08, + "loss": 1.1371, + "step": 6812 + }, + { + "epoch": 0.9236087575408392, + "grad_norm": 1.8185202444177868, + "learning_rate": 3.04071245777211e-08, + "loss": 1.1411, + "step": 6813 + }, + { + "epoch": 0.9237443231885041, + "grad_norm": 3.3429491674859237, + "learning_rate": 3.0299750644618205e-08, + "loss": 1.1076, + "step": 6814 + }, + { + "epoch": 0.9238798888361689, + "grad_norm": 2.5038977779091525, + "learning_rate": 3.019256371055423e-08, + "loss": 1.1458, + "step": 6815 + }, + { + "epoch": 0.9240154544838338, + "grad_norm": 2.4657391865545706, + "learning_rate": 3.0085563796198866e-08, + "loss": 1.1735, + "step": 6816 + }, + { + "epoch": 0.9241510201314986, + "grad_norm": 1.9884787925324277, + "learning_rate": 2.997875092218671e-08, + "loss": 1.1331, + "step": 6817 + }, + { + "epoch": 0.9242865857791636, + "grad_norm": 1.7716327427950362, + "learning_rate": 2.987212510911541e-08, + "loss": 1.1348, + "step": 6818 + }, + { + "epoch": 0.9244221514268285, + "grad_norm": 6.269151175891285, + "learning_rate": 2.976568637754717e-08, + "loss": 1.1293, + "step": 6819 + }, + { + "epoch": 0.9245577170744933, + "grad_norm": 2.047245238254759, + "learning_rate": 2.9659434748007696e-08, + "loss": 1.0671, + "step": 6820 + }, + { + "epoch": 0.9246932827221582, + "grad_norm": 1.6816327095543238, + "learning_rate": 2.9553370240986808e-08, + "loss": 1.116, + "step": 6821 + }, + { + "epoch": 0.924828848369823, + "grad_norm": 2.327560928863812, + "learning_rate": 2.944749287693815e-08, + "loss": 1.1438, + "step": 6822 + }, + { + "epoch": 0.924964414017488, + "grad_norm": 2.9062100778391637, + "learning_rate": 2.9341802676279505e-08, + "loss": 1.1396, + "step": 6823 + }, + { + "epoch": 0.9250999796651529, + "grad_norm": 1.8039429394412034, + "learning_rate": 2.923629965939234e-08, + "loss": 1.1216, + "step": 6824 + }, + { + "epoch": 0.9252355453128177, + "grad_norm": 1.8127809456494959, + "learning_rate": 2.913098384662205e-08, + "loss": 1.0967, + "step": 6825 + }, + { + "epoch": 0.9253711109604826, + "grad_norm": 1.8277794425893759, + "learning_rate": 2.902585525827783e-08, + "loss": 1.1411, + "step": 6826 + }, + { + "epoch": 0.9255066766081474, + "grad_norm": 3.8828209027243474, + "learning_rate": 2.8920913914633138e-08, + "loss": 1.1316, + "step": 6827 + }, + { + "epoch": 0.9256422422558124, + "grad_norm": 3.178378394793029, + "learning_rate": 2.881615983592489e-08, + "loss": 1.1547, + "step": 6828 + }, + { + "epoch": 0.9257778079034773, + "grad_norm": 1.6133305815922394, + "learning_rate": 2.8711593042354154e-08, + "loss": 1.1213, + "step": 6829 + }, + { + "epoch": 0.9259133735511421, + "grad_norm": 1.8087989045534467, + "learning_rate": 2.8607213554086018e-08, + "loss": 1.1318, + "step": 6830 + }, + { + "epoch": 0.926048939198807, + "grad_norm": 2.866951810474464, + "learning_rate": 2.8503021391248718e-08, + "loss": 1.1466, + "step": 6831 + }, + { + "epoch": 0.9261845048464719, + "grad_norm": 2.082200122855003, + "learning_rate": 2.839901657393551e-08, + "loss": 1.1625, + "step": 6832 + }, + { + "epoch": 0.9263200704941368, + "grad_norm": 1.5106208523740738, + "learning_rate": 2.829519912220235e-08, + "loss": 1.1207, + "step": 6833 + }, + { + "epoch": 0.9264556361418017, + "grad_norm": 1.5158302703618614, + "learning_rate": 2.819156905607012e-08, + "loss": 1.1832, + "step": 6834 + }, + { + "epoch": 0.9265912017894665, + "grad_norm": 2.084230626346103, + "learning_rate": 2.8088126395522495e-08, + "loss": 1.1349, + "step": 6835 + }, + { + "epoch": 0.9267267674371314, + "grad_norm": 1.8562191838890199, + "learning_rate": 2.7984871160508185e-08, + "loss": 1.0957, + "step": 6836 + }, + { + "epoch": 0.9268623330847963, + "grad_norm": 2.088886053574346, + "learning_rate": 2.7881803370938595e-08, + "loss": 1.1272, + "step": 6837 + }, + { + "epoch": 0.9269978987324612, + "grad_norm": 1.5848438232636406, + "learning_rate": 2.777892304669005e-08, + "loss": 1.1504, + "step": 6838 + }, + { + "epoch": 0.9271334643801261, + "grad_norm": 1.831691446207229, + "learning_rate": 2.7676230207601793e-08, + "loss": 1.1905, + "step": 6839 + }, + { + "epoch": 0.9272690300277909, + "grad_norm": 3.28745843023518, + "learning_rate": 2.757372487347753e-08, + "loss": 1.1293, + "step": 6840 + }, + { + "epoch": 0.9274045956754559, + "grad_norm": 1.6548176560435084, + "learning_rate": 2.747140706408446e-08, + "loss": 1.1547, + "step": 6841 + }, + { + "epoch": 0.9275401613231207, + "grad_norm": 2.128323728474654, + "learning_rate": 2.7369276799154017e-08, + "loss": 1.156, + "step": 6842 + }, + { + "epoch": 0.9276757269707856, + "grad_norm": 1.9018231023899768, + "learning_rate": 2.7267334098381e-08, + "loss": 1.1291, + "step": 6843 + }, + { + "epoch": 0.9278112926184505, + "grad_norm": 1.8560482886284062, + "learning_rate": 2.7165578981424354e-08, + "loss": 1.1583, + "step": 6844 + }, + { + "epoch": 0.9279468582661153, + "grad_norm": 1.95529296343071, + "learning_rate": 2.70640114679066e-08, + "loss": 1.1368, + "step": 6845 + }, + { + "epoch": 0.9280824239137803, + "grad_norm": 1.6269638532701818, + "learning_rate": 2.696263157741441e-08, + "loss": 1.1686, + "step": 6846 + }, + { + "epoch": 0.9282179895614451, + "grad_norm": 1.675691674172511, + "learning_rate": 2.6861439329498026e-08, + "loss": 1.1527, + "step": 6847 + }, + { + "epoch": 0.92835355520911, + "grad_norm": 1.9707502021410623, + "learning_rate": 2.6760434743671623e-08, + "loss": 1.1375, + "step": 6848 + }, + { + "epoch": 0.9284891208567749, + "grad_norm": 1.7404616024522532, + "learning_rate": 2.665961783941306e-08, + "loss": 1.151, + "step": 6849 + }, + { + "epoch": 0.9286246865044397, + "grad_norm": 1.835193279327664, + "learning_rate": 2.6558988636164127e-08, + "loss": 1.1391, + "step": 6850 + }, + { + "epoch": 0.9287602521521047, + "grad_norm": 1.6991882002803618, + "learning_rate": 2.645854715333029e-08, + "loss": 1.1539, + "step": 6851 + }, + { + "epoch": 0.9288958177997695, + "grad_norm": 1.962438970804226, + "learning_rate": 2.6358293410281062e-08, + "loss": 1.1386, + "step": 6852 + }, + { + "epoch": 0.9290313834474344, + "grad_norm": 2.178803325814055, + "learning_rate": 2.6258227426349533e-08, + "loss": 1.1405, + "step": 6853 + }, + { + "epoch": 0.9291669490950993, + "grad_norm": 1.762549554054437, + "learning_rate": 2.6158349220832375e-08, + "loss": 1.1296, + "step": 6854 + }, + { + "epoch": 0.9293025147427642, + "grad_norm": 3.4493844744436912, + "learning_rate": 2.605865881299074e-08, + "loss": 1.1039, + "step": 6855 + }, + { + "epoch": 0.9294380803904291, + "grad_norm": 1.925040637453337, + "learning_rate": 2.5959156222048805e-08, + "loss": 1.1568, + "step": 6856 + }, + { + "epoch": 0.9295736460380939, + "grad_norm": 4.34685512108383, + "learning_rate": 2.585984146719511e-08, + "loss": 1.1309, + "step": 6857 + }, + { + "epoch": 0.9297092116857588, + "grad_norm": 3.3031479786372784, + "learning_rate": 2.5760714567581554e-08, + "loss": 1.1608, + "step": 6858 + }, + { + "epoch": 0.9298447773334237, + "grad_norm": 1.739838771242186, + "learning_rate": 2.566177554232396e-08, + "loss": 1.1134, + "step": 6859 + }, + { + "epoch": 0.9299803429810886, + "grad_norm": 1.9571368010333459, + "learning_rate": 2.5563024410501954e-08, + "loss": 1.1505, + "step": 6860 + }, + { + "epoch": 0.9301159086287535, + "grad_norm": 1.717404957919265, + "learning_rate": 2.546446119115908e-08, + "loss": 1.1669, + "step": 6861 + }, + { + "epoch": 0.9302514742764184, + "grad_norm": 2.537868329941339, + "learning_rate": 2.5366085903302247e-08, + "loss": 1.1656, + "step": 6862 + }, + { + "epoch": 0.9303870399240832, + "grad_norm": 1.9643660254556317, + "learning_rate": 2.5267898565902503e-08, + "loss": 1.0956, + "step": 6863 + }, + { + "epoch": 0.9305226055717482, + "grad_norm": 1.7869932097113084, + "learning_rate": 2.5169899197894363e-08, + "loss": 1.1183, + "step": 6864 + }, + { + "epoch": 0.930658171219413, + "grad_norm": 1.901733352324411, + "learning_rate": 2.507208781817638e-08, + "loss": 1.1464, + "step": 6865 + }, + { + "epoch": 0.9307937368670779, + "grad_norm": 2.3651970921871714, + "learning_rate": 2.4974464445610688e-08, + "loss": 1.1511, + "step": 6866 + }, + { + "epoch": 0.9309293025147428, + "grad_norm": 1.8103947369626654, + "learning_rate": 2.4877029099023116e-08, + "loss": 1.1262, + "step": 6867 + }, + { + "epoch": 0.9310648681624076, + "grad_norm": 1.8280264985419534, + "learning_rate": 2.4779781797203303e-08, + "loss": 1.1345, + "step": 6868 + }, + { + "epoch": 0.9312004338100726, + "grad_norm": 1.8218317969386808, + "learning_rate": 2.468272255890469e-08, + "loss": 1.1009, + "step": 6869 + }, + { + "epoch": 0.9313359994577374, + "grad_norm": 1.7932746614402184, + "learning_rate": 2.4585851402844305e-08, + "loss": 1.1107, + "step": 6870 + }, + { + "epoch": 0.9314715651054023, + "grad_norm": 2.542615210765025, + "learning_rate": 2.4489168347703093e-08, + "loss": 1.1533, + "step": 6871 + }, + { + "epoch": 0.9316071307530672, + "grad_norm": 1.7745052020229946, + "learning_rate": 2.4392673412125476e-08, + "loss": 1.1304, + "step": 6872 + }, + { + "epoch": 0.931742696400732, + "grad_norm": 2.078624031567002, + "learning_rate": 2.429636661472001e-08, + "loss": 1.1403, + "step": 6873 + }, + { + "epoch": 0.931878262048397, + "grad_norm": 2.7216472384038726, + "learning_rate": 2.4200247974058175e-08, + "loss": 1.1289, + "step": 6874 + }, + { + "epoch": 0.9320138276960618, + "grad_norm": 1.711170297561451, + "learning_rate": 2.4104317508676363e-08, + "loss": 1.1503, + "step": 6875 + }, + { + "epoch": 0.9321493933437267, + "grad_norm": 1.862530584499513, + "learning_rate": 2.4008575237073335e-08, + "loss": 1.156, + "step": 6876 + }, + { + "epoch": 0.9322849589913916, + "grad_norm": 2.2103852909149557, + "learning_rate": 2.3913021177712876e-08, + "loss": 1.1451, + "step": 6877 + }, + { + "epoch": 0.9324205246390564, + "grad_norm": 2.4456831800011845, + "learning_rate": 2.3817655349021247e-08, + "loss": 1.1445, + "step": 6878 + }, + { + "epoch": 0.9325560902867214, + "grad_norm": 1.9492494143784334, + "learning_rate": 2.3722477769389515e-08, + "loss": 1.1304, + "step": 6879 + }, + { + "epoch": 0.9326916559343862, + "grad_norm": 2.4696454206086953, + "learning_rate": 2.362748845717155e-08, + "loss": 1.143, + "step": 6880 + }, + { + "epoch": 0.9328272215820511, + "grad_norm": 2.363083109531296, + "learning_rate": 2.3532687430685373e-08, + "loss": 1.1087, + "step": 6881 + }, + { + "epoch": 0.932962787229716, + "grad_norm": 1.9165571429815318, + "learning_rate": 2.3438074708212795e-08, + "loss": 1.144, + "step": 6882 + }, + { + "epoch": 0.9330983528773809, + "grad_norm": 1.6980258655677667, + "learning_rate": 2.3343650307998896e-08, + "loss": 1.1076, + "step": 6883 + }, + { + "epoch": 0.9332339185250458, + "grad_norm": 2.2308221747683175, + "learning_rate": 2.3249414248252775e-08, + "loss": 1.1799, + "step": 6884 + }, + { + "epoch": 0.9333694841727106, + "grad_norm": 1.6699595482536234, + "learning_rate": 2.3155366547147115e-08, + "loss": 1.1463, + "step": 6885 + }, + { + "epoch": 0.9335050498203755, + "grad_norm": 2.452585204522459, + "learning_rate": 2.30615072228183e-08, + "loss": 1.1203, + "step": 6886 + }, + { + "epoch": 0.9336406154680404, + "grad_norm": 1.7403767397338177, + "learning_rate": 2.2967836293366405e-08, + "loss": 1.0882, + "step": 6887 + }, + { + "epoch": 0.9337761811157053, + "grad_norm": 1.7467343294237476, + "learning_rate": 2.287435377685498e-08, + "loss": 1.1397, + "step": 6888 + }, + { + "epoch": 0.9339117467633702, + "grad_norm": 1.9557078302268331, + "learning_rate": 2.2781059691311498e-08, + "loss": 1.1608, + "step": 6889 + }, + { + "epoch": 0.934047312411035, + "grad_norm": 2.902787786547701, + "learning_rate": 2.268795405472701e-08, + "loss": 1.1576, + "step": 6890 + }, + { + "epoch": 0.9341828780586999, + "grad_norm": 2.290168789714428, + "learning_rate": 2.259503688505593e-08, + "loss": 1.1644, + "step": 6891 + }, + { + "epoch": 0.9343184437063649, + "grad_norm": 1.8939687482467362, + "learning_rate": 2.2502308200217037e-08, + "loss": 1.1184, + "step": 6892 + }, + { + "epoch": 0.9344540093540297, + "grad_norm": 1.6797150465376367, + "learning_rate": 2.2409768018092024e-08, + "loss": 1.1189, + "step": 6893 + }, + { + "epoch": 0.9345895750016946, + "grad_norm": 1.8576885246249426, + "learning_rate": 2.231741635652673e-08, + "loss": 1.116, + "step": 6894 + }, + { + "epoch": 0.9347251406493594, + "grad_norm": 1.535018694439724, + "learning_rate": 2.222525323333013e-08, + "loss": 1.1118, + "step": 6895 + }, + { + "epoch": 0.9348607062970243, + "grad_norm": 2.1188028927618285, + "learning_rate": 2.2133278666275567e-08, + "loss": 1.1472, + "step": 6896 + }, + { + "epoch": 0.9349962719446893, + "grad_norm": 1.8670034207217914, + "learning_rate": 2.2041492673099182e-08, + "loss": 1.1507, + "step": 6897 + }, + { + "epoch": 0.9351318375923541, + "grad_norm": 1.8946151025241826, + "learning_rate": 2.1949895271501596e-08, + "loss": 1.1046, + "step": 6898 + }, + { + "epoch": 0.935267403240019, + "grad_norm": 2.6975917300713292, + "learning_rate": 2.1858486479146344e-08, + "loss": 1.1294, + "step": 6899 + }, + { + "epoch": 0.9354029688876838, + "grad_norm": 1.735319367240506, + "learning_rate": 2.1767266313661102e-08, + "loss": 1.1483, + "step": 6900 + }, + { + "epoch": 0.9355385345353487, + "grad_norm": 1.7995553989055044, + "learning_rate": 2.1676234792636693e-08, + "loss": 1.145, + "step": 6901 + }, + { + "epoch": 0.9356741001830137, + "grad_norm": 1.7732586655078162, + "learning_rate": 2.1585391933628073e-08, + "loss": 1.1067, + "step": 6902 + }, + { + "epoch": 0.9358096658306785, + "grad_norm": 2.0226701692941944, + "learning_rate": 2.1494737754153558e-08, + "loss": 1.173, + "step": 6903 + }, + { + "epoch": 0.9359452314783434, + "grad_norm": 1.7891232941482291, + "learning_rate": 2.1404272271694945e-08, + "loss": 1.1586, + "step": 6904 + }, + { + "epoch": 0.9360807971260082, + "grad_norm": 1.5961686547486846, + "learning_rate": 2.1313995503697833e-08, + "loss": 1.1458, + "step": 6905 + }, + { + "epoch": 0.9362163627736731, + "grad_norm": 3.000187444611773, + "learning_rate": 2.122390746757141e-08, + "loss": 1.1498, + "step": 6906 + }, + { + "epoch": 0.9363519284213381, + "grad_norm": 1.9142283780630402, + "learning_rate": 2.1134008180688445e-08, + "loss": 1.1318, + "step": 6907 + }, + { + "epoch": 0.9364874940690029, + "grad_norm": 1.6876385532871037, + "learning_rate": 2.1044297660385292e-08, + "loss": 1.1549, + "step": 6908 + }, + { + "epoch": 0.9366230597166678, + "grad_norm": 1.4891656539128921, + "learning_rate": 2.0954775923961997e-08, + "loss": 1.1349, + "step": 6909 + }, + { + "epoch": 0.9367586253643326, + "grad_norm": 1.862896655631254, + "learning_rate": 2.086544298868198e-08, + "loss": 1.1643, + "step": 6910 + }, + { + "epoch": 0.9368941910119976, + "grad_norm": 1.8456062238871969, + "learning_rate": 2.077629887177257e-08, + "loss": 1.1662, + "step": 6911 + }, + { + "epoch": 0.9370297566596625, + "grad_norm": 1.924071625404914, + "learning_rate": 2.0687343590424232e-08, + "loss": 1.1072, + "step": 6912 + }, + { + "epoch": 0.9371653223073273, + "grad_norm": 1.7592654598353512, + "learning_rate": 2.0598577161791587e-08, + "loss": 1.1237, + "step": 6913 + }, + { + "epoch": 0.9373008879549922, + "grad_norm": 2.369608943034893, + "learning_rate": 2.050999960299249e-08, + "loss": 1.1699, + "step": 6914 + }, + { + "epoch": 0.937436453602657, + "grad_norm": 1.8822085786386176, + "learning_rate": 2.0421610931108168e-08, + "loss": 1.191, + "step": 6915 + }, + { + "epoch": 0.937572019250322, + "grad_norm": 1.9877700447920412, + "learning_rate": 2.033341116318399e-08, + "loss": 1.1198, + "step": 6916 + }, + { + "epoch": 0.9377075848979869, + "grad_norm": 2.0870354093419787, + "learning_rate": 2.0245400316228344e-08, + "loss": 1.1495, + "step": 6917 + }, + { + "epoch": 0.9378431505456517, + "grad_norm": 2.4777632547636173, + "learning_rate": 2.015757840721366e-08, + "loss": 1.1302, + "step": 6918 + }, + { + "epoch": 0.9379787161933166, + "grad_norm": 1.7908763790691276, + "learning_rate": 2.006994545307539e-08, + "loss": 1.119, + "step": 6919 + }, + { + "epoch": 0.9381142818409814, + "grad_norm": 3.774737456799614, + "learning_rate": 1.998250147071323e-08, + "loss": 1.1272, + "step": 6920 + }, + { + "epoch": 0.9382498474886464, + "grad_norm": 2.8303396722741243, + "learning_rate": 1.9895246476989703e-08, + "loss": 1.1463, + "step": 6921 + }, + { + "epoch": 0.9383854131363113, + "grad_norm": 1.8664219190330837, + "learning_rate": 1.9808180488731564e-08, + "loss": 1.1539, + "step": 6922 + }, + { + "epoch": 0.9385209787839761, + "grad_norm": 1.688992586889786, + "learning_rate": 1.9721303522728605e-08, + "loss": 1.1579, + "step": 6923 + }, + { + "epoch": 0.938656544431641, + "grad_norm": 1.8772836316430965, + "learning_rate": 1.9634615595734316e-08, + "loss": 1.1422, + "step": 6924 + }, + { + "epoch": 0.9387921100793059, + "grad_norm": 1.7489675838451046, + "learning_rate": 1.954811672446599e-08, + "loss": 1.132, + "step": 6925 + }, + { + "epoch": 0.9389276757269708, + "grad_norm": 2.26611186751172, + "learning_rate": 1.9461806925604064e-08, + "loss": 1.1551, + "step": 6926 + }, + { + "epoch": 0.9390632413746357, + "grad_norm": 2.103901723259491, + "learning_rate": 1.9375686215792886e-08, + "loss": 1.1057, + "step": 6927 + }, + { + "epoch": 0.9391988070223005, + "grad_norm": 2.713724088499566, + "learning_rate": 1.9289754611639954e-08, + "loss": 1.1359, + "step": 6928 + }, + { + "epoch": 0.9393343726699654, + "grad_norm": 2.033421066145491, + "learning_rate": 1.9204012129716672e-08, + "loss": 1.1317, + "step": 6929 + }, + { + "epoch": 0.9394699383176303, + "grad_norm": 2.2059877715820635, + "learning_rate": 1.911845878655749e-08, + "loss": 1.1398, + "step": 6930 + }, + { + "epoch": 0.9396055039652952, + "grad_norm": 1.7752555490244546, + "learning_rate": 1.9033094598661204e-08, + "loss": 1.1076, + "step": 6931 + }, + { + "epoch": 0.9397410696129601, + "grad_norm": 3.2471027977685547, + "learning_rate": 1.89479195824892e-08, + "loss": 1.1489, + "step": 6932 + }, + { + "epoch": 0.9398766352606249, + "grad_norm": 5.440286132529354, + "learning_rate": 1.8862933754467013e-08, + "loss": 1.1839, + "step": 6933 + }, + { + "epoch": 0.9400122009082899, + "grad_norm": 1.7355356364704846, + "learning_rate": 1.8778137130983307e-08, + "loss": 1.1501, + "step": 6934 + }, + { + "epoch": 0.9401477665559547, + "grad_norm": 1.919402699262792, + "learning_rate": 1.8693529728390667e-08, + "loss": 1.173, + "step": 6935 + }, + { + "epoch": 0.9402833322036196, + "grad_norm": 1.7316093233571355, + "learning_rate": 1.860911156300482e-08, + "loss": 1.098, + "step": 6936 + }, + { + "epoch": 0.9404188978512845, + "grad_norm": 3.284965697888466, + "learning_rate": 1.8524882651105188e-08, + "loss": 1.0817, + "step": 6937 + }, + { + "epoch": 0.9405544634989493, + "grad_norm": 2.1472700068977026, + "learning_rate": 1.844084300893456e-08, + "loss": 1.1013, + "step": 6938 + }, + { + "epoch": 0.9406900291466143, + "grad_norm": 1.9840750576210957, + "learning_rate": 1.835699265269963e-08, + "loss": 1.1463, + "step": 6939 + }, + { + "epoch": 0.9408255947942791, + "grad_norm": 2.1738906294926346, + "learning_rate": 1.827333159856981e-08, + "loss": 1.1719, + "step": 6940 + }, + { + "epoch": 0.940961160441944, + "grad_norm": 2.0308316207471933, + "learning_rate": 1.8189859862678848e-08, + "loss": 1.1608, + "step": 6941 + }, + { + "epoch": 0.9410967260896089, + "grad_norm": 1.9370863539363992, + "learning_rate": 1.8106577461123428e-08, + "loss": 1.1122, + "step": 6942 + }, + { + "epoch": 0.9412322917372737, + "grad_norm": 2.203339329233159, + "learning_rate": 1.802348440996393e-08, + "loss": 1.1362, + "step": 6943 + }, + { + "epoch": 0.9413678573849387, + "grad_norm": 1.9416454441657267, + "learning_rate": 1.794058072522431e-08, + "loss": 1.1469, + "step": 6944 + }, + { + "epoch": 0.9415034230326036, + "grad_norm": 2.846051036813186, + "learning_rate": 1.7857866422891665e-08, + "loss": 1.1662, + "step": 6945 + }, + { + "epoch": 0.9416389886802684, + "grad_norm": 1.779798083380636, + "learning_rate": 1.777534151891702e-08, + "loss": 1.1596, + "step": 6946 + }, + { + "epoch": 0.9417745543279333, + "grad_norm": 2.363044545138007, + "learning_rate": 1.7693006029214418e-08, + "loss": 1.1329, + "step": 6947 + }, + { + "epoch": 0.9419101199755981, + "grad_norm": 2.7061875879634267, + "learning_rate": 1.7610859969661827e-08, + "loss": 1.1166, + "step": 6948 + }, + { + "epoch": 0.9420456856232631, + "grad_norm": 2.194515223505337, + "learning_rate": 1.7528903356100466e-08, + "loss": 1.125, + "step": 6949 + }, + { + "epoch": 0.942181251270928, + "grad_norm": 1.8195470780970924, + "learning_rate": 1.74471362043348e-08, + "loss": 1.135, + "step": 6950 + }, + { + "epoch": 0.9423168169185928, + "grad_norm": 2.7612413581298583, + "learning_rate": 1.7365558530133218e-08, + "loss": 1.1218, + "step": 6951 + }, + { + "epoch": 0.9424523825662577, + "grad_norm": 3.2266513314366225, + "learning_rate": 1.7284170349227246e-08, + "loss": 1.1172, + "step": 6952 + }, + { + "epoch": 0.9425879482139226, + "grad_norm": 3.5159798617210134, + "learning_rate": 1.7202971677311774e-08, + "loss": 1.0933, + "step": 6953 + }, + { + "epoch": 0.9427235138615875, + "grad_norm": 3.5327038355153424, + "learning_rate": 1.712196253004572e-08, + "loss": 1.1615, + "step": 6954 + }, + { + "epoch": 0.9428590795092524, + "grad_norm": 1.7585772719278248, + "learning_rate": 1.704114292305059e-08, + "loss": 1.1531, + "step": 6955 + }, + { + "epoch": 0.9429946451569172, + "grad_norm": 1.86877238984571, + "learning_rate": 1.6960512871912246e-08, + "loss": 1.1168, + "step": 6956 + }, + { + "epoch": 0.9431302108045821, + "grad_norm": 1.6819469991223563, + "learning_rate": 1.6880072392179146e-08, + "loss": 1.1307, + "step": 6957 + }, + { + "epoch": 0.943265776452247, + "grad_norm": 2.7747726876746706, + "learning_rate": 1.6799821499363987e-08, + "loss": 1.145, + "step": 6958 + }, + { + "epoch": 0.9434013420999119, + "grad_norm": 1.7290762828703408, + "learning_rate": 1.671976020894228e-08, + "loss": 1.1357, + "step": 6959 + }, + { + "epoch": 0.9435369077475768, + "grad_norm": 1.8228748559075059, + "learning_rate": 1.663988853635323e-08, + "loss": 1.1295, + "step": 6960 + }, + { + "epoch": 0.9436724733952416, + "grad_norm": 2.2142945637365887, + "learning_rate": 1.6560206496999517e-08, + "loss": 1.1658, + "step": 6961 + }, + { + "epoch": 0.9438080390429066, + "grad_norm": 3.020636043307101, + "learning_rate": 1.6480714106247186e-08, + "loss": 1.1178, + "step": 6962 + }, + { + "epoch": 0.9439436046905714, + "grad_norm": 1.6941810237411519, + "learning_rate": 1.6401411379425746e-08, + "loss": 1.1615, + "step": 6963 + }, + { + "epoch": 0.9440791703382363, + "grad_norm": 1.5446260933398057, + "learning_rate": 1.6322298331827967e-08, + "loss": 1.0874, + "step": 6964 + }, + { + "epoch": 0.9442147359859012, + "grad_norm": 1.8234866414264035, + "learning_rate": 1.624337497871042e-08, + "loss": 1.1801, + "step": 6965 + }, + { + "epoch": 0.944350301633566, + "grad_norm": 1.754968326877625, + "learning_rate": 1.6164641335292606e-08, + "loss": 1.0911, + "step": 6966 + }, + { + "epoch": 0.944485867281231, + "grad_norm": 1.8336802197995663, + "learning_rate": 1.6086097416757816e-08, + "loss": 1.1802, + "step": 6967 + }, + { + "epoch": 0.9446214329288958, + "grad_norm": 2.0802067344766404, + "learning_rate": 1.60077432382526e-08, + "loss": 1.1052, + "step": 6968 + }, + { + "epoch": 0.9447569985765607, + "grad_norm": 4.72101787165864, + "learning_rate": 1.5929578814886878e-08, + "loss": 1.106, + "step": 6969 + }, + { + "epoch": 0.9448925642242256, + "grad_norm": 1.8141055611958274, + "learning_rate": 1.5851604161734256e-08, + "loss": 1.1445, + "step": 6970 + }, + { + "epoch": 0.9450281298718904, + "grad_norm": 1.8216916805665189, + "learning_rate": 1.5773819293831148e-08, + "loss": 1.1252, + "step": 6971 + }, + { + "epoch": 0.9451636955195554, + "grad_norm": 2.2372030621021133, + "learning_rate": 1.5696224226178224e-08, + "loss": 1.12, + "step": 6972 + }, + { + "epoch": 0.9452992611672202, + "grad_norm": 2.205413499010483, + "learning_rate": 1.5618818973738625e-08, + "loss": 1.1779, + "step": 6973 + }, + { + "epoch": 0.9454348268148851, + "grad_norm": 1.9440829633253416, + "learning_rate": 1.554160355143974e-08, + "loss": 1.0964, + "step": 6974 + }, + { + "epoch": 0.94557039246255, + "grad_norm": 2.002965530141726, + "learning_rate": 1.5464577974171554e-08, + "loss": 1.1126, + "step": 6975 + }, + { + "epoch": 0.9457059581102149, + "grad_norm": 1.7901586764242787, + "learning_rate": 1.5387742256788294e-08, + "loss": 1.1808, + "step": 6976 + }, + { + "epoch": 0.9458415237578798, + "grad_norm": 2.2703274387346837, + "learning_rate": 1.531109641410666e-08, + "loss": 1.1412, + "step": 6977 + }, + { + "epoch": 0.9459770894055446, + "grad_norm": 1.6001567846534281, + "learning_rate": 1.523464046090761e-08, + "loss": 1.1217, + "step": 6978 + }, + { + "epoch": 0.9461126550532095, + "grad_norm": 1.6990621409752729, + "learning_rate": 1.5158374411934793e-08, + "loss": 1.1257, + "step": 6979 + }, + { + "epoch": 0.9462482207008744, + "grad_norm": 1.669524103608102, + "learning_rate": 1.5082298281895666e-08, + "loss": 1.1106, + "step": 6980 + }, + { + "epoch": 0.9463837863485393, + "grad_norm": 1.5863239390742812, + "learning_rate": 1.500641208546072e-08, + "loss": 1.113, + "step": 6981 + }, + { + "epoch": 0.9465193519962042, + "grad_norm": 4.280678976862245, + "learning_rate": 1.493071583726424e-08, + "loss": 1.1523, + "step": 6982 + }, + { + "epoch": 0.946654917643869, + "grad_norm": 2.0488304767090084, + "learning_rate": 1.4855209551903559e-08, + "loss": 1.1463, + "step": 6983 + }, + { + "epoch": 0.9467904832915339, + "grad_norm": 2.0439372347140323, + "learning_rate": 1.4779893243939356e-08, + "loss": 1.1437, + "step": 6984 + }, + { + "epoch": 0.9469260489391989, + "grad_norm": 3.0399210180305802, + "learning_rate": 1.4704766927895907e-08, + "loss": 1.1714, + "step": 6985 + }, + { + "epoch": 0.9470616145868637, + "grad_norm": 1.9515203524608462, + "learning_rate": 1.462983061826084e-08, + "loss": 1.0996, + "step": 6986 + }, + { + "epoch": 0.9471971802345286, + "grad_norm": 2.5086691871497613, + "learning_rate": 1.4555084329484713e-08, + "loss": 1.1608, + "step": 6987 + }, + { + "epoch": 0.9473327458821934, + "grad_norm": 1.8233217193573024, + "learning_rate": 1.4480528075982102e-08, + "loss": 1.1496, + "step": 6988 + }, + { + "epoch": 0.9474683115298583, + "grad_norm": 2.0718632925361526, + "learning_rate": 1.4406161872130396e-08, + "loss": 1.1581, + "step": 6989 + }, + { + "epoch": 0.9476038771775233, + "grad_norm": 1.7648878389501153, + "learning_rate": 1.4331985732270457e-08, + "loss": 1.154, + "step": 6990 + }, + { + "epoch": 0.9477394428251881, + "grad_norm": 1.848753289491071, + "learning_rate": 1.4257999670706844e-08, + "loss": 1.1686, + "step": 6991 + }, + { + "epoch": 0.947875008472853, + "grad_norm": 1.6152853496607054, + "learning_rate": 1.418420370170681e-08, + "loss": 1.0974, + "step": 6992 + }, + { + "epoch": 0.9480105741205178, + "grad_norm": 2.013960122478299, + "learning_rate": 1.4110597839501748e-08, + "loss": 1.1417, + "step": 6993 + }, + { + "epoch": 0.9481461397681827, + "grad_norm": 2.684719916886598, + "learning_rate": 1.4037182098285639e-08, + "loss": 1.1446, + "step": 6994 + }, + { + "epoch": 0.9482817054158477, + "grad_norm": 3.008734320871962, + "learning_rate": 1.3963956492216377e-08, + "loss": 1.1754, + "step": 6995 + }, + { + "epoch": 0.9484172710635125, + "grad_norm": 3.4852794351577012, + "learning_rate": 1.389092103541456e-08, + "loss": 1.1248, + "step": 6996 + }, + { + "epoch": 0.9485528367111774, + "grad_norm": 2.176884639495488, + "learning_rate": 1.3818075741965029e-08, + "loss": 1.1595, + "step": 6997 + }, + { + "epoch": 0.9486884023588422, + "grad_norm": 1.655017368704315, + "learning_rate": 1.3745420625914995e-08, + "loss": 1.1259, + "step": 6998 + }, + { + "epoch": 0.9488239680065071, + "grad_norm": 1.9455270821178177, + "learning_rate": 1.3672955701275579e-08, + "loss": 1.137, + "step": 6999 + }, + { + "epoch": 0.9489595336541721, + "grad_norm": 1.6853734385937251, + "learning_rate": 1.360068098202105e-08, + "loss": 1.0789, + "step": 7000 + }, + { + "epoch": 0.9490950993018369, + "grad_norm": 1.5498830572300537, + "learning_rate": 1.3528596482089039e-08, + "loss": 1.0966, + "step": 7001 + }, + { + "epoch": 0.9492306649495018, + "grad_norm": 2.1913145722527463, + "learning_rate": 1.3456702215380534e-08, + "loss": 1.1122, + "step": 7002 + }, + { + "epoch": 0.9493662305971666, + "grad_norm": 1.8057344012744119, + "learning_rate": 1.3384998195759667e-08, + "loss": 1.1315, + "step": 7003 + }, + { + "epoch": 0.9495017962448316, + "grad_norm": 2.0470968559223945, + "learning_rate": 1.3313484437053935e-08, + "loss": 1.1677, + "step": 7004 + }, + { + "epoch": 0.9496373618924965, + "grad_norm": 4.187847604262615, + "learning_rate": 1.3242160953054415e-08, + "loss": 1.1236, + "step": 7005 + }, + { + "epoch": 0.9497729275401613, + "grad_norm": 2.120356601142119, + "learning_rate": 1.3171027757515107e-08, + "loss": 1.1432, + "step": 7006 + }, + { + "epoch": 0.9499084931878262, + "grad_norm": 2.532041528004807, + "learning_rate": 1.3100084864153593e-08, + "loss": 1.1749, + "step": 7007 + }, + { + "epoch": 0.950044058835491, + "grad_norm": 12.448740258487252, + "learning_rate": 1.3029332286650596e-08, + "loss": 1.1305, + "step": 7008 + }, + { + "epoch": 0.950179624483156, + "grad_norm": 1.9639389849595956, + "learning_rate": 1.295877003865009e-08, + "loss": 1.0989, + "step": 7009 + }, + { + "epoch": 0.9503151901308209, + "grad_norm": 1.9346069528753618, + "learning_rate": 1.2888398133759637e-08, + "loss": 1.1565, + "step": 7010 + }, + { + "epoch": 0.9504507557784857, + "grad_norm": 2.063197309211194, + "learning_rate": 1.2818216585549824e-08, + "loss": 1.0972, + "step": 7011 + }, + { + "epoch": 0.9505863214261506, + "grad_norm": 2.0533877688805315, + "learning_rate": 1.2748225407554603e-08, + "loss": 1.148, + "step": 7012 + }, + { + "epoch": 0.9507218870738154, + "grad_norm": 1.928102401267399, + "learning_rate": 1.2678424613271288e-08, + "loss": 1.1556, + "step": 7013 + }, + { + "epoch": 0.9508574527214804, + "grad_norm": 3.5676462342355593, + "learning_rate": 1.2608814216160223e-08, + "loss": 1.1349, + "step": 7014 + }, + { + "epoch": 0.9509930183691453, + "grad_norm": 2.271542622335616, + "learning_rate": 1.253939422964545e-08, + "loss": 1.1115, + "step": 7015 + }, + { + "epoch": 0.9511285840168101, + "grad_norm": 2.1706975919175737, + "learning_rate": 1.2470164667113926e-08, + "loss": 1.1308, + "step": 7016 + }, + { + "epoch": 0.951264149664475, + "grad_norm": 1.7003750051179638, + "learning_rate": 1.2401125541915968e-08, + "loss": 1.1331, + "step": 7017 + }, + { + "epoch": 0.9513997153121398, + "grad_norm": 1.7264547477997738, + "learning_rate": 1.2332276867365377e-08, + "loss": 1.1136, + "step": 7018 + }, + { + "epoch": 0.9515352809598048, + "grad_norm": 1.5997464145241345, + "learning_rate": 1.2263618656739083e-08, + "loss": 1.1337, + "step": 7019 + }, + { + "epoch": 0.9516708466074697, + "grad_norm": 2.089422058263286, + "learning_rate": 1.2195150923277054e-08, + "loss": 1.119, + "step": 7020 + }, + { + "epoch": 0.9518064122551345, + "grad_norm": 2.5145646361873872, + "learning_rate": 1.2126873680183058e-08, + "loss": 1.1369, + "step": 7021 + }, + { + "epoch": 0.9519419779027994, + "grad_norm": 1.5617611334829617, + "learning_rate": 1.2058786940623678e-08, + "loss": 1.1163, + "step": 7022 + }, + { + "epoch": 0.9520775435504644, + "grad_norm": 2.5738030675386194, + "learning_rate": 1.1990890717728852e-08, + "loss": 1.1376, + "step": 7023 + }, + { + "epoch": 0.9522131091981292, + "grad_norm": 2.254749918130236, + "learning_rate": 1.1923185024591775e-08, + "loss": 1.1313, + "step": 7024 + }, + { + "epoch": 0.9523486748457941, + "grad_norm": 4.805540605508889, + "learning_rate": 1.1855669874269225e-08, + "loss": 1.1316, + "step": 7025 + }, + { + "epoch": 0.9524842404934589, + "grad_norm": 2.1431881314145502, + "learning_rate": 1.1788345279780786e-08, + "loss": 1.1268, + "step": 7026 + }, + { + "epoch": 0.9526198061411238, + "grad_norm": 1.6988868700391506, + "learning_rate": 1.1721211254109408e-08, + "loss": 1.1375, + "step": 7027 + }, + { + "epoch": 0.9527553717887888, + "grad_norm": 1.8625418956702569, + "learning_rate": 1.1654267810201512e-08, + "loss": 1.1463, + "step": 7028 + }, + { + "epoch": 0.9528909374364536, + "grad_norm": 2.10671041466994, + "learning_rate": 1.1587514960966437e-08, + "loss": 1.1725, + "step": 7029 + }, + { + "epoch": 0.9530265030841185, + "grad_norm": 2.0519737179686826, + "learning_rate": 1.1520952719277222e-08, + "loss": 1.1809, + "step": 7030 + }, + { + "epoch": 0.9531620687317833, + "grad_norm": 1.8939482730639412, + "learning_rate": 1.1454581097969595e-08, + "loss": 1.1478, + "step": 7031 + }, + { + "epoch": 0.9532976343794483, + "grad_norm": 2.694049841480372, + "learning_rate": 1.1388400109842878e-08, + "loss": 1.1177, + "step": 7032 + }, + { + "epoch": 0.9534332000271132, + "grad_norm": 1.8262563504496143, + "learning_rate": 1.1322409767659525e-08, + "loss": 1.17, + "step": 7033 + }, + { + "epoch": 0.953568765674778, + "grad_norm": 1.6412469747657927, + "learning_rate": 1.1256610084145468e-08, + "loss": 1.0999, + "step": 7034 + }, + { + "epoch": 0.9537043313224429, + "grad_norm": 1.6932266549242765, + "learning_rate": 1.1191001071989336e-08, + "loss": 1.1452, + "step": 7035 + }, + { + "epoch": 0.9538398969701077, + "grad_norm": 1.5147238692773428, + "learning_rate": 1.1125582743843564e-08, + "loss": 1.1096, + "step": 7036 + }, + { + "epoch": 0.9539754626177727, + "grad_norm": 2.710436923581871, + "learning_rate": 1.1060355112323395e-08, + "loss": 1.2032, + "step": 7037 + }, + { + "epoch": 0.9541110282654376, + "grad_norm": 2.78132438712321, + "learning_rate": 1.0995318190007652e-08, + "loss": 1.1311, + "step": 7038 + }, + { + "epoch": 0.9542465939131024, + "grad_norm": 2.1467123783105637, + "learning_rate": 1.0930471989437862e-08, + "loss": 1.0945, + "step": 7039 + }, + { + "epoch": 0.9543821595607673, + "grad_norm": 1.7779671001667414, + "learning_rate": 1.0865816523119464e-08, + "loss": 1.1604, + "step": 7040 + }, + { + "epoch": 0.9545177252084321, + "grad_norm": 2.0125435249022874, + "learning_rate": 1.0801351803520598e-08, + "loss": 1.1206, + "step": 7041 + }, + { + "epoch": 0.9546532908560971, + "grad_norm": 3.3483019286061086, + "learning_rate": 1.0737077843072762e-08, + "loss": 1.125, + "step": 7042 + }, + { + "epoch": 0.954788856503762, + "grad_norm": 2.0787074272651647, + "learning_rate": 1.0672994654170598e-08, + "loss": 1.134, + "step": 7043 + }, + { + "epoch": 0.9549244221514268, + "grad_norm": 2.048604796744923, + "learning_rate": 1.060910224917222e-08, + "loss": 1.1235, + "step": 7044 + }, + { + "epoch": 0.9550599877990917, + "grad_norm": 1.6932143382881781, + "learning_rate": 1.054540064039866e-08, + "loss": 1.1256, + "step": 7045 + }, + { + "epoch": 0.9551955534467566, + "grad_norm": 2.0393500565562226, + "learning_rate": 1.0481889840134428e-08, + "loss": 1.1276, + "step": 7046 + }, + { + "epoch": 0.9553311190944215, + "grad_norm": 1.5907907406506145, + "learning_rate": 1.0418569860626836e-08, + "loss": 1.1293, + "step": 7047 + }, + { + "epoch": 0.9554666847420864, + "grad_norm": 1.8284772667785045, + "learning_rate": 1.0355440714086782e-08, + "loss": 1.1553, + "step": 7048 + }, + { + "epoch": 0.9556022503897512, + "grad_norm": 1.8160024045330152, + "learning_rate": 1.0292502412688198e-08, + "loss": 1.1325, + "step": 7049 + }, + { + "epoch": 0.9557378160374161, + "grad_norm": 1.7053576567700564, + "learning_rate": 1.0229754968568261e-08, + "loss": 1.1313, + "step": 7050 + }, + { + "epoch": 0.955873381685081, + "grad_norm": 1.7939137779330394, + "learning_rate": 1.0167198393827403e-08, + "loss": 1.1288, + "step": 7051 + }, + { + "epoch": 0.9560089473327459, + "grad_norm": 2.6539030120886644, + "learning_rate": 1.0104832700528975e-08, + "loss": 1.1226, + "step": 7052 + }, + { + "epoch": 0.9561445129804108, + "grad_norm": 1.6180374547004746, + "learning_rate": 1.0042657900699803e-08, + "loss": 1.1548, + "step": 7053 + }, + { + "epoch": 0.9562800786280756, + "grad_norm": 1.9190843815959049, + "learning_rate": 9.980674006329848e-09, + "loss": 1.1062, + "step": 7054 + }, + { + "epoch": 0.9564156442757406, + "grad_norm": 2.005584241952583, + "learning_rate": 9.918881029372106e-09, + "loss": 1.1107, + "step": 7055 + }, + { + "epoch": 0.9565512099234054, + "grad_norm": 4.002874096915869, + "learning_rate": 9.857278981742934e-09, + "loss": 1.1757, + "step": 7056 + }, + { + "epoch": 0.9566867755710703, + "grad_norm": 1.6416117741463765, + "learning_rate": 9.795867875321829e-09, + "loss": 1.1396, + "step": 7057 + }, + { + "epoch": 0.9568223412187352, + "grad_norm": 1.9659243723326478, + "learning_rate": 9.734647721951427e-09, + "loss": 1.1456, + "step": 7058 + }, + { + "epoch": 0.9569579068664, + "grad_norm": 3.9065235680995123, + "learning_rate": 9.673618533437511e-09, + "loss": 1.1548, + "step": 7059 + }, + { + "epoch": 0.957093472514065, + "grad_norm": 2.533238201143718, + "learning_rate": 9.612780321549108e-09, + "loss": 1.1579, + "step": 7060 + }, + { + "epoch": 0.9572290381617298, + "grad_norm": 1.7745794174447374, + "learning_rate": 9.552133098018389e-09, + "loss": 1.1512, + "step": 7061 + }, + { + "epoch": 0.9573646038093947, + "grad_norm": 1.7936810324167212, + "learning_rate": 9.491676874540666e-09, + "loss": 1.1082, + "step": 7062 + }, + { + "epoch": 0.9575001694570596, + "grad_norm": 2.348406464551202, + "learning_rate": 9.431411662774502e-09, + "loss": 1.1663, + "step": 7063 + }, + { + "epoch": 0.9576357351047244, + "grad_norm": 5.033606966428257, + "learning_rate": 9.37133747434149e-09, + "loss": 1.1517, + "step": 7064 + }, + { + "epoch": 0.9577713007523894, + "grad_norm": 3.3816640842392656, + "learning_rate": 9.311454320826473e-09, + "loss": 1.1556, + "step": 7065 + }, + { + "epoch": 0.9579068664000542, + "grad_norm": 1.8597851995454844, + "learning_rate": 9.251762213777437e-09, + "loss": 1.1495, + "step": 7066 + }, + { + "epoch": 0.9580424320477191, + "grad_norm": 1.6675096774145546, + "learning_rate": 9.192261164705617e-09, + "loss": 1.1452, + "step": 7067 + }, + { + "epoch": 0.958177997695384, + "grad_norm": 1.9690953592060858, + "learning_rate": 9.132951185085281e-09, + "loss": 1.1351, + "step": 7068 + }, + { + "epoch": 0.9583135633430488, + "grad_norm": 1.6543699912164465, + "learning_rate": 9.073832286353944e-09, + "loss": 1.1672, + "step": 7069 + }, + { + "epoch": 0.9584491289907138, + "grad_norm": 3.9189126623209325, + "learning_rate": 9.014904479912044e-09, + "loss": 1.1521, + "step": 7070 + }, + { + "epoch": 0.9585846946383786, + "grad_norm": 4.398596819852975, + "learning_rate": 8.956167777123602e-09, + "loss": 1.1183, + "step": 7071 + }, + { + "epoch": 0.9587202602860435, + "grad_norm": 2.112354013330604, + "learning_rate": 8.897622189315224e-09, + "loss": 1.1456, + "step": 7072 + }, + { + "epoch": 0.9588558259337084, + "grad_norm": 4.05104311964079, + "learning_rate": 8.839267727777211e-09, + "loss": 1.1726, + "step": 7073 + }, + { + "epoch": 0.9589913915813733, + "grad_norm": 1.7150371875444694, + "learning_rate": 8.781104403762563e-09, + "loss": 1.1015, + "step": 7074 + }, + { + "epoch": 0.9591269572290382, + "grad_norm": 1.829558904246851, + "learning_rate": 8.723132228487861e-09, + "loss": 1.1663, + "step": 7075 + }, + { + "epoch": 0.959262522876703, + "grad_norm": 1.8298592462057324, + "learning_rate": 8.665351213132278e-09, + "loss": 1.1338, + "step": 7076 + }, + { + "epoch": 0.9593980885243679, + "grad_norm": 2.0884126933675873, + "learning_rate": 8.607761368838785e-09, + "loss": 1.1184, + "step": 7077 + }, + { + "epoch": 0.9595336541720328, + "grad_norm": 1.8630947558960034, + "learning_rate": 8.550362706712832e-09, + "loss": 1.1416, + "step": 7078 + }, + { + "epoch": 0.9596692198196977, + "grad_norm": 1.9705393429666025, + "learning_rate": 8.493155237823347e-09, + "loss": 1.1645, + "step": 7079 + }, + { + "epoch": 0.9598047854673626, + "grad_norm": 1.7985469285680158, + "learning_rate": 8.4361389732025e-09, + "loss": 1.1238, + "step": 7080 + }, + { + "epoch": 0.9599403511150274, + "grad_norm": 1.7620251976349552, + "learning_rate": 8.379313923845277e-09, + "loss": 1.1234, + "step": 7081 + }, + { + "epoch": 0.9600759167626923, + "grad_norm": 1.9278140813188984, + "learning_rate": 8.322680100710022e-09, + "loss": 1.1209, + "step": 7082 + }, + { + "epoch": 0.9602114824103573, + "grad_norm": 1.7798828450277955, + "learning_rate": 8.266237514718e-09, + "loss": 1.1512, + "step": 7083 + }, + { + "epoch": 0.9603470480580221, + "grad_norm": 1.6484551187322167, + "learning_rate": 8.209986176753948e-09, + "loss": 1.1071, + "step": 7084 + }, + { + "epoch": 0.960482613705687, + "grad_norm": 4.37035681128153, + "learning_rate": 8.153926097665186e-09, + "loss": 1.1393, + "step": 7085 + }, + { + "epoch": 0.9606181793533518, + "grad_norm": 2.3075110115934123, + "learning_rate": 8.098057288262738e-09, + "loss": 1.1399, + "step": 7086 + }, + { + "epoch": 0.9607537450010167, + "grad_norm": 2.591754966749492, + "learning_rate": 8.042379759320317e-09, + "loss": 1.1649, + "step": 7087 + }, + { + "epoch": 0.9608893106486817, + "grad_norm": 2.184420257654729, + "learning_rate": 7.986893521574888e-09, + "loss": 1.1488, + "step": 7088 + }, + { + "epoch": 0.9610248762963465, + "grad_norm": 1.6451961343757149, + "learning_rate": 7.931598585726562e-09, + "loss": 1.1378, + "step": 7089 + }, + { + "epoch": 0.9611604419440114, + "grad_norm": 1.7786967170302748, + "learning_rate": 7.876494962438585e-09, + "loss": 1.1601, + "step": 7090 + }, + { + "epoch": 0.9612960075916762, + "grad_norm": 2.229346964287714, + "learning_rate": 7.821582662337123e-09, + "loss": 1.1298, + "step": 7091 + }, + { + "epoch": 0.9614315732393411, + "grad_norm": 1.8324337023153168, + "learning_rate": 7.766861696011816e-09, + "loss": 1.1244, + "step": 7092 + }, + { + "epoch": 0.9615671388870061, + "grad_norm": 1.7570643809727389, + "learning_rate": 7.712332074014893e-09, + "loss": 1.1234, + "step": 7093 + }, + { + "epoch": 0.9617027045346709, + "grad_norm": 1.8768396203950253, + "learning_rate": 7.657993806862162e-09, + "loss": 1.1525, + "step": 7094 + }, + { + "epoch": 0.9618382701823358, + "grad_norm": 2.411277975340872, + "learning_rate": 7.603846905032129e-09, + "loss": 1.1287, + "step": 7095 + }, + { + "epoch": 0.9619738358300006, + "grad_norm": 1.8432216788968894, + "learning_rate": 7.549891378966888e-09, + "loss": 1.1529, + "step": 7096 + }, + { + "epoch": 0.9621094014776655, + "grad_norm": 1.9746781783553498, + "learning_rate": 7.496127239071003e-09, + "loss": 1.1491, + "step": 7097 + }, + { + "epoch": 0.9622449671253305, + "grad_norm": 6.841311442388723, + "learning_rate": 7.442554495712738e-09, + "loss": 1.1324, + "step": 7098 + }, + { + "epoch": 0.9623805327729953, + "grad_norm": 1.6406583294499049, + "learning_rate": 7.3891731592230496e-09, + "loss": 1.1028, + "step": 7099 + }, + { + "epoch": 0.9625160984206602, + "grad_norm": 1.8905034352281835, + "learning_rate": 7.335983239896148e-09, + "loss": 1.1378, + "step": 7100 + }, + { + "epoch": 0.9626516640683251, + "grad_norm": 1.7391519239719095, + "learning_rate": 7.282984747989163e-09, + "loss": 1.136, + "step": 7101 + }, + { + "epoch": 0.96278722971599, + "grad_norm": 2.011011054578179, + "learning_rate": 7.230177693722583e-09, + "loss": 1.125, + "step": 7102 + }, + { + "epoch": 0.9629227953636549, + "grad_norm": 9.23959867767666, + "learning_rate": 7.17756208727982e-09, + "loss": 1.14, + "step": 7103 + }, + { + "epoch": 0.9630583610113197, + "grad_norm": 1.7181093561135274, + "learning_rate": 7.125137938807424e-09, + "loss": 1.1171, + "step": 7104 + }, + { + "epoch": 0.9631939266589846, + "grad_norm": 2.2596503734716826, + "learning_rate": 7.072905258414752e-09, + "loss": 1.1278, + "step": 7105 + }, + { + "epoch": 0.9633294923066495, + "grad_norm": 2.2982495973583084, + "learning_rate": 7.020864056174635e-09, + "loss": 1.1042, + "step": 7106 + }, + { + "epoch": 0.9634650579543144, + "grad_norm": 2.0891636476055138, + "learning_rate": 6.969014342122825e-09, + "loss": 1.1174, + "step": 7107 + }, + { + "epoch": 0.9636006236019793, + "grad_norm": 2.1825350728760387, + "learning_rate": 6.9173561262581e-09, + "loss": 1.1622, + "step": 7108 + }, + { + "epoch": 0.9637361892496441, + "grad_norm": 2.1683471333804345, + "learning_rate": 6.86588941854227e-09, + "loss": 1.1407, + "step": 7109 + }, + { + "epoch": 0.963871754897309, + "grad_norm": 2.0036892537934046, + "learning_rate": 6.814614228900506e-09, + "loss": 1.1416, + "step": 7110 + }, + { + "epoch": 0.964007320544974, + "grad_norm": 1.568422220924218, + "learning_rate": 6.763530567220455e-09, + "loss": 1.1753, + "step": 7111 + }, + { + "epoch": 0.9641428861926388, + "grad_norm": 1.996888215992166, + "learning_rate": 6.712638443353569e-09, + "loss": 1.1345, + "step": 7112 + }, + { + "epoch": 0.9642784518403037, + "grad_norm": 1.6323792018612153, + "learning_rate": 6.661937867113665e-09, + "loss": 1.1533, + "step": 7113 + }, + { + "epoch": 0.9644140174879685, + "grad_norm": 1.6909594469680715, + "learning_rate": 6.611428848278256e-09, + "loss": 1.1412, + "step": 7114 + }, + { + "epoch": 0.9645495831356334, + "grad_norm": 2.9148001494258606, + "learning_rate": 6.5611113965873265e-09, + "loss": 1.1922, + "step": 7115 + }, + { + "epoch": 0.9646851487832984, + "grad_norm": 2.037237811421171, + "learning_rate": 6.51098552174445e-09, + "loss": 1.1923, + "step": 7116 + }, + { + "epoch": 0.9648207144309632, + "grad_norm": 1.6622369457882873, + "learning_rate": 6.461051233415782e-09, + "loss": 1.1359, + "step": 7117 + }, + { + "epoch": 0.9649562800786281, + "grad_norm": 1.9031471637359685, + "learning_rate": 6.4113085412309535e-09, + "loss": 1.1415, + "step": 7118 + }, + { + "epoch": 0.9650918457262929, + "grad_norm": 1.5614359585616724, + "learning_rate": 6.361757454782291e-09, + "loss": 1.1275, + "step": 7119 + }, + { + "epoch": 0.9652274113739578, + "grad_norm": 2.115348997815399, + "learning_rate": 6.312397983625483e-09, + "loss": 1.1613, + "step": 7120 + }, + { + "epoch": 0.9653629770216228, + "grad_norm": 1.6600504512769036, + "learning_rate": 6.2632301372789185e-09, + "loss": 1.1417, + "step": 7121 + }, + { + "epoch": 0.9654985426692876, + "grad_norm": 2.1429278196362067, + "learning_rate": 6.214253925224455e-09, + "loss": 1.1242, + "step": 7122 + }, + { + "epoch": 0.9656341083169525, + "grad_norm": 1.9775991056243183, + "learning_rate": 6.165469356906539e-09, + "loss": 1.1435, + "step": 7123 + }, + { + "epoch": 0.9657696739646173, + "grad_norm": 1.747268269195497, + "learning_rate": 6.116876441733087e-09, + "loss": 1.1618, + "step": 7124 + }, + { + "epoch": 0.9659052396122823, + "grad_norm": 2.77042327408537, + "learning_rate": 6.068475189074829e-09, + "loss": 1.1458, + "step": 7125 + }, + { + "epoch": 0.9660408052599472, + "grad_norm": 1.6845251987792689, + "learning_rate": 6.020265608265407e-09, + "loss": 1.142, + "step": 7126 + }, + { + "epoch": 0.966176370907612, + "grad_norm": 2.9162894668172514, + "learning_rate": 5.97224770860183e-09, + "loss": 1.1971, + "step": 7127 + }, + { + "epoch": 0.9663119365552769, + "grad_norm": 2.5544552243037026, + "learning_rate": 5.924421499343801e-09, + "loss": 1.1243, + "step": 7128 + }, + { + "epoch": 0.9664475022029417, + "grad_norm": 1.9602520992841506, + "learning_rate": 5.8767869897145e-09, + "loss": 1.1742, + "step": 7129 + }, + { + "epoch": 0.9665830678506067, + "grad_norm": 3.186700305856264, + "learning_rate": 5.8293441888994655e-09, + "loss": 1.1163, + "step": 7130 + }, + { + "epoch": 0.9667186334982716, + "grad_norm": 2.0264628189611567, + "learning_rate": 5.7820931060481585e-09, + "loss": 1.1191, + "step": 7131 + }, + { + "epoch": 0.9668541991459364, + "grad_norm": 1.8193651508859305, + "learning_rate": 5.735033750272067e-09, + "loss": 1.1146, + "step": 7132 + }, + { + "epoch": 0.9669897647936013, + "grad_norm": 2.173192138214751, + "learning_rate": 5.68816613064671e-09, + "loss": 1.1632, + "step": 7133 + }, + { + "epoch": 0.9671253304412661, + "grad_norm": 1.8991509152845314, + "learning_rate": 5.6414902562096356e-09, + "loss": 1.1556, + "step": 7134 + }, + { + "epoch": 0.9672608960889311, + "grad_norm": 2.2766238620158945, + "learning_rate": 5.595006135962421e-09, + "loss": 1.1464, + "step": 7135 + }, + { + "epoch": 0.967396461736596, + "grad_norm": 1.735284475341264, + "learning_rate": 5.548713778868786e-09, + "loss": 1.1549, + "step": 7136 + }, + { + "epoch": 0.9675320273842608, + "grad_norm": 1.7073978818400524, + "learning_rate": 5.502613193856031e-09, + "loss": 1.1192, + "step": 7137 + }, + { + "epoch": 0.9676675930319257, + "grad_norm": 1.855572696679519, + "learning_rate": 5.45670438981416e-09, + "loss": 1.1459, + "step": 7138 + }, + { + "epoch": 0.9678031586795905, + "grad_norm": 4.02562793628154, + "learning_rate": 5.4109873755964205e-09, + "loss": 1.1434, + "step": 7139 + }, + { + "epoch": 0.9679387243272555, + "grad_norm": 1.7214924596199257, + "learning_rate": 5.365462160018985e-09, + "loss": 1.1421, + "step": 7140 + }, + { + "epoch": 0.9680742899749204, + "grad_norm": 2.7245005876692425, + "learning_rate": 5.3201287518610525e-09, + "loss": 1.1191, + "step": 7141 + }, + { + "epoch": 0.9682098556225852, + "grad_norm": 15.150072059147984, + "learning_rate": 5.274987159864741e-09, + "loss": 1.1365, + "step": 7142 + }, + { + "epoch": 0.9683454212702501, + "grad_norm": 2.3386527463763653, + "learning_rate": 5.2300373927351984e-09, + "loss": 1.1901, + "step": 7143 + }, + { + "epoch": 0.968480986917915, + "grad_norm": 2.299623358277266, + "learning_rate": 5.185279459140823e-09, + "loss": 1.1644, + "step": 7144 + }, + { + "epoch": 0.9686165525655799, + "grad_norm": 2.0329961132726826, + "learning_rate": 5.140713367712601e-09, + "loss": 1.1404, + "step": 7145 + }, + { + "epoch": 0.9687521182132448, + "grad_norm": 1.7832082021786617, + "learning_rate": 5.09633912704488e-09, + "loss": 1.1598, + "step": 7146 + }, + { + "epoch": 0.9688876838609096, + "grad_norm": 1.6376291365491504, + "learning_rate": 5.052156745694924e-09, + "loss": 1.1294, + "step": 7147 + }, + { + "epoch": 0.9690232495085745, + "grad_norm": 1.7365066134442486, + "learning_rate": 5.00816623218292e-09, + "loss": 1.18, + "step": 7148 + }, + { + "epoch": 0.9691588151562394, + "grad_norm": 1.8663383800459186, + "learning_rate": 4.964367594991969e-09, + "loss": 1.1361, + "step": 7149 + }, + { + "epoch": 0.9692943808039043, + "grad_norm": 2.328539794063838, + "learning_rate": 4.920760842568539e-09, + "loss": 1.1334, + "step": 7150 + }, + { + "epoch": 0.9694299464515692, + "grad_norm": 1.5827489966837187, + "learning_rate": 4.877345983321568e-09, + "loss": 1.1668, + "step": 7151 + }, + { + "epoch": 0.969565512099234, + "grad_norm": 1.8536877591821892, + "learning_rate": 4.834123025623471e-09, + "loss": 1.1714, + "step": 7152 + }, + { + "epoch": 0.969701077746899, + "grad_norm": 2.086608826575171, + "learning_rate": 4.791091977809358e-09, + "loss": 1.1257, + "step": 7153 + }, + { + "epoch": 0.9698366433945638, + "grad_norm": 1.6747684108715788, + "learning_rate": 4.7482528481774805e-09, + "loss": 1.0766, + "step": 7154 + }, + { + "epoch": 0.9699722090422287, + "grad_norm": 1.6105800102303338, + "learning_rate": 4.705605644988897e-09, + "loss": 1.1536, + "step": 7155 + }, + { + "epoch": 0.9701077746898936, + "grad_norm": 5.469234824667813, + "learning_rate": 4.663150376468028e-09, + "loss": 1.1249, + "step": 7156 + }, + { + "epoch": 0.9702433403375584, + "grad_norm": 4.82574201765184, + "learning_rate": 4.62088705080177e-09, + "loss": 1.1883, + "step": 7157 + }, + { + "epoch": 0.9703789059852234, + "grad_norm": 1.563313250387079, + "learning_rate": 4.5788156761404906e-09, + "loss": 1.149, + "step": 7158 + }, + { + "epoch": 0.9705144716328882, + "grad_norm": 2.010942505398286, + "learning_rate": 4.536936260597257e-09, + "loss": 1.1481, + "step": 7159 + }, + { + "epoch": 0.9706500372805531, + "grad_norm": 1.6562810574586375, + "learning_rate": 4.495248812248054e-09, + "loss": 1.1312, + "step": 7160 + }, + { + "epoch": 0.970785602928218, + "grad_norm": 1.7368394035724355, + "learning_rate": 4.453753339132116e-09, + "loss": 1.1245, + "step": 7161 + }, + { + "epoch": 0.9709211685758828, + "grad_norm": 2.0801761688424865, + "learning_rate": 4.412449849251598e-09, + "loss": 1.122, + "step": 7162 + }, + { + "epoch": 0.9710567342235478, + "grad_norm": 3.081142690999182, + "learning_rate": 4.371338350571352e-09, + "loss": 1.1387, + "step": 7163 + }, + { + "epoch": 0.9711922998712126, + "grad_norm": 5.002942842803807, + "learning_rate": 4.3304188510194795e-09, + "loss": 1.1518, + "step": 7164 + }, + { + "epoch": 0.9713278655188775, + "grad_norm": 2.1917339871372996, + "learning_rate": 4.289691358486891e-09, + "loss": 1.1814, + "step": 7165 + }, + { + "epoch": 0.9714634311665424, + "grad_norm": 1.7099846884508807, + "learning_rate": 4.249155880827859e-09, + "loss": 1.1433, + "step": 7166 + }, + { + "epoch": 0.9715989968142072, + "grad_norm": 3.937469043120042, + "learning_rate": 4.2088124258590205e-09, + "loss": 1.1404, + "step": 7167 + }, + { + "epoch": 0.9717345624618722, + "grad_norm": 3.0733420626476455, + "learning_rate": 4.168661001360485e-09, + "loss": 1.1241, + "step": 7168 + }, + { + "epoch": 0.971870128109537, + "grad_norm": 2.6151311712438052, + "learning_rate": 4.128701615074947e-09, + "loss": 1.1448, + "step": 7169 + }, + { + "epoch": 0.9720056937572019, + "grad_norm": 1.7002443906561184, + "learning_rate": 4.088934274708466e-09, + "loss": 1.1288, + "step": 7170 + }, + { + "epoch": 0.9721412594048668, + "grad_norm": 1.840219349913103, + "learning_rate": 4.049358987929685e-09, + "loss": 1.1314, + "step": 7171 + }, + { + "epoch": 0.9722768250525317, + "grad_norm": 1.8241169536750832, + "learning_rate": 4.00997576237061e-09, + "loss": 1.1554, + "step": 7172 + }, + { + "epoch": 0.9724123907001966, + "grad_norm": 2.243689171142017, + "learning_rate": 3.970784605625721e-09, + "loss": 1.1247, + "step": 7173 + }, + { + "epoch": 0.9725479563478614, + "grad_norm": 1.5712422843457416, + "learning_rate": 3.931785525252862e-09, + "loss": 1.1137, + "step": 7174 + }, + { + "epoch": 0.9726835219955263, + "grad_norm": 1.8665461514830444, + "learning_rate": 3.892978528772684e-09, + "loss": 1.1219, + "step": 7175 + }, + { + "epoch": 0.9728190876431912, + "grad_norm": 7.598621427516209, + "learning_rate": 3.854363623668866e-09, + "loss": 1.1379, + "step": 7176 + }, + { + "epoch": 0.9729546532908561, + "grad_norm": 1.7772543613432263, + "learning_rate": 3.815940817387786e-09, + "loss": 1.1406, + "step": 7177 + }, + { + "epoch": 0.973090218938521, + "grad_norm": 1.8250275959089104, + "learning_rate": 3.777710117339183e-09, + "loss": 1.1109, + "step": 7178 + }, + { + "epoch": 0.9732257845861859, + "grad_norm": 1.7492700774593921, + "learning_rate": 3.739671530895605e-09, + "loss": 1.1359, + "step": 7179 + }, + { + "epoch": 0.9733613502338507, + "grad_norm": 3.2421072488956537, + "learning_rate": 3.7018250653921834e-09, + "loss": 1.1345, + "step": 7180 + }, + { + "epoch": 0.9734969158815157, + "grad_norm": 1.768868032608592, + "learning_rate": 3.6641707281276357e-09, + "loss": 1.139, + "step": 7181 + }, + { + "epoch": 0.9736324815291805, + "grad_norm": 2.08672006186654, + "learning_rate": 3.6267085263631537e-09, + "loss": 1.1188, + "step": 7182 + }, + { + "epoch": 0.9737680471768454, + "grad_norm": 1.62689356155613, + "learning_rate": 3.589438467322958e-09, + "loss": 1.1465, + "step": 7183 + }, + { + "epoch": 0.9739036128245103, + "grad_norm": 1.935427220439092, + "learning_rate": 3.5523605581944115e-09, + "loss": 1.1515, + "step": 7184 + }, + { + "epoch": 0.9740391784721751, + "grad_norm": 1.6480547118391864, + "learning_rate": 3.5154748061276828e-09, + "loss": 1.1515, + "step": 7185 + }, + { + "epoch": 0.9741747441198401, + "grad_norm": 2.9121013438995917, + "learning_rate": 3.47878121823586e-09, + "loss": 1.15, + "step": 7186 + }, + { + "epoch": 0.9743103097675049, + "grad_norm": 1.8425909909589555, + "learning_rate": 3.4422798015949496e-09, + "loss": 1.1258, + "step": 7187 + }, + { + "epoch": 0.9744458754151698, + "grad_norm": 2.01591511697042, + "learning_rate": 3.405970563244098e-09, + "loss": 1.1297, + "step": 7188 + }, + { + "epoch": 0.9745814410628347, + "grad_norm": 1.7836883475748668, + "learning_rate": 3.36985351018515e-09, + "loss": 1.0689, + "step": 7189 + }, + { + "epoch": 0.9747170067104995, + "grad_norm": 1.6944162355889227, + "learning_rate": 3.3339286493830886e-09, + "loss": 1.1639, + "step": 7190 + }, + { + "epoch": 0.9748525723581645, + "grad_norm": 1.5172132881764009, + "learning_rate": 3.2981959877657063e-09, + "loss": 1.1323, + "step": 7191 + }, + { + "epoch": 0.9749881380058293, + "grad_norm": 1.6102555959423932, + "learning_rate": 3.2626555322236014e-09, + "loss": 1.1528, + "step": 7192 + }, + { + "epoch": 0.9751237036534942, + "grad_norm": 1.9318904294140031, + "learning_rate": 3.227307289610737e-09, + "loss": 1.104, + "step": 7193 + }, + { + "epoch": 0.9752592693011591, + "grad_norm": 1.8177695185402667, + "learning_rate": 3.192151266743548e-09, + "loss": 1.1657, + "step": 7194 + }, + { + "epoch": 0.975394834948824, + "grad_norm": 2.070692220488257, + "learning_rate": 3.157187470401723e-09, + "loss": 1.1623, + "step": 7195 + }, + { + "epoch": 0.9755304005964889, + "grad_norm": 1.7506594018289297, + "learning_rate": 3.122415907327647e-09, + "loss": 1.1579, + "step": 7196 + }, + { + "epoch": 0.9756659662441537, + "grad_norm": 2.355842709171441, + "learning_rate": 3.0878365842268437e-09, + "loss": 1.1569, + "step": 7197 + }, + { + "epoch": 0.9758015318918186, + "grad_norm": 1.8490888226452398, + "learning_rate": 3.053449507767536e-09, + "loss": 1.1419, + "step": 7198 + }, + { + "epoch": 0.9759370975394835, + "grad_norm": 1.9370376576799004, + "learning_rate": 3.019254684581085e-09, + "loss": 1.1693, + "step": 7199 + }, + { + "epoch": 0.9760726631871484, + "grad_norm": 2.058146745789947, + "learning_rate": 2.985252121261661e-09, + "loss": 1.1405, + "step": 7200 + }, + { + "epoch": 0.9762082288348133, + "grad_norm": 1.7413677137156276, + "learning_rate": 2.951441824366463e-09, + "loss": 1.1212, + "step": 7201 + }, + { + "epoch": 0.9763437944824781, + "grad_norm": 1.7497241526906495, + "learning_rate": 2.9178238004154975e-09, + "loss": 1.1222, + "step": 7202 + }, + { + "epoch": 0.976479360130143, + "grad_norm": 2.0478553777516164, + "learning_rate": 2.88439805589169e-09, + "loss": 1.1257, + "step": 7203 + }, + { + "epoch": 0.976614925777808, + "grad_norm": 1.7930925409406233, + "learning_rate": 2.851164597240996e-09, + "loss": 1.1452, + "step": 7204 + }, + { + "epoch": 0.9767504914254728, + "grad_norm": 1.9023445158329266, + "learning_rate": 2.8181234308721767e-09, + "loss": 1.1023, + "step": 7205 + }, + { + "epoch": 0.9768860570731377, + "grad_norm": 1.7676241118100382, + "learning_rate": 2.7852745631570253e-09, + "loss": 1.1207, + "step": 7206 + }, + { + "epoch": 0.9770216227208025, + "grad_norm": 2.2835700435704442, + "learning_rate": 2.7526180004300294e-09, + "loss": 1.1477, + "step": 7207 + }, + { + "epoch": 0.9771571883684674, + "grad_norm": 1.993779861409688, + "learning_rate": 2.720153748988929e-09, + "loss": 1.139, + "step": 7208 + }, + { + "epoch": 0.9772927540161324, + "grad_norm": 2.0564120065020686, + "learning_rate": 2.6878818150941616e-09, + "loss": 1.1561, + "step": 7209 + }, + { + "epoch": 0.9774283196637972, + "grad_norm": 1.7617558992976297, + "learning_rate": 2.655802204968971e-09, + "loss": 1.1534, + "step": 7210 + }, + { + "epoch": 0.9775638853114621, + "grad_norm": 1.569259650089582, + "learning_rate": 2.6239149247999635e-09, + "loss": 1.1249, + "step": 7211 + }, + { + "epoch": 0.9776994509591269, + "grad_norm": 2.0583119171328765, + "learning_rate": 2.592219980735999e-09, + "loss": 1.1342, + "step": 7212 + }, + { + "epoch": 0.9778350166067918, + "grad_norm": 1.9190665853981144, + "learning_rate": 2.5607173788894097e-09, + "loss": 1.1431, + "step": 7213 + }, + { + "epoch": 0.9779705822544568, + "grad_norm": 2.1520084360330562, + "learning_rate": 2.5294071253351146e-09, + "loss": 1.1414, + "step": 7214 + }, + { + "epoch": 0.9781061479021216, + "grad_norm": 1.8943750862736246, + "learning_rate": 2.498289226111061e-09, + "loss": 1.1195, + "step": 7215 + }, + { + "epoch": 0.9782417135497865, + "grad_norm": 1.7986542565288057, + "learning_rate": 2.467363687218227e-09, + "loss": 1.1712, + "step": 7216 + }, + { + "epoch": 0.9783772791974513, + "grad_norm": 1.6732476417496203, + "learning_rate": 2.436630514620286e-09, + "loss": 1.1181, + "step": 7217 + }, + { + "epoch": 0.9785128448451162, + "grad_norm": 1.7831274982927694, + "learning_rate": 2.4060897142438308e-09, + "loss": 1.1453, + "step": 7218 + }, + { + "epoch": 0.9786484104927812, + "grad_norm": 2.430034644895944, + "learning_rate": 2.3757412919783725e-09, + "loss": 1.1635, + "step": 7219 + }, + { + "epoch": 0.978783976140446, + "grad_norm": 3.3884050202263265, + "learning_rate": 2.345585253676452e-09, + "loss": 1.1488, + "step": 7220 + }, + { + "epoch": 0.9789195417881109, + "grad_norm": 2.0490940308181074, + "learning_rate": 2.3156216051535284e-09, + "loss": 1.1019, + "step": 7221 + }, + { + "epoch": 0.9790551074357757, + "grad_norm": 1.8160320765347415, + "learning_rate": 2.285850352187646e-09, + "loss": 1.1228, + "step": 7222 + }, + { + "epoch": 0.9791906730834407, + "grad_norm": 1.8512312364129713, + "learning_rate": 2.2562715005201016e-09, + "loss": 1.1459, + "step": 7223 + }, + { + "epoch": 0.9793262387311056, + "grad_norm": 3.220731267524111, + "learning_rate": 2.226885055854777e-09, + "loss": 1.183, + "step": 7224 + }, + { + "epoch": 0.9794618043787704, + "grad_norm": 1.8846093023551764, + "learning_rate": 2.1976910238588055e-09, + "loss": 1.163, + "step": 7225 + }, + { + "epoch": 0.9795973700264353, + "grad_norm": 1.6958886312172636, + "learning_rate": 2.168689410162017e-09, + "loss": 1.1115, + "step": 7226 + }, + { + "epoch": 0.9797329356741001, + "grad_norm": 1.6840520503070422, + "learning_rate": 2.1398802203569375e-09, + "loss": 1.1538, + "step": 7227 + }, + { + "epoch": 0.9798685013217651, + "grad_norm": 1.6883993770277839, + "learning_rate": 2.111263459999457e-09, + "loss": 1.1176, + "step": 7228 + }, + { + "epoch": 0.98000406696943, + "grad_norm": 1.7843039146594757, + "learning_rate": 2.0828391346078277e-09, + "loss": 1.1384, + "step": 7229 + }, + { + "epoch": 0.9801396326170948, + "grad_norm": 15.317713192365474, + "learning_rate": 2.054607249663665e-09, + "loss": 1.1477, + "step": 7230 + }, + { + "epoch": 0.9802751982647597, + "grad_norm": 1.6680572031121492, + "learning_rate": 2.0265678106111685e-09, + "loss": 1.1646, + "step": 7231 + }, + { + "epoch": 0.9804107639124245, + "grad_norm": 1.9666698837543557, + "learning_rate": 1.9987208228575693e-09, + "loss": 1.1283, + "step": 7232 + }, + { + "epoch": 0.9805463295600895, + "grad_norm": 1.9311813803018716, + "learning_rate": 1.971066291772905e-09, + "loss": 1.1627, + "step": 7233 + }, + { + "epoch": 0.9806818952077544, + "grad_norm": 4.642162521894147, + "learning_rate": 1.9436042226901315e-09, + "loss": 1.1566, + "step": 7234 + }, + { + "epoch": 0.9808174608554192, + "grad_norm": 1.6951795124326932, + "learning_rate": 1.9163346209051246e-09, + "loss": 1.1238, + "step": 7235 + }, + { + "epoch": 0.9809530265030841, + "grad_norm": 1.6884053593944854, + "learning_rate": 1.889257491676677e-09, + "loss": 1.1048, + "step": 7236 + }, + { + "epoch": 0.981088592150749, + "grad_norm": 1.7251300380453687, + "learning_rate": 1.8623728402261674e-09, + "loss": 1.1208, + "step": 7237 + }, + { + "epoch": 0.9812241577984139, + "grad_norm": 1.8092506902857715, + "learning_rate": 1.8356806717383377e-09, + "loss": 1.1598, + "step": 7238 + }, + { + "epoch": 0.9813597234460788, + "grad_norm": 2.032709813093778, + "learning_rate": 1.809180991360404e-09, + "loss": 1.1369, + "step": 7239 + }, + { + "epoch": 0.9814952890937436, + "grad_norm": 1.783943213788109, + "learning_rate": 1.7828738042027225e-09, + "loss": 1.1527, + "step": 7240 + }, + { + "epoch": 0.9816308547414085, + "grad_norm": 2.0389027175904038, + "learning_rate": 1.7567591153383466e-09, + "loss": 1.1571, + "step": 7241 + }, + { + "epoch": 0.9817664203890734, + "grad_norm": 1.991990803078623, + "learning_rate": 1.7308369298033587e-09, + "loss": 1.1526, + "step": 7242 + }, + { + "epoch": 0.9819019860367383, + "grad_norm": 1.8966558248654597, + "learning_rate": 1.7051072525965382e-09, + "loss": 1.1442, + "step": 7243 + }, + { + "epoch": 0.9820375516844032, + "grad_norm": 2.027849315381984, + "learning_rate": 1.6795700886798049e-09, + "loss": 1.1397, + "step": 7244 + }, + { + "epoch": 0.982173117332068, + "grad_norm": 1.7155614056182429, + "learning_rate": 1.6542254429776636e-09, + "loss": 1.072, + "step": 7245 + }, + { + "epoch": 0.982308682979733, + "grad_norm": 2.108213829059701, + "learning_rate": 1.6290733203776497e-09, + "loss": 1.1672, + "step": 7246 + }, + { + "epoch": 0.9824442486273978, + "grad_norm": 2.25294253488452, + "learning_rate": 1.6041137257303272e-09, + "loss": 1.1269, + "step": 7247 + }, + { + "epoch": 0.9825798142750627, + "grad_norm": 1.7647821601430227, + "learning_rate": 1.5793466638486242e-09, + "loss": 1.1407, + "step": 7248 + }, + { + "epoch": 0.9827153799227276, + "grad_norm": 1.547335322668067, + "learning_rate": 1.554772139509053e-09, + "loss": 1.1228, + "step": 7249 + }, + { + "epoch": 0.9828509455703924, + "grad_norm": 3.062829255690299, + "learning_rate": 1.5303901574502675e-09, + "loss": 1.1137, + "step": 7250 + }, + { + "epoch": 0.9829865112180574, + "grad_norm": 2.023128309062777, + "learning_rate": 1.5062007223743956e-09, + "loss": 1.1518, + "step": 7251 + }, + { + "epoch": 0.9831220768657222, + "grad_norm": 1.8375182714604354, + "learning_rate": 1.482203838946039e-09, + "loss": 1.1236, + "step": 7252 + }, + { + "epoch": 0.9832576425133871, + "grad_norm": 1.8198285258749562, + "learning_rate": 1.4583995117929404e-09, + "loss": 1.1734, + "step": 7253 + }, + { + "epoch": 0.983393208161052, + "grad_norm": 2.0084987846373803, + "learning_rate": 1.434787745505317e-09, + "loss": 1.1601, + "step": 7254 + }, + { + "epoch": 0.9835287738087168, + "grad_norm": 1.826919425880202, + "learning_rate": 1.4113685446368595e-09, + "loss": 1.1061, + "step": 7255 + }, + { + "epoch": 0.9836643394563818, + "grad_norm": 2.382496547237129, + "learning_rate": 1.388141913703511e-09, + "loss": 1.1447, + "step": 7256 + }, + { + "epoch": 0.9837999051040466, + "grad_norm": 1.6502329430617906, + "learning_rate": 1.3651078571844664e-09, + "loss": 1.1184, + "step": 7257 + }, + { + "epoch": 0.9839354707517115, + "grad_norm": 2.0212168422761905, + "learning_rate": 1.3422663795215062e-09, + "loss": 1.1398, + "step": 7258 + }, + { + "epoch": 0.9840710363993764, + "grad_norm": 1.8317893266416365, + "learning_rate": 1.3196174851196617e-09, + "loss": 1.1098, + "step": 7259 + }, + { + "epoch": 0.9842066020470412, + "grad_norm": 1.6702671674806389, + "learning_rate": 1.2971611783465507e-09, + "loss": 1.1038, + "step": 7260 + }, + { + "epoch": 0.9843421676947062, + "grad_norm": 2.0421262488195278, + "learning_rate": 1.274897463532487e-09, + "loss": 1.1452, + "step": 7261 + }, + { + "epoch": 0.9844777333423711, + "grad_norm": 1.8575980645149806, + "learning_rate": 1.2528263449710363e-09, + "loss": 1.1478, + "step": 7262 + }, + { + "epoch": 0.9846132989900359, + "grad_norm": 1.7988368281987863, + "learning_rate": 1.2309478269184602e-09, + "loss": 1.1487, + "step": 7263 + }, + { + "epoch": 0.9847488646377008, + "grad_norm": 1.8415272450298705, + "learning_rate": 1.2092619135937177e-09, + "loss": 1.1803, + "step": 7264 + }, + { + "epoch": 0.9848844302853657, + "grad_norm": 1.9597125053735975, + "learning_rate": 1.1877686091787963e-09, + "loss": 1.1236, + "step": 7265 + }, + { + "epoch": 0.9850199959330306, + "grad_norm": 1.8605610168095932, + "learning_rate": 1.1664679178186032e-09, + "loss": 1.0957, + "step": 7266 + }, + { + "epoch": 0.9851555615806955, + "grad_norm": 2.016151484730992, + "learning_rate": 1.1453598436208522e-09, + "loss": 1.1614, + "step": 7267 + }, + { + "epoch": 0.9852911272283603, + "grad_norm": 1.670927310419065, + "learning_rate": 1.1244443906558432e-09, + "loss": 1.1411, + "step": 7268 + }, + { + "epoch": 0.9854266928760252, + "grad_norm": 2.1507624467807163, + "learning_rate": 1.1037215629571272e-09, + "loss": 1.1573, + "step": 7269 + }, + { + "epoch": 0.9855622585236901, + "grad_norm": 1.7214105884849327, + "learning_rate": 1.0831913645209522e-09, + "loss": 1.1489, + "step": 7270 + }, + { + "epoch": 0.985697824171355, + "grad_norm": 2.231917277772765, + "learning_rate": 1.0628537993063736e-09, + "loss": 1.1921, + "step": 7271 + }, + { + "epoch": 0.9858333898190199, + "grad_norm": 2.338680453383897, + "learning_rate": 1.042708871235143e-09, + "loss": 1.1505, + "step": 7272 + }, + { + "epoch": 0.9859689554666847, + "grad_norm": 1.8540238924447394, + "learning_rate": 1.0227565841923746e-09, + "loss": 1.1156, + "step": 7273 + }, + { + "epoch": 0.9861045211143497, + "grad_norm": 1.9919953016226941, + "learning_rate": 1.002996942025547e-09, + "loss": 1.1387, + "step": 7274 + }, + { + "epoch": 0.9862400867620145, + "grad_norm": 1.7873706147698667, + "learning_rate": 9.834299485450559e-10, + "loss": 1.1572, + "step": 7275 + }, + { + "epoch": 0.9863756524096794, + "grad_norm": 2.19883412918489, + "learning_rate": 9.640556075244388e-10, + "loss": 1.1572, + "step": 7276 + }, + { + "epoch": 0.9865112180573443, + "grad_norm": 2.1226419794684905, + "learning_rate": 9.448739226997072e-10, + "loss": 1.1509, + "step": 7277 + }, + { + "epoch": 0.9866467837050091, + "grad_norm": 10.332641516010355, + "learning_rate": 9.258848977700129e-10, + "loss": 1.1448, + "step": 7278 + }, + { + "epoch": 0.9867823493526741, + "grad_norm": 1.994524055982041, + "learning_rate": 9.070885363972047e-10, + "loss": 1.1823, + "step": 7279 + }, + { + "epoch": 0.9869179150003389, + "grad_norm": 2.752814520179471, + "learning_rate": 8.884848422060498e-10, + "loss": 1.1727, + "step": 7280 + }, + { + "epoch": 0.9870534806480038, + "grad_norm": 2.765542193530241, + "learning_rate": 8.700738187840118e-10, + "loss": 1.154, + "step": 7281 + }, + { + "epoch": 0.9871890462956687, + "grad_norm": 2.735748624346094, + "learning_rate": 8.518554696815838e-10, + "loss": 1.1809, + "step": 7282 + }, + { + "epoch": 0.9873246119433335, + "grad_norm": 2.6563484526647363, + "learning_rate": 8.338297984121778e-10, + "loss": 1.1173, + "step": 7283 + }, + { + "epoch": 0.9874601775909985, + "grad_norm": 1.564464218201518, + "learning_rate": 8.159968084515689e-10, + "loss": 1.1186, + "step": 7284 + }, + { + "epoch": 0.9875957432386633, + "grad_norm": 2.604208185989026, + "learning_rate": 7.983565032390061e-10, + "loss": 1.1092, + "step": 7285 + }, + { + "epoch": 0.9877313088863282, + "grad_norm": 1.735012107279987, + "learning_rate": 7.809088861762125e-10, + "loss": 1.1177, + "step": 7286 + }, + { + "epoch": 0.9878668745339931, + "grad_norm": 1.9791858862464926, + "learning_rate": 7.636539606277192e-10, + "loss": 1.1861, + "step": 7287 + }, + { + "epoch": 0.988002440181658, + "grad_norm": 1.9247409444314534, + "learning_rate": 7.465917299210866e-10, + "loss": 1.1234, + "step": 7288 + }, + { + "epoch": 0.9881380058293229, + "grad_norm": 1.9628582241886547, + "learning_rate": 7.297221973465717e-10, + "loss": 1.1364, + "step": 7289 + }, + { + "epoch": 0.9882735714769877, + "grad_norm": 2.631197619377059, + "learning_rate": 7.130453661573499e-10, + "loss": 1.1391, + "step": 7290 + }, + { + "epoch": 0.9884091371246526, + "grad_norm": 2.02425886436325, + "learning_rate": 6.965612395695153e-10, + "loss": 1.1688, + "step": 7291 + }, + { + "epoch": 0.9885447027723175, + "grad_norm": 2.0019577618428297, + "learning_rate": 6.802698207617474e-10, + "loss": 1.1384, + "step": 7292 + }, + { + "epoch": 0.9886802684199824, + "grad_norm": 4.282581386472536, + "learning_rate": 6.641711128758665e-10, + "loss": 1.1112, + "step": 7293 + }, + { + "epoch": 0.9888158340676473, + "grad_norm": 1.8654958013145784, + "learning_rate": 6.48265119016278e-10, + "loss": 1.1049, + "step": 7294 + }, + { + "epoch": 0.9889513997153121, + "grad_norm": 1.9482769424608362, + "learning_rate": 6.325518422503063e-10, + "loss": 1.1388, + "step": 7295 + }, + { + "epoch": 0.989086965362977, + "grad_norm": 1.8414429459403316, + "learning_rate": 6.170312856083048e-10, + "loss": 1.164, + "step": 7296 + }, + { + "epoch": 0.989222531010642, + "grad_norm": 1.7505508049535863, + "learning_rate": 6.017034520831021e-10, + "loss": 1.183, + "step": 7297 + }, + { + "epoch": 0.9893580966583068, + "grad_norm": 1.5063407863257874, + "learning_rate": 5.865683446305558e-10, + "loss": 1.1336, + "step": 7298 + }, + { + "epoch": 0.9894936623059717, + "grad_norm": 3.163983140138284, + "learning_rate": 5.716259661695533e-10, + "loss": 1.1496, + "step": 7299 + }, + { + "epoch": 0.9896292279536365, + "grad_norm": 1.9081258299409787, + "learning_rate": 5.568763195813453e-10, + "loss": 1.1385, + "step": 7300 + }, + { + "epoch": 0.9897647936013014, + "grad_norm": 1.8103920104841729, + "learning_rate": 5.423194077104343e-10, + "loss": 1.0931, + "step": 7301 + }, + { + "epoch": 0.9899003592489664, + "grad_norm": 2.1155749207718206, + "learning_rate": 5.279552333640191e-10, + "loss": 1.0998, + "step": 7302 + }, + { + "epoch": 0.9900359248966312, + "grad_norm": 1.9374002587149286, + "learning_rate": 5.137837993121064e-10, + "loss": 1.1158, + "step": 7303 + }, + { + "epoch": 0.9901714905442961, + "grad_norm": 3.2854455112470444, + "learning_rate": 4.998051082875099e-10, + "loss": 1.2179, + "step": 7304 + }, + { + "epoch": 0.9903070561919609, + "grad_norm": 1.8362492058751527, + "learning_rate": 4.860191629859623e-10, + "loss": 1.1625, + "step": 7305 + }, + { + "epoch": 0.9904426218396258, + "grad_norm": 3.182519325635339, + "learning_rate": 4.724259660658924e-10, + "loss": 1.1587, + "step": 7306 + }, + { + "epoch": 0.9905781874872908, + "grad_norm": 1.6508731591149417, + "learning_rate": 4.5902552014864815e-10, + "loss": 1.1127, + "step": 7307 + }, + { + "epoch": 0.9907137531349556, + "grad_norm": 1.7845776181795554, + "learning_rate": 4.458178278184954e-10, + "loss": 1.1242, + "step": 7308 + }, + { + "epoch": 0.9908493187826205, + "grad_norm": 2.2883477616474845, + "learning_rate": 4.328028916222859e-10, + "loss": 1.1448, + "step": 7309 + }, + { + "epoch": 0.9909848844302853, + "grad_norm": 1.9515104839537334, + "learning_rate": 4.199807140700118e-10, + "loss": 1.1545, + "step": 7310 + }, + { + "epoch": 0.9911204500779502, + "grad_norm": 2.9219158373296468, + "learning_rate": 4.073512976342508e-10, + "loss": 1.1227, + "step": 7311 + }, + { + "epoch": 0.9912560157256152, + "grad_norm": 1.846093623520437, + "learning_rate": 3.9491464475049916e-10, + "loss": 1.1635, + "step": 7312 + }, + { + "epoch": 0.99139158137328, + "grad_norm": 1.907541158713146, + "learning_rate": 3.826707578170607e-10, + "loss": 1.1462, + "step": 7313 + }, + { + "epoch": 0.9915271470209449, + "grad_norm": 1.6876607120411955, + "learning_rate": 3.7061963919504667e-10, + "loss": 1.1498, + "step": 7314 + }, + { + "epoch": 0.9916627126686097, + "grad_norm": 1.786196291472633, + "learning_rate": 3.5876129120837596e-10, + "loss": 1.1479, + "step": 7315 + }, + { + "epoch": 0.9917982783162747, + "grad_norm": 1.757647514034416, + "learning_rate": 3.470957161439969e-10, + "loss": 1.1249, + "step": 7316 + }, + { + "epoch": 0.9919338439639396, + "grad_norm": 1.9396051907071463, + "learning_rate": 3.3562291625133245e-10, + "loss": 1.1518, + "step": 7317 + }, + { + "epoch": 0.9920694096116044, + "grad_norm": 1.9590896526900903, + "learning_rate": 3.24342893742946e-10, + "loss": 1.126, + "step": 7318 + }, + { + "epoch": 0.9922049752592693, + "grad_norm": 6.783881307879676, + "learning_rate": 3.1325565079409755e-10, + "loss": 1.138, + "step": 7319 + }, + { + "epoch": 0.9923405409069341, + "grad_norm": 1.6412255667183278, + "learning_rate": 3.023611895428546e-10, + "loss": 1.1097, + "step": 7320 + }, + { + "epoch": 0.9924761065545991, + "grad_norm": 1.771098771953571, + "learning_rate": 2.9165951209020325e-10, + "loss": 1.124, + "step": 7321 + }, + { + "epoch": 0.992611672202264, + "grad_norm": 1.9584679213800282, + "learning_rate": 2.8115062049971493e-10, + "loss": 1.1096, + "step": 7322 + }, + { + "epoch": 0.9927472378499288, + "grad_norm": 2.320416204196344, + "learning_rate": 2.7083451679799084e-10, + "loss": 1.0953, + "step": 7323 + }, + { + "epoch": 0.9928828034975937, + "grad_norm": 2.535446348754648, + "learning_rate": 2.6071120297443963e-10, + "loss": 1.1678, + "step": 7324 + }, + { + "epoch": 0.9930183691452585, + "grad_norm": 1.7399411007265313, + "learning_rate": 2.507806809813884e-10, + "loss": 1.1228, + "step": 7325 + }, + { + "epoch": 0.9931539347929235, + "grad_norm": 5.753471712813689, + "learning_rate": 2.410429527336388e-10, + "loss": 1.146, + "step": 7326 + }, + { + "epoch": 0.9932895004405884, + "grad_norm": 2.1087133633289263, + "learning_rate": 2.3149802010913322e-10, + "loss": 1.1507, + "step": 7327 + }, + { + "epoch": 0.9934250660882532, + "grad_norm": 1.792617709652166, + "learning_rate": 2.221458849486213e-10, + "loss": 1.1665, + "step": 7328 + }, + { + "epoch": 0.9935606317359181, + "grad_norm": 1.8087370367430278, + "learning_rate": 2.1298654905543834e-10, + "loss": 1.1247, + "step": 7329 + }, + { + "epoch": 0.993696197383583, + "grad_norm": 3.500286553444128, + "learning_rate": 2.0402001419594917e-10, + "loss": 1.1313, + "step": 7330 + }, + { + "epoch": 0.9938317630312479, + "grad_norm": 2.2688191222990337, + "learning_rate": 1.9524628209943718e-10, + "loss": 1.1653, + "step": 7331 + }, + { + "epoch": 0.9939673286789128, + "grad_norm": 1.7817587538255701, + "learning_rate": 1.8666535445754917e-10, + "loss": 1.1399, + "step": 7332 + }, + { + "epoch": 0.9941028943265776, + "grad_norm": 1.6594429755653881, + "learning_rate": 1.7827723292518358e-10, + "loss": 1.129, + "step": 7333 + }, + { + "epoch": 0.9942384599742425, + "grad_norm": 1.7153080301246038, + "learning_rate": 1.7008191912004645e-10, + "loss": 1.1355, + "step": 7334 + }, + { + "epoch": 0.9943740256219074, + "grad_norm": 2.3809254266349353, + "learning_rate": 1.6207941462242912e-10, + "loss": 1.1346, + "step": 7335 + }, + { + "epoch": 0.9945095912695723, + "grad_norm": 1.9925602755589376, + "learning_rate": 1.5426972097543068e-10, + "loss": 1.1427, + "step": 7336 + }, + { + "epoch": 0.9946451569172372, + "grad_norm": 2.366951558052229, + "learning_rate": 1.4665283968529062e-10, + "loss": 1.0949, + "step": 7337 + }, + { + "epoch": 0.994780722564902, + "grad_norm": 2.043757166157033, + "learning_rate": 1.3922877222083407e-10, + "loss": 1.1532, + "step": 7338 + }, + { + "epoch": 0.994916288212567, + "grad_norm": 1.7999573284973465, + "learning_rate": 1.3199752001369359e-10, + "loss": 1.1647, + "step": 7339 + }, + { + "epoch": 0.9950518538602319, + "grad_norm": 1.7607229207042296, + "learning_rate": 1.2495908445830928e-10, + "loss": 1.1164, + "step": 7340 + }, + { + "epoch": 0.9951874195078967, + "grad_norm": 2.090543528327042, + "learning_rate": 1.1811346691203982e-10, + "loss": 1.1102, + "step": 7341 + }, + { + "epoch": 0.9953229851555616, + "grad_norm": 2.223970705835262, + "learning_rate": 1.1146066869494042e-10, + "loss": 1.1517, + "step": 7342 + }, + { + "epoch": 0.9954585508032264, + "grad_norm": 1.7421977433070037, + "learning_rate": 1.0500069109009579e-10, + "loss": 1.1492, + "step": 7343 + }, + { + "epoch": 0.9955941164508914, + "grad_norm": 2.010284720460006, + "learning_rate": 9.873353534317619e-11, + "loss": 1.1195, + "step": 7344 + }, + { + "epoch": 0.9957296820985563, + "grad_norm": 1.9021729551988797, + "learning_rate": 9.265920266265936e-11, + "loss": 1.1314, + "step": 7345 + }, + { + "epoch": 0.9958652477462211, + "grad_norm": 2.557912110139075, + "learning_rate": 8.677769422005266e-11, + "loss": 1.1595, + "step": 7346 + }, + { + "epoch": 0.996000813393886, + "grad_norm": 1.9705092337859373, + "learning_rate": 8.108901114955991e-11, + "loss": 1.1384, + "step": 7347 + }, + { + "epoch": 0.9961363790415508, + "grad_norm": 2.317661464057254, + "learning_rate": 7.559315454819249e-11, + "loss": 1.116, + "step": 7348 + }, + { + "epoch": 0.9962719446892158, + "grad_norm": 4.255659747032378, + "learning_rate": 7.029012547576929e-11, + "loss": 1.143, + "step": 7349 + }, + { + "epoch": 0.9964075103368807, + "grad_norm": 3.088343846915634, + "learning_rate": 6.517992495491676e-11, + "loss": 1.1476, + "step": 7350 + }, + { + "epoch": 0.9965430759845455, + "grad_norm": 2.4496356914929382, + "learning_rate": 6.026255397106884e-11, + "loss": 1.1338, + "step": 7351 + }, + { + "epoch": 0.9966786416322104, + "grad_norm": 1.75491575677336, + "learning_rate": 5.553801347257803e-11, + "loss": 1.1599, + "step": 7352 + }, + { + "epoch": 0.9968142072798752, + "grad_norm": 5.763547571866006, + "learning_rate": 5.1006304370493355e-11, + "loss": 1.1271, + "step": 7353 + }, + { + "epoch": 0.9969497729275402, + "grad_norm": 1.8186002182049237, + "learning_rate": 4.6667427538782386e-11, + "loss": 1.1188, + "step": 7354 + }, + { + "epoch": 0.9970853385752051, + "grad_norm": 1.6047628504113367, + "learning_rate": 4.252138381399817e-11, + "loss": 1.145, + "step": 7355 + }, + { + "epoch": 0.9972209042228699, + "grad_norm": 5.34619219346154, + "learning_rate": 3.856817399594536e-11, + "loss": 1.1313, + "step": 7356 + }, + { + "epoch": 0.9973564698705348, + "grad_norm": 1.8252090449795022, + "learning_rate": 3.4807798846681055e-11, + "loss": 1.1717, + "step": 7357 + }, + { + "epoch": 0.9974920355181996, + "grad_norm": 1.8016472093910534, + "learning_rate": 3.124025909151395e-11, + "loss": 1.157, + "step": 7358 + }, + { + "epoch": 0.9976276011658646, + "grad_norm": 2.234540081987162, + "learning_rate": 2.7865555418338238e-11, + "loss": 1.1902, + "step": 7359 + }, + { + "epoch": 0.9977631668135295, + "grad_norm": 1.8174943252200455, + "learning_rate": 2.4683688477966647e-11, + "loss": 1.1498, + "step": 7360 + }, + { + "epoch": 0.9978987324611943, + "grad_norm": 2.0443003966003768, + "learning_rate": 2.1694658884130468e-11, + "loss": 1.1655, + "step": 7361 + }, + { + "epoch": 0.9980342981088592, + "grad_norm": 1.603413057242596, + "learning_rate": 1.8898467213146473e-11, + "loss": 1.1536, + "step": 7362 + }, + { + "epoch": 0.9981698637565241, + "grad_norm": 2.642116028144747, + "learning_rate": 1.6295114004138965e-11, + "loss": 1.1717, + "step": 7363 + }, + { + "epoch": 0.998305429404189, + "grad_norm": 2.057298997204857, + "learning_rate": 1.3884599759261818e-11, + "loss": 1.1629, + "step": 7364 + }, + { + "epoch": 0.9984409950518539, + "grad_norm": 1.642314821711257, + "learning_rate": 1.1666924943254386e-11, + "loss": 1.1393, + "step": 7365 + }, + { + "epoch": 0.9985765606995187, + "grad_norm": 2.635002491447392, + "learning_rate": 9.642089983885604e-12, + "loss": 1.1493, + "step": 7366 + }, + { + "epoch": 0.9987121263471836, + "grad_norm": 2.2070672294804945, + "learning_rate": 7.810095271620908e-12, + "loss": 1.1404, + "step": 7367 + }, + { + "epoch": 0.9988476919948485, + "grad_norm": 1.9558195291189124, + "learning_rate": 6.170941159733267e-12, + "loss": 1.1353, + "step": 7368 + }, + { + "epoch": 0.9989832576425134, + "grad_norm": 2.1382536372668706, + "learning_rate": 4.724627964303174e-12, + "loss": 1.1286, + "step": 7369 + }, + { + "epoch": 0.9991188232901783, + "grad_norm": 1.5819700923347995, + "learning_rate": 3.4711559642186527e-12, + "loss": 1.1216, + "step": 7370 + }, + { + "epoch": 0.9992543889378431, + "grad_norm": 1.9338220648810571, + "learning_rate": 2.4105254012862784e-12, + "loss": 1.1138, + "step": 7371 + }, + { + "epoch": 0.9993899545855081, + "grad_norm": 1.6863376048661736, + "learning_rate": 1.5427364800091325e-12, + "loss": 1.1568, + "step": 7372 + }, + { + "epoch": 0.9995255202331729, + "grad_norm": 1.862167391360977, + "learning_rate": 8.67789367586802e-13, + "loss": 1.1171, + "step": 7373 + }, + { + "epoch": 0.9996610858808378, + "grad_norm": 2.3849054097089897, + "learning_rate": 3.856841943594702e-13, + "loss": 1.156, + "step": 7374 + }, + { + "epoch": 0.9997966515285027, + "grad_norm": 2.225893687063196, + "learning_rate": 9.642105325280425e-14, + "loss": 1.1113, + "step": 7375 + }, + { + "epoch": 0.9999322171761675, + "grad_norm": 1.7501244366708446, + "learning_rate": 0.0, + "loss": 1.1698, + "step": 7376 + }, + { + "epoch": 0.9999322171761675, + "step": 7376, + "total_flos": 4.331831325234299e+17, + "train_loss": 1.1986228312303084, + "train_runtime": 68441.7673, + "train_samples_per_second": 10.347, + "train_steps_per_second": 0.108 + } + ], + "logging_steps": 1.0, + "max_steps": 7376, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.331831325234299e+17, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}