diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24733 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.99992916341999, + "eval_steps": 500, + "global_step": 3529, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 5.6118285147712355, + "learning_rate": 1.8867924528301887e-08, + "loss": 0.846, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 5.713571723217765, + "learning_rate": 3.7735849056603774e-08, + "loss": 0.813, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 6.189090904191137, + "learning_rate": 5.660377358490566e-08, + "loss": 0.8901, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 7.135987059921496, + "learning_rate": 7.547169811320755e-08, + "loss": 0.9084, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 6.9487575503764125, + "learning_rate": 9.433962264150943e-08, + "loss": 0.868, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 6.531405547529674, + "learning_rate": 1.1320754716981131e-07, + "loss": 0.8339, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 5.794428405177802, + "learning_rate": 1.320754716981132e-07, + "loss": 0.8867, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 6.383676757195207, + "learning_rate": 1.509433962264151e-07, + "loss": 0.8464, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 6.3067158349421915, + "learning_rate": 1.6981132075471695e-07, + "loss": 0.876, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 5.879065954925436, + "learning_rate": 1.8867924528301886e-07, + "loss": 0.8613, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 7.289965054557051, + "learning_rate": 2.0754716981132074e-07, + "loss": 0.8581, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 6.250628393462324, + "learning_rate": 2.2641509433962263e-07, + "loss": 0.8659, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 4.897669106714332, + "learning_rate": 2.452830188679245e-07, + "loss": 0.8234, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 6.899340907296569, + "learning_rate": 2.641509433962264e-07, + "loss": 0.9095, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 6.991040839353428, + "learning_rate": 2.830188679245283e-07, + "loss": 0.8476, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 5.18007982067284, + "learning_rate": 3.018867924528302e-07, + "loss": 0.7656, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 4.900634108194426, + "learning_rate": 3.2075471698113204e-07, + "loss": 0.797, + "step": 17 + }, + { + "epoch": 0.01, + "grad_norm": 5.70859024316582, + "learning_rate": 3.396226415094339e-07, + "loss": 0.8428, + "step": 18 + }, + { + "epoch": 0.01, + "grad_norm": 5.980094728579612, + "learning_rate": 3.584905660377358e-07, + "loss": 0.8573, + "step": 19 + }, + { + "epoch": 0.01, + "grad_norm": 5.852389477273713, + "learning_rate": 3.773584905660377e-07, + "loss": 0.7921, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 4.661373849976143, + "learning_rate": 3.9622641509433963e-07, + "loss": 0.718, + "step": 21 + }, + { + "epoch": 0.01, + "grad_norm": 6.23274723855891, + "learning_rate": 4.150943396226415e-07, + "loss": 0.8623, + "step": 22 + }, + { + "epoch": 0.01, + "grad_norm": 6.0704700228127635, + "learning_rate": 4.339622641509434e-07, + "loss": 0.8782, + "step": 23 + }, + { + "epoch": 0.01, + "grad_norm": 5.700893588615956, + "learning_rate": 4.5283018867924526e-07, + "loss": 0.8101, + "step": 24 + }, + { + "epoch": 0.01, + "grad_norm": 4.998135432510591, + "learning_rate": 4.7169811320754717e-07, + "loss": 0.7511, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 5.217486426924381, + "learning_rate": 4.90566037735849e-07, + "loss": 0.8268, + "step": 26 + }, + { + "epoch": 0.01, + "grad_norm": 5.0541443638950305, + "learning_rate": 5.094339622641509e-07, + "loss": 0.7494, + "step": 27 + }, + { + "epoch": 0.01, + "grad_norm": 5.205266954508169, + "learning_rate": 5.283018867924528e-07, + "loss": 0.8054, + "step": 28 + }, + { + "epoch": 0.01, + "grad_norm": 11.080559809465251, + "learning_rate": 5.471698113207546e-07, + "loss": 0.7088, + "step": 29 + }, + { + "epoch": 0.01, + "grad_norm": 5.230197181619572, + "learning_rate": 5.660377358490566e-07, + "loss": 0.7552, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 4.988538877448871, + "learning_rate": 5.849056603773585e-07, + "loss": 0.7455, + "step": 31 + }, + { + "epoch": 0.01, + "grad_norm": 4.321562944292788, + "learning_rate": 6.037735849056604e-07, + "loss": 0.7366, + "step": 32 + }, + { + "epoch": 0.01, + "grad_norm": 4.520956670013252, + "learning_rate": 6.226415094339622e-07, + "loss": 0.7305, + "step": 33 + }, + { + "epoch": 0.01, + "grad_norm": 4.677218814857416, + "learning_rate": 6.415094339622641e-07, + "loss": 0.6962, + "step": 34 + }, + { + "epoch": 0.01, + "grad_norm": 4.393544219908874, + "learning_rate": 6.60377358490566e-07, + "loss": 0.7401, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 4.64954547771705, + "learning_rate": 6.792452830188678e-07, + "loss": 0.706, + "step": 36 + }, + { + "epoch": 0.01, + "grad_norm": 4.274230930931085, + "learning_rate": 6.981132075471697e-07, + "loss": 0.7262, + "step": 37 + }, + { + "epoch": 0.01, + "grad_norm": 3.907454657653952, + "learning_rate": 7.169811320754716e-07, + "loss": 0.6775, + "step": 38 + }, + { + "epoch": 0.01, + "grad_norm": 4.169754686936093, + "learning_rate": 7.358490566037735e-07, + "loss": 0.6674, + "step": 39 + }, + { + "epoch": 0.01, + "grad_norm": 4.464499900362431, + "learning_rate": 7.547169811320754e-07, + "loss": 0.6806, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 4.097582890478711, + "learning_rate": 7.735849056603774e-07, + "loss": 0.7274, + "step": 41 + }, + { + "epoch": 0.01, + "grad_norm": 3.697964008533405, + "learning_rate": 7.924528301886793e-07, + "loss": 0.6486, + "step": 42 + }, + { + "epoch": 0.01, + "grad_norm": 3.7930403767612777, + "learning_rate": 8.113207547169812e-07, + "loss": 0.6157, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 3.6371272191211172, + "learning_rate": 8.30188679245283e-07, + "loss": 0.5929, + "step": 44 + }, + { + "epoch": 0.01, + "grad_norm": 3.801258780572625, + "learning_rate": 8.490566037735849e-07, + "loss": 0.5322, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 3.301139979984435, + "learning_rate": 8.679245283018868e-07, + "loss": 0.5699, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 3.498909620163726, + "learning_rate": 8.867924528301887e-07, + "loss": 0.5538, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 3.994048255712527, + "learning_rate": 9.056603773584905e-07, + "loss": 0.5087, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 3.5927147782012816, + "learning_rate": 9.245283018867924e-07, + "loss": 0.6009, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 4.500724906785562, + "learning_rate": 9.433962264150943e-07, + "loss": 0.5618, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 3.356984138931124, + "learning_rate": 9.622641509433961e-07, + "loss": 0.5287, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 3.3735668806204133, + "learning_rate": 9.81132075471698e-07, + "loss": 0.5447, + "step": 52 + }, + { + "epoch": 0.02, + "grad_norm": 3.3326495713070416, + "learning_rate": 1e-06, + "loss": 0.5317, + "step": 53 + }, + { + "epoch": 0.02, + "grad_norm": 3.489263896611546, + "learning_rate": 1.0188679245283019e-06, + "loss": 0.5549, + "step": 54 + }, + { + "epoch": 0.02, + "grad_norm": 3.1415448527151857, + "learning_rate": 1.0377358490566038e-06, + "loss": 0.5329, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 3.1401388087716526, + "learning_rate": 1.0566037735849057e-06, + "loss": 0.5074, + "step": 56 + }, + { + "epoch": 0.02, + "grad_norm": 3.2267416931558985, + "learning_rate": 1.0754716981132074e-06, + "loss": 0.5431, + "step": 57 + }, + { + "epoch": 0.02, + "grad_norm": 2.978898769392214, + "learning_rate": 1.0943396226415093e-06, + "loss": 0.5433, + "step": 58 + }, + { + "epoch": 0.02, + "grad_norm": 3.0011201623997574, + "learning_rate": 1.1132075471698112e-06, + "loss": 0.5155, + "step": 59 + }, + { + "epoch": 0.02, + "grad_norm": 3.0273145674095665, + "learning_rate": 1.1320754716981131e-06, + "loss": 0.4837, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 4.213734749879202, + "learning_rate": 1.150943396226415e-06, + "loss": 0.5608, + "step": 61 + }, + { + "epoch": 0.02, + "grad_norm": 5.958909244082287, + "learning_rate": 1.169811320754717e-06, + "loss": 0.4858, + "step": 62 + }, + { + "epoch": 0.02, + "grad_norm": 2.8152544386137444, + "learning_rate": 1.1886792452830188e-06, + "loss": 0.5165, + "step": 63 + }, + { + "epoch": 0.02, + "grad_norm": 3.6384403894331143, + "learning_rate": 1.2075471698113208e-06, + "loss": 0.5016, + "step": 64 + }, + { + "epoch": 0.02, + "grad_norm": 2.87730885494883, + "learning_rate": 1.2264150943396225e-06, + "loss": 0.4738, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 3.113820837217724, + "learning_rate": 1.2452830188679244e-06, + "loss": 0.4554, + "step": 66 + }, + { + "epoch": 0.02, + "grad_norm": 2.7669377573492886, + "learning_rate": 1.2641509433962263e-06, + "loss": 0.466, + "step": 67 + }, + { + "epoch": 0.02, + "grad_norm": 3.2823683218193014, + "learning_rate": 1.2830188679245282e-06, + "loss": 0.4613, + "step": 68 + }, + { + "epoch": 0.02, + "grad_norm": 4.673456593571675, + "learning_rate": 1.30188679245283e-06, + "loss": 0.4792, + "step": 69 + }, + { + "epoch": 0.02, + "grad_norm": 2.792640744843451, + "learning_rate": 1.320754716981132e-06, + "loss": 0.4595, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 3.8347185566760507, + "learning_rate": 1.339622641509434e-06, + "loss": 0.4462, + "step": 71 + }, + { + "epoch": 0.02, + "grad_norm": 3.089668904583872, + "learning_rate": 1.3584905660377356e-06, + "loss": 0.5026, + "step": 72 + }, + { + "epoch": 0.02, + "grad_norm": 3.2532898025993413, + "learning_rate": 1.3773584905660375e-06, + "loss": 0.4678, + "step": 73 + }, + { + "epoch": 0.02, + "grad_norm": 3.9484661367267835, + "learning_rate": 1.3962264150943394e-06, + "loss": 0.488, + "step": 74 + }, + { + "epoch": 0.02, + "grad_norm": 3.1818768846061185, + "learning_rate": 1.4150943396226413e-06, + "loss": 0.5166, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 3.1426225216317043, + "learning_rate": 1.4339622641509432e-06, + "loss": 0.4996, + "step": 76 + }, + { + "epoch": 0.02, + "grad_norm": 2.9238328814899393, + "learning_rate": 1.4528301886792452e-06, + "loss": 0.4289, + "step": 77 + }, + { + "epoch": 0.02, + "grad_norm": 2.8360366336829568, + "learning_rate": 1.471698113207547e-06, + "loss": 0.4543, + "step": 78 + }, + { + "epoch": 0.02, + "grad_norm": 3.6249196520915064, + "learning_rate": 1.490566037735849e-06, + "loss": 0.5045, + "step": 79 + }, + { + "epoch": 0.02, + "grad_norm": 5.299329223688592, + "learning_rate": 1.5094339622641509e-06, + "loss": 0.4843, + "step": 80 + }, + { + "epoch": 0.02, + "grad_norm": 2.787829361897964, + "learning_rate": 1.5283018867924528e-06, + "loss": 0.4455, + "step": 81 + }, + { + "epoch": 0.02, + "grad_norm": 2.7277354760339874, + "learning_rate": 1.5471698113207547e-06, + "loss": 0.4194, + "step": 82 + }, + { + "epoch": 0.02, + "grad_norm": 2.6600416644163922, + "learning_rate": 1.5660377358490566e-06, + "loss": 0.4048, + "step": 83 + }, + { + "epoch": 0.02, + "grad_norm": 3.1774992527477286, + "learning_rate": 1.5849056603773585e-06, + "loss": 0.4497, + "step": 84 + }, + { + "epoch": 0.02, + "grad_norm": 2.9439067645554107, + "learning_rate": 1.6037735849056604e-06, + "loss": 0.474, + "step": 85 + }, + { + "epoch": 0.02, + "grad_norm": 2.8642919262683084, + "learning_rate": 1.6226415094339623e-06, + "loss": 0.4249, + "step": 86 + }, + { + "epoch": 0.02, + "grad_norm": 2.774887911245674, + "learning_rate": 1.6415094339622643e-06, + "loss": 0.4294, + "step": 87 + }, + { + "epoch": 0.02, + "grad_norm": 2.9695757117046395, + "learning_rate": 1.660377358490566e-06, + "loss": 0.4764, + "step": 88 + }, + { + "epoch": 0.03, + "grad_norm": 2.945368803651741, + "learning_rate": 1.6792452830188679e-06, + "loss": 0.4517, + "step": 89 + }, + { + "epoch": 0.03, + "grad_norm": 2.857389843927476, + "learning_rate": 1.6981132075471698e-06, + "loss": 0.4651, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 3.0764711362725805, + "learning_rate": 1.7169811320754717e-06, + "loss": 0.4735, + "step": 91 + }, + { + "epoch": 0.03, + "grad_norm": 2.887559820554778, + "learning_rate": 1.7358490566037736e-06, + "loss": 0.408, + "step": 92 + }, + { + "epoch": 0.03, + "grad_norm": 3.7339438704382357, + "learning_rate": 1.7547169811320755e-06, + "loss": 0.4428, + "step": 93 + }, + { + "epoch": 0.03, + "grad_norm": 2.725029573154123, + "learning_rate": 1.7735849056603774e-06, + "loss": 0.4066, + "step": 94 + }, + { + "epoch": 0.03, + "grad_norm": 2.785369107959244, + "learning_rate": 1.7924528301886791e-06, + "loss": 0.4325, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 2.693163705078504, + "learning_rate": 1.811320754716981e-06, + "loss": 0.4303, + "step": 96 + }, + { + "epoch": 0.03, + "grad_norm": 2.815163383579835, + "learning_rate": 1.830188679245283e-06, + "loss": 0.409, + "step": 97 + }, + { + "epoch": 0.03, + "grad_norm": 3.1482120663466073, + "learning_rate": 1.8490566037735848e-06, + "loss": 0.4413, + "step": 98 + }, + { + "epoch": 0.03, + "grad_norm": 2.7451340037459535, + "learning_rate": 1.8679245283018868e-06, + "loss": 0.3909, + "step": 99 + }, + { + "epoch": 0.03, + "grad_norm": 2.668547433739357, + "learning_rate": 1.8867924528301887e-06, + "loss": 0.4245, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 3.4097066689240445, + "learning_rate": 1.9056603773584906e-06, + "loss": 0.4171, + "step": 101 + }, + { + "epoch": 0.03, + "grad_norm": 2.7500799535422535, + "learning_rate": 1.9245283018867923e-06, + "loss": 0.4316, + "step": 102 + }, + { + "epoch": 0.03, + "grad_norm": 2.772565736820654, + "learning_rate": 1.9433962264150944e-06, + "loss": 0.4308, + "step": 103 + }, + { + "epoch": 0.03, + "grad_norm": 2.9388045054687497, + "learning_rate": 1.962264150943396e-06, + "loss": 0.3946, + "step": 104 + }, + { + "epoch": 0.03, + "grad_norm": 2.6971415729142567, + "learning_rate": 1.981132075471698e-06, + "loss": 0.3971, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 2.565759147084175, + "learning_rate": 2e-06, + "loss": 0.4335, + "step": 106 + }, + { + "epoch": 0.03, + "grad_norm": 2.6533331797170074, + "learning_rate": 1.9999995788314622e-06, + "loss": 0.4027, + "step": 107 + }, + { + "epoch": 0.03, + "grad_norm": 2.6400141953211516, + "learning_rate": 1.9999983153262037e-06, + "loss": 0.3566, + "step": 108 + }, + { + "epoch": 0.03, + "grad_norm": 2.869629191400056, + "learning_rate": 1.9999962094852885e-06, + "loss": 0.4321, + "step": 109 + }, + { + "epoch": 0.03, + "grad_norm": 3.1890954493242303, + "learning_rate": 1.999993261310491e-06, + "loss": 0.4243, + "step": 110 + }, + { + "epoch": 0.03, + "grad_norm": 2.771919519749318, + "learning_rate": 1.999989470804294e-06, + "loss": 0.4053, + "step": 111 + }, + { + "epoch": 0.03, + "grad_norm": 3.3182106784918077, + "learning_rate": 1.9999848379698903e-06, + "loss": 0.4299, + "step": 112 + }, + { + "epoch": 0.03, + "grad_norm": 2.809056771173883, + "learning_rate": 1.999979362811183e-06, + "loss": 0.4323, + "step": 113 + }, + { + "epoch": 0.03, + "grad_norm": 2.8114945027835807, + "learning_rate": 1.9999730453327834e-06, + "loss": 0.4382, + "step": 114 + }, + { + "epoch": 0.03, + "grad_norm": 2.7106485289513396, + "learning_rate": 1.9999658855400133e-06, + "loss": 0.4195, + "step": 115 + }, + { + "epoch": 0.03, + "grad_norm": 3.0700900731543523, + "learning_rate": 1.9999578834389034e-06, + "loss": 0.4007, + "step": 116 + }, + { + "epoch": 0.03, + "grad_norm": 2.581255352359972, + "learning_rate": 1.9999490390361944e-06, + "loss": 0.4003, + "step": 117 + }, + { + "epoch": 0.03, + "grad_norm": 2.730583061252001, + "learning_rate": 1.9999393523393364e-06, + "loss": 0.3988, + "step": 118 + }, + { + "epoch": 0.03, + "grad_norm": 2.953836584220443, + "learning_rate": 1.999928823356488e-06, + "loss": 0.4003, + "step": 119 + }, + { + "epoch": 0.03, + "grad_norm": 2.540162372030467, + "learning_rate": 1.9999174520965193e-06, + "loss": 0.4193, + "step": 120 + }, + { + "epoch": 0.03, + "grad_norm": 2.8247216227110754, + "learning_rate": 1.9999052385690078e-06, + "loss": 0.4106, + "step": 121 + }, + { + "epoch": 0.03, + "grad_norm": 2.9800895294923313, + "learning_rate": 1.999892182784242e-06, + "loss": 0.425, + "step": 122 + }, + { + "epoch": 0.03, + "grad_norm": 2.9580274438434895, + "learning_rate": 1.9998782847532195e-06, + "loss": 0.4222, + "step": 123 + }, + { + "epoch": 0.04, + "grad_norm": 2.978550415936864, + "learning_rate": 1.9998635444876458e-06, + "loss": 0.4031, + "step": 124 + }, + { + "epoch": 0.04, + "grad_norm": 2.8082748015723213, + "learning_rate": 1.999847961999939e-06, + "loss": 0.4056, + "step": 125 + }, + { + "epoch": 0.04, + "grad_norm": 2.8595783375198818, + "learning_rate": 1.9998315373032237e-06, + "loss": 0.4224, + "step": 126 + }, + { + "epoch": 0.04, + "grad_norm": 2.6890260114894686, + "learning_rate": 1.9998142704113346e-06, + "loss": 0.3496, + "step": 127 + }, + { + "epoch": 0.04, + "grad_norm": 2.7450486041972693, + "learning_rate": 1.9997961613388173e-06, + "loss": 0.3901, + "step": 128 + }, + { + "epoch": 0.04, + "grad_norm": 3.109712683225939, + "learning_rate": 1.9997772101009253e-06, + "loss": 0.4559, + "step": 129 + }, + { + "epoch": 0.04, + "grad_norm": 2.8203576670652284, + "learning_rate": 1.9997574167136223e-06, + "loss": 0.3763, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 2.5062427163533143, + "learning_rate": 1.9997367811935805e-06, + "loss": 0.3815, + "step": 131 + }, + { + "epoch": 0.04, + "grad_norm": 2.7117271095711204, + "learning_rate": 1.999715303558182e-06, + "loss": 0.3941, + "step": 132 + }, + { + "epoch": 0.04, + "grad_norm": 2.55987610733269, + "learning_rate": 1.999692983825518e-06, + "loss": 0.3927, + "step": 133 + }, + { + "epoch": 0.04, + "grad_norm": 3.3555921974167564, + "learning_rate": 1.99966982201439e-06, + "loss": 0.3758, + "step": 134 + }, + { + "epoch": 0.04, + "grad_norm": 2.808757885402719, + "learning_rate": 1.999645818144307e-06, + "loss": 0.4095, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 2.68635260987047, + "learning_rate": 1.9996209722354897e-06, + "loss": 0.3608, + "step": 136 + }, + { + "epoch": 0.04, + "grad_norm": 2.9268085169131046, + "learning_rate": 1.9995952843088656e-06, + "loss": 0.4159, + "step": 137 + }, + { + "epoch": 0.04, + "grad_norm": 2.9484082206625333, + "learning_rate": 1.9995687543860728e-06, + "loss": 0.4132, + "step": 138 + }, + { + "epoch": 0.04, + "grad_norm": 2.646373861612375, + "learning_rate": 1.999541382489459e-06, + "loss": 0.3747, + "step": 139 + }, + { + "epoch": 0.04, + "grad_norm": 2.646592594686166, + "learning_rate": 1.9995131686420798e-06, + "loss": 0.3996, + "step": 140 + }, + { + "epoch": 0.04, + "grad_norm": 2.7303459670357704, + "learning_rate": 1.9994841128677014e-06, + "loss": 0.3988, + "step": 141 + }, + { + "epoch": 0.04, + "grad_norm": 2.799829009143507, + "learning_rate": 1.9994542151907985e-06, + "loss": 0.4117, + "step": 142 + }, + { + "epoch": 0.04, + "grad_norm": 2.6018316616627173, + "learning_rate": 1.9994234756365546e-06, + "loss": 0.3918, + "step": 143 + }, + { + "epoch": 0.04, + "grad_norm": 2.8528313633077835, + "learning_rate": 1.9993918942308625e-06, + "loss": 0.3901, + "step": 144 + }, + { + "epoch": 0.04, + "grad_norm": 3.206126131387853, + "learning_rate": 1.999359471000326e-06, + "loss": 0.3858, + "step": 145 + }, + { + "epoch": 0.04, + "grad_norm": 2.6182854163185243, + "learning_rate": 1.9993262059722546e-06, + "loss": 0.4106, + "step": 146 + }, + { + "epoch": 0.04, + "grad_norm": 3.002116351588365, + "learning_rate": 1.9992920991746695e-06, + "loss": 0.4153, + "step": 147 + }, + { + "epoch": 0.04, + "grad_norm": 2.744545644279597, + "learning_rate": 1.9992571506362995e-06, + "loss": 0.3962, + "step": 148 + }, + { + "epoch": 0.04, + "grad_norm": 2.9977459472060466, + "learning_rate": 1.999221360386584e-06, + "loss": 0.3931, + "step": 149 + }, + { + "epoch": 0.04, + "grad_norm": 2.6424581529573303, + "learning_rate": 1.99918472845567e-06, + "loss": 0.3706, + "step": 150 + }, + { + "epoch": 0.04, + "grad_norm": 2.7069491277467383, + "learning_rate": 1.999147254874414e-06, + "loss": 0.4147, + "step": 151 + }, + { + "epoch": 0.04, + "grad_norm": 2.6035821918848363, + "learning_rate": 1.9991089396743805e-06, + "loss": 0.4249, + "step": 152 + }, + { + "epoch": 0.04, + "grad_norm": 2.8388566955132943, + "learning_rate": 1.999069782887845e-06, + "loss": 0.3695, + "step": 153 + }, + { + "epoch": 0.04, + "grad_norm": 2.7846849561486064, + "learning_rate": 1.999029784547791e-06, + "loss": 0.4046, + "step": 154 + }, + { + "epoch": 0.04, + "grad_norm": 2.5672347312163266, + "learning_rate": 1.998988944687909e-06, + "loss": 0.3784, + "step": 155 + }, + { + "epoch": 0.04, + "grad_norm": 2.5262276088682456, + "learning_rate": 1.998947263342601e-06, + "loss": 0.3624, + "step": 156 + }, + { + "epoch": 0.04, + "grad_norm": 2.760289710063222, + "learning_rate": 1.998904740546977e-06, + "loss": 0.3976, + "step": 157 + }, + { + "epoch": 0.04, + "grad_norm": 2.6197959186468402, + "learning_rate": 1.9988613763368545e-06, + "loss": 0.3852, + "step": 158 + }, + { + "epoch": 0.05, + "grad_norm": 2.9089432590991646, + "learning_rate": 1.998817170748762e-06, + "loss": 0.3799, + "step": 159 + }, + { + "epoch": 0.05, + "grad_norm": 2.506679497101866, + "learning_rate": 1.9987721238199343e-06, + "loss": 0.3535, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 2.825720790926751, + "learning_rate": 1.9987262355883173e-06, + "loss": 0.3744, + "step": 161 + }, + { + "epoch": 0.05, + "grad_norm": 2.906788715253664, + "learning_rate": 1.9986795060925633e-06, + "loss": 0.4155, + "step": 162 + }, + { + "epoch": 0.05, + "grad_norm": 3.099980135318557, + "learning_rate": 1.998631935372035e-06, + "loss": 0.4039, + "step": 163 + }, + { + "epoch": 0.05, + "grad_norm": 2.473077408954635, + "learning_rate": 1.9985835234668023e-06, + "loss": 0.3994, + "step": 164 + }, + { + "epoch": 0.05, + "grad_norm": 2.7379960293103736, + "learning_rate": 1.998534270417645e-06, + "loss": 0.4067, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 2.934946021108072, + "learning_rate": 1.9984841762660503e-06, + "loss": 0.3915, + "step": 166 + }, + { + "epoch": 0.05, + "grad_norm": 2.595844563943777, + "learning_rate": 1.998433241054215e-06, + "loss": 0.3917, + "step": 167 + }, + { + "epoch": 0.05, + "grad_norm": 2.7121653357980597, + "learning_rate": 1.998381464825043e-06, + "loss": 0.3971, + "step": 168 + }, + { + "epoch": 0.05, + "grad_norm": 2.584381509862202, + "learning_rate": 1.998328847622148e-06, + "loss": 0.3716, + "step": 169 + }, + { + "epoch": 0.05, + "grad_norm": 2.6766288563674414, + "learning_rate": 1.9982753894898506e-06, + "loss": 0.3798, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 2.5571445545817957, + "learning_rate": 1.9982210904731812e-06, + "loss": 0.3643, + "step": 171 + }, + { + "epoch": 0.05, + "grad_norm": 2.5407475772937818, + "learning_rate": 1.9981659506178776e-06, + "loss": 0.3839, + "step": 172 + }, + { + "epoch": 0.05, + "grad_norm": 2.5544944850006597, + "learning_rate": 1.9981099699703864e-06, + "loss": 0.368, + "step": 173 + }, + { + "epoch": 0.05, + "grad_norm": 2.531812353121465, + "learning_rate": 1.998053148577862e-06, + "loss": 0.3964, + "step": 174 + }, + { + "epoch": 0.05, + "grad_norm": 3.046992089059834, + "learning_rate": 1.997995486488167e-06, + "loss": 0.3844, + "step": 175 + }, + { + "epoch": 0.05, + "grad_norm": 2.822735955792875, + "learning_rate": 1.9979369837498727e-06, + "loss": 0.3991, + "step": 176 + }, + { + "epoch": 0.05, + "grad_norm": 2.9314069703471657, + "learning_rate": 1.997877640412258e-06, + "loss": 0.401, + "step": 177 + }, + { + "epoch": 0.05, + "grad_norm": 2.504699351071927, + "learning_rate": 1.9978174565253095e-06, + "loss": 0.3805, + "step": 178 + }, + { + "epoch": 0.05, + "grad_norm": 2.566177801931028, + "learning_rate": 1.9977564321397233e-06, + "loss": 0.437, + "step": 179 + }, + { + "epoch": 0.05, + "grad_norm": 2.6868768656926396, + "learning_rate": 1.9976945673069015e-06, + "loss": 0.3873, + "step": 180 + }, + { + "epoch": 0.05, + "grad_norm": 2.456556033093542, + "learning_rate": 1.9976318620789557e-06, + "loss": 0.3185, + "step": 181 + }, + { + "epoch": 0.05, + "grad_norm": 2.590060188752798, + "learning_rate": 1.9975683165087047e-06, + "loss": 0.3522, + "step": 182 + }, + { + "epoch": 0.05, + "grad_norm": 2.840213817668187, + "learning_rate": 1.9975039306496755e-06, + "loss": 0.3667, + "step": 183 + }, + { + "epoch": 0.05, + "grad_norm": 2.8933404886544967, + "learning_rate": 1.997438704556102e-06, + "loss": 0.37, + "step": 184 + }, + { + "epoch": 0.05, + "grad_norm": 2.7526331247943334, + "learning_rate": 1.997372638282928e-06, + "loss": 0.3968, + "step": 185 + }, + { + "epoch": 0.05, + "grad_norm": 2.6258818690706627, + "learning_rate": 1.9973057318858017e-06, + "loss": 0.3279, + "step": 186 + }, + { + "epoch": 0.05, + "grad_norm": 2.7649018240097503, + "learning_rate": 1.9972379854210823e-06, + "loss": 0.3689, + "step": 187 + }, + { + "epoch": 0.05, + "grad_norm": 2.7999331736072066, + "learning_rate": 1.9971693989458345e-06, + "loss": 0.3441, + "step": 188 + }, + { + "epoch": 0.05, + "grad_norm": 2.7258205465442513, + "learning_rate": 1.997099972517831e-06, + "loss": 0.3396, + "step": 189 + }, + { + "epoch": 0.05, + "grad_norm": 2.536508380657664, + "learning_rate": 1.997029706195553e-06, + "loss": 0.3346, + "step": 190 + }, + { + "epoch": 0.05, + "grad_norm": 2.958855867705031, + "learning_rate": 1.9969586000381883e-06, + "loss": 0.3903, + "step": 191 + }, + { + "epoch": 0.05, + "grad_norm": 2.7156228414424066, + "learning_rate": 1.9968866541056313e-06, + "loss": 0.3706, + "step": 192 + }, + { + "epoch": 0.05, + "grad_norm": 2.665030631253663, + "learning_rate": 1.996813868458486e-06, + "loss": 0.3883, + "step": 193 + }, + { + "epoch": 0.05, + "grad_norm": 2.6313799332939225, + "learning_rate": 1.9967402431580617e-06, + "loss": 0.3527, + "step": 194 + }, + { + "epoch": 0.06, + "grad_norm": 2.7364262838936404, + "learning_rate": 1.996665778266376e-06, + "loss": 0.3676, + "step": 195 + }, + { + "epoch": 0.06, + "grad_norm": 2.810935136720703, + "learning_rate": 1.996590473846153e-06, + "loss": 0.3341, + "step": 196 + }, + { + "epoch": 0.06, + "grad_norm": 2.9192563229960538, + "learning_rate": 1.996514329960825e-06, + "loss": 0.3536, + "step": 197 + }, + { + "epoch": 0.06, + "grad_norm": 2.8207777067651287, + "learning_rate": 1.9964373466745307e-06, + "loss": 0.3828, + "step": 198 + }, + { + "epoch": 0.06, + "grad_norm": 3.1120567282258955, + "learning_rate": 1.9963595240521156e-06, + "loss": 0.386, + "step": 199 + }, + { + "epoch": 0.06, + "grad_norm": 2.6529947961693527, + "learning_rate": 1.996280862159133e-06, + "loss": 0.3888, + "step": 200 + }, + { + "epoch": 0.06, + "grad_norm": 3.0993864773394, + "learning_rate": 1.996201361061842e-06, + "loss": 0.4037, + "step": 201 + }, + { + "epoch": 0.06, + "grad_norm": 2.5729470692760734, + "learning_rate": 1.9961210208272105e-06, + "loss": 0.3721, + "step": 202 + }, + { + "epoch": 0.06, + "grad_norm": 2.8037555614398397, + "learning_rate": 1.996039841522911e-06, + "loss": 0.3861, + "step": 203 + }, + { + "epoch": 0.06, + "grad_norm": 2.502805445419668, + "learning_rate": 1.9959578232173245e-06, + "loss": 0.3421, + "step": 204 + }, + { + "epoch": 0.06, + "grad_norm": 2.6892317392586462, + "learning_rate": 1.995874965979538e-06, + "loss": 0.3703, + "step": 205 + }, + { + "epoch": 0.06, + "grad_norm": 2.649856195234877, + "learning_rate": 1.9957912698793447e-06, + "loss": 0.3965, + "step": 206 + }, + { + "epoch": 0.06, + "grad_norm": 2.6385637045784005, + "learning_rate": 1.9957067349872456e-06, + "loss": 0.355, + "step": 207 + }, + { + "epoch": 0.06, + "grad_norm": 2.6041098458460032, + "learning_rate": 1.995621361374447e-06, + "loss": 0.4005, + "step": 208 + }, + { + "epoch": 0.06, + "grad_norm": 2.791585851603161, + "learning_rate": 1.995535149112862e-06, + "loss": 0.3828, + "step": 209 + }, + { + "epoch": 0.06, + "grad_norm": 2.56933881840692, + "learning_rate": 1.995448098275112e-06, + "loss": 0.3715, + "step": 210 + }, + { + "epoch": 0.06, + "grad_norm": 2.7409053496946076, + "learning_rate": 1.9953602089345213e-06, + "loss": 0.385, + "step": 211 + }, + { + "epoch": 0.06, + "grad_norm": 2.916462909652704, + "learning_rate": 1.995271481165123e-06, + "loss": 0.3491, + "step": 212 + }, + { + "epoch": 0.06, + "grad_norm": 2.509746621164135, + "learning_rate": 1.9951819150416564e-06, + "loss": 0.3333, + "step": 213 + }, + { + "epoch": 0.06, + "grad_norm": 2.491097202916147, + "learning_rate": 1.9950915106395654e-06, + "loss": 0.3349, + "step": 214 + }, + { + "epoch": 0.06, + "grad_norm": 2.842582458464046, + "learning_rate": 1.9950002680350016e-06, + "loss": 0.3447, + "step": 215 + }, + { + "epoch": 0.06, + "grad_norm": 2.6721882016425553, + "learning_rate": 1.994908187304822e-06, + "loss": 0.3676, + "step": 216 + }, + { + "epoch": 0.06, + "grad_norm": 2.5656168078227406, + "learning_rate": 1.9948152685265892e-06, + "loss": 0.3781, + "step": 217 + }, + { + "epoch": 0.06, + "grad_norm": 2.625977722419885, + "learning_rate": 1.9947215117785727e-06, + "loss": 0.3553, + "step": 218 + }, + { + "epoch": 0.06, + "grad_norm": 2.845684023469767, + "learning_rate": 1.9946269171397465e-06, + "loss": 0.3676, + "step": 219 + }, + { + "epoch": 0.06, + "grad_norm": 3.006378204445308, + "learning_rate": 1.994531484689792e-06, + "loss": 0.3609, + "step": 220 + }, + { + "epoch": 0.06, + "grad_norm": 2.734959302329953, + "learning_rate": 1.994435214509095e-06, + "loss": 0.3979, + "step": 221 + }, + { + "epoch": 0.06, + "grad_norm": 2.8769028978485194, + "learning_rate": 1.994338106678748e-06, + "loss": 0.3968, + "step": 222 + }, + { + "epoch": 0.06, + "grad_norm": 2.7379521677523693, + "learning_rate": 1.9942401612805477e-06, + "loss": 0.433, + "step": 223 + }, + { + "epoch": 0.06, + "grad_norm": 2.6234046844572374, + "learning_rate": 1.9941413783969976e-06, + "loss": 0.3595, + "step": 224 + }, + { + "epoch": 0.06, + "grad_norm": 2.554494207301178, + "learning_rate": 1.994041758111306e-06, + "loss": 0.382, + "step": 225 + }, + { + "epoch": 0.06, + "grad_norm": 2.579323406962154, + "learning_rate": 1.993941300507387e-06, + "loss": 0.3521, + "step": 226 + }, + { + "epoch": 0.06, + "grad_norm": 2.7795940548748352, + "learning_rate": 1.9938400056698595e-06, + "loss": 0.3617, + "step": 227 + }, + { + "epoch": 0.06, + "grad_norm": 3.0176541561507246, + "learning_rate": 1.9937378736840483e-06, + "loss": 0.4024, + "step": 228 + }, + { + "epoch": 0.06, + "grad_norm": 2.7077194486209333, + "learning_rate": 1.993634904635983e-06, + "loss": 0.3594, + "step": 229 + }, + { + "epoch": 0.07, + "grad_norm": 2.4884581742716905, + "learning_rate": 1.9935310986123976e-06, + "loss": 0.3308, + "step": 230 + }, + { + "epoch": 0.07, + "grad_norm": 2.4268425735478183, + "learning_rate": 1.993426455700732e-06, + "loss": 0.3419, + "step": 231 + }, + { + "epoch": 0.07, + "grad_norm": 2.665186874653042, + "learning_rate": 1.993320975989131e-06, + "loss": 0.3426, + "step": 232 + }, + { + "epoch": 0.07, + "grad_norm": 2.552819929036354, + "learning_rate": 1.9932146595664446e-06, + "loss": 0.387, + "step": 233 + }, + { + "epoch": 0.07, + "grad_norm": 3.4824302393777335, + "learning_rate": 1.993107506522226e-06, + "loss": 0.3213, + "step": 234 + }, + { + "epoch": 0.07, + "grad_norm": 3.1076257763007455, + "learning_rate": 1.9929995169467344e-06, + "loss": 0.3862, + "step": 235 + }, + { + "epoch": 0.07, + "grad_norm": 2.7868572688950537, + "learning_rate": 1.992890690930934e-06, + "loss": 0.3902, + "step": 236 + }, + { + "epoch": 0.07, + "grad_norm": 2.527196091690139, + "learning_rate": 1.9927810285664927e-06, + "loss": 0.3258, + "step": 237 + }, + { + "epoch": 0.07, + "grad_norm": 2.5385263313762674, + "learning_rate": 1.9926705299457827e-06, + "loss": 0.3925, + "step": 238 + }, + { + "epoch": 0.07, + "grad_norm": 2.6446574691522353, + "learning_rate": 1.992559195161882e-06, + "loss": 0.37, + "step": 239 + }, + { + "epoch": 0.07, + "grad_norm": 2.4843804067774076, + "learning_rate": 1.9924470243085712e-06, + "loss": 0.3525, + "step": 240 + }, + { + "epoch": 0.07, + "grad_norm": 2.463079722299446, + "learning_rate": 1.9923340174803367e-06, + "loss": 0.328, + "step": 241 + }, + { + "epoch": 0.07, + "grad_norm": 2.5139165394391894, + "learning_rate": 1.9922201747723677e-06, + "loss": 0.3499, + "step": 242 + }, + { + "epoch": 0.07, + "grad_norm": 2.6807859647502177, + "learning_rate": 1.9921054962805585e-06, + "loss": 0.3419, + "step": 243 + }, + { + "epoch": 0.07, + "grad_norm": 2.9749725307944406, + "learning_rate": 1.9919899821015063e-06, + "loss": 0.3248, + "step": 244 + }, + { + "epoch": 0.07, + "grad_norm": 2.521006728376108, + "learning_rate": 1.9918736323325142e-06, + "loss": 0.3565, + "step": 245 + }, + { + "epoch": 0.07, + "grad_norm": 2.690164522320936, + "learning_rate": 1.9917564470715872e-06, + "loss": 0.3598, + "step": 246 + }, + { + "epoch": 0.07, + "grad_norm": 2.6434030217227464, + "learning_rate": 1.991638426417435e-06, + "loss": 0.3628, + "step": 247 + }, + { + "epoch": 0.07, + "grad_norm": 2.406254480523089, + "learning_rate": 1.991519570469471e-06, + "loss": 0.3412, + "step": 248 + }, + { + "epoch": 0.07, + "grad_norm": 2.8193314112019814, + "learning_rate": 1.9913998793278113e-06, + "loss": 0.367, + "step": 249 + }, + { + "epoch": 0.07, + "grad_norm": 2.4988815520050247, + "learning_rate": 1.9912793530932764e-06, + "loss": 0.3622, + "step": 250 + }, + { + "epoch": 0.07, + "grad_norm": 2.461527020103545, + "learning_rate": 1.9911579918673904e-06, + "loss": 0.3299, + "step": 251 + }, + { + "epoch": 0.07, + "grad_norm": 3.773311619061041, + "learning_rate": 1.99103579575238e-06, + "loss": 0.3449, + "step": 252 + }, + { + "epoch": 0.07, + "grad_norm": 2.5782349089610825, + "learning_rate": 1.9909127648511754e-06, + "loss": 0.3594, + "step": 253 + }, + { + "epoch": 0.07, + "grad_norm": 2.4446994500795194, + "learning_rate": 1.990788899267411e-06, + "loss": 0.344, + "step": 254 + }, + { + "epoch": 0.07, + "grad_norm": 2.6855473726715022, + "learning_rate": 1.9906641991054222e-06, + "loss": 0.4129, + "step": 255 + }, + { + "epoch": 0.07, + "grad_norm": 2.5485938248498092, + "learning_rate": 1.9905386644702493e-06, + "loss": 0.3543, + "step": 256 + }, + { + "epoch": 0.07, + "grad_norm": 2.5080000647003278, + "learning_rate": 1.9904122954676345e-06, + "loss": 0.3202, + "step": 257 + }, + { + "epoch": 0.07, + "grad_norm": 2.457262360426059, + "learning_rate": 1.9902850922040227e-06, + "loss": 0.3579, + "step": 258 + }, + { + "epoch": 0.07, + "grad_norm": 2.541677938349451, + "learning_rate": 1.9901570547865627e-06, + "loss": 0.3584, + "step": 259 + }, + { + "epoch": 0.07, + "grad_norm": 2.6969360626762677, + "learning_rate": 1.990028183323105e-06, + "loss": 0.3887, + "step": 260 + }, + { + "epoch": 0.07, + "grad_norm": 2.713143817573316, + "learning_rate": 1.9898984779222025e-06, + "loss": 0.3841, + "step": 261 + }, + { + "epoch": 0.07, + "grad_norm": 2.5477445113524815, + "learning_rate": 1.9897679386931113e-06, + "loss": 0.3435, + "step": 262 + }, + { + "epoch": 0.07, + "grad_norm": 3.2229779357171524, + "learning_rate": 1.9896365657457887e-06, + "loss": 0.3744, + "step": 263 + }, + { + "epoch": 0.07, + "grad_norm": 2.613400938132588, + "learning_rate": 1.989504359190896e-06, + "loss": 0.3374, + "step": 264 + }, + { + "epoch": 0.08, + "grad_norm": 2.80802734327446, + "learning_rate": 1.989371319139794e-06, + "loss": 0.3772, + "step": 265 + }, + { + "epoch": 0.08, + "grad_norm": 2.4470246183799156, + "learning_rate": 1.9892374457045494e-06, + "loss": 0.3553, + "step": 266 + }, + { + "epoch": 0.08, + "grad_norm": 2.6800638145741353, + "learning_rate": 1.9891027389979278e-06, + "loss": 0.3706, + "step": 267 + }, + { + "epoch": 0.08, + "grad_norm": 2.913022801860264, + "learning_rate": 1.9889671991333976e-06, + "loss": 0.3766, + "step": 268 + }, + { + "epoch": 0.08, + "grad_norm": 3.0319933246387856, + "learning_rate": 1.9888308262251284e-06, + "loss": 0.3663, + "step": 269 + }, + { + "epoch": 0.08, + "grad_norm": 2.4223388136877886, + "learning_rate": 1.9886936203879935e-06, + "loss": 0.3418, + "step": 270 + }, + { + "epoch": 0.08, + "grad_norm": 2.7509386532674815, + "learning_rate": 1.9885555817375654e-06, + "loss": 0.4096, + "step": 271 + }, + { + "epoch": 0.08, + "grad_norm": 2.639110162050167, + "learning_rate": 1.9884167103901194e-06, + "loss": 0.39, + "step": 272 + }, + { + "epoch": 0.08, + "grad_norm": 2.7803037778659814, + "learning_rate": 1.9882770064626328e-06, + "loss": 0.3795, + "step": 273 + }, + { + "epoch": 0.08, + "grad_norm": 2.5513683067438753, + "learning_rate": 1.988136470072782e-06, + "loss": 0.3469, + "step": 274 + }, + { + "epoch": 0.08, + "grad_norm": 2.535630029496729, + "learning_rate": 1.987995101338947e-06, + "loss": 0.3517, + "step": 275 + }, + { + "epoch": 0.08, + "grad_norm": 2.6139021814657037, + "learning_rate": 1.9878529003802084e-06, + "loss": 0.3976, + "step": 276 + }, + { + "epoch": 0.08, + "grad_norm": 2.483456591372453, + "learning_rate": 1.987709867316346e-06, + "loss": 0.3631, + "step": 277 + }, + { + "epoch": 0.08, + "grad_norm": 2.5809926151280314, + "learning_rate": 1.9875660022678424e-06, + "loss": 0.3685, + "step": 278 + }, + { + "epoch": 0.08, + "grad_norm": 2.545111593100663, + "learning_rate": 1.9874213053558804e-06, + "loss": 0.3427, + "step": 279 + }, + { + "epoch": 0.08, + "grad_norm": 2.617356456075554, + "learning_rate": 1.987275776702344e-06, + "loss": 0.3395, + "step": 280 + }, + { + "epoch": 0.08, + "grad_norm": 2.8211042738249743, + "learning_rate": 1.987129416429817e-06, + "loss": 0.3571, + "step": 281 + }, + { + "epoch": 0.08, + "grad_norm": 7.370289089811191, + "learning_rate": 1.986982224661584e-06, + "loss": 0.3474, + "step": 282 + }, + { + "epoch": 0.08, + "grad_norm": 2.5171030275465305, + "learning_rate": 1.986834201521631e-06, + "loss": 0.3306, + "step": 283 + }, + { + "epoch": 0.08, + "grad_norm": 2.5654163958906837, + "learning_rate": 1.9866853471346417e-06, + "loss": 0.3568, + "step": 284 + }, + { + "epoch": 0.08, + "grad_norm": 3.4580167416924765, + "learning_rate": 1.986535661626003e-06, + "loss": 0.368, + "step": 285 + }, + { + "epoch": 0.08, + "grad_norm": 2.6005433341670887, + "learning_rate": 1.9863851451218003e-06, + "loss": 0.3518, + "step": 286 + }, + { + "epoch": 0.08, + "grad_norm": 2.699225097426378, + "learning_rate": 1.986233797748819e-06, + "loss": 0.3784, + "step": 287 + }, + { + "epoch": 0.08, + "grad_norm": 2.732736459682785, + "learning_rate": 1.986081619634545e-06, + "loss": 0.3752, + "step": 288 + }, + { + "epoch": 0.08, + "grad_norm": 2.7938860793855747, + "learning_rate": 1.9859286109071623e-06, + "loss": 0.3502, + "step": 289 + }, + { + "epoch": 0.08, + "grad_norm": 2.5092963016547123, + "learning_rate": 1.985774771695558e-06, + "loss": 0.3709, + "step": 290 + }, + { + "epoch": 0.08, + "grad_norm": 2.8384058342986496, + "learning_rate": 1.9856201021293148e-06, + "loss": 0.3702, + "step": 291 + }, + { + "epoch": 0.08, + "grad_norm": 2.7336616375679426, + "learning_rate": 1.985464602338717e-06, + "loss": 0.363, + "step": 292 + }, + { + "epoch": 0.08, + "grad_norm": 2.5955531896252, + "learning_rate": 1.9853082724547476e-06, + "loss": 0.3036, + "step": 293 + }, + { + "epoch": 0.08, + "grad_norm": 2.571917020052252, + "learning_rate": 1.9851511126090906e-06, + "loss": 0.361, + "step": 294 + }, + { + "epoch": 0.08, + "grad_norm": 2.660892685172359, + "learning_rate": 1.9849931229341256e-06, + "loss": 0.325, + "step": 295 + }, + { + "epoch": 0.08, + "grad_norm": 2.3648358197445933, + "learning_rate": 1.9848343035629343e-06, + "loss": 0.3532, + "step": 296 + }, + { + "epoch": 0.08, + "grad_norm": 2.594871579879767, + "learning_rate": 1.9846746546292955e-06, + "loss": 0.3413, + "step": 297 + }, + { + "epoch": 0.08, + "grad_norm": 2.5793998255691766, + "learning_rate": 1.9845141762676882e-06, + "loss": 0.3335, + "step": 298 + }, + { + "epoch": 0.08, + "grad_norm": 2.591077271608269, + "learning_rate": 1.984352868613289e-06, + "loss": 0.3675, + "step": 299 + }, + { + "epoch": 0.09, + "grad_norm": 3.285884192859506, + "learning_rate": 1.9841907318019724e-06, + "loss": 0.3539, + "step": 300 + }, + { + "epoch": 0.09, + "grad_norm": 2.5448166312144047, + "learning_rate": 1.9840277659703137e-06, + "loss": 0.3542, + "step": 301 + }, + { + "epoch": 0.09, + "grad_norm": 2.470915634055657, + "learning_rate": 1.9838639712555838e-06, + "loss": 0.3646, + "step": 302 + }, + { + "epoch": 0.09, + "grad_norm": 2.5716228188887387, + "learning_rate": 1.9836993477957536e-06, + "loss": 0.3493, + "step": 303 + }, + { + "epoch": 0.09, + "grad_norm": 2.643931078864786, + "learning_rate": 1.983533895729492e-06, + "loss": 0.3195, + "step": 304 + }, + { + "epoch": 0.09, + "grad_norm": 2.911939697435264, + "learning_rate": 1.9833676151961647e-06, + "loss": 0.3654, + "step": 305 + }, + { + "epoch": 0.09, + "grad_norm": 2.3724724165880784, + "learning_rate": 1.9832005063358366e-06, + "loss": 0.3603, + "step": 306 + }, + { + "epoch": 0.09, + "grad_norm": 2.5039300276236034, + "learning_rate": 1.9830325692892687e-06, + "loss": 0.3552, + "step": 307 + }, + { + "epoch": 0.09, + "grad_norm": 2.5223232113937843, + "learning_rate": 1.9828638041979216e-06, + "loss": 0.3327, + "step": 308 + }, + { + "epoch": 0.09, + "grad_norm": 2.9855800223800464, + "learning_rate": 1.982694211203952e-06, + "loss": 0.3829, + "step": 309 + }, + { + "epoch": 0.09, + "grad_norm": 2.521775265390254, + "learning_rate": 1.9825237904502143e-06, + "loss": 0.3408, + "step": 310 + }, + { + "epoch": 0.09, + "grad_norm": 2.7867318408681685, + "learning_rate": 1.98235254208026e-06, + "loss": 0.3465, + "step": 311 + }, + { + "epoch": 0.09, + "grad_norm": 2.617705586883914, + "learning_rate": 1.9821804662383385e-06, + "loss": 0.3625, + "step": 312 + }, + { + "epoch": 0.09, + "grad_norm": 3.1720347849017676, + "learning_rate": 1.982007563069395e-06, + "loss": 0.3589, + "step": 313 + }, + { + "epoch": 0.09, + "grad_norm": 2.575250459612025, + "learning_rate": 1.9818338327190735e-06, + "loss": 0.341, + "step": 314 + }, + { + "epoch": 0.09, + "grad_norm": 2.878553661168574, + "learning_rate": 1.981659275333712e-06, + "loss": 0.3398, + "step": 315 + }, + { + "epoch": 0.09, + "grad_norm": 2.552552883120015, + "learning_rate": 1.981483891060348e-06, + "loss": 0.3343, + "step": 316 + }, + { + "epoch": 0.09, + "grad_norm": 2.5137157940173256, + "learning_rate": 1.981307680046713e-06, + "loss": 0.3576, + "step": 317 + }, + { + "epoch": 0.09, + "grad_norm": 2.403478162113704, + "learning_rate": 1.9811306424412368e-06, + "loss": 0.3313, + "step": 318 + }, + { + "epoch": 0.09, + "grad_norm": 2.5219076719388847, + "learning_rate": 1.9809527783930442e-06, + "loss": 0.3498, + "step": 319 + }, + { + "epoch": 0.09, + "grad_norm": 2.46937865375049, + "learning_rate": 1.980774088051957e-06, + "loss": 0.3305, + "step": 320 + }, + { + "epoch": 0.09, + "grad_norm": 2.4507535819330233, + "learning_rate": 1.980594571568493e-06, + "loss": 0.3154, + "step": 321 + }, + { + "epoch": 0.09, + "grad_norm": 2.5843540111215284, + "learning_rate": 1.980414229093865e-06, + "loss": 0.3583, + "step": 322 + }, + { + "epoch": 0.09, + "grad_norm": 2.442105603481713, + "learning_rate": 1.980233060779983e-06, + "loss": 0.3347, + "step": 323 + }, + { + "epoch": 0.09, + "grad_norm": 2.5856362077055297, + "learning_rate": 1.9800510667794508e-06, + "loss": 0.3265, + "step": 324 + }, + { + "epoch": 0.09, + "grad_norm": 2.780782385254357, + "learning_rate": 1.979868247245569e-06, + "loss": 0.3502, + "step": 325 + }, + { + "epoch": 0.09, + "grad_norm": 2.5778120808206366, + "learning_rate": 1.9796846023323335e-06, + "loss": 0.3259, + "step": 326 + }, + { + "epoch": 0.09, + "grad_norm": 3.454138511692687, + "learning_rate": 1.979500132194435e-06, + "loss": 0.3468, + "step": 327 + }, + { + "epoch": 0.09, + "grad_norm": 2.4070384710440167, + "learning_rate": 1.97931483698726e-06, + "loss": 0.3279, + "step": 328 + }, + { + "epoch": 0.09, + "grad_norm": 2.834842983135936, + "learning_rate": 1.979128716866889e-06, + "loss": 0.3356, + "step": 329 + }, + { + "epoch": 0.09, + "grad_norm": 2.578244568214759, + "learning_rate": 1.978941771990098e-06, + "loss": 0.3452, + "step": 330 + }, + { + "epoch": 0.09, + "grad_norm": 2.628476335289947, + "learning_rate": 1.9787540025143576e-06, + "loss": 0.3495, + "step": 331 + }, + { + "epoch": 0.09, + "grad_norm": 2.5526394737282962, + "learning_rate": 1.9785654085978328e-06, + "loss": 0.3528, + "step": 332 + }, + { + "epoch": 0.09, + "grad_norm": 2.671559049988823, + "learning_rate": 1.978375990399384e-06, + "loss": 0.3705, + "step": 333 + }, + { + "epoch": 0.09, + "grad_norm": 2.5473894352720237, + "learning_rate": 1.9781857480785644e-06, + "loss": 0.3192, + "step": 334 + }, + { + "epoch": 0.09, + "grad_norm": 2.5989571364649473, + "learning_rate": 1.9779946817956223e-06, + "loss": 0.3616, + "step": 335 + }, + { + "epoch": 0.1, + "grad_norm": 2.795031591430215, + "learning_rate": 1.9778027917115005e-06, + "loss": 0.3821, + "step": 336 + }, + { + "epoch": 0.1, + "grad_norm": 2.4775590847102986, + "learning_rate": 1.9776100779878343e-06, + "loss": 0.3527, + "step": 337 + }, + { + "epoch": 0.1, + "grad_norm": 2.7144133030062814, + "learning_rate": 1.9774165407869535e-06, + "loss": 0.3803, + "step": 338 + }, + { + "epoch": 0.1, + "grad_norm": 2.7376378600974713, + "learning_rate": 1.977222180271883e-06, + "loss": 0.3447, + "step": 339 + }, + { + "epoch": 0.1, + "grad_norm": 2.5409133266582233, + "learning_rate": 1.9770269966063388e-06, + "loss": 0.3393, + "step": 340 + }, + { + "epoch": 0.1, + "grad_norm": 2.6836638084477937, + "learning_rate": 1.976830989954731e-06, + "loss": 0.3491, + "step": 341 + }, + { + "epoch": 0.1, + "grad_norm": 2.642805261995146, + "learning_rate": 1.9766341604821643e-06, + "loss": 0.3471, + "step": 342 + }, + { + "epoch": 0.1, + "grad_norm": 2.598394203262401, + "learning_rate": 1.976436508354435e-06, + "loss": 0.3603, + "step": 343 + }, + { + "epoch": 0.1, + "grad_norm": 2.382257852100494, + "learning_rate": 1.9762380337380325e-06, + "loss": 0.3502, + "step": 344 + }, + { + "epoch": 0.1, + "grad_norm": 2.632050658022369, + "learning_rate": 1.97603873680014e-06, + "loss": 0.3648, + "step": 345 + }, + { + "epoch": 0.1, + "grad_norm": 2.750819827854785, + "learning_rate": 1.975838617708632e-06, + "loss": 0.3334, + "step": 346 + }, + { + "epoch": 0.1, + "grad_norm": 3.410781673584302, + "learning_rate": 1.975637676632077e-06, + "loss": 0.3417, + "step": 347 + }, + { + "epoch": 0.1, + "grad_norm": 2.629343266531438, + "learning_rate": 1.975435913739734e-06, + "loss": 0.3896, + "step": 348 + }, + { + "epoch": 0.1, + "grad_norm": 2.472893659048711, + "learning_rate": 1.9752333292015564e-06, + "loss": 0.3457, + "step": 349 + }, + { + "epoch": 0.1, + "grad_norm": 2.6266932605282713, + "learning_rate": 1.9750299231881882e-06, + "loss": 0.369, + "step": 350 + }, + { + "epoch": 0.1, + "grad_norm": 2.655745774986425, + "learning_rate": 1.974825695870966e-06, + "loss": 0.3468, + "step": 351 + }, + { + "epoch": 0.1, + "grad_norm": 2.4865033854087613, + "learning_rate": 1.974620647421918e-06, + "loss": 0.3326, + "step": 352 + }, + { + "epoch": 0.1, + "grad_norm": 2.536995229766425, + "learning_rate": 1.9744147780137644e-06, + "loss": 0.3384, + "step": 353 + }, + { + "epoch": 0.1, + "grad_norm": 2.5297575083663606, + "learning_rate": 1.9742080878199155e-06, + "loss": 0.3284, + "step": 354 + }, + { + "epoch": 0.1, + "grad_norm": 2.3922716483502056, + "learning_rate": 1.9740005770144757e-06, + "loss": 0.3624, + "step": 355 + }, + { + "epoch": 0.1, + "grad_norm": 2.4959650916452087, + "learning_rate": 1.973792245772238e-06, + "loss": 0.3541, + "step": 356 + }, + { + "epoch": 0.1, + "grad_norm": 2.7223874523007403, + "learning_rate": 1.9735830942686877e-06, + "loss": 0.3142, + "step": 357 + }, + { + "epoch": 0.1, + "grad_norm": 2.636175207703715, + "learning_rate": 1.973373122680001e-06, + "loss": 0.3735, + "step": 358 + }, + { + "epoch": 0.1, + "grad_norm": 2.5918184199941066, + "learning_rate": 1.973162331183045e-06, + "loss": 0.3524, + "step": 359 + }, + { + "epoch": 0.1, + "grad_norm": 2.4828658779223347, + "learning_rate": 1.9729507199553766e-06, + "loss": 0.3092, + "step": 360 + }, + { + "epoch": 0.1, + "grad_norm": 2.4424967985660193, + "learning_rate": 1.9727382891752444e-06, + "loss": 0.3256, + "step": 361 + }, + { + "epoch": 0.1, + "grad_norm": 2.74882206494836, + "learning_rate": 1.972525039021586e-06, + "loss": 0.3465, + "step": 362 + }, + { + "epoch": 0.1, + "grad_norm": 2.5803729023474986, + "learning_rate": 1.9723109696740307e-06, + "loss": 0.3697, + "step": 363 + }, + { + "epoch": 0.1, + "grad_norm": 2.5314535016182607, + "learning_rate": 1.9720960813128963e-06, + "loss": 0.385, + "step": 364 + }, + { + "epoch": 0.1, + "grad_norm": 2.5334837256809553, + "learning_rate": 1.971880374119192e-06, + "loss": 0.3436, + "step": 365 + }, + { + "epoch": 0.1, + "grad_norm": 2.838981459900644, + "learning_rate": 1.971663848274615e-06, + "loss": 0.3547, + "step": 366 + }, + { + "epoch": 0.1, + "grad_norm": 2.785342579119847, + "learning_rate": 1.971446503961554e-06, + "loss": 0.3829, + "step": 367 + }, + { + "epoch": 0.1, + "grad_norm": 2.4462261571298303, + "learning_rate": 1.9712283413630863e-06, + "loss": 0.3461, + "step": 368 + }, + { + "epoch": 0.1, + "grad_norm": 2.5548288894996096, + "learning_rate": 1.9710093606629773e-06, + "loss": 0.3542, + "step": 369 + }, + { + "epoch": 0.1, + "grad_norm": 3.2724105962356234, + "learning_rate": 1.970789562045683e-06, + "loss": 0.3621, + "step": 370 + }, + { + "epoch": 0.11, + "grad_norm": 2.593756143692595, + "learning_rate": 1.970568945696348e-06, + "loss": 0.347, + "step": 371 + }, + { + "epoch": 0.11, + "grad_norm": 2.4998898782445944, + "learning_rate": 1.970347511800806e-06, + "loss": 0.3494, + "step": 372 + }, + { + "epoch": 0.11, + "grad_norm": 2.429214718940558, + "learning_rate": 1.9701252605455783e-06, + "loss": 0.3171, + "step": 373 + }, + { + "epoch": 0.11, + "grad_norm": 2.7538511396476717, + "learning_rate": 1.969902192117876e-06, + "loss": 0.3328, + "step": 374 + }, + { + "epoch": 0.11, + "grad_norm": 2.6048648567873838, + "learning_rate": 1.9696783067055978e-06, + "loss": 0.3158, + "step": 375 + }, + { + "epoch": 0.11, + "grad_norm": 2.5474822093572587, + "learning_rate": 1.96945360449733e-06, + "loss": 0.332, + "step": 376 + }, + { + "epoch": 0.11, + "grad_norm": 2.5836149604108907, + "learning_rate": 1.9692280856823484e-06, + "loss": 0.3482, + "step": 377 + }, + { + "epoch": 0.11, + "grad_norm": 3.0864417183648216, + "learning_rate": 1.9690017504506155e-06, + "loss": 0.3451, + "step": 378 + }, + { + "epoch": 0.11, + "grad_norm": 2.6700788713740815, + "learning_rate": 1.9687745989927824e-06, + "loss": 0.3512, + "step": 379 + }, + { + "epoch": 0.11, + "grad_norm": 2.763428118675348, + "learning_rate": 1.968546631500186e-06, + "loss": 0.3783, + "step": 380 + }, + { + "epoch": 0.11, + "grad_norm": 2.6666814788251756, + "learning_rate": 1.9683178481648527e-06, + "loss": 0.3135, + "step": 381 + }, + { + "epoch": 0.11, + "grad_norm": 2.410152816700471, + "learning_rate": 1.968088249179495e-06, + "loss": 0.3304, + "step": 382 + }, + { + "epoch": 0.11, + "grad_norm": 2.514956754492286, + "learning_rate": 1.967857834737513e-06, + "loss": 0.3086, + "step": 383 + }, + { + "epoch": 0.11, + "grad_norm": 2.403693065158659, + "learning_rate": 1.9676266050329922e-06, + "loss": 0.3082, + "step": 384 + }, + { + "epoch": 0.11, + "grad_norm": 2.5751977044730494, + "learning_rate": 1.967394560260707e-06, + "loss": 0.3369, + "step": 385 + }, + { + "epoch": 0.11, + "grad_norm": 2.5583174802608997, + "learning_rate": 1.967161700616117e-06, + "loss": 0.3181, + "step": 386 + }, + { + "epoch": 0.11, + "grad_norm": 2.6557837593729072, + "learning_rate": 1.966928026295369e-06, + "loss": 0.3771, + "step": 387 + }, + { + "epoch": 0.11, + "grad_norm": 3.5991375063276094, + "learning_rate": 1.9666935374952944e-06, + "loss": 0.3262, + "step": 388 + }, + { + "epoch": 0.11, + "grad_norm": 2.6405353323745278, + "learning_rate": 1.9664582344134127e-06, + "loss": 0.3636, + "step": 389 + }, + { + "epoch": 0.11, + "grad_norm": 2.4486516403012857, + "learning_rate": 1.9662221172479282e-06, + "loss": 0.3404, + "step": 390 + }, + { + "epoch": 0.11, + "grad_norm": 2.708565335228964, + "learning_rate": 1.965985186197731e-06, + "loss": 0.3737, + "step": 391 + }, + { + "epoch": 0.11, + "grad_norm": 2.6111093488966133, + "learning_rate": 1.965747441462397e-06, + "loss": 0.3577, + "step": 392 + }, + { + "epoch": 0.11, + "grad_norm": 2.4352883871602913, + "learning_rate": 1.965508883242188e-06, + "loss": 0.3583, + "step": 393 + }, + { + "epoch": 0.11, + "grad_norm": 2.6020531043744404, + "learning_rate": 1.965269511738049e-06, + "loss": 0.3169, + "step": 394 + }, + { + "epoch": 0.11, + "grad_norm": 2.841460231837411, + "learning_rate": 1.965029327151613e-06, + "loss": 0.3649, + "step": 395 + }, + { + "epoch": 0.11, + "grad_norm": 2.25286900315995, + "learning_rate": 1.9647883296851956e-06, + "loss": 0.3119, + "step": 396 + }, + { + "epoch": 0.11, + "grad_norm": 2.522094707467322, + "learning_rate": 1.964546519541798e-06, + "loss": 0.3345, + "step": 397 + }, + { + "epoch": 0.11, + "grad_norm": 2.443044826914561, + "learning_rate": 1.9643038969251062e-06, + "loss": 0.3387, + "step": 398 + }, + { + "epoch": 0.11, + "grad_norm": 2.4333924844267014, + "learning_rate": 1.9640604620394894e-06, + "loss": 0.3527, + "step": 399 + }, + { + "epoch": 0.11, + "grad_norm": 2.6188283476819763, + "learning_rate": 1.9638162150900025e-06, + "loss": 0.3755, + "step": 400 + }, + { + "epoch": 0.11, + "grad_norm": 3.1294437997674893, + "learning_rate": 1.963571156282384e-06, + "loss": 0.3146, + "step": 401 + }, + { + "epoch": 0.11, + "grad_norm": 2.462574569192702, + "learning_rate": 1.963325285823055e-06, + "loss": 0.3475, + "step": 402 + }, + { + "epoch": 0.11, + "grad_norm": 2.424115414008482, + "learning_rate": 1.9630786039191225e-06, + "loss": 0.3551, + "step": 403 + }, + { + "epoch": 0.11, + "grad_norm": 2.7183638186252166, + "learning_rate": 1.962831110778375e-06, + "loss": 0.3235, + "step": 404 + }, + { + "epoch": 0.11, + "grad_norm": 2.707298913677284, + "learning_rate": 1.9625828066092854e-06, + "loss": 0.3626, + "step": 405 + }, + { + "epoch": 0.12, + "grad_norm": 3.051683309395247, + "learning_rate": 1.9623336916210096e-06, + "loss": 0.3174, + "step": 406 + }, + { + "epoch": 0.12, + "grad_norm": 2.6084332006277027, + "learning_rate": 1.962083766023386e-06, + "loss": 0.3158, + "step": 407 + }, + { + "epoch": 0.12, + "grad_norm": 2.351302315368594, + "learning_rate": 1.961833030026937e-06, + "loss": 0.284, + "step": 408 + }, + { + "epoch": 0.12, + "grad_norm": 2.3788504441528806, + "learning_rate": 1.961581483842866e-06, + "loss": 0.3278, + "step": 409 + }, + { + "epoch": 0.12, + "grad_norm": 2.8946442484890107, + "learning_rate": 1.96132912768306e-06, + "loss": 0.3163, + "step": 410 + }, + { + "epoch": 0.12, + "grad_norm": 2.728285381971823, + "learning_rate": 1.961075961760088e-06, + "loss": 0.3418, + "step": 411 + }, + { + "epoch": 0.12, + "grad_norm": 2.640098159654451, + "learning_rate": 1.9608219862872008e-06, + "loss": 0.3264, + "step": 412 + }, + { + "epoch": 0.12, + "grad_norm": 2.5466803175986557, + "learning_rate": 1.960567201478332e-06, + "loss": 0.3122, + "step": 413 + }, + { + "epoch": 0.12, + "grad_norm": 2.5571107605288512, + "learning_rate": 1.9603116075480955e-06, + "loss": 0.3582, + "step": 414 + }, + { + "epoch": 0.12, + "grad_norm": 2.6335058487995013, + "learning_rate": 1.960055204711788e-06, + "loss": 0.3319, + "step": 415 + }, + { + "epoch": 0.12, + "grad_norm": 2.499711088348462, + "learning_rate": 1.959797993185387e-06, + "loss": 0.3553, + "step": 416 + }, + { + "epoch": 0.12, + "grad_norm": 2.3778807191941955, + "learning_rate": 1.959539973185551e-06, + "loss": 0.332, + "step": 417 + }, + { + "epoch": 0.12, + "grad_norm": 2.6344006456744054, + "learning_rate": 1.9592811449296206e-06, + "loss": 0.3572, + "step": 418 + }, + { + "epoch": 0.12, + "grad_norm": 2.4980884179301603, + "learning_rate": 1.9590215086356155e-06, + "loss": 0.2792, + "step": 419 + }, + { + "epoch": 0.12, + "grad_norm": 2.7675481866770877, + "learning_rate": 1.9587610645222377e-06, + "loss": 0.3385, + "step": 420 + }, + { + "epoch": 0.12, + "grad_norm": 2.4770423980229457, + "learning_rate": 1.9584998128088683e-06, + "loss": 0.341, + "step": 421 + }, + { + "epoch": 0.12, + "grad_norm": 2.6038105597999173, + "learning_rate": 1.9582377537155703e-06, + "loss": 0.3269, + "step": 422 + }, + { + "epoch": 0.12, + "grad_norm": 2.618927817603957, + "learning_rate": 1.9579748874630846e-06, + "loss": 0.332, + "step": 423 + }, + { + "epoch": 0.12, + "grad_norm": 2.5224312088303065, + "learning_rate": 1.9577112142728337e-06, + "loss": 0.3191, + "step": 424 + }, + { + "epoch": 0.12, + "grad_norm": 2.410236418801495, + "learning_rate": 1.95744673436692e-06, + "loss": 0.3188, + "step": 425 + }, + { + "epoch": 0.12, + "grad_norm": 2.5211092188943076, + "learning_rate": 1.9571814479681233e-06, + "loss": 0.3545, + "step": 426 + }, + { + "epoch": 0.12, + "grad_norm": 2.6366198905561125, + "learning_rate": 1.9569153552999053e-06, + "loss": 0.3413, + "step": 427 + }, + { + "epoch": 0.12, + "grad_norm": 2.3273141341492978, + "learning_rate": 1.9566484565864056e-06, + "loss": 0.3178, + "step": 428 + }, + { + "epoch": 0.12, + "grad_norm": 2.464296948249515, + "learning_rate": 1.9563807520524424e-06, + "loss": 0.3227, + "step": 429 + }, + { + "epoch": 0.12, + "grad_norm": 2.5770345250979836, + "learning_rate": 1.9561122419235133e-06, + "loss": 0.3275, + "step": 430 + }, + { + "epoch": 0.12, + "grad_norm": 2.806790500176456, + "learning_rate": 1.9558429264257946e-06, + "loss": 0.3654, + "step": 431 + }, + { + "epoch": 0.12, + "grad_norm": 2.6954362891186325, + "learning_rate": 1.955572805786141e-06, + "loss": 0.2983, + "step": 432 + }, + { + "epoch": 0.12, + "grad_norm": 2.5608755561769727, + "learning_rate": 1.955301880232084e-06, + "loss": 0.3729, + "step": 433 + }, + { + "epoch": 0.12, + "grad_norm": 2.578420783299757, + "learning_rate": 1.9550301499918353e-06, + "loss": 0.328, + "step": 434 + }, + { + "epoch": 0.12, + "grad_norm": 2.6160850303665457, + "learning_rate": 1.9547576152942825e-06, + "loss": 0.3917, + "step": 435 + }, + { + "epoch": 0.12, + "grad_norm": 2.582252428734102, + "learning_rate": 1.9544842763689928e-06, + "loss": 0.3299, + "step": 436 + }, + { + "epoch": 0.12, + "grad_norm": 2.634871783199364, + "learning_rate": 1.9542101334462086e-06, + "loss": 0.3125, + "step": 437 + }, + { + "epoch": 0.12, + "grad_norm": 2.6720251585718655, + "learning_rate": 1.9539351867568515e-06, + "loss": 0.3618, + "step": 438 + }, + { + "epoch": 0.12, + "grad_norm": 2.701084613749105, + "learning_rate": 1.953659436532519e-06, + "loss": 0.3332, + "step": 439 + }, + { + "epoch": 0.12, + "grad_norm": 2.586301761737129, + "learning_rate": 1.953382883005485e-06, + "loss": 0.3434, + "step": 440 + }, + { + "epoch": 0.12, + "grad_norm": 2.5520563354383428, + "learning_rate": 1.953105526408702e-06, + "loss": 0.3136, + "step": 441 + }, + { + "epoch": 0.13, + "grad_norm": 2.7054210509581953, + "learning_rate": 1.952827366975797e-06, + "loss": 0.3614, + "step": 442 + }, + { + "epoch": 0.13, + "grad_norm": 2.377738480595085, + "learning_rate": 1.9525484049410745e-06, + "loss": 0.3406, + "step": 443 + }, + { + "epoch": 0.13, + "grad_norm": 2.393036741968658, + "learning_rate": 1.952268640539514e-06, + "loss": 0.3094, + "step": 444 + }, + { + "epoch": 0.13, + "grad_norm": 2.8081828257349892, + "learning_rate": 1.951988074006772e-06, + "loss": 0.313, + "step": 445 + }, + { + "epoch": 0.13, + "grad_norm": 2.4443193318497127, + "learning_rate": 1.951706705579179e-06, + "loss": 0.3388, + "step": 446 + }, + { + "epoch": 0.13, + "grad_norm": 2.6083418173169153, + "learning_rate": 1.9514245354937434e-06, + "loss": 0.3416, + "step": 447 + }, + { + "epoch": 0.13, + "grad_norm": 2.482240802319246, + "learning_rate": 1.951141563988147e-06, + "loss": 0.3184, + "step": 448 + }, + { + "epoch": 0.13, + "grad_norm": 2.573942965254348, + "learning_rate": 1.9508577913007472e-06, + "loss": 0.3288, + "step": 449 + }, + { + "epoch": 0.13, + "grad_norm": 2.520122116076561, + "learning_rate": 1.9505732176705762e-06, + "loss": 0.3395, + "step": 450 + }, + { + "epoch": 0.13, + "grad_norm": 2.5535464035474194, + "learning_rate": 1.9502878433373404e-06, + "loss": 0.3257, + "step": 451 + }, + { + "epoch": 0.13, + "grad_norm": 2.5785359969521116, + "learning_rate": 1.9500016685414223e-06, + "loss": 0.3444, + "step": 452 + }, + { + "epoch": 0.13, + "grad_norm": 2.6932421940193616, + "learning_rate": 1.9497146935238767e-06, + "loss": 0.3103, + "step": 453 + }, + { + "epoch": 0.13, + "grad_norm": 2.5976063721804574, + "learning_rate": 1.949426918526434e-06, + "loss": 0.3376, + "step": 454 + }, + { + "epoch": 0.13, + "grad_norm": 2.768636398677477, + "learning_rate": 1.9491383437914964e-06, + "loss": 0.3263, + "step": 455 + }, + { + "epoch": 0.13, + "grad_norm": 2.4290109643427487, + "learning_rate": 1.9488489695621427e-06, + "loss": 0.3428, + "step": 456 + }, + { + "epoch": 0.13, + "grad_norm": 2.356160466466906, + "learning_rate": 1.9485587960821227e-06, + "loss": 0.2786, + "step": 457 + }, + { + "epoch": 0.13, + "grad_norm": 2.410697728515682, + "learning_rate": 1.9482678235958605e-06, + "loss": 0.3311, + "step": 458 + }, + { + "epoch": 0.13, + "grad_norm": 2.7601789133630934, + "learning_rate": 1.9479760523484526e-06, + "loss": 0.3668, + "step": 459 + }, + { + "epoch": 0.13, + "grad_norm": 3.2475066691008267, + "learning_rate": 1.9476834825856695e-06, + "loss": 0.3236, + "step": 460 + }, + { + "epoch": 0.13, + "grad_norm": 2.7004065801678983, + "learning_rate": 1.947390114553953e-06, + "loss": 0.3747, + "step": 461 + }, + { + "epoch": 0.13, + "grad_norm": 2.4762707217582633, + "learning_rate": 1.947095948500418e-06, + "loss": 0.3298, + "step": 462 + }, + { + "epoch": 0.13, + "grad_norm": 2.359836990811463, + "learning_rate": 1.946800984672851e-06, + "loss": 0.3274, + "step": 463 + }, + { + "epoch": 0.13, + "grad_norm": 2.475164850692892, + "learning_rate": 1.946505223319712e-06, + "loss": 0.3076, + "step": 464 + }, + { + "epoch": 0.13, + "grad_norm": 2.497063237032677, + "learning_rate": 1.946208664690131e-06, + "loss": 0.3366, + "step": 465 + }, + { + "epoch": 0.13, + "grad_norm": 2.5993339212468367, + "learning_rate": 1.9459113090339107e-06, + "loss": 0.3546, + "step": 466 + }, + { + "epoch": 0.13, + "grad_norm": 2.522352594749957, + "learning_rate": 1.945613156601524e-06, + "loss": 0.3454, + "step": 467 + }, + { + "epoch": 0.13, + "grad_norm": 2.4833483392268674, + "learning_rate": 1.945314207644117e-06, + "loss": 0.3262, + "step": 468 + }, + { + "epoch": 0.13, + "grad_norm": 2.715300511368663, + "learning_rate": 1.9450144624135047e-06, + "loss": 0.3532, + "step": 469 + }, + { + "epoch": 0.13, + "grad_norm": 2.668453574347607, + "learning_rate": 1.944713921162174e-06, + "loss": 0.3488, + "step": 470 + }, + { + "epoch": 0.13, + "grad_norm": 2.401790874153332, + "learning_rate": 1.9444125841432814e-06, + "loss": 0.3355, + "step": 471 + }, + { + "epoch": 0.13, + "grad_norm": 2.5611493361085746, + "learning_rate": 1.944110451610655e-06, + "loss": 0.3469, + "step": 472 + }, + { + "epoch": 0.13, + "grad_norm": 2.649852662518354, + "learning_rate": 1.9438075238187914e-06, + "loss": 0.3461, + "step": 473 + }, + { + "epoch": 0.13, + "grad_norm": 2.5142573134371973, + "learning_rate": 1.9435038010228583e-06, + "loss": 0.3577, + "step": 474 + }, + { + "epoch": 0.13, + "grad_norm": 2.6489862422940784, + "learning_rate": 1.9431992834786925e-06, + "loss": 0.3052, + "step": 475 + }, + { + "epoch": 0.13, + "grad_norm": 16.540757693889148, + "learning_rate": 1.9428939714428008e-06, + "loss": 0.3543, + "step": 476 + }, + { + "epoch": 0.14, + "grad_norm": 2.630121182626526, + "learning_rate": 1.9425878651723587e-06, + "loss": 0.3327, + "step": 477 + }, + { + "epoch": 0.14, + "grad_norm": 2.6496095013669674, + "learning_rate": 1.9422809649252107e-06, + "loss": 0.3809, + "step": 478 + }, + { + "epoch": 0.14, + "grad_norm": 2.5789313072631086, + "learning_rate": 1.9419732709598705e-06, + "loss": 0.3517, + "step": 479 + }, + { + "epoch": 0.14, + "grad_norm": 2.5779662665291494, + "learning_rate": 1.94166478353552e-06, + "loss": 0.3393, + "step": 480 + }, + { + "epoch": 0.14, + "grad_norm": 2.463068866924456, + "learning_rate": 1.9413555029120096e-06, + "loss": 0.3126, + "step": 481 + }, + { + "epoch": 0.14, + "grad_norm": 2.4759062085258847, + "learning_rate": 1.9410454293498573e-06, + "loss": 0.3286, + "step": 482 + }, + { + "epoch": 0.14, + "grad_norm": 2.376938714024481, + "learning_rate": 1.9407345631102507e-06, + "loss": 0.3082, + "step": 483 + }, + { + "epoch": 0.14, + "grad_norm": 2.480444918068485, + "learning_rate": 1.940422904455043e-06, + "loss": 0.3557, + "step": 484 + }, + { + "epoch": 0.14, + "grad_norm": 2.653839263450258, + "learning_rate": 1.9401104536467562e-06, + "loss": 0.3158, + "step": 485 + }, + { + "epoch": 0.14, + "grad_norm": 2.6730631024757936, + "learning_rate": 1.93979721094858e-06, + "loss": 0.3582, + "step": 486 + }, + { + "epoch": 0.14, + "grad_norm": 2.516049593333345, + "learning_rate": 1.9394831766243685e-06, + "loss": 0.3389, + "step": 487 + }, + { + "epoch": 0.14, + "grad_norm": 2.8083636392126374, + "learning_rate": 1.9391683509386457e-06, + "loss": 0.3247, + "step": 488 + }, + { + "epoch": 0.14, + "grad_norm": 2.2678950733089147, + "learning_rate": 1.9388527341566008e-06, + "loss": 0.3186, + "step": 489 + }, + { + "epoch": 0.14, + "grad_norm": 2.3716071271350714, + "learning_rate": 1.9385363265440895e-06, + "loss": 0.2982, + "step": 490 + }, + { + "epoch": 0.14, + "grad_norm": 2.4167013747271455, + "learning_rate": 1.9382191283676333e-06, + "loss": 0.3442, + "step": 491 + }, + { + "epoch": 0.14, + "grad_norm": 2.4025041454549023, + "learning_rate": 1.9379011398944207e-06, + "loss": 0.3188, + "step": 492 + }, + { + "epoch": 0.14, + "grad_norm": 2.515862732197645, + "learning_rate": 1.9375823613923047e-06, + "loss": 0.3273, + "step": 493 + }, + { + "epoch": 0.14, + "grad_norm": 2.613914145600926, + "learning_rate": 1.937262793129804e-06, + "loss": 0.3309, + "step": 494 + }, + { + "epoch": 0.14, + "grad_norm": 2.578527306014049, + "learning_rate": 1.9369424353761033e-06, + "loss": 0.3301, + "step": 495 + }, + { + "epoch": 0.14, + "grad_norm": 2.536979840648795, + "learning_rate": 1.936621288401052e-06, + "loss": 0.3524, + "step": 496 + }, + { + "epoch": 0.14, + "grad_norm": 2.537778567527244, + "learning_rate": 1.9362993524751632e-06, + "loss": 0.3478, + "step": 497 + }, + { + "epoch": 0.14, + "grad_norm": 2.767407350887436, + "learning_rate": 1.9359766278696163e-06, + "loss": 0.3498, + "step": 498 + }, + { + "epoch": 0.14, + "grad_norm": 2.3407625140305086, + "learning_rate": 1.9356531148562537e-06, + "loss": 0.2906, + "step": 499 + }, + { + "epoch": 0.14, + "grad_norm": 2.21835283736995, + "learning_rate": 1.9353288137075827e-06, + "loss": 0.3053, + "step": 500 + }, + { + "epoch": 0.14, + "grad_norm": 2.6764139240001645, + "learning_rate": 1.935003724696774e-06, + "loss": 0.3378, + "step": 501 + }, + { + "epoch": 0.14, + "grad_norm": 2.848173934867263, + "learning_rate": 1.9346778480976625e-06, + "loss": 0.3188, + "step": 502 + }, + { + "epoch": 0.14, + "grad_norm": 2.721824420607853, + "learning_rate": 1.9343511841847455e-06, + "loss": 0.3694, + "step": 503 + }, + { + "epoch": 0.14, + "grad_norm": 2.4137958474476844, + "learning_rate": 1.9340237332331844e-06, + "loss": 0.3081, + "step": 504 + }, + { + "epoch": 0.14, + "grad_norm": 2.304436601106214, + "learning_rate": 1.933695495518804e-06, + "loss": 0.3085, + "step": 505 + }, + { + "epoch": 0.14, + "grad_norm": 2.660920458664747, + "learning_rate": 1.9333664713180897e-06, + "loss": 0.3328, + "step": 506 + }, + { + "epoch": 0.14, + "grad_norm": 2.675836492265816, + "learning_rate": 1.933036660908192e-06, + "loss": 0.3514, + "step": 507 + }, + { + "epoch": 0.14, + "grad_norm": 2.592652582435975, + "learning_rate": 1.932706064566922e-06, + "loss": 0.3551, + "step": 508 + }, + { + "epoch": 0.14, + "grad_norm": 2.45395931857169, + "learning_rate": 1.932374682572753e-06, + "loss": 0.3036, + "step": 509 + }, + { + "epoch": 0.14, + "grad_norm": 2.495991500842149, + "learning_rate": 1.9320425152048202e-06, + "loss": 0.3299, + "step": 510 + }, + { + "epoch": 0.14, + "grad_norm": 2.5722593523020048, + "learning_rate": 1.9317095627429214e-06, + "loss": 0.3039, + "step": 511 + }, + { + "epoch": 0.15, + "grad_norm": 2.699696114708531, + "learning_rate": 1.931375825467514e-06, + "loss": 0.3371, + "step": 512 + }, + { + "epoch": 0.15, + "grad_norm": 2.550768314398241, + "learning_rate": 1.9310413036597178e-06, + "loss": 0.336, + "step": 513 + }, + { + "epoch": 0.15, + "grad_norm": 2.415423337002907, + "learning_rate": 1.9307059976013125e-06, + "loss": 0.3084, + "step": 514 + }, + { + "epoch": 0.15, + "grad_norm": 3.3333411433452578, + "learning_rate": 1.930369907574739e-06, + "loss": 0.3311, + "step": 515 + }, + { + "epoch": 0.15, + "grad_norm": 2.6221986764230607, + "learning_rate": 1.9300330338630982e-06, + "loss": 0.3579, + "step": 516 + }, + { + "epoch": 0.15, + "grad_norm": 2.6611984074627193, + "learning_rate": 1.929695376750152e-06, + "loss": 0.3324, + "step": 517 + }, + { + "epoch": 0.15, + "grad_norm": 2.3464416133083468, + "learning_rate": 1.9293569365203202e-06, + "loss": 0.3218, + "step": 518 + }, + { + "epoch": 0.15, + "grad_norm": 2.397852657010271, + "learning_rate": 1.9290177134586847e-06, + "loss": 0.3483, + "step": 519 + }, + { + "epoch": 0.15, + "grad_norm": 2.507353885884589, + "learning_rate": 1.9286777078509856e-06, + "loss": 0.3258, + "step": 520 + }, + { + "epoch": 0.15, + "grad_norm": 2.5193974148260367, + "learning_rate": 1.928336919983622e-06, + "loss": 0.3549, + "step": 521 + }, + { + "epoch": 0.15, + "grad_norm": 2.5488782265641943, + "learning_rate": 1.9279953501436516e-06, + "loss": 0.3011, + "step": 522 + }, + { + "epoch": 0.15, + "grad_norm": 2.5085603852646186, + "learning_rate": 1.927652998618792e-06, + "loss": 0.322, + "step": 523 + }, + { + "epoch": 0.15, + "grad_norm": 2.4263847131574203, + "learning_rate": 1.927309865697419e-06, + "loss": 0.3101, + "step": 524 + }, + { + "epoch": 0.15, + "grad_norm": 2.3616115639087862, + "learning_rate": 1.926965951668565e-06, + "loss": 0.3095, + "step": 525 + }, + { + "epoch": 0.15, + "grad_norm": 2.594453166745246, + "learning_rate": 1.926621256821922e-06, + "loss": 0.3514, + "step": 526 + }, + { + "epoch": 0.15, + "grad_norm": 2.567525367017643, + "learning_rate": 1.9262757814478397e-06, + "loss": 0.3423, + "step": 527 + }, + { + "epoch": 0.15, + "grad_norm": 2.4618364897492793, + "learning_rate": 1.925929525837324e-06, + "loss": 0.3388, + "step": 528 + }, + { + "epoch": 0.15, + "grad_norm": 2.622140958588784, + "learning_rate": 1.92558249028204e-06, + "loss": 0.3613, + "step": 529 + }, + { + "epoch": 0.15, + "grad_norm": 2.45449141356188, + "learning_rate": 1.925234675074308e-06, + "loss": 0.3137, + "step": 530 + }, + { + "epoch": 0.15, + "grad_norm": 2.3719475881683727, + "learning_rate": 1.9248860805071054e-06, + "loss": 0.3193, + "step": 531 + }, + { + "epoch": 0.15, + "grad_norm": 2.489830142014237, + "learning_rate": 1.924536706874066e-06, + "loss": 0.3379, + "step": 532 + }, + { + "epoch": 0.15, + "grad_norm": 2.4421392911819737, + "learning_rate": 1.9241865544694814e-06, + "loss": 0.3191, + "step": 533 + }, + { + "epoch": 0.15, + "grad_norm": 3.157619123600503, + "learning_rate": 1.923835623588297e-06, + "loss": 0.3257, + "step": 534 + }, + { + "epoch": 0.15, + "grad_norm": 2.385666421992085, + "learning_rate": 1.9234839145261152e-06, + "loss": 0.2854, + "step": 535 + }, + { + "epoch": 0.15, + "grad_norm": 2.9066780150364364, + "learning_rate": 1.923131427579193e-06, + "loss": 0.3441, + "step": 536 + }, + { + "epoch": 0.15, + "grad_norm": 2.573671704791437, + "learning_rate": 1.9227781630444444e-06, + "loss": 0.3418, + "step": 537 + }, + { + "epoch": 0.15, + "grad_norm": 2.395871838099826, + "learning_rate": 1.9224241212194363e-06, + "loss": 0.2987, + "step": 538 + }, + { + "epoch": 0.15, + "grad_norm": 2.3230103292952684, + "learning_rate": 1.9220693024023915e-06, + "loss": 0.3357, + "step": 539 + }, + { + "epoch": 0.15, + "grad_norm": 2.486607024771495, + "learning_rate": 1.921713706892187e-06, + "loss": 0.352, + "step": 540 + }, + { + "epoch": 0.15, + "grad_norm": 2.395039371562957, + "learning_rate": 1.9213573349883544e-06, + "loss": 0.3222, + "step": 541 + }, + { + "epoch": 0.15, + "grad_norm": 2.3744101515616243, + "learning_rate": 1.9210001869910785e-06, + "loss": 0.297, + "step": 542 + }, + { + "epoch": 0.15, + "grad_norm": 2.8587800610376006, + "learning_rate": 1.9206422632011987e-06, + "loss": 0.3287, + "step": 543 + }, + { + "epoch": 0.15, + "grad_norm": 3.3743691210117297, + "learning_rate": 1.920283563920207e-06, + "loss": 0.3331, + "step": 544 + }, + { + "epoch": 0.15, + "grad_norm": 2.5527534687842115, + "learning_rate": 1.9199240894502497e-06, + "loss": 0.3361, + "step": 545 + }, + { + "epoch": 0.15, + "grad_norm": 2.6134344899024025, + "learning_rate": 1.919563840094125e-06, + "loss": 0.3363, + "step": 546 + }, + { + "epoch": 0.15, + "grad_norm": 2.321694668053294, + "learning_rate": 1.9192028161552843e-06, + "loss": 0.3191, + "step": 547 + }, + { + "epoch": 0.16, + "grad_norm": 2.5342449599959207, + "learning_rate": 1.918841017937832e-06, + "loss": 0.3174, + "step": 548 + }, + { + "epoch": 0.16, + "grad_norm": 2.413833093027609, + "learning_rate": 1.9184784457465236e-06, + "loss": 0.3133, + "step": 549 + }, + { + "epoch": 0.16, + "grad_norm": 2.6117314393101876, + "learning_rate": 1.918115099886767e-06, + "loss": 0.2876, + "step": 550 + }, + { + "epoch": 0.16, + "grad_norm": 2.5391976887583554, + "learning_rate": 1.9177509806646224e-06, + "loss": 0.334, + "step": 551 + }, + { + "epoch": 0.16, + "grad_norm": 2.4016170632334592, + "learning_rate": 1.9173860883868005e-06, + "loss": 0.334, + "step": 552 + }, + { + "epoch": 0.16, + "grad_norm": 3.6440124440068837, + "learning_rate": 1.9170204233606638e-06, + "loss": 0.3077, + "step": 553 + }, + { + "epoch": 0.16, + "grad_norm": 2.482547368266431, + "learning_rate": 1.9166539858942254e-06, + "loss": 0.3164, + "step": 554 + }, + { + "epoch": 0.16, + "grad_norm": 2.4041117231577642, + "learning_rate": 1.9162867762961495e-06, + "loss": 0.328, + "step": 555 + }, + { + "epoch": 0.16, + "grad_norm": 2.564868045958623, + "learning_rate": 1.91591879487575e-06, + "loss": 0.3859, + "step": 556 + }, + { + "epoch": 0.16, + "grad_norm": 2.4954454983835226, + "learning_rate": 1.9155500419429915e-06, + "loss": 0.3196, + "step": 557 + }, + { + "epoch": 0.16, + "grad_norm": 2.2622655946584604, + "learning_rate": 1.9151805178084877e-06, + "loss": 0.2938, + "step": 558 + }, + { + "epoch": 0.16, + "grad_norm": 2.5314647358932616, + "learning_rate": 1.9148102227835032e-06, + "loss": 0.3244, + "step": 559 + }, + { + "epoch": 0.16, + "grad_norm": 2.378307639689325, + "learning_rate": 1.9144391571799508e-06, + "loss": 0.3212, + "step": 560 + }, + { + "epoch": 0.16, + "grad_norm": 2.486662946691639, + "learning_rate": 1.914067321310393e-06, + "loss": 0.3252, + "step": 561 + }, + { + "epoch": 0.16, + "grad_norm": 2.5296589870880304, + "learning_rate": 1.9136947154880413e-06, + "loss": 0.356, + "step": 562 + }, + { + "epoch": 0.16, + "grad_norm": 2.308279577860115, + "learning_rate": 1.9133213400267547e-06, + "loss": 0.2777, + "step": 563 + }, + { + "epoch": 0.16, + "grad_norm": 2.5577799075870113, + "learning_rate": 1.9129471952410416e-06, + "loss": 0.316, + "step": 564 + }, + { + "epoch": 0.16, + "grad_norm": 2.4022008859887922, + "learning_rate": 1.912572281446058e-06, + "loss": 0.293, + "step": 565 + }, + { + "epoch": 0.16, + "grad_norm": 2.7900327421429663, + "learning_rate": 1.9121965989576074e-06, + "loss": 0.3221, + "step": 566 + }, + { + "epoch": 0.16, + "grad_norm": 2.302655008983288, + "learning_rate": 1.9118201480921414e-06, + "loss": 0.2883, + "step": 567 + }, + { + "epoch": 0.16, + "grad_norm": 2.7137787292734403, + "learning_rate": 1.911442929166758e-06, + "loss": 0.3416, + "step": 568 + }, + { + "epoch": 0.16, + "grad_norm": 2.4111683606362626, + "learning_rate": 1.911064942499204e-06, + "loss": 0.2932, + "step": 569 + }, + { + "epoch": 0.16, + "grad_norm": 2.5828661156738106, + "learning_rate": 1.91068618840787e-06, + "loss": 0.341, + "step": 570 + }, + { + "epoch": 0.16, + "grad_norm": 2.589921775852251, + "learning_rate": 1.9103066672117954e-06, + "loss": 0.3259, + "step": 571 + }, + { + "epoch": 0.16, + "grad_norm": 2.6417147099420015, + "learning_rate": 1.909926379230665e-06, + "loss": 0.376, + "step": 572 + }, + { + "epoch": 0.16, + "grad_norm": 2.388934604814071, + "learning_rate": 1.9095453247848097e-06, + "loss": 0.3073, + "step": 573 + }, + { + "epoch": 0.16, + "grad_norm": 2.6524201982252116, + "learning_rate": 1.909163504195205e-06, + "loss": 0.3319, + "step": 574 + }, + { + "epoch": 0.16, + "grad_norm": 2.4339126604553596, + "learning_rate": 1.9087809177834733e-06, + "loss": 0.3259, + "step": 575 + }, + { + "epoch": 0.16, + "grad_norm": 2.5826745669652147, + "learning_rate": 1.9083975658718804e-06, + "loss": 0.362, + "step": 576 + }, + { + "epoch": 0.16, + "grad_norm": 2.5029175314188823, + "learning_rate": 1.908013448783339e-06, + "loss": 0.3201, + "step": 577 + }, + { + "epoch": 0.16, + "grad_norm": 2.3953466865243507, + "learning_rate": 1.9076285668414042e-06, + "loss": 0.3018, + "step": 578 + }, + { + "epoch": 0.16, + "grad_norm": 2.432260404365039, + "learning_rate": 1.907242920370277e-06, + "loss": 0.3351, + "step": 579 + }, + { + "epoch": 0.16, + "grad_norm": 2.421859217898423, + "learning_rate": 1.9068565096948014e-06, + "loss": 0.3264, + "step": 580 + }, + { + "epoch": 0.16, + "grad_norm": 2.6404787924858732, + "learning_rate": 1.9064693351404655e-06, + "loss": 0.3268, + "step": 581 + }, + { + "epoch": 0.16, + "grad_norm": 2.8951196157160384, + "learning_rate": 1.9060813970334006e-06, + "loss": 0.3217, + "step": 582 + }, + { + "epoch": 0.17, + "grad_norm": 2.5423693712583666, + "learning_rate": 1.9056926957003818e-06, + "loss": 0.325, + "step": 583 + }, + { + "epoch": 0.17, + "grad_norm": 2.6348173425693346, + "learning_rate": 1.9053032314688261e-06, + "loss": 0.3266, + "step": 584 + }, + { + "epoch": 0.17, + "grad_norm": 2.335526260808841, + "learning_rate": 1.904913004666794e-06, + "loss": 0.3154, + "step": 585 + }, + { + "epoch": 0.17, + "grad_norm": 2.379364811568399, + "learning_rate": 1.904522015622988e-06, + "loss": 0.3142, + "step": 586 + }, + { + "epoch": 0.17, + "grad_norm": 3.378276249403308, + "learning_rate": 1.9041302646667526e-06, + "loss": 0.3054, + "step": 587 + }, + { + "epoch": 0.17, + "grad_norm": 2.6371331096990946, + "learning_rate": 1.903737752128074e-06, + "loss": 0.3344, + "step": 588 + }, + { + "epoch": 0.17, + "grad_norm": 2.567548383563427, + "learning_rate": 1.9033444783375804e-06, + "loss": 0.313, + "step": 589 + }, + { + "epoch": 0.17, + "grad_norm": 2.666816454240892, + "learning_rate": 1.9029504436265405e-06, + "loss": 0.3282, + "step": 590 + }, + { + "epoch": 0.17, + "grad_norm": 2.6100809481220226, + "learning_rate": 1.9025556483268646e-06, + "loss": 0.3456, + "step": 591 + }, + { + "epoch": 0.17, + "grad_norm": 2.465309639776073, + "learning_rate": 1.9021600927711035e-06, + "loss": 0.3375, + "step": 592 + }, + { + "epoch": 0.17, + "grad_norm": 2.3866245619525572, + "learning_rate": 1.901763777292448e-06, + "loss": 0.3103, + "step": 593 + }, + { + "epoch": 0.17, + "grad_norm": 2.506109281722593, + "learning_rate": 1.9013667022247295e-06, + "loss": 0.3546, + "step": 594 + }, + { + "epoch": 0.17, + "grad_norm": 2.4005233786171747, + "learning_rate": 1.9009688679024189e-06, + "loss": 0.3456, + "step": 595 + }, + { + "epoch": 0.17, + "grad_norm": 2.252931620591618, + "learning_rate": 1.900570274660627e-06, + "loss": 0.3204, + "step": 596 + }, + { + "epoch": 0.17, + "grad_norm": 2.5630832843404354, + "learning_rate": 1.900170922835104e-06, + "loss": 0.3014, + "step": 597 + }, + { + "epoch": 0.17, + "grad_norm": 2.776046404382503, + "learning_rate": 1.899770812762238e-06, + "loss": 0.3166, + "step": 598 + }, + { + "epoch": 0.17, + "grad_norm": 2.4769903180988444, + "learning_rate": 1.8993699447790573e-06, + "loss": 0.3285, + "step": 599 + }, + { + "epoch": 0.17, + "grad_norm": 2.6638868801905664, + "learning_rate": 1.8989683192232274e-06, + "loss": 0.3336, + "step": 600 + }, + { + "epoch": 0.17, + "grad_norm": 2.553206602922454, + "learning_rate": 1.898565936433052e-06, + "loss": 0.3442, + "step": 601 + }, + { + "epoch": 0.17, + "grad_norm": 2.5268981785339535, + "learning_rate": 1.8981627967474738e-06, + "loss": 0.3122, + "step": 602 + }, + { + "epoch": 0.17, + "grad_norm": 2.5220239590274005, + "learning_rate": 1.8977589005060722e-06, + "loss": 0.3039, + "step": 603 + }, + { + "epoch": 0.17, + "grad_norm": 2.31394471539144, + "learning_rate": 1.8973542480490634e-06, + "loss": 0.3123, + "step": 604 + }, + { + "epoch": 0.17, + "grad_norm": 2.391150021660033, + "learning_rate": 1.8969488397173018e-06, + "loss": 0.3363, + "step": 605 + }, + { + "epoch": 0.17, + "grad_norm": 2.472462582484426, + "learning_rate": 1.8965426758522779e-06, + "loss": 0.306, + "step": 606 + }, + { + "epoch": 0.17, + "grad_norm": 2.587624273773328, + "learning_rate": 1.8961357567961178e-06, + "loss": 0.3533, + "step": 607 + }, + { + "epoch": 0.17, + "grad_norm": 2.347674821776924, + "learning_rate": 1.8957280828915853e-06, + "loss": 0.3416, + "step": 608 + }, + { + "epoch": 0.17, + "grad_norm": 2.36594618155158, + "learning_rate": 1.8953196544820789e-06, + "loss": 0.3117, + "step": 609 + }, + { + "epoch": 0.17, + "grad_norm": 2.3850526329845225, + "learning_rate": 1.894910471911633e-06, + "loss": 0.3458, + "step": 610 + }, + { + "epoch": 0.17, + "grad_norm": 2.435275666201602, + "learning_rate": 1.8945005355249175e-06, + "loss": 0.3243, + "step": 611 + }, + { + "epoch": 0.17, + "grad_norm": 2.4942240390673143, + "learning_rate": 1.8940898456672368e-06, + "loss": 0.3712, + "step": 612 + }, + { + "epoch": 0.17, + "grad_norm": 2.6755213116047805, + "learning_rate": 1.89367840268453e-06, + "loss": 0.3011, + "step": 613 + }, + { + "epoch": 0.17, + "grad_norm": 2.5560943374584237, + "learning_rate": 1.8932662069233713e-06, + "loss": 0.3243, + "step": 614 + }, + { + "epoch": 0.17, + "grad_norm": 2.3265956737861715, + "learning_rate": 1.892853258730968e-06, + "loss": 0.3042, + "step": 615 + }, + { + "epoch": 0.17, + "grad_norm": 2.387187177414009, + "learning_rate": 1.892439558455162e-06, + "loss": 0.3214, + "step": 616 + }, + { + "epoch": 0.17, + "grad_norm": 2.592245862879042, + "learning_rate": 1.892025106444428e-06, + "loss": 0.3066, + "step": 617 + }, + { + "epoch": 0.18, + "grad_norm": 2.334983691084106, + "learning_rate": 1.8916099030478746e-06, + "loss": 0.322, + "step": 618 + }, + { + "epoch": 0.18, + "grad_norm": 2.4200119260792414, + "learning_rate": 1.891193948615243e-06, + "loss": 0.3414, + "step": 619 + }, + { + "epoch": 0.18, + "grad_norm": 2.341872649046259, + "learning_rate": 1.890777243496907e-06, + "loss": 0.3028, + "step": 620 + }, + { + "epoch": 0.18, + "grad_norm": 2.699959166733786, + "learning_rate": 1.8903597880438727e-06, + "loss": 0.3372, + "step": 621 + }, + { + "epoch": 0.18, + "grad_norm": 2.6037992641562195, + "learning_rate": 1.8899415826077783e-06, + "loss": 0.3287, + "step": 622 + }, + { + "epoch": 0.18, + "grad_norm": 2.5600828901372816, + "learning_rate": 1.8895226275408937e-06, + "loss": 0.3081, + "step": 623 + }, + { + "epoch": 0.18, + "grad_norm": 2.47503393467837, + "learning_rate": 1.8891029231961207e-06, + "loss": 0.3134, + "step": 624 + }, + { + "epoch": 0.18, + "grad_norm": 2.475124180370412, + "learning_rate": 1.8886824699269912e-06, + "loss": 0.3327, + "step": 625 + }, + { + "epoch": 0.18, + "grad_norm": 2.4558851933431, + "learning_rate": 1.8882612680876689e-06, + "loss": 0.3099, + "step": 626 + }, + { + "epoch": 0.18, + "grad_norm": 2.4377750490408023, + "learning_rate": 1.887839318032948e-06, + "loss": 0.3065, + "step": 627 + }, + { + "epoch": 0.18, + "grad_norm": 2.4888520087900017, + "learning_rate": 1.8874166201182522e-06, + "loss": 0.2961, + "step": 628 + }, + { + "epoch": 0.18, + "grad_norm": 2.429511339842882, + "learning_rate": 1.8869931746996358e-06, + "loss": 0.2766, + "step": 629 + }, + { + "epoch": 0.18, + "grad_norm": 2.517207986809363, + "learning_rate": 1.8865689821337825e-06, + "loss": 0.3325, + "step": 630 + }, + { + "epoch": 0.18, + "grad_norm": 2.5414615608781967, + "learning_rate": 1.8861440427780058e-06, + "loss": 0.3545, + "step": 631 + }, + { + "epoch": 0.18, + "grad_norm": 6.389922704628554, + "learning_rate": 1.8857183569902473e-06, + "loss": 0.3134, + "step": 632 + }, + { + "epoch": 0.18, + "grad_norm": 2.469158788196905, + "learning_rate": 1.8852919251290783e-06, + "loss": 0.3327, + "step": 633 + }, + { + "epoch": 0.18, + "grad_norm": 2.4225156287436476, + "learning_rate": 1.884864747553698e-06, + "loss": 0.3232, + "step": 634 + }, + { + "epoch": 0.18, + "grad_norm": 2.541582435994063, + "learning_rate": 1.884436824623934e-06, + "loss": 0.3067, + "step": 635 + }, + { + "epoch": 0.18, + "grad_norm": 2.324430056321435, + "learning_rate": 1.8840081567002417e-06, + "loss": 0.3344, + "step": 636 + }, + { + "epoch": 0.18, + "grad_norm": 2.4445507644563325, + "learning_rate": 1.883578744143704e-06, + "loss": 0.322, + "step": 637 + }, + { + "epoch": 0.18, + "grad_norm": 2.398784855142958, + "learning_rate": 1.8831485873160312e-06, + "loss": 0.2996, + "step": 638 + }, + { + "epoch": 0.18, + "grad_norm": 2.490567840851795, + "learning_rate": 1.8827176865795596e-06, + "loss": 0.3261, + "step": 639 + }, + { + "epoch": 0.18, + "grad_norm": 2.736732509195614, + "learning_rate": 1.8822860422972534e-06, + "loss": 0.3633, + "step": 640 + }, + { + "epoch": 0.18, + "grad_norm": 2.298233280198889, + "learning_rate": 1.8818536548327026e-06, + "loss": 0.3252, + "step": 641 + }, + { + "epoch": 0.18, + "grad_norm": 2.381279510885023, + "learning_rate": 1.8814205245501234e-06, + "loss": 0.3223, + "step": 642 + }, + { + "epoch": 0.18, + "grad_norm": 2.6425786131254654, + "learning_rate": 1.880986651814357e-06, + "loss": 0.2965, + "step": 643 + }, + { + "epoch": 0.18, + "grad_norm": 2.608150395709567, + "learning_rate": 1.8805520369908705e-06, + "loss": 0.3117, + "step": 644 + }, + { + "epoch": 0.18, + "grad_norm": 2.528769452573672, + "learning_rate": 1.8801166804457568e-06, + "loss": 0.3365, + "step": 645 + }, + { + "epoch": 0.18, + "grad_norm": 2.593082547173156, + "learning_rate": 1.879680582545732e-06, + "loss": 0.3417, + "step": 646 + }, + { + "epoch": 0.18, + "grad_norm": 2.235518751649165, + "learning_rate": 1.879243743658138e-06, + "loss": 0.2968, + "step": 647 + }, + { + "epoch": 0.18, + "grad_norm": 2.512840027266576, + "learning_rate": 1.8788061641509398e-06, + "loss": 0.331, + "step": 648 + }, + { + "epoch": 0.18, + "grad_norm": 2.623875669173926, + "learning_rate": 1.878367844392728e-06, + "loss": 0.3278, + "step": 649 + }, + { + "epoch": 0.18, + "grad_norm": 2.613013341049209, + "learning_rate": 1.8779287847527146e-06, + "loss": 0.3081, + "step": 650 + }, + { + "epoch": 0.18, + "grad_norm": 2.3967165560552526, + "learning_rate": 1.877488985600736e-06, + "loss": 0.2915, + "step": 651 + }, + { + "epoch": 0.18, + "grad_norm": 2.461584837861662, + "learning_rate": 1.8770484473072517e-06, + "loss": 0.3174, + "step": 652 + }, + { + "epoch": 0.19, + "grad_norm": 2.5909734569213563, + "learning_rate": 1.8766071702433427e-06, + "loss": 0.3462, + "step": 653 + }, + { + "epoch": 0.19, + "grad_norm": 2.3003987637657093, + "learning_rate": 1.8761651547807142e-06, + "loss": 0.2864, + "step": 654 + }, + { + "epoch": 0.19, + "grad_norm": 2.3914632891284517, + "learning_rate": 1.875722401291691e-06, + "loss": 0.3229, + "step": 655 + }, + { + "epoch": 0.19, + "grad_norm": 2.745394637477216, + "learning_rate": 1.8752789101492214e-06, + "loss": 0.3379, + "step": 656 + }, + { + "epoch": 0.19, + "grad_norm": 2.637442893764372, + "learning_rate": 1.8748346817268745e-06, + "loss": 0.2811, + "step": 657 + }, + { + "epoch": 0.19, + "grad_norm": 2.7011526479877643, + "learning_rate": 1.87438971639884e-06, + "loss": 0.3486, + "step": 658 + }, + { + "epoch": 0.19, + "grad_norm": 2.37855781182186, + "learning_rate": 1.8739440145399293e-06, + "loss": 0.3502, + "step": 659 + }, + { + "epoch": 0.19, + "grad_norm": 2.4520047039317117, + "learning_rate": 1.873497576525573e-06, + "loss": 0.3201, + "step": 660 + }, + { + "epoch": 0.19, + "grad_norm": 2.634743516274746, + "learning_rate": 1.873050402731822e-06, + "loss": 0.32, + "step": 661 + }, + { + "epoch": 0.19, + "grad_norm": 2.5434914673155924, + "learning_rate": 1.8726024935353487e-06, + "loss": 0.3073, + "step": 662 + }, + { + "epoch": 0.19, + "grad_norm": 2.405470557161396, + "learning_rate": 1.8721538493134425e-06, + "loss": 0.3465, + "step": 663 + }, + { + "epoch": 0.19, + "grad_norm": 2.5890725900604377, + "learning_rate": 1.8717044704440137e-06, + "loss": 0.3216, + "step": 664 + }, + { + "epoch": 0.19, + "grad_norm": 2.6171547730436733, + "learning_rate": 1.8712543573055903e-06, + "loss": 0.3311, + "step": 665 + }, + { + "epoch": 0.19, + "grad_norm": 2.6274679971113475, + "learning_rate": 1.8708035102773196e-06, + "loss": 0.3092, + "step": 666 + }, + { + "epoch": 0.19, + "grad_norm": 3.0832936101998927, + "learning_rate": 1.8703519297389667e-06, + "loss": 0.3331, + "step": 667 + }, + { + "epoch": 0.19, + "grad_norm": 2.370456160821706, + "learning_rate": 1.8698996160709146e-06, + "loss": 0.2874, + "step": 668 + }, + { + "epoch": 0.19, + "grad_norm": 2.4773772224277013, + "learning_rate": 1.8694465696541639e-06, + "loss": 0.32, + "step": 669 + }, + { + "epoch": 0.19, + "grad_norm": 2.4624072608294227, + "learning_rate": 1.8689927908703322e-06, + "loss": 0.3001, + "step": 670 + }, + { + "epoch": 0.19, + "grad_norm": 2.4253560050409084, + "learning_rate": 1.8685382801016547e-06, + "loss": 0.3164, + "step": 671 + }, + { + "epoch": 0.19, + "grad_norm": 2.5349779292713572, + "learning_rate": 1.868083037730982e-06, + "loss": 0.3408, + "step": 672 + }, + { + "epoch": 0.19, + "grad_norm": 2.98022002253498, + "learning_rate": 1.8676270641417821e-06, + "loss": 0.3569, + "step": 673 + }, + { + "epoch": 0.19, + "grad_norm": 2.4999640201078237, + "learning_rate": 1.8671703597181383e-06, + "loss": 0.326, + "step": 674 + }, + { + "epoch": 0.19, + "grad_norm": 2.602274939593711, + "learning_rate": 1.8667129248447497e-06, + "loss": 0.3487, + "step": 675 + }, + { + "epoch": 0.19, + "grad_norm": 2.650254061550966, + "learning_rate": 1.8662547599069308e-06, + "loss": 0.3179, + "step": 676 + }, + { + "epoch": 0.19, + "grad_norm": 2.6539404492903316, + "learning_rate": 1.8657958652906106e-06, + "loss": 0.3066, + "step": 677 + }, + { + "epoch": 0.19, + "grad_norm": 2.4099554187480887, + "learning_rate": 1.8653362413823331e-06, + "loss": 0.3084, + "step": 678 + }, + { + "epoch": 0.19, + "grad_norm": 2.611577650482648, + "learning_rate": 1.8648758885692569e-06, + "loss": 0.3539, + "step": 679 + }, + { + "epoch": 0.19, + "grad_norm": 2.2722703116941463, + "learning_rate": 1.8644148072391537e-06, + "loss": 0.3013, + "step": 680 + }, + { + "epoch": 0.19, + "grad_norm": 2.641577905268663, + "learning_rate": 1.86395299778041e-06, + "loss": 0.3231, + "step": 681 + }, + { + "epoch": 0.19, + "grad_norm": 3.2759503960375165, + "learning_rate": 1.8634904605820244e-06, + "loss": 0.3255, + "step": 682 + }, + { + "epoch": 0.19, + "grad_norm": 2.4768927319494694, + "learning_rate": 1.8630271960336096e-06, + "loss": 0.3297, + "step": 683 + }, + { + "epoch": 0.19, + "grad_norm": 2.5489379391148748, + "learning_rate": 1.8625632045253905e-06, + "loss": 0.3336, + "step": 684 + }, + { + "epoch": 0.19, + "grad_norm": 2.67838618655016, + "learning_rate": 1.8620984864482042e-06, + "loss": 0.3079, + "step": 685 + }, + { + "epoch": 0.19, + "grad_norm": 2.586620003188253, + "learning_rate": 1.8616330421935001e-06, + "loss": 0.3386, + "step": 686 + }, + { + "epoch": 0.19, + "grad_norm": 2.339499160262079, + "learning_rate": 1.861166872153339e-06, + "loss": 0.3196, + "step": 687 + }, + { + "epoch": 0.19, + "grad_norm": 2.3721624029710053, + "learning_rate": 1.860699976720393e-06, + "loss": 0.3084, + "step": 688 + }, + { + "epoch": 0.2, + "grad_norm": 2.4086625407569775, + "learning_rate": 1.8602323562879461e-06, + "loss": 0.3253, + "step": 689 + }, + { + "epoch": 0.2, + "grad_norm": 2.599196154030309, + "learning_rate": 1.8597640112498914e-06, + "loss": 0.3298, + "step": 690 + }, + { + "epoch": 0.2, + "grad_norm": 2.439632928413326, + "learning_rate": 1.859294942000734e-06, + "loss": 0.3522, + "step": 691 + }, + { + "epoch": 0.2, + "grad_norm": 2.2742388509467504, + "learning_rate": 1.8588251489355882e-06, + "loss": 0.3085, + "step": 692 + }, + { + "epoch": 0.2, + "grad_norm": 2.6673271769516163, + "learning_rate": 1.8583546324501781e-06, + "loss": 0.2883, + "step": 693 + }, + { + "epoch": 0.2, + "grad_norm": 2.517797219208855, + "learning_rate": 1.857883392940837e-06, + "loss": 0.3105, + "step": 694 + }, + { + "epoch": 0.2, + "grad_norm": 2.386077222115691, + "learning_rate": 1.8574114308045074e-06, + "loss": 0.3316, + "step": 695 + }, + { + "epoch": 0.2, + "grad_norm": 2.628302002056542, + "learning_rate": 1.856938746438741e-06, + "loss": 0.328, + "step": 696 + }, + { + "epoch": 0.2, + "grad_norm": 2.368441850241122, + "learning_rate": 1.8564653402416968e-06, + "loss": 0.2803, + "step": 697 + }, + { + "epoch": 0.2, + "grad_norm": 2.4692437770109708, + "learning_rate": 1.8559912126121424e-06, + "loss": 0.3359, + "step": 698 + }, + { + "epoch": 0.2, + "grad_norm": 2.5386494618133373, + "learning_rate": 1.8555163639494534e-06, + "loss": 0.3291, + "step": 699 + }, + { + "epoch": 0.2, + "grad_norm": 9.033454716840346, + "learning_rate": 1.8550407946536123e-06, + "loss": 0.3158, + "step": 700 + }, + { + "epoch": 0.2, + "grad_norm": 2.5159179694463285, + "learning_rate": 1.854564505125209e-06, + "loss": 0.3187, + "step": 701 + }, + { + "epoch": 0.2, + "grad_norm": 2.5230000058408466, + "learning_rate": 1.8540874957654396e-06, + "loss": 0.3, + "step": 702 + }, + { + "epoch": 0.2, + "grad_norm": 2.7927177428869077, + "learning_rate": 1.8536097669761064e-06, + "loss": 0.3026, + "step": 703 + }, + { + "epoch": 0.2, + "grad_norm": 2.4839005856698795, + "learning_rate": 1.8531313191596186e-06, + "loss": 0.3017, + "step": 704 + }, + { + "epoch": 0.2, + "grad_norm": 2.800891099137517, + "learning_rate": 1.8526521527189903e-06, + "loss": 0.3359, + "step": 705 + }, + { + "epoch": 0.2, + "grad_norm": 2.2846064092871186, + "learning_rate": 1.8521722680578411e-06, + "loss": 0.3122, + "step": 706 + }, + { + "epoch": 0.2, + "grad_norm": 2.233088715040393, + "learning_rate": 1.851691665580396e-06, + "loss": 0.2861, + "step": 707 + }, + { + "epoch": 0.2, + "grad_norm": 2.538036700824122, + "learning_rate": 1.851210345691484e-06, + "loss": 0.3084, + "step": 708 + }, + { + "epoch": 0.2, + "grad_norm": 2.45913218863696, + "learning_rate": 1.8507283087965387e-06, + "loss": 0.3205, + "step": 709 + }, + { + "epoch": 0.2, + "grad_norm": 2.5787057825261046, + "learning_rate": 1.8502455553015976e-06, + "loss": 0.3406, + "step": 710 + }, + { + "epoch": 0.2, + "grad_norm": 2.8389237764621162, + "learning_rate": 1.8497620856133019e-06, + "loss": 0.352, + "step": 711 + }, + { + "epoch": 0.2, + "grad_norm": 2.4819341553311967, + "learning_rate": 1.8492779001388964e-06, + "loss": 0.3211, + "step": 712 + }, + { + "epoch": 0.2, + "grad_norm": 2.489319817396486, + "learning_rate": 1.848792999286228e-06, + "loss": 0.3089, + "step": 713 + }, + { + "epoch": 0.2, + "grad_norm": 2.5388004718059367, + "learning_rate": 1.8483073834637467e-06, + "loss": 0.3115, + "step": 714 + }, + { + "epoch": 0.2, + "grad_norm": 2.7326112989715554, + "learning_rate": 1.847821053080505e-06, + "loss": 0.3376, + "step": 715 + }, + { + "epoch": 0.2, + "grad_norm": 2.5047349342201994, + "learning_rate": 1.8473340085461567e-06, + "loss": 0.3138, + "step": 716 + }, + { + "epoch": 0.2, + "grad_norm": 2.804498503318447, + "learning_rate": 1.8468462502709577e-06, + "loss": 0.35, + "step": 717 + }, + { + "epoch": 0.2, + "grad_norm": 2.439841721380218, + "learning_rate": 1.8463577786657649e-06, + "loss": 0.3395, + "step": 718 + }, + { + "epoch": 0.2, + "grad_norm": 2.3401956581086893, + "learning_rate": 1.8458685941420358e-06, + "loss": 0.2785, + "step": 719 + }, + { + "epoch": 0.2, + "grad_norm": 2.394895809355412, + "learning_rate": 1.8453786971118287e-06, + "loss": 0.3223, + "step": 720 + }, + { + "epoch": 0.2, + "grad_norm": 2.4660553191758, + "learning_rate": 1.8448880879878024e-06, + "loss": 0.3619, + "step": 721 + }, + { + "epoch": 0.2, + "grad_norm": 2.301475320635097, + "learning_rate": 1.8443967671832148e-06, + "loss": 0.2969, + "step": 722 + }, + { + "epoch": 0.2, + "grad_norm": 3.0721556769159895, + "learning_rate": 1.843904735111924e-06, + "loss": 0.3455, + "step": 723 + }, + { + "epoch": 0.21, + "grad_norm": 2.1798953175310274, + "learning_rate": 1.8434119921883861e-06, + "loss": 0.2835, + "step": 724 + }, + { + "epoch": 0.21, + "grad_norm": 2.3614911044919316, + "learning_rate": 1.8429185388276576e-06, + "loss": 0.3089, + "step": 725 + }, + { + "epoch": 0.21, + "grad_norm": 2.6367517629764587, + "learning_rate": 1.8424243754453919e-06, + "loss": 0.3234, + "step": 726 + }, + { + "epoch": 0.21, + "grad_norm": 2.4323439863930796, + "learning_rate": 1.8419295024578416e-06, + "loss": 0.3071, + "step": 727 + }, + { + "epoch": 0.21, + "grad_norm": 2.636030318990423, + "learning_rate": 1.8414339202818562e-06, + "loss": 0.3645, + "step": 728 + }, + { + "epoch": 0.21, + "grad_norm": 2.5239856793906656, + "learning_rate": 1.8409376293348834e-06, + "loss": 0.299, + "step": 729 + }, + { + "epoch": 0.21, + "grad_norm": 2.3291366949117034, + "learning_rate": 1.840440630034967e-06, + "loss": 0.3238, + "step": 730 + }, + { + "epoch": 0.21, + "grad_norm": 2.3344355021750274, + "learning_rate": 1.8399429228007483e-06, + "loss": 0.2983, + "step": 731 + }, + { + "epoch": 0.21, + "grad_norm": 2.5558353060456525, + "learning_rate": 1.8394445080514642e-06, + "loss": 0.2869, + "step": 732 + }, + { + "epoch": 0.21, + "grad_norm": 2.4247971738192935, + "learning_rate": 1.838945386206948e-06, + "loss": 0.292, + "step": 733 + }, + { + "epoch": 0.21, + "grad_norm": 2.5825090055477484, + "learning_rate": 1.8384455576876288e-06, + "loss": 0.3063, + "step": 734 + }, + { + "epoch": 0.21, + "grad_norm": 2.296180432216192, + "learning_rate": 1.8379450229145305e-06, + "loss": 0.3052, + "step": 735 + }, + { + "epoch": 0.21, + "grad_norm": 2.358017059445579, + "learning_rate": 1.8374437823092722e-06, + "loss": 0.259, + "step": 736 + }, + { + "epoch": 0.21, + "grad_norm": 2.3676231371995446, + "learning_rate": 1.8369418362940673e-06, + "loss": 0.3186, + "step": 737 + }, + { + "epoch": 0.21, + "grad_norm": 2.220845042724821, + "learning_rate": 1.8364391852917235e-06, + "loss": 0.3032, + "step": 738 + }, + { + "epoch": 0.21, + "grad_norm": 2.726016115467376, + "learning_rate": 1.8359358297256427e-06, + "loss": 0.3386, + "step": 739 + }, + { + "epoch": 0.21, + "grad_norm": 2.5838509942685897, + "learning_rate": 1.8354317700198196e-06, + "loss": 0.327, + "step": 740 + }, + { + "epoch": 0.21, + "grad_norm": 2.6091104337763684, + "learning_rate": 1.8349270065988427e-06, + "loss": 0.3253, + "step": 741 + }, + { + "epoch": 0.21, + "grad_norm": 2.714238298218788, + "learning_rate": 1.8344215398878924e-06, + "loss": 0.2946, + "step": 742 + }, + { + "epoch": 0.21, + "grad_norm": 2.380436878366774, + "learning_rate": 1.8339153703127428e-06, + "loss": 0.3225, + "step": 743 + }, + { + "epoch": 0.21, + "grad_norm": 2.2474161169587887, + "learning_rate": 1.8334084982997586e-06, + "loss": 0.2879, + "step": 744 + }, + { + "epoch": 0.21, + "grad_norm": 2.4824008936563606, + "learning_rate": 1.8329009242758975e-06, + "loss": 0.3321, + "step": 745 + }, + { + "epoch": 0.21, + "grad_norm": 2.387839320021625, + "learning_rate": 1.8323926486687073e-06, + "loss": 0.3115, + "step": 746 + }, + { + "epoch": 0.21, + "grad_norm": 2.4975073563611065, + "learning_rate": 1.8318836719063277e-06, + "loss": 0.2931, + "step": 747 + }, + { + "epoch": 0.21, + "grad_norm": 2.6069296054765014, + "learning_rate": 1.831373994417489e-06, + "loss": 0.3373, + "step": 748 + }, + { + "epoch": 0.21, + "grad_norm": 2.6304583634758507, + "learning_rate": 1.830863616631511e-06, + "loss": 0.3244, + "step": 749 + }, + { + "epoch": 0.21, + "grad_norm": 2.4945219891074273, + "learning_rate": 1.830352538978304e-06, + "loss": 0.3045, + "step": 750 + }, + { + "epoch": 0.21, + "grad_norm": 2.5943190229679503, + "learning_rate": 1.8298407618883677e-06, + "loss": 0.3132, + "step": 751 + }, + { + "epoch": 0.21, + "grad_norm": 2.451462512381113, + "learning_rate": 1.8293282857927909e-06, + "loss": 0.3306, + "step": 752 + }, + { + "epoch": 0.21, + "grad_norm": 2.802381941279302, + "learning_rate": 1.828815111123251e-06, + "loss": 0.3325, + "step": 753 + }, + { + "epoch": 0.21, + "grad_norm": 2.2958989566865715, + "learning_rate": 1.8283012383120145e-06, + "loss": 0.2997, + "step": 754 + }, + { + "epoch": 0.21, + "grad_norm": 2.4742741886127257, + "learning_rate": 1.827786667791935e-06, + "loss": 0.3466, + "step": 755 + }, + { + "epoch": 0.21, + "grad_norm": 2.3330203880683333, + "learning_rate": 1.8272713999964546e-06, + "loss": 0.2964, + "step": 756 + }, + { + "epoch": 0.21, + "grad_norm": 2.372071762014812, + "learning_rate": 1.8267554353596024e-06, + "loss": 0.3035, + "step": 757 + }, + { + "epoch": 0.21, + "grad_norm": 2.54065702879972, + "learning_rate": 1.8262387743159948e-06, + "loss": 0.3268, + "step": 758 + }, + { + "epoch": 0.22, + "grad_norm": 2.3382390623791394, + "learning_rate": 1.8257214173008344e-06, + "loss": 0.3051, + "step": 759 + }, + { + "epoch": 0.22, + "grad_norm": 2.4031055066754603, + "learning_rate": 1.8252033647499099e-06, + "loss": 0.3317, + "step": 760 + }, + { + "epoch": 0.22, + "grad_norm": 2.4032306983074845, + "learning_rate": 1.8246846170995961e-06, + "loss": 0.3015, + "step": 761 + }, + { + "epoch": 0.22, + "grad_norm": 3.1664023828151944, + "learning_rate": 1.8241651747868541e-06, + "loss": 0.3408, + "step": 762 + }, + { + "epoch": 0.22, + "grad_norm": 2.4245747041798946, + "learning_rate": 1.823645038249229e-06, + "loss": 0.3301, + "step": 763 + }, + { + "epoch": 0.22, + "grad_norm": 2.307622301322068, + "learning_rate": 1.823124207924851e-06, + "loss": 0.3273, + "step": 764 + }, + { + "epoch": 0.22, + "grad_norm": 2.300710489645234, + "learning_rate": 1.822602684252435e-06, + "loss": 0.2982, + "step": 765 + }, + { + "epoch": 0.22, + "grad_norm": 2.548236181983721, + "learning_rate": 1.8220804676712794e-06, + "loss": 0.3127, + "step": 766 + }, + { + "epoch": 0.22, + "grad_norm": 2.405523364038282, + "learning_rate": 1.8215575586212667e-06, + "loss": 0.3216, + "step": 767 + }, + { + "epoch": 0.22, + "grad_norm": 2.290131171904868, + "learning_rate": 1.821033957542863e-06, + "loss": 0.2858, + "step": 768 + }, + { + "epoch": 0.22, + "grad_norm": 2.408680649010481, + "learning_rate": 1.8205096648771163e-06, + "loss": 0.3249, + "step": 769 + }, + { + "epoch": 0.22, + "grad_norm": 2.194918371382472, + "learning_rate": 1.8199846810656583e-06, + "loss": 0.2824, + "step": 770 + }, + { + "epoch": 0.22, + "grad_norm": 3.2665129154156993, + "learning_rate": 1.819459006550702e-06, + "loss": 0.3252, + "step": 771 + }, + { + "epoch": 0.22, + "grad_norm": 2.4877798522357626, + "learning_rate": 1.8189326417750426e-06, + "loss": 0.2929, + "step": 772 + }, + { + "epoch": 0.22, + "grad_norm": 2.4438405001800634, + "learning_rate": 1.8184055871820565e-06, + "loss": 0.3092, + "step": 773 + }, + { + "epoch": 0.22, + "grad_norm": 2.471072912113562, + "learning_rate": 1.8178778432157014e-06, + "loss": 0.3262, + "step": 774 + }, + { + "epoch": 0.22, + "grad_norm": 2.520311374184934, + "learning_rate": 1.8173494103205158e-06, + "loss": 0.2878, + "step": 775 + }, + { + "epoch": 0.22, + "grad_norm": 2.4873941242668485, + "learning_rate": 1.8168202889416182e-06, + "loss": 0.2937, + "step": 776 + }, + { + "epoch": 0.22, + "grad_norm": 2.4169244067397506, + "learning_rate": 1.8162904795247074e-06, + "loss": 0.3231, + "step": 777 + }, + { + "epoch": 0.22, + "grad_norm": 2.4212965608339387, + "learning_rate": 1.8157599825160607e-06, + "loss": 0.3153, + "step": 778 + }, + { + "epoch": 0.22, + "grad_norm": 2.58524729253949, + "learning_rate": 1.8152287983625365e-06, + "loss": 0.3141, + "step": 779 + }, + { + "epoch": 0.22, + "grad_norm": 2.648579898819257, + "learning_rate": 1.8146969275115701e-06, + "loss": 0.3006, + "step": 780 + }, + { + "epoch": 0.22, + "grad_norm": 2.4588512176786788, + "learning_rate": 1.8141643704111767e-06, + "loss": 0.3369, + "step": 781 + }, + { + "epoch": 0.22, + "grad_norm": 3.388867011332001, + "learning_rate": 1.8136311275099484e-06, + "loss": 0.3207, + "step": 782 + }, + { + "epoch": 0.22, + "grad_norm": 2.3763493849662827, + "learning_rate": 1.8130971992570552e-06, + "loss": 0.3433, + "step": 783 + }, + { + "epoch": 0.22, + "grad_norm": 2.3814908591670236, + "learning_rate": 1.8125625861022454e-06, + "loss": 0.3218, + "step": 784 + }, + { + "epoch": 0.22, + "grad_norm": 2.2169228847674773, + "learning_rate": 1.812027288495843e-06, + "loss": 0.2853, + "step": 785 + }, + { + "epoch": 0.22, + "grad_norm": 2.5600068690152775, + "learning_rate": 1.8114913068887493e-06, + "loss": 0.3017, + "step": 786 + }, + { + "epoch": 0.22, + "grad_norm": 2.3564897528452327, + "learning_rate": 1.810954641732441e-06, + "loss": 0.3025, + "step": 787 + }, + { + "epoch": 0.22, + "grad_norm": 2.7185301584909722, + "learning_rate": 1.8104172934789715e-06, + "loss": 0.3002, + "step": 788 + }, + { + "epoch": 0.22, + "grad_norm": 2.394601217070245, + "learning_rate": 1.8098792625809689e-06, + "loss": 0.3127, + "step": 789 + }, + { + "epoch": 0.22, + "grad_norm": 2.919708059528672, + "learning_rate": 1.8093405494916372e-06, + "loss": 0.3342, + "step": 790 + }, + { + "epoch": 0.22, + "grad_norm": 2.5018439752505226, + "learning_rate": 1.8088011546647533e-06, + "loss": 0.3036, + "step": 791 + }, + { + "epoch": 0.22, + "grad_norm": 3.0270280866919808, + "learning_rate": 1.8082610785546706e-06, + "loss": 0.3237, + "step": 792 + }, + { + "epoch": 0.22, + "grad_norm": 2.5871694707198345, + "learning_rate": 1.8077203216163143e-06, + "loss": 0.3229, + "step": 793 + }, + { + "epoch": 0.22, + "grad_norm": 2.229119769181344, + "learning_rate": 1.8071788843051848e-06, + "loss": 0.2927, + "step": 794 + }, + { + "epoch": 0.23, + "grad_norm": 2.2796518406154402, + "learning_rate": 1.806636767077354e-06, + "loss": 0.3144, + "step": 795 + }, + { + "epoch": 0.23, + "grad_norm": 2.4914832529542728, + "learning_rate": 1.8060939703894682e-06, + "loss": 0.3326, + "step": 796 + }, + { + "epoch": 0.23, + "grad_norm": 2.3781369431977692, + "learning_rate": 1.8055504946987447e-06, + "loss": 0.3312, + "step": 797 + }, + { + "epoch": 0.23, + "grad_norm": 2.6910642279410624, + "learning_rate": 1.8050063404629732e-06, + "loss": 0.3322, + "step": 798 + }, + { + "epoch": 0.23, + "grad_norm": 2.5232510110504314, + "learning_rate": 1.8044615081405151e-06, + "loss": 0.2967, + "step": 799 + }, + { + "epoch": 0.23, + "grad_norm": 2.377745599667623, + "learning_rate": 1.8039159981903027e-06, + "loss": 0.289, + "step": 800 + }, + { + "epoch": 0.23, + "grad_norm": 2.354831110869895, + "learning_rate": 1.8033698110718394e-06, + "loss": 0.3094, + "step": 801 + }, + { + "epoch": 0.23, + "grad_norm": 2.5266561213394017, + "learning_rate": 1.802822947245199e-06, + "loss": 0.3095, + "step": 802 + }, + { + "epoch": 0.23, + "grad_norm": 2.5283457627776005, + "learning_rate": 1.8022754071710252e-06, + "loss": 0.3077, + "step": 803 + }, + { + "epoch": 0.23, + "grad_norm": 2.507549698347303, + "learning_rate": 1.8017271913105306e-06, + "loss": 0.3138, + "step": 804 + }, + { + "epoch": 0.23, + "grad_norm": 2.625393318911374, + "learning_rate": 1.8011783001254988e-06, + "loss": 0.3556, + "step": 805 + }, + { + "epoch": 0.23, + "grad_norm": 2.5556149198490132, + "learning_rate": 1.8006287340782805e-06, + "loss": 0.2912, + "step": 806 + }, + { + "epoch": 0.23, + "grad_norm": 2.698644031831514, + "learning_rate": 1.8000784936317957e-06, + "loss": 0.2907, + "step": 807 + }, + { + "epoch": 0.23, + "grad_norm": 2.4656465189485064, + "learning_rate": 1.7995275792495324e-06, + "loss": 0.2793, + "step": 808 + }, + { + "epoch": 0.23, + "grad_norm": 2.411006777212944, + "learning_rate": 1.7989759913955463e-06, + "loss": 0.3178, + "step": 809 + }, + { + "epoch": 0.23, + "grad_norm": 2.4414409226078426, + "learning_rate": 1.7984237305344601e-06, + "loss": 0.3266, + "step": 810 + }, + { + "epoch": 0.23, + "grad_norm": 2.4708127483607822, + "learning_rate": 1.7978707971314636e-06, + "loss": 0.3158, + "step": 811 + }, + { + "epoch": 0.23, + "grad_norm": 2.843055534073164, + "learning_rate": 1.7973171916523131e-06, + "loss": 0.2984, + "step": 812 + }, + { + "epoch": 0.23, + "grad_norm": 2.4782911871163895, + "learning_rate": 1.796762914563331e-06, + "loss": 0.3261, + "step": 813 + }, + { + "epoch": 0.23, + "grad_norm": 2.559804468307838, + "learning_rate": 1.7962079663314058e-06, + "loss": 0.3027, + "step": 814 + }, + { + "epoch": 0.23, + "grad_norm": 2.3474605814667004, + "learning_rate": 1.7956523474239907e-06, + "loss": 0.2762, + "step": 815 + }, + { + "epoch": 0.23, + "grad_norm": 2.3212862035195525, + "learning_rate": 1.7950960583091042e-06, + "loss": 0.3334, + "step": 816 + }, + { + "epoch": 0.23, + "grad_norm": 2.5219516348949926, + "learning_rate": 1.794539099455329e-06, + "loss": 0.3364, + "step": 817 + }, + { + "epoch": 0.23, + "grad_norm": 2.2878948873749803, + "learning_rate": 1.7939814713318122e-06, + "loss": 0.3063, + "step": 818 + }, + { + "epoch": 0.23, + "grad_norm": 2.7420790975532734, + "learning_rate": 1.7934231744082649e-06, + "loss": 0.3206, + "step": 819 + }, + { + "epoch": 0.23, + "grad_norm": 2.3798967487705602, + "learning_rate": 1.7928642091549612e-06, + "loss": 0.2839, + "step": 820 + }, + { + "epoch": 0.23, + "grad_norm": 2.593899829352958, + "learning_rate": 1.7923045760427384e-06, + "loss": 0.3206, + "step": 821 + }, + { + "epoch": 0.23, + "grad_norm": 2.4143042994636197, + "learning_rate": 1.791744275542996e-06, + "loss": 0.3312, + "step": 822 + }, + { + "epoch": 0.23, + "grad_norm": 2.7810821685273153, + "learning_rate": 1.7911833081276958e-06, + "loss": 0.335, + "step": 823 + }, + { + "epoch": 0.23, + "grad_norm": 2.4949557139361316, + "learning_rate": 1.7906216742693619e-06, + "loss": 0.3108, + "step": 824 + }, + { + "epoch": 0.23, + "grad_norm": 2.392727307987833, + "learning_rate": 1.7900593744410789e-06, + "loss": 0.2839, + "step": 825 + }, + { + "epoch": 0.23, + "grad_norm": 2.454105906481232, + "learning_rate": 1.7894964091164928e-06, + "loss": 0.311, + "step": 826 + }, + { + "epoch": 0.23, + "grad_norm": 2.3359712297438464, + "learning_rate": 1.7889327787698103e-06, + "loss": 0.3052, + "step": 827 + }, + { + "epoch": 0.23, + "grad_norm": 2.3578175704575868, + "learning_rate": 1.7883684838757982e-06, + "loss": 0.2843, + "step": 828 + }, + { + "epoch": 0.23, + "grad_norm": 2.3546703645487783, + "learning_rate": 1.787803524909783e-06, + "loss": 0.3116, + "step": 829 + }, + { + "epoch": 0.24, + "grad_norm": 2.3540766735466523, + "learning_rate": 1.7872379023476506e-06, + "loss": 0.3029, + "step": 830 + }, + { + "epoch": 0.24, + "grad_norm": 2.4561967275434085, + "learning_rate": 1.7866716166658455e-06, + "loss": 0.3164, + "step": 831 + }, + { + "epoch": 0.24, + "grad_norm": 2.3804728957034103, + "learning_rate": 1.7861046683413714e-06, + "loss": 0.3422, + "step": 832 + }, + { + "epoch": 0.24, + "grad_norm": 2.5253167982397517, + "learning_rate": 1.78553705785179e-06, + "loss": 0.3434, + "step": 833 + }, + { + "epoch": 0.24, + "grad_norm": 2.4647339440108627, + "learning_rate": 1.7849687856752206e-06, + "loss": 0.3414, + "step": 834 + }, + { + "epoch": 0.24, + "grad_norm": 2.3099561921610197, + "learning_rate": 1.7843998522903401e-06, + "loss": 0.287, + "step": 835 + }, + { + "epoch": 0.24, + "grad_norm": 3.169187560233085, + "learning_rate": 1.7838302581763815e-06, + "loss": 0.3013, + "step": 836 + }, + { + "epoch": 0.24, + "grad_norm": 2.4697580845690608, + "learning_rate": 1.7832600038131358e-06, + "loss": 0.3017, + "step": 837 + }, + { + "epoch": 0.24, + "grad_norm": 2.270074862967853, + "learning_rate": 1.782689089680949e-06, + "loss": 0.312, + "step": 838 + }, + { + "epoch": 0.24, + "grad_norm": 2.834517930537921, + "learning_rate": 1.7821175162607234e-06, + "loss": 0.3683, + "step": 839 + }, + { + "epoch": 0.24, + "grad_norm": 2.3820423992048503, + "learning_rate": 1.7815452840339162e-06, + "loss": 0.3103, + "step": 840 + }, + { + "epoch": 0.24, + "grad_norm": 4.134181679241243, + "learning_rate": 1.7809723934825402e-06, + "loss": 0.3091, + "step": 841 + }, + { + "epoch": 0.24, + "grad_norm": 2.512480635786526, + "learning_rate": 1.7803988450891626e-06, + "loss": 0.3315, + "step": 842 + }, + { + "epoch": 0.24, + "grad_norm": 2.3133473585428055, + "learning_rate": 1.7798246393369033e-06, + "loss": 0.3219, + "step": 843 + }, + { + "epoch": 0.24, + "grad_norm": 2.337263848790207, + "learning_rate": 1.7792497767094381e-06, + "loss": 0.3013, + "step": 844 + }, + { + "epoch": 0.24, + "grad_norm": 2.5037976885693185, + "learning_rate": 1.7786742576909952e-06, + "loss": 0.3077, + "step": 845 + }, + { + "epoch": 0.24, + "grad_norm": 2.4742609810228995, + "learning_rate": 1.778098082766355e-06, + "loss": 0.3193, + "step": 846 + }, + { + "epoch": 0.24, + "grad_norm": 2.4435964981750127, + "learning_rate": 1.7775212524208512e-06, + "loss": 0.3154, + "step": 847 + }, + { + "epoch": 0.24, + "grad_norm": 2.350815158082994, + "learning_rate": 1.7769437671403696e-06, + "loss": 0.3046, + "step": 848 + }, + { + "epoch": 0.24, + "grad_norm": 2.443996984457143, + "learning_rate": 1.7763656274113473e-06, + "loss": 0.3378, + "step": 849 + }, + { + "epoch": 0.24, + "grad_norm": 2.4900482903481054, + "learning_rate": 1.775786833720773e-06, + "loss": 0.3177, + "step": 850 + }, + { + "epoch": 0.24, + "grad_norm": 2.3204585475536352, + "learning_rate": 1.7752073865561856e-06, + "loss": 0.3452, + "step": 851 + }, + { + "epoch": 0.24, + "grad_norm": 2.3326934912673125, + "learning_rate": 1.7746272864056752e-06, + "loss": 0.3015, + "step": 852 + }, + { + "epoch": 0.24, + "grad_norm": 2.170865654693707, + "learning_rate": 1.774046533757882e-06, + "loss": 0.2929, + "step": 853 + }, + { + "epoch": 0.24, + "grad_norm": 2.353397028015931, + "learning_rate": 1.7734651291019953e-06, + "loss": 0.2848, + "step": 854 + }, + { + "epoch": 0.24, + "grad_norm": 2.3162206156297156, + "learning_rate": 1.7728830729277537e-06, + "loss": 0.2981, + "step": 855 + }, + { + "epoch": 0.24, + "grad_norm": 2.440390255244275, + "learning_rate": 1.7723003657254444e-06, + "loss": 0.3082, + "step": 856 + }, + { + "epoch": 0.24, + "grad_norm": 2.484031656364762, + "learning_rate": 1.7717170079859039e-06, + "loss": 0.2997, + "step": 857 + }, + { + "epoch": 0.24, + "grad_norm": 2.4625882477489385, + "learning_rate": 1.7711330002005155e-06, + "loss": 0.3039, + "step": 858 + }, + { + "epoch": 0.24, + "grad_norm": 2.2641087407974614, + "learning_rate": 1.770548342861211e-06, + "loss": 0.2964, + "step": 859 + }, + { + "epoch": 0.24, + "grad_norm": 2.4624242786779154, + "learning_rate": 1.7699630364604687e-06, + "loss": 0.3124, + "step": 860 + }, + { + "epoch": 0.24, + "grad_norm": 2.4068039255630387, + "learning_rate": 1.769377081491314e-06, + "loss": 0.3017, + "step": 861 + }, + { + "epoch": 0.24, + "grad_norm": 2.5898207729162475, + "learning_rate": 1.7687904784473186e-06, + "loss": 0.3304, + "step": 862 + }, + { + "epoch": 0.24, + "grad_norm": 2.4943816806666663, + "learning_rate": 1.7682032278226e-06, + "loss": 0.301, + "step": 863 + }, + { + "epoch": 0.24, + "grad_norm": 2.2735550324194502, + "learning_rate": 1.7676153301118206e-06, + "loss": 0.2942, + "step": 864 + }, + { + "epoch": 0.25, + "grad_norm": 2.542751669795743, + "learning_rate": 1.7670267858101892e-06, + "loss": 0.3213, + "step": 865 + }, + { + "epoch": 0.25, + "grad_norm": 2.6233292164559674, + "learning_rate": 1.7664375954134582e-06, + "loss": 0.3246, + "step": 866 + }, + { + "epoch": 0.25, + "grad_norm": 2.406407165535525, + "learning_rate": 1.7658477594179244e-06, + "loss": 0.2849, + "step": 867 + }, + { + "epoch": 0.25, + "grad_norm": 2.2742014416952006, + "learning_rate": 1.7652572783204284e-06, + "loss": 0.3338, + "step": 868 + }, + { + "epoch": 0.25, + "grad_norm": 2.3357411639986996, + "learning_rate": 1.7646661526183549e-06, + "loss": 0.304, + "step": 869 + }, + { + "epoch": 0.25, + "grad_norm": 2.3725262940496337, + "learning_rate": 1.7640743828096305e-06, + "loss": 0.2939, + "step": 870 + }, + { + "epoch": 0.25, + "grad_norm": 2.2632492249675016, + "learning_rate": 1.7634819693927252e-06, + "loss": 0.3038, + "step": 871 + }, + { + "epoch": 0.25, + "grad_norm": 2.367959700282048, + "learning_rate": 1.7628889128666501e-06, + "loss": 0.313, + "step": 872 + }, + { + "epoch": 0.25, + "grad_norm": 2.6123170953069517, + "learning_rate": 1.7622952137309595e-06, + "loss": 0.3111, + "step": 873 + }, + { + "epoch": 0.25, + "grad_norm": 2.419626125941722, + "learning_rate": 1.7617008724857478e-06, + "loss": 0.2916, + "step": 874 + }, + { + "epoch": 0.25, + "grad_norm": 2.38908803713079, + "learning_rate": 1.7611058896316507e-06, + "loss": 0.3049, + "step": 875 + }, + { + "epoch": 0.25, + "grad_norm": 2.5552448508204053, + "learning_rate": 1.7605102656698442e-06, + "loss": 0.2781, + "step": 876 + }, + { + "epoch": 0.25, + "grad_norm": 2.4428995873799524, + "learning_rate": 1.7599140011020448e-06, + "loss": 0.3321, + "step": 877 + }, + { + "epoch": 0.25, + "grad_norm": 2.4239545273760847, + "learning_rate": 1.7593170964305077e-06, + "loss": 0.3077, + "step": 878 + }, + { + "epoch": 0.25, + "grad_norm": 2.1915135477858714, + "learning_rate": 1.7587195521580288e-06, + "loss": 0.2939, + "step": 879 + }, + { + "epoch": 0.25, + "grad_norm": 2.4431326002218743, + "learning_rate": 1.7581213687879405e-06, + "loss": 0.3382, + "step": 880 + }, + { + "epoch": 0.25, + "grad_norm": 2.586725605255059, + "learning_rate": 1.757522546824116e-06, + "loss": 0.3251, + "step": 881 + }, + { + "epoch": 0.25, + "grad_norm": 2.5557207299582285, + "learning_rate": 1.7569230867709645e-06, + "loss": 0.33, + "step": 882 + }, + { + "epoch": 0.25, + "grad_norm": 2.5666833510025726, + "learning_rate": 1.7563229891334336e-06, + "loss": 0.3349, + "step": 883 + }, + { + "epoch": 0.25, + "grad_norm": 2.2615897690980273, + "learning_rate": 1.7557222544170081e-06, + "loss": 0.3446, + "step": 884 + }, + { + "epoch": 0.25, + "grad_norm": 2.192243862822083, + "learning_rate": 1.7551208831277088e-06, + "loss": 0.3127, + "step": 885 + }, + { + "epoch": 0.25, + "grad_norm": 2.5533807430869837, + "learning_rate": 1.754518875772093e-06, + "loss": 0.2985, + "step": 886 + }, + { + "epoch": 0.25, + "grad_norm": 2.6127071592959825, + "learning_rate": 1.7539162328572542e-06, + "loss": 0.3152, + "step": 887 + }, + { + "epoch": 0.25, + "grad_norm": 2.24436359208922, + "learning_rate": 1.7533129548908203e-06, + "loss": 0.2826, + "step": 888 + }, + { + "epoch": 0.25, + "grad_norm": 2.21650724397911, + "learning_rate": 1.752709042380955e-06, + "loss": 0.3139, + "step": 889 + }, + { + "epoch": 0.25, + "grad_norm": 2.240179325579028, + "learning_rate": 1.7521044958363565e-06, + "loss": 0.3201, + "step": 890 + }, + { + "epoch": 0.25, + "grad_norm": 2.6497870152313414, + "learning_rate": 1.7514993157662561e-06, + "loss": 0.3042, + "step": 891 + }, + { + "epoch": 0.25, + "grad_norm": 2.4528906426154102, + "learning_rate": 1.75089350268042e-06, + "loss": 0.2924, + "step": 892 + }, + { + "epoch": 0.25, + "grad_norm": 2.3758599539597336, + "learning_rate": 1.7502870570891468e-06, + "loss": 0.3152, + "step": 893 + }, + { + "epoch": 0.25, + "grad_norm": 2.3761524990008973, + "learning_rate": 1.749679979503268e-06, + "loss": 0.3026, + "step": 894 + }, + { + "epoch": 0.25, + "grad_norm": 2.438579519539714, + "learning_rate": 1.749072270434148e-06, + "loss": 0.3014, + "step": 895 + }, + { + "epoch": 0.25, + "grad_norm": 2.2771364984262332, + "learning_rate": 1.7484639303936822e-06, + "loss": 0.2879, + "step": 896 + }, + { + "epoch": 0.25, + "grad_norm": 2.699643333300088, + "learning_rate": 1.747854959894298e-06, + "loss": 0.3181, + "step": 897 + }, + { + "epoch": 0.25, + "grad_norm": 2.3392193675944903, + "learning_rate": 1.7472453594489538e-06, + "loss": 0.307, + "step": 898 + }, + { + "epoch": 0.25, + "grad_norm": 2.370792659232408, + "learning_rate": 1.746635129571139e-06, + "loss": 0.2849, + "step": 899 + }, + { + "epoch": 0.26, + "grad_norm": 3.4045887373466757, + "learning_rate": 1.7460242707748728e-06, + "loss": 0.3112, + "step": 900 + }, + { + "epoch": 0.26, + "grad_norm": 2.1885494563042704, + "learning_rate": 1.7454127835747037e-06, + "loss": 0.3049, + "step": 901 + }, + { + "epoch": 0.26, + "grad_norm": 2.719256164872196, + "learning_rate": 1.7448006684857106e-06, + "loss": 0.2924, + "step": 902 + }, + { + "epoch": 0.26, + "grad_norm": 2.3275588349254432, + "learning_rate": 1.7441879260235007e-06, + "loss": 0.2916, + "step": 903 + }, + { + "epoch": 0.26, + "grad_norm": 2.6433671842254447, + "learning_rate": 1.7435745567042094e-06, + "loss": 0.3004, + "step": 904 + }, + { + "epoch": 0.26, + "grad_norm": 2.3788139054652286, + "learning_rate": 1.7429605610445004e-06, + "loss": 0.2855, + "step": 905 + }, + { + "epoch": 0.26, + "grad_norm": 2.3669128953610983, + "learning_rate": 1.7423459395615652e-06, + "loss": 0.3005, + "step": 906 + }, + { + "epoch": 0.26, + "grad_norm": 2.3977847313289256, + "learning_rate": 1.7417306927731223e-06, + "loss": 0.3056, + "step": 907 + }, + { + "epoch": 0.26, + "grad_norm": 2.634258954737695, + "learning_rate": 1.7411148211974167e-06, + "loss": 0.2984, + "step": 908 + }, + { + "epoch": 0.26, + "grad_norm": 2.549418667244922, + "learning_rate": 1.7404983253532202e-06, + "loss": 0.3313, + "step": 909 + }, + { + "epoch": 0.26, + "grad_norm": 2.229251686531007, + "learning_rate": 1.7398812057598298e-06, + "loss": 0.2907, + "step": 910 + }, + { + "epoch": 0.26, + "grad_norm": 2.363913384681548, + "learning_rate": 1.7392634629370681e-06, + "loss": 0.3069, + "step": 911 + }, + { + "epoch": 0.26, + "grad_norm": 2.635330690255088, + "learning_rate": 1.7386450974052832e-06, + "loss": 0.3306, + "step": 912 + }, + { + "epoch": 0.26, + "grad_norm": 2.454322989683554, + "learning_rate": 1.738026109685347e-06, + "loss": 0.3071, + "step": 913 + }, + { + "epoch": 0.26, + "grad_norm": 2.559993279354754, + "learning_rate": 1.7374065002986557e-06, + "loss": 0.3025, + "step": 914 + }, + { + "epoch": 0.26, + "grad_norm": 2.3497500860175626, + "learning_rate": 1.7367862697671299e-06, + "loss": 0.3064, + "step": 915 + }, + { + "epoch": 0.26, + "grad_norm": 2.5312837946915137, + "learning_rate": 1.7361654186132117e-06, + "loss": 0.3307, + "step": 916 + }, + { + "epoch": 0.26, + "grad_norm": 2.3373983973812016, + "learning_rate": 1.735543947359868e-06, + "loss": 0.2739, + "step": 917 + }, + { + "epoch": 0.26, + "grad_norm": 2.4244408158663995, + "learning_rate": 1.7349218565305867e-06, + "loss": 0.2939, + "step": 918 + }, + { + "epoch": 0.26, + "grad_norm": 2.4878585942575686, + "learning_rate": 1.7342991466493784e-06, + "loss": 0.2913, + "step": 919 + }, + { + "epoch": 0.26, + "grad_norm": 2.288423305700923, + "learning_rate": 1.7336758182407737e-06, + "loss": 0.3096, + "step": 920 + }, + { + "epoch": 0.26, + "grad_norm": 2.502154794481118, + "learning_rate": 1.733051871829826e-06, + "loss": 0.3025, + "step": 921 + }, + { + "epoch": 0.26, + "grad_norm": 2.478412947406026, + "learning_rate": 1.7324273079421088e-06, + "loss": 0.3002, + "step": 922 + }, + { + "epoch": 0.26, + "grad_norm": 2.6007090817241703, + "learning_rate": 1.7318021271037146e-06, + "loss": 0.3246, + "step": 923 + }, + { + "epoch": 0.26, + "grad_norm": 2.1730136205293205, + "learning_rate": 1.7311763298412569e-06, + "loss": 0.309, + "step": 924 + }, + { + "epoch": 0.26, + "grad_norm": 2.328540172131703, + "learning_rate": 1.7305499166818679e-06, + "loss": 0.3198, + "step": 925 + }, + { + "epoch": 0.26, + "grad_norm": 2.3787220615358797, + "learning_rate": 1.7299228881531982e-06, + "loss": 0.2649, + "step": 926 + }, + { + "epoch": 0.26, + "grad_norm": 2.522766022357985, + "learning_rate": 1.729295244783418e-06, + "loss": 0.3404, + "step": 927 + }, + { + "epoch": 0.26, + "grad_norm": 2.235899600554452, + "learning_rate": 1.7286669871012135e-06, + "loss": 0.3235, + "step": 928 + }, + { + "epoch": 0.26, + "grad_norm": 3.0892728394383817, + "learning_rate": 1.7280381156357904e-06, + "loss": 0.2768, + "step": 929 + }, + { + "epoch": 0.26, + "grad_norm": 2.5467713290166145, + "learning_rate": 1.7274086309168701e-06, + "loss": 0.2989, + "step": 930 + }, + { + "epoch": 0.26, + "grad_norm": 2.2375743713758394, + "learning_rate": 1.7267785334746907e-06, + "loss": 0.2926, + "step": 931 + }, + { + "epoch": 0.26, + "grad_norm": 2.392561762822056, + "learning_rate": 1.7261478238400068e-06, + "loss": 0.2985, + "step": 932 + }, + { + "epoch": 0.26, + "grad_norm": 2.4450577681450616, + "learning_rate": 1.725516502544089e-06, + "loss": 0.3386, + "step": 933 + }, + { + "epoch": 0.26, + "grad_norm": 2.2772591477186124, + "learning_rate": 1.7248845701187218e-06, + "loss": 0.2864, + "step": 934 + }, + { + "epoch": 0.26, + "grad_norm": 2.616420026072073, + "learning_rate": 1.7242520270962057e-06, + "loss": 0.333, + "step": 935 + }, + { + "epoch": 0.27, + "grad_norm": 2.5407530283716717, + "learning_rate": 1.7236188740093554e-06, + "loss": 0.3247, + "step": 936 + }, + { + "epoch": 0.27, + "grad_norm": 2.837835871101234, + "learning_rate": 1.7229851113914986e-06, + "loss": 0.3547, + "step": 937 + }, + { + "epoch": 0.27, + "grad_norm": 2.438027917952889, + "learning_rate": 1.7223507397764778e-06, + "loss": 0.3063, + "step": 938 + }, + { + "epoch": 0.27, + "grad_norm": 2.2202556858343114, + "learning_rate": 1.721715759698647e-06, + "loss": 0.2684, + "step": 939 + }, + { + "epoch": 0.27, + "grad_norm": 2.4880759861493327, + "learning_rate": 1.721080171692874e-06, + "loss": 0.3131, + "step": 940 + }, + { + "epoch": 0.27, + "grad_norm": 2.4325603890985956, + "learning_rate": 1.720443976294538e-06, + "loss": 0.3066, + "step": 941 + }, + { + "epoch": 0.27, + "grad_norm": 2.598743028566477, + "learning_rate": 1.7198071740395298e-06, + "loss": 0.3196, + "step": 942 + }, + { + "epoch": 0.27, + "grad_norm": 2.447618969814215, + "learning_rate": 1.7191697654642515e-06, + "loss": 0.3208, + "step": 943 + }, + { + "epoch": 0.27, + "grad_norm": 2.398233275246693, + "learning_rate": 1.7185317511056163e-06, + "loss": 0.2895, + "step": 944 + }, + { + "epoch": 0.27, + "grad_norm": 2.455998669650308, + "learning_rate": 1.717893131501047e-06, + "loss": 0.3088, + "step": 945 + }, + { + "epoch": 0.27, + "grad_norm": 2.55553833781246, + "learning_rate": 1.717253907188477e-06, + "loss": 0.311, + "step": 946 + }, + { + "epoch": 0.27, + "grad_norm": 2.331976956659491, + "learning_rate": 1.7166140787063484e-06, + "loss": 0.2975, + "step": 947 + }, + { + "epoch": 0.27, + "grad_norm": 2.5443708751804843, + "learning_rate": 1.7159736465936122e-06, + "loss": 0.3312, + "step": 948 + }, + { + "epoch": 0.27, + "grad_norm": 2.4646620518131765, + "learning_rate": 1.7153326113897285e-06, + "loss": 0.2983, + "step": 949 + }, + { + "epoch": 0.27, + "grad_norm": 2.700520256461987, + "learning_rate": 1.7146909736346649e-06, + "loss": 0.3422, + "step": 950 + }, + { + "epoch": 0.27, + "grad_norm": 2.365208695404597, + "learning_rate": 1.7140487338688964e-06, + "loss": 0.3125, + "step": 951 + }, + { + "epoch": 0.27, + "grad_norm": 2.342799021500254, + "learning_rate": 1.7134058926334061e-06, + "loss": 0.2851, + "step": 952 + }, + { + "epoch": 0.27, + "grad_norm": 2.472473882462489, + "learning_rate": 1.712762450469682e-06, + "loss": 0.2833, + "step": 953 + }, + { + "epoch": 0.27, + "grad_norm": 2.924949206264284, + "learning_rate": 1.7121184079197199e-06, + "loss": 0.2867, + "step": 954 + }, + { + "epoch": 0.27, + "grad_norm": 2.446605549606646, + "learning_rate": 1.7114737655260209e-06, + "loss": 0.3125, + "step": 955 + }, + { + "epoch": 0.27, + "grad_norm": 2.4578380201775114, + "learning_rate": 1.710828523831591e-06, + "loss": 0.304, + "step": 956 + }, + { + "epoch": 0.27, + "grad_norm": 2.278217100685124, + "learning_rate": 1.7101826833799408e-06, + "loss": 0.3097, + "step": 957 + }, + { + "epoch": 0.27, + "grad_norm": 2.634826805932555, + "learning_rate": 1.7095362447150863e-06, + "loss": 0.2935, + "step": 958 + }, + { + "epoch": 0.27, + "grad_norm": 3.378656134708543, + "learning_rate": 1.708889208381546e-06, + "loss": 0.2915, + "step": 959 + }, + { + "epoch": 0.27, + "grad_norm": 2.4000239757418482, + "learning_rate": 1.7082415749243434e-06, + "loss": 0.2999, + "step": 960 + }, + { + "epoch": 0.27, + "grad_norm": 2.2573289490776585, + "learning_rate": 1.7075933448890036e-06, + "loss": 0.277, + "step": 961 + }, + { + "epoch": 0.27, + "grad_norm": 2.346166395763324, + "learning_rate": 1.706944518821555e-06, + "loss": 0.3021, + "step": 962 + }, + { + "epoch": 0.27, + "grad_norm": 2.4124728615991455, + "learning_rate": 1.7062950972685276e-06, + "loss": 0.3128, + "step": 963 + }, + { + "epoch": 0.27, + "grad_norm": 2.826711761487159, + "learning_rate": 1.705645080776954e-06, + "loss": 0.313, + "step": 964 + }, + { + "epoch": 0.27, + "grad_norm": 2.4047963495047906, + "learning_rate": 1.7049944698943666e-06, + "loss": 0.3371, + "step": 965 + }, + { + "epoch": 0.27, + "grad_norm": 2.389614674773777, + "learning_rate": 1.7043432651687985e-06, + "loss": 0.3249, + "step": 966 + }, + { + "epoch": 0.27, + "grad_norm": 2.436635418792066, + "learning_rate": 1.7036914671487849e-06, + "loss": 0.2986, + "step": 967 + }, + { + "epoch": 0.27, + "grad_norm": 2.3717874245845163, + "learning_rate": 1.7030390763833586e-06, + "loss": 0.3382, + "step": 968 + }, + { + "epoch": 0.27, + "grad_norm": 2.497520195958812, + "learning_rate": 1.7023860934220529e-06, + "loss": 0.3302, + "step": 969 + }, + { + "epoch": 0.27, + "grad_norm": 2.4602529075200508, + "learning_rate": 1.701732518814899e-06, + "loss": 0.2964, + "step": 970 + }, + { + "epoch": 0.28, + "grad_norm": 2.496811408865411, + "learning_rate": 1.7010783531124276e-06, + "loss": 0.2837, + "step": 971 + }, + { + "epoch": 0.28, + "grad_norm": 2.2217169162660713, + "learning_rate": 1.7004235968656663e-06, + "loss": 0.3015, + "step": 972 + }, + { + "epoch": 0.28, + "grad_norm": 2.499163380974902, + "learning_rate": 1.6997682506261408e-06, + "loss": 0.3124, + "step": 973 + }, + { + "epoch": 0.28, + "grad_norm": 2.4447537512361546, + "learning_rate": 1.6991123149458738e-06, + "loss": 0.2945, + "step": 974 + }, + { + "epoch": 0.28, + "grad_norm": 2.362267004446206, + "learning_rate": 1.698455790377384e-06, + "loss": 0.2863, + "step": 975 + }, + { + "epoch": 0.28, + "grad_norm": 2.2546056397623135, + "learning_rate": 1.6977986774736856e-06, + "loss": 0.2822, + "step": 976 + }, + { + "epoch": 0.28, + "grad_norm": 2.2691350851857135, + "learning_rate": 1.6971409767882908e-06, + "loss": 0.2974, + "step": 977 + }, + { + "epoch": 0.28, + "grad_norm": 2.445416620705216, + "learning_rate": 1.6964826888752036e-06, + "loss": 0.2852, + "step": 978 + }, + { + "epoch": 0.28, + "grad_norm": 2.384942731710928, + "learning_rate": 1.6958238142889256e-06, + "loss": 0.2925, + "step": 979 + }, + { + "epoch": 0.28, + "grad_norm": 2.2714288291286295, + "learning_rate": 1.6951643535844507e-06, + "loss": 0.2746, + "step": 980 + }, + { + "epoch": 0.28, + "grad_norm": 2.2375520872504486, + "learning_rate": 1.6945043073172669e-06, + "loss": 0.3002, + "step": 981 + }, + { + "epoch": 0.28, + "grad_norm": 2.4784951066376597, + "learning_rate": 1.6938436760433563e-06, + "loss": 0.3209, + "step": 982 + }, + { + "epoch": 0.28, + "grad_norm": 2.340283512510198, + "learning_rate": 1.6931824603191924e-06, + "loss": 0.3039, + "step": 983 + }, + { + "epoch": 0.28, + "grad_norm": 2.30737661460916, + "learning_rate": 1.692520660701742e-06, + "loss": 0.2923, + "step": 984 + }, + { + "epoch": 0.28, + "grad_norm": 2.437715351903717, + "learning_rate": 1.691858277748464e-06, + "loss": 0.3014, + "step": 985 + }, + { + "epoch": 0.28, + "grad_norm": 2.5154337603104495, + "learning_rate": 1.6911953120173072e-06, + "loss": 0.2792, + "step": 986 + }, + { + "epoch": 0.28, + "grad_norm": 2.4797671932366554, + "learning_rate": 1.690531764066713e-06, + "loss": 0.2949, + "step": 987 + }, + { + "epoch": 0.28, + "grad_norm": 2.3547640609593965, + "learning_rate": 1.6898676344556116e-06, + "loss": 0.2787, + "step": 988 + }, + { + "epoch": 0.28, + "grad_norm": 2.3656668511347525, + "learning_rate": 1.6892029237434247e-06, + "loss": 0.3285, + "step": 989 + }, + { + "epoch": 0.28, + "grad_norm": 2.4444266223198388, + "learning_rate": 1.6885376324900627e-06, + "loss": 0.314, + "step": 990 + }, + { + "epoch": 0.28, + "grad_norm": 2.3855107129962514, + "learning_rate": 1.6878717612559248e-06, + "loss": 0.3095, + "step": 991 + }, + { + "epoch": 0.28, + "grad_norm": 2.5693108001100673, + "learning_rate": 1.6872053106018994e-06, + "loss": 0.2997, + "step": 992 + }, + { + "epoch": 0.28, + "grad_norm": 2.1970928719836182, + "learning_rate": 1.686538281089362e-06, + "loss": 0.3192, + "step": 993 + }, + { + "epoch": 0.28, + "grad_norm": 2.3599486714531084, + "learning_rate": 1.6858706732801765e-06, + "loss": 0.2893, + "step": 994 + }, + { + "epoch": 0.28, + "grad_norm": 2.297010635268803, + "learning_rate": 1.6852024877366944e-06, + "loss": 0.2971, + "step": 995 + }, + { + "epoch": 0.28, + "grad_norm": 2.3034348135405156, + "learning_rate": 1.6845337250217524e-06, + "loss": 0.3109, + "step": 996 + }, + { + "epoch": 0.28, + "grad_norm": 2.967206227562683, + "learning_rate": 1.6838643856986742e-06, + "loss": 0.3199, + "step": 997 + }, + { + "epoch": 0.28, + "grad_norm": 2.623339369359266, + "learning_rate": 1.6831944703312692e-06, + "loss": 0.2834, + "step": 998 + }, + { + "epoch": 0.28, + "grad_norm": 2.53117931729609, + "learning_rate": 1.6825239794838325e-06, + "loss": 0.3006, + "step": 999 + }, + { + "epoch": 0.28, + "grad_norm": 2.3882720935542516, + "learning_rate": 1.6818529137211426e-06, + "loss": 0.3082, + "step": 1000 + }, + { + "epoch": 0.28, + "grad_norm": 2.55440787423396, + "learning_rate": 1.6811812736084634e-06, + "loss": 0.3204, + "step": 1001 + }, + { + "epoch": 0.28, + "grad_norm": 2.321158053976048, + "learning_rate": 1.6805090597115424e-06, + "loss": 0.319, + "step": 1002 + }, + { + "epoch": 0.28, + "grad_norm": 2.4872668248685, + "learning_rate": 1.67983627259661e-06, + "loss": 0.2799, + "step": 1003 + }, + { + "epoch": 0.28, + "grad_norm": 2.4245140920050603, + "learning_rate": 1.67916291283038e-06, + "loss": 0.3063, + "step": 1004 + }, + { + "epoch": 0.28, + "grad_norm": 2.6435579373630245, + "learning_rate": 1.678488980980048e-06, + "loss": 0.3093, + "step": 1005 + }, + { + "epoch": 0.29, + "grad_norm": 2.258406979497761, + "learning_rate": 1.6778144776132922e-06, + "loss": 0.268, + "step": 1006 + }, + { + "epoch": 0.29, + "grad_norm": 2.9135073664027664, + "learning_rate": 1.6771394032982715e-06, + "loss": 0.3402, + "step": 1007 + }, + { + "epoch": 0.29, + "grad_norm": 2.3278350848972984, + "learning_rate": 1.6764637586036262e-06, + "loss": 0.2839, + "step": 1008 + }, + { + "epoch": 0.29, + "grad_norm": 2.3248056744204986, + "learning_rate": 1.6757875440984765e-06, + "loss": 0.2983, + "step": 1009 + }, + { + "epoch": 0.29, + "grad_norm": 2.439826013803833, + "learning_rate": 1.6751107603524234e-06, + "loss": 0.3344, + "step": 1010 + }, + { + "epoch": 0.29, + "grad_norm": 2.2802148297417255, + "learning_rate": 1.6744334079355468e-06, + "loss": 0.2897, + "step": 1011 + }, + { + "epoch": 0.29, + "grad_norm": 2.6263572428855366, + "learning_rate": 1.6737554874184054e-06, + "loss": 0.2902, + "step": 1012 + }, + { + "epoch": 0.29, + "grad_norm": 2.2664433077616817, + "learning_rate": 1.6730769993720374e-06, + "loss": 0.2768, + "step": 1013 + }, + { + "epoch": 0.29, + "grad_norm": 2.429423006143885, + "learning_rate": 1.6723979443679581e-06, + "loss": 0.3299, + "step": 1014 + }, + { + "epoch": 0.29, + "grad_norm": 2.416722381378333, + "learning_rate": 1.6717183229781608e-06, + "loss": 0.295, + "step": 1015 + }, + { + "epoch": 0.29, + "grad_norm": 2.8299828280374224, + "learning_rate": 1.6710381357751153e-06, + "loss": 0.3108, + "step": 1016 + }, + { + "epoch": 0.29, + "grad_norm": 3.0380148513953085, + "learning_rate": 1.6703573833317695e-06, + "loss": 0.3334, + "step": 1017 + }, + { + "epoch": 0.29, + "grad_norm": 2.315449694497384, + "learning_rate": 1.6696760662215454e-06, + "loss": 0.3075, + "step": 1018 + }, + { + "epoch": 0.29, + "grad_norm": 2.493817939261923, + "learning_rate": 1.6689941850183423e-06, + "loss": 0.3229, + "step": 1019 + }, + { + "epoch": 0.29, + "grad_norm": 2.501749294264062, + "learning_rate": 1.668311740296534e-06, + "loss": 0.3106, + "step": 1020 + }, + { + "epoch": 0.29, + "grad_norm": 2.289914719495963, + "learning_rate": 1.6676287326309684e-06, + "loss": 0.309, + "step": 1021 + }, + { + "epoch": 0.29, + "grad_norm": 2.718396677832358, + "learning_rate": 1.666945162596969e-06, + "loss": 0.3284, + "step": 1022 + }, + { + "epoch": 0.29, + "grad_norm": 2.3457700683897236, + "learning_rate": 1.6662610307703315e-06, + "loss": 0.3157, + "step": 1023 + }, + { + "epoch": 0.29, + "grad_norm": 2.5448441504576618, + "learning_rate": 1.6655763377273258e-06, + "loss": 0.3019, + "step": 1024 + }, + { + "epoch": 0.29, + "grad_norm": 2.4890666738130007, + "learning_rate": 1.6648910840446945e-06, + "loss": 0.3275, + "step": 1025 + }, + { + "epoch": 0.29, + "grad_norm": 2.4894253493778624, + "learning_rate": 1.6642052702996518e-06, + "loss": 0.3149, + "step": 1026 + }, + { + "epoch": 0.29, + "grad_norm": 2.353041250698328, + "learning_rate": 1.663518897069884e-06, + "loss": 0.2886, + "step": 1027 + }, + { + "epoch": 0.29, + "grad_norm": 2.4392572033533066, + "learning_rate": 1.662831964933549e-06, + "loss": 0.3412, + "step": 1028 + }, + { + "epoch": 0.29, + "grad_norm": 2.3188088866561403, + "learning_rate": 1.662144474469275e-06, + "loss": 0.2947, + "step": 1029 + }, + { + "epoch": 0.29, + "grad_norm": 2.341100743164467, + "learning_rate": 1.6614564262561608e-06, + "loss": 0.2728, + "step": 1030 + }, + { + "epoch": 0.29, + "grad_norm": 2.2101531499352434, + "learning_rate": 1.660767820873775e-06, + "loss": 0.3123, + "step": 1031 + }, + { + "epoch": 0.29, + "grad_norm": 2.475747069138636, + "learning_rate": 1.6600786589021552e-06, + "loss": 0.3403, + "step": 1032 + }, + { + "epoch": 0.29, + "grad_norm": 2.459042145691644, + "learning_rate": 1.6593889409218082e-06, + "loss": 0.312, + "step": 1033 + }, + { + "epoch": 0.29, + "grad_norm": 2.5295519497711707, + "learning_rate": 1.6586986675137092e-06, + "loss": 0.2986, + "step": 1034 + }, + { + "epoch": 0.29, + "grad_norm": 2.450928118563158, + "learning_rate": 1.658007839259301e-06, + "loss": 0.3285, + "step": 1035 + }, + { + "epoch": 0.29, + "grad_norm": 2.1898774628428628, + "learning_rate": 1.6573164567404935e-06, + "loss": 0.2827, + "step": 1036 + }, + { + "epoch": 0.29, + "grad_norm": 2.427219719465487, + "learning_rate": 1.6566245205396645e-06, + "loss": 0.299, + "step": 1037 + }, + { + "epoch": 0.29, + "grad_norm": 2.3570225217372567, + "learning_rate": 1.655932031239657e-06, + "loss": 0.2951, + "step": 1038 + }, + { + "epoch": 0.29, + "grad_norm": 2.3402537457095547, + "learning_rate": 1.6552389894237805e-06, + "loss": 0.3171, + "step": 1039 + }, + { + "epoch": 0.29, + "grad_norm": 2.354873029829454, + "learning_rate": 1.6545453956758095e-06, + "loss": 0.2935, + "step": 1040 + }, + { + "epoch": 0.29, + "grad_norm": 2.25090458483495, + "learning_rate": 1.6538512505799846e-06, + "loss": 0.3108, + "step": 1041 + }, + { + "epoch": 0.3, + "grad_norm": 2.55047815912428, + "learning_rate": 1.6531565547210091e-06, + "loss": 0.3013, + "step": 1042 + }, + { + "epoch": 0.3, + "grad_norm": 2.4440385164546696, + "learning_rate": 1.6524613086840518e-06, + "loss": 0.3105, + "step": 1043 + }, + { + "epoch": 0.3, + "grad_norm": 2.391941906752125, + "learning_rate": 1.6517655130547435e-06, + "loss": 0.2957, + "step": 1044 + }, + { + "epoch": 0.3, + "grad_norm": 2.435031979636581, + "learning_rate": 1.6510691684191792e-06, + "loss": 0.3012, + "step": 1045 + }, + { + "epoch": 0.3, + "grad_norm": 2.289453395649587, + "learning_rate": 1.6503722753639152e-06, + "loss": 0.2879, + "step": 1046 + }, + { + "epoch": 0.3, + "grad_norm": 2.7232938391584423, + "learning_rate": 1.6496748344759711e-06, + "loss": 0.2899, + "step": 1047 + }, + { + "epoch": 0.3, + "grad_norm": 2.3480597213292884, + "learning_rate": 1.6489768463428271e-06, + "loss": 0.2844, + "step": 1048 + }, + { + "epoch": 0.3, + "grad_norm": 2.2274801914857827, + "learning_rate": 1.6482783115524236e-06, + "loss": 0.3026, + "step": 1049 + }, + { + "epoch": 0.3, + "grad_norm": 2.350786936769416, + "learning_rate": 1.6475792306931634e-06, + "loss": 0.3025, + "step": 1050 + }, + { + "epoch": 0.3, + "grad_norm": 2.455094703349108, + "learning_rate": 1.646879604353908e-06, + "loss": 0.279, + "step": 1051 + }, + { + "epoch": 0.3, + "grad_norm": 2.749072216613041, + "learning_rate": 1.6461794331239784e-06, + "loss": 0.2868, + "step": 1052 + }, + { + "epoch": 0.3, + "grad_norm": 2.3961592307243933, + "learning_rate": 1.6454787175931545e-06, + "loss": 0.3241, + "step": 1053 + }, + { + "epoch": 0.3, + "grad_norm": 2.3802912636459324, + "learning_rate": 1.6447774583516757e-06, + "loss": 0.3184, + "step": 1054 + }, + { + "epoch": 0.3, + "grad_norm": 2.3578994334145595, + "learning_rate": 1.6440756559902378e-06, + "loss": 0.2992, + "step": 1055 + }, + { + "epoch": 0.3, + "grad_norm": 2.669831279613677, + "learning_rate": 1.6433733110999955e-06, + "loss": 0.3343, + "step": 1056 + }, + { + "epoch": 0.3, + "grad_norm": 2.3566063200933205, + "learning_rate": 1.64267042427256e-06, + "loss": 0.2962, + "step": 1057 + }, + { + "epoch": 0.3, + "grad_norm": 2.9095379633908793, + "learning_rate": 1.6419669960999988e-06, + "loss": 0.3049, + "step": 1058 + }, + { + "epoch": 0.3, + "grad_norm": 2.4085390520917045, + "learning_rate": 1.6412630271748353e-06, + "loss": 0.3121, + "step": 1059 + }, + { + "epoch": 0.3, + "grad_norm": 2.3603280046033523, + "learning_rate": 1.640558518090049e-06, + "loss": 0.2943, + "step": 1060 + }, + { + "epoch": 0.3, + "grad_norm": 2.486479238370381, + "learning_rate": 1.6398534694390738e-06, + "loss": 0.3191, + "step": 1061 + }, + { + "epoch": 0.3, + "grad_norm": 2.3336082601675407, + "learning_rate": 1.6391478818157984e-06, + "loss": 0.3227, + "step": 1062 + }, + { + "epoch": 0.3, + "grad_norm": 2.40928700934904, + "learning_rate": 1.6384417558145653e-06, + "loss": 0.2902, + "step": 1063 + }, + { + "epoch": 0.3, + "grad_norm": 2.320113452541132, + "learning_rate": 1.637735092030171e-06, + "loss": 0.294, + "step": 1064 + }, + { + "epoch": 0.3, + "grad_norm": 2.1722153868801457, + "learning_rate": 1.637027891057864e-06, + "loss": 0.2874, + "step": 1065 + }, + { + "epoch": 0.3, + "grad_norm": 2.4440111925983836, + "learning_rate": 1.6363201534933461e-06, + "loss": 0.3141, + "step": 1066 + }, + { + "epoch": 0.3, + "grad_norm": 2.3338478949662225, + "learning_rate": 1.6356118799327714e-06, + "loss": 0.3092, + "step": 1067 + }, + { + "epoch": 0.3, + "grad_norm": 2.5121442311511917, + "learning_rate": 1.634903070972744e-06, + "loss": 0.3397, + "step": 1068 + }, + { + "epoch": 0.3, + "grad_norm": 2.5781463798701902, + "learning_rate": 1.634193727210321e-06, + "loss": 0.3104, + "step": 1069 + }, + { + "epoch": 0.3, + "grad_norm": 2.5961627840525114, + "learning_rate": 1.6334838492430083e-06, + "loss": 0.3101, + "step": 1070 + }, + { + "epoch": 0.3, + "grad_norm": 2.267179058947479, + "learning_rate": 1.6327734376687627e-06, + "loss": 0.283, + "step": 1071 + }, + { + "epoch": 0.3, + "grad_norm": 2.3444605330977066, + "learning_rate": 1.6320624930859904e-06, + "loss": 0.3122, + "step": 1072 + }, + { + "epoch": 0.3, + "grad_norm": 2.993127129778634, + "learning_rate": 1.6313510160935456e-06, + "loss": 0.3471, + "step": 1073 + }, + { + "epoch": 0.3, + "grad_norm": 2.2872650945211332, + "learning_rate": 1.6306390072907325e-06, + "loss": 0.3169, + "step": 1074 + }, + { + "epoch": 0.3, + "grad_norm": 2.724604471623292, + "learning_rate": 1.6299264672773023e-06, + "loss": 0.3031, + "step": 1075 + }, + { + "epoch": 0.3, + "grad_norm": 2.913751858855762, + "learning_rate": 1.6292133966534538e-06, + "loss": 0.284, + "step": 1076 + }, + { + "epoch": 0.31, + "grad_norm": 2.286247266050023, + "learning_rate": 1.6284997960198327e-06, + "loss": 0.3257, + "step": 1077 + }, + { + "epoch": 0.31, + "grad_norm": 2.3860053312723952, + "learning_rate": 1.6277856659775318e-06, + "loss": 0.315, + "step": 1078 + }, + { + "epoch": 0.31, + "grad_norm": 2.453199624252134, + "learning_rate": 1.6270710071280886e-06, + "loss": 0.3262, + "step": 1079 + }, + { + "epoch": 0.31, + "grad_norm": 2.622679958245868, + "learning_rate": 1.6263558200734874e-06, + "loss": 0.2872, + "step": 1080 + }, + { + "epoch": 0.31, + "grad_norm": 2.3486987427194266, + "learning_rate": 1.6256401054161564e-06, + "loss": 0.3164, + "step": 1081 + }, + { + "epoch": 0.31, + "grad_norm": 2.217339421310054, + "learning_rate": 1.6249238637589686e-06, + "loss": 0.2927, + "step": 1082 + }, + { + "epoch": 0.31, + "grad_norm": 2.350362880482745, + "learning_rate": 1.6242070957052408e-06, + "loss": 0.327, + "step": 1083 + }, + { + "epoch": 0.31, + "grad_norm": 2.291927833619107, + "learning_rate": 1.6234898018587336e-06, + "loss": 0.281, + "step": 1084 + }, + { + "epoch": 0.31, + "grad_norm": 2.3219095991433525, + "learning_rate": 1.62277198282365e-06, + "loss": 0.292, + "step": 1085 + }, + { + "epoch": 0.31, + "grad_norm": 2.3916455188615386, + "learning_rate": 1.6220536392046355e-06, + "loss": 0.2984, + "step": 1086 + }, + { + "epoch": 0.31, + "grad_norm": 2.858586614338182, + "learning_rate": 1.621334771606778e-06, + "loss": 0.3256, + "step": 1087 + }, + { + "epoch": 0.31, + "grad_norm": 2.563926714028576, + "learning_rate": 1.620615380635606e-06, + "loss": 0.298, + "step": 1088 + }, + { + "epoch": 0.31, + "grad_norm": 2.3780229371631334, + "learning_rate": 1.6198954668970892e-06, + "loss": 0.3113, + "step": 1089 + }, + { + "epoch": 0.31, + "grad_norm": 2.3717718934908736, + "learning_rate": 1.6191750309976375e-06, + "loss": 0.3097, + "step": 1090 + }, + { + "epoch": 0.31, + "grad_norm": 2.26879452699283, + "learning_rate": 1.6184540735441011e-06, + "loss": 0.2758, + "step": 1091 + }, + { + "epoch": 0.31, + "grad_norm": 2.3545499465443673, + "learning_rate": 1.617732595143769e-06, + "loss": 0.2951, + "step": 1092 + }, + { + "epoch": 0.31, + "grad_norm": 2.244857733670842, + "learning_rate": 1.6170105964043693e-06, + "loss": 0.2951, + "step": 1093 + }, + { + "epoch": 0.31, + "grad_norm": 2.3294945365100497, + "learning_rate": 1.6162880779340685e-06, + "loss": 0.2943, + "step": 1094 + }, + { + "epoch": 0.31, + "grad_norm": 2.3426596108235653, + "learning_rate": 1.61556504034147e-06, + "loss": 0.3315, + "step": 1095 + }, + { + "epoch": 0.31, + "grad_norm": 2.4575241805037504, + "learning_rate": 1.6148414842356157e-06, + "loss": 0.3015, + "step": 1096 + }, + { + "epoch": 0.31, + "grad_norm": 2.4324516901036253, + "learning_rate": 1.6141174102259835e-06, + "loss": 0.2792, + "step": 1097 + }, + { + "epoch": 0.31, + "grad_norm": 2.4290612852471347, + "learning_rate": 1.6133928189224886e-06, + "loss": 0.3176, + "step": 1098 + }, + { + "epoch": 0.31, + "grad_norm": 2.370996130318332, + "learning_rate": 1.61266771093548e-06, + "loss": 0.2853, + "step": 1099 + }, + { + "epoch": 0.31, + "grad_norm": 2.770906603532933, + "learning_rate": 1.6119420868757429e-06, + "loss": 0.3126, + "step": 1100 + }, + { + "epoch": 0.31, + "grad_norm": 2.3587875090211288, + "learning_rate": 1.6112159473544988e-06, + "loss": 0.2996, + "step": 1101 + }, + { + "epoch": 0.31, + "grad_norm": 2.213960910839405, + "learning_rate": 1.6104892929834006e-06, + "loss": 0.2793, + "step": 1102 + }, + { + "epoch": 0.31, + "grad_norm": 2.453113592013504, + "learning_rate": 1.6097621243745369e-06, + "loss": 0.3057, + "step": 1103 + }, + { + "epoch": 0.31, + "grad_norm": 2.458016611313274, + "learning_rate": 1.6090344421404285e-06, + "loss": 0.2673, + "step": 1104 + }, + { + "epoch": 0.31, + "grad_norm": 2.473355007432182, + "learning_rate": 1.6083062468940294e-06, + "loss": 0.3012, + "step": 1105 + }, + { + "epoch": 0.31, + "grad_norm": 2.336731158238361, + "learning_rate": 1.607577539248725e-06, + "loss": 0.2982, + "step": 1106 + }, + { + "epoch": 0.31, + "grad_norm": 2.431007377897008, + "learning_rate": 1.606848319818333e-06, + "loss": 0.3029, + "step": 1107 + }, + { + "epoch": 0.31, + "grad_norm": 2.484485654028409, + "learning_rate": 1.6061185892171021e-06, + "loss": 0.3235, + "step": 1108 + }, + { + "epoch": 0.31, + "grad_norm": 2.1469506553648987, + "learning_rate": 1.6053883480597112e-06, + "loss": 0.2604, + "step": 1109 + }, + { + "epoch": 0.31, + "grad_norm": 2.3229347997347713, + "learning_rate": 1.60465759696127e-06, + "loss": 0.3256, + "step": 1110 + }, + { + "epoch": 0.31, + "grad_norm": 2.802832295421606, + "learning_rate": 1.6039263365373167e-06, + "loss": 0.2955, + "step": 1111 + }, + { + "epoch": 0.32, + "grad_norm": 2.2623099141043825, + "learning_rate": 1.6031945674038188e-06, + "loss": 0.2703, + "step": 1112 + }, + { + "epoch": 0.32, + "grad_norm": 2.3600259797934546, + "learning_rate": 1.6024622901771734e-06, + "loss": 0.2909, + "step": 1113 + }, + { + "epoch": 0.32, + "grad_norm": 2.4897006659817715, + "learning_rate": 1.6017295054742044e-06, + "loss": 0.2968, + "step": 1114 + }, + { + "epoch": 0.32, + "grad_norm": 2.684689300603362, + "learning_rate": 1.6009962139121634e-06, + "loss": 0.2989, + "step": 1115 + }, + { + "epoch": 0.32, + "grad_norm": 2.4044760003269716, + "learning_rate": 1.600262416108729e-06, + "loss": 0.2997, + "step": 1116 + }, + { + "epoch": 0.32, + "grad_norm": 2.3201320285410962, + "learning_rate": 1.5995281126820066e-06, + "loss": 0.3071, + "step": 1117 + }, + { + "epoch": 0.32, + "grad_norm": 2.6468821662036515, + "learning_rate": 1.598793304250527e-06, + "loss": 0.286, + "step": 1118 + }, + { + "epoch": 0.32, + "grad_norm": 2.3639503488566613, + "learning_rate": 1.5980579914332465e-06, + "loss": 0.2914, + "step": 1119 + }, + { + "epoch": 0.32, + "grad_norm": 2.6193329887287207, + "learning_rate": 1.5973221748495468e-06, + "loss": 0.2952, + "step": 1120 + }, + { + "epoch": 0.32, + "grad_norm": 2.679692415018323, + "learning_rate": 1.5965858551192327e-06, + "loss": 0.2953, + "step": 1121 + }, + { + "epoch": 0.32, + "grad_norm": 2.3437150532341953, + "learning_rate": 1.5958490328625347e-06, + "loss": 0.3032, + "step": 1122 + }, + { + "epoch": 0.32, + "grad_norm": 2.3239952811715567, + "learning_rate": 1.5951117087001046e-06, + "loss": 0.2854, + "step": 1123 + }, + { + "epoch": 0.32, + "grad_norm": 2.385086510499303, + "learning_rate": 1.5943738832530182e-06, + "loss": 0.2979, + "step": 1124 + }, + { + "epoch": 0.32, + "grad_norm": 2.4806154461787604, + "learning_rate": 1.5936355571427733e-06, + "loss": 0.2966, + "step": 1125 + }, + { + "epoch": 0.32, + "grad_norm": 2.216112548739291, + "learning_rate": 1.5928967309912888e-06, + "loss": 0.3029, + "step": 1126 + }, + { + "epoch": 0.32, + "grad_norm": 2.572934206919917, + "learning_rate": 1.5921574054209063e-06, + "loss": 0.3056, + "step": 1127 + }, + { + "epoch": 0.32, + "grad_norm": 2.5508527353207016, + "learning_rate": 1.5914175810543866e-06, + "loss": 0.2833, + "step": 1128 + }, + { + "epoch": 0.32, + "grad_norm": 2.6384165447425425, + "learning_rate": 1.590677258514911e-06, + "loss": 0.3049, + "step": 1129 + }, + { + "epoch": 0.32, + "grad_norm": 2.509989832211171, + "learning_rate": 1.5899364384260811e-06, + "loss": 0.2929, + "step": 1130 + }, + { + "epoch": 0.32, + "grad_norm": 2.5231713344758324, + "learning_rate": 1.5891951214119165e-06, + "loss": 0.294, + "step": 1131 + }, + { + "epoch": 0.32, + "grad_norm": 2.281755230252254, + "learning_rate": 1.5884533080968569e-06, + "loss": 0.2919, + "step": 1132 + }, + { + "epoch": 0.32, + "grad_norm": 2.3669782426311925, + "learning_rate": 1.5877109991057587e-06, + "loss": 0.3073, + "step": 1133 + }, + { + "epoch": 0.32, + "grad_norm": 2.362565174713835, + "learning_rate": 1.5869681950638959e-06, + "loss": 0.2966, + "step": 1134 + }, + { + "epoch": 0.32, + "grad_norm": 2.438987608826221, + "learning_rate": 1.5862248965969603e-06, + "loss": 0.2823, + "step": 1135 + }, + { + "epoch": 0.32, + "grad_norm": 2.3746503313743537, + "learning_rate": 1.5854811043310596e-06, + "loss": 0.2849, + "step": 1136 + }, + { + "epoch": 0.32, + "grad_norm": 2.5131572164621208, + "learning_rate": 1.5847368188927179e-06, + "loss": 0.2863, + "step": 1137 + }, + { + "epoch": 0.32, + "grad_norm": 2.5408742994134594, + "learning_rate": 1.5839920409088743e-06, + "loss": 0.2736, + "step": 1138 + }, + { + "epoch": 0.32, + "grad_norm": 2.3560444580090807, + "learning_rate": 1.5832467710068824e-06, + "loss": 0.2994, + "step": 1139 + }, + { + "epoch": 0.32, + "grad_norm": 2.4639874108624364, + "learning_rate": 1.5825010098145116e-06, + "loss": 0.3127, + "step": 1140 + }, + { + "epoch": 0.32, + "grad_norm": 2.1704217818052713, + "learning_rate": 1.5817547579599432e-06, + "loss": 0.2887, + "step": 1141 + }, + { + "epoch": 0.32, + "grad_norm": 2.4916333471595937, + "learning_rate": 1.5810080160717734e-06, + "loss": 0.2873, + "step": 1142 + }, + { + "epoch": 0.32, + "grad_norm": 2.240153940362432, + "learning_rate": 1.5802607847790107e-06, + "loss": 0.278, + "step": 1143 + }, + { + "epoch": 0.32, + "grad_norm": 2.379603346081565, + "learning_rate": 1.5795130647110753e-06, + "loss": 0.3027, + "step": 1144 + }, + { + "epoch": 0.32, + "grad_norm": 3.356341932955021, + "learning_rate": 1.5787648564977998e-06, + "loss": 0.3085, + "step": 1145 + }, + { + "epoch": 0.32, + "grad_norm": 2.4227890292727765, + "learning_rate": 1.5780161607694275e-06, + "loss": 0.3099, + "step": 1146 + }, + { + "epoch": 0.32, + "grad_norm": 2.6094467312618272, + "learning_rate": 1.577266978156613e-06, + "loss": 0.3307, + "step": 1147 + }, + { + "epoch": 0.33, + "grad_norm": 2.4273403176635058, + "learning_rate": 1.5765173092904201e-06, + "loss": 0.2923, + "step": 1148 + }, + { + "epoch": 0.33, + "grad_norm": 2.4037099389004934, + "learning_rate": 1.5757671548023228e-06, + "loss": 0.3346, + "step": 1149 + }, + { + "epoch": 0.33, + "grad_norm": 2.1568447136717457, + "learning_rate": 1.5750165153242044e-06, + "loss": 0.2583, + "step": 1150 + }, + { + "epoch": 0.33, + "grad_norm": 2.28801464145824, + "learning_rate": 1.5742653914883558e-06, + "loss": 0.2924, + "step": 1151 + }, + { + "epoch": 0.33, + "grad_norm": 2.529364561744506, + "learning_rate": 1.5735137839274773e-06, + "loss": 0.2886, + "step": 1152 + }, + { + "epoch": 0.33, + "grad_norm": 2.327560334688552, + "learning_rate": 1.5727616932746746e-06, + "loss": 0.285, + "step": 1153 + }, + { + "epoch": 0.33, + "grad_norm": 2.4283275210499173, + "learning_rate": 1.5720091201634627e-06, + "loss": 0.299, + "step": 1154 + }, + { + "epoch": 0.33, + "grad_norm": 2.5399568436656264, + "learning_rate": 1.5712560652277609e-06, + "loss": 0.2833, + "step": 1155 + }, + { + "epoch": 0.33, + "grad_norm": 2.3646366116477813, + "learning_rate": 1.570502529101896e-06, + "loss": 0.3011, + "step": 1156 + }, + { + "epoch": 0.33, + "grad_norm": 2.94102514169586, + "learning_rate": 1.5697485124205989e-06, + "loss": 0.3032, + "step": 1157 + }, + { + "epoch": 0.33, + "grad_norm": 2.36875518092041, + "learning_rate": 1.568994015819006e-06, + "loss": 0.2749, + "step": 1158 + }, + { + "epoch": 0.33, + "grad_norm": 2.534169488455715, + "learning_rate": 1.5682390399326582e-06, + "loss": 0.3512, + "step": 1159 + }, + { + "epoch": 0.33, + "grad_norm": 2.3107725895023434, + "learning_rate": 1.567483585397499e-06, + "loss": 0.2657, + "step": 1160 + }, + { + "epoch": 0.33, + "grad_norm": 2.6748111833306107, + "learning_rate": 1.5667276528498763e-06, + "loss": 0.3012, + "step": 1161 + }, + { + "epoch": 0.33, + "grad_norm": 2.2766830635131297, + "learning_rate": 1.56597124292654e-06, + "loss": 0.284, + "step": 1162 + }, + { + "epoch": 0.33, + "grad_norm": 2.542009323160178, + "learning_rate": 1.5652143562646413e-06, + "loss": 0.2992, + "step": 1163 + }, + { + "epoch": 0.33, + "grad_norm": 2.316537583857316, + "learning_rate": 1.5644569935017355e-06, + "loss": 0.2771, + "step": 1164 + }, + { + "epoch": 0.33, + "grad_norm": 2.5038538788727873, + "learning_rate": 1.563699155275776e-06, + "loss": 0.3007, + "step": 1165 + }, + { + "epoch": 0.33, + "grad_norm": 2.4159983582420788, + "learning_rate": 1.5629408422251192e-06, + "loss": 0.3034, + "step": 1166 + }, + { + "epoch": 0.33, + "grad_norm": 2.4341610746771356, + "learning_rate": 1.562182054988519e-06, + "loss": 0.3088, + "step": 1167 + }, + { + "epoch": 0.33, + "grad_norm": 2.357812160161354, + "learning_rate": 1.5614227942051307e-06, + "loss": 0.2786, + "step": 1168 + }, + { + "epoch": 0.33, + "grad_norm": 2.3163132363746737, + "learning_rate": 1.5606630605145081e-06, + "loss": 0.3147, + "step": 1169 + }, + { + "epoch": 0.33, + "grad_norm": 2.2351859561010263, + "learning_rate": 1.5599028545566026e-06, + "loss": 0.3012, + "step": 1170 + }, + { + "epoch": 0.33, + "grad_norm": 2.4339611497994285, + "learning_rate": 1.5591421769717642e-06, + "loss": 0.2826, + "step": 1171 + }, + { + "epoch": 0.33, + "grad_norm": 2.624244763187848, + "learning_rate": 1.5583810284007393e-06, + "loss": 0.3324, + "step": 1172 + }, + { + "epoch": 0.33, + "grad_norm": 2.547134698048815, + "learning_rate": 1.5576194094846722e-06, + "loss": 0.3079, + "step": 1173 + }, + { + "epoch": 0.33, + "grad_norm": 2.527816531870315, + "learning_rate": 1.5568573208651023e-06, + "loss": 0.2957, + "step": 1174 + }, + { + "epoch": 0.33, + "grad_norm": 2.2182054977613723, + "learning_rate": 1.5560947631839652e-06, + "loss": 0.2692, + "step": 1175 + }, + { + "epoch": 0.33, + "grad_norm": 2.265694858532665, + "learning_rate": 1.5553317370835913e-06, + "loss": 0.2751, + "step": 1176 + }, + { + "epoch": 0.33, + "grad_norm": 2.443214358088767, + "learning_rate": 1.5545682432067063e-06, + "loss": 0.3132, + "step": 1177 + }, + { + "epoch": 0.33, + "grad_norm": 2.5864134329362356, + "learning_rate": 1.5538042821964292e-06, + "loss": 0.275, + "step": 1178 + }, + { + "epoch": 0.33, + "grad_norm": 2.492520363058183, + "learning_rate": 1.5530398546962729e-06, + "loss": 0.3168, + "step": 1179 + }, + { + "epoch": 0.33, + "grad_norm": 2.6570124876738745, + "learning_rate": 1.5522749613501423e-06, + "loss": 0.2994, + "step": 1180 + }, + { + "epoch": 0.33, + "grad_norm": 2.4265104474117507, + "learning_rate": 1.5515096028023359e-06, + "loss": 0.3041, + "step": 1181 + }, + { + "epoch": 0.33, + "grad_norm": 2.807562758250271, + "learning_rate": 1.5507437796975434e-06, + "loss": 0.2973, + "step": 1182 + }, + { + "epoch": 0.34, + "grad_norm": 2.3246387128523778, + "learning_rate": 1.5499774926808464e-06, + "loss": 0.2884, + "step": 1183 + }, + { + "epoch": 0.34, + "grad_norm": 2.2932871149447402, + "learning_rate": 1.5492107423977166e-06, + "loss": 0.2855, + "step": 1184 + }, + { + "epoch": 0.34, + "grad_norm": 2.435228543816678, + "learning_rate": 1.548443529494016e-06, + "loss": 0.3097, + "step": 1185 + }, + { + "epoch": 0.34, + "grad_norm": 2.3510824680139604, + "learning_rate": 1.5476758546159966e-06, + "loss": 0.2742, + "step": 1186 + }, + { + "epoch": 0.34, + "grad_norm": 2.392760312472469, + "learning_rate": 1.5469077184102996e-06, + "loss": 0.3302, + "step": 1187 + }, + { + "epoch": 0.34, + "grad_norm": 2.343553702909353, + "learning_rate": 1.5461391215239545e-06, + "loss": 0.2465, + "step": 1188 + }, + { + "epoch": 0.34, + "grad_norm": 2.248803321374722, + "learning_rate": 1.545370064604379e-06, + "loss": 0.2789, + "step": 1189 + }, + { + "epoch": 0.34, + "grad_norm": 2.2905096127758915, + "learning_rate": 1.544600548299378e-06, + "loss": 0.2953, + "step": 1190 + }, + { + "epoch": 0.34, + "grad_norm": 2.158849950842058, + "learning_rate": 1.5438305732571442e-06, + "loss": 0.2784, + "step": 1191 + }, + { + "epoch": 0.34, + "grad_norm": 2.7014072777834843, + "learning_rate": 1.543060140126255e-06, + "loss": 0.2989, + "step": 1192 + }, + { + "epoch": 0.34, + "grad_norm": 2.4406118157452497, + "learning_rate": 1.5422892495556764e-06, + "loss": 0.2828, + "step": 1193 + }, + { + "epoch": 0.34, + "grad_norm": 2.309427539691521, + "learning_rate": 1.5415179021947565e-06, + "loss": 0.2881, + "step": 1194 + }, + { + "epoch": 0.34, + "grad_norm": 2.3087869140253527, + "learning_rate": 1.5407460986932309e-06, + "loss": 0.3104, + "step": 1195 + }, + { + "epoch": 0.34, + "grad_norm": 2.4365948516037017, + "learning_rate": 1.5399738397012176e-06, + "loss": 0.3117, + "step": 1196 + }, + { + "epoch": 0.34, + "grad_norm": 2.472925420715825, + "learning_rate": 1.5392011258692197e-06, + "loss": 0.297, + "step": 1197 + }, + { + "epoch": 0.34, + "grad_norm": 2.4954071083244327, + "learning_rate": 1.538427957848122e-06, + "loss": 0.2885, + "step": 1198 + }, + { + "epoch": 0.34, + "grad_norm": 2.370015598392829, + "learning_rate": 1.5376543362891932e-06, + "loss": 0.3013, + "step": 1199 + }, + { + "epoch": 0.34, + "grad_norm": 2.352657234394544, + "learning_rate": 1.5368802618440829e-06, + "loss": 0.2781, + "step": 1200 + }, + { + "epoch": 0.34, + "grad_norm": 2.413009305045956, + "learning_rate": 1.5361057351648228e-06, + "loss": 0.294, + "step": 1201 + }, + { + "epoch": 0.34, + "grad_norm": 2.4136284994593225, + "learning_rate": 1.5353307569038254e-06, + "loss": 0.3016, + "step": 1202 + }, + { + "epoch": 0.34, + "grad_norm": 2.5447870948805607, + "learning_rate": 1.5345553277138846e-06, + "loss": 0.3485, + "step": 1203 + }, + { + "epoch": 0.34, + "grad_norm": 2.230921132142122, + "learning_rate": 1.5337794482481714e-06, + "loss": 0.2858, + "step": 1204 + }, + { + "epoch": 0.34, + "grad_norm": 2.38145420246431, + "learning_rate": 1.5330031191602393e-06, + "loss": 0.2674, + "step": 1205 + }, + { + "epoch": 0.34, + "grad_norm": 2.275552437256254, + "learning_rate": 1.5322263411040185e-06, + "loss": 0.2731, + "step": 1206 + }, + { + "epoch": 0.34, + "grad_norm": 2.5017524293312565, + "learning_rate": 1.5314491147338178e-06, + "loss": 0.276, + "step": 1207 + }, + { + "epoch": 0.34, + "grad_norm": 2.3438144876797264, + "learning_rate": 1.530671440704324e-06, + "loss": 0.2789, + "step": 1208 + }, + { + "epoch": 0.34, + "grad_norm": 2.2085904618308208, + "learning_rate": 1.5298933196706008e-06, + "loss": 0.2342, + "step": 1209 + }, + { + "epoch": 0.34, + "grad_norm": 2.4759768263587394, + "learning_rate": 1.5291147522880884e-06, + "loss": 0.2941, + "step": 1210 + }, + { + "epoch": 0.34, + "grad_norm": 2.4202423096646886, + "learning_rate": 1.528335739212603e-06, + "loss": 0.3279, + "step": 1211 + }, + { + "epoch": 0.34, + "grad_norm": 2.3966984192951535, + "learning_rate": 1.5275562811003363e-06, + "loss": 0.2772, + "step": 1212 + }, + { + "epoch": 0.34, + "grad_norm": 2.199944532742987, + "learning_rate": 1.5267763786078541e-06, + "loss": 0.2842, + "step": 1213 + }, + { + "epoch": 0.34, + "grad_norm": 2.508334908719426, + "learning_rate": 1.525996032392098e-06, + "loss": 0.2911, + "step": 1214 + }, + { + "epoch": 0.34, + "grad_norm": 2.2262275947096417, + "learning_rate": 1.525215243110382e-06, + "loss": 0.292, + "step": 1215 + }, + { + "epoch": 0.34, + "grad_norm": 2.2895553897355208, + "learning_rate": 1.5244340114203943e-06, + "loss": 0.2798, + "step": 1216 + }, + { + "epoch": 0.34, + "grad_norm": 2.2505915029751153, + "learning_rate": 1.5236523379801951e-06, + "loss": 0.2813, + "step": 1217 + }, + { + "epoch": 0.35, + "grad_norm": 2.5301472784695105, + "learning_rate": 1.522870223448217e-06, + "loss": 0.3055, + "step": 1218 + }, + { + "epoch": 0.35, + "grad_norm": 2.467104811055958, + "learning_rate": 1.5220876684832638e-06, + "loss": 0.3138, + "step": 1219 + }, + { + "epoch": 0.35, + "grad_norm": 2.4211238944654565, + "learning_rate": 1.5213046737445105e-06, + "loss": 0.2889, + "step": 1220 + }, + { + "epoch": 0.35, + "grad_norm": 2.4038218910709306, + "learning_rate": 1.5205212398915032e-06, + "loss": 0.2955, + "step": 1221 + }, + { + "epoch": 0.35, + "grad_norm": 2.233445129687268, + "learning_rate": 1.5197373675841569e-06, + "loss": 0.2641, + "step": 1222 + }, + { + "epoch": 0.35, + "grad_norm": 2.6361531058162115, + "learning_rate": 1.5189530574827565e-06, + "loss": 0.2966, + "step": 1223 + }, + { + "epoch": 0.35, + "grad_norm": 2.4076912906486867, + "learning_rate": 1.518168310247955e-06, + "loss": 0.3003, + "step": 1224 + }, + { + "epoch": 0.35, + "grad_norm": 2.4118387221922744, + "learning_rate": 1.5173831265407747e-06, + "loss": 0.311, + "step": 1225 + }, + { + "epoch": 0.35, + "grad_norm": 2.8245527076546093, + "learning_rate": 1.5165975070226043e-06, + "loss": 0.3364, + "step": 1226 + }, + { + "epoch": 0.35, + "grad_norm": 2.602810109791664, + "learning_rate": 1.515811452355201e-06, + "loss": 0.3041, + "step": 1227 + }, + { + "epoch": 0.35, + "grad_norm": 2.347826721773315, + "learning_rate": 1.5150249632006868e-06, + "loss": 0.275, + "step": 1228 + }, + { + "epoch": 0.35, + "grad_norm": 2.4794242356386733, + "learning_rate": 1.5142380402215518e-06, + "loss": 0.3054, + "step": 1229 + }, + { + "epoch": 0.35, + "grad_norm": 2.577687203706355, + "learning_rate": 1.5134506840806496e-06, + "loss": 0.3037, + "step": 1230 + }, + { + "epoch": 0.35, + "grad_norm": 2.3642776332377244, + "learning_rate": 1.5126628954411999e-06, + "loss": 0.3193, + "step": 1231 + }, + { + "epoch": 0.35, + "grad_norm": 2.8048821645424717, + "learning_rate": 1.5118746749667862e-06, + "loss": 0.3046, + "step": 1232 + }, + { + "epoch": 0.35, + "grad_norm": 2.344113545456005, + "learning_rate": 1.5110860233213554e-06, + "loss": 0.2815, + "step": 1233 + }, + { + "epoch": 0.35, + "grad_norm": 2.281481302849905, + "learning_rate": 1.5102969411692183e-06, + "loss": 0.3236, + "step": 1234 + }, + { + "epoch": 0.35, + "grad_norm": 2.276485093800136, + "learning_rate": 1.5095074291750485e-06, + "loss": 0.2774, + "step": 1235 + }, + { + "epoch": 0.35, + "grad_norm": 2.338175236841784, + "learning_rate": 1.5087174880038806e-06, + "loss": 0.2859, + "step": 1236 + }, + { + "epoch": 0.35, + "grad_norm": 2.3029685841528056, + "learning_rate": 1.5079271183211116e-06, + "loss": 0.2911, + "step": 1237 + }, + { + "epoch": 0.35, + "grad_norm": 2.407994836006956, + "learning_rate": 1.5071363207924992e-06, + "loss": 0.3106, + "step": 1238 + }, + { + "epoch": 0.35, + "grad_norm": 2.5374727320190633, + "learning_rate": 1.5063450960841614e-06, + "loss": 0.2836, + "step": 1239 + }, + { + "epoch": 0.35, + "grad_norm": 2.3499467670805023, + "learning_rate": 1.5055534448625764e-06, + "loss": 0.2795, + "step": 1240 + }, + { + "epoch": 0.35, + "grad_norm": 2.4266429590953495, + "learning_rate": 1.5047613677945808e-06, + "loss": 0.2749, + "step": 1241 + }, + { + "epoch": 0.35, + "grad_norm": 2.4657170841490976, + "learning_rate": 1.503968865547371e-06, + "loss": 0.2837, + "step": 1242 + }, + { + "epoch": 0.35, + "grad_norm": 5.0976536509730455, + "learning_rate": 1.5031759387885007e-06, + "loss": 0.3269, + "step": 1243 + }, + { + "epoch": 0.35, + "grad_norm": 2.628231359188434, + "learning_rate": 1.5023825881858818e-06, + "loss": 0.3152, + "step": 1244 + }, + { + "epoch": 0.35, + "grad_norm": 2.4756832958452573, + "learning_rate": 1.5015888144077824e-06, + "loss": 0.3073, + "step": 1245 + }, + { + "epoch": 0.35, + "grad_norm": 2.298512467341184, + "learning_rate": 1.5007946181228283e-06, + "loss": 0.3061, + "step": 1246 + }, + { + "epoch": 0.35, + "grad_norm": 2.52945706170183, + "learning_rate": 1.5e-06, + "loss": 0.3178, + "step": 1247 + }, + { + "epoch": 0.35, + "grad_norm": 2.5724615855504926, + "learning_rate": 1.4992049607086339e-06, + "loss": 0.3417, + "step": 1248 + }, + { + "epoch": 0.35, + "grad_norm": 2.727288725787042, + "learning_rate": 1.4984095009184212e-06, + "loss": 0.3349, + "step": 1249 + }, + { + "epoch": 0.35, + "grad_norm": 2.350197638440752, + "learning_rate": 1.497613621299407e-06, + "loss": 0.3041, + "step": 1250 + }, + { + "epoch": 0.35, + "grad_norm": 2.7132496007296156, + "learning_rate": 1.4968173225219901e-06, + "loss": 0.313, + "step": 1251 + }, + { + "epoch": 0.35, + "grad_norm": 2.5199034855021836, + "learning_rate": 1.496020605256923e-06, + "loss": 0.3085, + "step": 1252 + }, + { + "epoch": 0.36, + "grad_norm": 2.2147023860745105, + "learning_rate": 1.4952234701753095e-06, + "loss": 0.2737, + "step": 1253 + }, + { + "epoch": 0.36, + "grad_norm": 2.297883065371963, + "learning_rate": 1.4944259179486065e-06, + "loss": 0.2783, + "step": 1254 + }, + { + "epoch": 0.36, + "grad_norm": 2.3449804782716774, + "learning_rate": 1.493627949248622e-06, + "loss": 0.314, + "step": 1255 + }, + { + "epoch": 0.36, + "grad_norm": 2.4267078408027443, + "learning_rate": 1.492829564747514e-06, + "loss": 0.3169, + "step": 1256 + }, + { + "epoch": 0.36, + "grad_norm": 2.6432791420563784, + "learning_rate": 1.492030765117792e-06, + "loss": 0.3107, + "step": 1257 + }, + { + "epoch": 0.36, + "grad_norm": 2.7012532465585966, + "learning_rate": 1.4912315510323137e-06, + "loss": 0.329, + "step": 1258 + }, + { + "epoch": 0.36, + "grad_norm": 2.486228397791336, + "learning_rate": 1.4904319231642876e-06, + "loss": 0.3027, + "step": 1259 + }, + { + "epoch": 0.36, + "grad_norm": 2.41871838980173, + "learning_rate": 1.4896318821872696e-06, + "loss": 0.3126, + "step": 1260 + }, + { + "epoch": 0.36, + "grad_norm": 2.5420371517098084, + "learning_rate": 1.4888314287751638e-06, + "loss": 0.3158, + "step": 1261 + }, + { + "epoch": 0.36, + "grad_norm": 2.3839764981648215, + "learning_rate": 1.488030563602222e-06, + "loss": 0.3023, + "step": 1262 + }, + { + "epoch": 0.36, + "grad_norm": 2.4532950147848407, + "learning_rate": 1.4872292873430424e-06, + "loss": 0.2901, + "step": 1263 + }, + { + "epoch": 0.36, + "grad_norm": 2.5054399027116503, + "learning_rate": 1.4864276006725698e-06, + "loss": 0.3057, + "step": 1264 + }, + { + "epoch": 0.36, + "grad_norm": 2.477920531834612, + "learning_rate": 1.4856255042660943e-06, + "loss": 0.2954, + "step": 1265 + }, + { + "epoch": 0.36, + "grad_norm": 3.935540643162236, + "learning_rate": 1.484822998799252e-06, + "loss": 0.2844, + "step": 1266 + }, + { + "epoch": 0.36, + "grad_norm": 2.314201674178338, + "learning_rate": 1.4840200849480225e-06, + "loss": 0.2863, + "step": 1267 + }, + { + "epoch": 0.36, + "grad_norm": 2.3731413875042193, + "learning_rate": 1.4832167633887305e-06, + "loss": 0.3051, + "step": 1268 + }, + { + "epoch": 0.36, + "grad_norm": 2.3892817055908777, + "learning_rate": 1.482413034798043e-06, + "loss": 0.3129, + "step": 1269 + }, + { + "epoch": 0.36, + "grad_norm": 2.302263982367759, + "learning_rate": 1.4816088998529706e-06, + "loss": 0.2664, + "step": 1270 + }, + { + "epoch": 0.36, + "grad_norm": 2.6512705403947354, + "learning_rate": 1.480804359230866e-06, + "loss": 0.2965, + "step": 1271 + }, + { + "epoch": 0.36, + "grad_norm": 2.347288821858566, + "learning_rate": 1.4799994136094232e-06, + "loss": 0.301, + "step": 1272 + }, + { + "epoch": 0.36, + "grad_norm": 2.5333151896115873, + "learning_rate": 1.4791940636666782e-06, + "loss": 0.3012, + "step": 1273 + }, + { + "epoch": 0.36, + "grad_norm": 2.5264247786210148, + "learning_rate": 1.4783883100810073e-06, + "loss": 0.3196, + "step": 1274 + }, + { + "epoch": 0.36, + "grad_norm": 2.462515547861249, + "learning_rate": 1.4775821535311259e-06, + "loss": 0.3057, + "step": 1275 + }, + { + "epoch": 0.36, + "grad_norm": 2.447574287258765, + "learning_rate": 1.47677559469609e-06, + "loss": 0.3101, + "step": 1276 + }, + { + "epoch": 0.36, + "grad_norm": 2.4266565567789664, + "learning_rate": 1.4759686342552943e-06, + "loss": 0.271, + "step": 1277 + }, + { + "epoch": 0.36, + "grad_norm": 2.2897497408615908, + "learning_rate": 1.475161272888471e-06, + "loss": 0.2681, + "step": 1278 + }, + { + "epoch": 0.36, + "grad_norm": 2.0860403283083073, + "learning_rate": 1.4743535112756908e-06, + "loss": 0.2544, + "step": 1279 + }, + { + "epoch": 0.36, + "grad_norm": 2.392018732079389, + "learning_rate": 1.4735453500973609e-06, + "loss": 0.2836, + "step": 1280 + }, + { + "epoch": 0.36, + "grad_norm": 2.23012471953278, + "learning_rate": 1.4727367900342258e-06, + "loss": 0.2767, + "step": 1281 + }, + { + "epoch": 0.36, + "grad_norm": 2.272102233160078, + "learning_rate": 1.4719278317673654e-06, + "loss": 0.3001, + "step": 1282 + }, + { + "epoch": 0.36, + "grad_norm": 2.39901218259605, + "learning_rate": 1.4711184759781953e-06, + "loss": 0.2574, + "step": 1283 + }, + { + "epoch": 0.36, + "grad_norm": 2.5600352664145354, + "learning_rate": 1.4703087233484659e-06, + "loss": 0.3206, + "step": 1284 + }, + { + "epoch": 0.36, + "grad_norm": 3.9198388613999184, + "learning_rate": 1.469498574560262e-06, + "loss": 0.3024, + "step": 1285 + }, + { + "epoch": 0.36, + "grad_norm": 2.4103965434314567, + "learning_rate": 1.4686880302960018e-06, + "loss": 0.3014, + "step": 1286 + }, + { + "epoch": 0.36, + "grad_norm": 2.209789038709257, + "learning_rate": 1.4678770912384368e-06, + "loss": 0.249, + "step": 1287 + }, + { + "epoch": 0.36, + "grad_norm": 3.3618412034454384, + "learning_rate": 1.467065758070651e-06, + "loss": 0.2966, + "step": 1288 + }, + { + "epoch": 0.37, + "grad_norm": 2.3019549434185347, + "learning_rate": 1.4662540314760605e-06, + "loss": 0.2788, + "step": 1289 + }, + { + "epoch": 0.37, + "grad_norm": 2.4662661882754224, + "learning_rate": 1.4654419121384126e-06, + "loss": 0.2831, + "step": 1290 + }, + { + "epoch": 0.37, + "grad_norm": 2.454442520748841, + "learning_rate": 1.4646294007417856e-06, + "loss": 0.302, + "step": 1291 + }, + { + "epoch": 0.37, + "grad_norm": 2.340806467057121, + "learning_rate": 1.463816497970588e-06, + "loss": 0.329, + "step": 1292 + }, + { + "epoch": 0.37, + "grad_norm": 2.449099525994602, + "learning_rate": 1.4630032045095579e-06, + "loss": 0.3047, + "step": 1293 + }, + { + "epoch": 0.37, + "grad_norm": 2.6306339717083382, + "learning_rate": 1.4621895210437625e-06, + "loss": 0.3269, + "step": 1294 + }, + { + "epoch": 0.37, + "grad_norm": 2.2548608028685004, + "learning_rate": 1.4613754482585977e-06, + "loss": 0.2985, + "step": 1295 + }, + { + "epoch": 0.37, + "grad_norm": 2.37377254194116, + "learning_rate": 1.4605609868397872e-06, + "loss": 0.2989, + "step": 1296 + }, + { + "epoch": 0.37, + "grad_norm": 2.2530252554578647, + "learning_rate": 1.4597461374733815e-06, + "loss": 0.3155, + "step": 1297 + }, + { + "epoch": 0.37, + "grad_norm": 2.6798879482553373, + "learning_rate": 1.4589309008457594e-06, + "loss": 0.2738, + "step": 1298 + }, + { + "epoch": 0.37, + "grad_norm": 2.4502889632471945, + "learning_rate": 1.4581152776436238e-06, + "loss": 0.3192, + "step": 1299 + }, + { + "epoch": 0.37, + "grad_norm": 2.6298976641211786, + "learning_rate": 1.4572992685540056e-06, + "loss": 0.3167, + "step": 1300 + }, + { + "epoch": 0.37, + "grad_norm": 2.3641713607691983, + "learning_rate": 1.4564828742642583e-06, + "loss": 0.289, + "step": 1301 + }, + { + "epoch": 0.37, + "grad_norm": 2.367238462783568, + "learning_rate": 1.455666095462062e-06, + "loss": 0.2841, + "step": 1302 + }, + { + "epoch": 0.37, + "grad_norm": 2.494597823178741, + "learning_rate": 1.4548489328354194e-06, + "loss": 0.3052, + "step": 1303 + }, + { + "epoch": 0.37, + "grad_norm": 2.549571518755505, + "learning_rate": 1.4540313870726568e-06, + "loss": 0.3206, + "step": 1304 + }, + { + "epoch": 0.37, + "grad_norm": 2.34529117759076, + "learning_rate": 1.4532134588624233e-06, + "loss": 0.2947, + "step": 1305 + }, + { + "epoch": 0.37, + "grad_norm": 2.3384051528527823, + "learning_rate": 1.4523951488936903e-06, + "loss": 0.3108, + "step": 1306 + }, + { + "epoch": 0.37, + "grad_norm": 2.223749125445581, + "learning_rate": 1.451576457855751e-06, + "loss": 0.2693, + "step": 1307 + }, + { + "epoch": 0.37, + "grad_norm": 2.466587040514915, + "learning_rate": 1.4507573864382186e-06, + "loss": 0.2897, + "step": 1308 + }, + { + "epoch": 0.37, + "grad_norm": 2.4013650330596032, + "learning_rate": 1.4499379353310272e-06, + "loss": 0.3095, + "step": 1309 + }, + { + "epoch": 0.37, + "grad_norm": 2.329526517251253, + "learning_rate": 1.4491181052244315e-06, + "loss": 0.3053, + "step": 1310 + }, + { + "epoch": 0.37, + "grad_norm": 2.2325386851804963, + "learning_rate": 1.4482978968090043e-06, + "loss": 0.2799, + "step": 1311 + }, + { + "epoch": 0.37, + "grad_norm": 2.739730141975504, + "learning_rate": 1.4474773107756378e-06, + "loss": 0.2819, + "step": 1312 + }, + { + "epoch": 0.37, + "grad_norm": 2.4059311840397424, + "learning_rate": 1.446656347815542e-06, + "loss": 0.3184, + "step": 1313 + }, + { + "epoch": 0.37, + "grad_norm": 2.5043427586314624, + "learning_rate": 1.4458350086202442e-06, + "loss": 0.3358, + "step": 1314 + }, + { + "epoch": 0.37, + "grad_norm": 2.5523646241136073, + "learning_rate": 1.4450132938815893e-06, + "loss": 0.324, + "step": 1315 + }, + { + "epoch": 0.37, + "grad_norm": 2.6696043592655574, + "learning_rate": 1.4441912042917378e-06, + "loss": 0.3198, + "step": 1316 + }, + { + "epoch": 0.37, + "grad_norm": 2.4038993670802613, + "learning_rate": 1.4433687405431661e-06, + "loss": 0.3094, + "step": 1317 + }, + { + "epoch": 0.37, + "grad_norm": 2.2708458215823537, + "learning_rate": 1.4425459033286663e-06, + "loss": 0.2929, + "step": 1318 + }, + { + "epoch": 0.37, + "grad_norm": 2.228885189442795, + "learning_rate": 1.4417226933413445e-06, + "loss": 0.2684, + "step": 1319 + }, + { + "epoch": 0.37, + "grad_norm": 3.3560738300099113, + "learning_rate": 1.4408991112746209e-06, + "loss": 0.3066, + "step": 1320 + }, + { + "epoch": 0.37, + "grad_norm": 2.3347511327984707, + "learning_rate": 1.4400751578222293e-06, + "loss": 0.2655, + "step": 1321 + }, + { + "epoch": 0.37, + "grad_norm": 2.4636897945354117, + "learning_rate": 1.4392508336782165e-06, + "loss": 0.2594, + "step": 1322 + }, + { + "epoch": 0.37, + "grad_norm": 2.3437567986546166, + "learning_rate": 1.4384261395369405e-06, + "loss": 0.2944, + "step": 1323 + }, + { + "epoch": 0.38, + "grad_norm": 2.430515865063001, + "learning_rate": 1.4376010760930727e-06, + "loss": 0.3145, + "step": 1324 + }, + { + "epoch": 0.38, + "grad_norm": 2.648183336932284, + "learning_rate": 1.436775644041594e-06, + "loss": 0.3171, + "step": 1325 + }, + { + "epoch": 0.38, + "grad_norm": 2.1948854220827525, + "learning_rate": 1.4359498440777969e-06, + "loss": 0.2954, + "step": 1326 + }, + { + "epoch": 0.38, + "grad_norm": 2.424356777798155, + "learning_rate": 1.4351236768972827e-06, + "loss": 0.295, + "step": 1327 + }, + { + "epoch": 0.38, + "grad_norm": 2.3561806922810296, + "learning_rate": 1.4342971431959633e-06, + "loss": 0.2942, + "step": 1328 + }, + { + "epoch": 0.38, + "grad_norm": 2.2325216270315376, + "learning_rate": 1.4334702436700582e-06, + "loss": 0.2743, + "step": 1329 + }, + { + "epoch": 0.38, + "grad_norm": 2.388209633531802, + "learning_rate": 1.4326429790160957e-06, + "loss": 0.3036, + "step": 1330 + }, + { + "epoch": 0.38, + "grad_norm": 2.518564073310508, + "learning_rate": 1.4318153499309115e-06, + "loss": 0.2809, + "step": 1331 + }, + { + "epoch": 0.38, + "grad_norm": 2.3524718365011186, + "learning_rate": 1.4309873571116484e-06, + "loss": 0.2934, + "step": 1332 + }, + { + "epoch": 0.38, + "grad_norm": 2.4008119110054373, + "learning_rate": 1.4301590012557552e-06, + "loss": 0.2948, + "step": 1333 + }, + { + "epoch": 0.38, + "grad_norm": 2.657182854931203, + "learning_rate": 1.4293302830609869e-06, + "loss": 0.2982, + "step": 1334 + }, + { + "epoch": 0.38, + "grad_norm": 2.4741787245997418, + "learning_rate": 1.4285012032254033e-06, + "loss": 0.3052, + "step": 1335 + }, + { + "epoch": 0.38, + "grad_norm": 2.266535481706884, + "learning_rate": 1.4276717624473695e-06, + "loss": 0.2751, + "step": 1336 + }, + { + "epoch": 0.38, + "grad_norm": 2.499984914973482, + "learning_rate": 1.4268419614255543e-06, + "loss": 0.3218, + "step": 1337 + }, + { + "epoch": 0.38, + "grad_norm": 2.167267266773434, + "learning_rate": 1.4260118008589293e-06, + "loss": 0.2849, + "step": 1338 + }, + { + "epoch": 0.38, + "grad_norm": 2.3809186742123614, + "learning_rate": 1.42518128144677e-06, + "loss": 0.3186, + "step": 1339 + }, + { + "epoch": 0.38, + "grad_norm": 2.276436866987727, + "learning_rate": 1.4243504038886528e-06, + "loss": 0.2769, + "step": 1340 + }, + { + "epoch": 0.38, + "grad_norm": 2.7771729871088713, + "learning_rate": 1.4235191688844583e-06, + "loss": 0.3143, + "step": 1341 + }, + { + "epoch": 0.38, + "grad_norm": 2.5976626344842813, + "learning_rate": 1.4226875771343654e-06, + "loss": 0.3003, + "step": 1342 + }, + { + "epoch": 0.38, + "grad_norm": 2.3154780781114837, + "learning_rate": 1.4218556293388547e-06, + "loss": 0.2912, + "step": 1343 + }, + { + "epoch": 0.38, + "grad_norm": 2.3516866583648466, + "learning_rate": 1.4210233261987069e-06, + "loss": 0.2951, + "step": 1344 + }, + { + "epoch": 0.38, + "grad_norm": 2.6133293177904156, + "learning_rate": 1.4201906684150019e-06, + "loss": 0.32, + "step": 1345 + }, + { + "epoch": 0.38, + "grad_norm": 2.349221085671353, + "learning_rate": 1.4193576566891179e-06, + "loss": 0.2934, + "step": 1346 + }, + { + "epoch": 0.38, + "grad_norm": 2.3372859553928187, + "learning_rate": 1.418524291722732e-06, + "loss": 0.2875, + "step": 1347 + }, + { + "epoch": 0.38, + "grad_norm": 2.9969867158207175, + "learning_rate": 1.4176905742178178e-06, + "loss": 0.2858, + "step": 1348 + }, + { + "epoch": 0.38, + "grad_norm": 2.469367670387917, + "learning_rate": 1.4168565048766473e-06, + "loss": 0.3216, + "step": 1349 + }, + { + "epoch": 0.38, + "grad_norm": 2.433107326687296, + "learning_rate": 1.4160220844017873e-06, + "loss": 0.3011, + "step": 1350 + }, + { + "epoch": 0.38, + "grad_norm": 2.44869944836565, + "learning_rate": 1.4151873134961011e-06, + "loss": 0.2823, + "step": 1351 + }, + { + "epoch": 0.38, + "grad_norm": 2.3356654813249977, + "learning_rate": 1.4143521928627477e-06, + "loss": 0.2743, + "step": 1352 + }, + { + "epoch": 0.38, + "grad_norm": 2.4618631831211477, + "learning_rate": 1.41351672320518e-06, + "loss": 0.3103, + "step": 1353 + }, + { + "epoch": 0.38, + "grad_norm": 2.3728132538728643, + "learning_rate": 1.4126809052271451e-06, + "loss": 0.2894, + "step": 1354 + }, + { + "epoch": 0.38, + "grad_norm": 2.1921637488839747, + "learning_rate": 1.411844739632683e-06, + "loss": 0.2991, + "step": 1355 + }, + { + "epoch": 0.38, + "grad_norm": 2.2401465864522305, + "learning_rate": 1.4110082271261277e-06, + "loss": 0.2916, + "step": 1356 + }, + { + "epoch": 0.38, + "grad_norm": 2.1374788781683236, + "learning_rate": 1.410171368412104e-06, + "loss": 0.27, + "step": 1357 + }, + { + "epoch": 0.38, + "grad_norm": 2.2226597845776324, + "learning_rate": 1.4093341641955296e-06, + "loss": 0.313, + "step": 1358 + }, + { + "epoch": 0.39, + "grad_norm": 2.2209502544829482, + "learning_rate": 1.4084966151816122e-06, + "loss": 0.3006, + "step": 1359 + }, + { + "epoch": 0.39, + "grad_norm": 2.2293956956621597, + "learning_rate": 1.4076587220758508e-06, + "loss": 0.2945, + "step": 1360 + }, + { + "epoch": 0.39, + "grad_norm": 2.469630659844368, + "learning_rate": 1.4068204855840336e-06, + "loss": 0.2862, + "step": 1361 + }, + { + "epoch": 0.39, + "grad_norm": 2.7961374088455777, + "learning_rate": 1.405981906412238e-06, + "loss": 0.3437, + "step": 1362 + }, + { + "epoch": 0.39, + "grad_norm": 2.422097199933655, + "learning_rate": 1.4051429852668311e-06, + "loss": 0.2894, + "step": 1363 + }, + { + "epoch": 0.39, + "grad_norm": 2.6518743875649915, + "learning_rate": 1.4043037228544665e-06, + "loss": 0.3017, + "step": 1364 + }, + { + "epoch": 0.39, + "grad_norm": 2.5012891328415634, + "learning_rate": 1.4034641198820865e-06, + "loss": 0.3169, + "step": 1365 + }, + { + "epoch": 0.39, + "grad_norm": 2.3793187249527525, + "learning_rate": 1.4026241770569196e-06, + "loss": 0.2922, + "step": 1366 + }, + { + "epoch": 0.39, + "grad_norm": 2.4635007290354376, + "learning_rate": 1.4017838950864806e-06, + "loss": 0.3042, + "step": 1367 + }, + { + "epoch": 0.39, + "grad_norm": 2.4776118195788914, + "learning_rate": 1.4009432746785709e-06, + "loss": 0.2756, + "step": 1368 + }, + { + "epoch": 0.39, + "grad_norm": 2.3652753400904305, + "learning_rate": 1.4001023165412753e-06, + "loss": 0.3089, + "step": 1369 + }, + { + "epoch": 0.39, + "grad_norm": 2.4321634002156793, + "learning_rate": 1.3992610213829648e-06, + "loss": 0.3024, + "step": 1370 + }, + { + "epoch": 0.39, + "grad_norm": 2.3618306833159903, + "learning_rate": 1.3984193899122932e-06, + "loss": 0.2753, + "step": 1371 + }, + { + "epoch": 0.39, + "grad_norm": 2.462533985172125, + "learning_rate": 1.3975774228381974e-06, + "loss": 0.3393, + "step": 1372 + }, + { + "epoch": 0.39, + "grad_norm": 2.3130478385308115, + "learning_rate": 1.3967351208698982e-06, + "loss": 0.2861, + "step": 1373 + }, + { + "epoch": 0.39, + "grad_norm": 2.4529100901465632, + "learning_rate": 1.3958924847168977e-06, + "loss": 0.3029, + "step": 1374 + }, + { + "epoch": 0.39, + "grad_norm": 2.424126592675069, + "learning_rate": 1.3950495150889793e-06, + "loss": 0.3359, + "step": 1375 + }, + { + "epoch": 0.39, + "grad_norm": 2.3408333339200533, + "learning_rate": 1.3942062126962075e-06, + "loss": 0.2858, + "step": 1376 + }, + { + "epoch": 0.39, + "grad_norm": 2.5154163908242535, + "learning_rate": 1.3933625782489274e-06, + "loss": 0.3013, + "step": 1377 + }, + { + "epoch": 0.39, + "grad_norm": 2.1942197379970936, + "learning_rate": 1.3925186124577637e-06, + "loss": 0.2779, + "step": 1378 + }, + { + "epoch": 0.39, + "grad_norm": 2.573288722683661, + "learning_rate": 1.3916743160336195e-06, + "loss": 0.2878, + "step": 1379 + }, + { + "epoch": 0.39, + "grad_norm": 2.3597427022892417, + "learning_rate": 1.3908296896876776e-06, + "loss": 0.2953, + "step": 1380 + }, + { + "epoch": 0.39, + "grad_norm": 2.6721873029691863, + "learning_rate": 1.389984734131398e-06, + "loss": 0.2873, + "step": 1381 + }, + { + "epoch": 0.39, + "grad_norm": 3.1997325497798492, + "learning_rate": 1.389139450076518e-06, + "loss": 0.2886, + "step": 1382 + }, + { + "epoch": 0.39, + "grad_norm": 2.829582003955937, + "learning_rate": 1.388293838235051e-06, + "loss": 0.2966, + "step": 1383 + }, + { + "epoch": 0.39, + "grad_norm": 2.407857097243499, + "learning_rate": 1.3874478993192885e-06, + "loss": 0.2855, + "step": 1384 + }, + { + "epoch": 0.39, + "grad_norm": 2.3886683252466736, + "learning_rate": 1.3866016340417951e-06, + "loss": 0.2789, + "step": 1385 + }, + { + "epoch": 0.39, + "grad_norm": 2.5464389702961645, + "learning_rate": 1.385755043115412e-06, + "loss": 0.3101, + "step": 1386 + }, + { + "epoch": 0.39, + "grad_norm": 2.316650930929904, + "learning_rate": 1.3849081272532544e-06, + "loss": 0.3025, + "step": 1387 + }, + { + "epoch": 0.39, + "grad_norm": 2.4580772632167855, + "learning_rate": 1.3840608871687102e-06, + "loss": 0.2894, + "step": 1388 + }, + { + "epoch": 0.39, + "grad_norm": 2.5338692072902522, + "learning_rate": 1.3832133235754415e-06, + "loss": 0.2948, + "step": 1389 + }, + { + "epoch": 0.39, + "grad_norm": 2.417527611985879, + "learning_rate": 1.3823654371873825e-06, + "loss": 0.2873, + "step": 1390 + }, + { + "epoch": 0.39, + "grad_norm": 2.4989649967624388, + "learning_rate": 1.3815172287187393e-06, + "loss": 0.2881, + "step": 1391 + }, + { + "epoch": 0.39, + "grad_norm": 2.5528812441155635, + "learning_rate": 1.3806686988839896e-06, + "loss": 0.2971, + "step": 1392 + }, + { + "epoch": 0.39, + "grad_norm": 2.5574316457528656, + "learning_rate": 1.3798198483978813e-06, + "loss": 0.2848, + "step": 1393 + }, + { + "epoch": 0.39, + "grad_norm": 2.50493766439196, + "learning_rate": 1.3789706779754324e-06, + "loss": 0.308, + "step": 1394 + }, + { + "epoch": 0.4, + "grad_norm": 2.2752842924157215, + "learning_rate": 1.3781211883319312e-06, + "loss": 0.2848, + "step": 1395 + }, + { + "epoch": 0.4, + "grad_norm": 2.381552220394106, + "learning_rate": 1.3772713801829336e-06, + "loss": 0.2649, + "step": 1396 + }, + { + "epoch": 0.4, + "grad_norm": 2.319700989497983, + "learning_rate": 1.3764212542442655e-06, + "loss": 0.2681, + "step": 1397 + }, + { + "epoch": 0.4, + "grad_norm": 2.3685340252808373, + "learning_rate": 1.3755708112320185e-06, + "loss": 0.3195, + "step": 1398 + }, + { + "epoch": 0.4, + "grad_norm": 2.356275959928643, + "learning_rate": 1.3747200518625529e-06, + "loss": 0.3112, + "step": 1399 + }, + { + "epoch": 0.4, + "grad_norm": 2.4613648289880636, + "learning_rate": 1.3738689768524944e-06, + "loss": 0.2868, + "step": 1400 + }, + { + "epoch": 0.4, + "grad_norm": 2.636826673880524, + "learning_rate": 1.3730175869187356e-06, + "loss": 0.3307, + "step": 1401 + }, + { + "epoch": 0.4, + "grad_norm": 2.185141789325747, + "learning_rate": 1.3721658827784333e-06, + "loss": 0.277, + "step": 1402 + }, + { + "epoch": 0.4, + "grad_norm": 2.284884901630232, + "learning_rate": 1.37131386514901e-06, + "loss": 0.2662, + "step": 1403 + }, + { + "epoch": 0.4, + "grad_norm": 2.5056304950275905, + "learning_rate": 1.370461534748151e-06, + "loss": 0.2736, + "step": 1404 + }, + { + "epoch": 0.4, + "grad_norm": 2.6112929606416824, + "learning_rate": 1.3696088922938063e-06, + "loss": 0.3041, + "step": 1405 + }, + { + "epoch": 0.4, + "grad_norm": 2.518370343635079, + "learning_rate": 1.3687559385041883e-06, + "loss": 0.2924, + "step": 1406 + }, + { + "epoch": 0.4, + "grad_norm": 2.452505799257892, + "learning_rate": 1.3679026740977716e-06, + "loss": 0.3219, + "step": 1407 + }, + { + "epoch": 0.4, + "grad_norm": 2.3448228996764517, + "learning_rate": 1.367049099793292e-06, + "loss": 0.2965, + "step": 1408 + }, + { + "epoch": 0.4, + "grad_norm": 2.5390555374689074, + "learning_rate": 1.3661952163097472e-06, + "loss": 0.2962, + "step": 1409 + }, + { + "epoch": 0.4, + "grad_norm": 2.481322721652436, + "learning_rate": 1.3653410243663951e-06, + "loss": 0.2905, + "step": 1410 + }, + { + "epoch": 0.4, + "grad_norm": 2.686838626482608, + "learning_rate": 1.3644865246827527e-06, + "loss": 0.3123, + "step": 1411 + }, + { + "epoch": 0.4, + "grad_norm": 2.384974185059098, + "learning_rate": 1.363631717978597e-06, + "loss": 0.2874, + "step": 1412 + }, + { + "epoch": 0.4, + "grad_norm": 2.5990639729189606, + "learning_rate": 1.3627766049739633e-06, + "loss": 0.3079, + "step": 1413 + }, + { + "epoch": 0.4, + "grad_norm": 2.3712178342874344, + "learning_rate": 1.3619211863891456e-06, + "loss": 0.3038, + "step": 1414 + }, + { + "epoch": 0.4, + "grad_norm": 2.524361510330734, + "learning_rate": 1.3610654629446936e-06, + "loss": 0.336, + "step": 1415 + }, + { + "epoch": 0.4, + "grad_norm": 2.0728242696264614, + "learning_rate": 1.3602094353614158e-06, + "loss": 0.2532, + "step": 1416 + }, + { + "epoch": 0.4, + "grad_norm": 2.3929540560511, + "learning_rate": 1.3593531043603755e-06, + "loss": 0.2928, + "step": 1417 + }, + { + "epoch": 0.4, + "grad_norm": 2.2288066754224736, + "learning_rate": 1.3584964706628921e-06, + "loss": 0.2806, + "step": 1418 + }, + { + "epoch": 0.4, + "grad_norm": 2.3635592025311642, + "learning_rate": 1.35763953499054e-06, + "loss": 0.2875, + "step": 1419 + }, + { + "epoch": 0.4, + "grad_norm": 2.3988417976914302, + "learning_rate": 1.356782298065148e-06, + "loss": 0.2665, + "step": 1420 + }, + { + "epoch": 0.4, + "grad_norm": 2.3297542920898744, + "learning_rate": 1.3559247606087984e-06, + "loss": 0.2878, + "step": 1421 + }, + { + "epoch": 0.4, + "grad_norm": 2.326689143317108, + "learning_rate": 1.355066923343827e-06, + "loss": 0.2892, + "step": 1422 + }, + { + "epoch": 0.4, + "grad_norm": 2.3010274507614974, + "learning_rate": 1.3542087869928213e-06, + "loss": 0.2607, + "step": 1423 + }, + { + "epoch": 0.4, + "grad_norm": 2.6613557163218498, + "learning_rate": 1.3533503522786223e-06, + "loss": 0.3039, + "step": 1424 + }, + { + "epoch": 0.4, + "grad_norm": 2.4466499629846106, + "learning_rate": 1.3524916199243208e-06, + "loss": 0.3095, + "step": 1425 + }, + { + "epoch": 0.4, + "grad_norm": 2.547045510727559, + "learning_rate": 1.351632590653259e-06, + "loss": 0.28, + "step": 1426 + }, + { + "epoch": 0.4, + "grad_norm": 2.2667498039894647, + "learning_rate": 1.3507732651890291e-06, + "loss": 0.2992, + "step": 1427 + }, + { + "epoch": 0.4, + "grad_norm": 2.49008724136304, + "learning_rate": 1.349913644255473e-06, + "loss": 0.279, + "step": 1428 + }, + { + "epoch": 0.4, + "grad_norm": 2.719503341700533, + "learning_rate": 1.3490537285766808e-06, + "loss": 0.2948, + "step": 1429 + }, + { + "epoch": 0.41, + "grad_norm": 2.4092554173724507, + "learning_rate": 1.3481935188769917e-06, + "loss": 0.2996, + "step": 1430 + }, + { + "epoch": 0.41, + "grad_norm": 2.4216625872037585, + "learning_rate": 1.3473330158809924e-06, + "loss": 0.282, + "step": 1431 + }, + { + "epoch": 0.41, + "grad_norm": 2.1808475970040537, + "learning_rate": 1.346472220313516e-06, + "loss": 0.2887, + "step": 1432 + }, + { + "epoch": 0.41, + "grad_norm": 2.3738790822075817, + "learning_rate": 1.3456111328996428e-06, + "loss": 0.2629, + "step": 1433 + }, + { + "epoch": 0.41, + "grad_norm": 2.461742161075914, + "learning_rate": 1.344749754364699e-06, + "loss": 0.3071, + "step": 1434 + }, + { + "epoch": 0.41, + "grad_norm": 2.458297651337914, + "learning_rate": 1.343888085434255e-06, + "loss": 0.3058, + "step": 1435 + }, + { + "epoch": 0.41, + "grad_norm": 2.3528781827973866, + "learning_rate": 1.343026126834127e-06, + "loss": 0.2796, + "step": 1436 + }, + { + "epoch": 0.41, + "grad_norm": 2.4443019851876113, + "learning_rate": 1.3421638792903743e-06, + "loss": 0.3051, + "step": 1437 + }, + { + "epoch": 0.41, + "grad_norm": 2.4157361534282247, + "learning_rate": 1.3413013435293002e-06, + "loss": 0.3129, + "step": 1438 + }, + { + "epoch": 0.41, + "grad_norm": 2.6256696001134663, + "learning_rate": 1.3404385202774504e-06, + "loss": 0.3422, + "step": 1439 + }, + { + "epoch": 0.41, + "grad_norm": 2.494653539752748, + "learning_rate": 1.3395754102616133e-06, + "loss": 0.3017, + "step": 1440 + }, + { + "epoch": 0.41, + "grad_norm": 2.533547772461133, + "learning_rate": 1.338712014208818e-06, + "loss": 0.3079, + "step": 1441 + }, + { + "epoch": 0.41, + "grad_norm": 2.2545374483944847, + "learning_rate": 1.3378483328463351e-06, + "loss": 0.2881, + "step": 1442 + }, + { + "epoch": 0.41, + "grad_norm": 2.2384615081575245, + "learning_rate": 1.3369843669016756e-06, + "loss": 0.2581, + "step": 1443 + }, + { + "epoch": 0.41, + "grad_norm": 2.3973511771959712, + "learning_rate": 1.33612011710259e-06, + "loss": 0.2767, + "step": 1444 + }, + { + "epoch": 0.41, + "grad_norm": 2.6139147945528984, + "learning_rate": 1.335255584177068e-06, + "loss": 0.3135, + "step": 1445 + }, + { + "epoch": 0.41, + "grad_norm": 2.5360027379877477, + "learning_rate": 1.3343907688533375e-06, + "loss": 0.2835, + "step": 1446 + }, + { + "epoch": 0.41, + "grad_norm": 2.338124638027161, + "learning_rate": 1.333525671859865e-06, + "loss": 0.2852, + "step": 1447 + }, + { + "epoch": 0.41, + "grad_norm": 2.4812362246871094, + "learning_rate": 1.332660293925353e-06, + "loss": 0.3102, + "step": 1448 + }, + { + "epoch": 0.41, + "grad_norm": 2.444894602358509, + "learning_rate": 1.3317946357787424e-06, + "loss": 0.302, + "step": 1449 + }, + { + "epoch": 0.41, + "grad_norm": 2.3811549830187424, + "learning_rate": 1.3309286981492082e-06, + "loss": 0.2827, + "step": 1450 + }, + { + "epoch": 0.41, + "grad_norm": 4.15688641411048, + "learning_rate": 1.3300624817661626e-06, + "loss": 0.2674, + "step": 1451 + }, + { + "epoch": 0.41, + "grad_norm": 2.365390398206374, + "learning_rate": 1.3291959873592507e-06, + "loss": 0.2953, + "step": 1452 + }, + { + "epoch": 0.41, + "grad_norm": 2.3143132163507927, + "learning_rate": 1.328329215658354e-06, + "loss": 0.2969, + "step": 1453 + }, + { + "epoch": 0.41, + "grad_norm": 2.3680841714427667, + "learning_rate": 1.327462167393586e-06, + "loss": 0.2858, + "step": 1454 + }, + { + "epoch": 0.41, + "grad_norm": 2.4333096082055556, + "learning_rate": 1.3265948432952934e-06, + "loss": 0.3119, + "step": 1455 + }, + { + "epoch": 0.41, + "grad_norm": 2.3979577441652404, + "learning_rate": 1.3257272440940556e-06, + "loss": 0.2716, + "step": 1456 + }, + { + "epoch": 0.41, + "grad_norm": 2.780283482227355, + "learning_rate": 1.3248593705206837e-06, + "loss": 0.3097, + "step": 1457 + }, + { + "epoch": 0.41, + "grad_norm": 2.4727707069996874, + "learning_rate": 1.3239912233062195e-06, + "loss": 0.2949, + "step": 1458 + }, + { + "epoch": 0.41, + "grad_norm": 2.605144018890558, + "learning_rate": 1.3231228031819358e-06, + "loss": 0.3038, + "step": 1459 + }, + { + "epoch": 0.41, + "grad_norm": 2.30781291279654, + "learning_rate": 1.322254110879335e-06, + "loss": 0.3085, + "step": 1460 + }, + { + "epoch": 0.41, + "grad_norm": 2.4557382603995013, + "learning_rate": 1.321385147130149e-06, + "loss": 0.3176, + "step": 1461 + }, + { + "epoch": 0.41, + "grad_norm": 2.480506129867659, + "learning_rate": 1.320515912666338e-06, + "loss": 0.3305, + "step": 1462 + }, + { + "epoch": 0.41, + "grad_norm": 2.5727163800612325, + "learning_rate": 1.3196464082200901e-06, + "loss": 0.2969, + "step": 1463 + }, + { + "epoch": 0.41, + "grad_norm": 2.1255081785167396, + "learning_rate": 1.318776634523822e-06, + "loss": 0.2373, + "step": 1464 + }, + { + "epoch": 0.42, + "grad_norm": 2.5116755100727595, + "learning_rate": 1.3179065923101757e-06, + "loss": 0.2783, + "step": 1465 + }, + { + "epoch": 0.42, + "grad_norm": 2.2406852442419862, + "learning_rate": 1.3170362823120202e-06, + "loss": 0.2865, + "step": 1466 + }, + { + "epoch": 0.42, + "grad_norm": 2.357655391327485, + "learning_rate": 1.3161657052624496e-06, + "loss": 0.2699, + "step": 1467 + }, + { + "epoch": 0.42, + "grad_norm": 2.4125721643155313, + "learning_rate": 1.3152948618947836e-06, + "loss": 0.315, + "step": 1468 + }, + { + "epoch": 0.42, + "grad_norm": 2.5164883344248845, + "learning_rate": 1.3144237529425652e-06, + "loss": 0.3061, + "step": 1469 + }, + { + "epoch": 0.42, + "grad_norm": 2.3188123708680113, + "learning_rate": 1.313552379139563e-06, + "loss": 0.2862, + "step": 1470 + }, + { + "epoch": 0.42, + "grad_norm": 2.3745493691138244, + "learning_rate": 1.3126807412197664e-06, + "loss": 0.3067, + "step": 1471 + }, + { + "epoch": 0.42, + "grad_norm": 2.50456059069696, + "learning_rate": 1.3118088399173886e-06, + "loss": 0.2483, + "step": 1472 + }, + { + "epoch": 0.42, + "grad_norm": 2.4560490015824437, + "learning_rate": 1.3109366759668646e-06, + "loss": 0.2723, + "step": 1473 + }, + { + "epoch": 0.42, + "grad_norm": 2.5134191311678173, + "learning_rate": 1.31006425010285e-06, + "loss": 0.3037, + "step": 1474 + }, + { + "epoch": 0.42, + "grad_norm": 2.3293536029879425, + "learning_rate": 1.3091915630602222e-06, + "loss": 0.2851, + "step": 1475 + }, + { + "epoch": 0.42, + "grad_norm": 2.3190242087963773, + "learning_rate": 1.308318615574077e-06, + "loss": 0.2658, + "step": 1476 + }, + { + "epoch": 0.42, + "grad_norm": 2.3274324770049466, + "learning_rate": 1.3074454083797307e-06, + "loss": 0.2773, + "step": 1477 + }, + { + "epoch": 0.42, + "grad_norm": 2.4771135962414292, + "learning_rate": 1.3065719422127185e-06, + "loss": 0.302, + "step": 1478 + }, + { + "epoch": 0.42, + "grad_norm": 2.3441652217311866, + "learning_rate": 1.3056982178087933e-06, + "loss": 0.291, + "step": 1479 + }, + { + "epoch": 0.42, + "grad_norm": 2.5380690988427403, + "learning_rate": 1.3048242359039247e-06, + "loss": 0.3169, + "step": 1480 + }, + { + "epoch": 0.42, + "grad_norm": 2.2385674340764274, + "learning_rate": 1.303949997234301e-06, + "loss": 0.2914, + "step": 1481 + }, + { + "epoch": 0.42, + "grad_norm": 2.5364022580181365, + "learning_rate": 1.3030755025363255e-06, + "loss": 0.306, + "step": 1482 + }, + { + "epoch": 0.42, + "grad_norm": 2.446531539254504, + "learning_rate": 1.3022007525466179e-06, + "loss": 0.336, + "step": 1483 + }, + { + "epoch": 0.42, + "grad_norm": 2.4545408603051793, + "learning_rate": 1.3013257480020114e-06, + "loss": 0.2919, + "step": 1484 + }, + { + "epoch": 0.42, + "grad_norm": 2.3489981204127623, + "learning_rate": 1.3004504896395562e-06, + "loss": 0.2909, + "step": 1485 + }, + { + "epoch": 0.42, + "grad_norm": 3.3814888503667264, + "learning_rate": 1.2995749781965136e-06, + "loss": 0.2893, + "step": 1486 + }, + { + "epoch": 0.42, + "grad_norm": 2.419910649136688, + "learning_rate": 1.2986992144103606e-06, + "loss": 0.2879, + "step": 1487 + }, + { + "epoch": 0.42, + "grad_norm": 2.619557910339679, + "learning_rate": 1.2978231990187847e-06, + "loss": 0.2802, + "step": 1488 + }, + { + "epoch": 0.42, + "grad_norm": 2.2204889549475744, + "learning_rate": 1.2969469327596859e-06, + "loss": 0.2897, + "step": 1489 + }, + { + "epoch": 0.42, + "grad_norm": 2.38645753292471, + "learning_rate": 1.2960704163711766e-06, + "loss": 0.2963, + "step": 1490 + }, + { + "epoch": 0.42, + "grad_norm": 2.47586048779542, + "learning_rate": 1.2951936505915781e-06, + "loss": 0.2775, + "step": 1491 + }, + { + "epoch": 0.42, + "grad_norm": 2.406803415941365, + "learning_rate": 1.294316636159424e-06, + "loss": 0.2906, + "step": 1492 + }, + { + "epoch": 0.42, + "grad_norm": 2.3808072486060525, + "learning_rate": 1.2934393738134546e-06, + "loss": 0.299, + "step": 1493 + }, + { + "epoch": 0.42, + "grad_norm": 2.5512356401014133, + "learning_rate": 1.2925618642926218e-06, + "loss": 0.3172, + "step": 1494 + }, + { + "epoch": 0.42, + "grad_norm": 2.160246048496873, + "learning_rate": 1.2916841083360834e-06, + "loss": 0.268, + "step": 1495 + }, + { + "epoch": 0.42, + "grad_norm": 2.537746401420283, + "learning_rate": 1.2908061066832063e-06, + "loss": 0.2998, + "step": 1496 + }, + { + "epoch": 0.42, + "grad_norm": 2.522608636766397, + "learning_rate": 1.289927860073564e-06, + "loss": 0.2946, + "step": 1497 + }, + { + "epoch": 0.42, + "grad_norm": 2.371012661692452, + "learning_rate": 1.2890493692469356e-06, + "loss": 0.3024, + "step": 1498 + }, + { + "epoch": 0.42, + "grad_norm": 2.450688539105884, + "learning_rate": 1.2881706349433067e-06, + "loss": 0.2836, + "step": 1499 + }, + { + "epoch": 0.43, + "grad_norm": 2.4682318267739607, + "learning_rate": 1.2872916579028684e-06, + "loss": 0.3102, + "step": 1500 + }, + { + "epoch": 0.43, + "grad_norm": 2.43757520685359, + "learning_rate": 1.2864124388660146e-06, + "loss": 0.2905, + "step": 1501 + }, + { + "epoch": 0.43, + "grad_norm": 2.790709881138362, + "learning_rate": 1.2855329785733452e-06, + "loss": 0.3042, + "step": 1502 + }, + { + "epoch": 0.43, + "grad_norm": 2.4479829710987713, + "learning_rate": 1.2846532777656612e-06, + "loss": 0.2789, + "step": 1503 + }, + { + "epoch": 0.43, + "grad_norm": 2.375342138833745, + "learning_rate": 1.2837733371839678e-06, + "loss": 0.2797, + "step": 1504 + }, + { + "epoch": 0.43, + "grad_norm": 2.2951167449970327, + "learning_rate": 1.2828931575694718e-06, + "loss": 0.2895, + "step": 1505 + }, + { + "epoch": 0.43, + "grad_norm": 2.4220238351531425, + "learning_rate": 1.2820127396635801e-06, + "loss": 0.2767, + "step": 1506 + }, + { + "epoch": 0.43, + "grad_norm": 2.390590180143157, + "learning_rate": 1.2811320842079026e-06, + "loss": 0.2845, + "step": 1507 + }, + { + "epoch": 0.43, + "grad_norm": 2.227092280010057, + "learning_rate": 1.2802511919442468e-06, + "loss": 0.2727, + "step": 1508 + }, + { + "epoch": 0.43, + "grad_norm": 2.9350175829138374, + "learning_rate": 1.279370063614622e-06, + "loss": 0.3165, + "step": 1509 + }, + { + "epoch": 0.43, + "grad_norm": 2.3556655543461025, + "learning_rate": 1.2784886999612347e-06, + "loss": 0.2972, + "step": 1510 + }, + { + "epoch": 0.43, + "grad_norm": 2.4145727803902792, + "learning_rate": 1.2776071017264907e-06, + "loss": 0.3229, + "step": 1511 + }, + { + "epoch": 0.43, + "grad_norm": 2.2701438676974655, + "learning_rate": 1.276725269652992e-06, + "loss": 0.2972, + "step": 1512 + }, + { + "epoch": 0.43, + "grad_norm": 2.279322336462779, + "learning_rate": 1.275843204483539e-06, + "loss": 0.2831, + "step": 1513 + }, + { + "epoch": 0.43, + "grad_norm": 2.2549254803317544, + "learning_rate": 1.274960906961128e-06, + "loss": 0.2961, + "step": 1514 + }, + { + "epoch": 0.43, + "grad_norm": 2.410217984272276, + "learning_rate": 1.2740783778289505e-06, + "loss": 0.3046, + "step": 1515 + }, + { + "epoch": 0.43, + "grad_norm": 2.5502149712168185, + "learning_rate": 1.273195617830394e-06, + "loss": 0.2885, + "step": 1516 + }, + { + "epoch": 0.43, + "grad_norm": 2.1287324402745083, + "learning_rate": 1.2723126277090395e-06, + "loss": 0.2714, + "step": 1517 + }, + { + "epoch": 0.43, + "grad_norm": 2.5024724957081674, + "learning_rate": 1.2714294082086627e-06, + "loss": 0.3027, + "step": 1518 + }, + { + "epoch": 0.43, + "grad_norm": 2.403173741361718, + "learning_rate": 1.2705459600732317e-06, + "loss": 0.3082, + "step": 1519 + }, + { + "epoch": 0.43, + "grad_norm": 2.338355201173731, + "learning_rate": 1.2696622840469081e-06, + "loss": 0.2848, + "step": 1520 + }, + { + "epoch": 0.43, + "grad_norm": 2.3525510876983664, + "learning_rate": 1.2687783808740448e-06, + "loss": 0.2715, + "step": 1521 + }, + { + "epoch": 0.43, + "grad_norm": 2.280452801928687, + "learning_rate": 1.2678942512991864e-06, + "loss": 0.272, + "step": 1522 + }, + { + "epoch": 0.43, + "grad_norm": 2.2859179747537994, + "learning_rate": 1.2670098960670674e-06, + "loss": 0.2747, + "step": 1523 + }, + { + "epoch": 0.43, + "grad_norm": 2.303075929644119, + "learning_rate": 1.2661253159226138e-06, + "loss": 0.2929, + "step": 1524 + }, + { + "epoch": 0.43, + "grad_norm": 2.3018200914697458, + "learning_rate": 1.2652405116109393e-06, + "loss": 0.3046, + "step": 1525 + }, + { + "epoch": 0.43, + "grad_norm": 2.8220611835937754, + "learning_rate": 1.2643554838773486e-06, + "loss": 0.3014, + "step": 1526 + }, + { + "epoch": 0.43, + "grad_norm": 2.3777860887932234, + "learning_rate": 1.263470233467332e-06, + "loss": 0.3024, + "step": 1527 + }, + { + "epoch": 0.43, + "grad_norm": 2.397804821536525, + "learning_rate": 1.26258476112657e-06, + "loss": 0.3062, + "step": 1528 + }, + { + "epoch": 0.43, + "grad_norm": 2.218905959145441, + "learning_rate": 1.261699067600928e-06, + "loss": 0.2794, + "step": 1529 + }, + { + "epoch": 0.43, + "grad_norm": 2.463662208750161, + "learning_rate": 1.2608131536364589e-06, + "loss": 0.285, + "step": 1530 + }, + { + "epoch": 0.43, + "grad_norm": 2.336920618973469, + "learning_rate": 1.2599270199794006e-06, + "loss": 0.2871, + "step": 1531 + }, + { + "epoch": 0.43, + "grad_norm": 2.4511676946774426, + "learning_rate": 1.259040667376176e-06, + "loss": 0.2744, + "step": 1532 + }, + { + "epoch": 0.43, + "grad_norm": 2.445524281717222, + "learning_rate": 1.2581540965733939e-06, + "loss": 0.3044, + "step": 1533 + }, + { + "epoch": 0.43, + "grad_norm": 2.481605303885608, + "learning_rate": 1.2572673083178447e-06, + "loss": 0.2677, + "step": 1534 + }, + { + "epoch": 0.43, + "grad_norm": 2.3038645377681823, + "learning_rate": 1.2563803033565032e-06, + "loss": 0.2978, + "step": 1535 + }, + { + "epoch": 0.44, + "grad_norm": 2.521409954147198, + "learning_rate": 1.255493082436527e-06, + "loss": 0.3202, + "step": 1536 + }, + { + "epoch": 0.44, + "grad_norm": 2.3247669770557176, + "learning_rate": 1.2546056463052548e-06, + "loss": 0.2797, + "step": 1537 + }, + { + "epoch": 0.44, + "grad_norm": 2.284580868757689, + "learning_rate": 1.2537179957102074e-06, + "loss": 0.2746, + "step": 1538 + }, + { + "epoch": 0.44, + "grad_norm": 3.3220609927468523, + "learning_rate": 1.2528301313990853e-06, + "loss": 0.2953, + "step": 1539 + }, + { + "epoch": 0.44, + "grad_norm": 2.3514220186854273, + "learning_rate": 1.2519420541197693e-06, + "loss": 0.2994, + "step": 1540 + }, + { + "epoch": 0.44, + "grad_norm": 2.6177756144523845, + "learning_rate": 1.2510537646203207e-06, + "loss": 0.2957, + "step": 1541 + }, + { + "epoch": 0.44, + "grad_norm": 2.3045538164229953, + "learning_rate": 1.2501652636489778e-06, + "loss": 0.2742, + "step": 1542 + }, + { + "epoch": 0.44, + "grad_norm": 2.5292210544986977, + "learning_rate": 1.249276551954159e-06, + "loss": 0.2926, + "step": 1543 + }, + { + "epoch": 0.44, + "grad_norm": 2.397955383256873, + "learning_rate": 1.2483876302844578e-06, + "loss": 0.31, + "step": 1544 + }, + { + "epoch": 0.44, + "grad_norm": 2.4427886972425847, + "learning_rate": 1.2474984993886465e-06, + "loss": 0.3102, + "step": 1545 + }, + { + "epoch": 0.44, + "grad_norm": 2.494204157300474, + "learning_rate": 1.2466091600156733e-06, + "loss": 0.2729, + "step": 1546 + }, + { + "epoch": 0.44, + "grad_norm": 2.332220076619459, + "learning_rate": 1.2457196129146615e-06, + "loss": 0.2446, + "step": 1547 + }, + { + "epoch": 0.44, + "grad_norm": 2.2296539060987866, + "learning_rate": 1.2448298588349096e-06, + "loss": 0.2756, + "step": 1548 + }, + { + "epoch": 0.44, + "grad_norm": 2.4852006453223505, + "learning_rate": 1.2439398985258897e-06, + "loss": 0.3134, + "step": 1549 + }, + { + "epoch": 0.44, + "grad_norm": 2.4450105242836138, + "learning_rate": 1.24304973273725e-06, + "loss": 0.2894, + "step": 1550 + }, + { + "epoch": 0.44, + "grad_norm": 2.1789569148926184, + "learning_rate": 1.2421593622188086e-06, + "loss": 0.2926, + "step": 1551 + }, + { + "epoch": 0.44, + "grad_norm": 2.125580836250774, + "learning_rate": 1.2412687877205585e-06, + "loss": 0.2825, + "step": 1552 + }, + { + "epoch": 0.44, + "grad_norm": 2.419826962755061, + "learning_rate": 1.2403780099926633e-06, + "loss": 0.2975, + "step": 1553 + }, + { + "epoch": 0.44, + "grad_norm": 2.335424495673667, + "learning_rate": 1.2394870297854581e-06, + "loss": 0.2841, + "step": 1554 + }, + { + "epoch": 0.44, + "grad_norm": 2.1971134981537985, + "learning_rate": 1.2385958478494484e-06, + "loss": 0.3257, + "step": 1555 + }, + { + "epoch": 0.44, + "grad_norm": 2.6420619435757744, + "learning_rate": 1.2377044649353102e-06, + "loss": 0.3272, + "step": 1556 + }, + { + "epoch": 0.44, + "grad_norm": 2.6924042663801266, + "learning_rate": 1.2368128817938882e-06, + "loss": 0.2752, + "step": 1557 + }, + { + "epoch": 0.44, + "grad_norm": 2.3389639420764436, + "learning_rate": 1.2359210991761956e-06, + "loss": 0.2989, + "step": 1558 + }, + { + "epoch": 0.44, + "grad_norm": 2.466397462305249, + "learning_rate": 1.2350291178334144e-06, + "loss": 0.2997, + "step": 1559 + }, + { + "epoch": 0.44, + "grad_norm": 2.448220145119163, + "learning_rate": 1.2341369385168935e-06, + "loss": 0.2712, + "step": 1560 + }, + { + "epoch": 0.44, + "grad_norm": 2.45491872341095, + "learning_rate": 1.2332445619781489e-06, + "loss": 0.2982, + "step": 1561 + }, + { + "epoch": 0.44, + "grad_norm": 2.397589962456345, + "learning_rate": 1.2323519889688614e-06, + "loss": 0.2792, + "step": 1562 + }, + { + "epoch": 0.44, + "grad_norm": 2.3142801661017898, + "learning_rate": 1.2314592202408795e-06, + "loss": 0.2755, + "step": 1563 + }, + { + "epoch": 0.44, + "grad_norm": 2.527326221140043, + "learning_rate": 1.2305662565462144e-06, + "loss": 0.2994, + "step": 1564 + }, + { + "epoch": 0.44, + "grad_norm": 2.3697314137556154, + "learning_rate": 1.2296730986370436e-06, + "loss": 0.2737, + "step": 1565 + }, + { + "epoch": 0.44, + "grad_norm": 2.8244358950556676, + "learning_rate": 1.2287797472657063e-06, + "loss": 0.2652, + "step": 1566 + }, + { + "epoch": 0.44, + "grad_norm": 2.5992271577632464, + "learning_rate": 1.2278862031847059e-06, + "loss": 0.3089, + "step": 1567 + }, + { + "epoch": 0.44, + "grad_norm": 2.2422228745894803, + "learning_rate": 1.2269924671467073e-06, + "loss": 0.2793, + "step": 1568 + }, + { + "epoch": 0.44, + "grad_norm": 2.6203714518020855, + "learning_rate": 1.226098539904538e-06, + "loss": 0.317, + "step": 1569 + }, + { + "epoch": 0.44, + "grad_norm": 2.616453062567964, + "learning_rate": 1.2252044222111857e-06, + "loss": 0.2953, + "step": 1570 + }, + { + "epoch": 0.45, + "grad_norm": 2.160665725191548, + "learning_rate": 1.2243101148197989e-06, + "loss": 0.2658, + "step": 1571 + }, + { + "epoch": 0.45, + "grad_norm": 2.2669549608244024, + "learning_rate": 1.223415618483686e-06, + "loss": 0.2762, + "step": 1572 + }, + { + "epoch": 0.45, + "grad_norm": 2.1309593576379817, + "learning_rate": 1.2225209339563143e-06, + "loss": 0.2691, + "step": 1573 + }, + { + "epoch": 0.45, + "grad_norm": 2.502580046773122, + "learning_rate": 1.22162606199131e-06, + "loss": 0.3064, + "step": 1574 + }, + { + "epoch": 0.45, + "grad_norm": 2.183455320755513, + "learning_rate": 1.2207310033424566e-06, + "loss": 0.2644, + "step": 1575 + }, + { + "epoch": 0.45, + "grad_norm": 2.0960195827035477, + "learning_rate": 1.2198357587636956e-06, + "loss": 0.2653, + "step": 1576 + }, + { + "epoch": 0.45, + "grad_norm": 2.8250176006360355, + "learning_rate": 1.2189403290091244e-06, + "loss": 0.3265, + "step": 1577 + }, + { + "epoch": 0.45, + "grad_norm": 2.3678674723497974, + "learning_rate": 1.218044714832997e-06, + "loss": 0.2833, + "step": 1578 + }, + { + "epoch": 0.45, + "grad_norm": 2.2569556247463827, + "learning_rate": 1.2171489169897215e-06, + "loss": 0.2895, + "step": 1579 + }, + { + "epoch": 0.45, + "grad_norm": 2.812494247020307, + "learning_rate": 1.2162529362338631e-06, + "loss": 0.3053, + "step": 1580 + }, + { + "epoch": 0.45, + "grad_norm": 2.2127372881951244, + "learning_rate": 1.2153567733201383e-06, + "loss": 0.2867, + "step": 1581 + }, + { + "epoch": 0.45, + "grad_norm": 2.426303976237553, + "learning_rate": 1.214460429003419e-06, + "loss": 0.2647, + "step": 1582 + }, + { + "epoch": 0.45, + "grad_norm": 2.39029073034326, + "learning_rate": 1.213563904038729e-06, + "loss": 0.3349, + "step": 1583 + }, + { + "epoch": 0.45, + "grad_norm": 2.3123384990964833, + "learning_rate": 1.2126671991812447e-06, + "loss": 0.2903, + "step": 1584 + }, + { + "epoch": 0.45, + "grad_norm": 2.3166156240565745, + "learning_rate": 1.2117703151862939e-06, + "loss": 0.2993, + "step": 1585 + }, + { + "epoch": 0.45, + "grad_norm": 2.4677299405381614, + "learning_rate": 1.2108732528093549e-06, + "loss": 0.3073, + "step": 1586 + }, + { + "epoch": 0.45, + "grad_norm": 2.4192821491062335, + "learning_rate": 1.209976012806057e-06, + "loss": 0.299, + "step": 1587 + }, + { + "epoch": 0.45, + "grad_norm": 2.4357059330636432, + "learning_rate": 1.2090785959321781e-06, + "loss": 0.2646, + "step": 1588 + }, + { + "epoch": 0.45, + "grad_norm": 5.522395881502279, + "learning_rate": 1.2081810029436468e-06, + "loss": 0.2937, + "step": 1589 + }, + { + "epoch": 0.45, + "grad_norm": 2.321652531861596, + "learning_rate": 1.207283234596538e-06, + "loss": 0.3025, + "step": 1590 + }, + { + "epoch": 0.45, + "grad_norm": 2.446193398181088, + "learning_rate": 1.2063852916470753e-06, + "loss": 0.3041, + "step": 1591 + }, + { + "epoch": 0.45, + "grad_norm": 2.551171821649562, + "learning_rate": 1.20548717485163e-06, + "loss": 0.3013, + "step": 1592 + }, + { + "epoch": 0.45, + "grad_norm": 2.2392588623142875, + "learning_rate": 1.2045888849667185e-06, + "loss": 0.2993, + "step": 1593 + }, + { + "epoch": 0.45, + "grad_norm": 2.377514804296935, + "learning_rate": 1.2036904227490041e-06, + "loss": 0.3237, + "step": 1594 + }, + { + "epoch": 0.45, + "grad_norm": 2.4836421658846763, + "learning_rate": 1.202791788955295e-06, + "loss": 0.2805, + "step": 1595 + }, + { + "epoch": 0.45, + "grad_norm": 2.4812149257453022, + "learning_rate": 1.2018929843425427e-06, + "loss": 0.29, + "step": 1596 + }, + { + "epoch": 0.45, + "grad_norm": 2.7616563865201336, + "learning_rate": 1.200994009667845e-06, + "loss": 0.2956, + "step": 1597 + }, + { + "epoch": 0.45, + "grad_norm": 2.75907227360528, + "learning_rate": 1.2000948656884407e-06, + "loss": 0.3059, + "step": 1598 + }, + { + "epoch": 0.45, + "grad_norm": 2.4106964339523826, + "learning_rate": 1.1991955531617123e-06, + "loss": 0.304, + "step": 1599 + }, + { + "epoch": 0.45, + "grad_norm": 2.4572321494745086, + "learning_rate": 1.1982960728451845e-06, + "loss": 0.3002, + "step": 1600 + }, + { + "epoch": 0.45, + "grad_norm": 2.3147931678781486, + "learning_rate": 1.1973964254965223e-06, + "loss": 0.2984, + "step": 1601 + }, + { + "epoch": 0.45, + "grad_norm": 2.162842212080869, + "learning_rate": 1.196496611873533e-06, + "loss": 0.2708, + "step": 1602 + }, + { + "epoch": 0.45, + "grad_norm": 2.439223299123657, + "learning_rate": 1.1955966327341613e-06, + "loss": 0.2816, + "step": 1603 + }, + { + "epoch": 0.45, + "grad_norm": 2.270140418104315, + "learning_rate": 1.1946964888364947e-06, + "loss": 0.2834, + "step": 1604 + }, + { + "epoch": 0.45, + "grad_norm": 2.2872796726584594, + "learning_rate": 1.1937961809387567e-06, + "loss": 0.2927, + "step": 1605 + }, + { + "epoch": 0.46, + "grad_norm": 2.430768331384572, + "learning_rate": 1.192895709799311e-06, + "loss": 0.2615, + "step": 1606 + }, + { + "epoch": 0.46, + "grad_norm": 2.5802386722827864, + "learning_rate": 1.1919950761766567e-06, + "loss": 0.2601, + "step": 1607 + }, + { + "epoch": 0.46, + "grad_norm": 2.303315690496668, + "learning_rate": 1.1910942808294313e-06, + "loss": 0.3003, + "step": 1608 + }, + { + "epoch": 0.46, + "grad_norm": 2.401745013844011, + "learning_rate": 1.1901933245164084e-06, + "loss": 0.2738, + "step": 1609 + }, + { + "epoch": 0.46, + "grad_norm": 2.2269512971196703, + "learning_rate": 1.189292207996497e-06, + "loss": 0.2859, + "step": 1610 + }, + { + "epoch": 0.46, + "grad_norm": 2.5530381476933846, + "learning_rate": 1.1883909320287403e-06, + "loss": 0.3276, + "step": 1611 + }, + { + "epoch": 0.46, + "grad_norm": 2.552181480306632, + "learning_rate": 1.1874894973723171e-06, + "loss": 0.2945, + "step": 1612 + }, + { + "epoch": 0.46, + "grad_norm": 2.4277394972003403, + "learning_rate": 1.1865879047865389e-06, + "loss": 0.3273, + "step": 1613 + }, + { + "epoch": 0.46, + "grad_norm": 2.304486641989715, + "learning_rate": 1.1856861550308506e-06, + "loss": 0.2731, + "step": 1614 + }, + { + "epoch": 0.46, + "grad_norm": 2.6788843226324546, + "learning_rate": 1.1847842488648294e-06, + "loss": 0.3032, + "step": 1615 + }, + { + "epoch": 0.46, + "grad_norm": 2.3709010265119104, + "learning_rate": 1.1838821870481846e-06, + "loss": 0.2952, + "step": 1616 + }, + { + "epoch": 0.46, + "grad_norm": 2.3812997004982766, + "learning_rate": 1.1829799703407562e-06, + "loss": 0.2854, + "step": 1617 + }, + { + "epoch": 0.46, + "grad_norm": 2.4542133115233544, + "learning_rate": 1.1820775995025146e-06, + "loss": 0.3001, + "step": 1618 + }, + { + "epoch": 0.46, + "grad_norm": 2.2291860735070586, + "learning_rate": 1.1811750752935604e-06, + "loss": 0.2829, + "step": 1619 + }, + { + "epoch": 0.46, + "grad_norm": 2.3315720405872233, + "learning_rate": 1.1802723984741227e-06, + "loss": 0.2916, + "step": 1620 + }, + { + "epoch": 0.46, + "grad_norm": 2.284866993118778, + "learning_rate": 1.1793695698045605e-06, + "loss": 0.2921, + "step": 1621 + }, + { + "epoch": 0.46, + "grad_norm": 2.38603967727056, + "learning_rate": 1.1784665900453592e-06, + "loss": 0.3035, + "step": 1622 + }, + { + "epoch": 0.46, + "grad_norm": 2.3086528982281633, + "learning_rate": 1.1775634599571325e-06, + "loss": 0.2809, + "step": 1623 + }, + { + "epoch": 0.46, + "grad_norm": 2.355332925064751, + "learning_rate": 1.1766601803006201e-06, + "loss": 0.2996, + "step": 1624 + }, + { + "epoch": 0.46, + "grad_norm": 2.897899160265012, + "learning_rate": 1.1757567518366883e-06, + "loss": 0.2976, + "step": 1625 + }, + { + "epoch": 0.46, + "grad_norm": 2.366508867197264, + "learning_rate": 1.174853175326328e-06, + "loss": 0.2954, + "step": 1626 + }, + { + "epoch": 0.46, + "grad_norm": 2.4458650264482884, + "learning_rate": 1.1739494515306552e-06, + "loss": 0.2743, + "step": 1627 + }, + { + "epoch": 0.46, + "grad_norm": 3.878447090985886, + "learning_rate": 1.17304558121091e-06, + "loss": 0.2948, + "step": 1628 + }, + { + "epoch": 0.46, + "grad_norm": 2.3919404467561116, + "learning_rate": 1.1721415651284564e-06, + "loss": 0.296, + "step": 1629 + }, + { + "epoch": 0.46, + "grad_norm": 2.4843121301414333, + "learning_rate": 1.1712374040447801e-06, + "loss": 0.2939, + "step": 1630 + }, + { + "epoch": 0.46, + "grad_norm": 2.443680664040836, + "learning_rate": 1.1703330987214896e-06, + "loss": 0.2894, + "step": 1631 + }, + { + "epoch": 0.46, + "grad_norm": 2.4771653111875067, + "learning_rate": 1.1694286499203147e-06, + "loss": 0.3024, + "step": 1632 + }, + { + "epoch": 0.46, + "grad_norm": 2.492661747284556, + "learning_rate": 1.1685240584031067e-06, + "loss": 0.2502, + "step": 1633 + }, + { + "epoch": 0.46, + "grad_norm": 2.422281617887777, + "learning_rate": 1.1676193249318358e-06, + "loss": 0.2729, + "step": 1634 + }, + { + "epoch": 0.46, + "grad_norm": 2.2196613564768586, + "learning_rate": 1.166714450268593e-06, + "loss": 0.261, + "step": 1635 + }, + { + "epoch": 0.46, + "grad_norm": 2.410642664529523, + "learning_rate": 1.165809435175588e-06, + "loss": 0.2686, + "step": 1636 + }, + { + "epoch": 0.46, + "grad_norm": 2.4144309428272397, + "learning_rate": 1.164904280415148e-06, + "loss": 0.2752, + "step": 1637 + }, + { + "epoch": 0.46, + "grad_norm": 3.6694333385746716, + "learning_rate": 1.163998986749719e-06, + "loss": 0.2774, + "step": 1638 + }, + { + "epoch": 0.46, + "grad_norm": 2.304174111118199, + "learning_rate": 1.1630935549418626e-06, + "loss": 0.2836, + "step": 1639 + }, + { + "epoch": 0.46, + "grad_norm": 2.4381870026370938, + "learning_rate": 1.1621879857542585e-06, + "loss": 0.281, + "step": 1640 + }, + { + "epoch": 0.46, + "grad_norm": 3.435408423357395, + "learning_rate": 1.1612822799497005e-06, + "loss": 0.2797, + "step": 1641 + }, + { + "epoch": 0.47, + "grad_norm": 2.1859105113726507, + "learning_rate": 1.1603764382910988e-06, + "loss": 0.2905, + "step": 1642 + }, + { + "epoch": 0.47, + "grad_norm": 2.478183699921075, + "learning_rate": 1.1594704615414768e-06, + "loss": 0.2683, + "step": 1643 + }, + { + "epoch": 0.47, + "grad_norm": 2.3890470350734123, + "learning_rate": 1.1585643504639725e-06, + "loss": 0.2967, + "step": 1644 + }, + { + "epoch": 0.47, + "grad_norm": 2.5253388824450242, + "learning_rate": 1.1576581058218372e-06, + "loss": 0.3048, + "step": 1645 + }, + { + "epoch": 0.47, + "grad_norm": 2.295525160535059, + "learning_rate": 1.1567517283784343e-06, + "loss": 0.2689, + "step": 1646 + }, + { + "epoch": 0.47, + "grad_norm": 2.4418315746970265, + "learning_rate": 1.1558452188972384e-06, + "loss": 0.2702, + "step": 1647 + }, + { + "epoch": 0.47, + "grad_norm": 2.4257726301731095, + "learning_rate": 1.154938578141837e-06, + "loss": 0.2566, + "step": 1648 + }, + { + "epoch": 0.47, + "grad_norm": 2.4880540461286063, + "learning_rate": 1.1540318068759268e-06, + "loss": 0.2707, + "step": 1649 + }, + { + "epoch": 0.47, + "grad_norm": 2.180860676840754, + "learning_rate": 1.1531249058633147e-06, + "loss": 0.2975, + "step": 1650 + }, + { + "epoch": 0.47, + "grad_norm": 2.3279730202534044, + "learning_rate": 1.152217875867917e-06, + "loss": 0.3109, + "step": 1651 + }, + { + "epoch": 0.47, + "grad_norm": 2.2238195956771283, + "learning_rate": 1.151310717653759e-06, + "loss": 0.2528, + "step": 1652 + }, + { + "epoch": 0.47, + "grad_norm": 2.443841308625575, + "learning_rate": 1.150403431984974e-06, + "loss": 0.3063, + "step": 1653 + }, + { + "epoch": 0.47, + "grad_norm": 2.6189577221458014, + "learning_rate": 1.1494960196258015e-06, + "loss": 0.2925, + "step": 1654 + }, + { + "epoch": 0.47, + "grad_norm": 2.382589203135882, + "learning_rate": 1.1485884813405891e-06, + "loss": 0.3003, + "step": 1655 + }, + { + "epoch": 0.47, + "grad_norm": 2.4163619522350466, + "learning_rate": 1.1476808178937898e-06, + "loss": 0.3021, + "step": 1656 + }, + { + "epoch": 0.47, + "grad_norm": 2.801855906017032, + "learning_rate": 1.1467730300499624e-06, + "loss": 0.2966, + "step": 1657 + }, + { + "epoch": 0.47, + "grad_norm": 2.072560421336879, + "learning_rate": 1.1458651185737702e-06, + "loss": 0.2517, + "step": 1658 + }, + { + "epoch": 0.47, + "grad_norm": 2.3776984976754605, + "learning_rate": 1.1449570842299803e-06, + "loss": 0.2902, + "step": 1659 + }, + { + "epoch": 0.47, + "grad_norm": 2.272720150972345, + "learning_rate": 1.1440489277834645e-06, + "loss": 0.2967, + "step": 1660 + }, + { + "epoch": 0.47, + "grad_norm": 3.1113527639819614, + "learning_rate": 1.1431406499991953e-06, + "loss": 0.2833, + "step": 1661 + }, + { + "epoch": 0.47, + "grad_norm": 2.124494330550425, + "learning_rate": 1.1422322516422505e-06, + "loss": 0.2549, + "step": 1662 + }, + { + "epoch": 0.47, + "grad_norm": 2.2187918477352206, + "learning_rate": 1.1413237334778064e-06, + "loss": 0.2724, + "step": 1663 + }, + { + "epoch": 0.47, + "grad_norm": 2.338783727753721, + "learning_rate": 1.1404150962711416e-06, + "loss": 0.2768, + "step": 1664 + }, + { + "epoch": 0.47, + "grad_norm": 2.268969273911241, + "learning_rate": 1.1395063407876358e-06, + "loss": 0.307, + "step": 1665 + }, + { + "epoch": 0.47, + "grad_norm": 2.3939662990886927, + "learning_rate": 1.1385974677927665e-06, + "loss": 0.2903, + "step": 1666 + }, + { + "epoch": 0.47, + "grad_norm": 2.3441718507444174, + "learning_rate": 1.1376884780521116e-06, + "loss": 0.2631, + "step": 1667 + }, + { + "epoch": 0.47, + "grad_norm": 2.5386815434886256, + "learning_rate": 1.1367793723313468e-06, + "loss": 0.3002, + "step": 1668 + }, + { + "epoch": 0.47, + "grad_norm": 2.9721204904382397, + "learning_rate": 1.1358701513962454e-06, + "loss": 0.2851, + "step": 1669 + }, + { + "epoch": 0.47, + "grad_norm": 2.961021970694507, + "learning_rate": 1.1349608160126783e-06, + "loss": 0.3089, + "step": 1670 + }, + { + "epoch": 0.47, + "grad_norm": 2.281055601400845, + "learning_rate": 1.1340513669466119e-06, + "loss": 0.2629, + "step": 1671 + }, + { + "epoch": 0.47, + "grad_norm": 2.3781198642027723, + "learning_rate": 1.133141804964109e-06, + "loss": 0.3255, + "step": 1672 + }, + { + "epoch": 0.47, + "grad_norm": 2.2183111482419444, + "learning_rate": 1.1322321308313277e-06, + "loss": 0.2706, + "step": 1673 + }, + { + "epoch": 0.47, + "grad_norm": 2.251887744888779, + "learning_rate": 1.13132234531452e-06, + "loss": 0.2522, + "step": 1674 + }, + { + "epoch": 0.47, + "grad_norm": 2.3499624185539907, + "learning_rate": 1.130412449180032e-06, + "loss": 0.306, + "step": 1675 + }, + { + "epoch": 0.47, + "grad_norm": 2.428096073028077, + "learning_rate": 1.1295024431943028e-06, + "loss": 0.2778, + "step": 1676 + }, + { + "epoch": 0.48, + "grad_norm": 2.2110536927365874, + "learning_rate": 1.1285923281238646e-06, + "loss": 0.2743, + "step": 1677 + }, + { + "epoch": 0.48, + "grad_norm": 2.634455891084864, + "learning_rate": 1.1276821047353401e-06, + "loss": 0.3085, + "step": 1678 + }, + { + "epoch": 0.48, + "grad_norm": 2.8355465814347784, + "learning_rate": 1.1267717737954458e-06, + "loss": 0.2992, + "step": 1679 + }, + { + "epoch": 0.48, + "grad_norm": 2.3347658200547374, + "learning_rate": 1.1258613360709858e-06, + "loss": 0.2813, + "step": 1680 + }, + { + "epoch": 0.48, + "grad_norm": 2.491899736342909, + "learning_rate": 1.1249507923288561e-06, + "loss": 0.2773, + "step": 1681 + }, + { + "epoch": 0.48, + "grad_norm": 2.4424959895263525, + "learning_rate": 1.1240401433360417e-06, + "loss": 0.2641, + "step": 1682 + }, + { + "epoch": 0.48, + "grad_norm": 2.425874752306347, + "learning_rate": 1.1231293898596153e-06, + "loss": 0.2927, + "step": 1683 + }, + { + "epoch": 0.48, + "grad_norm": 2.373837340184451, + "learning_rate": 1.1222185326667387e-06, + "loss": 0.2625, + "step": 1684 + }, + { + "epoch": 0.48, + "grad_norm": 2.329312562974794, + "learning_rate": 1.121307572524661e-06, + "loss": 0.2582, + "step": 1685 + }, + { + "epoch": 0.48, + "grad_norm": 2.3640722972664494, + "learning_rate": 1.1203965102007173e-06, + "loss": 0.2603, + "step": 1686 + }, + { + "epoch": 0.48, + "grad_norm": 2.396425754577803, + "learning_rate": 1.1194853464623293e-06, + "loss": 0.2633, + "step": 1687 + }, + { + "epoch": 0.48, + "grad_norm": 2.381954406906367, + "learning_rate": 1.118574082077004e-06, + "loss": 0.2834, + "step": 1688 + }, + { + "epoch": 0.48, + "grad_norm": 2.3566730696052085, + "learning_rate": 1.117662717812333e-06, + "loss": 0.2863, + "step": 1689 + }, + { + "epoch": 0.48, + "grad_norm": 2.569134886147644, + "learning_rate": 1.1167512544359927e-06, + "loss": 0.3116, + "step": 1690 + }, + { + "epoch": 0.48, + "grad_norm": 2.2502650418462724, + "learning_rate": 1.115839692715742e-06, + "loss": 0.2557, + "step": 1691 + }, + { + "epoch": 0.48, + "grad_norm": 2.124319603820594, + "learning_rate": 1.1149280334194235e-06, + "loss": 0.2509, + "step": 1692 + }, + { + "epoch": 0.48, + "grad_norm": 2.3197689998035793, + "learning_rate": 1.114016277314961e-06, + "loss": 0.2843, + "step": 1693 + }, + { + "epoch": 0.48, + "grad_norm": 2.3081780964531617, + "learning_rate": 1.1131044251703615e-06, + "loss": 0.2712, + "step": 1694 + }, + { + "epoch": 0.48, + "grad_norm": 2.6681397012058037, + "learning_rate": 1.1121924777537107e-06, + "loss": 0.2887, + "step": 1695 + }, + { + "epoch": 0.48, + "grad_norm": 2.570603638430538, + "learning_rate": 1.1112804358331765e-06, + "loss": 0.2973, + "step": 1696 + }, + { + "epoch": 0.48, + "grad_norm": 2.3482739754496023, + "learning_rate": 1.1103683001770055e-06, + "loss": 0.3073, + "step": 1697 + }, + { + "epoch": 0.48, + "grad_norm": 2.5409530486963754, + "learning_rate": 1.109456071553523e-06, + "loss": 0.2721, + "step": 1698 + }, + { + "epoch": 0.48, + "grad_norm": 2.515926218025391, + "learning_rate": 1.1085437507311338e-06, + "loss": 0.3055, + "step": 1699 + }, + { + "epoch": 0.48, + "grad_norm": 2.297386792787777, + "learning_rate": 1.1076313384783182e-06, + "loss": 0.2715, + "step": 1700 + }, + { + "epoch": 0.48, + "grad_norm": 2.2796020266024217, + "learning_rate": 1.1067188355636366e-06, + "loss": 0.2703, + "step": 1701 + }, + { + "epoch": 0.48, + "grad_norm": 2.324970919155172, + "learning_rate": 1.1058062427557228e-06, + "loss": 0.2629, + "step": 1702 + }, + { + "epoch": 0.48, + "grad_norm": 2.832132949007624, + "learning_rate": 1.1048935608232878e-06, + "loss": 0.3345, + "step": 1703 + }, + { + "epoch": 0.48, + "grad_norm": 2.5880321537200777, + "learning_rate": 1.1039807905351176e-06, + "loss": 0.2845, + "step": 1704 + }, + { + "epoch": 0.48, + "grad_norm": 2.444606820574314, + "learning_rate": 1.1030679326600725e-06, + "loss": 0.2943, + "step": 1705 + }, + { + "epoch": 0.48, + "grad_norm": 2.216027659045018, + "learning_rate": 1.1021549879670864e-06, + "loss": 0.277, + "step": 1706 + }, + { + "epoch": 0.48, + "grad_norm": 2.36616902049424, + "learning_rate": 1.1012419572251663e-06, + "loss": 0.2683, + "step": 1707 + }, + { + "epoch": 0.48, + "grad_norm": 2.520596375978376, + "learning_rate": 1.1003288412033923e-06, + "loss": 0.2901, + "step": 1708 + }, + { + "epoch": 0.48, + "grad_norm": 2.3720873886683127, + "learning_rate": 1.0994156406709153e-06, + "loss": 0.2916, + "step": 1709 + }, + { + "epoch": 0.48, + "grad_norm": 2.550717392273682, + "learning_rate": 1.0985023563969584e-06, + "loss": 0.2813, + "step": 1710 + }, + { + "epoch": 0.48, + "grad_norm": 2.2948039992926934, + "learning_rate": 1.0975889891508147e-06, + "loss": 0.2802, + "step": 1711 + }, + { + "epoch": 0.49, + "grad_norm": 2.302247130619214, + "learning_rate": 1.0966755397018472e-06, + "loss": 0.2835, + "step": 1712 + }, + { + "epoch": 0.49, + "grad_norm": 2.1751534784301647, + "learning_rate": 1.0957620088194883e-06, + "loss": 0.2757, + "step": 1713 + }, + { + "epoch": 0.49, + "grad_norm": 2.4138394233404137, + "learning_rate": 1.0948483972732395e-06, + "loss": 0.2824, + "step": 1714 + }, + { + "epoch": 0.49, + "grad_norm": 2.203928367452892, + "learning_rate": 1.0939347058326681e-06, + "loss": 0.2812, + "step": 1715 + }, + { + "epoch": 0.49, + "grad_norm": 2.388362231374219, + "learning_rate": 1.0930209352674123e-06, + "loss": 0.3166, + "step": 1716 + }, + { + "epoch": 0.49, + "grad_norm": 2.3369407790264196, + "learning_rate": 1.0921070863471732e-06, + "loss": 0.2883, + "step": 1717 + }, + { + "epoch": 0.49, + "grad_norm": 2.463710226139802, + "learning_rate": 1.0911931598417209e-06, + "loss": 0.2929, + "step": 1718 + }, + { + "epoch": 0.49, + "grad_norm": 2.5950571987671425, + "learning_rate": 1.0902791565208886e-06, + "loss": 0.2898, + "step": 1719 + }, + { + "epoch": 0.49, + "grad_norm": 2.4638825288289783, + "learning_rate": 1.0893650771545756e-06, + "loss": 0.2853, + "step": 1720 + }, + { + "epoch": 0.49, + "grad_norm": 2.4506770789310464, + "learning_rate": 1.0884509225127451e-06, + "loss": 0.3009, + "step": 1721 + }, + { + "epoch": 0.49, + "grad_norm": 2.2545658493991563, + "learning_rate": 1.0875366933654231e-06, + "loss": 0.2552, + "step": 1722 + }, + { + "epoch": 0.49, + "grad_norm": 2.568482399258855, + "learning_rate": 1.0866223904826989e-06, + "loss": 0.256, + "step": 1723 + }, + { + "epoch": 0.49, + "grad_norm": 2.3860221317855297, + "learning_rate": 1.0857080146347236e-06, + "loss": 0.2599, + "step": 1724 + }, + { + "epoch": 0.49, + "grad_norm": 2.324595881561982, + "learning_rate": 1.0847935665917098e-06, + "loss": 0.2739, + "step": 1725 + }, + { + "epoch": 0.49, + "grad_norm": 2.8118642447032007, + "learning_rate": 1.0838790471239311e-06, + "loss": 0.3026, + "step": 1726 + }, + { + "epoch": 0.49, + "grad_norm": 2.384500504570064, + "learning_rate": 1.0829644570017211e-06, + "loss": 0.2669, + "step": 1727 + }, + { + "epoch": 0.49, + "grad_norm": 2.523962612237143, + "learning_rate": 1.0820497969954731e-06, + "loss": 0.295, + "step": 1728 + }, + { + "epoch": 0.49, + "grad_norm": 2.281525461610133, + "learning_rate": 1.0811350678756391e-06, + "loss": 0.2788, + "step": 1729 + }, + { + "epoch": 0.49, + "grad_norm": 2.2065386887892777, + "learning_rate": 1.0802202704127292e-06, + "loss": 0.2514, + "step": 1730 + }, + { + "epoch": 0.49, + "grad_norm": 2.335004256597714, + "learning_rate": 1.0793054053773117e-06, + "loss": 0.2886, + "step": 1731 + }, + { + "epoch": 0.49, + "grad_norm": 2.422474441053778, + "learning_rate": 1.0783904735400102e-06, + "loss": 0.2989, + "step": 1732 + }, + { + "epoch": 0.49, + "grad_norm": 2.3667128250142393, + "learning_rate": 1.0774754756715071e-06, + "loss": 0.2674, + "step": 1733 + }, + { + "epoch": 0.49, + "grad_norm": 2.4721285965550446, + "learning_rate": 1.0765604125425381e-06, + "loss": 0.275, + "step": 1734 + }, + { + "epoch": 0.49, + "grad_norm": 2.6296653925230213, + "learning_rate": 1.0756452849238953e-06, + "loss": 0.2941, + "step": 1735 + }, + { + "epoch": 0.49, + "grad_norm": 2.3439950039178763, + "learning_rate": 1.0747300935864243e-06, + "loss": 0.2858, + "step": 1736 + }, + { + "epoch": 0.49, + "grad_norm": 2.417870316163827, + "learning_rate": 1.0738148393010249e-06, + "loss": 0.2902, + "step": 1737 + }, + { + "epoch": 0.49, + "grad_norm": 2.3262702827912864, + "learning_rate": 1.0728995228386495e-06, + "loss": 0.275, + "step": 1738 + }, + { + "epoch": 0.49, + "grad_norm": 2.58841745004525, + "learning_rate": 1.0719841449703033e-06, + "loss": 0.2811, + "step": 1739 + }, + { + "epoch": 0.49, + "grad_norm": 2.380777299893438, + "learning_rate": 1.071068706467043e-06, + "loss": 0.2861, + "step": 1740 + }, + { + "epoch": 0.49, + "grad_norm": 2.4556940889456893, + "learning_rate": 1.070153208099976e-06, + "loss": 0.3134, + "step": 1741 + }, + { + "epoch": 0.49, + "grad_norm": 2.6412055647062096, + "learning_rate": 1.0692376506402613e-06, + "loss": 0.3369, + "step": 1742 + }, + { + "epoch": 0.49, + "grad_norm": 2.560785153064596, + "learning_rate": 1.068322034859106e-06, + "loss": 0.29, + "step": 1743 + }, + { + "epoch": 0.49, + "grad_norm": 2.531656496883173, + "learning_rate": 1.067406361527768e-06, + "loss": 0.2713, + "step": 1744 + }, + { + "epoch": 0.49, + "grad_norm": 2.5381548769556077, + "learning_rate": 1.0664906314175524e-06, + "loss": 0.305, + "step": 1745 + }, + { + "epoch": 0.49, + "grad_norm": 2.5206232944748304, + "learning_rate": 1.0655748452998127e-06, + "loss": 0.2925, + "step": 1746 + }, + { + "epoch": 0.5, + "grad_norm": 2.592877659877174, + "learning_rate": 1.0646590039459499e-06, + "loss": 0.3254, + "step": 1747 + }, + { + "epoch": 0.5, + "grad_norm": 2.415535301875715, + "learning_rate": 1.0637431081274107e-06, + "loss": 0.2762, + "step": 1748 + }, + { + "epoch": 0.5, + "grad_norm": 2.450107923002262, + "learning_rate": 1.0628271586156878e-06, + "loss": 0.2704, + "step": 1749 + }, + { + "epoch": 0.5, + "grad_norm": 2.414014331879564, + "learning_rate": 1.0619111561823206e-06, + "loss": 0.2876, + "step": 1750 + }, + { + "epoch": 0.5, + "grad_norm": 2.3011402092648536, + "learning_rate": 1.0609951015988904e-06, + "loss": 0.2916, + "step": 1751 + }, + { + "epoch": 0.5, + "grad_norm": 2.296789517012798, + "learning_rate": 1.0600789956370253e-06, + "loss": 0.285, + "step": 1752 + }, + { + "epoch": 0.5, + "grad_norm": 2.1437783622474975, + "learning_rate": 1.0591628390683945e-06, + "loss": 0.259, + "step": 1753 + }, + { + "epoch": 0.5, + "grad_norm": 2.396711139395835, + "learning_rate": 1.0582466326647109e-06, + "loss": 0.2865, + "step": 1754 + }, + { + "epoch": 0.5, + "grad_norm": 2.411830911114145, + "learning_rate": 1.0573303771977288e-06, + "loss": 0.2833, + "step": 1755 + }, + { + "epoch": 0.5, + "grad_norm": 2.40099593001122, + "learning_rate": 1.0564140734392445e-06, + "loss": 0.2596, + "step": 1756 + }, + { + "epoch": 0.5, + "grad_norm": 2.3296913537453428, + "learning_rate": 1.0554977221610948e-06, + "loss": 0.2708, + "step": 1757 + }, + { + "epoch": 0.5, + "grad_norm": 2.536138383027427, + "learning_rate": 1.0545813241351558e-06, + "loss": 0.2789, + "step": 1758 + }, + { + "epoch": 0.5, + "grad_norm": 2.3747634534579163, + "learning_rate": 1.053664880133344e-06, + "loss": 0.2982, + "step": 1759 + }, + { + "epoch": 0.5, + "grad_norm": 2.5618665119525854, + "learning_rate": 1.0527483909276142e-06, + "loss": 0.317, + "step": 1760 + }, + { + "epoch": 0.5, + "grad_norm": 2.4399883429176024, + "learning_rate": 1.051831857289959e-06, + "loss": 0.2937, + "step": 1761 + }, + { + "epoch": 0.5, + "grad_norm": 2.466100915953789, + "learning_rate": 1.0509152799924084e-06, + "loss": 0.3105, + "step": 1762 + }, + { + "epoch": 0.5, + "grad_norm": 2.2869814441583154, + "learning_rate": 1.0499986598070301e-06, + "loss": 0.2699, + "step": 1763 + }, + { + "epoch": 0.5, + "grad_norm": 2.3649098477306936, + "learning_rate": 1.0490819975059267e-06, + "loss": 0.2624, + "step": 1764 + }, + { + "epoch": 0.5, + "grad_norm": 2.2203716949643275, + "learning_rate": 1.0481652938612372e-06, + "loss": 0.2749, + "step": 1765 + }, + { + "epoch": 0.5, + "grad_norm": 2.218764342181374, + "learning_rate": 1.0472485496451347e-06, + "loss": 0.2587, + "step": 1766 + }, + { + "epoch": 0.5, + "grad_norm": 2.1979106434585445, + "learning_rate": 1.0463317656298272e-06, + "loss": 0.2564, + "step": 1767 + }, + { + "epoch": 0.5, + "grad_norm": 2.598649617483703, + "learning_rate": 1.0454149425875558e-06, + "loss": 0.307, + "step": 1768 + }, + { + "epoch": 0.5, + "grad_norm": 2.4256848758871676, + "learning_rate": 1.0444980812905944e-06, + "loss": 0.2598, + "step": 1769 + }, + { + "epoch": 0.5, + "grad_norm": 2.799918173511259, + "learning_rate": 1.0435811825112496e-06, + "loss": 0.296, + "step": 1770 + }, + { + "epoch": 0.5, + "grad_norm": 2.825152276633744, + "learning_rate": 1.0426642470218585e-06, + "loss": 0.2572, + "step": 1771 + }, + { + "epoch": 0.5, + "grad_norm": 2.2875491695406134, + "learning_rate": 1.0417472755947908e-06, + "loss": 0.2697, + "step": 1772 + }, + { + "epoch": 0.5, + "grad_norm": 2.338928267426616, + "learning_rate": 1.0408302690024446e-06, + "loss": 0.2905, + "step": 1773 + }, + { + "epoch": 0.5, + "grad_norm": 2.377571283990241, + "learning_rate": 1.0399132280172493e-06, + "loss": 0.2888, + "step": 1774 + }, + { + "epoch": 0.5, + "grad_norm": 2.494627386822755, + "learning_rate": 1.038996153411662e-06, + "loss": 0.3092, + "step": 1775 + }, + { + "epoch": 0.5, + "grad_norm": 2.4803657304664886, + "learning_rate": 1.0380790459581694e-06, + "loss": 0.2933, + "step": 1776 + }, + { + "epoch": 0.5, + "grad_norm": 2.599391249742488, + "learning_rate": 1.0371619064292842e-06, + "loss": 0.2987, + "step": 1777 + }, + { + "epoch": 0.5, + "grad_norm": 2.4867542348412894, + "learning_rate": 1.0362447355975475e-06, + "loss": 0.2618, + "step": 1778 + }, + { + "epoch": 0.5, + "grad_norm": 2.345277800331825, + "learning_rate": 1.0353275342355262e-06, + "loss": 0.2586, + "step": 1779 + }, + { + "epoch": 0.5, + "grad_norm": 2.2953003810671726, + "learning_rate": 1.034410303115813e-06, + "loss": 0.2999, + "step": 1780 + }, + { + "epoch": 0.5, + "grad_norm": 2.40069451279197, + "learning_rate": 1.0334930430110256e-06, + "loss": 0.2897, + "step": 1781 + }, + { + "epoch": 0.5, + "grad_norm": 2.4034742727959304, + "learning_rate": 1.0325757546938066e-06, + "loss": 0.3252, + "step": 1782 + }, + { + "epoch": 0.51, + "grad_norm": 2.6249771671947753, + "learning_rate": 1.0316584389368212e-06, + "loss": 0.2917, + "step": 1783 + }, + { + "epoch": 0.51, + "grad_norm": 2.521429682335649, + "learning_rate": 1.0307410965127594e-06, + "loss": 0.266, + "step": 1784 + }, + { + "epoch": 0.51, + "grad_norm": 2.4288395224203296, + "learning_rate": 1.029823728194332e-06, + "loss": 0.2769, + "step": 1785 + }, + { + "epoch": 0.51, + "grad_norm": 2.278568446029088, + "learning_rate": 1.0289063347542726e-06, + "loss": 0.2921, + "step": 1786 + }, + { + "epoch": 0.51, + "grad_norm": 2.3818620321327395, + "learning_rate": 1.0279889169653359e-06, + "loss": 0.2805, + "step": 1787 + }, + { + "epoch": 0.51, + "grad_norm": 2.438023465912851, + "learning_rate": 1.0270714756002965e-06, + "loss": 0.3057, + "step": 1788 + }, + { + "epoch": 0.51, + "grad_norm": 2.2677418993032314, + "learning_rate": 1.0261540114319497e-06, + "loss": 0.3, + "step": 1789 + }, + { + "epoch": 0.51, + "grad_norm": 2.204566597722765, + "learning_rate": 1.0252365252331092e-06, + "loss": 0.2801, + "step": 1790 + }, + { + "epoch": 0.51, + "grad_norm": 2.4901371783556514, + "learning_rate": 1.0243190177766084e-06, + "loss": 0.2966, + "step": 1791 + }, + { + "epoch": 0.51, + "grad_norm": 2.3665442696463894, + "learning_rate": 1.0234014898352965e-06, + "loss": 0.2915, + "step": 1792 + }, + { + "epoch": 0.51, + "grad_norm": 2.3177504690621924, + "learning_rate": 1.0224839421820426e-06, + "loss": 0.2512, + "step": 1793 + }, + { + "epoch": 0.51, + "grad_norm": 2.423196847728315, + "learning_rate": 1.0215663755897306e-06, + "loss": 0.2942, + "step": 1794 + }, + { + "epoch": 0.51, + "grad_norm": 2.3565389077019434, + "learning_rate": 1.0206487908312607e-06, + "loss": 0.2896, + "step": 1795 + }, + { + "epoch": 0.51, + "grad_norm": 2.5055170626548535, + "learning_rate": 1.0197311886795485e-06, + "loss": 0.2973, + "step": 1796 + }, + { + "epoch": 0.51, + "grad_norm": 2.303580120594449, + "learning_rate": 1.018813569907525e-06, + "loss": 0.2792, + "step": 1797 + }, + { + "epoch": 0.51, + "grad_norm": 2.2318800840205935, + "learning_rate": 1.0178959352881335e-06, + "loss": 0.2664, + "step": 1798 + }, + { + "epoch": 0.51, + "grad_norm": 2.404582126143902, + "learning_rate": 1.0169782855943326e-06, + "loss": 0.28, + "step": 1799 + }, + { + "epoch": 0.51, + "grad_norm": 2.3223672277058265, + "learning_rate": 1.016060621599092e-06, + "loss": 0.2784, + "step": 1800 + }, + { + "epoch": 0.51, + "grad_norm": 2.211504993025335, + "learning_rate": 1.0151429440753948e-06, + "loss": 0.2583, + "step": 1801 + }, + { + "epoch": 0.51, + "grad_norm": 2.1652312860171645, + "learning_rate": 1.0142252537962338e-06, + "loss": 0.2483, + "step": 1802 + }, + { + "epoch": 0.51, + "grad_norm": 2.5690040507347525, + "learning_rate": 1.0133075515346147e-06, + "loss": 0.3247, + "step": 1803 + }, + { + "epoch": 0.51, + "grad_norm": 2.3512850501355937, + "learning_rate": 1.0123898380635514e-06, + "loss": 0.2846, + "step": 1804 + }, + { + "epoch": 0.51, + "grad_norm": 2.5307997781444462, + "learning_rate": 1.0114721141560678e-06, + "loss": 0.3021, + "step": 1805 + }, + { + "epoch": 0.51, + "grad_norm": 2.3755600706842928, + "learning_rate": 1.0105543805851975e-06, + "loss": 0.2707, + "step": 1806 + }, + { + "epoch": 0.51, + "grad_norm": 2.510828516893168, + "learning_rate": 1.0096366381239806e-06, + "loss": 0.2939, + "step": 1807 + }, + { + "epoch": 0.51, + "grad_norm": 2.2898278280269504, + "learning_rate": 1.0087188875454668e-06, + "loss": 0.2451, + "step": 1808 + }, + { + "epoch": 0.51, + "grad_norm": 2.449184623548092, + "learning_rate": 1.0078011296227103e-06, + "loss": 0.2667, + "step": 1809 + }, + { + "epoch": 0.51, + "grad_norm": 2.4197885773422607, + "learning_rate": 1.0068833651287733e-06, + "loss": 0.2951, + "step": 1810 + }, + { + "epoch": 0.51, + "grad_norm": 2.536061699800912, + "learning_rate": 1.0059655948367228e-06, + "loss": 0.2738, + "step": 1811 + }, + { + "epoch": 0.51, + "grad_norm": 2.29823211800711, + "learning_rate": 1.0050478195196302e-06, + "loss": 0.3026, + "step": 1812 + }, + { + "epoch": 0.51, + "grad_norm": 2.131609519143086, + "learning_rate": 1.0041300399505724e-06, + "loss": 0.2666, + "step": 1813 + }, + { + "epoch": 0.51, + "grad_norm": 2.2837693911189096, + "learning_rate": 1.0032122569026281e-06, + "loss": 0.2944, + "step": 1814 + }, + { + "epoch": 0.51, + "grad_norm": 2.382827289671159, + "learning_rate": 1.0022944711488816e-06, + "loss": 0.2907, + "step": 1815 + }, + { + "epoch": 0.51, + "grad_norm": 2.3398481320326656, + "learning_rate": 1.0013766834624167e-06, + "loss": 0.2928, + "step": 1816 + }, + { + "epoch": 0.51, + "grad_norm": 2.3567810281368526, + "learning_rate": 1.0004588946163202e-06, + "loss": 0.278, + "step": 1817 + }, + { + "epoch": 0.52, + "grad_norm": 2.3774697747428126, + "learning_rate": 9.995411053836797e-07, + "loss": 0.317, + "step": 1818 + }, + { + "epoch": 0.52, + "grad_norm": 2.5589790947949296, + "learning_rate": 9.986233165375836e-07, + "loss": 0.2779, + "step": 1819 + }, + { + "epoch": 0.52, + "grad_norm": 2.408172535398655, + "learning_rate": 9.977055288511181e-07, + "loss": 0.2812, + "step": 1820 + }, + { + "epoch": 0.52, + "grad_norm": 3.1184684834613012, + "learning_rate": 9.967877430973716e-07, + "loss": 0.296, + "step": 1821 + }, + { + "epoch": 0.52, + "grad_norm": 2.3555209555113175, + "learning_rate": 9.958699600494277e-07, + "loss": 0.3002, + "step": 1822 + }, + { + "epoch": 0.52, + "grad_norm": 2.2792496340161654, + "learning_rate": 9.949521804803697e-07, + "loss": 0.2675, + "step": 1823 + }, + { + "epoch": 0.52, + "grad_norm": 2.444601373370277, + "learning_rate": 9.940344051632776e-07, + "loss": 0.2925, + "step": 1824 + }, + { + "epoch": 0.52, + "grad_norm": 2.4852374720528982, + "learning_rate": 9.931166348712266e-07, + "loss": 0.2627, + "step": 1825 + }, + { + "epoch": 0.52, + "grad_norm": 2.7780433494306975, + "learning_rate": 9.921988703772896e-07, + "loss": 0.3006, + "step": 1826 + }, + { + "epoch": 0.52, + "grad_norm": 2.446401074333341, + "learning_rate": 9.912811124545332e-07, + "loss": 0.2667, + "step": 1827 + }, + { + "epoch": 0.52, + "grad_norm": 2.3409932053933957, + "learning_rate": 9.903633618760193e-07, + "loss": 0.2746, + "step": 1828 + }, + { + "epoch": 0.52, + "grad_norm": 2.407708241012865, + "learning_rate": 9.894456194148028e-07, + "loss": 0.2602, + "step": 1829 + }, + { + "epoch": 0.52, + "grad_norm": 2.22125022141639, + "learning_rate": 9.885278858439321e-07, + "loss": 0.2698, + "step": 1830 + }, + { + "epoch": 0.52, + "grad_norm": 2.6680271799964284, + "learning_rate": 9.876101619364487e-07, + "loss": 0.2795, + "step": 1831 + }, + { + "epoch": 0.52, + "grad_norm": 2.4811890516289075, + "learning_rate": 9.866924484653855e-07, + "loss": 0.2979, + "step": 1832 + }, + { + "epoch": 0.52, + "grad_norm": 2.2735370919447107, + "learning_rate": 9.85774746203766e-07, + "loss": 0.2601, + "step": 1833 + }, + { + "epoch": 0.52, + "grad_norm": 2.3429662880966524, + "learning_rate": 9.848570559246053e-07, + "loss": 0.2813, + "step": 1834 + }, + { + "epoch": 0.52, + "grad_norm": 2.449821577681452, + "learning_rate": 9.839393784009076e-07, + "loss": 0.2968, + "step": 1835 + }, + { + "epoch": 0.52, + "grad_norm": 2.302224969895475, + "learning_rate": 9.830217144056673e-07, + "loss": 0.2868, + "step": 1836 + }, + { + "epoch": 0.52, + "grad_norm": 3.6831658109868886, + "learning_rate": 9.821040647118664e-07, + "loss": 0.2874, + "step": 1837 + }, + { + "epoch": 0.52, + "grad_norm": 2.3434640511937035, + "learning_rate": 9.811864300924752e-07, + "loss": 0.2616, + "step": 1838 + }, + { + "epoch": 0.52, + "grad_norm": 2.337482263937847, + "learning_rate": 9.802688113204516e-07, + "loss": 0.2761, + "step": 1839 + }, + { + "epoch": 0.52, + "grad_norm": 3.089170981209531, + "learning_rate": 9.793512091687394e-07, + "loss": 0.2862, + "step": 1840 + }, + { + "epoch": 0.52, + "grad_norm": 2.4252580066175775, + "learning_rate": 9.784336244102695e-07, + "loss": 0.3009, + "step": 1841 + }, + { + "epoch": 0.52, + "grad_norm": 2.259680761735452, + "learning_rate": 9.775160578179573e-07, + "loss": 0.297, + "step": 1842 + }, + { + "epoch": 0.52, + "grad_norm": 2.325015796073353, + "learning_rate": 9.765985101647034e-07, + "loss": 0.3074, + "step": 1843 + }, + { + "epoch": 0.52, + "grad_norm": 2.298833983394814, + "learning_rate": 9.75680982223392e-07, + "loss": 0.2627, + "step": 1844 + }, + { + "epoch": 0.52, + "grad_norm": 2.5713981868206135, + "learning_rate": 9.747634747668905e-07, + "loss": 0.2899, + "step": 1845 + }, + { + "epoch": 0.52, + "grad_norm": 2.419747640232926, + "learning_rate": 9.738459885680502e-07, + "loss": 0.283, + "step": 1846 + }, + { + "epoch": 0.52, + "grad_norm": 2.355604327545243, + "learning_rate": 9.729285243997036e-07, + "loss": 0.2893, + "step": 1847 + }, + { + "epoch": 0.52, + "grad_norm": 2.8604074410743534, + "learning_rate": 9.720110830346642e-07, + "loss": 0.2752, + "step": 1848 + }, + { + "epoch": 0.52, + "grad_norm": 2.351280307326712, + "learning_rate": 9.710936652457275e-07, + "loss": 0.2744, + "step": 1849 + }, + { + "epoch": 0.52, + "grad_norm": 2.392326574811654, + "learning_rate": 9.70176271805668e-07, + "loss": 0.2886, + "step": 1850 + }, + { + "epoch": 0.52, + "grad_norm": 2.4560054450403612, + "learning_rate": 9.692589034872408e-07, + "loss": 0.2949, + "step": 1851 + }, + { + "epoch": 0.52, + "grad_norm": 2.333345087894509, + "learning_rate": 9.683415610631787e-07, + "loss": 0.3006, + "step": 1852 + }, + { + "epoch": 0.53, + "grad_norm": 2.384442083807228, + "learning_rate": 9.674242453061935e-07, + "loss": 0.2591, + "step": 1853 + }, + { + "epoch": 0.53, + "grad_norm": 2.3865601767906246, + "learning_rate": 9.66506956988974e-07, + "loss": 0.2539, + "step": 1854 + }, + { + "epoch": 0.53, + "grad_norm": 2.3401087584534266, + "learning_rate": 9.655896968841872e-07, + "loss": 0.2816, + "step": 1855 + }, + { + "epoch": 0.53, + "grad_norm": 2.4510382444906633, + "learning_rate": 9.64672465764474e-07, + "loss": 0.2776, + "step": 1856 + }, + { + "epoch": 0.53, + "grad_norm": 2.6222410730947283, + "learning_rate": 9.637552644024526e-07, + "loss": 0.2724, + "step": 1857 + }, + { + "epoch": 0.53, + "grad_norm": 2.0489271774956777, + "learning_rate": 9.62838093570716e-07, + "loss": 0.2194, + "step": 1858 + }, + { + "epoch": 0.53, + "grad_norm": 2.4784055906636544, + "learning_rate": 9.619209540418306e-07, + "loss": 0.2921, + "step": 1859 + }, + { + "epoch": 0.53, + "grad_norm": 2.1713605870866304, + "learning_rate": 9.610038465883376e-07, + "loss": 0.2496, + "step": 1860 + }, + { + "epoch": 0.53, + "grad_norm": 2.39693101648829, + "learning_rate": 9.600867719827506e-07, + "loss": 0.2992, + "step": 1861 + }, + { + "epoch": 0.53, + "grad_norm": 2.229116705886729, + "learning_rate": 9.591697309975555e-07, + "loss": 0.2797, + "step": 1862 + }, + { + "epoch": 0.53, + "grad_norm": 2.4326025652791423, + "learning_rate": 9.582527244052094e-07, + "loss": 0.2756, + "step": 1863 + }, + { + "epoch": 0.53, + "grad_norm": 2.3303058887008707, + "learning_rate": 9.573357529781414e-07, + "loss": 0.2905, + "step": 1864 + }, + { + "epoch": 0.53, + "grad_norm": 2.234063548100955, + "learning_rate": 9.564188174887503e-07, + "loss": 0.266, + "step": 1865 + }, + { + "epoch": 0.53, + "grad_norm": 2.2828800276675207, + "learning_rate": 9.555019187094057e-07, + "loss": 0.2989, + "step": 1866 + }, + { + "epoch": 0.53, + "grad_norm": 2.6430152997452225, + "learning_rate": 9.545850574124443e-07, + "loss": 0.2702, + "step": 1867 + }, + { + "epoch": 0.53, + "grad_norm": 2.4975260565196784, + "learning_rate": 9.536682343701728e-07, + "loss": 0.29, + "step": 1868 + }, + { + "epoch": 0.53, + "grad_norm": 2.2169356324072207, + "learning_rate": 9.527514503548651e-07, + "loss": 0.2595, + "step": 1869 + }, + { + "epoch": 0.53, + "grad_norm": 2.4171320151902154, + "learning_rate": 9.518347061387627e-07, + "loss": 0.288, + "step": 1870 + }, + { + "epoch": 0.53, + "grad_norm": 2.4017512468814295, + "learning_rate": 9.509180024940734e-07, + "loss": 0.3166, + "step": 1871 + }, + { + "epoch": 0.53, + "grad_norm": 2.417658702684582, + "learning_rate": 9.500013401929701e-07, + "loss": 0.2598, + "step": 1872 + }, + { + "epoch": 0.53, + "grad_norm": 2.4955418694217633, + "learning_rate": 9.490847200075917e-07, + "loss": 0.3007, + "step": 1873 + }, + { + "epoch": 0.53, + "grad_norm": 2.407401323867513, + "learning_rate": 9.48168142710041e-07, + "loss": 0.2828, + "step": 1874 + }, + { + "epoch": 0.53, + "grad_norm": 2.3317221443277116, + "learning_rate": 9.472516090723859e-07, + "loss": 0.268, + "step": 1875 + }, + { + "epoch": 0.53, + "grad_norm": 2.633088826167416, + "learning_rate": 9.463351198666559e-07, + "loss": 0.3279, + "step": 1876 + }, + { + "epoch": 0.53, + "grad_norm": 2.4694512144741725, + "learning_rate": 9.454186758648443e-07, + "loss": 0.3348, + "step": 1877 + }, + { + "epoch": 0.53, + "grad_norm": 2.427311040283762, + "learning_rate": 9.445022778389056e-07, + "loss": 0.2883, + "step": 1878 + }, + { + "epoch": 0.53, + "grad_norm": 2.254665154293914, + "learning_rate": 9.435859265607554e-07, + "loss": 0.2615, + "step": 1879 + }, + { + "epoch": 0.53, + "grad_norm": 2.4736029632574907, + "learning_rate": 9.426696228022713e-07, + "loss": 0.2653, + "step": 1880 + }, + { + "epoch": 0.53, + "grad_norm": 2.358484562573563, + "learning_rate": 9.417533673352893e-07, + "loss": 0.2898, + "step": 1881 + }, + { + "epoch": 0.53, + "grad_norm": 2.3562987141421656, + "learning_rate": 9.408371609316058e-07, + "loss": 0.2452, + "step": 1882 + }, + { + "epoch": 0.53, + "grad_norm": 2.256556790697905, + "learning_rate": 9.39921004362975e-07, + "loss": 0.2823, + "step": 1883 + }, + { + "epoch": 0.53, + "grad_norm": 2.5861784387547755, + "learning_rate": 9.390048984011094e-07, + "loss": 0.2502, + "step": 1884 + }, + { + "epoch": 0.53, + "grad_norm": 2.5766242409891094, + "learning_rate": 9.380888438176795e-07, + "loss": 0.3105, + "step": 1885 + }, + { + "epoch": 0.53, + "grad_norm": 2.4315420005305888, + "learning_rate": 9.37172841384312e-07, + "loss": 0.282, + "step": 1886 + }, + { + "epoch": 0.53, + "grad_norm": 2.7588805558007934, + "learning_rate": 9.362568918725895e-07, + "loss": 0.277, + "step": 1887 + }, + { + "epoch": 0.53, + "grad_norm": 2.420971784915736, + "learning_rate": 9.353409960540505e-07, + "loss": 0.3012, + "step": 1888 + }, + { + "epoch": 0.54, + "grad_norm": 2.2747877360470765, + "learning_rate": 9.344251547001871e-07, + "loss": 0.2797, + "step": 1889 + }, + { + "epoch": 0.54, + "grad_norm": 2.4181692064226716, + "learning_rate": 9.335093685824476e-07, + "loss": 0.2631, + "step": 1890 + }, + { + "epoch": 0.54, + "grad_norm": 2.340474100192766, + "learning_rate": 9.325936384722321e-07, + "loss": 0.2648, + "step": 1891 + }, + { + "epoch": 0.54, + "grad_norm": 2.4228559382516854, + "learning_rate": 9.316779651408939e-07, + "loss": 0.2925, + "step": 1892 + }, + { + "epoch": 0.54, + "grad_norm": 2.523273040640516, + "learning_rate": 9.307623493597387e-07, + "loss": 0.2793, + "step": 1893 + }, + { + "epoch": 0.54, + "grad_norm": 2.7145037184668444, + "learning_rate": 9.29846791900024e-07, + "loss": 0.315, + "step": 1894 + }, + { + "epoch": 0.54, + "grad_norm": 2.4016932182830613, + "learning_rate": 9.289312935329572e-07, + "loss": 0.2689, + "step": 1895 + }, + { + "epoch": 0.54, + "grad_norm": 2.5004609561755666, + "learning_rate": 9.280158550296968e-07, + "loss": 0.2841, + "step": 1896 + }, + { + "epoch": 0.54, + "grad_norm": 2.2912353853003617, + "learning_rate": 9.271004771613508e-07, + "loss": 0.2632, + "step": 1897 + }, + { + "epoch": 0.54, + "grad_norm": 2.490373208906252, + "learning_rate": 9.261851606989753e-07, + "loss": 0.2796, + "step": 1898 + }, + { + "epoch": 0.54, + "grad_norm": 2.3708068137158524, + "learning_rate": 9.252699064135758e-07, + "loss": 0.2935, + "step": 1899 + }, + { + "epoch": 0.54, + "grad_norm": 2.211291701138569, + "learning_rate": 9.243547150761046e-07, + "loss": 0.2753, + "step": 1900 + }, + { + "epoch": 0.54, + "grad_norm": 2.310505963420102, + "learning_rate": 9.23439587457462e-07, + "loss": 0.2807, + "step": 1901 + }, + { + "epoch": 0.54, + "grad_norm": 2.2820307173012457, + "learning_rate": 9.22524524328493e-07, + "loss": 0.2828, + "step": 1902 + }, + { + "epoch": 0.54, + "grad_norm": 2.1476374326857575, + "learning_rate": 9.216095264599894e-07, + "loss": 0.2673, + "step": 1903 + }, + { + "epoch": 0.54, + "grad_norm": 2.4323240408845486, + "learning_rate": 9.206945946226883e-07, + "loss": 0.29, + "step": 1904 + }, + { + "epoch": 0.54, + "grad_norm": 2.6627412851874475, + "learning_rate": 9.197797295872708e-07, + "loss": 0.2769, + "step": 1905 + }, + { + "epoch": 0.54, + "grad_norm": 2.530655011578067, + "learning_rate": 9.188649321243609e-07, + "loss": 0.3114, + "step": 1906 + }, + { + "epoch": 0.54, + "grad_norm": 2.3937515846923914, + "learning_rate": 9.179502030045269e-07, + "loss": 0.2785, + "step": 1907 + }, + { + "epoch": 0.54, + "grad_norm": 2.323366450374401, + "learning_rate": 9.170355429982787e-07, + "loss": 0.282, + "step": 1908 + }, + { + "epoch": 0.54, + "grad_norm": 2.354831302609529, + "learning_rate": 9.161209528760689e-07, + "loss": 0.2577, + "step": 1909 + }, + { + "epoch": 0.54, + "grad_norm": 2.71004074994221, + "learning_rate": 9.152064334082903e-07, + "loss": 0.2879, + "step": 1910 + }, + { + "epoch": 0.54, + "grad_norm": 2.4742529942991918, + "learning_rate": 9.142919853652765e-07, + "loss": 0.2716, + "step": 1911 + }, + { + "epoch": 0.54, + "grad_norm": 2.1048299871316023, + "learning_rate": 9.133776095173013e-07, + "loss": 0.2557, + "step": 1912 + }, + { + "epoch": 0.54, + "grad_norm": 2.578248531071737, + "learning_rate": 9.124633066345768e-07, + "loss": 0.2945, + "step": 1913 + }, + { + "epoch": 0.54, + "grad_norm": 2.916797832424238, + "learning_rate": 9.115490774872549e-07, + "loss": 0.2802, + "step": 1914 + }, + { + "epoch": 0.54, + "grad_norm": 2.392195902194485, + "learning_rate": 9.106349228454242e-07, + "loss": 0.287, + "step": 1915 + }, + { + "epoch": 0.54, + "grad_norm": 2.37893310147143, + "learning_rate": 9.097208434791116e-07, + "loss": 0.2834, + "step": 1916 + }, + { + "epoch": 0.54, + "grad_norm": 2.287075599664048, + "learning_rate": 9.088068401582795e-07, + "loss": 0.2554, + "step": 1917 + }, + { + "epoch": 0.54, + "grad_norm": 2.416346774146093, + "learning_rate": 9.078929136528267e-07, + "loss": 0.2581, + "step": 1918 + }, + { + "epoch": 0.54, + "grad_norm": 2.353920820538036, + "learning_rate": 9.069790647325878e-07, + "loss": 0.2793, + "step": 1919 + }, + { + "epoch": 0.54, + "grad_norm": 2.3798498329467104, + "learning_rate": 9.060652941673317e-07, + "loss": 0.2813, + "step": 1920 + }, + { + "epoch": 0.54, + "grad_norm": 2.4914236732800843, + "learning_rate": 9.05151602726761e-07, + "loss": 0.3002, + "step": 1921 + }, + { + "epoch": 0.54, + "grad_norm": 2.4316041066701413, + "learning_rate": 9.042379911805116e-07, + "loss": 0.2804, + "step": 1922 + }, + { + "epoch": 0.54, + "grad_norm": 2.639715234179413, + "learning_rate": 9.033244602981525e-07, + "loss": 0.2862, + "step": 1923 + }, + { + "epoch": 0.55, + "grad_norm": 2.241581435626847, + "learning_rate": 9.024110108491853e-07, + "loss": 0.2774, + "step": 1924 + }, + { + "epoch": 0.55, + "grad_norm": 2.856997759615627, + "learning_rate": 9.014976436030416e-07, + "loss": 0.2824, + "step": 1925 + }, + { + "epoch": 0.55, + "grad_norm": 2.2585837498129946, + "learning_rate": 9.005843593290847e-07, + "loss": 0.2812, + "step": 1926 + }, + { + "epoch": 0.55, + "grad_norm": 2.6046519639500647, + "learning_rate": 8.996711587966077e-07, + "loss": 0.2876, + "step": 1927 + }, + { + "epoch": 0.55, + "grad_norm": 2.4420328746980484, + "learning_rate": 8.987580427748335e-07, + "loss": 0.2839, + "step": 1928 + }, + { + "epoch": 0.55, + "grad_norm": 2.3624950251203214, + "learning_rate": 8.978450120329137e-07, + "loss": 0.3064, + "step": 1929 + }, + { + "epoch": 0.55, + "grad_norm": 2.3272518802728523, + "learning_rate": 8.969320673399276e-07, + "loss": 0.2786, + "step": 1930 + }, + { + "epoch": 0.55, + "grad_norm": 2.2731877501718016, + "learning_rate": 8.960192094648826e-07, + "loss": 0.2765, + "step": 1931 + }, + { + "epoch": 0.55, + "grad_norm": 2.419747659121121, + "learning_rate": 8.951064391767119e-07, + "loss": 0.2528, + "step": 1932 + }, + { + "epoch": 0.55, + "grad_norm": 2.442953839847466, + "learning_rate": 8.941937572442773e-07, + "loss": 0.2231, + "step": 1933 + }, + { + "epoch": 0.55, + "grad_norm": 2.387408496361725, + "learning_rate": 8.932811644363635e-07, + "loss": 0.2664, + "step": 1934 + }, + { + "epoch": 0.55, + "grad_norm": 2.4922207588724437, + "learning_rate": 8.923686615216816e-07, + "loss": 0.321, + "step": 1935 + }, + { + "epoch": 0.55, + "grad_norm": 2.4252013919135114, + "learning_rate": 8.914562492688666e-07, + "loss": 0.3129, + "step": 1936 + }, + { + "epoch": 0.55, + "grad_norm": 2.1682085963308153, + "learning_rate": 8.905439284464769e-07, + "loss": 0.2873, + "step": 1937 + }, + { + "epoch": 0.55, + "grad_norm": 2.416834129768717, + "learning_rate": 8.896316998229946e-07, + "loss": 0.2461, + "step": 1938 + }, + { + "epoch": 0.55, + "grad_norm": 2.195335513474169, + "learning_rate": 8.887195641668234e-07, + "loss": 0.2644, + "step": 1939 + }, + { + "epoch": 0.55, + "grad_norm": 2.2883878593039366, + "learning_rate": 8.878075222462895e-07, + "loss": 0.2704, + "step": 1940 + }, + { + "epoch": 0.55, + "grad_norm": 2.787774022686844, + "learning_rate": 8.86895574829639e-07, + "loss": 0.2796, + "step": 1941 + }, + { + "epoch": 0.55, + "grad_norm": 2.3051247213959707, + "learning_rate": 8.859837226850388e-07, + "loss": 0.2558, + "step": 1942 + }, + { + "epoch": 0.55, + "grad_norm": 2.3063454413012825, + "learning_rate": 8.850719665805766e-07, + "loss": 0.2837, + "step": 1943 + }, + { + "epoch": 0.55, + "grad_norm": 2.3273022288303054, + "learning_rate": 8.841603072842581e-07, + "loss": 0.2783, + "step": 1944 + }, + { + "epoch": 0.55, + "grad_norm": 2.255821650288142, + "learning_rate": 8.832487455640074e-07, + "loss": 0.2783, + "step": 1945 + }, + { + "epoch": 0.55, + "grad_norm": 2.4717382690009164, + "learning_rate": 8.823372821876671e-07, + "loss": 0.2977, + "step": 1946 + }, + { + "epoch": 0.55, + "grad_norm": 2.257346400197328, + "learning_rate": 8.814259179229959e-07, + "loss": 0.2843, + "step": 1947 + }, + { + "epoch": 0.55, + "grad_norm": 2.222491787939881, + "learning_rate": 8.805146535376708e-07, + "loss": 0.2756, + "step": 1948 + }, + { + "epoch": 0.55, + "grad_norm": 2.7443046514917997, + "learning_rate": 8.796034897992828e-07, + "loss": 0.3251, + "step": 1949 + }, + { + "epoch": 0.55, + "grad_norm": 2.3189270403797373, + "learning_rate": 8.78692427475339e-07, + "loss": 0.2681, + "step": 1950 + }, + { + "epoch": 0.55, + "grad_norm": 2.319905410278452, + "learning_rate": 8.777814673332614e-07, + "loss": 0.2999, + "step": 1951 + }, + { + "epoch": 0.55, + "grad_norm": 2.3315871825933216, + "learning_rate": 8.768706101403847e-07, + "loss": 0.2713, + "step": 1952 + }, + { + "epoch": 0.55, + "grad_norm": 2.585690032496667, + "learning_rate": 8.759598566639586e-07, + "loss": 0.3387, + "step": 1953 + }, + { + "epoch": 0.55, + "grad_norm": 2.2257813847959205, + "learning_rate": 8.750492076711439e-07, + "loss": 0.2633, + "step": 1954 + }, + { + "epoch": 0.55, + "grad_norm": 2.3438552082983675, + "learning_rate": 8.741386639290144e-07, + "loss": 0.2721, + "step": 1955 + }, + { + "epoch": 0.55, + "grad_norm": 2.381209291439057, + "learning_rate": 8.732282262045545e-07, + "loss": 0.294, + "step": 1956 + }, + { + "epoch": 0.55, + "grad_norm": 2.383293998015045, + "learning_rate": 8.723178952646595e-07, + "loss": 0.2625, + "step": 1957 + }, + { + "epoch": 0.55, + "grad_norm": 2.3888215246027644, + "learning_rate": 8.714076718761355e-07, + "loss": 0.2833, + "step": 1958 + }, + { + "epoch": 0.56, + "grad_norm": 2.484679974607535, + "learning_rate": 8.704975568056974e-07, + "loss": 0.2762, + "step": 1959 + }, + { + "epoch": 0.56, + "grad_norm": 2.7262266793563943, + "learning_rate": 8.695875508199682e-07, + "loss": 0.2734, + "step": 1960 + }, + { + "epoch": 0.56, + "grad_norm": 2.4748011500503755, + "learning_rate": 8.686776546854799e-07, + "loss": 0.2844, + "step": 1961 + }, + { + "epoch": 0.56, + "grad_norm": 2.4158474466530744, + "learning_rate": 8.677678691686721e-07, + "loss": 0.2933, + "step": 1962 + }, + { + "epoch": 0.56, + "grad_norm": 2.365395676527773, + "learning_rate": 8.668581950358909e-07, + "loss": 0.2819, + "step": 1963 + }, + { + "epoch": 0.56, + "grad_norm": 2.308616132611635, + "learning_rate": 8.659486330533881e-07, + "loss": 0.2717, + "step": 1964 + }, + { + "epoch": 0.56, + "grad_norm": 2.1834999486117583, + "learning_rate": 8.650391839873217e-07, + "loss": 0.2787, + "step": 1965 + }, + { + "epoch": 0.56, + "grad_norm": 2.461144597727408, + "learning_rate": 8.641298486037543e-07, + "loss": 0.3018, + "step": 1966 + }, + { + "epoch": 0.56, + "grad_norm": 2.80762751416142, + "learning_rate": 8.632206276686532e-07, + "loss": 0.292, + "step": 1967 + }, + { + "epoch": 0.56, + "grad_norm": 2.250135073102463, + "learning_rate": 8.623115219478884e-07, + "loss": 0.2788, + "step": 1968 + }, + { + "epoch": 0.56, + "grad_norm": 2.248508050847324, + "learning_rate": 8.614025322072336e-07, + "loss": 0.2968, + "step": 1969 + }, + { + "epoch": 0.56, + "grad_norm": 2.1565106717006195, + "learning_rate": 8.604936592123646e-07, + "loss": 0.2758, + "step": 1970 + }, + { + "epoch": 0.56, + "grad_norm": 2.3513480691328814, + "learning_rate": 8.595849037288581e-07, + "loss": 0.25, + "step": 1971 + }, + { + "epoch": 0.56, + "grad_norm": 2.564839835756159, + "learning_rate": 8.586762665221938e-07, + "loss": 0.2919, + "step": 1972 + }, + { + "epoch": 0.56, + "grad_norm": 2.5031125040120625, + "learning_rate": 8.577677483577496e-07, + "loss": 0.3025, + "step": 1973 + }, + { + "epoch": 0.56, + "grad_norm": 2.3757684711077345, + "learning_rate": 8.568593500008046e-07, + "loss": 0.2957, + "step": 1974 + }, + { + "epoch": 0.56, + "grad_norm": 2.4814182838221437, + "learning_rate": 8.559510722165359e-07, + "loss": 0.296, + "step": 1975 + }, + { + "epoch": 0.56, + "grad_norm": 2.1058424660903623, + "learning_rate": 8.550429157700195e-07, + "loss": 0.2587, + "step": 1976 + }, + { + "epoch": 0.56, + "grad_norm": 2.3051111370602984, + "learning_rate": 8.541348814262297e-07, + "loss": 0.2773, + "step": 1977 + }, + { + "epoch": 0.56, + "grad_norm": 2.379656401763318, + "learning_rate": 8.532269699500376e-07, + "loss": 0.278, + "step": 1978 + }, + { + "epoch": 0.56, + "grad_norm": 2.128141672834981, + "learning_rate": 8.523191821062101e-07, + "loss": 0.2422, + "step": 1979 + }, + { + "epoch": 0.56, + "grad_norm": 2.4154798555989068, + "learning_rate": 8.51411518659411e-07, + "loss": 0.3067, + "step": 1980 + }, + { + "epoch": 0.56, + "grad_norm": 2.4147085149757053, + "learning_rate": 8.505039803741985e-07, + "loss": 0.285, + "step": 1981 + }, + { + "epoch": 0.56, + "grad_norm": 2.260022512654864, + "learning_rate": 8.49596568015026e-07, + "loss": 0.2586, + "step": 1982 + }, + { + "epoch": 0.56, + "grad_norm": 2.327455245309484, + "learning_rate": 8.486892823462409e-07, + "loss": 0.286, + "step": 1983 + }, + { + "epoch": 0.56, + "grad_norm": 2.500033776529244, + "learning_rate": 8.47782124132083e-07, + "loss": 0.2992, + "step": 1984 + }, + { + "epoch": 0.56, + "grad_norm": 2.444891884703191, + "learning_rate": 8.468750941366858e-07, + "loss": 0.2911, + "step": 1985 + }, + { + "epoch": 0.56, + "grad_norm": 2.284667592281283, + "learning_rate": 8.459681931240732e-07, + "loss": 0.2759, + "step": 1986 + }, + { + "epoch": 0.56, + "grad_norm": 2.4878491891950203, + "learning_rate": 8.45061421858163e-07, + "loss": 0.2881, + "step": 1987 + }, + { + "epoch": 0.56, + "grad_norm": 2.3148743275286594, + "learning_rate": 8.441547811027614e-07, + "loss": 0.2717, + "step": 1988 + }, + { + "epoch": 0.56, + "grad_norm": 2.246090838636193, + "learning_rate": 8.432482716215661e-07, + "loss": 0.2686, + "step": 1989 + }, + { + "epoch": 0.56, + "grad_norm": 2.4017844104732218, + "learning_rate": 8.423418941781628e-07, + "loss": 0.2761, + "step": 1990 + }, + { + "epoch": 0.56, + "grad_norm": 2.4218579296866674, + "learning_rate": 8.414356495360273e-07, + "loss": 0.3065, + "step": 1991 + }, + { + "epoch": 0.56, + "grad_norm": 2.341440126862159, + "learning_rate": 8.405295384585231e-07, + "loss": 0.2606, + "step": 1992 + }, + { + "epoch": 0.56, + "grad_norm": 2.292885993151981, + "learning_rate": 8.396235617089012e-07, + "loss": 0.2555, + "step": 1993 + }, + { + "epoch": 0.56, + "grad_norm": 2.341814398483769, + "learning_rate": 8.387177200502995e-07, + "loss": 0.266, + "step": 1994 + }, + { + "epoch": 0.57, + "grad_norm": 2.296872307085636, + "learning_rate": 8.378120142457414e-07, + "loss": 0.2583, + "step": 1995 + }, + { + "epoch": 0.57, + "grad_norm": 2.3580787443847817, + "learning_rate": 8.369064450581372e-07, + "loss": 0.3061, + "step": 1996 + }, + { + "epoch": 0.57, + "grad_norm": 2.3394656647930487, + "learning_rate": 8.360010132502811e-07, + "loss": 0.2603, + "step": 1997 + }, + { + "epoch": 0.57, + "grad_norm": 2.326461942946112, + "learning_rate": 8.35095719584852e-07, + "loss": 0.2937, + "step": 1998 + }, + { + "epoch": 0.57, + "grad_norm": 2.9975048170259715, + "learning_rate": 8.34190564824412e-07, + "loss": 0.2967, + "step": 1999 + }, + { + "epoch": 0.57, + "grad_norm": 2.1577062774756857, + "learning_rate": 8.332855497314066e-07, + "loss": 0.249, + "step": 2000 + }, + { + "epoch": 0.57, + "grad_norm": 2.2756120324360114, + "learning_rate": 8.32380675068164e-07, + "loss": 0.2726, + "step": 2001 + }, + { + "epoch": 0.57, + "grad_norm": 2.6199877483436027, + "learning_rate": 8.314759415968935e-07, + "loss": 0.2773, + "step": 2002 + }, + { + "epoch": 0.57, + "grad_norm": 2.4368100866555142, + "learning_rate": 8.305713500796851e-07, + "loss": 0.2852, + "step": 2003 + }, + { + "epoch": 0.57, + "grad_norm": 2.3744985075641285, + "learning_rate": 8.296669012785104e-07, + "loss": 0.2776, + "step": 2004 + }, + { + "epoch": 0.57, + "grad_norm": 5.917189729911316, + "learning_rate": 8.287625959552198e-07, + "loss": 0.2689, + "step": 2005 + }, + { + "epoch": 0.57, + "grad_norm": 2.355544152202805, + "learning_rate": 8.278584348715436e-07, + "loss": 0.2588, + "step": 2006 + }, + { + "epoch": 0.57, + "grad_norm": 2.355190741627796, + "learning_rate": 8.269544187890898e-07, + "loss": 0.2782, + "step": 2007 + }, + { + "epoch": 0.57, + "grad_norm": 2.4253597317377102, + "learning_rate": 8.260505484693448e-07, + "loss": 0.2703, + "step": 2008 + }, + { + "epoch": 0.57, + "grad_norm": 2.433583526241199, + "learning_rate": 8.251468246736724e-07, + "loss": 0.2638, + "step": 2009 + }, + { + "epoch": 0.57, + "grad_norm": 2.199948379924113, + "learning_rate": 8.242432481633118e-07, + "loss": 0.261, + "step": 2010 + }, + { + "epoch": 0.57, + "grad_norm": 2.5886116191736344, + "learning_rate": 8.233398196993798e-07, + "loss": 0.2925, + "step": 2011 + }, + { + "epoch": 0.57, + "grad_norm": 2.2534242765762365, + "learning_rate": 8.224365400428674e-07, + "loss": 0.2674, + "step": 2012 + }, + { + "epoch": 0.57, + "grad_norm": 2.272714618279941, + "learning_rate": 8.215334099546409e-07, + "loss": 0.29, + "step": 2013 + }, + { + "epoch": 0.57, + "grad_norm": 2.430180054552849, + "learning_rate": 8.206304301954396e-07, + "loss": 0.2953, + "step": 2014 + }, + { + "epoch": 0.57, + "grad_norm": 2.352628694363435, + "learning_rate": 8.197276015258772e-07, + "loss": 0.26, + "step": 2015 + }, + { + "epoch": 0.57, + "grad_norm": 2.416566753780975, + "learning_rate": 8.188249247064398e-07, + "loss": 0.3085, + "step": 2016 + }, + { + "epoch": 0.57, + "grad_norm": 2.3952528996210893, + "learning_rate": 8.179224004974856e-07, + "loss": 0.27, + "step": 2017 + }, + { + "epoch": 0.57, + "grad_norm": 2.4678308055327314, + "learning_rate": 8.17020029659244e-07, + "loss": 0.2753, + "step": 2018 + }, + { + "epoch": 0.57, + "grad_norm": 2.2562591329701824, + "learning_rate": 8.161178129518154e-07, + "loss": 0.2535, + "step": 2019 + }, + { + "epoch": 0.57, + "grad_norm": 2.429259179192542, + "learning_rate": 8.152157511351703e-07, + "loss": 0.2736, + "step": 2020 + }, + { + "epoch": 0.57, + "grad_norm": 2.4260375660546294, + "learning_rate": 8.143138449691495e-07, + "loss": 0.2932, + "step": 2021 + }, + { + "epoch": 0.57, + "grad_norm": 2.3616449561041235, + "learning_rate": 8.134120952134613e-07, + "loss": 0.2741, + "step": 2022 + }, + { + "epoch": 0.57, + "grad_norm": 2.3250813299091413, + "learning_rate": 8.125105026276831e-07, + "loss": 0.2833, + "step": 2023 + }, + { + "epoch": 0.57, + "grad_norm": 2.252788985570452, + "learning_rate": 8.116090679712599e-07, + "loss": 0.2829, + "step": 2024 + }, + { + "epoch": 0.57, + "grad_norm": 2.5183270412019643, + "learning_rate": 8.107077920035031e-07, + "loss": 0.2796, + "step": 2025 + }, + { + "epoch": 0.57, + "grad_norm": 2.37778294908015, + "learning_rate": 8.098066754835915e-07, + "loss": 0.2816, + "step": 2026 + }, + { + "epoch": 0.57, + "grad_norm": 2.300737037482741, + "learning_rate": 8.089057191705686e-07, + "loss": 0.258, + "step": 2027 + }, + { + "epoch": 0.57, + "grad_norm": 2.679227795168687, + "learning_rate": 8.080049238233438e-07, + "loss": 0.2379, + "step": 2028 + }, + { + "epoch": 0.57, + "grad_norm": 2.7613834098675243, + "learning_rate": 8.071042902006895e-07, + "loss": 0.328, + "step": 2029 + }, + { + "epoch": 0.58, + "grad_norm": 2.3468717263559764, + "learning_rate": 8.06203819061243e-07, + "loss": 0.2647, + "step": 2030 + }, + { + "epoch": 0.58, + "grad_norm": 2.3531096490494323, + "learning_rate": 8.053035111635053e-07, + "loss": 0.2864, + "step": 2031 + }, + { + "epoch": 0.58, + "grad_norm": 2.414012788939637, + "learning_rate": 8.044033672658386e-07, + "loss": 0.292, + "step": 2032 + }, + { + "epoch": 0.58, + "grad_norm": 2.2777315812282772, + "learning_rate": 8.035033881264674e-07, + "loss": 0.2596, + "step": 2033 + }, + { + "epoch": 0.58, + "grad_norm": 2.4517357307262144, + "learning_rate": 8.026035745034773e-07, + "loss": 0.2746, + "step": 2034 + }, + { + "epoch": 0.58, + "grad_norm": 2.4070537452132297, + "learning_rate": 8.017039271548154e-07, + "loss": 0.2971, + "step": 2035 + }, + { + "epoch": 0.58, + "grad_norm": 2.247094822024942, + "learning_rate": 8.008044468382876e-07, + "loss": 0.2794, + "step": 2036 + }, + { + "epoch": 0.58, + "grad_norm": 2.306978962744317, + "learning_rate": 7.999051343115595e-07, + "loss": 0.2655, + "step": 2037 + }, + { + "epoch": 0.58, + "grad_norm": 2.2123055046356663, + "learning_rate": 7.990059903321552e-07, + "loss": 0.2664, + "step": 2038 + }, + { + "epoch": 0.58, + "grad_norm": 2.2889931885830315, + "learning_rate": 7.981070156574571e-07, + "loss": 0.2636, + "step": 2039 + }, + { + "epoch": 0.58, + "grad_norm": 2.478118432621666, + "learning_rate": 7.972082110447051e-07, + "loss": 0.276, + "step": 2040 + }, + { + "epoch": 0.58, + "grad_norm": 2.2579090871085845, + "learning_rate": 7.963095772509959e-07, + "loss": 0.2579, + "step": 2041 + }, + { + "epoch": 0.58, + "grad_norm": 2.236066241421122, + "learning_rate": 7.954111150332814e-07, + "loss": 0.244, + "step": 2042 + }, + { + "epoch": 0.58, + "grad_norm": 2.3660082996037533, + "learning_rate": 7.945128251483702e-07, + "loss": 0.2734, + "step": 2043 + }, + { + "epoch": 0.58, + "grad_norm": 2.359454646610675, + "learning_rate": 7.936147083529243e-07, + "loss": 0.2573, + "step": 2044 + }, + { + "epoch": 0.58, + "grad_norm": 2.236496257553659, + "learning_rate": 7.927167654034621e-07, + "loss": 0.2801, + "step": 2045 + }, + { + "epoch": 0.58, + "grad_norm": 3.133062542389784, + "learning_rate": 7.918189970563534e-07, + "loss": 0.2848, + "step": 2046 + }, + { + "epoch": 0.58, + "grad_norm": 2.2905808901519253, + "learning_rate": 7.909214040678219e-07, + "loss": 0.3012, + "step": 2047 + }, + { + "epoch": 0.58, + "grad_norm": 2.3380895198629887, + "learning_rate": 7.900239871939434e-07, + "loss": 0.278, + "step": 2048 + }, + { + "epoch": 0.58, + "grad_norm": 2.2480017735021103, + "learning_rate": 7.891267471906451e-07, + "loss": 0.2631, + "step": 2049 + }, + { + "epoch": 0.58, + "grad_norm": 2.4311189635822004, + "learning_rate": 7.882296848137063e-07, + "loss": 0.2801, + "step": 2050 + }, + { + "epoch": 0.58, + "grad_norm": 2.5144487494732086, + "learning_rate": 7.873328008187553e-07, + "loss": 0.3047, + "step": 2051 + }, + { + "epoch": 0.58, + "grad_norm": 2.1876903102504475, + "learning_rate": 7.864360959612713e-07, + "loss": 0.2638, + "step": 2052 + }, + { + "epoch": 0.58, + "grad_norm": 2.2277035817996653, + "learning_rate": 7.855395709965813e-07, + "loss": 0.2514, + "step": 2053 + }, + { + "epoch": 0.58, + "grad_norm": 2.491217536073013, + "learning_rate": 7.846432266798618e-07, + "loss": 0.2838, + "step": 2054 + }, + { + "epoch": 0.58, + "grad_norm": 2.3602798122236064, + "learning_rate": 7.83747063766137e-07, + "loss": 0.2715, + "step": 2055 + }, + { + "epoch": 0.58, + "grad_norm": 2.5464354937602813, + "learning_rate": 7.828510830102784e-07, + "loss": 0.2894, + "step": 2056 + }, + { + "epoch": 0.58, + "grad_norm": 2.2870974956544288, + "learning_rate": 7.819552851670032e-07, + "loss": 0.2697, + "step": 2057 + }, + { + "epoch": 0.58, + "grad_norm": 2.3234172641559288, + "learning_rate": 7.810596709908758e-07, + "loss": 0.2723, + "step": 2058 + }, + { + "epoch": 0.58, + "grad_norm": 2.414991920888712, + "learning_rate": 7.801642412363041e-07, + "loss": 0.274, + "step": 2059 + }, + { + "epoch": 0.58, + "grad_norm": 2.444712542812734, + "learning_rate": 7.792689966575432e-07, + "loss": 0.2814, + "step": 2060 + }, + { + "epoch": 0.58, + "grad_norm": 2.47079382630738, + "learning_rate": 7.7837393800869e-07, + "loss": 0.2675, + "step": 2061 + }, + { + "epoch": 0.58, + "grad_norm": 2.1684754178540895, + "learning_rate": 7.774790660436857e-07, + "loss": 0.2669, + "step": 2062 + }, + { + "epoch": 0.58, + "grad_norm": 2.460092218537868, + "learning_rate": 7.765843815163142e-07, + "loss": 0.2804, + "step": 2063 + }, + { + "epoch": 0.58, + "grad_norm": 2.433323079334218, + "learning_rate": 7.756898851802012e-07, + "loss": 0.2714, + "step": 2064 + }, + { + "epoch": 0.59, + "grad_norm": 2.537186553166969, + "learning_rate": 7.747955777888144e-07, + "loss": 0.2474, + "step": 2065 + }, + { + "epoch": 0.59, + "grad_norm": 2.267636696071962, + "learning_rate": 7.739014600954621e-07, + "loss": 0.2699, + "step": 2066 + }, + { + "epoch": 0.59, + "grad_norm": 2.430074544000473, + "learning_rate": 7.730075328532929e-07, + "loss": 0.3004, + "step": 2067 + }, + { + "epoch": 0.59, + "grad_norm": 2.4201031125203274, + "learning_rate": 7.721137968152943e-07, + "loss": 0.2731, + "step": 2068 + }, + { + "epoch": 0.59, + "grad_norm": 2.290956179149082, + "learning_rate": 7.712202527342936e-07, + "loss": 0.2916, + "step": 2069 + }, + { + "epoch": 0.59, + "grad_norm": 2.4089321122557297, + "learning_rate": 7.703269013629563e-07, + "loss": 0.2728, + "step": 2070 + }, + { + "epoch": 0.59, + "grad_norm": 2.276232550384149, + "learning_rate": 7.694337434537855e-07, + "loss": 0.2448, + "step": 2071 + }, + { + "epoch": 0.59, + "grad_norm": 2.496023360365651, + "learning_rate": 7.685407797591207e-07, + "loss": 0.3067, + "step": 2072 + }, + { + "epoch": 0.59, + "grad_norm": 2.4890362420157524, + "learning_rate": 7.676480110311384e-07, + "loss": 0.2688, + "step": 2073 + }, + { + "epoch": 0.59, + "grad_norm": 2.5525509171952554, + "learning_rate": 7.667554380218512e-07, + "loss": 0.2882, + "step": 2074 + }, + { + "epoch": 0.59, + "grad_norm": 2.4611262943609726, + "learning_rate": 7.658630614831064e-07, + "loss": 0.2698, + "step": 2075 + }, + { + "epoch": 0.59, + "grad_norm": 2.4864295885236416, + "learning_rate": 7.649708821665855e-07, + "loss": 0.2881, + "step": 2076 + }, + { + "epoch": 0.59, + "grad_norm": 2.2472701497969103, + "learning_rate": 7.640789008238044e-07, + "loss": 0.2778, + "step": 2077 + }, + { + "epoch": 0.59, + "grad_norm": 2.195989671921942, + "learning_rate": 7.631871182061117e-07, + "loss": 0.2975, + "step": 2078 + }, + { + "epoch": 0.59, + "grad_norm": 2.476679902794854, + "learning_rate": 7.622955350646898e-07, + "loss": 0.2998, + "step": 2079 + }, + { + "epoch": 0.59, + "grad_norm": 2.696686446155788, + "learning_rate": 7.614041521505517e-07, + "loss": 0.3025, + "step": 2080 + }, + { + "epoch": 0.59, + "grad_norm": 2.3976222679749406, + "learning_rate": 7.605129702145421e-07, + "loss": 0.2662, + "step": 2081 + }, + { + "epoch": 0.59, + "grad_norm": 2.3738204956727182, + "learning_rate": 7.59621990007337e-07, + "loss": 0.2548, + "step": 2082 + }, + { + "epoch": 0.59, + "grad_norm": 2.3700727964945068, + "learning_rate": 7.587312122794413e-07, + "loss": 0.3058, + "step": 2083 + }, + { + "epoch": 0.59, + "grad_norm": 2.265968246116298, + "learning_rate": 7.578406377811914e-07, + "loss": 0.3043, + "step": 2084 + }, + { + "epoch": 0.59, + "grad_norm": 2.3290623178260126, + "learning_rate": 7.569502672627502e-07, + "loss": 0.2747, + "step": 2085 + }, + { + "epoch": 0.59, + "grad_norm": 2.233856682563935, + "learning_rate": 7.560601014741101e-07, + "loss": 0.271, + "step": 2086 + }, + { + "epoch": 0.59, + "grad_norm": 2.394115009432413, + "learning_rate": 7.551701411650908e-07, + "loss": 0.2599, + "step": 2087 + }, + { + "epoch": 0.59, + "grad_norm": 2.313562738069459, + "learning_rate": 7.542803870853385e-07, + "loss": 0.2798, + "step": 2088 + }, + { + "epoch": 0.59, + "grad_norm": 2.4791031646621047, + "learning_rate": 7.533908399843265e-07, + "loss": 0.3062, + "step": 2089 + }, + { + "epoch": 0.59, + "grad_norm": 2.2719449030756977, + "learning_rate": 7.525015006113536e-07, + "loss": 0.2697, + "step": 2090 + }, + { + "epoch": 0.59, + "grad_norm": 2.288872717976602, + "learning_rate": 7.516123697155423e-07, + "loss": 0.2292, + "step": 2091 + }, + { + "epoch": 0.59, + "grad_norm": 2.3190728757400314, + "learning_rate": 7.507234480458413e-07, + "loss": 0.2436, + "step": 2092 + }, + { + "epoch": 0.59, + "grad_norm": 2.437882538102759, + "learning_rate": 7.498347363510219e-07, + "loss": 0.2779, + "step": 2093 + }, + { + "epoch": 0.59, + "grad_norm": 2.2839625441224283, + "learning_rate": 7.489462353796792e-07, + "loss": 0.2652, + "step": 2094 + }, + { + "epoch": 0.59, + "grad_norm": 2.461952777175989, + "learning_rate": 7.480579458802307e-07, + "loss": 0.2964, + "step": 2095 + }, + { + "epoch": 0.59, + "grad_norm": 2.2354629485291837, + "learning_rate": 7.471698686009149e-07, + "loss": 0.2695, + "step": 2096 + }, + { + "epoch": 0.59, + "grad_norm": 2.4767741728643218, + "learning_rate": 7.46282004289793e-07, + "loss": 0.2797, + "step": 2097 + }, + { + "epoch": 0.59, + "grad_norm": 2.435906585416531, + "learning_rate": 7.453943536947449e-07, + "loss": 0.2383, + "step": 2098 + }, + { + "epoch": 0.59, + "grad_norm": 2.3346098559962103, + "learning_rate": 7.44506917563473e-07, + "loss": 0.278, + "step": 2099 + }, + { + "epoch": 0.6, + "grad_norm": 2.6073668597304467, + "learning_rate": 7.436196966434967e-07, + "loss": 0.3092, + "step": 2100 + }, + { + "epoch": 0.6, + "grad_norm": 2.342933109587325, + "learning_rate": 7.427326916821557e-07, + "loss": 0.2889, + "step": 2101 + }, + { + "epoch": 0.6, + "grad_norm": 2.410578090840819, + "learning_rate": 7.41845903426606e-07, + "loss": 0.3085, + "step": 2102 + }, + { + "epoch": 0.6, + "grad_norm": 2.525339056164691, + "learning_rate": 7.409593326238238e-07, + "loss": 0.2591, + "step": 2103 + }, + { + "epoch": 0.6, + "grad_norm": 2.2445227765026416, + "learning_rate": 7.400729800205996e-07, + "loss": 0.2499, + "step": 2104 + }, + { + "epoch": 0.6, + "grad_norm": 2.3990383814636984, + "learning_rate": 7.391868463635412e-07, + "loss": 0.2688, + "step": 2105 + }, + { + "epoch": 0.6, + "grad_norm": 2.2685711781633047, + "learning_rate": 7.383009323990722e-07, + "loss": 0.2827, + "step": 2106 + }, + { + "epoch": 0.6, + "grad_norm": 2.3055301604055773, + "learning_rate": 7.3741523887343e-07, + "loss": 0.2671, + "step": 2107 + }, + { + "epoch": 0.6, + "grad_norm": 2.557224320802981, + "learning_rate": 7.365297665326677e-07, + "loss": 0.2837, + "step": 2108 + }, + { + "epoch": 0.6, + "grad_norm": 2.534642480368157, + "learning_rate": 7.356445161226515e-07, + "loss": 0.2691, + "step": 2109 + }, + { + "epoch": 0.6, + "grad_norm": 2.356329998491783, + "learning_rate": 7.347594883890607e-07, + "loss": 0.2751, + "step": 2110 + }, + { + "epoch": 0.6, + "grad_norm": 2.381325655091747, + "learning_rate": 7.338746840773865e-07, + "loss": 0.2565, + "step": 2111 + }, + { + "epoch": 0.6, + "grad_norm": 2.329511980402188, + "learning_rate": 7.329901039329325e-07, + "loss": 0.263, + "step": 2112 + }, + { + "epoch": 0.6, + "grad_norm": 2.315894476466652, + "learning_rate": 7.321057487008135e-07, + "loss": 0.3127, + "step": 2113 + }, + { + "epoch": 0.6, + "grad_norm": 2.2856850537832103, + "learning_rate": 7.312216191259551e-07, + "loss": 0.2547, + "step": 2114 + }, + { + "epoch": 0.6, + "grad_norm": 2.421444178537356, + "learning_rate": 7.303377159530918e-07, + "loss": 0.3091, + "step": 2115 + }, + { + "epoch": 0.6, + "grad_norm": 2.243237220702003, + "learning_rate": 7.294540399267682e-07, + "loss": 0.2723, + "step": 2116 + }, + { + "epoch": 0.6, + "grad_norm": 2.5157378247007247, + "learning_rate": 7.285705917913372e-07, + "loss": 0.272, + "step": 2117 + }, + { + "epoch": 0.6, + "grad_norm": 2.453951629479585, + "learning_rate": 7.276873722909604e-07, + "loss": 0.2891, + "step": 2118 + }, + { + "epoch": 0.6, + "grad_norm": 2.300751946901452, + "learning_rate": 7.268043821696062e-07, + "loss": 0.2803, + "step": 2119 + }, + { + "epoch": 0.6, + "grad_norm": 2.5899121934419096, + "learning_rate": 7.259216221710495e-07, + "loss": 0.2943, + "step": 2120 + }, + { + "epoch": 0.6, + "grad_norm": 2.2701438074597586, + "learning_rate": 7.250390930388723e-07, + "loss": 0.2895, + "step": 2121 + }, + { + "epoch": 0.6, + "grad_norm": 3.732218819170221, + "learning_rate": 7.241567955164609e-07, + "loss": 0.2743, + "step": 2122 + }, + { + "epoch": 0.6, + "grad_norm": 2.3387796014090907, + "learning_rate": 7.232747303470081e-07, + "loss": 0.2776, + "step": 2123 + }, + { + "epoch": 0.6, + "grad_norm": 2.5764272321852286, + "learning_rate": 7.223928982735095e-07, + "loss": 0.2892, + "step": 2124 + }, + { + "epoch": 0.6, + "grad_norm": 2.383832218103511, + "learning_rate": 7.215113000387653e-07, + "loss": 0.2517, + "step": 2125 + }, + { + "epoch": 0.6, + "grad_norm": 2.464831541878814, + "learning_rate": 7.206299363853781e-07, + "loss": 0.2689, + "step": 2126 + }, + { + "epoch": 0.6, + "grad_norm": 2.471104492666639, + "learning_rate": 7.19748808055753e-07, + "loss": 0.2722, + "step": 2127 + }, + { + "epoch": 0.6, + "grad_norm": 3.329307106617614, + "learning_rate": 7.188679157920976e-07, + "loss": 0.2766, + "step": 2128 + }, + { + "epoch": 0.6, + "grad_norm": 2.3363762963465784, + "learning_rate": 7.179872603364199e-07, + "loss": 0.2662, + "step": 2129 + }, + { + "epoch": 0.6, + "grad_norm": 2.517632149515168, + "learning_rate": 7.171068424305286e-07, + "loss": 0.2746, + "step": 2130 + }, + { + "epoch": 0.6, + "grad_norm": 2.5706870497329652, + "learning_rate": 7.162266628160322e-07, + "loss": 0.2633, + "step": 2131 + }, + { + "epoch": 0.6, + "grad_norm": 2.5354944977016265, + "learning_rate": 7.153467222343386e-07, + "loss": 0.2902, + "step": 2132 + }, + { + "epoch": 0.6, + "grad_norm": 2.7456345440150387, + "learning_rate": 7.144670214266551e-07, + "loss": 0.287, + "step": 2133 + }, + { + "epoch": 0.6, + "grad_norm": 2.3894076576548855, + "learning_rate": 7.135875611339853e-07, + "loss": 0.3036, + "step": 2134 + }, + { + "epoch": 0.6, + "grad_norm": 2.569731455695933, + "learning_rate": 7.127083420971319e-07, + "loss": 0.2747, + "step": 2135 + }, + { + "epoch": 0.61, + "grad_norm": 2.2928767146566758, + "learning_rate": 7.11829365056693e-07, + "loss": 0.2583, + "step": 2136 + }, + { + "epoch": 0.61, + "grad_norm": 2.673719868392139, + "learning_rate": 7.109506307530645e-07, + "loss": 0.2716, + "step": 2137 + }, + { + "epoch": 0.61, + "grad_norm": 2.3579020039258674, + "learning_rate": 7.100721399264362e-07, + "loss": 0.2868, + "step": 2138 + }, + { + "epoch": 0.61, + "grad_norm": 2.266348724055388, + "learning_rate": 7.091938933167936e-07, + "loss": 0.2455, + "step": 2139 + }, + { + "epoch": 0.61, + "grad_norm": 2.227236593653924, + "learning_rate": 7.083158916639168e-07, + "loss": 0.2457, + "step": 2140 + }, + { + "epoch": 0.61, + "grad_norm": 2.332130191191915, + "learning_rate": 7.074381357073781e-07, + "loss": 0.2814, + "step": 2141 + }, + { + "epoch": 0.61, + "grad_norm": 2.3944392677131745, + "learning_rate": 7.065606261865452e-07, + "loss": 0.2994, + "step": 2142 + }, + { + "epoch": 0.61, + "grad_norm": 2.4322935118276092, + "learning_rate": 7.056833638405761e-07, + "loss": 0.3136, + "step": 2143 + }, + { + "epoch": 0.61, + "grad_norm": 2.5671930014493745, + "learning_rate": 7.048063494084218e-07, + "loss": 0.2926, + "step": 2144 + }, + { + "epoch": 0.61, + "grad_norm": 2.318564278312186, + "learning_rate": 7.039295836288237e-07, + "loss": 0.2545, + "step": 2145 + }, + { + "epoch": 0.61, + "grad_norm": 2.464705863159734, + "learning_rate": 7.030530672403138e-07, + "loss": 0.2708, + "step": 2146 + }, + { + "epoch": 0.61, + "grad_norm": 2.19609942264482, + "learning_rate": 7.021768009812155e-07, + "loss": 0.2596, + "step": 2147 + }, + { + "epoch": 0.61, + "grad_norm": 2.48676163657945, + "learning_rate": 7.013007855896396e-07, + "loss": 0.2627, + "step": 2148 + }, + { + "epoch": 0.61, + "grad_norm": 2.3182899968588386, + "learning_rate": 7.004250218034863e-07, + "loss": 0.279, + "step": 2149 + }, + { + "epoch": 0.61, + "grad_norm": 2.3442020814482314, + "learning_rate": 6.99549510360444e-07, + "loss": 0.2704, + "step": 2150 + }, + { + "epoch": 0.61, + "grad_norm": 2.22906721368112, + "learning_rate": 6.986742519979883e-07, + "loss": 0.2606, + "step": 2151 + }, + { + "epoch": 0.61, + "grad_norm": 2.2962396809815453, + "learning_rate": 6.977992474533823e-07, + "loss": 0.2615, + "step": 2152 + }, + { + "epoch": 0.61, + "grad_norm": 2.385100652981511, + "learning_rate": 6.969244974636744e-07, + "loss": 0.3269, + "step": 2153 + }, + { + "epoch": 0.61, + "grad_norm": 2.5104597172170724, + "learning_rate": 6.960500027656989e-07, + "loss": 0.2623, + "step": 2154 + }, + { + "epoch": 0.61, + "grad_norm": 2.343953006786303, + "learning_rate": 6.951757640960753e-07, + "loss": 0.2832, + "step": 2155 + }, + { + "epoch": 0.61, + "grad_norm": 2.3828926943026008, + "learning_rate": 6.943017821912068e-07, + "loss": 0.3251, + "step": 2156 + }, + { + "epoch": 0.61, + "grad_norm": 2.989832235987735, + "learning_rate": 6.934280577872813e-07, + "loss": 0.2892, + "step": 2157 + }, + { + "epoch": 0.61, + "grad_norm": 2.724481526794439, + "learning_rate": 6.925545916202691e-07, + "loss": 0.2734, + "step": 2158 + }, + { + "epoch": 0.61, + "grad_norm": 2.428029211150564, + "learning_rate": 6.916813844259233e-07, + "loss": 0.3051, + "step": 2159 + }, + { + "epoch": 0.61, + "grad_norm": 2.259379386342519, + "learning_rate": 6.908084369397782e-07, + "loss": 0.2725, + "step": 2160 + }, + { + "epoch": 0.61, + "grad_norm": 2.402931976669789, + "learning_rate": 6.899357498971499e-07, + "loss": 0.3116, + "step": 2161 + }, + { + "epoch": 0.61, + "grad_norm": 2.316897428090023, + "learning_rate": 6.890633240331353e-07, + "loss": 0.2725, + "step": 2162 + }, + { + "epoch": 0.61, + "grad_norm": 2.2528634833638503, + "learning_rate": 6.881911600826114e-07, + "loss": 0.2364, + "step": 2163 + }, + { + "epoch": 0.61, + "grad_norm": 2.3508074001500647, + "learning_rate": 6.873192587802339e-07, + "loss": 0.273, + "step": 2164 + }, + { + "epoch": 0.61, + "grad_norm": 2.460979430802097, + "learning_rate": 6.864476208604373e-07, + "loss": 0.2938, + "step": 2165 + }, + { + "epoch": 0.61, + "grad_norm": 2.2576345269648708, + "learning_rate": 6.855762470574344e-07, + "loss": 0.2741, + "step": 2166 + }, + { + "epoch": 0.61, + "grad_norm": 2.4334518552088227, + "learning_rate": 6.847051381052165e-07, + "loss": 0.2845, + "step": 2167 + }, + { + "epoch": 0.61, + "grad_norm": 2.3696143758251127, + "learning_rate": 6.838342947375506e-07, + "loss": 0.2722, + "step": 2168 + }, + { + "epoch": 0.61, + "grad_norm": 2.32490402082567, + "learning_rate": 6.829637176879801e-07, + "loss": 0.2925, + "step": 2169 + }, + { + "epoch": 0.61, + "grad_norm": 2.5248417813499, + "learning_rate": 6.820934076898246e-07, + "loss": 0.2985, + "step": 2170 + }, + { + "epoch": 0.62, + "grad_norm": 2.445099598380385, + "learning_rate": 6.812233654761779e-07, + "loss": 0.3044, + "step": 2171 + }, + { + "epoch": 0.62, + "grad_norm": 2.214657241657144, + "learning_rate": 6.803535917799097e-07, + "loss": 0.2414, + "step": 2172 + }, + { + "epoch": 0.62, + "grad_norm": 2.3655223286007603, + "learning_rate": 6.794840873336622e-07, + "loss": 0.2806, + "step": 2173 + }, + { + "epoch": 0.62, + "grad_norm": 2.4011629273315696, + "learning_rate": 6.786148528698511e-07, + "loss": 0.298, + "step": 2174 + }, + { + "epoch": 0.62, + "grad_norm": 2.3645904484074682, + "learning_rate": 6.777458891206647e-07, + "loss": 0.2872, + "step": 2175 + }, + { + "epoch": 0.62, + "grad_norm": 2.3303417451654598, + "learning_rate": 6.768771968180642e-07, + "loss": 0.2827, + "step": 2176 + }, + { + "epoch": 0.62, + "grad_norm": 2.42116804321365, + "learning_rate": 6.760087766937806e-07, + "loss": 0.287, + "step": 2177 + }, + { + "epoch": 0.62, + "grad_norm": 2.382779946387148, + "learning_rate": 6.751406294793165e-07, + "loss": 0.2552, + "step": 2178 + }, + { + "epoch": 0.62, + "grad_norm": 2.477672356064572, + "learning_rate": 6.742727559059447e-07, + "loss": 0.3088, + "step": 2179 + }, + { + "epoch": 0.62, + "grad_norm": 2.455414571828824, + "learning_rate": 6.734051567047067e-07, + "loss": 0.2795, + "step": 2180 + }, + { + "epoch": 0.62, + "grad_norm": 2.351414062300336, + "learning_rate": 6.72537832606414e-07, + "loss": 0.282, + "step": 2181 + }, + { + "epoch": 0.62, + "grad_norm": 2.22111778868477, + "learning_rate": 6.716707843416459e-07, + "loss": 0.2564, + "step": 2182 + }, + { + "epoch": 0.62, + "grad_norm": 2.175827676310216, + "learning_rate": 6.708040126407492e-07, + "loss": 0.2521, + "step": 2183 + }, + { + "epoch": 0.62, + "grad_norm": 2.5166909133215754, + "learning_rate": 6.699375182338378e-07, + "loss": 0.2835, + "step": 2184 + }, + { + "epoch": 0.62, + "grad_norm": 2.267037067854833, + "learning_rate": 6.690713018507916e-07, + "loss": 0.262, + "step": 2185 + }, + { + "epoch": 0.62, + "grad_norm": 2.288265638185063, + "learning_rate": 6.682053642212575e-07, + "loss": 0.2488, + "step": 2186 + }, + { + "epoch": 0.62, + "grad_norm": 2.2731622558940687, + "learning_rate": 6.673397060746469e-07, + "loss": 0.2665, + "step": 2187 + }, + { + "epoch": 0.62, + "grad_norm": 2.4917477923811013, + "learning_rate": 6.664743281401351e-07, + "loss": 0.292, + "step": 2188 + }, + { + "epoch": 0.62, + "grad_norm": 2.3827017764759604, + "learning_rate": 6.656092311466623e-07, + "loss": 0.2527, + "step": 2189 + }, + { + "epoch": 0.62, + "grad_norm": 2.224388121091182, + "learning_rate": 6.647444158229318e-07, + "loss": 0.2426, + "step": 2190 + }, + { + "epoch": 0.62, + "grad_norm": 2.2232829090554005, + "learning_rate": 6.638798828974099e-07, + "loss": 0.2419, + "step": 2191 + }, + { + "epoch": 0.62, + "grad_norm": 2.5867977881897732, + "learning_rate": 6.630156330983243e-07, + "loss": 0.2956, + "step": 2192 + }, + { + "epoch": 0.62, + "grad_norm": 2.4204940802285067, + "learning_rate": 6.621516671536649e-07, + "loss": 0.3057, + "step": 2193 + }, + { + "epoch": 0.62, + "grad_norm": 3.348152749985618, + "learning_rate": 6.612879857911824e-07, + "loss": 0.294, + "step": 2194 + }, + { + "epoch": 0.62, + "grad_norm": 2.3451848527753048, + "learning_rate": 6.604245897383869e-07, + "loss": 0.2748, + "step": 2195 + }, + { + "epoch": 0.62, + "grad_norm": 2.8676379377626224, + "learning_rate": 6.595614797225496e-07, + "loss": 0.2689, + "step": 2196 + }, + { + "epoch": 0.62, + "grad_norm": 2.3611535849479055, + "learning_rate": 6.586986564706998e-07, + "loss": 0.2612, + "step": 2197 + }, + { + "epoch": 0.62, + "grad_norm": 2.7435701756217252, + "learning_rate": 6.57836120709626e-07, + "loss": 0.3083, + "step": 2198 + }, + { + "epoch": 0.62, + "grad_norm": 2.529965900786767, + "learning_rate": 6.569738731658734e-07, + "loss": 0.3059, + "step": 2199 + }, + { + "epoch": 0.62, + "grad_norm": 2.2625128317474608, + "learning_rate": 6.56111914565745e-07, + "loss": 0.2622, + "step": 2200 + }, + { + "epoch": 0.62, + "grad_norm": 2.3355623198620297, + "learning_rate": 6.552502456353011e-07, + "loss": 0.2772, + "step": 2201 + }, + { + "epoch": 0.62, + "grad_norm": 2.426170884306247, + "learning_rate": 6.543888671003572e-07, + "loss": 0.2864, + "step": 2202 + }, + { + "epoch": 0.62, + "grad_norm": 2.40017155684168, + "learning_rate": 6.535277796864841e-07, + "loss": 0.2741, + "step": 2203 + }, + { + "epoch": 0.62, + "grad_norm": 2.52362240442778, + "learning_rate": 6.526669841190078e-07, + "loss": 0.2956, + "step": 2204 + }, + { + "epoch": 0.62, + "grad_norm": 2.356002953270973, + "learning_rate": 6.518064811230082e-07, + "loss": 0.2837, + "step": 2205 + }, + { + "epoch": 0.63, + "grad_norm": 2.6796883261044893, + "learning_rate": 6.509462714233193e-07, + "loss": 0.3092, + "step": 2206 + }, + { + "epoch": 0.63, + "grad_norm": 2.3176863434708137, + "learning_rate": 6.500863557445273e-07, + "loss": 0.2456, + "step": 2207 + }, + { + "epoch": 0.63, + "grad_norm": 2.6021904172206427, + "learning_rate": 6.49226734810971e-07, + "loss": 0.2975, + "step": 2208 + }, + { + "epoch": 0.63, + "grad_norm": 2.4935328971745925, + "learning_rate": 6.483674093467408e-07, + "loss": 0.2831, + "step": 2209 + }, + { + "epoch": 0.63, + "grad_norm": 2.3839197112529766, + "learning_rate": 6.475083800756791e-07, + "loss": 0.2826, + "step": 2210 + }, + { + "epoch": 0.63, + "grad_norm": 2.299681443388202, + "learning_rate": 6.466496477213776e-07, + "loss": 0.2215, + "step": 2211 + }, + { + "epoch": 0.63, + "grad_norm": 2.2836145958696776, + "learning_rate": 6.457912130071785e-07, + "loss": 0.2875, + "step": 2212 + }, + { + "epoch": 0.63, + "grad_norm": 2.437927993707982, + "learning_rate": 6.449330766561733e-07, + "loss": 0.3069, + "step": 2213 + }, + { + "epoch": 0.63, + "grad_norm": 2.416185295716576, + "learning_rate": 6.440752393912015e-07, + "loss": 0.2728, + "step": 2214 + }, + { + "epoch": 0.63, + "grad_norm": 2.2801588482913173, + "learning_rate": 6.43217701934852e-07, + "loss": 0.2855, + "step": 2215 + }, + { + "epoch": 0.63, + "grad_norm": 2.2867858493533832, + "learning_rate": 6.4236046500946e-07, + "loss": 0.2627, + "step": 2216 + }, + { + "epoch": 0.63, + "grad_norm": 2.587876434898805, + "learning_rate": 6.41503529337108e-07, + "loss": 0.293, + "step": 2217 + }, + { + "epoch": 0.63, + "grad_norm": 2.3088016451229136, + "learning_rate": 6.406468956396249e-07, + "loss": 0.2776, + "step": 2218 + }, + { + "epoch": 0.63, + "grad_norm": 2.3788309790734488, + "learning_rate": 6.397905646385844e-07, + "loss": 0.2834, + "step": 2219 + }, + { + "epoch": 0.63, + "grad_norm": 2.278441350078684, + "learning_rate": 6.389345370553064e-07, + "loss": 0.2475, + "step": 2220 + }, + { + "epoch": 0.63, + "grad_norm": 3.9142417532273233, + "learning_rate": 6.380788136108546e-07, + "loss": 0.2945, + "step": 2221 + }, + { + "epoch": 0.63, + "grad_norm": 2.344305602286148, + "learning_rate": 6.372233950260367e-07, + "loss": 0.2612, + "step": 2222 + }, + { + "epoch": 0.63, + "grad_norm": 2.33848755529304, + "learning_rate": 6.363682820214031e-07, + "loss": 0.25, + "step": 2223 + }, + { + "epoch": 0.63, + "grad_norm": 2.429139743401095, + "learning_rate": 6.355134753172473e-07, + "loss": 0.2767, + "step": 2224 + }, + { + "epoch": 0.63, + "grad_norm": 2.5080203198990803, + "learning_rate": 6.34658975633605e-07, + "loss": 0.293, + "step": 2225 + }, + { + "epoch": 0.63, + "grad_norm": 2.488305071582111, + "learning_rate": 6.338047836902527e-07, + "loss": 0.2923, + "step": 2226 + }, + { + "epoch": 0.63, + "grad_norm": 2.6386594684053755, + "learning_rate": 6.329509002067079e-07, + "loss": 0.2638, + "step": 2227 + }, + { + "epoch": 0.63, + "grad_norm": 2.499223534801197, + "learning_rate": 6.320973259022286e-07, + "loss": 0.2789, + "step": 2228 + }, + { + "epoch": 0.63, + "grad_norm": 2.349727343761741, + "learning_rate": 6.312440614958114e-07, + "loss": 0.3011, + "step": 2229 + }, + { + "epoch": 0.63, + "grad_norm": 2.399789289121343, + "learning_rate": 6.303911077061937e-07, + "loss": 0.2913, + "step": 2230 + }, + { + "epoch": 0.63, + "grad_norm": 2.468975784925108, + "learning_rate": 6.29538465251849e-07, + "loss": 0.2514, + "step": 2231 + }, + { + "epoch": 0.63, + "grad_norm": 2.3046212635405547, + "learning_rate": 6.286861348509902e-07, + "loss": 0.267, + "step": 2232 + }, + { + "epoch": 0.63, + "grad_norm": 2.2861395547404557, + "learning_rate": 6.278341172215669e-07, + "loss": 0.2329, + "step": 2233 + }, + { + "epoch": 0.63, + "grad_norm": 2.2936158988880835, + "learning_rate": 6.269824130812644e-07, + "loss": 0.2568, + "step": 2234 + }, + { + "epoch": 0.63, + "grad_norm": 2.241563640965623, + "learning_rate": 6.261310231475054e-07, + "loss": 0.2582, + "step": 2235 + }, + { + "epoch": 0.63, + "grad_norm": 2.4414301771540305, + "learning_rate": 6.252799481374472e-07, + "loss": 0.287, + "step": 2236 + }, + { + "epoch": 0.63, + "grad_norm": 2.2552834975225498, + "learning_rate": 6.244291887679818e-07, + "loss": 0.2436, + "step": 2237 + }, + { + "epoch": 0.63, + "grad_norm": 2.380287911860163, + "learning_rate": 6.235787457557349e-07, + "loss": 0.266, + "step": 2238 + }, + { + "epoch": 0.63, + "grad_norm": 2.3759044308925366, + "learning_rate": 6.227286198170662e-07, + "loss": 0.299, + "step": 2239 + }, + { + "epoch": 0.63, + "grad_norm": 2.477400443290136, + "learning_rate": 6.218788116680689e-07, + "loss": 0.2634, + "step": 2240 + }, + { + "epoch": 0.63, + "grad_norm": 2.4706236064839318, + "learning_rate": 6.210293220245677e-07, + "loss": 0.3339, + "step": 2241 + }, + { + "epoch": 0.64, + "grad_norm": 2.4793553558938775, + "learning_rate": 6.201801516021189e-07, + "loss": 0.3025, + "step": 2242 + }, + { + "epoch": 0.64, + "grad_norm": 2.228421467160145, + "learning_rate": 6.193313011160103e-07, + "loss": 0.2664, + "step": 2243 + }, + { + "epoch": 0.64, + "grad_norm": 2.437534621664695, + "learning_rate": 6.184827712812603e-07, + "loss": 0.2625, + "step": 2244 + }, + { + "epoch": 0.64, + "grad_norm": 2.3956581186100396, + "learning_rate": 6.176345628126175e-07, + "loss": 0.2898, + "step": 2245 + }, + { + "epoch": 0.64, + "grad_norm": 2.239063252739338, + "learning_rate": 6.167866764245586e-07, + "loss": 0.2541, + "step": 2246 + }, + { + "epoch": 0.64, + "grad_norm": 2.476757025154476, + "learning_rate": 6.159391128312899e-07, + "loss": 0.3004, + "step": 2247 + }, + { + "epoch": 0.64, + "grad_norm": 2.2569840619493933, + "learning_rate": 6.150918727467454e-07, + "loss": 0.275, + "step": 2248 + }, + { + "epoch": 0.64, + "grad_norm": 2.2133733010199363, + "learning_rate": 6.142449568845877e-07, + "loss": 0.287, + "step": 2249 + }, + { + "epoch": 0.64, + "grad_norm": 2.364274466964281, + "learning_rate": 6.133983659582047e-07, + "loss": 0.2928, + "step": 2250 + }, + { + "epoch": 0.64, + "grad_norm": 2.531639474882001, + "learning_rate": 6.125521006807115e-07, + "loss": 0.2825, + "step": 2251 + }, + { + "epoch": 0.64, + "grad_norm": 2.300776299226663, + "learning_rate": 6.11706161764949e-07, + "loss": 0.2593, + "step": 2252 + }, + { + "epoch": 0.64, + "grad_norm": 2.1737940033605505, + "learning_rate": 6.10860549923482e-07, + "loss": 0.2513, + "step": 2253 + }, + { + "epoch": 0.64, + "grad_norm": 2.465815246077779, + "learning_rate": 6.10015265868602e-07, + "loss": 0.2757, + "step": 2254 + }, + { + "epoch": 0.64, + "grad_norm": 2.2752963971548117, + "learning_rate": 6.091703103123222e-07, + "loss": 0.255, + "step": 2255 + }, + { + "epoch": 0.64, + "grad_norm": 2.501046208396167, + "learning_rate": 6.083256839663806e-07, + "loss": 0.2861, + "step": 2256 + }, + { + "epoch": 0.64, + "grad_norm": 2.5397860645465875, + "learning_rate": 6.074813875422365e-07, + "loss": 0.3324, + "step": 2257 + }, + { + "epoch": 0.64, + "grad_norm": 2.3053544026215484, + "learning_rate": 6.066374217510724e-07, + "loss": 0.2752, + "step": 2258 + }, + { + "epoch": 0.64, + "grad_norm": 2.3995880048556217, + "learning_rate": 6.057937873037924e-07, + "loss": 0.29, + "step": 2259 + }, + { + "epoch": 0.64, + "grad_norm": 2.523188766633019, + "learning_rate": 6.04950484911021e-07, + "loss": 0.2783, + "step": 2260 + }, + { + "epoch": 0.64, + "grad_norm": 2.228431929782206, + "learning_rate": 6.041075152831025e-07, + "loss": 0.2707, + "step": 2261 + }, + { + "epoch": 0.64, + "grad_norm": 2.9934974905244562, + "learning_rate": 6.032648791301018e-07, + "loss": 0.2906, + "step": 2262 + }, + { + "epoch": 0.64, + "grad_norm": 2.3233178461924853, + "learning_rate": 6.024225771618023e-07, + "loss": 0.2635, + "step": 2263 + }, + { + "epoch": 0.64, + "grad_norm": 2.723255393523626, + "learning_rate": 6.015806100877069e-07, + "loss": 0.2819, + "step": 2264 + }, + { + "epoch": 0.64, + "grad_norm": 2.571178926895352, + "learning_rate": 6.007389786170354e-07, + "loss": 0.3109, + "step": 2265 + }, + { + "epoch": 0.64, + "grad_norm": 2.2559815923268878, + "learning_rate": 5.998976834587246e-07, + "loss": 0.2704, + "step": 2266 + }, + { + "epoch": 0.64, + "grad_norm": 2.49947468441889, + "learning_rate": 5.990567253214295e-07, + "loss": 0.2818, + "step": 2267 + }, + { + "epoch": 0.64, + "grad_norm": 2.5920054788059637, + "learning_rate": 5.98216104913519e-07, + "loss": 0.2952, + "step": 2268 + }, + { + "epoch": 0.64, + "grad_norm": 3.999502386879307, + "learning_rate": 5.973758229430805e-07, + "loss": 0.2542, + "step": 2269 + }, + { + "epoch": 0.64, + "grad_norm": 2.282738274536259, + "learning_rate": 5.965358801179137e-07, + "loss": 0.2721, + "step": 2270 + }, + { + "epoch": 0.64, + "grad_norm": 2.2313925933260172, + "learning_rate": 5.956962771455337e-07, + "loss": 0.2787, + "step": 2271 + }, + { + "epoch": 0.64, + "grad_norm": 2.2704746180326767, + "learning_rate": 5.948570147331692e-07, + "loss": 0.2731, + "step": 2272 + }, + { + "epoch": 0.64, + "grad_norm": 2.5502486483761717, + "learning_rate": 5.940180935877619e-07, + "loss": 0.3083, + "step": 2273 + }, + { + "epoch": 0.64, + "grad_norm": 2.4309385928987153, + "learning_rate": 5.931795144159665e-07, + "loss": 0.2857, + "step": 2274 + }, + { + "epoch": 0.64, + "grad_norm": 2.1658776156389146, + "learning_rate": 5.923412779241492e-07, + "loss": 0.2644, + "step": 2275 + }, + { + "epoch": 0.64, + "grad_norm": 2.494804348637084, + "learning_rate": 5.91503384818388e-07, + "loss": 0.2412, + "step": 2276 + }, + { + "epoch": 0.65, + "grad_norm": 2.2382077913458502, + "learning_rate": 5.906658358044703e-07, + "loss": 0.2652, + "step": 2277 + }, + { + "epoch": 0.65, + "grad_norm": 2.2962584594206747, + "learning_rate": 5.89828631587896e-07, + "loss": 0.2807, + "step": 2278 + }, + { + "epoch": 0.65, + "grad_norm": 2.188276662228928, + "learning_rate": 5.889917728738724e-07, + "loss": 0.2486, + "step": 2279 + }, + { + "epoch": 0.65, + "grad_norm": 2.527543431519013, + "learning_rate": 5.88155260367317e-07, + "loss": 0.2352, + "step": 2280 + }, + { + "epoch": 0.65, + "grad_norm": 2.302003991338731, + "learning_rate": 5.873190947728551e-07, + "loss": 0.2543, + "step": 2281 + }, + { + "epoch": 0.65, + "grad_norm": 2.2480317567146004, + "learning_rate": 5.864832767948198e-07, + "loss": 0.2514, + "step": 2282 + }, + { + "epoch": 0.65, + "grad_norm": 2.524084873276555, + "learning_rate": 5.85647807137252e-07, + "loss": 0.2942, + "step": 2283 + }, + { + "epoch": 0.65, + "grad_norm": 2.369980013342227, + "learning_rate": 5.848126865038989e-07, + "loss": 0.2793, + "step": 2284 + }, + { + "epoch": 0.65, + "grad_norm": 2.8317055902232497, + "learning_rate": 5.83977915598213e-07, + "loss": 0.2724, + "step": 2285 + }, + { + "epoch": 0.65, + "grad_norm": 2.4188479265192777, + "learning_rate": 5.83143495123353e-07, + "loss": 0.2642, + "step": 2286 + }, + { + "epoch": 0.65, + "grad_norm": 2.6498348586511487, + "learning_rate": 5.823094257821821e-07, + "loss": 0.2817, + "step": 2287 + }, + { + "epoch": 0.65, + "grad_norm": 2.568128866402004, + "learning_rate": 5.814757082772682e-07, + "loss": 0.2934, + "step": 2288 + }, + { + "epoch": 0.65, + "grad_norm": 2.2940019570209067, + "learning_rate": 5.806423433108821e-07, + "loss": 0.2577, + "step": 2289 + }, + { + "epoch": 0.65, + "grad_norm": 2.348630798206626, + "learning_rate": 5.798093315849983e-07, + "loss": 0.2721, + "step": 2290 + }, + { + "epoch": 0.65, + "grad_norm": 2.394079405948414, + "learning_rate": 5.789766738012931e-07, + "loss": 0.2801, + "step": 2291 + }, + { + "epoch": 0.65, + "grad_norm": 2.6282642371955447, + "learning_rate": 5.781443706611454e-07, + "loss": 0.289, + "step": 2292 + }, + { + "epoch": 0.65, + "grad_norm": 2.3702665835924117, + "learning_rate": 5.773124228656348e-07, + "loss": 0.2578, + "step": 2293 + }, + { + "epoch": 0.65, + "grad_norm": 2.8168042286446826, + "learning_rate": 5.764808311155418e-07, + "loss": 0.3191, + "step": 2294 + }, + { + "epoch": 0.65, + "grad_norm": 2.4400086616895784, + "learning_rate": 5.756495961113468e-07, + "loss": 0.2866, + "step": 2295 + }, + { + "epoch": 0.65, + "grad_norm": 2.304434447879119, + "learning_rate": 5.748187185532305e-07, + "loss": 0.2867, + "step": 2296 + }, + { + "epoch": 0.65, + "grad_norm": 2.4909981692133747, + "learning_rate": 5.739881991410707e-07, + "loss": 0.2543, + "step": 2297 + }, + { + "epoch": 0.65, + "grad_norm": 2.2463457224543433, + "learning_rate": 5.731580385744457e-07, + "loss": 0.2313, + "step": 2298 + }, + { + "epoch": 0.65, + "grad_norm": 2.857451260559649, + "learning_rate": 5.723282375526302e-07, + "loss": 0.2663, + "step": 2299 + }, + { + "epoch": 0.65, + "grad_norm": 2.2112224349538265, + "learning_rate": 5.714987967745967e-07, + "loss": 0.2684, + "step": 2300 + }, + { + "epoch": 0.65, + "grad_norm": 2.3854594089360917, + "learning_rate": 5.706697169390134e-07, + "loss": 0.2865, + "step": 2301 + }, + { + "epoch": 0.65, + "grad_norm": 2.6668868136628197, + "learning_rate": 5.698409987442448e-07, + "loss": 0.257, + "step": 2302 + }, + { + "epoch": 0.65, + "grad_norm": 2.137828218966821, + "learning_rate": 5.690126428883515e-07, + "loss": 0.276, + "step": 2303 + }, + { + "epoch": 0.65, + "grad_norm": 2.486527199626064, + "learning_rate": 5.681846500690884e-07, + "loss": 0.2888, + "step": 2304 + }, + { + "epoch": 0.65, + "grad_norm": 2.35238133058172, + "learning_rate": 5.673570209839045e-07, + "loss": 0.2749, + "step": 2305 + }, + { + "epoch": 0.65, + "grad_norm": 3.2026123661227297, + "learning_rate": 5.66529756329942e-07, + "loss": 0.2537, + "step": 2306 + }, + { + "epoch": 0.65, + "grad_norm": 2.2395741026150864, + "learning_rate": 5.657028568040365e-07, + "loss": 0.2884, + "step": 2307 + }, + { + "epoch": 0.65, + "grad_norm": 2.4458748873204863, + "learning_rate": 5.64876323102717e-07, + "loss": 0.2608, + "step": 2308 + }, + { + "epoch": 0.65, + "grad_norm": 2.7200316283550663, + "learning_rate": 5.640501559222034e-07, + "loss": 0.278, + "step": 2309 + }, + { + "epoch": 0.65, + "grad_norm": 3.209972703940308, + "learning_rate": 5.63224355958406e-07, + "loss": 0.2916, + "step": 2310 + }, + { + "epoch": 0.65, + "grad_norm": 2.5998394218333085, + "learning_rate": 5.623989239069274e-07, + "loss": 0.3057, + "step": 2311 + }, + { + "epoch": 0.66, + "grad_norm": 2.38565340315908, + "learning_rate": 5.615738604630591e-07, + "loss": 0.268, + "step": 2312 + }, + { + "epoch": 0.66, + "grad_norm": 2.568508776646103, + "learning_rate": 5.607491663217838e-07, + "loss": 0.3047, + "step": 2313 + }, + { + "epoch": 0.66, + "grad_norm": 2.3544146184023838, + "learning_rate": 5.599248421777707e-07, + "loss": 0.292, + "step": 2314 + }, + { + "epoch": 0.66, + "grad_norm": 2.511187292747781, + "learning_rate": 5.591008887253792e-07, + "loss": 0.2809, + "step": 2315 + }, + { + "epoch": 0.66, + "grad_norm": 2.3207736387674878, + "learning_rate": 5.582773066586552e-07, + "loss": 0.2698, + "step": 2316 + }, + { + "epoch": 0.66, + "grad_norm": 2.396898601611306, + "learning_rate": 5.574540966713337e-07, + "loss": 0.2789, + "step": 2317 + }, + { + "epoch": 0.66, + "grad_norm": 2.5437105260347734, + "learning_rate": 5.566312594568339e-07, + "loss": 0.2654, + "step": 2318 + }, + { + "epoch": 0.66, + "grad_norm": 2.389316829041838, + "learning_rate": 5.558087957082623e-07, + "loss": 0.2631, + "step": 2319 + }, + { + "epoch": 0.66, + "grad_norm": 2.3657439718027082, + "learning_rate": 5.549867061184108e-07, + "loss": 0.2951, + "step": 2320 + }, + { + "epoch": 0.66, + "grad_norm": 2.613468479680014, + "learning_rate": 5.541649913797558e-07, + "loss": 0.2997, + "step": 2321 + }, + { + "epoch": 0.66, + "grad_norm": 2.7800748740297463, + "learning_rate": 5.533436521844581e-07, + "loss": 0.271, + "step": 2322 + }, + { + "epoch": 0.66, + "grad_norm": 2.4391679900312337, + "learning_rate": 5.525226892243623e-07, + "loss": 0.2932, + "step": 2323 + }, + { + "epoch": 0.66, + "grad_norm": 2.276031842799922, + "learning_rate": 5.517021031909958e-07, + "loss": 0.2882, + "step": 2324 + }, + { + "epoch": 0.66, + "grad_norm": 2.400813584246261, + "learning_rate": 5.508818947755686e-07, + "loss": 0.315, + "step": 2325 + }, + { + "epoch": 0.66, + "grad_norm": 2.3432150318418765, + "learning_rate": 5.500620646689728e-07, + "loss": 0.258, + "step": 2326 + }, + { + "epoch": 0.66, + "grad_norm": 2.34350737859966, + "learning_rate": 5.492426135617815e-07, + "loss": 0.2588, + "step": 2327 + }, + { + "epoch": 0.66, + "grad_norm": 2.461391081740737, + "learning_rate": 5.484235421442491e-07, + "loss": 0.2932, + "step": 2328 + }, + { + "epoch": 0.66, + "grad_norm": 2.2771199211489286, + "learning_rate": 5.476048511063095e-07, + "loss": 0.2641, + "step": 2329 + }, + { + "epoch": 0.66, + "grad_norm": 2.5021933751522076, + "learning_rate": 5.467865411375765e-07, + "loss": 0.282, + "step": 2330 + }, + { + "epoch": 0.66, + "grad_norm": 2.32908541394442, + "learning_rate": 5.459686129273432e-07, + "loss": 0.2632, + "step": 2331 + }, + { + "epoch": 0.66, + "grad_norm": 2.67185336758561, + "learning_rate": 5.451510671645806e-07, + "loss": 0.2974, + "step": 2332 + }, + { + "epoch": 0.66, + "grad_norm": 2.3283227432999998, + "learning_rate": 5.443339045379379e-07, + "loss": 0.2605, + "step": 2333 + }, + { + "epoch": 0.66, + "grad_norm": 2.419537722246254, + "learning_rate": 5.435171257357416e-07, + "loss": 0.2619, + "step": 2334 + }, + { + "epoch": 0.66, + "grad_norm": 2.3691225216183653, + "learning_rate": 5.427007314459948e-07, + "loss": 0.2878, + "step": 2335 + }, + { + "epoch": 0.66, + "grad_norm": 2.4319599179402496, + "learning_rate": 5.418847223563761e-07, + "loss": 0.2798, + "step": 2336 + }, + { + "epoch": 0.66, + "grad_norm": 2.3038917195379143, + "learning_rate": 5.410690991542407e-07, + "loss": 0.2465, + "step": 2337 + }, + { + "epoch": 0.66, + "grad_norm": 2.3787734383661636, + "learning_rate": 5.402538625266183e-07, + "loss": 0.2965, + "step": 2338 + }, + { + "epoch": 0.66, + "grad_norm": 2.2196406147403054, + "learning_rate": 5.394390131602132e-07, + "loss": 0.2643, + "step": 2339 + }, + { + "epoch": 0.66, + "grad_norm": 2.4182567726274042, + "learning_rate": 5.386245517414026e-07, + "loss": 0.245, + "step": 2340 + }, + { + "epoch": 0.66, + "grad_norm": 2.343449748727217, + "learning_rate": 5.378104789562373e-07, + "loss": 0.2887, + "step": 2341 + }, + { + "epoch": 0.66, + "grad_norm": 2.2836474205993706, + "learning_rate": 5.36996795490442e-07, + "loss": 0.2683, + "step": 2342 + }, + { + "epoch": 0.66, + "grad_norm": 2.3647325537081225, + "learning_rate": 5.361835020294122e-07, + "loss": 0.2615, + "step": 2343 + }, + { + "epoch": 0.66, + "grad_norm": 2.8029499896540897, + "learning_rate": 5.353705992582146e-07, + "loss": 0.2397, + "step": 2344 + }, + { + "epoch": 0.66, + "grad_norm": 2.3598025459643845, + "learning_rate": 5.345580878615877e-07, + "loss": 0.2764, + "step": 2345 + }, + { + "epoch": 0.66, + "grad_norm": 2.2772605562966164, + "learning_rate": 5.337459685239394e-07, + "loss": 0.2361, + "step": 2346 + }, + { + "epoch": 0.67, + "grad_norm": 2.398278736692514, + "learning_rate": 5.329342419293488e-07, + "loss": 0.277, + "step": 2347 + }, + { + "epoch": 0.67, + "grad_norm": 2.4037681130844675, + "learning_rate": 5.321229087615634e-07, + "loss": 0.2763, + "step": 2348 + }, + { + "epoch": 0.67, + "grad_norm": 2.485457451422128, + "learning_rate": 5.313119697039984e-07, + "loss": 0.2885, + "step": 2349 + }, + { + "epoch": 0.67, + "grad_norm": 2.3282359187966373, + "learning_rate": 5.305014254397377e-07, + "loss": 0.2649, + "step": 2350 + }, + { + "epoch": 0.67, + "grad_norm": 2.38908486454763, + "learning_rate": 5.296912766515338e-07, + "loss": 0.2835, + "step": 2351 + }, + { + "epoch": 0.67, + "grad_norm": 2.592192298182752, + "learning_rate": 5.288815240218048e-07, + "loss": 0.3013, + "step": 2352 + }, + { + "epoch": 0.67, + "grad_norm": 2.35159951282308, + "learning_rate": 5.280721682326348e-07, + "loss": 0.2669, + "step": 2353 + }, + { + "epoch": 0.67, + "grad_norm": 2.314369465048757, + "learning_rate": 5.272632099657743e-07, + "loss": 0.2702, + "step": 2354 + }, + { + "epoch": 0.67, + "grad_norm": 2.5082008674395184, + "learning_rate": 5.264546499026387e-07, + "loss": 0.2712, + "step": 2355 + }, + { + "epoch": 0.67, + "grad_norm": 2.1962925179788866, + "learning_rate": 5.256464887243094e-07, + "loss": 0.2556, + "step": 2356 + }, + { + "epoch": 0.67, + "grad_norm": 2.243902743791842, + "learning_rate": 5.248387271115291e-07, + "loss": 0.2622, + "step": 2357 + }, + { + "epoch": 0.67, + "grad_norm": 2.281956857693016, + "learning_rate": 5.240313657447057e-07, + "loss": 0.2766, + "step": 2358 + }, + { + "epoch": 0.67, + "grad_norm": 2.419361403863879, + "learning_rate": 5.232244053039099e-07, + "loss": 0.2697, + "step": 2359 + }, + { + "epoch": 0.67, + "grad_norm": 2.3917488210639513, + "learning_rate": 5.224178464688741e-07, + "loss": 0.2663, + "step": 2360 + }, + { + "epoch": 0.67, + "grad_norm": 2.331041543408664, + "learning_rate": 5.216116899189928e-07, + "loss": 0.2658, + "step": 2361 + }, + { + "epoch": 0.67, + "grad_norm": 2.821521842305285, + "learning_rate": 5.208059363333217e-07, + "loss": 0.296, + "step": 2362 + }, + { + "epoch": 0.67, + "grad_norm": 2.868996764673546, + "learning_rate": 5.200005863905767e-07, + "loss": 0.2982, + "step": 2363 + }, + { + "epoch": 0.67, + "grad_norm": 2.2425213278054152, + "learning_rate": 5.191956407691343e-07, + "loss": 0.2369, + "step": 2364 + }, + { + "epoch": 0.67, + "grad_norm": 2.332277471029029, + "learning_rate": 5.183911001470295e-07, + "loss": 0.2435, + "step": 2365 + }, + { + "epoch": 0.67, + "grad_norm": 2.493223595181339, + "learning_rate": 5.17586965201957e-07, + "loss": 0.268, + "step": 2366 + }, + { + "epoch": 0.67, + "grad_norm": 2.4352432752453614, + "learning_rate": 5.167832366112694e-07, + "loss": 0.2768, + "step": 2367 + }, + { + "epoch": 0.67, + "grad_norm": 2.461278109640111, + "learning_rate": 5.159799150519772e-07, + "loss": 0.3012, + "step": 2368 + }, + { + "epoch": 0.67, + "grad_norm": 2.146603860090843, + "learning_rate": 5.151770012007479e-07, + "loss": 0.2744, + "step": 2369 + }, + { + "epoch": 0.67, + "grad_norm": 2.2789999167061885, + "learning_rate": 5.143744957339056e-07, + "loss": 0.2775, + "step": 2370 + }, + { + "epoch": 0.67, + "grad_norm": 2.1648708799970486, + "learning_rate": 5.135723993274303e-07, + "loss": 0.2581, + "step": 2371 + }, + { + "epoch": 0.67, + "grad_norm": 2.40279231393275, + "learning_rate": 5.127707126569576e-07, + "loss": 0.2625, + "step": 2372 + }, + { + "epoch": 0.67, + "grad_norm": 2.879779242816951, + "learning_rate": 5.11969436397778e-07, + "loss": 0.2719, + "step": 2373 + }, + { + "epoch": 0.67, + "grad_norm": 2.4951831674750053, + "learning_rate": 5.111685712248363e-07, + "loss": 0.2983, + "step": 2374 + }, + { + "epoch": 0.67, + "grad_norm": 2.3148481953963143, + "learning_rate": 5.103681178127302e-07, + "loss": 0.2609, + "step": 2375 + }, + { + "epoch": 0.67, + "grad_norm": 2.4355470361980087, + "learning_rate": 5.095680768357122e-07, + "loss": 0.268, + "step": 2376 + }, + { + "epoch": 0.67, + "grad_norm": 2.957762249807943, + "learning_rate": 5.087684489676861e-07, + "loss": 0.2723, + "step": 2377 + }, + { + "epoch": 0.67, + "grad_norm": 2.3268375859011248, + "learning_rate": 5.079692348822085e-07, + "loss": 0.2763, + "step": 2378 + }, + { + "epoch": 0.67, + "grad_norm": 3.131960016007764, + "learning_rate": 5.071704352524862e-07, + "loss": 0.2984, + "step": 2379 + }, + { + "epoch": 0.67, + "grad_norm": 2.40458029742982, + "learning_rate": 5.06372050751378e-07, + "loss": 0.2836, + "step": 2380 + }, + { + "epoch": 0.67, + "grad_norm": 2.7074747538153594, + "learning_rate": 5.055740820513932e-07, + "loss": 0.2832, + "step": 2381 + }, + { + "epoch": 0.67, + "grad_norm": 2.2862593178591277, + "learning_rate": 5.047765298246907e-07, + "loss": 0.2952, + "step": 2382 + }, + { + "epoch": 0.68, + "grad_norm": 2.3946399410902326, + "learning_rate": 5.039793947430773e-07, + "loss": 0.2616, + "step": 2383 + }, + { + "epoch": 0.68, + "grad_norm": 2.3943577455752307, + "learning_rate": 5.031826774780097e-07, + "loss": 0.2822, + "step": 2384 + }, + { + "epoch": 0.68, + "grad_norm": 2.3088941950755317, + "learning_rate": 5.023863787005929e-07, + "loss": 0.2454, + "step": 2385 + }, + { + "epoch": 0.68, + "grad_norm": 2.28104166201968, + "learning_rate": 5.015904990815792e-07, + "loss": 0.2674, + "step": 2386 + }, + { + "epoch": 0.68, + "grad_norm": 2.4601117984407956, + "learning_rate": 5.007950392913662e-07, + "loss": 0.3097, + "step": 2387 + }, + { + "epoch": 0.68, + "grad_norm": 2.292624142786809, + "learning_rate": 5.000000000000002e-07, + "loss": 0.2736, + "step": 2388 + }, + { + "epoch": 0.68, + "grad_norm": 2.4788897424360155, + "learning_rate": 4.992053818771714e-07, + "loss": 0.2613, + "step": 2389 + }, + { + "epoch": 0.68, + "grad_norm": 2.359979008814565, + "learning_rate": 4.984111855922176e-07, + "loss": 0.2764, + "step": 2390 + }, + { + "epoch": 0.68, + "grad_norm": 2.2760545050797067, + "learning_rate": 4.976174118141185e-07, + "loss": 0.2722, + "step": 2391 + }, + { + "epoch": 0.68, + "grad_norm": 2.4792421447033077, + "learning_rate": 4.968240612114995e-07, + "loss": 0.2531, + "step": 2392 + }, + { + "epoch": 0.68, + "grad_norm": 2.3183540813170143, + "learning_rate": 4.960311344526292e-07, + "loss": 0.2784, + "step": 2393 + }, + { + "epoch": 0.68, + "grad_norm": 2.286214631829258, + "learning_rate": 4.952386322054188e-07, + "loss": 0.2646, + "step": 2394 + }, + { + "epoch": 0.68, + "grad_norm": 2.403792277262232, + "learning_rate": 4.944465551374238e-07, + "loss": 0.2963, + "step": 2395 + }, + { + "epoch": 0.68, + "grad_norm": 2.229517392197221, + "learning_rate": 4.936549039158385e-07, + "loss": 0.2491, + "step": 2396 + }, + { + "epoch": 0.68, + "grad_norm": 2.344675961661458, + "learning_rate": 4.928636792075007e-07, + "loss": 0.2838, + "step": 2397 + }, + { + "epoch": 0.68, + "grad_norm": 2.208281211784219, + "learning_rate": 4.920728816788883e-07, + "loss": 0.2643, + "step": 2398 + }, + { + "epoch": 0.68, + "grad_norm": 2.292389280601437, + "learning_rate": 4.912825119961194e-07, + "loss": 0.2835, + "step": 2399 + }, + { + "epoch": 0.68, + "grad_norm": 2.458712202984134, + "learning_rate": 4.904925708249516e-07, + "loss": 0.2845, + "step": 2400 + }, + { + "epoch": 0.68, + "grad_norm": 2.3774224537052615, + "learning_rate": 4.897030588307816e-07, + "loss": 0.2813, + "step": 2401 + }, + { + "epoch": 0.68, + "grad_norm": 2.297106016389134, + "learning_rate": 4.889139766786447e-07, + "loss": 0.2957, + "step": 2402 + }, + { + "epoch": 0.68, + "grad_norm": 2.4943961815678883, + "learning_rate": 4.881253250332141e-07, + "loss": 0.2811, + "step": 2403 + }, + { + "epoch": 0.68, + "grad_norm": 2.319088781552554, + "learning_rate": 4.873371045588001e-07, + "loss": 0.2814, + "step": 2404 + }, + { + "epoch": 0.68, + "grad_norm": 2.352193260486395, + "learning_rate": 4.865493159193504e-07, + "loss": 0.2689, + "step": 2405 + }, + { + "epoch": 0.68, + "grad_norm": 2.5413080645396913, + "learning_rate": 4.857619597784482e-07, + "loss": 0.3134, + "step": 2406 + }, + { + "epoch": 0.68, + "grad_norm": 2.420124609671713, + "learning_rate": 4.84975036799313e-07, + "loss": 0.2668, + "step": 2407 + }, + { + "epoch": 0.68, + "grad_norm": 2.8124452637866133, + "learning_rate": 4.841885476447995e-07, + "loss": 0.2866, + "step": 2408 + }, + { + "epoch": 0.68, + "grad_norm": 2.258369093161929, + "learning_rate": 4.834024929773956e-07, + "loss": 0.2565, + "step": 2409 + }, + { + "epoch": 0.68, + "grad_norm": 2.2445143082198666, + "learning_rate": 4.826168734592253e-07, + "loss": 0.2663, + "step": 2410 + }, + { + "epoch": 0.68, + "grad_norm": 2.439178512532609, + "learning_rate": 4.818316897520449e-07, + "loss": 0.2866, + "step": 2411 + }, + { + "epoch": 0.68, + "grad_norm": 2.4628414329677835, + "learning_rate": 4.810469425172439e-07, + "loss": 0.2673, + "step": 2412 + }, + { + "epoch": 0.68, + "grad_norm": 2.2814847882421305, + "learning_rate": 4.802626324158432e-07, + "loss": 0.2663, + "step": 2413 + }, + { + "epoch": 0.68, + "grad_norm": 2.436509033085469, + "learning_rate": 4.794787601084965e-07, + "loss": 0.2738, + "step": 2414 + }, + { + "epoch": 0.68, + "grad_norm": 2.347829927437003, + "learning_rate": 4.786953262554891e-07, + "loss": 0.2818, + "step": 2415 + }, + { + "epoch": 0.68, + "grad_norm": 2.2396757844804287, + "learning_rate": 4.779123315167361e-07, + "loss": 0.2528, + "step": 2416 + }, + { + "epoch": 0.68, + "grad_norm": 2.3879035459539453, + "learning_rate": 4.771297765517833e-07, + "loss": 0.264, + "step": 2417 + }, + { + "epoch": 0.69, + "grad_norm": 2.413721326849401, + "learning_rate": 4.763476620198047e-07, + "loss": 0.2489, + "step": 2418 + }, + { + "epoch": 0.69, + "grad_norm": 2.353494934214268, + "learning_rate": 4.755659885796054e-07, + "loss": 0.2713, + "step": 2419 + }, + { + "epoch": 0.69, + "grad_norm": 2.2907994697079594, + "learning_rate": 4.747847568896177e-07, + "loss": 0.2749, + "step": 2420 + }, + { + "epoch": 0.69, + "grad_norm": 2.340449182606632, + "learning_rate": 4.740039676079022e-07, + "loss": 0.295, + "step": 2421 + }, + { + "epoch": 0.69, + "grad_norm": 2.2745448209728005, + "learning_rate": 4.73223621392146e-07, + "loss": 0.2573, + "step": 2422 + }, + { + "epoch": 0.69, + "grad_norm": 2.326430648936994, + "learning_rate": 4.724437188996637e-07, + "loss": 0.2724, + "step": 2423 + }, + { + "epoch": 0.69, + "grad_norm": 2.574729677370753, + "learning_rate": 4.716642607873967e-07, + "loss": 0.3077, + "step": 2424 + }, + { + "epoch": 0.69, + "grad_norm": 2.368767987272835, + "learning_rate": 4.708852477119116e-07, + "loss": 0.2912, + "step": 2425 + }, + { + "epoch": 0.69, + "grad_norm": 2.327984474192384, + "learning_rate": 4.7010668032939925e-07, + "loss": 0.2689, + "step": 2426 + }, + { + "epoch": 0.69, + "grad_norm": 2.3809594525501567, + "learning_rate": 4.6932855929567606e-07, + "loss": 0.2723, + "step": 2427 + }, + { + "epoch": 0.69, + "grad_norm": 2.2477212537841136, + "learning_rate": 4.6855088526618204e-07, + "loss": 0.2677, + "step": 2428 + }, + { + "epoch": 0.69, + "grad_norm": 2.8939219591101, + "learning_rate": 4.6777365889598176e-07, + "loss": 0.2546, + "step": 2429 + }, + { + "epoch": 0.69, + "grad_norm": 2.3686432161421394, + "learning_rate": 4.6699688083976085e-07, + "loss": 0.2646, + "step": 2430 + }, + { + "epoch": 0.69, + "grad_norm": 2.425030952188078, + "learning_rate": 4.662205517518286e-07, + "loss": 0.2732, + "step": 2431 + }, + { + "epoch": 0.69, + "grad_norm": 2.50809725901002, + "learning_rate": 4.6544467228611584e-07, + "loss": 0.2584, + "step": 2432 + }, + { + "epoch": 0.69, + "grad_norm": 2.479077560409588, + "learning_rate": 4.646692430961744e-07, + "loss": 0.2749, + "step": 2433 + }, + { + "epoch": 0.69, + "grad_norm": 2.7200829790390317, + "learning_rate": 4.6389426483517736e-07, + "loss": 0.2805, + "step": 2434 + }, + { + "epoch": 0.69, + "grad_norm": 2.3144164267512886, + "learning_rate": 4.631197381559173e-07, + "loss": 0.2975, + "step": 2435 + }, + { + "epoch": 0.69, + "grad_norm": 2.459976750375668, + "learning_rate": 4.6234566371080697e-07, + "loss": 0.2956, + "step": 2436 + }, + { + "epoch": 0.69, + "grad_norm": 2.4508229207793812, + "learning_rate": 4.6157204215187795e-07, + "loss": 0.2788, + "step": 2437 + }, + { + "epoch": 0.69, + "grad_norm": 2.418521332143917, + "learning_rate": 4.6079887413078034e-07, + "loss": 0.2774, + "step": 2438 + }, + { + "epoch": 0.69, + "grad_norm": 2.3565178114840957, + "learning_rate": 4.6002616029878226e-07, + "loss": 0.2461, + "step": 2439 + }, + { + "epoch": 0.69, + "grad_norm": 2.318714557506511, + "learning_rate": 4.5925390130676913e-07, + "loss": 0.2673, + "step": 2440 + }, + { + "epoch": 0.69, + "grad_norm": 2.570508945139977, + "learning_rate": 4.584820978052434e-07, + "loss": 0.3228, + "step": 2441 + }, + { + "epoch": 0.69, + "grad_norm": 2.426739836009848, + "learning_rate": 4.5771075044432385e-07, + "loss": 0.2663, + "step": 2442 + }, + { + "epoch": 0.69, + "grad_norm": 2.5163005062865373, + "learning_rate": 4.5693985987374475e-07, + "loss": 0.3013, + "step": 2443 + }, + { + "epoch": 0.69, + "grad_norm": 2.397302513009581, + "learning_rate": 4.5616942674285596e-07, + "loss": 0.2689, + "step": 2444 + }, + { + "epoch": 0.69, + "grad_norm": 2.384358665290794, + "learning_rate": 4.553994517006219e-07, + "loss": 0.274, + "step": 2445 + }, + { + "epoch": 0.69, + "grad_norm": 2.3780130465359197, + "learning_rate": 4.54629935395621e-07, + "loss": 0.2703, + "step": 2446 + }, + { + "epoch": 0.69, + "grad_norm": 2.41175660090204, + "learning_rate": 4.5386087847604583e-07, + "loss": 0.2761, + "step": 2447 + }, + { + "epoch": 0.69, + "grad_norm": 2.2308940957310397, + "learning_rate": 4.5309228158970027e-07, + "loss": 0.2744, + "step": 2448 + }, + { + "epoch": 0.69, + "grad_norm": 2.2622794723399196, + "learning_rate": 4.523241453840033e-07, + "loss": 0.2634, + "step": 2449 + }, + { + "epoch": 0.69, + "grad_norm": 2.4892472914365382, + "learning_rate": 4.51556470505984e-07, + "loss": 0.251, + "step": 2450 + }, + { + "epoch": 0.69, + "grad_norm": 2.5240215459027504, + "learning_rate": 4.507892576022838e-07, + "loss": 0.2355, + "step": 2451 + }, + { + "epoch": 0.69, + "grad_norm": 2.206020087049321, + "learning_rate": 4.500225073191539e-07, + "loss": 0.2829, + "step": 2452 + }, + { + "epoch": 0.7, + "grad_norm": 2.254017885293891, + "learning_rate": 4.4925622030245645e-07, + "loss": 0.2649, + "step": 2453 + }, + { + "epoch": 0.7, + "grad_norm": 2.3498609298835365, + "learning_rate": 4.484903971976641e-07, + "loss": 0.2857, + "step": 2454 + }, + { + "epoch": 0.7, + "grad_norm": 2.0971087145254033, + "learning_rate": 4.4772503864985813e-07, + "loss": 0.2547, + "step": 2455 + }, + { + "epoch": 0.7, + "grad_norm": 2.321955789347521, + "learning_rate": 4.469601453037276e-07, + "loss": 0.2653, + "step": 2456 + }, + { + "epoch": 0.7, + "grad_norm": 2.205208474190725, + "learning_rate": 4.4619571780357046e-07, + "loss": 0.2622, + "step": 2457 + }, + { + "epoch": 0.7, + "grad_norm": 2.2349667352343308, + "learning_rate": 4.4543175679329337e-07, + "loss": 0.2675, + "step": 2458 + }, + { + "epoch": 0.7, + "grad_norm": 2.49822879969086, + "learning_rate": 4.4466826291640867e-07, + "loss": 0.2621, + "step": 2459 + }, + { + "epoch": 0.7, + "grad_norm": 2.338010522791837, + "learning_rate": 4.439052368160351e-07, + "loss": 0.2782, + "step": 2460 + }, + { + "epoch": 0.7, + "grad_norm": 2.5150398161628766, + "learning_rate": 4.43142679134898e-07, + "loss": 0.2697, + "step": 2461 + }, + { + "epoch": 0.7, + "grad_norm": 2.324998946630601, + "learning_rate": 4.4238059051532774e-07, + "loss": 0.2486, + "step": 2462 + }, + { + "epoch": 0.7, + "grad_norm": 2.748258160226059, + "learning_rate": 4.4161897159926044e-07, + "loss": 0.2896, + "step": 2463 + }, + { + "epoch": 0.7, + "grad_norm": 2.5017394373991007, + "learning_rate": 4.4085782302823604e-07, + "loss": 0.2904, + "step": 2464 + }, + { + "epoch": 0.7, + "grad_norm": 2.378543367007354, + "learning_rate": 4.400971454433975e-07, + "loss": 0.2693, + "step": 2465 + }, + { + "epoch": 0.7, + "grad_norm": 2.422935392437184, + "learning_rate": 4.39336939485492e-07, + "loss": 0.2735, + "step": 2466 + }, + { + "epoch": 0.7, + "grad_norm": 2.3311674451261903, + "learning_rate": 4.3857720579486887e-07, + "loss": 0.2516, + "step": 2467 + }, + { + "epoch": 0.7, + "grad_norm": 2.36415216404044, + "learning_rate": 4.3781794501148105e-07, + "loss": 0.2804, + "step": 2468 + }, + { + "epoch": 0.7, + "grad_norm": 2.6607955171263975, + "learning_rate": 4.3705915777488113e-07, + "loss": 0.2872, + "step": 2469 + }, + { + "epoch": 0.7, + "grad_norm": 2.4385599819583006, + "learning_rate": 4.363008447242239e-07, + "loss": 0.3045, + "step": 2470 + }, + { + "epoch": 0.7, + "grad_norm": 2.398963114273932, + "learning_rate": 4.355430064982646e-07, + "loss": 0.2633, + "step": 2471 + }, + { + "epoch": 0.7, + "grad_norm": 2.4263112035637553, + "learning_rate": 4.3478564373535844e-07, + "loss": 0.2872, + "step": 2472 + }, + { + "epoch": 0.7, + "grad_norm": 2.2860756043193895, + "learning_rate": 4.3402875707346033e-07, + "loss": 0.2481, + "step": 2473 + }, + { + "epoch": 0.7, + "grad_norm": 2.3064939908293614, + "learning_rate": 4.3327234715012373e-07, + "loss": 0.3014, + "step": 2474 + }, + { + "epoch": 0.7, + "grad_norm": 2.3342857525505023, + "learning_rate": 4.3251641460250086e-07, + "loss": 0.2614, + "step": 2475 + }, + { + "epoch": 0.7, + "grad_norm": 2.3856176203445134, + "learning_rate": 4.3176096006734175e-07, + "loss": 0.2783, + "step": 2476 + }, + { + "epoch": 0.7, + "grad_norm": 2.3528075219115725, + "learning_rate": 4.3100598418099377e-07, + "loss": 0.2615, + "step": 2477 + }, + { + "epoch": 0.7, + "grad_norm": 2.4978626002403077, + "learning_rate": 4.30251487579401e-07, + "loss": 0.2893, + "step": 2478 + }, + { + "epoch": 0.7, + "grad_norm": 2.3772114464772973, + "learning_rate": 4.2949747089810407e-07, + "loss": 0.277, + "step": 2479 + }, + { + "epoch": 0.7, + "grad_norm": 2.532502965255537, + "learning_rate": 4.2874393477223913e-07, + "loss": 0.283, + "step": 2480 + }, + { + "epoch": 0.7, + "grad_norm": 2.433748637942874, + "learning_rate": 4.279908798365378e-07, + "loss": 0.2877, + "step": 2481 + }, + { + "epoch": 0.7, + "grad_norm": 2.3727980561715913, + "learning_rate": 4.272383067253253e-07, + "loss": 0.2741, + "step": 2482 + }, + { + "epoch": 0.7, + "grad_norm": 2.3850416527902296, + "learning_rate": 4.264862160725229e-07, + "loss": 0.2602, + "step": 2483 + }, + { + "epoch": 0.7, + "grad_norm": 2.513270043584115, + "learning_rate": 4.25734608511644e-07, + "loss": 0.2897, + "step": 2484 + }, + { + "epoch": 0.7, + "grad_norm": 2.350862190964246, + "learning_rate": 4.2498348467579547e-07, + "loss": 0.2748, + "step": 2485 + }, + { + "epoch": 0.7, + "grad_norm": 2.197139592567514, + "learning_rate": 4.2423284519767735e-07, + "loss": 0.2445, + "step": 2486 + }, + { + "epoch": 0.7, + "grad_norm": 2.3065943115838583, + "learning_rate": 4.2348269070957977e-07, + "loss": 0.2764, + "step": 2487 + }, + { + "epoch": 0.7, + "grad_norm": 2.3619768477901895, + "learning_rate": 4.22733021843387e-07, + "loss": 0.3006, + "step": 2488 + }, + { + "epoch": 0.71, + "grad_norm": 2.207189526645788, + "learning_rate": 4.2198383923057224e-07, + "loss": 0.273, + "step": 2489 + }, + { + "epoch": 0.71, + "grad_norm": 2.358662330834118, + "learning_rate": 4.212351435022005e-07, + "loss": 0.291, + "step": 2490 + }, + { + "epoch": 0.71, + "grad_norm": 2.721161217426416, + "learning_rate": 4.2048693528892455e-07, + "loss": 0.2514, + "step": 2491 + }, + { + "epoch": 0.71, + "grad_norm": 2.41417573342347, + "learning_rate": 4.197392152209892e-07, + "loss": 0.2955, + "step": 2492 + }, + { + "epoch": 0.71, + "grad_norm": 2.4500824207202223, + "learning_rate": 4.189919839282264e-07, + "loss": 0.2735, + "step": 2493 + }, + { + "epoch": 0.71, + "grad_norm": 2.5618427216529427, + "learning_rate": 4.1824524204005706e-07, + "loss": 0.2856, + "step": 2494 + }, + { + "epoch": 0.71, + "grad_norm": 2.380341795132665, + "learning_rate": 4.1749899018548885e-07, + "loss": 0.2561, + "step": 2495 + }, + { + "epoch": 0.71, + "grad_norm": 2.144872498829846, + "learning_rate": 4.1675322899311736e-07, + "loss": 0.2487, + "step": 2496 + }, + { + "epoch": 0.71, + "grad_norm": 2.2617982886035275, + "learning_rate": 4.1600795909112564e-07, + "loss": 0.2319, + "step": 2497 + }, + { + "epoch": 0.71, + "grad_norm": 2.382061749693088, + "learning_rate": 4.152631811072822e-07, + "loss": 0.3004, + "step": 2498 + }, + { + "epoch": 0.71, + "grad_norm": 2.4247661850403164, + "learning_rate": 4.145188956689405e-07, + "loss": 0.3104, + "step": 2499 + }, + { + "epoch": 0.71, + "grad_norm": 2.3290005093252626, + "learning_rate": 4.137751034030399e-07, + "loss": 0.2591, + "step": 2500 + }, + { + "epoch": 0.71, + "grad_norm": 2.276287989665381, + "learning_rate": 4.130318049361039e-07, + "loss": 0.2786, + "step": 2501 + }, + { + "epoch": 0.71, + "grad_norm": 2.3270973591183304, + "learning_rate": 4.1228900089424155e-07, + "loss": 0.2857, + "step": 2502 + }, + { + "epoch": 0.71, + "grad_norm": 2.456702277321644, + "learning_rate": 4.1154669190314307e-07, + "loss": 0.2732, + "step": 2503 + }, + { + "epoch": 0.71, + "grad_norm": 2.306313481934988, + "learning_rate": 4.1080487858808334e-07, + "loss": 0.2913, + "step": 2504 + }, + { + "epoch": 0.71, + "grad_norm": 2.1824814739733234, + "learning_rate": 4.10063561573919e-07, + "loss": 0.2578, + "step": 2505 + }, + { + "epoch": 0.71, + "grad_norm": 2.3460665199937334, + "learning_rate": 4.0932274148508863e-07, + "loss": 0.2752, + "step": 2506 + }, + { + "epoch": 0.71, + "grad_norm": 2.3729008838322247, + "learning_rate": 4.085824189456135e-07, + "loss": 0.2646, + "step": 2507 + }, + { + "epoch": 0.71, + "grad_norm": 2.423768847342455, + "learning_rate": 4.0784259457909363e-07, + "loss": 0.2674, + "step": 2508 + }, + { + "epoch": 0.71, + "grad_norm": 2.430893115112683, + "learning_rate": 4.071032690087111e-07, + "loss": 0.2574, + "step": 2509 + }, + { + "epoch": 0.71, + "grad_norm": 2.4690843164366507, + "learning_rate": 4.0636444285722684e-07, + "loss": 0.2577, + "step": 2510 + }, + { + "epoch": 0.71, + "grad_norm": 2.293998049058781, + "learning_rate": 4.056261167469818e-07, + "loss": 0.2649, + "step": 2511 + }, + { + "epoch": 0.71, + "grad_norm": 2.295221692402047, + "learning_rate": 4.048882912998953e-07, + "loss": 0.2805, + "step": 2512 + }, + { + "epoch": 0.71, + "grad_norm": 2.2638696374900342, + "learning_rate": 4.0415096713746523e-07, + "loss": 0.249, + "step": 2513 + }, + { + "epoch": 0.71, + "grad_norm": 2.286203414947763, + "learning_rate": 4.0341414488076697e-07, + "loss": 0.2508, + "step": 2514 + }, + { + "epoch": 0.71, + "grad_norm": 2.5829535270097934, + "learning_rate": 4.026778251504532e-07, + "loss": 0.291, + "step": 2515 + }, + { + "epoch": 0.71, + "grad_norm": 2.5086684577605354, + "learning_rate": 4.0194200856675333e-07, + "loss": 0.2999, + "step": 2516 + }, + { + "epoch": 0.71, + "grad_norm": 2.360608262561292, + "learning_rate": 4.0120669574947297e-07, + "loss": 0.2708, + "step": 2517 + }, + { + "epoch": 0.71, + "grad_norm": 2.2937174901208697, + "learning_rate": 4.0047188731799343e-07, + "loss": 0.265, + "step": 2518 + }, + { + "epoch": 0.71, + "grad_norm": 2.3099838395487176, + "learning_rate": 3.99737583891271e-07, + "loss": 0.2728, + "step": 2519 + }, + { + "epoch": 0.71, + "grad_norm": 2.574811131070446, + "learning_rate": 3.9900378608783703e-07, + "loss": 0.2842, + "step": 2520 + }, + { + "epoch": 0.71, + "grad_norm": 2.524084353539817, + "learning_rate": 3.982704945257956e-07, + "loss": 0.2706, + "step": 2521 + }, + { + "epoch": 0.71, + "grad_norm": 2.5793952681688364, + "learning_rate": 3.9753770982282654e-07, + "loss": 0.265, + "step": 2522 + }, + { + "epoch": 0.71, + "grad_norm": 2.175096353388101, + "learning_rate": 3.9680543259618103e-07, + "loss": 0.2393, + "step": 2523 + }, + { + "epoch": 0.72, + "grad_norm": 2.2034119506106253, + "learning_rate": 3.960736634626838e-07, + "loss": 0.259, + "step": 2524 + }, + { + "epoch": 0.72, + "grad_norm": 2.187282761189505, + "learning_rate": 3.9534240303873e-07, + "loss": 0.2573, + "step": 2525 + }, + { + "epoch": 0.72, + "grad_norm": 2.2560105286323884, + "learning_rate": 3.9461165194028854e-07, + "loss": 0.2578, + "step": 2526 + }, + { + "epoch": 0.72, + "grad_norm": 2.2917686683304423, + "learning_rate": 3.9388141078289774e-07, + "loss": 0.261, + "step": 2527 + }, + { + "epoch": 0.72, + "grad_norm": 2.4807963920053657, + "learning_rate": 3.9315168018166676e-07, + "loss": 0.3061, + "step": 2528 + }, + { + "epoch": 0.72, + "grad_norm": 2.36922156845111, + "learning_rate": 3.924224607512753e-07, + "loss": 0.2702, + "step": 2529 + }, + { + "epoch": 0.72, + "grad_norm": 2.3190444175720692, + "learning_rate": 3.9169375310597054e-07, + "loss": 0.2649, + "step": 2530 + }, + { + "epoch": 0.72, + "grad_norm": 2.5630193048920256, + "learning_rate": 3.909655578595713e-07, + "loss": 0.2565, + "step": 2531 + }, + { + "epoch": 0.72, + "grad_norm": 2.3259427043076393, + "learning_rate": 3.9023787562546284e-07, + "loss": 0.2595, + "step": 2532 + }, + { + "epoch": 0.72, + "grad_norm": 2.2907661152770395, + "learning_rate": 3.895107070165995e-07, + "loss": 0.2744, + "step": 2533 + }, + { + "epoch": 0.72, + "grad_norm": 2.1958089968134478, + "learning_rate": 3.887840526455014e-07, + "loss": 0.2606, + "step": 2534 + }, + { + "epoch": 0.72, + "grad_norm": 2.0772019948377327, + "learning_rate": 3.880579131242566e-07, + "loss": 0.2445, + "step": 2535 + }, + { + "epoch": 0.72, + "grad_norm": 2.4373504411076095, + "learning_rate": 3.873322890645201e-07, + "loss": 0.2693, + "step": 2536 + }, + { + "epoch": 0.72, + "grad_norm": 2.5830036698453145, + "learning_rate": 3.8660718107751176e-07, + "loss": 0.2844, + "step": 2537 + }, + { + "epoch": 0.72, + "grad_norm": 2.484204909542249, + "learning_rate": 3.8588258977401636e-07, + "loss": 0.2637, + "step": 2538 + }, + { + "epoch": 0.72, + "grad_norm": 2.3622160811014967, + "learning_rate": 3.851585157643844e-07, + "loss": 0.2927, + "step": 2539 + }, + { + "epoch": 0.72, + "grad_norm": 2.3793320717187747, + "learning_rate": 3.844349596585298e-07, + "loss": 0.2663, + "step": 2540 + }, + { + "epoch": 0.72, + "grad_norm": 2.361965899246749, + "learning_rate": 3.8371192206593174e-07, + "loss": 0.2719, + "step": 2541 + }, + { + "epoch": 0.72, + "grad_norm": 2.384032621212719, + "learning_rate": 3.8298940359563057e-07, + "loss": 0.2671, + "step": 2542 + }, + { + "epoch": 0.72, + "grad_norm": 2.6938610355413224, + "learning_rate": 3.822674048562309e-07, + "loss": 0.2581, + "step": 2543 + }, + { + "epoch": 0.72, + "grad_norm": 2.3434426912533093, + "learning_rate": 3.8154592645589877e-07, + "loss": 0.2656, + "step": 2544 + }, + { + "epoch": 0.72, + "grad_norm": 2.465646994145014, + "learning_rate": 3.808249690023624e-07, + "loss": 0.2686, + "step": 2545 + }, + { + "epoch": 0.72, + "grad_norm": 2.3528660319084183, + "learning_rate": 3.801045331029108e-07, + "loss": 0.2803, + "step": 2546 + }, + { + "epoch": 0.72, + "grad_norm": 2.3126792038268906, + "learning_rate": 3.79384619364394e-07, + "loss": 0.2662, + "step": 2547 + }, + { + "epoch": 0.72, + "grad_norm": 2.381341397933177, + "learning_rate": 3.78665228393222e-07, + "loss": 0.2654, + "step": 2548 + }, + { + "epoch": 0.72, + "grad_norm": 2.427083572259114, + "learning_rate": 3.7794636079536436e-07, + "loss": 0.2747, + "step": 2549 + }, + { + "epoch": 0.72, + "grad_norm": 2.5762959209498875, + "learning_rate": 3.772280171763501e-07, + "loss": 0.2803, + "step": 2550 + }, + { + "epoch": 0.72, + "grad_norm": 2.445020108164999, + "learning_rate": 3.765101981412665e-07, + "loss": 0.2679, + "step": 2551 + }, + { + "epoch": 0.72, + "grad_norm": 2.3630588946128537, + "learning_rate": 3.757929042947593e-07, + "loss": 0.2836, + "step": 2552 + }, + { + "epoch": 0.72, + "grad_norm": 2.3140210795068104, + "learning_rate": 3.7507613624103165e-07, + "loss": 0.2908, + "step": 2553 + }, + { + "epoch": 0.72, + "grad_norm": 2.4066462075449193, + "learning_rate": 3.743598945838438e-07, + "loss": 0.3071, + "step": 2554 + }, + { + "epoch": 0.72, + "grad_norm": 2.4619938252669757, + "learning_rate": 3.7364417992651266e-07, + "loss": 0.2352, + "step": 2555 + }, + { + "epoch": 0.72, + "grad_norm": 2.3731347309841744, + "learning_rate": 3.7292899287191125e-07, + "loss": 0.2533, + "step": 2556 + }, + { + "epoch": 0.72, + "grad_norm": 2.3363844072722357, + "learning_rate": 3.7221433402246815e-07, + "loss": 0.2865, + "step": 2557 + }, + { + "epoch": 0.72, + "grad_norm": 2.5048502027566593, + "learning_rate": 3.715002039801671e-07, + "loss": 0.279, + "step": 2558 + }, + { + "epoch": 0.73, + "grad_norm": 2.5234443399823583, + "learning_rate": 3.707866033465461e-07, + "loss": 0.2743, + "step": 2559 + }, + { + "epoch": 0.73, + "grad_norm": 2.461815368653574, + "learning_rate": 3.700735327226976e-07, + "loss": 0.2504, + "step": 2560 + }, + { + "epoch": 0.73, + "grad_norm": 2.4228042349715087, + "learning_rate": 3.6936099270926734e-07, + "loss": 0.2828, + "step": 2561 + }, + { + "epoch": 0.73, + "grad_norm": 2.420179888833707, + "learning_rate": 3.686489839064543e-07, + "loss": 0.2846, + "step": 2562 + }, + { + "epoch": 0.73, + "grad_norm": 2.4906239765975884, + "learning_rate": 3.679375069140099e-07, + "loss": 0.309, + "step": 2563 + }, + { + "epoch": 0.73, + "grad_norm": 2.289665637068872, + "learning_rate": 3.6722656233123706e-07, + "loss": 0.2536, + "step": 2564 + }, + { + "epoch": 0.73, + "grad_norm": 2.3349740705759654, + "learning_rate": 3.6651615075699137e-07, + "loss": 0.2808, + "step": 2565 + }, + { + "epoch": 0.73, + "grad_norm": 2.4114347304296344, + "learning_rate": 3.658062727896788e-07, + "loss": 0.2927, + "step": 2566 + }, + { + "epoch": 0.73, + "grad_norm": 2.5596060525617372, + "learning_rate": 3.6509692902725597e-07, + "loss": 0.2798, + "step": 2567 + }, + { + "epoch": 0.73, + "grad_norm": 2.402246854525871, + "learning_rate": 3.6438812006722885e-07, + "loss": 0.3023, + "step": 2568 + }, + { + "epoch": 0.73, + "grad_norm": 2.369821125049631, + "learning_rate": 3.636798465066536e-07, + "loss": 0.2748, + "step": 2569 + }, + { + "epoch": 0.73, + "grad_norm": 2.2644575106815457, + "learning_rate": 3.629721089421359e-07, + "loss": 0.2624, + "step": 2570 + }, + { + "epoch": 0.73, + "grad_norm": 2.2918717960060118, + "learning_rate": 3.6226490796982925e-07, + "loss": 0.2728, + "step": 2571 + }, + { + "epoch": 0.73, + "grad_norm": 2.1864303860737917, + "learning_rate": 3.615582441854348e-07, + "loss": 0.2352, + "step": 2572 + }, + { + "epoch": 0.73, + "grad_norm": 2.4095803133808533, + "learning_rate": 3.6085211818420167e-07, + "loss": 0.3267, + "step": 2573 + }, + { + "epoch": 0.73, + "grad_norm": 2.1223371485411473, + "learning_rate": 3.6014653056092593e-07, + "loss": 0.2633, + "step": 2574 + }, + { + "epoch": 0.73, + "grad_norm": 2.207700617626701, + "learning_rate": 3.5944148190995073e-07, + "loss": 0.2399, + "step": 2575 + }, + { + "epoch": 0.73, + "grad_norm": 2.1559620057419866, + "learning_rate": 3.587369728251647e-07, + "loss": 0.2567, + "step": 2576 + }, + { + "epoch": 0.73, + "grad_norm": 2.403221431935094, + "learning_rate": 3.5803300390000133e-07, + "loss": 0.246, + "step": 2577 + }, + { + "epoch": 0.73, + "grad_norm": 2.8409300056102214, + "learning_rate": 3.5732957572744e-07, + "loss": 0.2835, + "step": 2578 + }, + { + "epoch": 0.73, + "grad_norm": 2.4152625007808504, + "learning_rate": 3.5662668890000415e-07, + "loss": 0.3176, + "step": 2579 + }, + { + "epoch": 0.73, + "grad_norm": 2.307449285104854, + "learning_rate": 3.559243440097622e-07, + "loss": 0.237, + "step": 2580 + }, + { + "epoch": 0.73, + "grad_norm": 4.0720001522452485, + "learning_rate": 3.5522254164832456e-07, + "loss": 0.3037, + "step": 2581 + }, + { + "epoch": 0.73, + "grad_norm": 4.432606930026203, + "learning_rate": 3.5452128240684556e-07, + "loss": 0.2782, + "step": 2582 + }, + { + "epoch": 0.73, + "grad_norm": 2.2833098442405837, + "learning_rate": 3.538205668760218e-07, + "loss": 0.2605, + "step": 2583 + }, + { + "epoch": 0.73, + "grad_norm": 2.239400351839413, + "learning_rate": 3.53120395646092e-07, + "loss": 0.2636, + "step": 2584 + }, + { + "epoch": 0.73, + "grad_norm": 2.330486004814981, + "learning_rate": 3.524207693068364e-07, + "loss": 0.284, + "step": 2585 + }, + { + "epoch": 0.73, + "grad_norm": 2.5464804766981235, + "learning_rate": 3.517216884475762e-07, + "loss": 0.2748, + "step": 2586 + }, + { + "epoch": 0.73, + "grad_norm": 2.1444559881500664, + "learning_rate": 3.5102315365717303e-07, + "loss": 0.2737, + "step": 2587 + }, + { + "epoch": 0.73, + "grad_norm": 2.167108947304041, + "learning_rate": 3.503251655240288e-07, + "loss": 0.2714, + "step": 2588 + }, + { + "epoch": 0.73, + "grad_norm": 2.502123863730012, + "learning_rate": 3.4962772463608457e-07, + "loss": 0.258, + "step": 2589 + }, + { + "epoch": 0.73, + "grad_norm": 2.518883768069017, + "learning_rate": 3.489308315808209e-07, + "loss": 0.2844, + "step": 2590 + }, + { + "epoch": 0.73, + "grad_norm": 2.340826659734753, + "learning_rate": 3.482344869452565e-07, + "loss": 0.2684, + "step": 2591 + }, + { + "epoch": 0.73, + "grad_norm": 2.3794594889726643, + "learning_rate": 3.475386913159483e-07, + "loss": 0.2825, + "step": 2592 + }, + { + "epoch": 0.73, + "grad_norm": 2.3529290513987755, + "learning_rate": 3.468434452789911e-07, + "loss": 0.2599, + "step": 2593 + }, + { + "epoch": 0.74, + "grad_norm": 2.4524978022353587, + "learning_rate": 3.461487494200154e-07, + "loss": 0.2631, + "step": 2594 + }, + { + "epoch": 0.74, + "grad_norm": 2.288189711546269, + "learning_rate": 3.4545460432419036e-07, + "loss": 0.2626, + "step": 2595 + }, + { + "epoch": 0.74, + "grad_norm": 2.4783830699080887, + "learning_rate": 3.4476101057621966e-07, + "loss": 0.3027, + "step": 2596 + }, + { + "epoch": 0.74, + "grad_norm": 2.3321205098430235, + "learning_rate": 3.4406796876034317e-07, + "loss": 0.2448, + "step": 2597 + }, + { + "epoch": 0.74, + "grad_norm": 2.334725211239991, + "learning_rate": 3.433754794603355e-07, + "loss": 0.2855, + "step": 2598 + }, + { + "epoch": 0.74, + "grad_norm": 2.259009020940633, + "learning_rate": 3.426835432595063e-07, + "loss": 0.2452, + "step": 2599 + }, + { + "epoch": 0.74, + "grad_norm": 2.504960317513778, + "learning_rate": 3.4199216074069903e-07, + "loss": 0.271, + "step": 2600 + }, + { + "epoch": 0.74, + "grad_norm": 2.4533948927974816, + "learning_rate": 3.4130133248629065e-07, + "loss": 0.2929, + "step": 2601 + }, + { + "epoch": 0.74, + "grad_norm": 2.1717537961063798, + "learning_rate": 3.40611059078192e-07, + "loss": 0.2325, + "step": 2602 + }, + { + "epoch": 0.74, + "grad_norm": 2.371460083952491, + "learning_rate": 3.399213410978446e-07, + "loss": 0.2476, + "step": 2603 + }, + { + "epoch": 0.74, + "grad_norm": 2.7576954694182683, + "learning_rate": 3.392321791262249e-07, + "loss": 0.2599, + "step": 2604 + }, + { + "epoch": 0.74, + "grad_norm": 2.3982745069910436, + "learning_rate": 3.3854357374383903e-07, + "loss": 0.2699, + "step": 2605 + }, + { + "epoch": 0.74, + "grad_norm": 2.2094367962380894, + "learning_rate": 3.3785552553072517e-07, + "loss": 0.2328, + "step": 2606 + }, + { + "epoch": 0.74, + "grad_norm": 2.304369436989134, + "learning_rate": 3.371680350664512e-07, + "loss": 0.2697, + "step": 2607 + }, + { + "epoch": 0.74, + "grad_norm": 2.3650813356848883, + "learning_rate": 3.364811029301159e-07, + "loss": 0.2831, + "step": 2608 + }, + { + "epoch": 0.74, + "grad_norm": 2.2515919828201922, + "learning_rate": 3.3579472970034814e-07, + "loss": 0.2401, + "step": 2609 + }, + { + "epoch": 0.74, + "grad_norm": 2.2827874062481417, + "learning_rate": 3.3510891595530564e-07, + "loss": 0.2491, + "step": 2610 + }, + { + "epoch": 0.74, + "grad_norm": 2.1908715491993274, + "learning_rate": 3.3442366227267425e-07, + "loss": 0.2541, + "step": 2611 + }, + { + "epoch": 0.74, + "grad_norm": 2.3717075447613394, + "learning_rate": 3.337389692296686e-07, + "loss": 0.2761, + "step": 2612 + }, + { + "epoch": 0.74, + "grad_norm": 2.2872731055219195, + "learning_rate": 3.330548374030309e-07, + "loss": 0.2577, + "step": 2613 + }, + { + "epoch": 0.74, + "grad_norm": 2.3441712807359547, + "learning_rate": 3.3237126736903166e-07, + "loss": 0.2887, + "step": 2614 + }, + { + "epoch": 0.74, + "grad_norm": 2.490986134566394, + "learning_rate": 3.316882597034663e-07, + "loss": 0.2656, + "step": 2615 + }, + { + "epoch": 0.74, + "grad_norm": 2.2382706884208705, + "learning_rate": 3.3100581498165783e-07, + "loss": 0.2658, + "step": 2616 + }, + { + "epoch": 0.74, + "grad_norm": 2.324163839691007, + "learning_rate": 3.303239337784547e-07, + "loss": 0.2477, + "step": 2617 + }, + { + "epoch": 0.74, + "grad_norm": 2.2028924069980684, + "learning_rate": 3.296426166682303e-07, + "loss": 0.2943, + "step": 2618 + }, + { + "epoch": 0.74, + "grad_norm": 2.7641272306318703, + "learning_rate": 3.289618642248846e-07, + "loss": 0.2767, + "step": 2619 + }, + { + "epoch": 0.74, + "grad_norm": 2.194964291798905, + "learning_rate": 3.282816770218394e-07, + "loss": 0.2871, + "step": 2620 + }, + { + "epoch": 0.74, + "grad_norm": 2.357021323177236, + "learning_rate": 3.276020556320419e-07, + "loss": 0.2768, + "step": 2621 + }, + { + "epoch": 0.74, + "grad_norm": 2.654814264416529, + "learning_rate": 3.2692300062796254e-07, + "loss": 0.3321, + "step": 2622 + }, + { + "epoch": 0.74, + "grad_norm": 2.29602703091403, + "learning_rate": 3.2624451258159447e-07, + "loss": 0.2763, + "step": 2623 + }, + { + "epoch": 0.74, + "grad_norm": 2.4739428126752783, + "learning_rate": 3.2556659206445327e-07, + "loss": 0.2721, + "step": 2624 + }, + { + "epoch": 0.74, + "grad_norm": 2.725494800009238, + "learning_rate": 3.248892396475765e-07, + "loss": 0.2962, + "step": 2625 + }, + { + "epoch": 0.74, + "grad_norm": 2.2994857273251244, + "learning_rate": 3.2421245590152335e-07, + "loss": 0.2613, + "step": 2626 + }, + { + "epoch": 0.74, + "grad_norm": 2.265001633392082, + "learning_rate": 3.235362413963738e-07, + "loss": 0.2489, + "step": 2627 + }, + { + "epoch": 0.74, + "grad_norm": 2.565517431965826, + "learning_rate": 3.228605967017284e-07, + "loss": 0.2866, + "step": 2628 + }, + { + "epoch": 0.74, + "grad_norm": 3.032018556302911, + "learning_rate": 3.221855223867076e-07, + "loss": 0.2603, + "step": 2629 + }, + { + "epoch": 0.75, + "grad_norm": 2.36995929708129, + "learning_rate": 3.215110190199518e-07, + "loss": 0.275, + "step": 2630 + }, + { + "epoch": 0.75, + "grad_norm": 2.2582387013579166, + "learning_rate": 3.2083708716961986e-07, + "loss": 0.2719, + "step": 2631 + }, + { + "epoch": 0.75, + "grad_norm": 2.6561695353472348, + "learning_rate": 3.201637274033899e-07, + "loss": 0.2949, + "step": 2632 + }, + { + "epoch": 0.75, + "grad_norm": 2.1972133812785444, + "learning_rate": 3.194909402884576e-07, + "loss": 0.262, + "step": 2633 + }, + { + "epoch": 0.75, + "grad_norm": 2.4304389915393703, + "learning_rate": 3.188187263915365e-07, + "loss": 0.2677, + "step": 2634 + }, + { + "epoch": 0.75, + "grad_norm": 2.374947867030814, + "learning_rate": 3.181470862788573e-07, + "loss": 0.2901, + "step": 2635 + }, + { + "epoch": 0.75, + "grad_norm": 2.3264780260706974, + "learning_rate": 3.174760205161678e-07, + "loss": 0.2657, + "step": 2636 + }, + { + "epoch": 0.75, + "grad_norm": 2.418903400270687, + "learning_rate": 3.168055296687305e-07, + "loss": 0.282, + "step": 2637 + }, + { + "epoch": 0.75, + "grad_norm": 2.25516044788765, + "learning_rate": 3.161356143013257e-07, + "loss": 0.2631, + "step": 2638 + }, + { + "epoch": 0.75, + "grad_norm": 2.3131577049947527, + "learning_rate": 3.154662749782476e-07, + "loss": 0.2647, + "step": 2639 + }, + { + "epoch": 0.75, + "grad_norm": 2.3814425730813724, + "learning_rate": 3.1479751226330566e-07, + "loss": 0.2769, + "step": 2640 + }, + { + "epoch": 0.75, + "grad_norm": 2.5373863660884393, + "learning_rate": 3.141293267198236e-07, + "loss": 0.2953, + "step": 2641 + }, + { + "epoch": 0.75, + "grad_norm": 2.4174812864003847, + "learning_rate": 3.13461718910638e-07, + "loss": 0.2943, + "step": 2642 + }, + { + "epoch": 0.75, + "grad_norm": 2.3206104540840973, + "learning_rate": 3.127946893981008e-07, + "loss": 0.2671, + "step": 2643 + }, + { + "epoch": 0.75, + "grad_norm": 3.5344070466388873, + "learning_rate": 3.1212823874407513e-07, + "loss": 0.3006, + "step": 2644 + }, + { + "epoch": 0.75, + "grad_norm": 2.344494797925107, + "learning_rate": 3.1146236750993757e-07, + "loss": 0.2756, + "step": 2645 + }, + { + "epoch": 0.75, + "grad_norm": 2.250948017574977, + "learning_rate": 3.107970762565755e-07, + "loss": 0.2626, + "step": 2646 + }, + { + "epoch": 0.75, + "grad_norm": 2.342448369121599, + "learning_rate": 3.1013236554438817e-07, + "loss": 0.2823, + "step": 2647 + }, + { + "epoch": 0.75, + "grad_norm": 2.4801146316539855, + "learning_rate": 3.094682359332871e-07, + "loss": 0.2701, + "step": 2648 + }, + { + "epoch": 0.75, + "grad_norm": 2.3519971171584273, + "learning_rate": 3.0880468798269286e-07, + "loss": 0.245, + "step": 2649 + }, + { + "epoch": 0.75, + "grad_norm": 2.3023529595295553, + "learning_rate": 3.0814172225153623e-07, + "loss": 0.2862, + "step": 2650 + }, + { + "epoch": 0.75, + "grad_norm": 2.479935156267912, + "learning_rate": 3.0747933929825786e-07, + "loss": 0.2595, + "step": 2651 + }, + { + "epoch": 0.75, + "grad_norm": 2.2767570811714357, + "learning_rate": 3.0681753968080735e-07, + "loss": 0.2454, + "step": 2652 + }, + { + "epoch": 0.75, + "grad_norm": 2.405319864949606, + "learning_rate": 3.061563239566439e-07, + "loss": 0.2668, + "step": 2653 + }, + { + "epoch": 0.75, + "grad_norm": 2.3523938041145374, + "learning_rate": 3.0549569268273314e-07, + "loss": 0.271, + "step": 2654 + }, + { + "epoch": 0.75, + "grad_norm": 2.2832859994293195, + "learning_rate": 3.048356464155495e-07, + "loss": 0.2562, + "step": 2655 + }, + { + "epoch": 0.75, + "grad_norm": 2.1814233292526097, + "learning_rate": 3.041761857110744e-07, + "loss": 0.2458, + "step": 2656 + }, + { + "epoch": 0.75, + "grad_norm": 2.3524324132974135, + "learning_rate": 3.0351731112479627e-07, + "loss": 0.2521, + "step": 2657 + }, + { + "epoch": 0.75, + "grad_norm": 2.4893308737060544, + "learning_rate": 3.0285902321170943e-07, + "loss": 0.2848, + "step": 2658 + }, + { + "epoch": 0.75, + "grad_norm": 2.58810016842196, + "learning_rate": 3.0220132252631416e-07, + "loss": 0.3069, + "step": 2659 + }, + { + "epoch": 0.75, + "grad_norm": 2.684462905942372, + "learning_rate": 3.015442096226163e-07, + "loss": 0.2914, + "step": 2660 + }, + { + "epoch": 0.75, + "grad_norm": 3.5481577407259155, + "learning_rate": 3.008876850541262e-07, + "loss": 0.2866, + "step": 2661 + }, + { + "epoch": 0.75, + "grad_norm": 2.4742375445466247, + "learning_rate": 3.00231749373859e-07, + "loss": 0.2657, + "step": 2662 + }, + { + "epoch": 0.75, + "grad_norm": 2.418562491235517, + "learning_rate": 2.995764031343336e-07, + "loss": 0.269, + "step": 2663 + }, + { + "epoch": 0.75, + "grad_norm": 2.238527067720981, + "learning_rate": 2.989216468875725e-07, + "loss": 0.2661, + "step": 2664 + }, + { + "epoch": 0.76, + "grad_norm": 2.3820690839491427, + "learning_rate": 2.9826748118510106e-07, + "loss": 0.2402, + "step": 2665 + }, + { + "epoch": 0.76, + "grad_norm": 2.5059711601939862, + "learning_rate": 2.9761390657794727e-07, + "loss": 0.2816, + "step": 2666 + }, + { + "epoch": 0.76, + "grad_norm": 2.336363737866262, + "learning_rate": 2.9696092361664125e-07, + "loss": 0.2732, + "step": 2667 + }, + { + "epoch": 0.76, + "grad_norm": 2.2351641737827395, + "learning_rate": 2.96308532851215e-07, + "loss": 0.2638, + "step": 2668 + }, + { + "epoch": 0.76, + "grad_norm": 2.455015050921343, + "learning_rate": 2.956567348312012e-07, + "loss": 0.2741, + "step": 2669 + }, + { + "epoch": 0.76, + "grad_norm": 2.4782570931707997, + "learning_rate": 2.9500553010563356e-07, + "loss": 0.2617, + "step": 2670 + }, + { + "epoch": 0.76, + "grad_norm": 2.412270563323574, + "learning_rate": 2.94354919223046e-07, + "loss": 0.2446, + "step": 2671 + }, + { + "epoch": 0.76, + "grad_norm": 2.208344037910064, + "learning_rate": 2.9370490273147217e-07, + "loss": 0.2516, + "step": 2672 + }, + { + "epoch": 0.76, + "grad_norm": 2.651043462680491, + "learning_rate": 2.9305548117844504e-07, + "loss": 0.2722, + "step": 2673 + }, + { + "epoch": 0.76, + "grad_norm": 2.4735510597437114, + "learning_rate": 2.9240665511099636e-07, + "loss": 0.2675, + "step": 2674 + }, + { + "epoch": 0.76, + "grad_norm": 2.258433909290774, + "learning_rate": 2.9175842507565695e-07, + "loss": 0.2557, + "step": 2675 + }, + { + "epoch": 0.76, + "grad_norm": 2.408322811827988, + "learning_rate": 2.911107916184539e-07, + "loss": 0.2982, + "step": 2676 + }, + { + "epoch": 0.76, + "grad_norm": 2.3516191496170067, + "learning_rate": 2.9046375528491376e-07, + "loss": 0.2785, + "step": 2677 + }, + { + "epoch": 0.76, + "grad_norm": 2.2900556855731247, + "learning_rate": 2.89817316620059e-07, + "loss": 0.2639, + "step": 2678 + }, + { + "epoch": 0.76, + "grad_norm": 3.0394892227608734, + "learning_rate": 2.891714761684093e-07, + "loss": 0.272, + "step": 2679 + }, + { + "epoch": 0.76, + "grad_norm": 2.2794822571116966, + "learning_rate": 2.8852623447397915e-07, + "loss": 0.248, + "step": 2680 + }, + { + "epoch": 0.76, + "grad_norm": 2.313592477288886, + "learning_rate": 2.8788159208027973e-07, + "loss": 0.2649, + "step": 2681 + }, + { + "epoch": 0.76, + "grad_norm": 2.537623352307677, + "learning_rate": 2.8723754953031777e-07, + "loss": 0.2874, + "step": 2682 + }, + { + "epoch": 0.76, + "grad_norm": 2.492465481071438, + "learning_rate": 2.8659410736659416e-07, + "loss": 0.267, + "step": 2683 + }, + { + "epoch": 0.76, + "grad_norm": 2.5525190678003082, + "learning_rate": 2.8595126613110363e-07, + "loss": 0.2814, + "step": 2684 + }, + { + "epoch": 0.76, + "grad_norm": 2.6229646083995553, + "learning_rate": 2.853090263653354e-07, + "loss": 0.3226, + "step": 2685 + }, + { + "epoch": 0.76, + "grad_norm": 2.2844022457053117, + "learning_rate": 2.846673886102714e-07, + "loss": 0.2898, + "step": 2686 + }, + { + "epoch": 0.76, + "grad_norm": 2.3647776025257463, + "learning_rate": 2.840263534063877e-07, + "loss": 0.263, + "step": 2687 + }, + { + "epoch": 0.76, + "grad_norm": 2.2855963169893183, + "learning_rate": 2.833859212936519e-07, + "loss": 0.272, + "step": 2688 + }, + { + "epoch": 0.76, + "grad_norm": 2.765864320378852, + "learning_rate": 2.827460928115232e-07, + "loss": 0.2951, + "step": 2689 + }, + { + "epoch": 0.76, + "grad_norm": 2.7069540638044005, + "learning_rate": 2.8210686849895307e-07, + "loss": 0.3033, + "step": 2690 + }, + { + "epoch": 0.76, + "grad_norm": 2.353518345754197, + "learning_rate": 2.8146824889438356e-07, + "loss": 0.3012, + "step": 2691 + }, + { + "epoch": 0.76, + "grad_norm": 2.2556279108688435, + "learning_rate": 2.808302345357486e-07, + "loss": 0.2602, + "step": 2692 + }, + { + "epoch": 0.76, + "grad_norm": 2.515343160953233, + "learning_rate": 2.8019282596047046e-07, + "loss": 0.2657, + "step": 2693 + }, + { + "epoch": 0.76, + "grad_norm": 2.5698374203328034, + "learning_rate": 2.7955602370546227e-07, + "loss": 0.2823, + "step": 2694 + }, + { + "epoch": 0.76, + "grad_norm": 2.372728140279168, + "learning_rate": 2.789198283071261e-07, + "loss": 0.2836, + "step": 2695 + }, + { + "epoch": 0.76, + "grad_norm": 2.528942553797293, + "learning_rate": 2.78284240301353e-07, + "loss": 0.2802, + "step": 2696 + }, + { + "epoch": 0.76, + "grad_norm": 2.3999481678317376, + "learning_rate": 2.776492602235223e-07, + "loss": 0.302, + "step": 2697 + }, + { + "epoch": 0.76, + "grad_norm": 2.334677905881842, + "learning_rate": 2.770148886085013e-07, + "loss": 0.259, + "step": 2698 + }, + { + "epoch": 0.76, + "grad_norm": 2.1839619355909523, + "learning_rate": 2.763811259906447e-07, + "loss": 0.268, + "step": 2699 + }, + { + "epoch": 0.77, + "grad_norm": 2.266845888284121, + "learning_rate": 2.7574797290379413e-07, + "loss": 0.232, + "step": 2700 + }, + { + "epoch": 0.77, + "grad_norm": 2.480842434672786, + "learning_rate": 2.751154298812781e-07, + "loss": 0.2678, + "step": 2701 + }, + { + "epoch": 0.77, + "grad_norm": 2.2677658357528414, + "learning_rate": 2.74483497455911e-07, + "loss": 0.2665, + "step": 2702 + }, + { + "epoch": 0.77, + "grad_norm": 2.4011260292496845, + "learning_rate": 2.73852176159993e-07, + "loss": 0.2534, + "step": 2703 + }, + { + "epoch": 0.77, + "grad_norm": 2.253785637191656, + "learning_rate": 2.732214665253092e-07, + "loss": 0.2379, + "step": 2704 + }, + { + "epoch": 0.77, + "grad_norm": 2.602202388621679, + "learning_rate": 2.7259136908312995e-07, + "loss": 0.3228, + "step": 2705 + }, + { + "epoch": 0.77, + "grad_norm": 2.3452687489865895, + "learning_rate": 2.719618843642095e-07, + "loss": 0.2966, + "step": 2706 + }, + { + "epoch": 0.77, + "grad_norm": 2.184038611317654, + "learning_rate": 2.713330128987864e-07, + "loss": 0.2524, + "step": 2707 + }, + { + "epoch": 0.77, + "grad_norm": 2.3034671990517026, + "learning_rate": 2.707047552165822e-07, + "loss": 0.2575, + "step": 2708 + }, + { + "epoch": 0.77, + "grad_norm": 2.405072030453159, + "learning_rate": 2.700771118468017e-07, + "loss": 0.2868, + "step": 2709 + }, + { + "epoch": 0.77, + "grad_norm": 2.252834431948804, + "learning_rate": 2.6945008331813224e-07, + "loss": 0.2549, + "step": 2710 + }, + { + "epoch": 0.77, + "grad_norm": 2.6519575726519546, + "learning_rate": 2.688236701587431e-07, + "loss": 0.3061, + "step": 2711 + }, + { + "epoch": 0.77, + "grad_norm": 2.3694153979510038, + "learning_rate": 2.6819787289628526e-07, + "loss": 0.2777, + "step": 2712 + }, + { + "epoch": 0.77, + "grad_norm": 2.5804112331460813, + "learning_rate": 2.6757269205789113e-07, + "loss": 0.2905, + "step": 2713 + }, + { + "epoch": 0.77, + "grad_norm": 2.3565782140319422, + "learning_rate": 2.6694812817017387e-07, + "loss": 0.267, + "step": 2714 + }, + { + "epoch": 0.77, + "grad_norm": 2.3170272128324116, + "learning_rate": 2.663241817592261e-07, + "loss": 0.2478, + "step": 2715 + }, + { + "epoch": 0.77, + "grad_norm": 2.3698915573154906, + "learning_rate": 2.6570085335062164e-07, + "loss": 0.2489, + "step": 2716 + }, + { + "epoch": 0.77, + "grad_norm": 2.2915645771544253, + "learning_rate": 2.6507814346941293e-07, + "loss": 0.2934, + "step": 2717 + }, + { + "epoch": 0.77, + "grad_norm": 2.268037849281149, + "learning_rate": 2.64456052640132e-07, + "loss": 0.2947, + "step": 2718 + }, + { + "epoch": 0.77, + "grad_norm": 2.402245114365464, + "learning_rate": 2.6383458138678827e-07, + "loss": 0.2441, + "step": 2719 + }, + { + "epoch": 0.77, + "grad_norm": 2.433258683555696, + "learning_rate": 2.6321373023287007e-07, + "loss": 0.2361, + "step": 2720 + }, + { + "epoch": 0.77, + "grad_norm": 2.638457709606131, + "learning_rate": 2.6259349970134403e-07, + "loss": 0.2723, + "step": 2721 + }, + { + "epoch": 0.77, + "grad_norm": 2.1506166647148444, + "learning_rate": 2.6197389031465324e-07, + "loss": 0.2288, + "step": 2722 + }, + { + "epoch": 0.77, + "grad_norm": 2.2800718518628473, + "learning_rate": 2.613549025947169e-07, + "loss": 0.2345, + "step": 2723 + }, + { + "epoch": 0.77, + "grad_norm": 2.2215835594373874, + "learning_rate": 2.60736537062932e-07, + "loss": 0.245, + "step": 2724 + }, + { + "epoch": 0.77, + "grad_norm": 2.4457713208187712, + "learning_rate": 2.6011879424017005e-07, + "loss": 0.3009, + "step": 2725 + }, + { + "epoch": 0.77, + "grad_norm": 2.3109213740501753, + "learning_rate": 2.5950167464677985e-07, + "loss": 0.2648, + "step": 2726 + }, + { + "epoch": 0.77, + "grad_norm": 2.367702983358573, + "learning_rate": 2.588851788025832e-07, + "loss": 0.2656, + "step": 2727 + }, + { + "epoch": 0.77, + "grad_norm": 2.267989855733225, + "learning_rate": 2.582693072268778e-07, + "loss": 0.2742, + "step": 2728 + }, + { + "epoch": 0.77, + "grad_norm": 2.3799453877005488, + "learning_rate": 2.5765406043843483e-07, + "loss": 0.2879, + "step": 2729 + }, + { + "epoch": 0.77, + "grad_norm": 2.4183363741864863, + "learning_rate": 2.5703943895549975e-07, + "loss": 0.2452, + "step": 2730 + }, + { + "epoch": 0.77, + "grad_norm": 2.352119842346004, + "learning_rate": 2.5642544329579085e-07, + "loss": 0.2769, + "step": 2731 + }, + { + "epoch": 0.77, + "grad_norm": 2.3630801759639217, + "learning_rate": 2.558120739764995e-07, + "loss": 0.2806, + "step": 2732 + }, + { + "epoch": 0.77, + "grad_norm": 2.511379597161267, + "learning_rate": 2.551993315142894e-07, + "loss": 0.2976, + "step": 2733 + }, + { + "epoch": 0.77, + "grad_norm": 2.1875569102599113, + "learning_rate": 2.5458721642529637e-07, + "loss": 0.2228, + "step": 2734 + }, + { + "epoch": 0.77, + "grad_norm": 2.3201124027880544, + "learning_rate": 2.5397572922512735e-07, + "loss": 0.2644, + "step": 2735 + }, + { + "epoch": 0.78, + "grad_norm": 2.5622681136534515, + "learning_rate": 2.53364870428861e-07, + "loss": 0.2763, + "step": 2736 + }, + { + "epoch": 0.78, + "grad_norm": 2.650004763755733, + "learning_rate": 2.527546405510461e-07, + "loss": 0.2547, + "step": 2737 + }, + { + "epoch": 0.78, + "grad_norm": 2.814911905875072, + "learning_rate": 2.5214504010570214e-07, + "loss": 0.2653, + "step": 2738 + }, + { + "epoch": 0.78, + "grad_norm": 2.2147385657579566, + "learning_rate": 2.515360696063179e-07, + "loss": 0.245, + "step": 2739 + }, + { + "epoch": 0.78, + "grad_norm": 2.306210277214575, + "learning_rate": 2.5092772956585205e-07, + "loss": 0.269, + "step": 2740 + }, + { + "epoch": 0.78, + "grad_norm": 2.239930705597454, + "learning_rate": 2.503200204967317e-07, + "loss": 0.2716, + "step": 2741 + }, + { + "epoch": 0.78, + "grad_norm": 2.2169610526187618, + "learning_rate": 2.497129429108531e-07, + "loss": 0.2659, + "step": 2742 + }, + { + "epoch": 0.78, + "grad_norm": 5.314771526733961, + "learning_rate": 2.491064973195798e-07, + "loss": 0.2949, + "step": 2743 + }, + { + "epoch": 0.78, + "grad_norm": 2.464722741180483, + "learning_rate": 2.485006842337437e-07, + "loss": 0.2622, + "step": 2744 + }, + { + "epoch": 0.78, + "grad_norm": 2.4414530378845556, + "learning_rate": 2.4789550416364347e-07, + "loss": 0.309, + "step": 2745 + }, + { + "epoch": 0.78, + "grad_norm": 2.3966166894823697, + "learning_rate": 2.4729095761904483e-07, + "loss": 0.2892, + "step": 2746 + }, + { + "epoch": 0.78, + "grad_norm": 2.293602180410425, + "learning_rate": 2.466870451091796e-07, + "loss": 0.2568, + "step": 2747 + }, + { + "epoch": 0.78, + "grad_norm": 2.570710790990384, + "learning_rate": 2.4608376714274617e-07, + "loss": 0.2488, + "step": 2748 + }, + { + "epoch": 0.78, + "grad_norm": 2.384964366245312, + "learning_rate": 2.454811242279069e-07, + "loss": 0.2628, + "step": 2749 + }, + { + "epoch": 0.78, + "grad_norm": 2.410683574063837, + "learning_rate": 2.4487911687229113e-07, + "loss": 0.2534, + "step": 2750 + }, + { + "epoch": 0.78, + "grad_norm": 2.3339434682085076, + "learning_rate": 2.4427774558299185e-07, + "loss": 0.2967, + "step": 2751 + }, + { + "epoch": 0.78, + "grad_norm": 2.2911955493857046, + "learning_rate": 2.4367701086656624e-07, + "loss": 0.2943, + "step": 2752 + }, + { + "epoch": 0.78, + "grad_norm": 2.3451456650946283, + "learning_rate": 2.430769132290357e-07, + "loss": 0.2765, + "step": 2753 + }, + { + "epoch": 0.78, + "grad_norm": 2.7071698183330892, + "learning_rate": 2.4247745317588397e-07, + "loss": 0.3126, + "step": 2754 + }, + { + "epoch": 0.78, + "grad_norm": 2.3761776008143247, + "learning_rate": 2.418786312120593e-07, + "loss": 0.2556, + "step": 2755 + }, + { + "epoch": 0.78, + "grad_norm": 2.188824650286055, + "learning_rate": 2.412804478419712e-07, + "loss": 0.2809, + "step": 2756 + }, + { + "epoch": 0.78, + "grad_norm": 2.3791611004508746, + "learning_rate": 2.406829035694923e-07, + "loss": 0.2718, + "step": 2757 + }, + { + "epoch": 0.78, + "grad_norm": 2.3031708128532293, + "learning_rate": 2.400859988979554e-07, + "loss": 0.2666, + "step": 2758 + }, + { + "epoch": 0.78, + "grad_norm": 2.507943026823091, + "learning_rate": 2.394897343301556e-07, + "loss": 0.2832, + "step": 2759 + }, + { + "epoch": 0.78, + "grad_norm": 2.1908843560087625, + "learning_rate": 2.388941103683493e-07, + "loss": 0.242, + "step": 2760 + }, + { + "epoch": 0.78, + "grad_norm": 2.3430749497591057, + "learning_rate": 2.382991275142524e-07, + "loss": 0.2607, + "step": 2761 + }, + { + "epoch": 0.78, + "grad_norm": 2.4206029000098783, + "learning_rate": 2.3770478626904068e-07, + "loss": 0.2676, + "step": 2762 + }, + { + "epoch": 0.78, + "grad_norm": 2.340638141700092, + "learning_rate": 2.3711108713334994e-07, + "loss": 0.2575, + "step": 2763 + }, + { + "epoch": 0.78, + "grad_norm": 2.398508636026643, + "learning_rate": 2.3651803060727482e-07, + "loss": 0.2855, + "step": 2764 + }, + { + "epoch": 0.78, + "grad_norm": 2.4459378713673647, + "learning_rate": 2.3592561719036952e-07, + "loss": 0.2749, + "step": 2765 + }, + { + "epoch": 0.78, + "grad_norm": 2.423230704899453, + "learning_rate": 2.3533384738164508e-07, + "loss": 0.2519, + "step": 2766 + }, + { + "epoch": 0.78, + "grad_norm": 3.0438979344407984, + "learning_rate": 2.3474272167957143e-07, + "loss": 0.2805, + "step": 2767 + }, + { + "epoch": 0.78, + "grad_norm": 2.3500512947448704, + "learning_rate": 2.341522405820756e-07, + "loss": 0.2851, + "step": 2768 + }, + { + "epoch": 0.78, + "grad_norm": 2.3808000861680303, + "learning_rate": 2.3356240458654185e-07, + "loss": 0.2639, + "step": 2769 + }, + { + "epoch": 0.78, + "grad_norm": 2.5585487785990346, + "learning_rate": 2.3297321418981075e-07, + "loss": 0.2867, + "step": 2770 + }, + { + "epoch": 0.79, + "grad_norm": 2.0898916437412027, + "learning_rate": 2.3238466988817928e-07, + "loss": 0.2489, + "step": 2771 + }, + { + "epoch": 0.79, + "grad_norm": 2.410983630793635, + "learning_rate": 2.3179677217740013e-07, + "loss": 0.2751, + "step": 2772 + }, + { + "epoch": 0.79, + "grad_norm": 2.2514253584456005, + "learning_rate": 2.3120952155268137e-07, + "loss": 0.2576, + "step": 2773 + }, + { + "epoch": 0.79, + "grad_norm": 2.4578035921751096, + "learning_rate": 2.3062291850868588e-07, + "loss": 0.2676, + "step": 2774 + }, + { + "epoch": 0.79, + "grad_norm": 2.185950464556516, + "learning_rate": 2.3003696353953117e-07, + "loss": 0.2439, + "step": 2775 + }, + { + "epoch": 0.79, + "grad_norm": 2.202339342263721, + "learning_rate": 2.29451657138789e-07, + "loss": 0.2578, + "step": 2776 + }, + { + "epoch": 0.79, + "grad_norm": 2.3231308517243865, + "learning_rate": 2.2886699979948444e-07, + "loss": 0.2893, + "step": 2777 + }, + { + "epoch": 0.79, + "grad_norm": 2.235137483563809, + "learning_rate": 2.2828299201409617e-07, + "loss": 0.2766, + "step": 2778 + }, + { + "epoch": 0.79, + "grad_norm": 2.194310608489236, + "learning_rate": 2.2769963427455552e-07, + "loss": 0.2535, + "step": 2779 + }, + { + "epoch": 0.79, + "grad_norm": 2.372712133821499, + "learning_rate": 2.2711692707224639e-07, + "loss": 0.2858, + "step": 2780 + }, + { + "epoch": 0.79, + "grad_norm": 2.214835367155243, + "learning_rate": 2.265348708980046e-07, + "loss": 0.2615, + "step": 2781 + }, + { + "epoch": 0.79, + "grad_norm": 2.6208064832788542, + "learning_rate": 2.2595346624211786e-07, + "loss": 0.2892, + "step": 2782 + }, + { + "epoch": 0.79, + "grad_norm": 2.4452656131894877, + "learning_rate": 2.2537271359432454e-07, + "loss": 0.2791, + "step": 2783 + }, + { + "epoch": 0.79, + "grad_norm": 2.400722142292888, + "learning_rate": 2.247926134438144e-07, + "loss": 0.2646, + "step": 2784 + }, + { + "epoch": 0.79, + "grad_norm": 2.3866676906776614, + "learning_rate": 2.2421316627922715e-07, + "loss": 0.2674, + "step": 2785 + }, + { + "epoch": 0.79, + "grad_norm": 2.4233247286668496, + "learning_rate": 2.236343725886527e-07, + "loss": 0.2808, + "step": 2786 + }, + { + "epoch": 0.79, + "grad_norm": 2.291558351108696, + "learning_rate": 2.230562328596306e-07, + "loss": 0.2673, + "step": 2787 + }, + { + "epoch": 0.79, + "grad_norm": 2.343505929479789, + "learning_rate": 2.2247874757914864e-07, + "loss": 0.267, + "step": 2788 + }, + { + "epoch": 0.79, + "grad_norm": 2.189849698105635, + "learning_rate": 2.2190191723364492e-07, + "loss": 0.2542, + "step": 2789 + }, + { + "epoch": 0.79, + "grad_norm": 2.421695068222722, + "learning_rate": 2.2132574230900482e-07, + "loss": 0.2883, + "step": 2790 + }, + { + "epoch": 0.79, + "grad_norm": 2.3300957788556116, + "learning_rate": 2.2075022329056192e-07, + "loss": 0.2993, + "step": 2791 + }, + { + "epoch": 0.79, + "grad_norm": 2.383095763086712, + "learning_rate": 2.2017536066309684e-07, + "loss": 0.2906, + "step": 2792 + }, + { + "epoch": 0.79, + "grad_norm": 2.1953601942491043, + "learning_rate": 2.1960115491083752e-07, + "loss": 0.237, + "step": 2793 + }, + { + "epoch": 0.79, + "grad_norm": 2.307913404713227, + "learning_rate": 2.1902760651745954e-07, + "loss": 0.2765, + "step": 2794 + }, + { + "epoch": 0.79, + "grad_norm": 2.4715336117778723, + "learning_rate": 2.1845471596608378e-07, + "loss": 0.278, + "step": 2795 + }, + { + "epoch": 0.79, + "grad_norm": 2.2935559162523127, + "learning_rate": 2.1788248373927675e-07, + "loss": 0.2311, + "step": 2796 + }, + { + "epoch": 0.79, + "grad_norm": 2.9425745468248645, + "learning_rate": 2.1731091031905113e-07, + "loss": 0.2453, + "step": 2797 + }, + { + "epoch": 0.79, + "grad_norm": 2.2150854366589776, + "learning_rate": 2.16739996186864e-07, + "loss": 0.2515, + "step": 2798 + }, + { + "epoch": 0.79, + "grad_norm": 2.353730791125616, + "learning_rate": 2.1616974182361825e-07, + "loss": 0.2687, + "step": 2799 + }, + { + "epoch": 0.79, + "grad_norm": 2.346278524063355, + "learning_rate": 2.1560014770966006e-07, + "loss": 0.264, + "step": 2800 + }, + { + "epoch": 0.79, + "grad_norm": 2.3421743418395375, + "learning_rate": 2.1503121432477932e-07, + "loss": 0.2769, + "step": 2801 + }, + { + "epoch": 0.79, + "grad_norm": 2.362493254521407, + "learning_rate": 2.1446294214820991e-07, + "loss": 0.244, + "step": 2802 + }, + { + "epoch": 0.79, + "grad_norm": 2.390546907904252, + "learning_rate": 2.1389533165862826e-07, + "loss": 0.2489, + "step": 2803 + }, + { + "epoch": 0.79, + "grad_norm": 2.298123026178687, + "learning_rate": 2.1332838333415447e-07, + "loss": 0.2389, + "step": 2804 + }, + { + "epoch": 0.79, + "grad_norm": 2.22722586200972, + "learning_rate": 2.1276209765234954e-07, + "loss": 0.2404, + "step": 2805 + }, + { + "epoch": 0.8, + "grad_norm": 2.399010284462047, + "learning_rate": 2.1219647509021698e-07, + "loss": 0.2709, + "step": 2806 + }, + { + "epoch": 0.8, + "grad_norm": 2.434832098004782, + "learning_rate": 2.116315161242015e-07, + "loss": 0.2922, + "step": 2807 + }, + { + "epoch": 0.8, + "grad_norm": 2.227811226214471, + "learning_rate": 2.110672212301896e-07, + "loss": 0.2842, + "step": 2808 + }, + { + "epoch": 0.8, + "grad_norm": 2.2952246407389745, + "learning_rate": 2.1050359088350723e-07, + "loss": 0.2774, + "step": 2809 + }, + { + "epoch": 0.8, + "grad_norm": 2.50504305057805, + "learning_rate": 2.0994062555892123e-07, + "loss": 0.2699, + "step": 2810 + }, + { + "epoch": 0.8, + "grad_norm": 2.342581244971091, + "learning_rate": 2.0937832573063818e-07, + "loss": 0.2346, + "step": 2811 + }, + { + "epoch": 0.8, + "grad_norm": 2.2551191727768205, + "learning_rate": 2.088166918723041e-07, + "loss": 0.2472, + "step": 2812 + }, + { + "epoch": 0.8, + "grad_norm": 2.2456476952574276, + "learning_rate": 2.0825572445700401e-07, + "loss": 0.2725, + "step": 2813 + }, + { + "epoch": 0.8, + "grad_norm": 2.3033253017776203, + "learning_rate": 2.076954239572616e-07, + "loss": 0.2669, + "step": 2814 + }, + { + "epoch": 0.8, + "grad_norm": 2.388877308526638, + "learning_rate": 2.0713579084503873e-07, + "loss": 0.2724, + "step": 2815 + }, + { + "epoch": 0.8, + "grad_norm": 2.3343989502499274, + "learning_rate": 2.0657682559173506e-07, + "loss": 0.2987, + "step": 2816 + }, + { + "epoch": 0.8, + "grad_norm": 2.2413289141575885, + "learning_rate": 2.060185286681878e-07, + "loss": 0.2364, + "step": 2817 + }, + { + "epoch": 0.8, + "grad_norm": 2.520433126733271, + "learning_rate": 2.0546090054467114e-07, + "loss": 0.2692, + "step": 2818 + }, + { + "epoch": 0.8, + "grad_norm": 2.4097024486795804, + "learning_rate": 2.0490394169089597e-07, + "loss": 0.2538, + "step": 2819 + }, + { + "epoch": 0.8, + "grad_norm": 2.243358516588946, + "learning_rate": 2.0434765257600928e-07, + "loss": 0.2506, + "step": 2820 + }, + { + "epoch": 0.8, + "grad_norm": 2.5078519127537215, + "learning_rate": 2.037920336685941e-07, + "loss": 0.2922, + "step": 2821 + }, + { + "epoch": 0.8, + "grad_norm": 2.4605547304027033, + "learning_rate": 2.0323708543666883e-07, + "loss": 0.2951, + "step": 2822 + }, + { + "epoch": 0.8, + "grad_norm": 2.293726838612549, + "learning_rate": 2.0268280834768692e-07, + "loss": 0.2494, + "step": 2823 + }, + { + "epoch": 0.8, + "grad_norm": 2.4860982798648648, + "learning_rate": 2.021292028685365e-07, + "loss": 0.283, + "step": 2824 + }, + { + "epoch": 0.8, + "grad_norm": 2.2593332441880984, + "learning_rate": 2.0157626946553995e-07, + "loss": 0.2997, + "step": 2825 + }, + { + "epoch": 0.8, + "grad_norm": 2.445996744842272, + "learning_rate": 2.01024008604454e-07, + "loss": 0.2568, + "step": 2826 + }, + { + "epoch": 0.8, + "grad_norm": 2.1971138861459263, + "learning_rate": 2.0047242075046744e-07, + "loss": 0.246, + "step": 2827 + }, + { + "epoch": 0.8, + "grad_norm": 2.592379392720621, + "learning_rate": 1.9992150636820415e-07, + "loss": 0.2868, + "step": 2828 + }, + { + "epoch": 0.8, + "grad_norm": 2.4169149111598256, + "learning_rate": 1.993712659217194e-07, + "loss": 0.2714, + "step": 2829 + }, + { + "epoch": 0.8, + "grad_norm": 2.4872947332284334, + "learning_rate": 1.9882169987450138e-07, + "loss": 0.273, + "step": 2830 + }, + { + "epoch": 0.8, + "grad_norm": 2.483937778500142, + "learning_rate": 1.982728086894694e-07, + "loss": 0.2737, + "step": 2831 + }, + { + "epoch": 0.8, + "grad_norm": 2.1706509090641597, + "learning_rate": 1.977245928289748e-07, + "loss": 0.2529, + "step": 2832 + }, + { + "epoch": 0.8, + "grad_norm": 4.210854981564041, + "learning_rate": 1.971770527548008e-07, + "loss": 0.2585, + "step": 2833 + }, + { + "epoch": 0.8, + "grad_norm": 2.67382194381519, + "learning_rate": 1.9663018892816063e-07, + "loss": 0.2802, + "step": 2834 + }, + { + "epoch": 0.8, + "grad_norm": 2.2123984409162762, + "learning_rate": 1.9608400180969743e-07, + "loss": 0.2357, + "step": 2835 + }, + { + "epoch": 0.8, + "grad_norm": 2.5011636033003968, + "learning_rate": 1.9553849185948512e-07, + "loss": 0.2659, + "step": 2836 + }, + { + "epoch": 0.8, + "grad_norm": 2.3650456015272416, + "learning_rate": 1.9499365953702674e-07, + "loss": 0.3112, + "step": 2837 + }, + { + "epoch": 0.8, + "grad_norm": 2.3662359880050072, + "learning_rate": 1.9444950530125548e-07, + "loss": 0.2701, + "step": 2838 + }, + { + "epoch": 0.8, + "grad_norm": 2.3835584666707947, + "learning_rate": 1.9390602961053194e-07, + "loss": 0.2649, + "step": 2839 + }, + { + "epoch": 0.8, + "grad_norm": 2.596380722618807, + "learning_rate": 1.933632329226459e-07, + "loss": 0.286, + "step": 2840 + }, + { + "epoch": 0.8, + "grad_norm": 2.7967913473061157, + "learning_rate": 1.9282111569481506e-07, + "loss": 0.2663, + "step": 2841 + }, + { + "epoch": 0.81, + "grad_norm": 2.2110276623027456, + "learning_rate": 1.9227967838368564e-07, + "loss": 0.2591, + "step": 2842 + }, + { + "epoch": 0.81, + "grad_norm": 2.249417596052053, + "learning_rate": 1.9173892144532956e-07, + "loss": 0.2357, + "step": 2843 + }, + { + "epoch": 0.81, + "grad_norm": 4.700061186446766, + "learning_rate": 1.9119884533524665e-07, + "loss": 0.2586, + "step": 2844 + }, + { + "epoch": 0.81, + "grad_norm": 3.0079039657582887, + "learning_rate": 1.9065945050836297e-07, + "loss": 0.2734, + "step": 2845 + }, + { + "epoch": 0.81, + "grad_norm": 2.594893242581233, + "learning_rate": 1.9012073741903068e-07, + "loss": 0.2745, + "step": 2846 + }, + { + "epoch": 0.81, + "grad_norm": 2.4748954516830683, + "learning_rate": 1.8958270652102858e-07, + "loss": 0.2767, + "step": 2847 + }, + { + "epoch": 0.81, + "grad_norm": 2.5129604741794878, + "learning_rate": 1.8904535826755908e-07, + "loss": 0.2863, + "step": 2848 + }, + { + "epoch": 0.81, + "grad_norm": 2.5981441940170984, + "learning_rate": 1.8850869311125096e-07, + "loss": 0.2985, + "step": 2849 + }, + { + "epoch": 0.81, + "grad_norm": 2.3356548455202435, + "learning_rate": 1.8797271150415705e-07, + "loss": 0.2584, + "step": 2850 + }, + { + "epoch": 0.81, + "grad_norm": 3.1820448800506997, + "learning_rate": 1.8743741389775469e-07, + "loss": 0.2552, + "step": 2851 + }, + { + "epoch": 0.81, + "grad_norm": 2.23023753403099, + "learning_rate": 1.8690280074294473e-07, + "loss": 0.2587, + "step": 2852 + }, + { + "epoch": 0.81, + "grad_norm": 2.5679736393340393, + "learning_rate": 1.8636887249005174e-07, + "loss": 0.2795, + "step": 2853 + }, + { + "epoch": 0.81, + "grad_norm": 2.177529079557188, + "learning_rate": 1.8583562958882327e-07, + "loss": 0.2363, + "step": 2854 + }, + { + "epoch": 0.81, + "grad_norm": 2.4742523821737907, + "learning_rate": 1.853030724884297e-07, + "loss": 0.3092, + "step": 2855 + }, + { + "epoch": 0.81, + "grad_norm": 2.2885878314952874, + "learning_rate": 1.847712016374634e-07, + "loss": 0.2582, + "step": 2856 + }, + { + "epoch": 0.81, + "grad_norm": 2.74968714414436, + "learning_rate": 1.8424001748393904e-07, + "loss": 0.2926, + "step": 2857 + }, + { + "epoch": 0.81, + "grad_norm": 2.2223393631269373, + "learning_rate": 1.8370952047529263e-07, + "loss": 0.2478, + "step": 2858 + }, + { + "epoch": 0.81, + "grad_norm": 2.2864500433093164, + "learning_rate": 1.831797110583817e-07, + "loss": 0.2364, + "step": 2859 + }, + { + "epoch": 0.81, + "grad_norm": 2.4266795093411857, + "learning_rate": 1.8265058967948433e-07, + "loss": 0.2833, + "step": 2860 + }, + { + "epoch": 0.81, + "grad_norm": 2.609719240431143, + "learning_rate": 1.8212215678429854e-07, + "loss": 0.2891, + "step": 2861 + }, + { + "epoch": 0.81, + "grad_norm": 2.4925100660920267, + "learning_rate": 1.8159441281794352e-07, + "loss": 0.2934, + "step": 2862 + }, + { + "epoch": 0.81, + "grad_norm": 2.5036731108979406, + "learning_rate": 1.8106735822495744e-07, + "loss": 0.2703, + "step": 2863 + }, + { + "epoch": 0.81, + "grad_norm": 2.302888816291254, + "learning_rate": 1.805409934492983e-07, + "loss": 0.2605, + "step": 2864 + }, + { + "epoch": 0.81, + "grad_norm": 2.3070765097071737, + "learning_rate": 1.8001531893434185e-07, + "loss": 0.2479, + "step": 2865 + }, + { + "epoch": 0.81, + "grad_norm": 2.511833666867204, + "learning_rate": 1.7949033512288346e-07, + "loss": 0.2459, + "step": 2866 + }, + { + "epoch": 0.81, + "grad_norm": 3.8799279280843644, + "learning_rate": 1.7896604245713686e-07, + "loss": 0.2613, + "step": 2867 + }, + { + "epoch": 0.81, + "grad_norm": 2.144326368427733, + "learning_rate": 1.7844244137873298e-07, + "loss": 0.2169, + "step": 2868 + }, + { + "epoch": 0.81, + "grad_norm": 2.3584488923021802, + "learning_rate": 1.779195323287208e-07, + "loss": 0.2616, + "step": 2869 + }, + { + "epoch": 0.81, + "grad_norm": 2.705678283702525, + "learning_rate": 1.7739731574756522e-07, + "loss": 0.2795, + "step": 2870 + }, + { + "epoch": 0.81, + "grad_norm": 2.2011956923784397, + "learning_rate": 1.768757920751489e-07, + "loss": 0.2416, + "step": 2871 + }, + { + "epoch": 0.81, + "grad_norm": 2.2331302113818396, + "learning_rate": 1.7635496175077081e-07, + "loss": 0.265, + "step": 2872 + }, + { + "epoch": 0.81, + "grad_norm": 2.3523724278598275, + "learning_rate": 1.7583482521314595e-07, + "loss": 0.2821, + "step": 2873 + }, + { + "epoch": 0.81, + "grad_norm": 3.9974888016714254, + "learning_rate": 1.7531538290040382e-07, + "loss": 0.2884, + "step": 2874 + }, + { + "epoch": 0.81, + "grad_norm": 2.55439064748466, + "learning_rate": 1.7479663525009037e-07, + "loss": 0.2989, + "step": 2875 + }, + { + "epoch": 0.81, + "grad_norm": 2.3600553815923035, + "learning_rate": 1.7427858269916563e-07, + "loss": 0.259, + "step": 2876 + }, + { + "epoch": 0.82, + "grad_norm": 2.3988600611863737, + "learning_rate": 1.737612256840053e-07, + "loss": 0.2789, + "step": 2877 + }, + { + "epoch": 0.82, + "grad_norm": 2.3773615435211157, + "learning_rate": 1.732445646403975e-07, + "loss": 0.2787, + "step": 2878 + }, + { + "epoch": 0.82, + "grad_norm": 2.467114711446906, + "learning_rate": 1.7272860000354538e-07, + "loss": 0.27, + "step": 2879 + }, + { + "epoch": 0.82, + "grad_norm": 2.227839666993782, + "learning_rate": 1.7221333220806477e-07, + "loss": 0.2531, + "step": 2880 + }, + { + "epoch": 0.82, + "grad_norm": 2.3135070130291506, + "learning_rate": 1.7169876168798558e-07, + "loss": 0.2465, + "step": 2881 + }, + { + "epoch": 0.82, + "grad_norm": 2.539791939043467, + "learning_rate": 1.7118488887674887e-07, + "loss": 0.2816, + "step": 2882 + }, + { + "epoch": 0.82, + "grad_norm": 2.6893138314421017, + "learning_rate": 1.7067171420720904e-07, + "loss": 0.3075, + "step": 2883 + }, + { + "epoch": 0.82, + "grad_norm": 2.2844568548404656, + "learning_rate": 1.7015923811163224e-07, + "loss": 0.2549, + "step": 2884 + }, + { + "epoch": 0.82, + "grad_norm": 2.5417677784666646, + "learning_rate": 1.696474610216958e-07, + "loss": 0.3035, + "step": 2885 + }, + { + "epoch": 0.82, + "grad_norm": 2.283990586546514, + "learning_rate": 1.691363833684889e-07, + "loss": 0.2711, + "step": 2886 + }, + { + "epoch": 0.82, + "grad_norm": 2.4459137162379867, + "learning_rate": 1.6862600558251095e-07, + "loss": 0.2835, + "step": 2887 + }, + { + "epoch": 0.82, + "grad_norm": 2.3833279688766233, + "learning_rate": 1.6811632809367204e-07, + "loss": 0.2697, + "step": 2888 + }, + { + "epoch": 0.82, + "grad_norm": 2.5303782470283664, + "learning_rate": 1.6760735133129267e-07, + "loss": 0.268, + "step": 2889 + }, + { + "epoch": 0.82, + "grad_norm": 2.232816947627661, + "learning_rate": 1.6709907572410265e-07, + "loss": 0.263, + "step": 2890 + }, + { + "epoch": 0.82, + "grad_norm": 2.296004011835285, + "learning_rate": 1.665915017002414e-07, + "loss": 0.246, + "step": 2891 + }, + { + "epoch": 0.82, + "grad_norm": 2.339328657097091, + "learning_rate": 1.6608462968725733e-07, + "loss": 0.2813, + "step": 2892 + }, + { + "epoch": 0.82, + "grad_norm": 2.188178965593008, + "learning_rate": 1.6557846011210751e-07, + "loss": 0.2206, + "step": 2893 + }, + { + "epoch": 0.82, + "grad_norm": 2.5645938693793346, + "learning_rate": 1.6507299340115744e-07, + "loss": 0.2885, + "step": 2894 + }, + { + "epoch": 0.82, + "grad_norm": 2.339243501741995, + "learning_rate": 1.645682299801804e-07, + "loss": 0.2825, + "step": 2895 + }, + { + "epoch": 0.82, + "grad_norm": 2.367728617536032, + "learning_rate": 1.6406417027435727e-07, + "loss": 0.2447, + "step": 2896 + }, + { + "epoch": 0.82, + "grad_norm": 2.462794940200196, + "learning_rate": 1.6356081470827633e-07, + "loss": 0.2586, + "step": 2897 + }, + { + "epoch": 0.82, + "grad_norm": 2.9373717395321797, + "learning_rate": 1.6305816370593262e-07, + "loss": 0.2825, + "step": 2898 + }, + { + "epoch": 0.82, + "grad_norm": 2.2061403889425364, + "learning_rate": 1.6255621769072803e-07, + "loss": 0.2256, + "step": 2899 + }, + { + "epoch": 0.82, + "grad_norm": 2.255180702409096, + "learning_rate": 1.6205497708546933e-07, + "loss": 0.2343, + "step": 2900 + }, + { + "epoch": 0.82, + "grad_norm": 2.326546351934489, + "learning_rate": 1.6155444231237104e-07, + "loss": 0.2713, + "step": 2901 + }, + { + "epoch": 0.82, + "grad_norm": 2.9170467115769405, + "learning_rate": 1.6105461379305186e-07, + "loss": 0.2874, + "step": 2902 + }, + { + "epoch": 0.82, + "grad_norm": 2.400555296130694, + "learning_rate": 1.60555491948536e-07, + "loss": 0.255, + "step": 2903 + }, + { + "epoch": 0.82, + "grad_norm": 2.0770253089306814, + "learning_rate": 1.6005707719925188e-07, + "loss": 0.2383, + "step": 2904 + }, + { + "epoch": 0.82, + "grad_norm": 2.264987088165909, + "learning_rate": 1.5955936996503284e-07, + "loss": 0.2391, + "step": 2905 + }, + { + "epoch": 0.82, + "grad_norm": 2.337389217903713, + "learning_rate": 1.590623706651164e-07, + "loss": 0.2457, + "step": 2906 + }, + { + "epoch": 0.82, + "grad_norm": 2.2921471576670096, + "learning_rate": 1.5856607971814374e-07, + "loss": 0.2613, + "step": 2907 + }, + { + "epoch": 0.82, + "grad_norm": 2.301923660369795, + "learning_rate": 1.580704975421584e-07, + "loss": 0.2399, + "step": 2908 + }, + { + "epoch": 0.82, + "grad_norm": 2.4213038071881754, + "learning_rate": 1.5757562455460805e-07, + "loss": 0.244, + "step": 2909 + }, + { + "epoch": 0.82, + "grad_norm": 2.5169986268703934, + "learning_rate": 1.5708146117234223e-07, + "loss": 0.2921, + "step": 2910 + }, + { + "epoch": 0.82, + "grad_norm": 2.4131508151166474, + "learning_rate": 1.5658800781161363e-07, + "loss": 0.2583, + "step": 2911 + }, + { + "epoch": 0.83, + "grad_norm": 2.3094984954712166, + "learning_rate": 1.5609526488807611e-07, + "loss": 0.2428, + "step": 2912 + }, + { + "epoch": 0.83, + "grad_norm": 2.297686613691211, + "learning_rate": 1.5560323281678512e-07, + "loss": 0.2762, + "step": 2913 + }, + { + "epoch": 0.83, + "grad_norm": 2.401738290881385, + "learning_rate": 1.5511191201219732e-07, + "loss": 0.2604, + "step": 2914 + }, + { + "epoch": 0.83, + "grad_norm": 2.2765127696756253, + "learning_rate": 1.5462130288817088e-07, + "loss": 0.274, + "step": 2915 + }, + { + "epoch": 0.83, + "grad_norm": 2.2840651600147557, + "learning_rate": 1.5413140585796426e-07, + "loss": 0.2799, + "step": 2916 + }, + { + "epoch": 0.83, + "grad_norm": 2.3612164284457617, + "learning_rate": 1.536422213342352e-07, + "loss": 0.2449, + "step": 2917 + }, + { + "epoch": 0.83, + "grad_norm": 2.2899490356795273, + "learning_rate": 1.5315374972904238e-07, + "loss": 0.2416, + "step": 2918 + }, + { + "epoch": 0.83, + "grad_norm": 2.4698956489551285, + "learning_rate": 1.5266599145384318e-07, + "loss": 0.2752, + "step": 2919 + }, + { + "epoch": 0.83, + "grad_norm": 2.240094198960351, + "learning_rate": 1.5217894691949518e-07, + "loss": 0.242, + "step": 2920 + }, + { + "epoch": 0.83, + "grad_norm": 2.3749916729226386, + "learning_rate": 1.5169261653625343e-07, + "loss": 0.2798, + "step": 2921 + }, + { + "epoch": 0.83, + "grad_norm": 2.7876388271161603, + "learning_rate": 1.5120700071377212e-07, + "loss": 0.2577, + "step": 2922 + }, + { + "epoch": 0.83, + "grad_norm": 3.781441804581668, + "learning_rate": 1.5072209986110373e-07, + "loss": 0.272, + "step": 2923 + }, + { + "epoch": 0.83, + "grad_norm": 2.4347166397268785, + "learning_rate": 1.5023791438669797e-07, + "loss": 0.2711, + "step": 2924 + }, + { + "epoch": 0.83, + "grad_norm": 2.458491054092882, + "learning_rate": 1.4975444469840238e-07, + "loss": 0.3303, + "step": 2925 + }, + { + "epoch": 0.83, + "grad_norm": 2.2599999748751727, + "learning_rate": 1.492716912034614e-07, + "loss": 0.223, + "step": 2926 + }, + { + "epoch": 0.83, + "grad_norm": 2.3778895116764045, + "learning_rate": 1.487896543085161e-07, + "loss": 0.2316, + "step": 2927 + }, + { + "epoch": 0.83, + "grad_norm": 2.2784230957221787, + "learning_rate": 1.48308334419604e-07, + "loss": 0.278, + "step": 2928 + }, + { + "epoch": 0.83, + "grad_norm": 2.3700733375661387, + "learning_rate": 1.4782773194215882e-07, + "loss": 0.2806, + "step": 2929 + }, + { + "epoch": 0.83, + "grad_norm": 2.2901041783321707, + "learning_rate": 1.473478472810097e-07, + "loss": 0.263, + "step": 2930 + }, + { + "epoch": 0.83, + "grad_norm": 2.409600698001674, + "learning_rate": 1.468686808403814e-07, + "loss": 0.2804, + "step": 2931 + }, + { + "epoch": 0.83, + "grad_norm": 2.5969716621663568, + "learning_rate": 1.4639023302389364e-07, + "loss": 0.2831, + "step": 2932 + }, + { + "epoch": 0.83, + "grad_norm": 2.394297205291279, + "learning_rate": 1.4591250423456046e-07, + "loss": 0.2396, + "step": 2933 + }, + { + "epoch": 0.83, + "grad_norm": 2.448689657248942, + "learning_rate": 1.454354948747909e-07, + "loss": 0.2427, + "step": 2934 + }, + { + "epoch": 0.83, + "grad_norm": 2.438782418407605, + "learning_rate": 1.449592053463874e-07, + "loss": 0.2961, + "step": 2935 + }, + { + "epoch": 0.83, + "grad_norm": 2.354138825480889, + "learning_rate": 1.4448363605054636e-07, + "loss": 0.2625, + "step": 2936 + }, + { + "epoch": 0.83, + "grad_norm": 2.2592559986699303, + "learning_rate": 1.440087873878574e-07, + "loss": 0.2509, + "step": 2937 + }, + { + "epoch": 0.83, + "grad_norm": 2.330818222709863, + "learning_rate": 1.4353465975830336e-07, + "loss": 0.2567, + "step": 2938 + }, + { + "epoch": 0.83, + "grad_norm": 2.4420070625144805, + "learning_rate": 1.4306125356125896e-07, + "loss": 0.2838, + "step": 2939 + }, + { + "epoch": 0.83, + "grad_norm": 2.2464288961476915, + "learning_rate": 1.4258856919549232e-07, + "loss": 0.2555, + "step": 2940 + }, + { + "epoch": 0.83, + "grad_norm": 2.3803013873706544, + "learning_rate": 1.4211660705916285e-07, + "loss": 0.2361, + "step": 2941 + }, + { + "epoch": 0.83, + "grad_norm": 2.410504833130406, + "learning_rate": 1.4164536754982203e-07, + "loss": 0.2519, + "step": 2942 + }, + { + "epoch": 0.83, + "grad_norm": 2.7036576648926016, + "learning_rate": 1.4117485106441186e-07, + "loss": 0.2542, + "step": 2943 + }, + { + "epoch": 0.83, + "grad_norm": 2.5126251019306363, + "learning_rate": 1.407050579992658e-07, + "loss": 0.2909, + "step": 2944 + }, + { + "epoch": 0.83, + "grad_norm": 2.246164260692375, + "learning_rate": 1.4023598875010844e-07, + "loss": 0.264, + "step": 2945 + }, + { + "epoch": 0.83, + "grad_norm": 2.268685222902053, + "learning_rate": 1.3976764371205418e-07, + "loss": 0.2696, + "step": 2946 + }, + { + "epoch": 0.84, + "grad_norm": 2.504224124261662, + "learning_rate": 1.39300023279607e-07, + "loss": 0.2633, + "step": 2947 + }, + { + "epoch": 0.84, + "grad_norm": 2.5660103800190637, + "learning_rate": 1.388331278466609e-07, + "loss": 0.2827, + "step": 2948 + }, + { + "epoch": 0.84, + "grad_norm": 2.4725541982526758, + "learning_rate": 1.3836695780649976e-07, + "loss": 0.3022, + "step": 2949 + }, + { + "epoch": 0.84, + "grad_norm": 2.446324380195441, + "learning_rate": 1.379015135517958e-07, + "loss": 0.2907, + "step": 2950 + }, + { + "epoch": 0.84, + "grad_norm": 2.2514612224451906, + "learning_rate": 1.374367954746094e-07, + "loss": 0.2706, + "step": 2951 + }, + { + "epoch": 0.84, + "grad_norm": 2.415819572419356, + "learning_rate": 1.3697280396639034e-07, + "loss": 0.2523, + "step": 2952 + }, + { + "epoch": 0.84, + "grad_norm": 2.1459371173307846, + "learning_rate": 1.365095394179754e-07, + "loss": 0.2396, + "step": 2953 + }, + { + "epoch": 0.84, + "grad_norm": 2.323396101986738, + "learning_rate": 1.360470022195902e-07, + "loss": 0.2739, + "step": 2954 + }, + { + "epoch": 0.84, + "grad_norm": 2.3080472437525437, + "learning_rate": 1.3558519276084635e-07, + "loss": 0.2503, + "step": 2955 + }, + { + "epoch": 0.84, + "grad_norm": 2.473968673368382, + "learning_rate": 1.3512411143074332e-07, + "loss": 0.296, + "step": 2956 + }, + { + "epoch": 0.84, + "grad_norm": 2.3430591542502466, + "learning_rate": 1.3466375861766698e-07, + "loss": 0.2451, + "step": 2957 + }, + { + "epoch": 0.84, + "grad_norm": 2.393077540550777, + "learning_rate": 1.3420413470938942e-07, + "loss": 0.2659, + "step": 2958 + }, + { + "epoch": 0.84, + "grad_norm": 2.6086430834408345, + "learning_rate": 1.3374524009306942e-07, + "loss": 0.2641, + "step": 2959 + }, + { + "epoch": 0.84, + "grad_norm": 2.621421328007362, + "learning_rate": 1.332870751552503e-07, + "loss": 0.2779, + "step": 2960 + }, + { + "epoch": 0.84, + "grad_norm": 2.840293461456054, + "learning_rate": 1.3282964028186172e-07, + "loss": 0.3053, + "step": 2961 + }, + { + "epoch": 0.84, + "grad_norm": 2.470822092275965, + "learning_rate": 1.3237293585821785e-07, + "loss": 0.2622, + "step": 2962 + }, + { + "epoch": 0.84, + "grad_norm": 2.252474082136468, + "learning_rate": 1.3191696226901795e-07, + "loss": 0.2718, + "step": 2963 + }, + { + "epoch": 0.84, + "grad_norm": 2.3074512131763516, + "learning_rate": 1.314617198983454e-07, + "loss": 0.243, + "step": 2964 + }, + { + "epoch": 0.84, + "grad_norm": 2.512138084713408, + "learning_rate": 1.3100720912966766e-07, + "loss": 0.272, + "step": 2965 + }, + { + "epoch": 0.84, + "grad_norm": 2.3739192198771737, + "learning_rate": 1.305534303458361e-07, + "loss": 0.2755, + "step": 2966 + }, + { + "epoch": 0.84, + "grad_norm": 2.2893167627464166, + "learning_rate": 1.301003839290853e-07, + "loss": 0.2408, + "step": 2967 + }, + { + "epoch": 0.84, + "grad_norm": 2.270167707949592, + "learning_rate": 1.296480702610332e-07, + "loss": 0.2501, + "step": 2968 + }, + { + "epoch": 0.84, + "grad_norm": 2.3914772364832166, + "learning_rate": 1.2919648972268027e-07, + "loss": 0.2323, + "step": 2969 + }, + { + "epoch": 0.84, + "grad_norm": 2.472406533385478, + "learning_rate": 1.2874564269440958e-07, + "loss": 0.3096, + "step": 2970 + }, + { + "epoch": 0.84, + "grad_norm": 2.5115059991420203, + "learning_rate": 1.2829552955598622e-07, + "loss": 0.3056, + "step": 2971 + }, + { + "epoch": 0.84, + "grad_norm": 2.1147586969077694, + "learning_rate": 1.2784615068655745e-07, + "loss": 0.2611, + "step": 2972 + }, + { + "epoch": 0.84, + "grad_norm": 2.3814559744375745, + "learning_rate": 1.273975064646512e-07, + "loss": 0.2569, + "step": 2973 + }, + { + "epoch": 0.84, + "grad_norm": 2.525852265833298, + "learning_rate": 1.2694959726817767e-07, + "loss": 0.2583, + "step": 2974 + }, + { + "epoch": 0.84, + "grad_norm": 2.4224050505105277, + "learning_rate": 1.2650242347442707e-07, + "loss": 0.2269, + "step": 2975 + }, + { + "epoch": 0.84, + "grad_norm": 2.3454068114753697, + "learning_rate": 1.260559854600709e-07, + "loss": 0.2746, + "step": 2976 + }, + { + "epoch": 0.84, + "grad_norm": 2.279877000511415, + "learning_rate": 1.2561028360116e-07, + "loss": 0.255, + "step": 2977 + }, + { + "epoch": 0.84, + "grad_norm": 2.416841526419076, + "learning_rate": 1.251653182731254e-07, + "loss": 0.2998, + "step": 2978 + }, + { + "epoch": 0.84, + "grad_norm": 2.4618454435380572, + "learning_rate": 1.2472108985077834e-07, + "loss": 0.2692, + "step": 2979 + }, + { + "epoch": 0.84, + "grad_norm": 2.5288543032906126, + "learning_rate": 1.242775987083088e-07, + "loss": 0.2515, + "step": 2980 + }, + { + "epoch": 0.84, + "grad_norm": 2.307446477470878, + "learning_rate": 1.23834845219286e-07, + "loss": 0.2711, + "step": 2981 + }, + { + "epoch": 0.84, + "grad_norm": 2.3118979472768664, + "learning_rate": 1.233928297566571e-07, + "loss": 0.2632, + "step": 2982 + }, + { + "epoch": 0.85, + "grad_norm": 3.0975762766164716, + "learning_rate": 1.2295155269274827e-07, + "loss": 0.2617, + "step": 2983 + }, + { + "epoch": 0.85, + "grad_norm": 2.252171146783034, + "learning_rate": 1.225110143992638e-07, + "loss": 0.296, + "step": 2984 + }, + { + "epoch": 0.85, + "grad_norm": 2.699367516738382, + "learning_rate": 1.220712152472856e-07, + "loss": 0.3007, + "step": 2985 + }, + { + "epoch": 0.85, + "grad_norm": 2.3609242051199897, + "learning_rate": 1.2163215560727214e-07, + "loss": 0.2853, + "step": 2986 + }, + { + "epoch": 0.85, + "grad_norm": 2.239716626762453, + "learning_rate": 1.2119383584905985e-07, + "loss": 0.2527, + "step": 2987 + }, + { + "epoch": 0.85, + "grad_norm": 2.281800284296882, + "learning_rate": 1.2075625634186205e-07, + "loss": 0.2509, + "step": 2988 + }, + { + "epoch": 0.85, + "grad_norm": 2.416589421795546, + "learning_rate": 1.203194174542682e-07, + "loss": 0.2919, + "step": 2989 + }, + { + "epoch": 0.85, + "grad_norm": 2.6405939356926527, + "learning_rate": 1.1988331955424347e-07, + "loss": 0.2755, + "step": 2990 + }, + { + "epoch": 0.85, + "grad_norm": 4.045979386286837, + "learning_rate": 1.194479630091294e-07, + "loss": 0.2376, + "step": 2991 + }, + { + "epoch": 0.85, + "grad_norm": 2.2126916596240935, + "learning_rate": 1.190133481856429e-07, + "loss": 0.2668, + "step": 2992 + }, + { + "epoch": 0.85, + "grad_norm": 2.2755646904017857, + "learning_rate": 1.1857947544987668e-07, + "loss": 0.2777, + "step": 2993 + }, + { + "epoch": 0.85, + "grad_norm": 2.431657462454105, + "learning_rate": 1.1814634516729726e-07, + "loss": 0.303, + "step": 2994 + }, + { + "epoch": 0.85, + "grad_norm": 2.38579299772321, + "learning_rate": 1.177139577027465e-07, + "loss": 0.2728, + "step": 2995 + }, + { + "epoch": 0.85, + "grad_norm": 2.1828610624233415, + "learning_rate": 1.1728231342044049e-07, + "loss": 0.2322, + "step": 2996 + }, + { + "epoch": 0.85, + "grad_norm": 2.286268924458742, + "learning_rate": 1.1685141268396902e-07, + "loss": 0.2507, + "step": 2997 + }, + { + "epoch": 0.85, + "grad_norm": 2.34834559713555, + "learning_rate": 1.1642125585629592e-07, + "loss": 0.2757, + "step": 2998 + }, + { + "epoch": 0.85, + "grad_norm": 2.446898983893535, + "learning_rate": 1.1599184329975809e-07, + "loss": 0.2751, + "step": 2999 + }, + { + "epoch": 0.85, + "grad_norm": 2.3969050527751103, + "learning_rate": 1.1556317537606586e-07, + "loss": 0.2653, + "step": 3000 + }, + { + "epoch": 0.85, + "grad_norm": 2.2928562405437947, + "learning_rate": 1.1513525244630196e-07, + "loss": 0.2647, + "step": 3001 + }, + { + "epoch": 0.85, + "grad_norm": 2.343027458930917, + "learning_rate": 1.1470807487092171e-07, + "loss": 0.2419, + "step": 3002 + }, + { + "epoch": 0.85, + "grad_norm": 2.4596000456064733, + "learning_rate": 1.1428164300975274e-07, + "loss": 0.2695, + "step": 3003 + }, + { + "epoch": 0.85, + "grad_norm": 2.1712507233695026, + "learning_rate": 1.1385595722199437e-07, + "loss": 0.2453, + "step": 3004 + }, + { + "epoch": 0.85, + "grad_norm": 2.1791223390148597, + "learning_rate": 1.1343101786621745e-07, + "loss": 0.2565, + "step": 3005 + }, + { + "epoch": 0.85, + "grad_norm": 2.2200097978674984, + "learning_rate": 1.1300682530036432e-07, + "loss": 0.2485, + "step": 3006 + }, + { + "epoch": 0.85, + "grad_norm": 2.3623281275120664, + "learning_rate": 1.1258337988174793e-07, + "loss": 0.2473, + "step": 3007 + }, + { + "epoch": 0.85, + "grad_norm": 2.315292263531697, + "learning_rate": 1.1216068196705208e-07, + "loss": 0.2621, + "step": 3008 + }, + { + "epoch": 0.85, + "grad_norm": 2.3133663961978814, + "learning_rate": 1.1173873191233096e-07, + "loss": 0.2478, + "step": 3009 + }, + { + "epoch": 0.85, + "grad_norm": 2.430708800052876, + "learning_rate": 1.1131753007300881e-07, + "loss": 0.2708, + "step": 3010 + }, + { + "epoch": 0.85, + "grad_norm": 2.5155824409494305, + "learning_rate": 1.1089707680387961e-07, + "loss": 0.2741, + "step": 3011 + }, + { + "epoch": 0.85, + "grad_norm": 2.9174229312451216, + "learning_rate": 1.1047737245910615e-07, + "loss": 0.275, + "step": 3012 + }, + { + "epoch": 0.85, + "grad_norm": 2.369638242381517, + "learning_rate": 1.1005841739222166e-07, + "loss": 0.2588, + "step": 3013 + }, + { + "epoch": 0.85, + "grad_norm": 2.2897156128120524, + "learning_rate": 1.0964021195612728e-07, + "loss": 0.2717, + "step": 3014 + }, + { + "epoch": 0.85, + "grad_norm": 2.3830666042149384, + "learning_rate": 1.0922275650309321e-07, + "loss": 0.2664, + "step": 3015 + }, + { + "epoch": 0.85, + "grad_norm": 2.3412654270408115, + "learning_rate": 1.0880605138475707e-07, + "loss": 0.2655, + "step": 3016 + }, + { + "epoch": 0.85, + "grad_norm": 2.382093627470339, + "learning_rate": 1.083900969521252e-07, + "loss": 0.2944, + "step": 3017 + }, + { + "epoch": 0.86, + "grad_norm": 2.492195238746543, + "learning_rate": 1.0797489355557188e-07, + "loss": 0.2826, + "step": 3018 + }, + { + "epoch": 0.86, + "grad_norm": 2.328835974383053, + "learning_rate": 1.0756044154483812e-07, + "loss": 0.2666, + "step": 3019 + }, + { + "epoch": 0.86, + "grad_norm": 2.2091174854967988, + "learning_rate": 1.07146741269032e-07, + "loss": 0.2796, + "step": 3020 + }, + { + "epoch": 0.86, + "grad_norm": 2.2501063439867477, + "learning_rate": 1.0673379307662855e-07, + "loss": 0.2591, + "step": 3021 + }, + { + "epoch": 0.86, + "grad_norm": 2.3549354214289524, + "learning_rate": 1.0632159731546964e-07, + "loss": 0.2486, + "step": 3022 + }, + { + "epoch": 0.86, + "grad_norm": 2.2242715824156876, + "learning_rate": 1.0591015433276306e-07, + "loss": 0.2486, + "step": 3023 + }, + { + "epoch": 0.86, + "grad_norm": 2.3033630244095527, + "learning_rate": 1.054994644750824e-07, + "loss": 0.2632, + "step": 3024 + }, + { + "epoch": 0.86, + "grad_norm": 2.343530716837555, + "learning_rate": 1.050895280883668e-07, + "loss": 0.258, + "step": 3025 + }, + { + "epoch": 0.86, + "grad_norm": 2.4582728084197667, + "learning_rate": 1.0468034551792083e-07, + "loss": 0.2733, + "step": 3026 + }, + { + "epoch": 0.86, + "grad_norm": 5.114240863591207, + "learning_rate": 1.0427191710841443e-07, + "loss": 0.2787, + "step": 3027 + }, + { + "epoch": 0.86, + "grad_norm": 2.8725573321328333, + "learning_rate": 1.0386424320388209e-07, + "loss": 0.2729, + "step": 3028 + }, + { + "epoch": 0.86, + "grad_norm": 2.2117518738730926, + "learning_rate": 1.0345732414772224e-07, + "loss": 0.2437, + "step": 3029 + }, + { + "epoch": 0.86, + "grad_norm": 2.4384125875736773, + "learning_rate": 1.0305116028269812e-07, + "loss": 0.2661, + "step": 3030 + }, + { + "epoch": 0.86, + "grad_norm": 2.38059904238801, + "learning_rate": 1.0264575195093628e-07, + "loss": 0.2384, + "step": 3031 + }, + { + "epoch": 0.86, + "grad_norm": 2.288709522342751, + "learning_rate": 1.022410994939279e-07, + "loss": 0.2578, + "step": 3032 + }, + { + "epoch": 0.86, + "grad_norm": 2.309509735794672, + "learning_rate": 1.0183720325252609e-07, + "loss": 0.2495, + "step": 3033 + }, + { + "epoch": 0.86, + "grad_norm": 2.3556405665317097, + "learning_rate": 1.0143406356694795e-07, + "loss": 0.2415, + "step": 3034 + }, + { + "epoch": 0.86, + "grad_norm": 2.3462410688998205, + "learning_rate": 1.0103168077677283e-07, + "loss": 0.2824, + "step": 3035 + }, + { + "epoch": 0.86, + "grad_norm": 2.1535339888759526, + "learning_rate": 1.006300552209427e-07, + "loss": 0.2374, + "step": 3036 + }, + { + "epoch": 0.86, + "grad_norm": 2.3690379200829725, + "learning_rate": 1.0022918723776175e-07, + "loss": 0.2625, + "step": 3037 + }, + { + "epoch": 0.86, + "grad_norm": 2.3032429338900373, + "learning_rate": 9.982907716489586e-08, + "loss": 0.272, + "step": 3038 + }, + { + "epoch": 0.86, + "grad_norm": 2.608970378774197, + "learning_rate": 9.942972533937266e-08, + "loss": 0.3033, + "step": 3039 + }, + { + "epoch": 0.86, + "grad_norm": 2.3119793826541364, + "learning_rate": 9.903113209758096e-08, + "loss": 0.2726, + "step": 3040 + }, + { + "epoch": 0.86, + "grad_norm": 2.381000490542755, + "learning_rate": 9.863329777527052e-08, + "loss": 0.2681, + "step": 3041 + }, + { + "epoch": 0.86, + "grad_norm": 2.4542599325019796, + "learning_rate": 9.823622270755205e-08, + "loss": 0.2564, + "step": 3042 + }, + { + "epoch": 0.86, + "grad_norm": 2.4368086277548415, + "learning_rate": 9.783990722889657e-08, + "loss": 0.2614, + "step": 3043 + }, + { + "epoch": 0.86, + "grad_norm": 2.644344757700451, + "learning_rate": 9.744435167313536e-08, + "loss": 0.2969, + "step": 3044 + }, + { + "epoch": 0.86, + "grad_norm": 2.506641792481987, + "learning_rate": 9.704955637345946e-08, + "loss": 0.2606, + "step": 3045 + }, + { + "epoch": 0.86, + "grad_norm": 2.428419743065912, + "learning_rate": 9.665552166241964e-08, + "loss": 0.2581, + "step": 3046 + }, + { + "epoch": 0.86, + "grad_norm": 2.308346279825781, + "learning_rate": 9.626224787192594e-08, + "loss": 0.2351, + "step": 3047 + }, + { + "epoch": 0.86, + "grad_norm": 2.4120125079732713, + "learning_rate": 9.586973533324738e-08, + "loss": 0.274, + "step": 3048 + }, + { + "epoch": 0.86, + "grad_norm": 2.1444026166759813, + "learning_rate": 9.547798437701193e-08, + "loss": 0.2458, + "step": 3049 + }, + { + "epoch": 0.86, + "grad_norm": 2.3280997441596494, + "learning_rate": 9.508699533320597e-08, + "loss": 0.2785, + "step": 3050 + }, + { + "epoch": 0.86, + "grad_norm": 2.468914423283346, + "learning_rate": 9.46967685311737e-08, + "loss": 0.3056, + "step": 3051 + }, + { + "epoch": 0.86, + "grad_norm": 2.251997965826707, + "learning_rate": 9.430730429961808e-08, + "loss": 0.2508, + "step": 3052 + }, + { + "epoch": 0.87, + "grad_norm": 2.3898049144957327, + "learning_rate": 9.391860296659915e-08, + "loss": 0.2694, + "step": 3053 + }, + { + "epoch": 0.87, + "grad_norm": 2.3671284513441533, + "learning_rate": 9.353066485953454e-08, + "loss": 0.2544, + "step": 3054 + }, + { + "epoch": 0.87, + "grad_norm": 2.4648509691775216, + "learning_rate": 9.314349030519842e-08, + "loss": 0.2771, + "step": 3055 + }, + { + "epoch": 0.87, + "grad_norm": 2.694199196787725, + "learning_rate": 9.275707962972279e-08, + "loss": 0.289, + "step": 3056 + }, + { + "epoch": 0.87, + "grad_norm": 2.2566611611970706, + "learning_rate": 9.237143315859552e-08, + "loss": 0.2622, + "step": 3057 + }, + { + "epoch": 0.87, + "grad_norm": 2.7771475571941817, + "learning_rate": 9.19865512166611e-08, + "loss": 0.2728, + "step": 3058 + }, + { + "epoch": 0.87, + "grad_norm": 2.395294527739055, + "learning_rate": 9.160243412811952e-08, + "loss": 0.2783, + "step": 3059 + }, + { + "epoch": 0.87, + "grad_norm": 2.342150068379509, + "learning_rate": 9.121908221652674e-08, + "loss": 0.2592, + "step": 3060 + }, + { + "epoch": 0.87, + "grad_norm": 3.3201108482311854, + "learning_rate": 9.083649580479491e-08, + "loss": 0.2906, + "step": 3061 + }, + { + "epoch": 0.87, + "grad_norm": 2.38692281040695, + "learning_rate": 9.045467521519045e-08, + "loss": 0.2888, + "step": 3062 + }, + { + "epoch": 0.87, + "grad_norm": 5.2488795924967935, + "learning_rate": 9.00736207693349e-08, + "loss": 0.2834, + "step": 3063 + }, + { + "epoch": 0.87, + "grad_norm": 2.5617539242024745, + "learning_rate": 8.969333278820445e-08, + "loss": 0.281, + "step": 3064 + }, + { + "epoch": 0.87, + "grad_norm": 2.2762094504720207, + "learning_rate": 8.931381159212981e-08, + "loss": 0.272, + "step": 3065 + }, + { + "epoch": 0.87, + "grad_norm": 2.397954895380229, + "learning_rate": 8.893505750079622e-08, + "loss": 0.2529, + "step": 3066 + }, + { + "epoch": 0.87, + "grad_norm": 2.451818318013084, + "learning_rate": 8.855707083324181e-08, + "loss": 0.2849, + "step": 3067 + }, + { + "epoch": 0.87, + "grad_norm": 2.330369584868695, + "learning_rate": 8.817985190785882e-08, + "loss": 0.2503, + "step": 3068 + }, + { + "epoch": 0.87, + "grad_norm": 2.347047792958819, + "learning_rate": 8.780340104239282e-08, + "loss": 0.2951, + "step": 3069 + }, + { + "epoch": 0.87, + "grad_norm": 2.4605820045070477, + "learning_rate": 8.742771855394204e-08, + "loss": 0.2863, + "step": 3070 + }, + { + "epoch": 0.87, + "grad_norm": 2.346271559452563, + "learning_rate": 8.705280475895848e-08, + "loss": 0.2666, + "step": 3071 + }, + { + "epoch": 0.87, + "grad_norm": 2.3650878311332355, + "learning_rate": 8.66786599732453e-08, + "loss": 0.2865, + "step": 3072 + }, + { + "epoch": 0.87, + "grad_norm": 2.4735231589506377, + "learning_rate": 8.630528451195873e-08, + "loss": 0.2652, + "step": 3073 + }, + { + "epoch": 0.87, + "grad_norm": 2.1078553801134716, + "learning_rate": 8.593267868960674e-08, + "loss": 0.2686, + "step": 3074 + }, + { + "epoch": 0.87, + "grad_norm": 2.367953894551575, + "learning_rate": 8.556084282004905e-08, + "loss": 0.2442, + "step": 3075 + }, + { + "epoch": 0.87, + "grad_norm": 2.6606227224227568, + "learning_rate": 8.518977721649679e-08, + "loss": 0.2446, + "step": 3076 + }, + { + "epoch": 0.87, + "grad_norm": 6.555949455627507, + "learning_rate": 8.481948219151225e-08, + "loss": 0.3033, + "step": 3077 + }, + { + "epoch": 0.87, + "grad_norm": 2.900102774963622, + "learning_rate": 8.444995805700872e-08, + "loss": 0.2805, + "step": 3078 + }, + { + "epoch": 0.87, + "grad_norm": 2.2788626417574056, + "learning_rate": 8.408120512424999e-08, + "loss": 0.2559, + "step": 3079 + }, + { + "epoch": 0.87, + "grad_norm": 2.240323058843195, + "learning_rate": 8.371322370385048e-08, + "loss": 0.2779, + "step": 3080 + }, + { + "epoch": 0.87, + "grad_norm": 2.3898302231497137, + "learning_rate": 8.334601410577436e-08, + "loss": 0.2853, + "step": 3081 + }, + { + "epoch": 0.87, + "grad_norm": 2.3482574141862327, + "learning_rate": 8.297957663933608e-08, + "loss": 0.2752, + "step": 3082 + }, + { + "epoch": 0.87, + "grad_norm": 2.319169634223261, + "learning_rate": 8.261391161319941e-08, + "loss": 0.24, + "step": 3083 + }, + { + "epoch": 0.87, + "grad_norm": 2.3155577880334555, + "learning_rate": 8.224901933537776e-08, + "loss": 0.2726, + "step": 3084 + }, + { + "epoch": 0.87, + "grad_norm": 2.2189740851909225, + "learning_rate": 8.18849001132329e-08, + "loss": 0.2405, + "step": 3085 + }, + { + "epoch": 0.87, + "grad_norm": 2.4746209392963223, + "learning_rate": 8.15215542534765e-08, + "loss": 0.268, + "step": 3086 + }, + { + "epoch": 0.87, + "grad_norm": 2.180319378355328, + "learning_rate": 8.115898206216798e-08, + "loss": 0.2591, + "step": 3087 + }, + { + "epoch": 0.87, + "grad_norm": 2.343287819286448, + "learning_rate": 8.079718384471557e-08, + "loss": 0.2634, + "step": 3088 + }, + { + "epoch": 0.88, + "grad_norm": 2.357102540519274, + "learning_rate": 8.043615990587494e-08, + "loss": 0.2686, + "step": 3089 + }, + { + "epoch": 0.88, + "grad_norm": 2.3110864751314604, + "learning_rate": 8.007591054975016e-08, + "loss": 0.2883, + "step": 3090 + }, + { + "epoch": 0.88, + "grad_norm": 3.5614262632707647, + "learning_rate": 7.971643607979273e-08, + "loss": 0.2842, + "step": 3091 + }, + { + "epoch": 0.88, + "grad_norm": 2.335392757844915, + "learning_rate": 7.93577367988012e-08, + "loss": 0.2757, + "step": 3092 + }, + { + "epoch": 0.88, + "grad_norm": 2.3085814088282475, + "learning_rate": 7.899981300892144e-08, + "loss": 0.258, + "step": 3093 + }, + { + "epoch": 0.88, + "grad_norm": 2.1934363925819382, + "learning_rate": 7.86426650116454e-08, + "loss": 0.2362, + "step": 3094 + }, + { + "epoch": 0.88, + "grad_norm": 2.1467726540158205, + "learning_rate": 7.828629310781265e-08, + "loss": 0.2312, + "step": 3095 + }, + { + "epoch": 0.88, + "grad_norm": 2.36570585706135, + "learning_rate": 7.793069759760829e-08, + "loss": 0.2702, + "step": 3096 + }, + { + "epoch": 0.88, + "grad_norm": 2.3436174242214376, + "learning_rate": 7.75758787805637e-08, + "loss": 0.2409, + "step": 3097 + }, + { + "epoch": 0.88, + "grad_norm": 2.430656309119998, + "learning_rate": 7.722183695555562e-08, + "loss": 0.2931, + "step": 3098 + }, + { + "epoch": 0.88, + "grad_norm": 2.4954309256933866, + "learning_rate": 7.686857242080669e-08, + "loss": 0.2587, + "step": 3099 + }, + { + "epoch": 0.88, + "grad_norm": 2.2823343280136217, + "learning_rate": 7.651608547388489e-08, + "loss": 0.2446, + "step": 3100 + }, + { + "epoch": 0.88, + "grad_norm": 2.4514656994841872, + "learning_rate": 7.616437641170315e-08, + "loss": 0.2562, + "step": 3101 + }, + { + "epoch": 0.88, + "grad_norm": 2.3284698907360495, + "learning_rate": 7.581344553051871e-08, + "loss": 0.2606, + "step": 3102 + }, + { + "epoch": 0.88, + "grad_norm": 2.6800251316823456, + "learning_rate": 7.54632931259338e-08, + "loss": 0.2645, + "step": 3103 + }, + { + "epoch": 0.88, + "grad_norm": 2.5123704990491103, + "learning_rate": 7.51139194928947e-08, + "loss": 0.2784, + "step": 3104 + }, + { + "epoch": 0.88, + "grad_norm": 2.3995486614027737, + "learning_rate": 7.47653249256922e-08, + "loss": 0.2573, + "step": 3105 + }, + { + "epoch": 0.88, + "grad_norm": 2.2445829265036874, + "learning_rate": 7.44175097179599e-08, + "loss": 0.2652, + "step": 3106 + }, + { + "epoch": 0.88, + "grad_norm": 2.516997912739564, + "learning_rate": 7.407047416267564e-08, + "loss": 0.2722, + "step": 3107 + }, + { + "epoch": 0.88, + "grad_norm": 2.197115715153637, + "learning_rate": 7.372421855216037e-08, + "loss": 0.251, + "step": 3108 + }, + { + "epoch": 0.88, + "grad_norm": 2.3742140188304464, + "learning_rate": 7.337874317807802e-08, + "loss": 0.2825, + "step": 3109 + }, + { + "epoch": 0.88, + "grad_norm": 2.1502066887265965, + "learning_rate": 7.303404833143522e-08, + "loss": 0.242, + "step": 3110 + }, + { + "epoch": 0.88, + "grad_norm": 2.580142884634076, + "learning_rate": 7.269013430258131e-08, + "loss": 0.3137, + "step": 3111 + }, + { + "epoch": 0.88, + "grad_norm": 2.3035826394343055, + "learning_rate": 7.234700138120776e-08, + "loss": 0.2367, + "step": 3112 + }, + { + "epoch": 0.88, + "grad_norm": 2.3843530141893963, + "learning_rate": 7.200464985634824e-08, + "loss": 0.2524, + "step": 3113 + }, + { + "epoch": 0.88, + "grad_norm": 2.3209743886788656, + "learning_rate": 7.166308001637811e-08, + "loss": 0.2899, + "step": 3114 + }, + { + "epoch": 0.88, + "grad_norm": 2.3303005873390066, + "learning_rate": 7.13222921490142e-08, + "loss": 0.2637, + "step": 3115 + }, + { + "epoch": 0.88, + "grad_norm": 2.426996076177478, + "learning_rate": 7.098228654131488e-08, + "loss": 0.2905, + "step": 3116 + }, + { + "epoch": 0.88, + "grad_norm": 2.2879093080690045, + "learning_rate": 7.064306347967952e-08, + "loss": 0.292, + "step": 3117 + }, + { + "epoch": 0.88, + "grad_norm": 2.337757585829966, + "learning_rate": 7.03046232498482e-08, + "loss": 0.248, + "step": 3118 + }, + { + "epoch": 0.88, + "grad_norm": 2.603799211351658, + "learning_rate": 6.996696613690156e-08, + "loss": 0.2754, + "step": 3119 + }, + { + "epoch": 0.88, + "grad_norm": 2.432035830910913, + "learning_rate": 6.963009242526096e-08, + "loss": 0.2708, + "step": 3120 + }, + { + "epoch": 0.88, + "grad_norm": 2.4988923447017273, + "learning_rate": 6.929400239868743e-08, + "loss": 0.2578, + "step": 3121 + }, + { + "epoch": 0.88, + "grad_norm": 2.1903315111152, + "learning_rate": 6.895869634028217e-08, + "loss": 0.2433, + "step": 3122 + }, + { + "epoch": 0.88, + "grad_norm": 2.182167491069264, + "learning_rate": 6.862417453248593e-08, + "loss": 0.224, + "step": 3123 + }, + { + "epoch": 0.89, + "grad_norm": 2.23234379480758, + "learning_rate": 6.82904372570785e-08, + "loss": 0.2661, + "step": 3124 + }, + { + "epoch": 0.89, + "grad_norm": 2.1799081969004988, + "learning_rate": 6.79574847951796e-08, + "loss": 0.2723, + "step": 3125 + }, + { + "epoch": 0.89, + "grad_norm": 2.6825595419677004, + "learning_rate": 6.76253174272472e-08, + "loss": 0.2784, + "step": 3126 + }, + { + "epoch": 0.89, + "grad_norm": 2.5886007659552006, + "learning_rate": 6.729393543307837e-08, + "loss": 0.3001, + "step": 3127 + }, + { + "epoch": 0.89, + "grad_norm": 2.4011926903124863, + "learning_rate": 6.696333909180796e-08, + "loss": 0.254, + "step": 3128 + }, + { + "epoch": 0.89, + "grad_norm": 2.379893316621335, + "learning_rate": 6.663352868191008e-08, + "loss": 0.2617, + "step": 3129 + }, + { + "epoch": 0.89, + "grad_norm": 2.6057157508985167, + "learning_rate": 6.630450448119617e-08, + "loss": 0.2767, + "step": 3130 + }, + { + "epoch": 0.89, + "grad_norm": 2.3009825684020466, + "learning_rate": 6.597626676681545e-08, + "loss": 0.2459, + "step": 3131 + }, + { + "epoch": 0.89, + "grad_norm": 2.3740096091255003, + "learning_rate": 6.564881581525449e-08, + "loss": 0.2746, + "step": 3132 + }, + { + "epoch": 0.89, + "grad_norm": 2.665005965150247, + "learning_rate": 6.532215190233747e-08, + "loss": 0.3207, + "step": 3133 + }, + { + "epoch": 0.89, + "grad_norm": 2.319434599678059, + "learning_rate": 6.499627530322582e-08, + "loss": 0.2632, + "step": 3134 + }, + { + "epoch": 0.89, + "grad_norm": 2.684205728247248, + "learning_rate": 6.467118629241718e-08, + "loss": 0.2644, + "step": 3135 + }, + { + "epoch": 0.89, + "grad_norm": 2.44892949559697, + "learning_rate": 6.434688514374632e-08, + "loss": 0.262, + "step": 3136 + }, + { + "epoch": 0.89, + "grad_norm": 2.3548233221374733, + "learning_rate": 6.402337213038378e-08, + "loss": 0.2764, + "step": 3137 + }, + { + "epoch": 0.89, + "grad_norm": 2.480068951853458, + "learning_rate": 6.370064752483661e-08, + "loss": 0.2784, + "step": 3138 + }, + { + "epoch": 0.89, + "grad_norm": 2.4592994564515664, + "learning_rate": 6.337871159894803e-08, + "loss": 0.2834, + "step": 3139 + }, + { + "epoch": 0.89, + "grad_norm": 2.2792956091900307, + "learning_rate": 6.305756462389644e-08, + "loss": 0.2547, + "step": 3140 + }, + { + "epoch": 0.89, + "grad_norm": 2.4306357067447553, + "learning_rate": 6.273720687019579e-08, + "loss": 0.2767, + "step": 3141 + }, + { + "epoch": 0.89, + "grad_norm": 2.379110567627786, + "learning_rate": 6.241763860769534e-08, + "loss": 0.2725, + "step": 3142 + }, + { + "epoch": 0.89, + "grad_norm": 2.2710952274288494, + "learning_rate": 6.209886010557907e-08, + "loss": 0.2485, + "step": 3143 + }, + { + "epoch": 0.89, + "grad_norm": 2.5652561077781746, + "learning_rate": 6.178087163236645e-08, + "loss": 0.252, + "step": 3144 + }, + { + "epoch": 0.89, + "grad_norm": 2.2341453207739805, + "learning_rate": 6.146367345591053e-08, + "loss": 0.253, + "step": 3145 + }, + { + "epoch": 0.89, + "grad_norm": 2.3158832187436107, + "learning_rate": 6.114726584339913e-08, + "loss": 0.2655, + "step": 3146 + }, + { + "epoch": 0.89, + "grad_norm": 2.4445635266013626, + "learning_rate": 6.08316490613543e-08, + "loss": 0.2669, + "step": 3147 + }, + { + "epoch": 0.89, + "grad_norm": 2.304129154768925, + "learning_rate": 6.051682337563158e-08, + "loss": 0.2705, + "step": 3148 + }, + { + "epoch": 0.89, + "grad_norm": 2.5757049619729018, + "learning_rate": 6.02027890514204e-08, + "loss": 0.2662, + "step": 3149 + }, + { + "epoch": 0.89, + "grad_norm": 2.565737017154383, + "learning_rate": 5.988954635324351e-08, + "loss": 0.3128, + "step": 3150 + }, + { + "epoch": 0.89, + "grad_norm": 2.3260226124352856, + "learning_rate": 5.957709554495682e-08, + "loss": 0.2633, + "step": 3151 + }, + { + "epoch": 0.89, + "grad_norm": 2.3858294402669435, + "learning_rate": 5.926543688974928e-08, + "loss": 0.264, + "step": 3152 + }, + { + "epoch": 0.89, + "grad_norm": 5.961869575469567, + "learning_rate": 5.8954570650142424e-08, + "loss": 0.2629, + "step": 3153 + }, + { + "epoch": 0.89, + "grad_norm": 2.474005827500162, + "learning_rate": 5.864449708799057e-08, + "loss": 0.2641, + "step": 3154 + }, + { + "epoch": 0.89, + "grad_norm": 2.4705398326799717, + "learning_rate": 5.833521646448003e-08, + "loss": 0.2926, + "step": 3155 + }, + { + "epoch": 0.89, + "grad_norm": 2.319310223557441, + "learning_rate": 5.8026729040129506e-08, + "loss": 0.2458, + "step": 3156 + }, + { + "epoch": 0.89, + "grad_norm": 2.411383748029571, + "learning_rate": 5.771903507478915e-08, + "loss": 0.3025, + "step": 3157 + }, + { + "epoch": 0.89, + "grad_norm": 6.037524189709264, + "learning_rate": 5.741213482764118e-08, + "loss": 0.2661, + "step": 3158 + }, + { + "epoch": 0.9, + "grad_norm": 2.534264378285265, + "learning_rate": 5.7106028557199036e-08, + "loss": 0.2815, + "step": 3159 + }, + { + "epoch": 0.9, + "grad_norm": 2.4799284033923366, + "learning_rate": 5.6800716521307356e-08, + "loss": 0.2857, + "step": 3160 + }, + { + "epoch": 0.9, + "grad_norm": 2.4307602896415217, + "learning_rate": 5.649619897714186e-08, + "loss": 0.2595, + "step": 3161 + }, + { + "epoch": 0.9, + "grad_norm": 2.309911730440573, + "learning_rate": 5.61924761812087e-08, + "loss": 0.2572, + "step": 3162 + }, + { + "epoch": 0.9, + "grad_norm": 2.2708443975298596, + "learning_rate": 5.588954838934523e-08, + "loss": 0.2263, + "step": 3163 + }, + { + "epoch": 0.9, + "grad_norm": 2.3928996224840224, + "learning_rate": 5.558741585671845e-08, + "loss": 0.262, + "step": 3164 + }, + { + "epoch": 0.9, + "grad_norm": 2.400883941359423, + "learning_rate": 5.528607883782599e-08, + "loss": 0.2699, + "step": 3165 + }, + { + "epoch": 0.9, + "grad_norm": 2.3632036616640164, + "learning_rate": 5.4985537586495157e-08, + "loss": 0.2488, + "step": 3166 + }, + { + "epoch": 0.9, + "grad_norm": 2.4002704113750326, + "learning_rate": 5.4685792355882664e-08, + "loss": 0.2626, + "step": 3167 + }, + { + "epoch": 0.9, + "grad_norm": 2.4063425935418663, + "learning_rate": 5.438684339847555e-08, + "loss": 0.2591, + "step": 3168 + }, + { + "epoch": 0.9, + "grad_norm": 2.430614312790665, + "learning_rate": 5.4088690966089254e-08, + "loss": 0.2611, + "step": 3169 + }, + { + "epoch": 0.9, + "grad_norm": 2.419754243863106, + "learning_rate": 5.379133530986901e-08, + "loss": 0.2727, + "step": 3170 + }, + { + "epoch": 0.9, + "grad_norm": 2.4662220396431747, + "learning_rate": 5.349477668028801e-08, + "loss": 0.283, + "step": 3171 + }, + { + "epoch": 0.9, + "grad_norm": 2.3426309066637545, + "learning_rate": 5.319901532714877e-08, + "loss": 0.2701, + "step": 3172 + }, + { + "epoch": 0.9, + "grad_norm": 2.4592232084554415, + "learning_rate": 5.2904051499582105e-08, + "loss": 0.2341, + "step": 3173 + }, + { + "epoch": 0.9, + "grad_norm": 2.46097236692037, + "learning_rate": 5.2609885446047165e-08, + "loss": 0.2851, + "step": 3174 + }, + { + "epoch": 0.9, + "grad_norm": 2.320945269240666, + "learning_rate": 5.231651741433063e-08, + "loss": 0.2515, + "step": 3175 + }, + { + "epoch": 0.9, + "grad_norm": 2.6406931382496315, + "learning_rate": 5.2023947651547275e-08, + "loss": 0.2882, + "step": 3176 + }, + { + "epoch": 0.9, + "grad_norm": 2.3858195376057347, + "learning_rate": 5.17321764041394e-08, + "loss": 0.249, + "step": 3177 + }, + { + "epoch": 0.9, + "grad_norm": 2.2130979645960913, + "learning_rate": 5.144120391787732e-08, + "loss": 0.2428, + "step": 3178 + }, + { + "epoch": 0.9, + "grad_norm": 2.4254984080961957, + "learning_rate": 5.115103043785718e-08, + "loss": 0.2485, + "step": 3179 + }, + { + "epoch": 0.9, + "grad_norm": 2.3045501628031704, + "learning_rate": 5.086165620850336e-08, + "loss": 0.2768, + "step": 3180 + }, + { + "epoch": 0.9, + "grad_norm": 2.2144689483497935, + "learning_rate": 5.0573081473566315e-08, + "loss": 0.2846, + "step": 3181 + }, + { + "epoch": 0.9, + "grad_norm": 2.3229900518667357, + "learning_rate": 5.028530647612306e-08, + "loss": 0.2937, + "step": 3182 + }, + { + "epoch": 0.9, + "grad_norm": 2.480173992557888, + "learning_rate": 4.999833145857768e-08, + "loss": 0.2651, + "step": 3183 + }, + { + "epoch": 0.9, + "grad_norm": 2.2829017879265145, + "learning_rate": 4.971215666265938e-08, + "loss": 0.2393, + "step": 3184 + }, + { + "epoch": 0.9, + "grad_norm": 2.698922135891398, + "learning_rate": 4.942678232942399e-08, + "loss": 0.2879, + "step": 3185 + }, + { + "epoch": 0.9, + "grad_norm": 2.409469865808042, + "learning_rate": 4.9142208699252893e-08, + "loss": 0.2864, + "step": 3186 + }, + { + "epoch": 0.9, + "grad_norm": 2.405615517945202, + "learning_rate": 4.885843601185291e-08, + "loss": 0.2805, + "step": 3187 + }, + { + "epoch": 0.9, + "grad_norm": 2.347723176787636, + "learning_rate": 4.857546450625649e-08, + "loss": 0.2615, + "step": 3188 + }, + { + "epoch": 0.9, + "grad_norm": 2.2570397287056916, + "learning_rate": 4.8293294420820754e-08, + "loss": 0.255, + "step": 3189 + }, + { + "epoch": 0.9, + "grad_norm": 2.386877051850636, + "learning_rate": 4.801192599322834e-08, + "loss": 0.2543, + "step": 3190 + }, + { + "epoch": 0.9, + "grad_norm": 2.4355257643620947, + "learning_rate": 4.773135946048601e-08, + "loss": 0.2548, + "step": 3191 + }, + { + "epoch": 0.9, + "grad_norm": 2.383789499942686, + "learning_rate": 4.7451595058925594e-08, + "loss": 0.2642, + "step": 3192 + }, + { + "epoch": 0.9, + "grad_norm": 2.22476355531068, + "learning_rate": 4.717263302420282e-08, + "loss": 0.2319, + "step": 3193 + }, + { + "epoch": 0.91, + "grad_norm": 2.3890981659570376, + "learning_rate": 4.689447359129794e-08, + "loss": 0.2672, + "step": 3194 + }, + { + "epoch": 0.91, + "grad_norm": 2.4735293733403685, + "learning_rate": 4.661711699451476e-08, + "loss": 0.2716, + "step": 3195 + }, + { + "epoch": 0.91, + "grad_norm": 2.225349534764869, + "learning_rate": 4.6340563467481164e-08, + "loss": 0.2744, + "step": 3196 + }, + { + "epoch": 0.91, + "grad_norm": 2.450129758872958, + "learning_rate": 4.606481324314848e-08, + "loss": 0.2513, + "step": 3197 + }, + { + "epoch": 0.91, + "grad_norm": 2.2568166974492883, + "learning_rate": 4.5789866553791245e-08, + "loss": 0.2402, + "step": 3198 + }, + { + "epoch": 0.91, + "grad_norm": 2.633689105888111, + "learning_rate": 4.551572363100731e-08, + "loss": 0.2336, + "step": 3199 + }, + { + "epoch": 0.91, + "grad_norm": 2.3558166985162363, + "learning_rate": 4.52423847057174e-08, + "loss": 0.2382, + "step": 3200 + }, + { + "epoch": 0.91, + "grad_norm": 2.4980777479857506, + "learning_rate": 4.496985000816489e-08, + "loss": 0.3011, + "step": 3201 + }, + { + "epoch": 0.91, + "grad_norm": 2.4025661382784196, + "learning_rate": 4.469811976791604e-08, + "loss": 0.3135, + "step": 3202 + }, + { + "epoch": 0.91, + "grad_norm": 2.288906258196443, + "learning_rate": 4.442719421385921e-08, + "loss": 0.2783, + "step": 3203 + }, + { + "epoch": 0.91, + "grad_norm": 2.3401562728012664, + "learning_rate": 4.415707357420517e-08, + "loss": 0.2381, + "step": 3204 + }, + { + "epoch": 0.91, + "grad_norm": 2.2560055476064536, + "learning_rate": 4.388775807648659e-08, + "loss": 0.2676, + "step": 3205 + }, + { + "epoch": 0.91, + "grad_norm": 2.307661722100612, + "learning_rate": 4.3619247947557445e-08, + "loss": 0.2451, + "step": 3206 + }, + { + "epoch": 0.91, + "grad_norm": 2.748113485649648, + "learning_rate": 4.3351543413594263e-08, + "loss": 0.2583, + "step": 3207 + }, + { + "epoch": 0.91, + "grad_norm": 2.2513931431439382, + "learning_rate": 4.308464470009432e-08, + "loss": 0.2774, + "step": 3208 + }, + { + "epoch": 0.91, + "grad_norm": 2.502034598201456, + "learning_rate": 4.2818552031876454e-08, + "loss": 0.2926, + "step": 3209 + }, + { + "epoch": 0.91, + "grad_norm": 2.291150374878783, + "learning_rate": 4.2553265633080146e-08, + "loss": 0.2662, + "step": 3210 + }, + { + "epoch": 0.91, + "grad_norm": 2.5215967746639354, + "learning_rate": 4.228878572716588e-08, + "loss": 0.266, + "step": 3211 + }, + { + "epoch": 0.91, + "grad_norm": 2.349662419335486, + "learning_rate": 4.202511253691521e-08, + "loss": 0.2708, + "step": 3212 + }, + { + "epoch": 0.91, + "grad_norm": 2.3748199771039564, + "learning_rate": 4.176224628442981e-08, + "loss": 0.2805, + "step": 3213 + }, + { + "epoch": 0.91, + "grad_norm": 2.3378089840973058, + "learning_rate": 4.150018719113147e-08, + "loss": 0.2765, + "step": 3214 + }, + { + "epoch": 0.91, + "grad_norm": 2.4869406885817775, + "learning_rate": 4.123893547776236e-08, + "loss": 0.2708, + "step": 3215 + }, + { + "epoch": 0.91, + "grad_norm": 2.3636653162321783, + "learning_rate": 4.097849136438436e-08, + "loss": 0.2616, + "step": 3216 + }, + { + "epoch": 0.91, + "grad_norm": 2.322882191550021, + "learning_rate": 4.071885507037953e-08, + "loss": 0.2479, + "step": 3217 + }, + { + "epoch": 0.91, + "grad_norm": 2.6038193510966265, + "learning_rate": 4.0460026814448934e-08, + "loss": 0.3161, + "step": 3218 + }, + { + "epoch": 0.91, + "grad_norm": 2.9175710529853762, + "learning_rate": 4.0202006814613165e-08, + "loss": 0.3271, + "step": 3219 + }, + { + "epoch": 0.91, + "grad_norm": 2.2335773040988847, + "learning_rate": 3.994479528821204e-08, + "loss": 0.2589, + "step": 3220 + }, + { + "epoch": 0.91, + "grad_norm": 2.29518040874271, + "learning_rate": 3.9688392451904475e-08, + "loss": 0.2662, + "step": 3221 + }, + { + "epoch": 0.91, + "grad_norm": 2.4345206626996085, + "learning_rate": 3.943279852166803e-08, + "loss": 0.2724, + "step": 3222 + }, + { + "epoch": 0.91, + "grad_norm": 2.619129736310827, + "learning_rate": 3.917801371279894e-08, + "loss": 0.2775, + "step": 3223 + }, + { + "epoch": 0.91, + "grad_norm": 2.2320266622210374, + "learning_rate": 3.8924038239911975e-08, + "loss": 0.2542, + "step": 3224 + }, + { + "epoch": 0.91, + "grad_norm": 2.4111141473121123, + "learning_rate": 3.8670872316939885e-08, + "loss": 0.2827, + "step": 3225 + }, + { + "epoch": 0.91, + "grad_norm": 2.453538199319764, + "learning_rate": 3.841851615713398e-08, + "loss": 0.3011, + "step": 3226 + }, + { + "epoch": 0.91, + "grad_norm": 2.3560675605285444, + "learning_rate": 3.816696997306301e-08, + "loss": 0.2999, + "step": 3227 + }, + { + "epoch": 0.91, + "grad_norm": 2.500501438703021, + "learning_rate": 3.79162339766137e-08, + "loss": 0.2847, + "step": 3228 + }, + { + "epoch": 0.91, + "grad_norm": 2.4567324123602363, + "learning_rate": 3.766630837899032e-08, + "loss": 0.2924, + "step": 3229 + }, + { + "epoch": 0.92, + "grad_norm": 2.369206056840079, + "learning_rate": 3.7417193390714476e-08, + "loss": 0.2833, + "step": 3230 + }, + { + "epoch": 0.92, + "grad_norm": 2.3537524798078473, + "learning_rate": 3.716888922162487e-08, + "loss": 0.2884, + "step": 3231 + }, + { + "epoch": 0.92, + "grad_norm": 2.2654121630396, + "learning_rate": 3.692139608087741e-08, + "loss": 0.2761, + "step": 3232 + }, + { + "epoch": 0.92, + "grad_norm": 2.367419390871166, + "learning_rate": 3.667471417694468e-08, + "loss": 0.2457, + "step": 3233 + }, + { + "epoch": 0.92, + "grad_norm": 2.2877453429907635, + "learning_rate": 3.642884371761601e-08, + "loss": 0.254, + "step": 3234 + }, + { + "epoch": 0.92, + "grad_norm": 2.573930551823851, + "learning_rate": 3.6183784909997187e-08, + "loss": 0.2634, + "step": 3235 + }, + { + "epoch": 0.92, + "grad_norm": 2.366805500727652, + "learning_rate": 3.593953796051041e-08, + "loss": 0.257, + "step": 3236 + }, + { + "epoch": 0.92, + "grad_norm": 5.863808786508515, + "learning_rate": 3.5696103074893793e-08, + "loss": 0.2966, + "step": 3237 + }, + { + "epoch": 0.92, + "grad_norm": 2.566493498708618, + "learning_rate": 3.545348045820173e-08, + "loss": 0.2675, + "step": 3238 + }, + { + "epoch": 0.92, + "grad_norm": 2.4598395736526837, + "learning_rate": 3.521167031480432e-08, + "loss": 0.2886, + "step": 3239 + }, + { + "epoch": 0.92, + "grad_norm": 2.2523848463832543, + "learning_rate": 3.497067284838673e-08, + "loss": 0.2659, + "step": 3240 + }, + { + "epoch": 0.92, + "grad_norm": 2.4175830736828057, + "learning_rate": 3.4730488261950574e-08, + "loss": 0.2866, + "step": 3241 + }, + { + "epoch": 0.92, + "grad_norm": 2.307963198877353, + "learning_rate": 3.449111675781202e-08, + "loss": 0.2474, + "step": 3242 + }, + { + "epoch": 0.92, + "grad_norm": 2.220977889209224, + "learning_rate": 3.4252558537602786e-08, + "loss": 0.2501, + "step": 3243 + }, + { + "epoch": 0.92, + "grad_norm": 2.3093402757426995, + "learning_rate": 3.401481380226889e-08, + "loss": 0.2674, + "step": 3244 + }, + { + "epoch": 0.92, + "grad_norm": 2.1868144442873, + "learning_rate": 3.3777882752071715e-08, + "loss": 0.2605, + "step": 3245 + }, + { + "epoch": 0.92, + "grad_norm": 2.219309920824285, + "learning_rate": 3.354176558658728e-08, + "loss": 0.2621, + "step": 3246 + }, + { + "epoch": 0.92, + "grad_norm": 2.5444203511053654, + "learning_rate": 3.33064625047057e-08, + "loss": 0.2821, + "step": 3247 + }, + { + "epoch": 0.92, + "grad_norm": 2.4616798789610947, + "learning_rate": 3.307197370463133e-08, + "loss": 0.2793, + "step": 3248 + }, + { + "epoch": 0.92, + "grad_norm": 2.300724708712612, + "learning_rate": 3.283829938388294e-08, + "loss": 0.2561, + "step": 3249 + }, + { + "epoch": 0.92, + "grad_norm": 3.3329227624540954, + "learning_rate": 3.260543973929286e-08, + "loss": 0.2779, + "step": 3250 + }, + { + "epoch": 0.92, + "grad_norm": 2.3194303907345537, + "learning_rate": 3.237339496700775e-08, + "loss": 0.2392, + "step": 3251 + }, + { + "epoch": 0.92, + "grad_norm": 2.602542644339946, + "learning_rate": 3.2142165262487365e-08, + "loss": 0.3197, + "step": 3252 + }, + { + "epoch": 0.92, + "grad_norm": 2.38983381123609, + "learning_rate": 3.1911750820505015e-08, + "loss": 0.2636, + "step": 3253 + }, + { + "epoch": 0.92, + "grad_norm": 2.1980480700182974, + "learning_rate": 3.168215183514733e-08, + "loss": 0.2387, + "step": 3254 + }, + { + "epoch": 0.92, + "grad_norm": 2.278995719811263, + "learning_rate": 3.145336849981395e-08, + "loss": 0.2432, + "step": 3255 + }, + { + "epoch": 0.92, + "grad_norm": 2.3941204185182294, + "learning_rate": 3.1225401007217934e-08, + "loss": 0.2821, + "step": 3256 + }, + { + "epoch": 0.92, + "grad_norm": 2.36472481649528, + "learning_rate": 3.0998249549384346e-08, + "loss": 0.2432, + "step": 3257 + }, + { + "epoch": 0.92, + "grad_norm": 2.261121367712428, + "learning_rate": 3.077191431765147e-08, + "loss": 0.266, + "step": 3258 + }, + { + "epoch": 0.92, + "grad_norm": 2.440840562496626, + "learning_rate": 3.0546395502669795e-08, + "loss": 0.2726, + "step": 3259 + }, + { + "epoch": 0.92, + "grad_norm": 2.416700603257129, + "learning_rate": 3.032169329440226e-08, + "loss": 0.278, + "step": 3260 + }, + { + "epoch": 0.92, + "grad_norm": 2.9480803404548483, + "learning_rate": 3.009780788212379e-08, + "loss": 0.2961, + "step": 3261 + }, + { + "epoch": 0.92, + "grad_norm": 2.3246552045964135, + "learning_rate": 2.9874739454421424e-08, + "loss": 0.2645, + "step": 3262 + }, + { + "epoch": 0.92, + "grad_norm": 2.2128528098505083, + "learning_rate": 2.965248819919397e-08, + "loss": 0.2527, + "step": 3263 + }, + { + "epoch": 0.92, + "grad_norm": 2.2228799005968125, + "learning_rate": 2.943105430365178e-08, + "loss": 0.2694, + "step": 3264 + }, + { + "epoch": 0.93, + "grad_norm": 2.3867533631022613, + "learning_rate": 2.921043795431699e-08, + "loss": 0.2819, + "step": 3265 + }, + { + "epoch": 0.93, + "grad_norm": 2.3560004236511376, + "learning_rate": 2.8990639337022838e-08, + "loss": 0.2677, + "step": 3266 + }, + { + "epoch": 0.93, + "grad_norm": 2.484908011066342, + "learning_rate": 2.8771658636913886e-08, + "loss": 0.2858, + "step": 3267 + }, + { + "epoch": 0.93, + "grad_norm": 2.414734426146304, + "learning_rate": 2.85534960384457e-08, + "loss": 0.2608, + "step": 3268 + }, + { + "epoch": 0.93, + "grad_norm": 2.384125145372157, + "learning_rate": 2.8336151725384727e-08, + "loss": 0.238, + "step": 3269 + }, + { + "epoch": 0.93, + "grad_norm": 2.539193357795301, + "learning_rate": 2.8119625880808183e-08, + "loss": 0.2764, + "step": 3270 + }, + { + "epoch": 0.93, + "grad_norm": 2.383186119315387, + "learning_rate": 2.7903918687103733e-08, + "loss": 0.2722, + "step": 3271 + }, + { + "epoch": 0.93, + "grad_norm": 2.1790139505551247, + "learning_rate": 2.7689030325969476e-08, + "loss": 0.2391, + "step": 3272 + }, + { + "epoch": 0.93, + "grad_norm": 2.7517357675515552, + "learning_rate": 2.7474960978414064e-08, + "loss": 0.2607, + "step": 3273 + }, + { + "epoch": 0.93, + "grad_norm": 2.4227957748137516, + "learning_rate": 2.7261710824755812e-08, + "loss": 0.2766, + "step": 3274 + }, + { + "epoch": 0.93, + "grad_norm": 2.236432074590604, + "learning_rate": 2.704928004462337e-08, + "loss": 0.2739, + "step": 3275 + }, + { + "epoch": 0.93, + "grad_norm": 4.958590582391775, + "learning_rate": 2.683766881695504e-08, + "loss": 0.2751, + "step": 3276 + }, + { + "epoch": 0.93, + "grad_norm": 2.3924049229226285, + "learning_rate": 2.6626877319998798e-08, + "loss": 0.2794, + "step": 3277 + }, + { + "epoch": 0.93, + "grad_norm": 2.3948885814981016, + "learning_rate": 2.641690573131228e-08, + "loss": 0.2757, + "step": 3278 + }, + { + "epoch": 0.93, + "grad_norm": 2.2237442319076095, + "learning_rate": 2.6207754227761892e-08, + "loss": 0.26, + "step": 3279 + }, + { + "epoch": 0.93, + "grad_norm": 2.2420191639484632, + "learning_rate": 2.5999422985524157e-08, + "loss": 0.2707, + "step": 3280 + }, + { + "epoch": 0.93, + "grad_norm": 2.3290621853224764, + "learning_rate": 2.579191218008403e-08, + "loss": 0.2739, + "step": 3281 + }, + { + "epoch": 0.93, + "grad_norm": 2.347622446698043, + "learning_rate": 2.5585221986235693e-08, + "loss": 0.2533, + "step": 3282 + }, + { + "epoch": 0.93, + "grad_norm": 2.323131153292077, + "learning_rate": 2.537935257808177e-08, + "loss": 0.272, + "step": 3283 + }, + { + "epoch": 0.93, + "grad_norm": 2.318533312951456, + "learning_rate": 2.5174304129033653e-08, + "loss": 0.2675, + "step": 3284 + }, + { + "epoch": 0.93, + "grad_norm": 2.481506357351729, + "learning_rate": 2.4970076811811513e-08, + "loss": 0.2553, + "step": 3285 + }, + { + "epoch": 0.93, + "grad_norm": 3.351142696730123, + "learning_rate": 2.4766670798443412e-08, + "loss": 0.2712, + "step": 3286 + }, + { + "epoch": 0.93, + "grad_norm": 2.319674755862298, + "learning_rate": 2.4564086260265847e-08, + "loss": 0.2722, + "step": 3287 + }, + { + "epoch": 0.93, + "grad_norm": 2.387010165950708, + "learning_rate": 2.436232336792321e-08, + "loss": 0.2832, + "step": 3288 + }, + { + "epoch": 0.93, + "grad_norm": 2.3906373099226284, + "learning_rate": 2.416138229136777e-08, + "loss": 0.2621, + "step": 3289 + }, + { + "epoch": 0.93, + "grad_norm": 2.347750085520224, + "learning_rate": 2.3961263199859915e-08, + "loss": 0.2609, + "step": 3290 + }, + { + "epoch": 0.93, + "grad_norm": 2.291141282114224, + "learning_rate": 2.3761966261967247e-08, + "loss": 0.2441, + "step": 3291 + }, + { + "epoch": 0.93, + "grad_norm": 2.3569274367279274, + "learning_rate": 2.3563491645564925e-08, + "loss": 0.2671, + "step": 3292 + }, + { + "epoch": 0.93, + "grad_norm": 2.3740086526710327, + "learning_rate": 2.336583951783555e-08, + "loss": 0.2842, + "step": 3293 + }, + { + "epoch": 0.93, + "grad_norm": 2.245727364756435, + "learning_rate": 2.3169010045268723e-08, + "loss": 0.2671, + "step": 3294 + }, + { + "epoch": 0.93, + "grad_norm": 2.3848931241980336, + "learning_rate": 2.2973003393661372e-08, + "loss": 0.2561, + "step": 3295 + }, + { + "epoch": 0.93, + "grad_norm": 2.368790604055993, + "learning_rate": 2.2777819728116988e-08, + "loss": 0.2614, + "step": 3296 + }, + { + "epoch": 0.93, + "grad_norm": 2.5764287797220984, + "learning_rate": 2.2583459213046162e-08, + "loss": 0.288, + "step": 3297 + }, + { + "epoch": 0.93, + "grad_norm": 2.8656696962228563, + "learning_rate": 2.238992201216594e-08, + "loss": 0.2705, + "step": 3298 + }, + { + "epoch": 0.93, + "grad_norm": 2.344589554713833, + "learning_rate": 2.219720828849969e-08, + "loss": 0.2329, + "step": 3299 + }, + { + "epoch": 0.94, + "grad_norm": 2.388564658275402, + "learning_rate": 2.2005318204377565e-08, + "loss": 0.289, + "step": 3300 + }, + { + "epoch": 0.94, + "grad_norm": 2.5719941834626807, + "learning_rate": 2.18142519214356e-08, + "loss": 0.3029, + "step": 3301 + }, + { + "epoch": 0.94, + "grad_norm": 2.4035422211888378, + "learning_rate": 2.1624009600616056e-08, + "loss": 0.2601, + "step": 3302 + }, + { + "epoch": 0.94, + "grad_norm": 2.5636385948099423, + "learning_rate": 2.1434591402166967e-08, + "loss": 0.286, + "step": 3303 + }, + { + "epoch": 0.94, + "grad_norm": 2.3912117460001006, + "learning_rate": 2.1245997485642485e-08, + "loss": 0.2394, + "step": 3304 + }, + { + "epoch": 0.94, + "grad_norm": 2.5857138602537892, + "learning_rate": 2.1058228009902092e-08, + "loss": 0.2813, + "step": 3305 + }, + { + "epoch": 0.94, + "grad_norm": 2.2418195013942563, + "learning_rate": 2.087128313311115e-08, + "loss": 0.2754, + "step": 3306 + }, + { + "epoch": 0.94, + "grad_norm": 2.411005422972666, + "learning_rate": 2.0685163012740036e-08, + "loss": 0.2702, + "step": 3307 + }, + { + "epoch": 0.94, + "grad_norm": 2.3891746886806846, + "learning_rate": 2.0499867805564784e-08, + "loss": 0.2797, + "step": 3308 + }, + { + "epoch": 0.94, + "grad_norm": 2.2603837327608254, + "learning_rate": 2.0315397667666433e-08, + "loss": 0.2452, + "step": 3309 + }, + { + "epoch": 0.94, + "grad_norm": 2.4929320500854577, + "learning_rate": 2.013175275443102e-08, + "loss": 0.2558, + "step": 3310 + }, + { + "epoch": 0.94, + "grad_norm": 2.569810976957224, + "learning_rate": 1.9948933220549248e-08, + "loss": 0.2974, + "step": 3311 + }, + { + "epoch": 0.94, + "grad_norm": 2.35226642441117, + "learning_rate": 1.9766939220017153e-08, + "loss": 0.2618, + "step": 3312 + }, + { + "epoch": 0.94, + "grad_norm": 2.3079105807378486, + "learning_rate": 1.9585770906134668e-08, + "loss": 0.2746, + "step": 3313 + }, + { + "epoch": 0.94, + "grad_norm": 2.3644399960317277, + "learning_rate": 1.940542843150683e-08, + "loss": 0.2819, + "step": 3314 + }, + { + "epoch": 0.94, + "grad_norm": 2.396111835884552, + "learning_rate": 1.9225911948042683e-08, + "loss": 0.2613, + "step": 3315 + }, + { + "epoch": 0.94, + "grad_norm": 2.3463888293024384, + "learning_rate": 1.9047221606955712e-08, + "loss": 0.2504, + "step": 3316 + }, + { + "epoch": 0.94, + "grad_norm": 2.3679034793186093, + "learning_rate": 1.886935755876329e-08, + "loss": 0.2733, + "step": 3317 + }, + { + "epoch": 0.94, + "grad_norm": 2.259794421485142, + "learning_rate": 1.8692319953286906e-08, + "loss": 0.2703, + "step": 3318 + }, + { + "epoch": 0.94, + "grad_norm": 2.3388390277874422, + "learning_rate": 1.8516108939651943e-08, + "loss": 0.2855, + "step": 3319 + }, + { + "epoch": 0.94, + "grad_norm": 2.543297868344573, + "learning_rate": 1.8340724666287555e-08, + "loss": 0.2757, + "step": 3320 + }, + { + "epoch": 0.94, + "grad_norm": 2.254819797018417, + "learning_rate": 1.816616728092646e-08, + "loss": 0.2511, + "step": 3321 + }, + { + "epoch": 0.94, + "grad_norm": 2.3420892520736505, + "learning_rate": 1.7992436930604483e-08, + "loss": 0.2922, + "step": 3322 + }, + { + "epoch": 0.94, + "grad_norm": 2.446367425978593, + "learning_rate": 1.7819533761661344e-08, + "loss": 0.2647, + "step": 3323 + }, + { + "epoch": 0.94, + "grad_norm": 2.4262242704514434, + "learning_rate": 1.7647457919739872e-08, + "loss": 0.2687, + "step": 3324 + }, + { + "epoch": 0.94, + "grad_norm": 2.3242793523904606, + "learning_rate": 1.7476209549785903e-08, + "loss": 0.2397, + "step": 3325 + }, + { + "epoch": 0.94, + "grad_norm": 2.426182963897848, + "learning_rate": 1.7305788796048272e-08, + "loss": 0.2973, + "step": 3326 + }, + { + "epoch": 0.94, + "grad_norm": 2.703243585335758, + "learning_rate": 1.7136195802078478e-08, + "loss": 0.2707, + "step": 3327 + }, + { + "epoch": 0.94, + "grad_norm": 2.543425586558075, + "learning_rate": 1.6967430710731258e-08, + "loss": 0.2497, + "step": 3328 + }, + { + "epoch": 0.94, + "grad_norm": 2.214936036687701, + "learning_rate": 1.6799493664163668e-08, + "loss": 0.2467, + "step": 3329 + }, + { + "epoch": 0.94, + "grad_norm": 2.394052290637097, + "learning_rate": 1.6632384803835332e-08, + "loss": 0.257, + "step": 3330 + }, + { + "epoch": 0.94, + "grad_norm": 2.4616826296848093, + "learning_rate": 1.6466104270508098e-08, + "loss": 0.235, + "step": 3331 + }, + { + "epoch": 0.94, + "grad_norm": 2.2261144395575623, + "learning_rate": 1.6300652204246255e-08, + "loss": 0.2369, + "step": 3332 + }, + { + "epoch": 0.94, + "grad_norm": 2.3220778215462743, + "learning_rate": 1.6136028744416218e-08, + "loss": 0.2586, + "step": 3333 + }, + { + "epoch": 0.94, + "grad_norm": 2.332569555366564, + "learning_rate": 1.5972234029686616e-08, + "loss": 0.2727, + "step": 3334 + }, + { + "epoch": 0.94, + "grad_norm": 2.43062826390679, + "learning_rate": 1.5809268198027524e-08, + "loss": 0.2972, + "step": 3335 + }, + { + "epoch": 0.95, + "grad_norm": 2.4845491954069296, + "learning_rate": 1.5647131386711367e-08, + "loss": 0.2835, + "step": 3336 + }, + { + "epoch": 0.95, + "grad_norm": 2.593439289006337, + "learning_rate": 1.5485823732311775e-08, + "loss": 0.2638, + "step": 3337 + }, + { + "epoch": 0.95, + "grad_norm": 2.473215011505199, + "learning_rate": 1.532534537070429e-08, + "loss": 0.2811, + "step": 3338 + }, + { + "epoch": 0.95, + "grad_norm": 2.256085966417221, + "learning_rate": 1.516569643706578e-08, + "loss": 0.2546, + "step": 3339 + }, + { + "epoch": 0.95, + "grad_norm": 2.4258033040572258, + "learning_rate": 1.5006877065874335e-08, + "loss": 0.2915, + "step": 3340 + }, + { + "epoch": 0.95, + "grad_norm": 2.268360402644574, + "learning_rate": 1.4848887390909614e-08, + "loss": 0.2953, + "step": 3341 + }, + { + "epoch": 0.95, + "grad_norm": 2.4874133228590356, + "learning_rate": 1.4691727545251942e-08, + "loss": 0.2913, + "step": 3342 + }, + { + "epoch": 0.95, + "grad_norm": 2.202250492491265, + "learning_rate": 1.4535397661283089e-08, + "loss": 0.2529, + "step": 3343 + }, + { + "epoch": 0.95, + "grad_norm": 2.343475727802179, + "learning_rate": 1.4379897870685498e-08, + "loss": 0.2372, + "step": 3344 + }, + { + "epoch": 0.95, + "grad_norm": 2.4606844418448905, + "learning_rate": 1.4225228304442172e-08, + "loss": 0.2593, + "step": 3345 + }, + { + "epoch": 0.95, + "grad_norm": 2.3663621030257858, + "learning_rate": 1.4071389092837338e-08, + "loss": 0.2785, + "step": 3346 + }, + { + "epoch": 0.95, + "grad_norm": 2.2640297556060873, + "learning_rate": 1.3918380365455228e-08, + "loss": 0.2446, + "step": 3347 + }, + { + "epoch": 0.95, + "grad_norm": 2.20404868372011, + "learning_rate": 1.3766202251180858e-08, + "loss": 0.2756, + "step": 3348 + }, + { + "epoch": 0.95, + "grad_norm": 2.1297206085618754, + "learning_rate": 1.3614854878199577e-08, + "loss": 0.2259, + "step": 3349 + }, + { + "epoch": 0.95, + "grad_norm": 2.4107497897973005, + "learning_rate": 1.3464338373996741e-08, + "loss": 0.2662, + "step": 3350 + }, + { + "epoch": 0.95, + "grad_norm": 2.581593802277168, + "learning_rate": 1.3314652865358156e-08, + "loss": 0.3059, + "step": 3351 + }, + { + "epoch": 0.95, + "grad_norm": 2.308818850381563, + "learning_rate": 1.3165798478369183e-08, + "loss": 0.2747, + "step": 3352 + }, + { + "epoch": 0.95, + "grad_norm": 2.2932563254591054, + "learning_rate": 1.3017775338415638e-08, + "loss": 0.2804, + "step": 3353 + }, + { + "epoch": 0.95, + "grad_norm": 2.525942716214574, + "learning_rate": 1.287058357018278e-08, + "loss": 0.2755, + "step": 3354 + }, + { + "epoch": 0.95, + "grad_norm": 2.4985923891428476, + "learning_rate": 1.2724223297655878e-08, + "loss": 0.3062, + "step": 3355 + }, + { + "epoch": 0.95, + "grad_norm": 2.356662166585638, + "learning_rate": 1.2578694644119425e-08, + "loss": 0.2611, + "step": 3356 + }, + { + "epoch": 0.95, + "grad_norm": 2.3775670454411615, + "learning_rate": 1.2433997732157586e-08, + "loss": 0.2657, + "step": 3357 + }, + { + "epoch": 0.95, + "grad_norm": 2.050051946828104, + "learning_rate": 1.2290132683654086e-08, + "loss": 0.2086, + "step": 3358 + }, + { + "epoch": 0.95, + "grad_norm": 2.5458546624396257, + "learning_rate": 1.2147099619791767e-08, + "loss": 0.2822, + "step": 3359 + }, + { + "epoch": 0.95, + "grad_norm": 2.415412671153464, + "learning_rate": 1.2004898661052588e-08, + "loss": 0.2783, + "step": 3360 + }, + { + "epoch": 0.95, + "grad_norm": 2.2751172201041, + "learning_rate": 1.186352992721773e-08, + "loss": 0.2502, + "step": 3361 + }, + { + "epoch": 0.95, + "grad_norm": 2.3132166812169714, + "learning_rate": 1.1722993537367277e-08, + "loss": 0.2829, + "step": 3362 + }, + { + "epoch": 0.95, + "grad_norm": 2.5768434717911473, + "learning_rate": 1.1583289609880308e-08, + "loss": 0.2876, + "step": 3363 + }, + { + "epoch": 0.95, + "grad_norm": 2.3452615889347994, + "learning_rate": 1.1444418262434586e-08, + "loss": 0.2531, + "step": 3364 + }, + { + "epoch": 0.95, + "grad_norm": 2.4335383089249945, + "learning_rate": 1.1306379612006645e-08, + "loss": 0.2877, + "step": 3365 + }, + { + "epoch": 0.95, + "grad_norm": 2.2737448478494238, + "learning_rate": 1.1169173774871477e-08, + "loss": 0.2422, + "step": 3366 + }, + { + "epoch": 0.95, + "grad_norm": 2.3643371162101996, + "learning_rate": 1.1032800866602632e-08, + "loss": 0.2475, + "step": 3367 + }, + { + "epoch": 0.95, + "grad_norm": 2.308016964987461, + "learning_rate": 1.0897261002072222e-08, + "loss": 0.2385, + "step": 3368 + }, + { + "epoch": 0.95, + "grad_norm": 2.396351551587896, + "learning_rate": 1.0762554295450366e-08, + "loss": 0.2635, + "step": 3369 + }, + { + "epoch": 0.95, + "grad_norm": 2.4150045202475745, + "learning_rate": 1.0628680860205518e-08, + "loss": 0.2565, + "step": 3370 + }, + { + "epoch": 0.96, + "grad_norm": 2.2957529859487034, + "learning_rate": 1.0495640809104256e-08, + "loss": 0.2808, + "step": 3371 + }, + { + "epoch": 0.96, + "grad_norm": 2.441843889242134, + "learning_rate": 1.0363434254211268e-08, + "loss": 0.2541, + "step": 3372 + }, + { + "epoch": 0.96, + "grad_norm": 2.4618524049071153, + "learning_rate": 1.0232061306888917e-08, + "loss": 0.2706, + "step": 3373 + }, + { + "epoch": 0.96, + "grad_norm": 2.5357395909750866, + "learning_rate": 1.0101522077797352e-08, + "loss": 0.2733, + "step": 3374 + }, + { + "epoch": 0.96, + "grad_norm": 2.619183803188383, + "learning_rate": 9.97181667689495e-09, + "loss": 0.2894, + "step": 3375 + }, + { + "epoch": 0.96, + "grad_norm": 2.534613440279317, + "learning_rate": 9.842945213437092e-09, + "loss": 0.2671, + "step": 3376 + }, + { + "epoch": 0.96, + "grad_norm": 2.230444791490229, + "learning_rate": 9.714907795977168e-09, + "loss": 0.2841, + "step": 3377 + }, + { + "epoch": 0.96, + "grad_norm": 2.7079530309001902, + "learning_rate": 9.587704532365681e-09, + "loss": 0.2649, + "step": 3378 + }, + { + "epoch": 0.96, + "grad_norm": 2.4746543060966806, + "learning_rate": 9.461335529750814e-09, + "loss": 0.2536, + "step": 3379 + }, + { + "epoch": 0.96, + "grad_norm": 2.474784362397472, + "learning_rate": 9.33580089457786e-09, + "loss": 0.2703, + "step": 3380 + }, + { + "epoch": 0.96, + "grad_norm": 2.449321332065921, + "learning_rate": 9.211100732589127e-09, + "loss": 0.278, + "step": 3381 + }, + { + "epoch": 0.96, + "grad_norm": 2.3510232296452678, + "learning_rate": 9.087235148824368e-09, + "loss": 0.2727, + "step": 3382 + }, + { + "epoch": 0.96, + "grad_norm": 2.2950727777946143, + "learning_rate": 8.964204247620011e-09, + "loss": 0.2902, + "step": 3383 + }, + { + "epoch": 0.96, + "grad_norm": 2.4002683065846058, + "learning_rate": 8.842008132609602e-09, + "loss": 0.2745, + "step": 3384 + }, + { + "epoch": 0.96, + "grad_norm": 2.462639292571834, + "learning_rate": 8.720646906723583e-09, + "loss": 0.2739, + "step": 3385 + }, + { + "epoch": 0.96, + "grad_norm": 2.419294993393598, + "learning_rate": 8.600120672188738e-09, + "loss": 0.2354, + "step": 3386 + }, + { + "epoch": 0.96, + "grad_norm": 2.2659319854149955, + "learning_rate": 8.480429530529076e-09, + "loss": 0.2479, + "step": 3387 + }, + { + "epoch": 0.96, + "grad_norm": 4.2481201180611, + "learning_rate": 8.361573582564729e-09, + "loss": 0.2959, + "step": 3388 + }, + { + "epoch": 0.96, + "grad_norm": 2.3327587692717473, + "learning_rate": 8.2435529284125e-09, + "loss": 0.2548, + "step": 3389 + }, + { + "epoch": 0.96, + "grad_norm": 3.024764248268901, + "learning_rate": 8.126367667485534e-09, + "loss": 0.291, + "step": 3390 + }, + { + "epoch": 0.96, + "grad_norm": 2.133213647977136, + "learning_rate": 8.010017898493315e-09, + "loss": 0.229, + "step": 3391 + }, + { + "epoch": 0.96, + "grad_norm": 2.2073354750539815, + "learning_rate": 7.89450371944167e-09, + "loss": 0.2341, + "step": 3392 + }, + { + "epoch": 0.96, + "grad_norm": 2.3459203903671693, + "learning_rate": 7.779825227632319e-09, + "loss": 0.2735, + "step": 3393 + }, + { + "epoch": 0.96, + "grad_norm": 2.2982394761984124, + "learning_rate": 7.665982519663327e-09, + "loss": 0.2733, + "step": 3394 + }, + { + "epoch": 0.96, + "grad_norm": 2.4779060772165153, + "learning_rate": 7.552975691428654e-09, + "loss": 0.2992, + "step": 3395 + }, + { + "epoch": 0.96, + "grad_norm": 2.3653518625493897, + "learning_rate": 7.440804838117931e-09, + "loss": 0.2544, + "step": 3396 + }, + { + "epoch": 0.96, + "grad_norm": 2.3191375340363085, + "learning_rate": 7.329470054217024e-09, + "loss": 0.2596, + "step": 3397 + }, + { + "epoch": 0.96, + "grad_norm": 2.233643860181891, + "learning_rate": 7.21897143350747e-09, + "loss": 0.2291, + "step": 3398 + }, + { + "epoch": 0.96, + "grad_norm": 3.645260025143113, + "learning_rate": 7.109309069065928e-09, + "loss": 0.2604, + "step": 3399 + }, + { + "epoch": 0.96, + "grad_norm": 2.2409004059704185, + "learning_rate": 7.000483053265505e-09, + "loss": 0.264, + "step": 3400 + }, + { + "epoch": 0.96, + "grad_norm": 2.0328585106470216, + "learning_rate": 6.892493477774097e-09, + "loss": 0.245, + "step": 3401 + }, + { + "epoch": 0.96, + "grad_norm": 2.4964028149676083, + "learning_rate": 6.7853404335554974e-09, + "loss": 0.2678, + "step": 3402 + }, + { + "epoch": 0.96, + "grad_norm": 2.558672079681962, + "learning_rate": 6.679024010868617e-09, + "loss": 0.3041, + "step": 3403 + }, + { + "epoch": 0.96, + "grad_norm": 2.40141182727304, + "learning_rate": 6.573544299267708e-09, + "loss": 0.2448, + "step": 3404 + }, + { + "epoch": 0.96, + "grad_norm": 2.516652123088787, + "learning_rate": 6.468901387602366e-09, + "loss": 0.3198, + "step": 3405 + }, + { + "epoch": 0.97, + "grad_norm": 2.3091424744946814, + "learning_rate": 6.36509536401697e-09, + "loss": 0.2587, + "step": 3406 + }, + { + "epoch": 0.97, + "grad_norm": 2.3951209136151093, + "learning_rate": 6.262126315951355e-09, + "loss": 0.2814, + "step": 3407 + }, + { + "epoch": 0.97, + "grad_norm": 2.313803926109103, + "learning_rate": 6.159994330140139e-09, + "loss": 0.2694, + "step": 3408 + }, + { + "epoch": 0.97, + "grad_norm": 2.5797502248297772, + "learning_rate": 6.0586994926128396e-09, + "loss": 0.2871, + "step": 3409 + }, + { + "epoch": 0.97, + "grad_norm": 2.359256899842382, + "learning_rate": 5.958241888693871e-09, + "loss": 0.2953, + "step": 3410 + }, + { + "epoch": 0.97, + "grad_norm": 2.35053897730414, + "learning_rate": 5.858621603002434e-09, + "loss": 0.2837, + "step": 3411 + }, + { + "epoch": 0.97, + "grad_norm": 2.329705678442523, + "learning_rate": 5.7598387194524035e-09, + "loss": 0.2842, + "step": 3412 + }, + { + "epoch": 0.97, + "grad_norm": 2.2477466985107877, + "learning_rate": 5.66189332125222e-09, + "loss": 0.2917, + "step": 3413 + }, + { + "epoch": 0.97, + "grad_norm": 2.255582150057296, + "learning_rate": 5.564785490904778e-09, + "loss": 0.2669, + "step": 3414 + }, + { + "epoch": 0.97, + "grad_norm": 2.3365220893647405, + "learning_rate": 5.468515310207866e-09, + "loss": 0.2792, + "step": 3415 + }, + { + "epoch": 0.97, + "grad_norm": 2.489205245133401, + "learning_rate": 5.373082860253286e-09, + "loss": 0.2997, + "step": 3416 + }, + { + "epoch": 0.97, + "grad_norm": 2.290026567655139, + "learning_rate": 5.278488221427402e-09, + "loss": 0.2529, + "step": 3417 + }, + { + "epoch": 0.97, + "grad_norm": 2.2195804476343555, + "learning_rate": 5.184731473410697e-09, + "loss": 0.2438, + "step": 3418 + }, + { + "epoch": 0.97, + "grad_norm": 2.440156596286958, + "learning_rate": 5.0918126951779995e-09, + "loss": 0.2575, + "step": 3419 + }, + { + "epoch": 0.97, + "grad_norm": 2.277947551672406, + "learning_rate": 4.999731964998255e-09, + "loss": 0.2365, + "step": 3420 + }, + { + "epoch": 0.97, + "grad_norm": 2.1878020075418747, + "learning_rate": 4.90848936043442e-09, + "loss": 0.2642, + "step": 3421 + }, + { + "epoch": 0.97, + "grad_norm": 2.3846931366612654, + "learning_rate": 4.818084958343571e-09, + "loss": 0.2614, + "step": 3422 + }, + { + "epoch": 0.97, + "grad_norm": 2.3490608640052697, + "learning_rate": 4.728518834876683e-09, + "loss": 0.2779, + "step": 3423 + }, + { + "epoch": 0.97, + "grad_norm": 2.631539271095303, + "learning_rate": 4.639791065478737e-09, + "loss": 0.2739, + "step": 3424 + }, + { + "epoch": 0.97, + "grad_norm": 2.2169767571560963, + "learning_rate": 4.551901724888063e-09, + "loss": 0.2424, + "step": 3425 + }, + { + "epoch": 0.97, + "grad_norm": 2.5267588387109967, + "learning_rate": 4.46485088713755e-09, + "loss": 0.2745, + "step": 3426 + }, + { + "epoch": 0.97, + "grad_norm": 2.516514525166744, + "learning_rate": 4.378638625553099e-09, + "loss": 0.2924, + "step": 3427 + }, + { + "epoch": 0.97, + "grad_norm": 2.8289199823796185, + "learning_rate": 4.29326501275451e-09, + "loss": 0.2984, + "step": 3428 + }, + { + "epoch": 0.97, + "grad_norm": 2.37941989685982, + "learning_rate": 4.208730120655257e-09, + "loss": 0.272, + "step": 3429 + }, + { + "epoch": 0.97, + "grad_norm": 2.3135649095456374, + "learning_rate": 4.125034020461937e-09, + "loss": 0.2414, + "step": 3430 + }, + { + "epoch": 0.97, + "grad_norm": 2.343939860797385, + "learning_rate": 4.042176782675266e-09, + "loss": 0.2724, + "step": 3431 + }, + { + "epoch": 0.97, + "grad_norm": 2.89142975164576, + "learning_rate": 3.9601584770887485e-09, + "loss": 0.2475, + "step": 3432 + }, + { + "epoch": 0.97, + "grad_norm": 2.4270415644781242, + "learning_rate": 3.878979172789454e-09, + "loss": 0.2627, + "step": 3433 + }, + { + "epoch": 0.97, + "grad_norm": 2.2244448517637934, + "learning_rate": 3.798638938157683e-09, + "loss": 0.2423, + "step": 3434 + }, + { + "epoch": 0.97, + "grad_norm": 2.5572297378768036, + "learning_rate": 3.7191378408670817e-09, + "loss": 0.2761, + "step": 3435 + }, + { + "epoch": 0.97, + "grad_norm": 2.4173689424138356, + "learning_rate": 3.640475947884303e-09, + "loss": 0.2652, + "step": 3436 + }, + { + "epoch": 0.97, + "grad_norm": 2.3181700082518804, + "learning_rate": 3.562653325469345e-09, + "loss": 0.2339, + "step": 3437 + }, + { + "epoch": 0.97, + "grad_norm": 2.358757031542546, + "learning_rate": 3.4856700391748817e-09, + "loss": 0.2793, + "step": 3438 + }, + { + "epoch": 0.97, + "grad_norm": 2.4888349477689635, + "learning_rate": 3.40952615384682e-09, + "loss": 0.2668, + "step": 3439 + }, + { + "epoch": 0.97, + "grad_norm": 2.4841252669275717, + "learning_rate": 3.3342217336239653e-09, + "loss": 0.2661, + "step": 3440 + }, + { + "epoch": 0.97, + "grad_norm": 2.535579975411954, + "learning_rate": 3.2597568419382437e-09, + "loss": 0.2823, + "step": 3441 + }, + { + "epoch": 0.98, + "grad_norm": 2.446986754997376, + "learning_rate": 3.1861315415139257e-09, + "loss": 0.2666, + "step": 3442 + }, + { + "epoch": 0.98, + "grad_norm": 2.304931837380515, + "learning_rate": 3.113345894368402e-09, + "loss": 0.2594, + "step": 3443 + }, + { + "epoch": 0.98, + "grad_norm": 2.359112779339876, + "learning_rate": 3.0413999618117415e-09, + "loss": 0.2738, + "step": 3444 + }, + { + "epoch": 0.98, + "grad_norm": 2.3442231737629338, + "learning_rate": 2.9702938044467994e-09, + "loss": 0.2618, + "step": 3445 + }, + { + "epoch": 0.98, + "grad_norm": 2.872008136124653, + "learning_rate": 2.9000274821687765e-09, + "loss": 0.2378, + "step": 3446 + }, + { + "epoch": 0.98, + "grad_norm": 2.393992867182129, + "learning_rate": 2.830601054165549e-09, + "loss": 0.2688, + "step": 3447 + }, + { + "epoch": 0.98, + "grad_norm": 2.4955852552862514, + "learning_rate": 2.7620145789177816e-09, + "loss": 0.3084, + "step": 3448 + }, + { + "epoch": 0.98, + "grad_norm": 2.394092561435307, + "learning_rate": 2.6942681141981506e-09, + "loss": 0.2509, + "step": 3449 + }, + { + "epoch": 0.98, + "grad_norm": 2.2975660397285913, + "learning_rate": 2.6273617170722295e-09, + "loss": 0.2669, + "step": 3450 + }, + { + "epoch": 0.98, + "grad_norm": 2.2922318671409196, + "learning_rate": 2.5612954438977154e-09, + "loss": 0.2503, + "step": 3451 + }, + { + "epoch": 0.98, + "grad_norm": 2.271404899827809, + "learning_rate": 2.4960693503245367e-09, + "loss": 0.2689, + "step": 3452 + }, + { + "epoch": 0.98, + "grad_norm": 2.173109962410258, + "learning_rate": 2.4316834912951887e-09, + "loss": 0.2373, + "step": 3453 + }, + { + "epoch": 0.98, + "grad_norm": 3.277192362094739, + "learning_rate": 2.3681379210442885e-09, + "loss": 0.255, + "step": 3454 + }, + { + "epoch": 0.98, + "grad_norm": 2.419018625423423, + "learning_rate": 2.3054326930984636e-09, + "loss": 0.2867, + "step": 3455 + }, + { + "epoch": 0.98, + "grad_norm": 2.704302423417181, + "learning_rate": 2.243567860276796e-09, + "loss": 0.296, + "step": 3456 + }, + { + "epoch": 0.98, + "grad_norm": 2.397799847013115, + "learning_rate": 2.1825434746903793e-09, + "loss": 0.2527, + "step": 3457 + }, + { + "epoch": 0.98, + "grad_norm": 2.35412657731211, + "learning_rate": 2.1223595877420953e-09, + "loss": 0.2718, + "step": 3458 + }, + { + "epoch": 0.98, + "grad_norm": 2.6890571841517277, + "learning_rate": 2.0630162501272806e-09, + "loss": 0.2542, + "step": 3459 + }, + { + "epoch": 0.98, + "grad_norm": 2.438948016197178, + "learning_rate": 2.0045135118328394e-09, + "loss": 0.2801, + "step": 3460 + }, + { + "epoch": 0.98, + "grad_norm": 2.267259101241706, + "learning_rate": 1.946851422138018e-09, + "loss": 0.2311, + "step": 3461 + }, + { + "epoch": 0.98, + "grad_norm": 2.551728847298024, + "learning_rate": 1.890030029613521e-09, + "loss": 0.2799, + "step": 3462 + }, + { + "epoch": 0.98, + "grad_norm": 2.479703785478036, + "learning_rate": 1.8340493821222824e-09, + "loss": 0.2788, + "step": 3463 + }, + { + "epoch": 0.98, + "grad_norm": 2.6460667172496315, + "learning_rate": 1.7789095268188058e-09, + "loss": 0.2908, + "step": 3464 + }, + { + "epoch": 0.98, + "grad_norm": 2.5203858422375642, + "learning_rate": 1.7246105101493825e-09, + "loss": 0.3041, + "step": 3465 + }, + { + "epoch": 0.98, + "grad_norm": 2.2142928954216696, + "learning_rate": 1.671152377852092e-09, + "loss": 0.2428, + "step": 3466 + }, + { + "epoch": 0.98, + "grad_norm": 2.265712772154652, + "learning_rate": 1.6185351749569142e-09, + "loss": 0.259, + "step": 3467 + }, + { + "epoch": 0.98, + "grad_norm": 2.2995218830476962, + "learning_rate": 1.5667589457849516e-09, + "loss": 0.2413, + "step": 3468 + }, + { + "epoch": 0.98, + "grad_norm": 3.1040150036059075, + "learning_rate": 1.5158237339494283e-09, + "loss": 0.234, + "step": 3469 + }, + { + "epoch": 0.98, + "grad_norm": 2.3875856781933744, + "learning_rate": 1.4657295823549132e-09, + "loss": 0.2668, + "step": 3470 + }, + { + "epoch": 0.98, + "grad_norm": 2.462460581054891, + "learning_rate": 1.4164765331976525e-09, + "loss": 0.2631, + "step": 3471 + }, + { + "epoch": 0.98, + "grad_norm": 2.4861816892269157, + "learning_rate": 1.3680646279651265e-09, + "loss": 0.2766, + "step": 3472 + }, + { + "epoch": 0.98, + "grad_norm": 2.455987767118866, + "learning_rate": 1.320493907436604e-09, + "loss": 0.2293, + "step": 3473 + }, + { + "epoch": 0.98, + "grad_norm": 2.7477765197169726, + "learning_rate": 1.2737644116826985e-09, + "loss": 0.305, + "step": 3474 + }, + { + "epoch": 0.98, + "grad_norm": 2.2249571672813317, + "learning_rate": 1.227876180065368e-09, + "loss": 0.258, + "step": 3475 + }, + { + "epoch": 0.98, + "grad_norm": 2.2543225771684656, + "learning_rate": 1.1828292512380267e-09, + "loss": 0.2526, + "step": 3476 + }, + { + "epoch": 0.99, + "grad_norm": 2.3257233204429535, + "learning_rate": 1.1386236631452107e-09, + "loss": 0.2569, + "step": 3477 + }, + { + "epoch": 0.99, + "grad_norm": 2.5350058521661056, + "learning_rate": 1.095259453023023e-09, + "loss": 0.2867, + "step": 3478 + }, + { + "epoch": 0.99, + "grad_norm": 2.4593700491079966, + "learning_rate": 1.0527366573986895e-09, + "loss": 0.2921, + "step": 3479 + }, + { + "epoch": 0.99, + "grad_norm": 2.2846641331816384, + "learning_rate": 1.0110553120908915e-09, + "loss": 0.2572, + "step": 3480 + }, + { + "epoch": 0.99, + "grad_norm": 2.3702436683714185, + "learning_rate": 9.70215452209211e-10, + "loss": 0.2596, + "step": 3481 + }, + { + "epoch": 0.99, + "grad_norm": 2.392817209094412, + "learning_rate": 9.302171121546853e-10, + "loss": 0.2476, + "step": 3482 + }, + { + "epoch": 0.99, + "grad_norm": 2.616727121844519, + "learning_rate": 8.910603256192529e-10, + "loss": 0.2712, + "step": 3483 + }, + { + "epoch": 0.99, + "grad_norm": 2.5477779958428575, + "learning_rate": 8.527451255863071e-10, + "loss": 0.2786, + "step": 3484 + }, + { + "epoch": 0.99, + "grad_norm": 2.42618092930159, + "learning_rate": 8.152715443300318e-10, + "loss": 0.2721, + "step": 3485 + }, + { + "epoch": 0.99, + "grad_norm": 2.5225716230160917, + "learning_rate": 7.786396134158435e-10, + "loss": 0.2834, + "step": 3486 + }, + { + "epoch": 0.99, + "grad_norm": 2.614699044791809, + "learning_rate": 7.42849363700282e-10, + "loss": 0.2844, + "step": 3487 + }, + { + "epoch": 0.99, + "grad_norm": 2.5011157795190915, + "learning_rate": 7.079008253306762e-10, + "loss": 0.2605, + "step": 3488 + }, + { + "epoch": 0.99, + "grad_norm": 2.493778563490711, + "learning_rate": 6.737940277454778e-10, + "loss": 0.2774, + "step": 3489 + }, + { + "epoch": 0.99, + "grad_norm": 2.257688702699588, + "learning_rate": 6.405289996741503e-10, + "loss": 0.2579, + "step": 3490 + }, + { + "epoch": 0.99, + "grad_norm": 2.427775485003037, + "learning_rate": 6.081057691370572e-10, + "loss": 0.2593, + "step": 3491 + }, + { + "epoch": 0.99, + "grad_norm": 2.2505185636951026, + "learning_rate": 5.76524363445463e-10, + "loss": 0.2815, + "step": 3492 + }, + { + "epoch": 0.99, + "grad_norm": 2.4718683751201636, + "learning_rate": 5.457848092015327e-10, + "loss": 0.2612, + "step": 3493 + }, + { + "epoch": 0.99, + "grad_norm": 2.3198523130298656, + "learning_rate": 5.158871322984426e-10, + "loss": 0.2698, + "step": 3494 + }, + { + "epoch": 0.99, + "grad_norm": 2.232091467187299, + "learning_rate": 4.868313579200479e-10, + "loss": 0.2533, + "step": 3495 + }, + { + "epoch": 0.99, + "grad_norm": 2.4747398852089417, + "learning_rate": 4.5861751054110385e-10, + "loss": 0.2753, + "step": 3496 + }, + { + "epoch": 0.99, + "grad_norm": 2.5764286483014556, + "learning_rate": 4.3124561392715584e-10, + "loss": 0.2567, + "step": 3497 + }, + { + "epoch": 0.99, + "grad_norm": 2.367029999731487, + "learning_rate": 4.047156911345384e-10, + "loss": 0.2753, + "step": 3498 + }, + { + "epoch": 0.99, + "grad_norm": 2.440774884458288, + "learning_rate": 3.7902776451048667e-10, + "loss": 0.2955, + "step": 3499 + }, + { + "epoch": 0.99, + "grad_norm": 2.2906056439515177, + "learning_rate": 3.5418185569280337e-10, + "loss": 0.2582, + "step": 3500 + }, + { + "epoch": 0.99, + "grad_norm": 2.2754598342103325, + "learning_rate": 3.3017798561030266e-10, + "loss": 0.2427, + "step": 3501 + }, + { + "epoch": 0.99, + "grad_norm": 2.401165101202187, + "learning_rate": 3.070161744820332e-10, + "loss": 0.2716, + "step": 3502 + }, + { + "epoch": 0.99, + "grad_norm": 2.1729922241238833, + "learning_rate": 2.846964418182773e-10, + "loss": 0.2476, + "step": 3503 + }, + { + "epoch": 0.99, + "grad_norm": 2.2677043074356003, + "learning_rate": 2.632188064196628e-10, + "loss": 0.2563, + "step": 3504 + }, + { + "epoch": 0.99, + "grad_norm": 2.449392876606644, + "learning_rate": 2.4258328637771776e-10, + "loss": 0.2601, + "step": 3505 + }, + { + "epoch": 0.99, + "grad_norm": 2.3055918692627793, + "learning_rate": 2.22789899074427e-10, + "loss": 0.2693, + "step": 3506 + }, + { + "epoch": 0.99, + "grad_norm": 2.4329321525802556, + "learning_rate": 2.0383866118245385e-10, + "loss": 0.2406, + "step": 3507 + }, + { + "epoch": 0.99, + "grad_norm": 2.2791612559079786, + "learning_rate": 1.8572958866514e-10, + "loss": 0.2761, + "step": 3508 + }, + { + "epoch": 0.99, + "grad_norm": 2.560549908458607, + "learning_rate": 1.684626967765057e-10, + "loss": 0.235, + "step": 3509 + }, + { + "epoch": 0.99, + "grad_norm": 2.5078118323749727, + "learning_rate": 1.5203800006102774e-10, + "loss": 0.3003, + "step": 3510 + }, + { + "epoch": 0.99, + "grad_norm": 2.493354960056434, + "learning_rate": 1.3645551235386133e-10, + "loss": 0.2504, + "step": 3511 + }, + { + "epoch": 1.0, + "grad_norm": 2.762856738714792, + "learning_rate": 1.2171524678061818e-10, + "loss": 0.2951, + "step": 3512 + }, + { + "epoch": 1.0, + "grad_norm": 2.4196022508390054, + "learning_rate": 1.0781721575781056e-10, + "loss": 0.2877, + "step": 3513 + }, + { + "epoch": 1.0, + "grad_norm": 2.1964914531229427, + "learning_rate": 9.476143099207412e-11, + "loss": 0.2594, + "step": 3514 + }, + { + "epoch": 1.0, + "grad_norm": 2.3678613204157104, + "learning_rate": 8.254790348072304e-11, + "loss": 0.2815, + "step": 3515 + }, + { + "epoch": 1.0, + "grad_norm": 2.3026631969295814, + "learning_rate": 7.117664351186103e-11, + "loss": 0.2821, + "step": 3516 + }, + { + "epoch": 1.0, + "grad_norm": 2.560841032353236, + "learning_rate": 6.06476606638262e-11, + "loss": 0.2817, + "step": 3517 + }, + { + "epoch": 1.0, + "grad_norm": 2.358566543205405, + "learning_rate": 5.096096380552417e-11, + "loss": 0.2727, + "step": 3518 + }, + { + "epoch": 1.0, + "grad_norm": 2.47453393582181, + "learning_rate": 4.211656109642803e-11, + "loss": 0.2822, + "step": 3519 + }, + { + "epoch": 1.0, + "grad_norm": 2.1677384205058012, + "learning_rate": 3.411445998668938e-11, + "loss": 0.2386, + "step": 3520 + }, + { + "epoch": 1.0, + "grad_norm": 2.405772186235948, + "learning_rate": 2.6954667216472217e-11, + "loss": 0.2798, + "step": 3521 + }, + { + "epoch": 1.0, + "grad_norm": 2.4163648834715747, + "learning_rate": 2.063718881695209e-11, + "loss": 0.2753, + "step": 3522 + }, + { + "epoch": 1.0, + "grad_norm": 2.1951499438686906, + "learning_rate": 1.516203010953898e-11, + "loss": 0.2416, + "step": 3523 + }, + { + "epoch": 1.0, + "grad_norm": 2.5347777261578788, + "learning_rate": 1.0529195706099337e-11, + "loss": 0.3174, + "step": 3524 + }, + { + "epoch": 1.0, + "grad_norm": 2.303176321896944, + "learning_rate": 6.738689509067086e-12, + "loss": 0.2859, + "step": 3525 + }, + { + "epoch": 1.0, + "grad_norm": 2.253837882658441, + "learning_rate": 3.790514711332626e-12, + "loss": 0.2613, + "step": 3526 + }, + { + "epoch": 1.0, + "grad_norm": 2.5952689897785053, + "learning_rate": 1.6846737963538415e-12, + "loss": 0.2775, + "step": 3527 + }, + { + "epoch": 1.0, + "grad_norm": 2.359674481777684, + "learning_rate": 4.211685378230356e-13, + "loss": 0.2754, + "step": 3528 + }, + { + "epoch": 1.0, + "grad_norm": 2.4963844359266916, + "learning_rate": 0.0, + "loss": 0.2803, + "step": 3529 + }, + { + "epoch": 1.0, + "step": 3529, + "total_flos": 696559979986944.0, + "train_loss": 0.3025049172588839, + "train_runtime": 35270.0898, + "train_samples_per_second": 3.202, + "train_steps_per_second": 0.1 + } + ], + "logging_steps": 1.0, + "max_steps": 3529, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 25000, + "total_flos": 696559979986944.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}